├── erag
    ├── utils
    │   ├── __init__.py
    │   └── utils.py
    ├── __init__.py
    └── eval.py
├── pyproject.toml
├── setup.py
├── LICENSE
└── README.md


/erag/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *


--------------------------------------------------------------------------------
/erag/__init__.py:
--------------------------------------------------------------------------------
1 | from .eval import eval
2 | from .utils import utils


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/erag/utils/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def batchify_text_generator(func):
 3 |     def batchified_text_gen(current_input):
 4 |         answers = dict()
 5 |         for key, value in current_input.items():
 6 |             answers[key] = func(key, value)
 7 |         return answers
 8 |     return batchified_text_gen
 9 | 
10 | def batchify_downstream_metric(func):
11 |     def batchified_metric(generated, expected):
12 |         answers = dict()
13 |         for key, value in generated.items():
14 |             answers[key] = func(value, expected[key])
15 |         return answers
16 |     return batchified_metric
17 | 
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="erag",
 8 |     version="0.0.1",
 9 |     author="Alireza Salemi",
10 |     author_email="asalemi@cs.umass.edu",
11 |     description="the implementation of the eRAG score.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/alirezasalemi7/eRAG",
15 |     packages=setuptools.find_packages(
16 |         include=['erag*'],  # ['*'] by default
17 |         exclude=['erag.tests', 'erag.eval']
18 |     ),
19 |     install_requires=['pytrec_eval == 0.5'],
20 | )


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Alireza Salemi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/erag/eval.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Callable, List, Union, Set
  2 | import pytrec_eval
  3 | import json
  4 | 
  5 | def eval(
  6 |         retrieval_results : Dict[str, List[str]], 
  7 |         expected_outputs : Dict[str, List[str]], 
  8 |         text_generator: Callable[[Dict[str, List[str]]], Dict[str, str]],
  9 |         downstream_metric: Callable[[Dict[str, str], Dict[str, List[str]]], Dict[str, Union[int, float]]],
 10 |         retrieval_metrics: Set[str],
 11 |         **args,
 12 |     ):
 13 |     """
 14 |     This function returns the eRAG score as explained in "[link to paper]"
 15 | 
 16 |     Args:
 17 |         retrieval_results ('Dict[str, List[str]]'):
 18 |             A dictionary that the key is the text input and the value is a list of 
 19 |             retrieval results in 'str' format that should be evaluated.
 20 |         
 21 |         expected_outputs ('Dict[str, List[str]]'):
 22 |             A dictionary that the key is the text input and the value is a list of the expected
 23 |             output that the 'text_generator' function should generate for that input.
 24 |                     
 25 |         text_generator ('Callable[[Dict[str, List[str]]], Dict[str, str]]'):
 26 |             A callable object that takes a dictionary of textual input to retrieval list and 
 27 |             generates dictionary of textual input to corresponding output.
 28 |                     
 29 |         downstream_metric ('Callable[[Dict[str, str], Dict[str, List[str]]], Dict[str, Union[int, float]]]'):
 30 |             A callable object that takes a dictonary of textual inputs to the corresponding prediction text as the first argument 
 31 |             and a dictionary of the textual inputs to corresponding list of gold outputs as the second argument,
 32 |             and generates a score based on them for the prediction text. The generated score should be in range [0, 1].
 33 | 
 34 |         retrieval_metrics ('Set[str]'):
 35 |             The set of Information Retrieval metrics should be used to evaluate the retrieval results. 
 36 |             We follow the same format as pytrec_eval library for deifning metrics: "https://github.com/cvangysel/pytrec_eval"
 37 |     
 38 |     Returns:
 39 |         A dictionary containing the per input eRAG score the and aggregated eRAG score.
 40 |     """
 41 | 
 42 |     assert set(retrieval_results.keys()) == set(expected_outputs.keys()), 'The keys in retrieval results and expected outputs do not match.'
 43 | 
 44 |     max_length_retrieval_lists = max(len(lst) for lst in retrieval_results.values())
 45 |     flatten_inputs = {
 46 |         f'{query}@{i}' : {'query' : query, 'document' : [doc]} for query, documents in retrieval_results.items() for i, doc in enumerate(documents)
 47 |     }
 48 | 
 49 |     evaluation_scores = dict()
 50 | 
 51 |     for i in range(max_length_retrieval_lists):
 52 |         current_input = dict()
 53 |         current_expected_outputs = dict()
 54 |         for query in retrieval_results.keys():
 55 |             if f'{query}@{i}' in flatten_inputs.keys():
 56 |                 item = flatten_inputs[f'{query}@{i}']
 57 |                 current_input[item['query']] = item['document']
 58 |                 current_expected_outputs[item['query']] = expected_outputs[item['query']]
 59 |         current_generated_outputs = text_generator(current_input)
 60 |         assert set(current_input.keys()) == set(current_generated_outputs.keys()), 'The text_generator function did not return outputs for all given inputs.'
 61 |         current_evaluation_scores = downstream_metric(current_generated_outputs, current_expected_outputs)
 62 |         assert set(current_generated_outputs.keys()) == set(current_evaluation_scores.keys()), 'The downstream_metric function did not return evaluation scores for all given inputs.'
 63 |         for query, score in current_evaluation_scores.items():
 64 |             evaluation_scores[f'{query}@{i}'] = score
 65 |     
 66 |     qrel = dict()
 67 |     run = dict()
 68 | 
 69 |     binary_downstream_metric = True
 70 | 
 71 |     for query in retrieval_results.keys():
 72 |         run[query] = dict()
 73 |         qrel[query] = dict()
 74 |         for j in range(len(retrieval_results[query])):
 75 |             run[query][str(j)] = len(retrieval_results[query]) - j
 76 |             qrel[query][str(j)] = evaluation_scores[f'{query}@{j}']
 77 |             
 78 |             if qrel[query][str(j)] in [0, 1]:
 79 |                 qrel[query][str(j)] = int(qrel[query][str(j)])
 80 |             if qrel[query][str(j)] not in [0, 1]:
 81 |                 binary_downstream_metric = False
 82 |             if qrel[query][str(j)] > 1 or qrel[query][str(j)] < 0:
 83 |                 raise RuntimeError('The returning value of the downstream_metric must be in range [0,1].')
 84 |             
 85 |     if binary_downstream_metric:
 86 |         evaluator = pytrec_eval.RelevanceEvaluator(qrel, retrieval_metrics)
 87 |         results = evaluator.evaluate(run)
 88 |     else:
 89 |         results = dict()
 90 |         for query, labels in qrel.items():
 91 |             results[query] = dict()
 92 |             for metric in retrieval_metrics:
 93 |                 if "_" in metric:
 94 |                     metric_without_cut = metric[:metric.find("_")]
 95 |                     if metric_without_cut not in {'success', 'P'}:
 96 |                         raise RuntimeError('The provided retrieval metrics cannot be used with continuous downsream metric. The supported retrieval metrics are ["success", "P"]')
 97 |                     cut_value = int(metric[metric.find("_")+1:])
 98 |                 else:
 99 |                     metric_without_cut = metric
100 |                     cut_value = len(labels)
101 |                 if metric_without_cut == 'success':
102 |                     max_value = 0
103 |                     for i in range(cut_value):
104 |                         max_value = max(max_value, labels[str(i)])
105 |                     results[query][metric] = max_value
106 |                 elif metric_without_cut == 'P':
107 |                     mean_value = 0
108 |                     for i in range(cut_value):
109 |                         mean_value += labels[str(i)]
110 |                     results[query][metric] = mean_value / cut_value
111 |                 else:
112 |                     raise RuntimeError('The provided retrieval metrics cannot be used with continuous downsream metric. The supported retrieval metrics are ["success", "P"]')
113 |     final_results = {'per_input' : results, 'aggregated' : dict()}
114 |     for metric in retrieval_metrics:
115 |         values = [value[metric] for key, value in results.items()]
116 |         final_results['aggregated'][metric] = sum(values) / len(values)
117 |     return final_results
118 | 
119 | 
120 | 
121 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # eRAG: Evaluating Retrieval Quality in Retrieval-Augmented Generation
  2 | 
  3 | This repository contains the codes and packages for the SIGIR 2025 paper titled [Evaluating Retrieval Quality in Retrieval-Augmented Generation](https://doi.org/10.1145/3626772.3657957).
  4 | 
  5 | Evaluating retrieval-augmented generation (RAG) presents challenges, particularly for retrieval models within these systems. Traditional end-to-end evaluation methods are computationally expensive. Furthermore, evaluation of the retrieval model's performance based on query-document relevance labels shows a small correlation with the RAG system's downstream performance. We propose a novel evaluation approach, eRAG, where each document in the retrieval list is individually utilized by the large language model within the RAG system. The output generated for each document is then evaluated based on the downstream task ground truth labels. In this manner, the downstream performance for each document serves as its relevance label. We employ various downstream task metrics to obtain document-level annotations and aggregate them using set-based or ranking metrics. Extensive experiments on a wide range of datasets demonstrate that eRAG achieves a higher correlation with downstream RAG performance compared to baseline methods, with improvements in Kendall's $\tau$ correlation ranging from 0.168 to 0.494. Additionally, eRAG offers significant computational advantages, improving runtime and consuming up to 50 times less GPU memory than end-to-end evaluation.
  6 | 
  7 | 
  8 | ## Installation
  9 | 
 10 | You can install the codes for evaluating a RAG system with eRAG using the following script:
 11 | 
 12 | ```
 13 | pip install erag
 14 | ```
 15 | 
 16 | ## Documentation
 17 | 
 18 | To calculate the eRAG score, you should use the eval function, with the following arguments:
 19 | 
 20 | - **--retrieval_results**: This is a dictionary in that the keys are the queries, and the values are the list of the text in the retrieved documents for each query. e.g.,
 21 | 
 22 | ```
 23 | retrieval_results = {
 24 |     "What position does Harry play on the Gryffindor Quidditch team?" : [
 25 |         "Quidditch /ˈkwɪdɪtʃ/ is a fictional sport invented by author J. K. Rowling for her fantasy book series Harry Potter. It first appeared in the novel Harry Potter and the Philosopher's Stone (1997). In the series, Quidditch is portrayed as a dangerous but popular sport played by witches and wizards riding flying broomsticks.",
 26 |         "Matches are played on a large oval pitch with three ring-shaped goals of different heights on each side, between two opposing teams of seven players each: three Chasers, two Beaters, the Keeper, and the Seeker. The Chasers and the Keeper respectively score with and defend the goals against the Quaffle; the two Beaters bat the Bludgers away from their teammates and towards their opponents; and the Seeker locates and catches the Golden Snitch, whose capture simultaneously wins the Seeker's team 150 points and ends the game. The team with the most points at the end wins.",
 27 |         "Harry Potter plays as Seeker for his house team at Hogwarts. Regional and international Quidditch competitions are mentioned throughout the series. Aspects of the sport's history are revealed in Quidditch Through the Ages, published by Rowling in 2001 to benefit Comic Relief."
 28 |     ],
 29 |     "Who is the Headmaster of Hogwarts when Harry arrives?" : [
 30 |         "Prof. Albus Percival Wulfric Brian Dumbledore is a fictional character in J. K. Rowling's Harry Potter series. For most of the series, he is the headmaster of the wizarding school Hogwarts. As part of his backstory, it is revealed that he is the founder and leader of the Order of the Phoenix, an organisation dedicated to fighting Lord Voldemort, the main antagonist of the series.",
 31 |         "Dumbledore was portrayed by Richard Harris in the film adaptations of Harry Potter and the Philosopher's Stone (2001) and Harry Potter and the Chamber of Secrets (2002). Following Harris' death in October 2002, Michael Gambon portrayed Dumbledore in the six remaining Harry Potter films from 2004 to 2011. Jude Law portrayed Dumbledore as a middle-aged man in the prequel films Fantastic Beasts: The Crimes of Grindelwald (2018) and Fantastic Beasts: The Secrets of Dumbledore (2022)."
 32 |     ]
 33 | }
 34 | ```
 35 | 
 36 | - **--expected_outputs**: This is a dictionary that the keys are the queries, and the values are the list of corrosponding outputs for each query. Note that the keys in *expected_outputs* and *retrieval_results* must be the same. e.g.,
 37 | 
 38 | ```
 39 | expected_outputs = {
 40 |     "What position does Harry play on the Gryffindor Quidditch team?" : ["seeker"],
 41 |     "Who is the Headmaster of Hogwarts when Harry arrives?" : ["Albus Dumbledore", "Albus Percival Wulfric Brian Dumbledore", "Dumbledore"]
 42 | }
 43 | ```
 44 | 
 45 | 
 46 | - **--text_generator**: This is a function that takes a dictionary as the input where the keys are the queries and values are the retrieved documents, and it returns a dictionary where the keys are the queries and the values are the generated string by the generative model in the RAG pipeline for the corresponding query. e.g.,
 47 | 
 48 | ```
 49 | def text_generator(queries_and_documents):
 50 |     from openai import OpenAI
 51 |     client = OpenAI(api_key="...")
 52 |     results = dict()
 53 |     for question, documents in queries_and_documents.items():
 54 |         chat_completion = client.chat.completions.create(
 55 |             messages=[
 56 |                 {
 57 |                     "role": "user",
 58 |                     "content": f"question: {question} context: {' '.join(documents)} answer: ",
 59 |                 }
 60 |             ],
 61 |             model="gpt-4-1106-preview",
 62 |         )
 63 |         results[question] = chat_completion.choices[0].message.content
 64 |     return results
 65 | ```
 66 | 
 67 | In this example, we utilize *GPT4* to generate outputs for each input question using the retrieved documents. 
 68 | 
 69 | **Note:** For the case that there exists a function that given the query and a set of retrieved documents can generate the output for a single input, you can use the utility function provided to make it work on a batch of inputs:
 70 | 
 71 | ```
 72 | from erag.utils import batchify_text_generator
 73 | 
 74 | def text_genrator_single_input(query, documents):
 75 |     ...
 76 |     return output
 77 | 
 78 | batch_text_genrator = batchify_text_generator(text_genrator_single_input)
 79 | ```
 80 | 
 81 | Now, you can use *batch_text_genrator* for eRAG.
 82 | 
 83 | - **--downstream_metric**: This is a function that takes two dictionaries as the input; the first argument is a dictionary that its keys are the input queries and values are the generated text by the *text_generator* function, and the second argument is *expected_outputs* explained before. The main job of this function is to evaluate the usefulness of each generated text by comparing it to the expected output. Note that the values should be between 0 and 1. e.g.,
 84 | 
 85 | ```
 86 | from rouge_score import rouge_scorer
 87 | 
 88 | def rouge_metric(generated_outputs, expected_outputs):
 89 |     scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)    
 90 |     results = dict()
 91 |     for query, gen_output in generated_outputs.items():
 92 |         expe_outputs_query = expected_outputs[query]
 93 |         max_value = 0
 94 |         for exp_output in expe_outputs_query:
 95 |             max_value = max(scorer.score(exp_output, gen_output)['rougeL'].fmeasure, max_value)
 96 |         results[query] = max_value
 97 |     return results
 98 | ```
 99 | 
100 | **Note:** For the case that there exists a function that given the generated ouput and a list of expected outputs can generate a score for a single input, you can use the utility function provided to make it work on a batch of inputs:
101 | 
102 | ```
103 | from erag.utils import batchify_downstream_metric
104 | 
105 | def score_single_input(generated_output, expected_outputs):
106 |     ...
107 |     return output
108 | 
109 | batch_scorer = batchify_downstream_metric(score_single_input)
110 | ```
111 | 
112 | Now, you can use *batch_scorer* for eRAG.
113 | 
114 | - **--retrieval_metrics**: This is a set of retrieval metric names that eRAG uses to aggregate the scores of individual documents in a ranked list for each query. We follow the [*pytrec_eval*](https://github.com/cvangysel/pytrec_eval) format for naming metrics. Note that when the returned values by *downstream_metric* are not binary (zero and one), the only possible metrics are precision ('P') and Hit Ratio ('success'). e.g., 
115 | 
116 | ```
117 | retrieval_metrics = {'P_10', 'success', 'recall', 'map'}
118 | ```
119 | 
120 | 
121 | Given all these arguments, you can use the following code to evaluate your results:
122 | 
123 | ```
124 | import erag
125 | 
126 | results = erag.eval(
127 |     retrieval_results = retrieval_results,
128 |     expected_outputs = expected_output,
129 |     text_generator = text_generator,
130 |     downstream_metric = downstream_metric,
131 |     retrieval_metrics = retrieval_metrics
132 | )
133 | 
134 | ```
135 | 
136 | 
137 | ## Examples
138 | 
139 | This [Colab notebook](https://colab.research.google.com/drive/1kMPRGowsVse56iGOei2Xaolk_zFw01S_?usp=sharing) is designed to show some examples of how to use eRAG for evaluating retrieval results in a RAG pipeline.
140 | 
141 | ## Reference
142 | 
143 | [Evaluating Retrieval Quality in Retrieval-Augmented Generation](https://doi.org/10.1145/3626772.3657957)
144 | 
145 | ```
146 | @inproceedings{salemi2024evaluating,
147 | author = {Salemi, Alireza and Zamani, Hamed},
148 | title = {Evaluating Retrieval Quality in Retrieval-Augmented Generation},
149 | year = {2024},
150 | isbn = {9798400704314},
151 | publisher = {Association for Computing Machinery},
152 | address = {New York, NY, USA},
153 | url = {https://doi.org/10.1145/3626772.3657957},
154 | doi = {10.1145/3626772.3657957},
155 | booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
156 | pages = {2395–2400},
157 | numpages = {6},
158 | keywords = {evaluation, retrieval quality, retrieval-augmented generation},
159 | location = {Washington DC, USA},
160 | series = {SIGIR '24}
161 | }
162 | ```
163 | 
164 | ## Acknowledgments
165 | 
166 | This work was supported in part by the Center for Intelligent Information Retrieval, in part by Lowe’s, and in part by an Amazon Research Award, Fall 2022 CFP. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect those of the sponsor.
167 | 


--------------------------------------------------------------------------------