├── erag ├── utils │ ├── __init__.py │ └── utils.py ├── __init__.py └── eval.py ├── pyproject.toml ├── setup.py ├── LICENSE └── README.md /erag/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * -------------------------------------------------------------------------------- /erag/__init__.py: -------------------------------------------------------------------------------- 1 | from .eval import eval 2 | from .utils import utils -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /erag/utils/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def batchify_text_generator(func): 3 | def batchified_text_gen(current_input): 4 | answers = dict() 5 | for key, value in current_input.items(): 6 | answers[key] = func(key, value) 7 | return answers 8 | return batchified_text_gen 9 | 10 | def batchify_downstream_metric(func): 11 | def batchified_metric(generated, expected): 12 | answers = dict() 13 | for key, value in generated.items(): 14 | answers[key] = func(value, expected[key]) 15 | return answers 16 | return batchified_metric 17 | 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="erag", 8 | version="0.0.1", 9 | author="Alireza Salemi", 10 | author_email="asalemi@cs.umass.edu", 11 | description="the implementation of the eRAG score.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/alirezasalemi7/eRAG", 15 | packages=setuptools.find_packages( 16 | include=['erag*'], # ['*'] by default 17 | exclude=['erag.tests', 'erag.eval'] 18 | ), 19 | install_requires=['pytrec_eval == 0.5'], 20 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Alireza Salemi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /erag/eval.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Callable, List, Union, Set 2 | import pytrec_eval 3 | import json 4 | 5 | def eval( 6 | retrieval_results : Dict[str, List[str]], 7 | expected_outputs : Dict[str, List[str]], 8 | text_generator: Callable[[Dict[str, List[str]]], Dict[str, str]], 9 | downstream_metric: Callable[[Dict[str, str], Dict[str, List[str]]], Dict[str, Union[int, float]]], 10 | retrieval_metrics: Set[str], 11 | **args, 12 | ): 13 | """ 14 | This function returns the eRAG score as explained in "[link to paper]" 15 | 16 | Args: 17 | retrieval_results ('Dict[str, List[str]]'): 18 | A dictionary that the key is the text input and the value is a list of 19 | retrieval results in 'str' format that should be evaluated. 20 | 21 | expected_outputs ('Dict[str, List[str]]'): 22 | A dictionary that the key is the text input and the value is a list of the expected 23 | output that the 'text_generator' function should generate for that input. 24 | 25 | text_generator ('Callable[[Dict[str, List[str]]], Dict[str, str]]'): 26 | A callable object that takes a dictionary of textual input to retrieval list and 27 | generates dictionary of textual input to corresponding output. 28 | 29 | downstream_metric ('Callable[[Dict[str, str], Dict[str, List[str]]], Dict[str, Union[int, float]]]'): 30 | A callable object that takes a dictonary of textual inputs to the corresponding prediction text as the first argument 31 | and a dictionary of the textual inputs to corresponding list of gold outputs as the second argument, 32 | and generates a score based on them for the prediction text. The generated score should be in range [0, 1]. 33 | 34 | retrieval_metrics ('Set[str]'): 35 | The set of Information Retrieval metrics should be used to evaluate the retrieval results. 36 | We follow the same format as pytrec_eval library for deifning metrics: "https://github.com/cvangysel/pytrec_eval" 37 | 38 | Returns: 39 | A dictionary containing the per input eRAG score the and aggregated eRAG score. 40 | """ 41 | 42 | assert set(retrieval_results.keys()) == set(expected_outputs.keys()), 'The keys in retrieval results and expected outputs do not match.' 43 | 44 | max_length_retrieval_lists = max(len(lst) for lst in retrieval_results.values()) 45 | flatten_inputs = { 46 | f'{query}@{i}' : {'query' : query, 'document' : [doc]} for query, documents in retrieval_results.items() for i, doc in enumerate(documents) 47 | } 48 | 49 | evaluation_scores = dict() 50 | 51 | for i in range(max_length_retrieval_lists): 52 | current_input = dict() 53 | current_expected_outputs = dict() 54 | for query in retrieval_results.keys(): 55 | if f'{query}@{i}' in flatten_inputs.keys(): 56 | item = flatten_inputs[f'{query}@{i}'] 57 | current_input[item['query']] = item['document'] 58 | current_expected_outputs[item['query']] = expected_outputs[item['query']] 59 | current_generated_outputs = text_generator(current_input) 60 | assert set(current_input.keys()) == set(current_generated_outputs.keys()), 'The text_generator function did not return outputs for all given inputs.' 61 | current_evaluation_scores = downstream_metric(current_generated_outputs, current_expected_outputs) 62 | assert set(current_generated_outputs.keys()) == set(current_evaluation_scores.keys()), 'The downstream_metric function did not return evaluation scores for all given inputs.' 63 | for query, score in current_evaluation_scores.items(): 64 | evaluation_scores[f'{query}@{i}'] = score 65 | 66 | qrel = dict() 67 | run = dict() 68 | 69 | binary_downstream_metric = True 70 | 71 | for query in retrieval_results.keys(): 72 | run[query] = dict() 73 | qrel[query] = dict() 74 | for j in range(len(retrieval_results[query])): 75 | run[query][str(j)] = len(retrieval_results[query]) - j 76 | qrel[query][str(j)] = evaluation_scores[f'{query}@{j}'] 77 | 78 | if qrel[query][str(j)] in [0, 1]: 79 | qrel[query][str(j)] = int(qrel[query][str(j)]) 80 | if qrel[query][str(j)] not in [0, 1]: 81 | binary_downstream_metric = False 82 | if qrel[query][str(j)] > 1 or qrel[query][str(j)] < 0: 83 | raise RuntimeError('The returning value of the downstream_metric must be in range [0,1].') 84 | 85 | if binary_downstream_metric: 86 | evaluator = pytrec_eval.RelevanceEvaluator(qrel, retrieval_metrics) 87 | results = evaluator.evaluate(run) 88 | else: 89 | results = dict() 90 | for query, labels in qrel.items(): 91 | results[query] = dict() 92 | for metric in retrieval_metrics: 93 | if "_" in metric: 94 | metric_without_cut = metric[:metric.find("_")] 95 | if metric_without_cut not in {'success', 'P'}: 96 | raise RuntimeError('The provided retrieval metrics cannot be used with continuous downsream metric. The supported retrieval metrics are ["success", "P"]') 97 | cut_value = int(metric[metric.find("_")+1:]) 98 | else: 99 | metric_without_cut = metric 100 | cut_value = len(labels) 101 | if metric_without_cut == 'success': 102 | max_value = 0 103 | for i in range(cut_value): 104 | max_value = max(max_value, labels[str(i)]) 105 | results[query][metric] = max_value 106 | elif metric_without_cut == 'P': 107 | mean_value = 0 108 | for i in range(cut_value): 109 | mean_value += labels[str(i)] 110 | results[query][metric] = mean_value / cut_value 111 | else: 112 | raise RuntimeError('The provided retrieval metrics cannot be used with continuous downsream metric. The supported retrieval metrics are ["success", "P"]') 113 | final_results = {'per_input' : results, 'aggregated' : dict()} 114 | for metric in retrieval_metrics: 115 | values = [value[metric] for key, value in results.items()] 116 | final_results['aggregated'][metric] = sum(values) / len(values) 117 | return final_results 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # eRAG: Evaluating Retrieval Quality in Retrieval-Augmented Generation 2 | 3 | This repository contains the codes and packages for the SIGIR 2025 paper titled [Evaluating Retrieval Quality in Retrieval-Augmented Generation](https://doi.org/10.1145/3626772.3657957). 4 | 5 | Evaluating retrieval-augmented generation (RAG) presents challenges, particularly for retrieval models within these systems. Traditional end-to-end evaluation methods are computationally expensive. Furthermore, evaluation of the retrieval model's performance based on query-document relevance labels shows a small correlation with the RAG system's downstream performance. We propose a novel evaluation approach, eRAG, where each document in the retrieval list is individually utilized by the large language model within the RAG system. The output generated for each document is then evaluated based on the downstream task ground truth labels. In this manner, the downstream performance for each document serves as its relevance label. We employ various downstream task metrics to obtain document-level annotations and aggregate them using set-based or ranking metrics. Extensive experiments on a wide range of datasets demonstrate that eRAG achieves a higher correlation with downstream RAG performance compared to baseline methods, with improvements in Kendall's $\tau$ correlation ranging from 0.168 to 0.494. Additionally, eRAG offers significant computational advantages, improving runtime and consuming up to 50 times less GPU memory than end-to-end evaluation. 6 | 7 | 8 | ## Installation 9 | 10 | You can install the codes for evaluating a RAG system with eRAG using the following script: 11 | 12 | ``` 13 | pip install erag 14 | ``` 15 | 16 | ## Documentation 17 | 18 | To calculate the eRAG score, you should use the eval function, with the following arguments: 19 | 20 | - **--retrieval_results**: This is a dictionary in that the keys are the queries, and the values are the list of the text in the retrieved documents for each query. e.g., 21 | 22 | ``` 23 | retrieval_results = { 24 | "What position does Harry play on the Gryffindor Quidditch team?" : [ 25 | "Quidditch /ˈkwɪdɪtʃ/ is a fictional sport invented by author J. K. Rowling for her fantasy book series Harry Potter. It first appeared in the novel Harry Potter and the Philosopher's Stone (1997). In the series, Quidditch is portrayed as a dangerous but popular sport played by witches and wizards riding flying broomsticks.", 26 | "Matches are played on a large oval pitch with three ring-shaped goals of different heights on each side, between two opposing teams of seven players each: three Chasers, two Beaters, the Keeper, and the Seeker. The Chasers and the Keeper respectively score with and defend the goals against the Quaffle; the two Beaters bat the Bludgers away from their teammates and towards their opponents; and the Seeker locates and catches the Golden Snitch, whose capture simultaneously wins the Seeker's team 150 points and ends the game. The team with the most points at the end wins.", 27 | "Harry Potter plays as Seeker for his house team at Hogwarts. Regional and international Quidditch competitions are mentioned throughout the series. Aspects of the sport's history are revealed in Quidditch Through the Ages, published by Rowling in 2001 to benefit Comic Relief." 28 | ], 29 | "Who is the Headmaster of Hogwarts when Harry arrives?" : [ 30 | "Prof. Albus Percival Wulfric Brian Dumbledore is a fictional character in J. K. Rowling's Harry Potter series. For most of the series, he is the headmaster of the wizarding school Hogwarts. As part of his backstory, it is revealed that he is the founder and leader of the Order of the Phoenix, an organisation dedicated to fighting Lord Voldemort, the main antagonist of the series.", 31 | "Dumbledore was portrayed by Richard Harris in the film adaptations of Harry Potter and the Philosopher's Stone (2001) and Harry Potter and the Chamber of Secrets (2002). Following Harris' death in October 2002, Michael Gambon portrayed Dumbledore in the six remaining Harry Potter films from 2004 to 2011. Jude Law portrayed Dumbledore as a middle-aged man in the prequel films Fantastic Beasts: The Crimes of Grindelwald (2018) and Fantastic Beasts: The Secrets of Dumbledore (2022)." 32 | ] 33 | } 34 | ``` 35 | 36 | - **--expected_outputs**: This is a dictionary that the keys are the queries, and the values are the list of corrosponding outputs for each query. Note that the keys in *expected_outputs* and *retrieval_results* must be the same. e.g., 37 | 38 | ``` 39 | expected_outputs = { 40 | "What position does Harry play on the Gryffindor Quidditch team?" : ["seeker"], 41 | "Who is the Headmaster of Hogwarts when Harry arrives?" : ["Albus Dumbledore", "Albus Percival Wulfric Brian Dumbledore", "Dumbledore"] 42 | } 43 | ``` 44 | 45 | 46 | - **--text_generator**: This is a function that takes a dictionary as the input where the keys are the queries and values are the retrieved documents, and it returns a dictionary where the keys are the queries and the values are the generated string by the generative model in the RAG pipeline for the corresponding query. e.g., 47 | 48 | ``` 49 | def text_generator(queries_and_documents): 50 | from openai import OpenAI 51 | client = OpenAI(api_key="...") 52 | results = dict() 53 | for question, documents in queries_and_documents.items(): 54 | chat_completion = client.chat.completions.create( 55 | messages=[ 56 | { 57 | "role": "user", 58 | "content": f"question: {question} context: {' '.join(documents)} answer: ", 59 | } 60 | ], 61 | model="gpt-4-1106-preview", 62 | ) 63 | results[question] = chat_completion.choices[0].message.content 64 | return results 65 | ``` 66 | 67 | In this example, we utilize *GPT4* to generate outputs for each input question using the retrieved documents. 68 | 69 | **Note:** For the case that there exists a function that given the query and a set of retrieved documents can generate the output for a single input, you can use the utility function provided to make it work on a batch of inputs: 70 | 71 | ``` 72 | from erag.utils import batchify_text_generator 73 | 74 | def text_genrator_single_input(query, documents): 75 | ... 76 | return output 77 | 78 | batch_text_genrator = batchify_text_generator(text_genrator_single_input) 79 | ``` 80 | 81 | Now, you can use *batch_text_genrator* for eRAG. 82 | 83 | - **--downstream_metric**: This is a function that takes two dictionaries as the input; the first argument is a dictionary that its keys are the input queries and values are the generated text by the *text_generator* function, and the second argument is *expected_outputs* explained before. The main job of this function is to evaluate the usefulness of each generated text by comparing it to the expected output. Note that the values should be between 0 and 1. e.g., 84 | 85 | ``` 86 | from rouge_score import rouge_scorer 87 | 88 | def rouge_metric(generated_outputs, expected_outputs): 89 | scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) 90 | results = dict() 91 | for query, gen_output in generated_outputs.items(): 92 | expe_outputs_query = expected_outputs[query] 93 | max_value = 0 94 | for exp_output in expe_outputs_query: 95 | max_value = max(scorer.score(exp_output, gen_output)['rougeL'].fmeasure, max_value) 96 | results[query] = max_value 97 | return results 98 | ``` 99 | 100 | **Note:** For the case that there exists a function that given the generated ouput and a list of expected outputs can generate a score for a single input, you can use the utility function provided to make it work on a batch of inputs: 101 | 102 | ``` 103 | from erag.utils import batchify_downstream_metric 104 | 105 | def score_single_input(generated_output, expected_outputs): 106 | ... 107 | return output 108 | 109 | batch_scorer = batchify_downstream_metric(score_single_input) 110 | ``` 111 | 112 | Now, you can use *batch_scorer* for eRAG. 113 | 114 | - **--retrieval_metrics**: This is a set of retrieval metric names that eRAG uses to aggregate the scores of individual documents in a ranked list for each query. We follow the [*pytrec_eval*](https://github.com/cvangysel/pytrec_eval) format for naming metrics. Note that when the returned values by *downstream_metric* are not binary (zero and one), the only possible metrics are precision ('P') and Hit Ratio ('success'). e.g., 115 | 116 | ``` 117 | retrieval_metrics = {'P_10', 'success', 'recall', 'map'} 118 | ``` 119 | 120 | 121 | Given all these arguments, you can use the following code to evaluate your results: 122 | 123 | ``` 124 | import erag 125 | 126 | results = erag.eval( 127 | retrieval_results = retrieval_results, 128 | expected_outputs = expected_output, 129 | text_generator = text_generator, 130 | downstream_metric = downstream_metric, 131 | retrieval_metrics = retrieval_metrics 132 | ) 133 | 134 | ``` 135 | 136 | 137 | ## Examples 138 | 139 | This [Colab notebook](https://colab.research.google.com/drive/1kMPRGowsVse56iGOei2Xaolk_zFw01S_?usp=sharing) is designed to show some examples of how to use eRAG for evaluating retrieval results in a RAG pipeline. 140 | 141 | ## Reference 142 | 143 | [Evaluating Retrieval Quality in Retrieval-Augmented Generation](https://doi.org/10.1145/3626772.3657957) 144 | 145 | ``` 146 | @inproceedings{salemi2024evaluating, 147 | author = {Salemi, Alireza and Zamani, Hamed}, 148 | title = {Evaluating Retrieval Quality in Retrieval-Augmented Generation}, 149 | year = {2024}, 150 | isbn = {9798400704314}, 151 | publisher = {Association for Computing Machinery}, 152 | address = {New York, NY, USA}, 153 | url = {https://doi.org/10.1145/3626772.3657957}, 154 | doi = {10.1145/3626772.3657957}, 155 | booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval}, 156 | pages = {2395–2400}, 157 | numpages = {6}, 158 | keywords = {evaluation, retrieval quality, retrieval-augmented generation}, 159 | location = {Washington DC, USA}, 160 | series = {SIGIR '24} 161 | } 162 | ``` 163 | 164 | ## Acknowledgments 165 | 166 | This work was supported in part by the Center for Intelligent Information Retrieval, in part by Lowe’s, and in part by an Amazon Research Award, Fall 2022 CFP. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect those of the sponsor. 167 | --------------------------------------------------------------------------------