├── .gitignore ├── LICENSE ├── README.md ├── figs ├── ragas_metrics.png └── score_distribs.png ├── rag-data ├── amnesty_qa-new-01.jsonl ├── amnesty_qa-new-01.tsv ├── amnesty_qa-new-02.jsonl ├── more-q-01.jsonl └── output-01.tsv ├── requirements.txt ├── resources ├── configs │ ├── answer_correctness-best.json │ ├── answer_relevance-best.json │ ├── context_precision-best.json │ ├── context_recall-best.json │ ├── context_relevance-best.json │ └── faithfulness-best.json ├── demodata │ └── amnesty_qa.jsonl ├── prompts │ ├── answer_correctness_1.txt │ ├── answer_relevance_1.txt │ ├── answer_relevance_2.txt │ ├── context_precision_1.txt │ ├── context_recall_1.txt │ ├── context_relevance_1.txt │ ├── faithfulness_1.txt │ ├── faithfulness_2.txt │ └── more_questions.txt └── reports │ ├── dspy-reports │ ├── answer_correctness_report.tsv │ ├── answer_relevance_report.tsv │ ├── context_precision_report.tsv │ ├── context_recall_report.tsv │ ├── context_relevance_report.tsv │ ├── context_utilization_report.tsv │ └── faithfulness_report.tsv │ └── lcel-reports │ ├── answer_correctness_report.tsv │ ├── answer_relevance_report.tsv │ ├── answer_similarity_report.tsv │ ├── context_precision_report.tsv │ ├── context_recall_report.tsv │ ├── context_relevance_report.tsv │ ├── context_utilization_report.tsv │ └── faithfulness_report.tsv └── src ├── convert_amnestyQA_to_desired_format.ipynb ├── generate_datasets.py ├── learned ├── __init__.py ├── answer_correctness.py ├── answer_relevance.py ├── context_precision.py ├── context_recall.py ├── context_relevance.py ├── faithfulness.py └── learning_utils.py ├── make_more_questions.py ├── metrics.py ├── prompted ├── __init__.py ├── answer_correctness.py ├── answer_relevance.py ├── answer_similarity.py ├── context_precision.py ├── context_recall.py ├── context_relevance.py ├── faithfulness.py ├── more_questions.py └── prompt_utils.py ├── run_learned_metrics.py ├── run_prompted_metrics.py ├── score_distributions.py ├── streamlit_app.py └── test_nltk.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # application specific 163 | **/data 164 | **/notes.txt 165 | **/try 166 | 167 | # mac thing 168 | .DS_Store 169 | **/.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llm-rag-eval 2 | 3 | Large Language Model (LLM) powered evaluator for Retrieval Augmented Generation (RAG) pipelines. 4 | 5 | 6 | 7 | ## Google AI Hackathon Submission 8 | 9 | [Submission Video Link](https://youtu.be/yOIU65chc98) 10 | 11 | ### Inspiration 12 | 13 | Our project is inspired by the [RAGAS](https://github.com/explodinggradients/ragas) project which defines and implements 8 metrics to evaluate inputs and outputs of a Retrieval Augmented Generation (RAG) pipeline, and by ideas from the [ARES paper](https://arxiv.org/abs/2311.09476), which attempts to calibrate these LLM evaluators against human evaluators. 14 | 15 | ### What it does 16 | 17 | It provides an LLM based framework to evaluate the performance of RAG systems using a set of metrics that are optimized for the application domain it (the RAG system) operates in. We have used the Gemini Pro 1.0 from Google AI as the LLM the framework uses. We have also used the Google AI embedding model to generate embeddings for some of the metrics. 18 | 19 | ### How we built it 20 | 21 | 1. We re-implemented the RAGAS metrics using [LangChain Expression Language (LCEL)](https://python.langchain.com/docs/expression_language/) so we could access outputs of intermediate steps in metrics calculation. 22 | 2. We then implemented the metrics using [DSPy (Declarative Self-improving Language Programs in Python)](https://github.com/stanfordnlp/dspy) and optimized the prompts to minimize score difference with LCEL using a subset of examples for Few Shot Learning (using Bootstrap Few Shot with Random Search). 23 | 3. We evaluated the confidence of scores produced by LCEL and DSPy metric implementations. 24 | 4. We are building a tool that allows human oversight on the LCEL outputs (including intermediate steps) for Active Learning supervision. 25 | 5. We will re-optimize the DSPy metrics using recalculated scores based on tool updates. 26 | 27 | ### Challenges we ran into 28 | 29 | * DSPy has a steep learning curve and it is still a work in progress, so some parts of it don't work as expected 30 | * Our project grew iteratively as our understanding of the problem space grew, so we had to do some steps sequentially, leading to wasted time 31 | 32 | ### Accomplishments that we're proud of 33 | 34 | * How team members from different parts of the world came together and pooled their skills towards our common goal of building a set of domain optimized metrics. 35 | 36 | ### What we learned 37 | 38 | * We gained greater insight into the RAGAS metrics once we implemented them ourselves. We gained additional insight when building the tool using the intermediate outputs. 39 | * Our team was not familiar with DSPy at all, we learned to use it and are very impressed with its capabilities 40 | 41 | ### What's next for llm-rag-eval 42 | 43 | We notice that most of our metrics involve predictive steps, where we predict a binary outcome given a pair of strings. These seem like variants of NLI (Natural Language Inference) which could be handled by non-LLM models, which are not only cheaper but also don't suffer from hallucinations, leading to more repeatable evaluations. It will require more data to train them, so we are starting to generate synthetic data, but this has other dependencies before we can start to offload these steps to smaller models. 44 | 45 | 46 | ## Running Prompted RAGAS metrics 47 | 48 | The following RAGAS metrics have been (re-)implemented in this project (because 49 | I had trouble making them work as-is, and because they are conceptually quite 50 | simple to implement). 51 | 52 | * [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html) -- given question, context and answer, measures factual consistency of the answer against the given context. 53 | * [Answer Relevance](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html) -- given question, context and answer, measures how pertinent the answer is to the question. 54 | * [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html) -- given question, context and ground truth, measures whether statements in the ground truth are found in context. 55 | * Context Utilization -- same as Context Precision, using the predicted answer instead of the ground truth. 56 | * [Context Relevance](https://docs.ragas.io/en/stable/concepts/metrics/context_relevancy.html) -- given question and context, measures relevance of the context against the question. 57 | * [Context Recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html) -- given the context and answer, measures extent to which context aligns with the answer. 58 | * [Answer Similarity](https://docs.ragas.io/en/stable/concepts/metrics/semantic_similarity.html) -- given answer and ground truth, measures semantic similarity between them (cosine or cross encoder similarity). 59 | * [Answer Correctness](https://docs.ragas.io/en/stable/concepts/metrics/answer_correctness.html) -- given answer and ground truth, measures the accuracy given facts in these two texts. 60 | 61 | The metrics described above can be run against your dataset by calling the `run_prompted_metrics.py` script in the `src` directory, with the path to the input (provided in JSON-L), the path to the output (TSV) file, and the appropriate metric name. 62 | 63 | ``` 64 | $ python3 run_prompted_metrics.py --help 65 | usage: run_prompted_metrics.py [-h] --metric 66 | {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness} 67 | --input INPUT_JSONL [--output OUTPUT_TSV] [--parallel] [--cross-encoder] 68 | 69 | options: 70 | -h, --help show this help message and exit 71 | --metric {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness} 72 | The metric to compute 73 | --input INPUT_JSONL 74 | Full path to evaluation data in JSONL format 75 | --output OUTPUT_TSV 76 | Full path to output TSV file 77 | --parallel Run in parallel where possible (default false) 78 | --cross-encoder Use cross-encoder similarity scoring (default false) 79 | ``` 80 | 81 | Ideally, we want to generate metrics from a running RAG pipeline, but in order to simplify the development process, we have isolate the evaluation functionality, feeding it the input it needs via a JSON-L file. Each line of the JSON-L file represents a single RAG transaction. The required fields are as follows. 82 | 83 | ``` 84 | { 85 | "id": {qid: int}, 86 | "query": {query: str}, 87 | "context": [ 88 | { 89 | "id": {chunk_id: str}, 90 | "chunk_text": {chunk_text: str}, 91 | ... other relevant fields 92 | }, 93 | ... more context elements 94 | ], 95 | "ideal_answer": {ground_truth: str}, 96 | "predicted_answer": {answer: str} 97 | } 98 | ``` 99 | 100 | We have used the [AmnestyQA](https://huggingface.co/datasets/explodinggradients/amnesty_qa) dataset on HuggingFace as our reference dataset. You can find a copy of that data in the format described above. 101 | 102 | ## Running Learned RAGAS metrics 103 | 104 | We have used DSPy to optimize our prompts for AmnestyQA dataset. At a very high level, this involves using random subsets of the training data (in our case outputs from our prompted RAG metrics) and finding the best subset of examples that produce the most optimized prompt. 105 | 106 | The DSPy implementations look for the optimized configuration in `resource/configs`. If it doesn't find it, then it looks for the dataset to allow it to optimize itself first. We have provided configurations for RAGAS metrics prompts optimized for AmnestyQA, but you probably need to generate optimized versions for your own dataset. To do so, you need to run the `generate_datasets.py` script, which will extract data from running the RAGAS prompts against the LLM and write it out as a JSON file into `data/dspy-datasets` (that's where the DSPy fine-tuning code expects to find it). The command to generate a dataset to fine-tune a DSPy prompt for a particular metric is shown below: 107 | 108 | ``` 109 | $ python3 generate_datasets.py --help 110 | usage: generate_datasets.py [-h] --metric 111 | {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness} 112 | --input INPUT --output OUTPUT [--parallel] [--debug] 113 | 114 | options: 115 | -h, --help show this help message and exit 116 | --metric {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness} 117 | The metric to generate datasets for 118 | --input INPUT Full path to input JSONL file 119 | --output OUTPUT Full path to output directory 120 | --parallel Run in parallel where possible (default false) 121 | --debug Turn debugging on (default: false) 122 | ``` 123 | 124 | To re-run the optimization locally, remove the configuration file for the metric from the `resources/config` directory. The next time you run `run_learned_metrics.py` it will re-optimize (this is a fairly lengthy process but doesn't require GPU). Leave the config file alone to re-use the prompt optimized for AmnestyQA. 125 | 126 | ``` 127 | $ python3 run_learned_metrics.py --help 128 | usage: run_learned_metrics.py [-h] --metric 129 | {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness} 130 | --input INPUT [--output OUTPUT] [--cross-encoder] [--model-temp MODEL_TEMP] 131 | 132 | options: 133 | -h, --help show this help message and exit 134 | --metric {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness} 135 | The metric to compute 136 | --input INPUT Full path to evaluation data in JSONL format 137 | --output OUTPUT Full path to output TSV file 138 | --cross-encoder Use cross-encoder similarity scoring (default true) 139 | --model-temp MODEL_TEMP 140 | The temperature of the model - between 0.0 and 1.0 (default 0.0) 141 | ``` 142 | 143 | ## Comparing Prompted vs Learned Metrics 144 | 145 | After computing the different metrics using the "prompted" approach using LCEL and the "optimized from data" approach using DSPy, we plot the scores as shown below. 146 | 147 | 148 | 149 | Visually, at least for some of the metrics, the DSPy approach seems to produce scores whose histograms are narrower around 0 and 1. In order to quantify this intuition, we measure deviations of the scores from 0.5 on the upper and lower side, then compute the standard deviation of the deviations for both LCEL and DSPy scores for each metric. The results are summarized below. 150 | 151 | | Metric | LCEL | DSPy | 152 | |--------|------|------| 153 | | Faithfulness | 0.162 | **0.000** | 154 | | Context Relevance | 0.191 | **0.184** | 155 | | Answer Correctness | 0.194 | **0.193** | 156 | | Context Recall | 0.191 | **0.183** | 157 | | Context Utilization | **0.186** | 0.189 | 158 | | Context Precision | 0.191 | **0.178** | 159 | | Answer Relevance | **0.049** | 0.064 | 160 | 161 | As can be seen, for most of the metrics, optimized prompts from DSPy produce more confident scores. In many cases, the difference is quite small, which may be attributed to the relatively few examples we are working with. 162 | -------------------------------------------------------------------------------- /figs/ragas_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/figs/ragas_metrics.png -------------------------------------------------------------------------------- /figs/score_distribs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/figs/score_distribs.png -------------------------------------------------------------------------------- /rag-data/amnesty_qa-new-01.tsv: -------------------------------------------------------------------------------- 1 | #QID FAITHFULNESS 2 | 20 1.000 3 | 21 1.000 4 | 22 1.000 5 | 23 1.000 6 | 24 1.000 7 | 25 1.000 8 | 26 1.000 9 | 27 1.000 10 | 28 1.000 11 | 29 1.000 12 | 30 1.000 13 | 31 1.000 14 | 32 1.000 15 | 33 1.000 16 | 34 1.000 17 | 35 1.000 18 | 36 1.000 19 | 37 1.000 20 | 38 1.000 21 | 39 1.000 22 | 40 0.500 23 | 41 0.500 24 | 42 1.000 25 | 43 0.500 26 | 44 1.000 27 | 45 1.000 28 | 46 0.500 29 | 47 1.000 30 | 48 1.000 31 | 49 1.000 32 | 50 1.000 33 | 51 1.000 34 | 52 1.000 35 | 53 1.000 36 | 54 1.000 37 | 55 0.400 38 | 56 1.000 39 | -------------------------------------------------------------------------------- /rag-data/amnesty_qa-new-02.jsonl: -------------------------------------------------------------------------------- 1 | {"query": "What is the Roe effect?", "predicted_answer": "The Roe effect is a hypothesis that suggests that the practice of abortion will eventually lead to the restriction or illegalization of abortion due to the erosion of the political base of abortion rights supporters.", "context": [{"id": "1", "chunk_text": "Roe effect | The Roe effect is a hypothesis about the long-term effect of abortion on the political balance of the United States, which suggests that since supporters of abortion rights cause the erosion of their own political base, the practice of abortion will eventually lead to the restriction or illegalization of abortion. It is named after \"Roe v. Wade\", the U.S. Supreme Court case that effectively legalized abortion nationwide in the U.S. Its best-known proponent is James Taranto of the \"Wall Street Journal\" who coined the phrase \"Roe effect\" in \"Best of the Web Today\", his OpinionJournal.com column."}], "id": 20} 2 | {"query": "What is the purpose of the Human Life Amendment?", "predicted_answer": "The Human Life Amendment is a proposed amendment to the United States Constitution that would overturn the Supreme Court's 1973 \"Roe v. Wade\" decision, which legalized abortion nationwide. The amendment would prohibit both Congress and the states from legalizing abortion.", "context": [{"id": "1", "chunk_text": "Human Life Amendment | The Human Life Amendment is the name of multiple proposals to amend the United States Constitution that would have the effect of overturning the Supreme Court 1973 decision \"Roe v. Wade\", which ruled that prohibitions against abortion were unconstitutional. All of these amendment proposals seek to overturn \"Roe v. Wade\", but most of them go further by forbidding both Congress and the states from legalizing abortion. Some of the proposals define human life as beginning with conception or fertilization."}], "id": 21} 3 | {"query": "What is the purpose of greenhouse gas accounting?", "predicted_answer": "Greenhouse gas accounting is a way to inventory and audit greenhouse gas (GHG) emissions. It quantifies the total greenhouse gases produced directly and indirectly from a business or organization's activities, providing information for understanding and managing climate change impacts.", "context": [{"id": "1", "chunk_text": "Greenhouse gas accounting | Greenhouse gas accounting describes the way to inventory and audit greenhouse gas (GHG) emissions. A corporate or organisational greenhouse gas (GHG) emissions assessment quantifies the total greenhouse gases produced directly and indirectly from a business or organisation\u2019s activities. Also known as a carbon footprint, it is a business tool that provides information with a basis for understanding and managing climate change impacts."}], "id": 22} 4 | {"query": "What was the outcome of the American Electric Power Co. v. Connecticut case?", "predicted_answer": "In American Electric Power Company v. Connecticut, the U.S. Supreme Court ruled that corporations cannot be sued for greenhouse gas emissions under federal common law because the Clean Air Act delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA).", "context": [{"id": "1", "chunk_text": "American Electric Power Co. v. Connecticut | American Electric Power Company v. Connecticut, 564 U.S. 410 (2011) , was a United States Supreme Court case in which the Court, in an 8\u20130 decision, held that corporations cannot be sued for greenhouse gas emissions (GHGs) under federal common law, primarily because the Clean Air Act (CAA) delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA). Brought to court in July 2004 in the Southern District of New York, this was the first global warming case based on a public nuisance claim."}], "id": 23} 5 | {"query": "What is the primary activity of the Carbon Disclosure Project (CDP)?", "predicted_answer": "The primary activity of the Carbon Disclosure Project (CDP) is to disclose the greenhouse gas (GHG) emissions of major corporations.", "context": [{"id": "1", "chunk_text": "Carbon Disclosure Project | The CDP (formerly the \"Carbon Disclosure Project\") is an organisation based in the United Kingdom which works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations. As disclosure of emission related data as CDP's primary activity, the quality of the data reported to CDP is key."}], "id": 24} 6 | {"query": "Where is the Weyburn-Midale Carbon Dioxide Project located?", "predicted_answer": "The Weyburn-Midale Carbon Dioxide Project is located in Midale, Saskatchewan, Canada.", "context": [{"id": "1", "chunk_text": "Weyburn-Midale Carbon Dioxide Project | The Weyburn-Midale Carbon Dioxide Project (or IEA GHG Weyburn-Midale CO Monitoring and Storage Project) is, as of 2008, the world's largest carbon capture and storage project. It is located in Midale, Saskatchewan, Canada."}], "id": 25} 7 | {"query": "What is Owens Wiwa's area of expertise?", "predicted_answer": "Owens Wiwa is an internationally renowned expert on the effects of globalization, especially as it relates to the business practices of Royal Dutch Shell in the Niger Delta.", "context": [{"id": "1", "chunk_text": "Owens Wiwa | Monday Owens Wiwa (born 10 October 1957 in Bori, Nigeria) is a medical doctor and human rights activist. He is the brother of executed Ogoni leader Ken Saro-Wiwa, and the son of Ogoni chief Jim Wiwa. Wiwa is an internationally renowned expert on the effects of globalisation, especially as it relates to the highly controversial business practices of Royal Dutch Shell in the Niger Delta. Vice-chairman of the Toronto chapter of the Sierra Club Canada and an active member of Amnesty International, Wiwa is frequently called upon to advocate for development programs in Canada and abroad and to campaign for increased corporate responsibility. This work has taken him to Ireland, which he visits in support of the Shell to Sea campaign. Currently, he is the Country Director for the Clinton Health Access Initiative in Nigeria."}], "id": 26} 8 | {"query": "What did Amnesty International do in response to the Zakoora and Tengpora Massacre?", "predicted_answer": "Amnesty International issued an appeal for urgent action on Kashmir.", "context": [{"id": "1", "chunk_text": "Zakoora and Tengpora massacre | The Zukoora And Tengpora Massacre was the killing of protesters calling for the implementation of a United Nations resolution regarding a plebiscite in Kashmir at Zakoora Crossing and Tengpora Bypass Road in Srinagar on 1 March 1990, in which 33 people were killed and 47 injured. It led Amnesty International to issue an appeal for urgent action on Kashmir."}], "id": 27} 9 | -------------------------------------------------------------------------------- /rag-data/more-q-01.jsonl: -------------------------------------------------------------------------------- 1 | {"query": "What is the Roe effect?", "predicted_answer": "The Roe effect is a hypothesis that suggests that the practice of abortion will eventually lead to the restriction or illegalization of abortion due to the erosion of the political base of abortion rights supporters.", "ideal_answer": "The Roe effect is a hypothesis that suggests that the practice of abortion will eventually lead to the restriction or illegalization of abortion due to the erosion of the political base of abortion rights supporters.", "context": [{"id": "0", "chunk_text": "Roe effect | The Roe effect is a hypothesis about the long-term effect of abortion on the political balance of the United States, which suggests that since supporters of abortion rights cause the erosion of their own political base, the practice of abortion will eventually lead to the restriction or illegalization of abortion."}], "id": 0} 2 | {"query": "What is the purpose of the Human Life Amendment?", "predicted_answer": "The Human Life Amendment is a proposal to amend the United States Constitution to overturn the Supreme Court's Roe v. Wade decision, which legalized abortion nationwide, and to forbid both Congress and the states from legalizing abortion.", "ideal_answer": "The Human Life Amendment is a proposal to amend the United States Constitution to overturn the Supreme Court's Roe v. Wade decision, which legalized abortion nationwide, and to forbid both Congress and the states from legalizing abortion.", "context": [{"id": "0", "chunk_text": "Human Life Amendment | The Human Life Amendment is the name of multiple proposals to amend the United States Constitution that would have the effect of overturning the Supreme Court 1973 decision \"Roe v. Wade\", which ruled that prohibitions against abortion were unconstitutional. All of these amendment proposals seek to overturn \"Roe v. Wade\", but most of them go further by forbidding both Congress and the states from legalizing abortion."}], "id": 1} 3 | {"query": "What was the ruling in United States v. Vuitch?", "predicted_answer": "In United States v. Vuitch, the Supreme Court held that the District of Columbia's abortion law banning the practice except when necessary for the health or life of the woman was not unconstitutionally vague.", "ideal_answer": "In United States v. Vuitch, the Supreme Court held that the District of Columbia's abortion law banning the practice except when necessary for the health or life of the woman was not unconstitutionally vague.", "context": [{"id": "0", "chunk_text": "United States v. Vuitch | United States v. Vuitch, 402 U.S. 62 (1971) was a United States Supreme Court abortion rights case, which held that the District of Columbia's abortion law banning the practice except when necessary for the health or life of the woman was not unconstitutionally vague."}], "id": 2} 4 | {"query": "What is the purpose of greenhouse gas accounting?", "predicted_answer": "Greenhouse gas accounting is a way to inventory and audit greenhouse gas (GHG) emissions. It quantifies the total greenhouse gases produced directly and indirectly from a business or organization's activities, providing information for understanding and managing climate change impacts.", "ideal_answer": "Greenhouse gas accounting is a way to inventory and audit greenhouse gas (GHG) emissions. It quantifies the total greenhouse gases produced directly and indirectly from a business or organization's activities, providing information for understanding and managing climate change impacts.", "context": [{"id": "0", "chunk_text": "Greenhouse gas accounting | Greenhouse gas accounting describes the way to inventory and audit greenhouse gas (GHG) emissions. A corporate or organisational greenhouse gas (GHG) emissions assessment quantifies the total greenhouse gases produced directly and indirectly from a business or organisation\u2019s activities. Also known as a carbon footprint, it is a business tool that provides information with a basis for understanding and managing climate change impacts."}], "id": 3} 5 | {"query": "What was the outcome of the American Electric Power Co. v. Connecticut case?", "predicted_answer": "In American Electric Power Company v. Connecticut, the U.S. Supreme Court ruled that corporations cannot be sued for greenhouse gas emissions under federal common law because the Clean Air Act delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA).", "ideal_answer": "In American Electric Power Company v. Connecticut, the U.S. Supreme Court ruled that corporations cannot be sued for greenhouse gas emissions under federal common law because the Clean Air Act delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA).", "context": [{"id": "0", "chunk_text": "American Electric Power Co. v. Connecticut | American Electric Power Company v. Connecticut, 564 U.S. 410 (2011) , was a United States Supreme Court case in which the Court, in an 8\u20130 decision, held that corporations cannot be sued for greenhouse gas emissions (GHGs) under federal common law, primarily because the Clean Air Act (CAA) delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA). Brought to court in July 2004 in the Southern District of New York, this was the first global warming case based on a public nuisance claim."}], "id": 4} 6 | {"query": "What is the primary activity of the Carbon Disclosure Project (CDP)?", "predicted_answer": "The Carbon Disclosure Project (CDP) works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations.", "ideal_answer": "The Carbon Disclosure Project (CDP) works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations.", "context": [{"id": "0", "chunk_text": "Carbon Disclosure Project | The CDP (formerly the \"Carbon Disclosure Project\") is an organisation based in the United Kingdom which works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations. As disclosure of emission related data as CDP\\'s primary activity, the quality of the data reported to CDP is key."}], "id": 5} 7 | -------------------------------------------------------------------------------- /rag-data/output-01.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_RECALL 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | accelerate 3 | Jinja2 4 | dspy-ai 5 | dspy-ai[chromadb] 6 | evaluate 7 | google-generativeai 8 | langchain 9 | langchain-google-genai 10 | nltk 11 | pillow 12 | polars 13 | rouge-score 14 | scikit-learn 15 | tensorboardX 16 | transformers[torch] 17 | xmltodict 18 | streamlit -------------------------------------------------------------------------------- /resources/configs/answer_correctness-best.json: -------------------------------------------------------------------------------- 1 | { 2 | "fact_extractor": { 3 | "lm": null, 4 | "traces": [], 5 | "train": [], 6 | "demos": [ 7 | { 8 | "augmented": true, 9 | "answer": "The consequences of criminalizing abortion for marginalized individuals include increased stigma, lack of information, and disinformation. This can have severe and irreversible effects on these individuals. Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights. Marginalized individuals, such as those living in poverty, historically discriminated against, Indigenous and Afro-descendent women, migrants, and refugees, are disproportionately affected by abortion criminalization. The criminalization of abortion is a major factor contributing to the high number of unsafe abortions, which leads to increased maternal mortality and morbidity. Access to health services is undermined, resulting in preventable maternal deaths and complications. Marginalized individuals are forced to resort to unsafe clandestine abortion methods, putting their lives and health at risk. In Nigeria, restrictive abortion laws make it difficult to access safe abortion care.", 10 | "facts": "1. Criminalizing abortion increases stigma for marginalized individuals.\n2. Criminalizing abortion leads to a lack of information and disinformation for marginalized individuals.\n3. Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights.\n4. Marginalized individuals are disproportionately affected by abortion criminalization.\n5. Criminalization of abortion contributes to the high number of unsafe abortions.\n6. Unsafe abortions lead to increased maternal mortality and morbidity.\n7. Access to health services is undermined by abortion criminalization.\n8. Marginalized individuals are forced to resort to unsafe clandestine abortion methods.\n9. Restrictive abortion laws in Nigeria make it difficult to access safe abortion care." 11 | }, 12 | { 13 | "augmented": true, 14 | "answer": "Criminalizing abortion can have severe consequences for marginalized individuals, particularly those who are already facing various forms of discrimination and inequality. Some of the consequences include:\n\n1. Health risks: When abortion is criminalized, individuals may resort to unsafe and illegal methods to terminate their pregnancies, leading to serious health complications and even death. Marginalized individuals, who often have limited access to healthcare and resources, are more likely to be affected by these risks.\n\n2. Economic burden: Marginalized individuals, who are more likely to face financial constraints, may struggle to afford safe and legal abortions in countries where it is criminalized. This can lead to increased financial burdens, as they may have to travel long distances or seek expensive underground services.\n\n3. Limited reproductive choices: Criminalizing abortion restricts individuals' reproductive choices and autonomy. Marginalized individuals, who already face various barriers to accessing healthcare and education, may be further limited in their ability to make informed decisions about their reproductive health.\n\n4. Increased inequality: Criminalizing abortion disproportionately affects marginalized individuals, exacerbating existing inequalities. Women from low-income backgrounds, racial and ethnic minorities, LGBTQ+ individuals, and those living in rural areas often face additional barriers to accessing safe and legal abortion services, further widening the gap between privileged and marginalized groups.\n\n5. Criminalization of pregnancy: In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself. This can result in the surveillance and punishment of pregnant individuals, particularly those from marginalized communities, who may be subjected to invasive monitoring, discrimination, and even imprisonment for actions perceived as harmful to the fetus.\n\n6. Mental health impact: The criminalization of abortion can have significant psychological consequences for marginalized individuals. The stigma and shame associated with seeking illegal abortions or being unable to access safe services can lead to increased stress, anxiety, and depression.\n\n7. Reproductive injustice: Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals the right to control their own bodies and make decisions about their reproductive health. It reinforces systemic inequalities and denies them the same reproductive rights and freedoms enjoyed by more privileged individuals.\n\nIt is important to note that these consequences are not limited to criminalizing abortion alone but are often intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.", 15 | "facts": "1. Criminalizing abortion can lead to unsafe and illegal methods of termination, resulting in health complications and death, especially for marginalized individuals.\n2. Marginalized individuals may face financial burdens due to limited access to safe and legal abortions in criminalized countries.\n3. Criminalizing abortion restricts reproductive choices and autonomy, particularly for marginalized individuals who face barriers to healthcare and education.\n4. Criminalizing abortion exacerbates existing inequalities, disproportionately affecting marginalized groups such as low-income women, racial and ethnic minorities, LGBTQ+ individuals, and rural residents.\n5. In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself, resulting in surveillance, punishment, and discrimination against pregnant individuals from marginalized communities.\n6. The criminalization of abortion can have significant mental health impacts, including increased stress, anxiety, and depression, for marginalized individuals.\n7. Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals control over their bodies and reproductive health decisions.\n8. The consequences of criminalizing abortion are intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.\n9. Criminalizing abortion can lead to increased inequality between privileged and marginalized groups.\n10. Criminalizing abortion can result in the denial of reproductive rights and freedoms for marginalized individuals." 16 | } 17 | ], 18 | "signature_instructions": " Extract 5-10 facts from the answer", 19 | "signature_prefix": "Facts:" 20 | }, 21 | "fact_grouper": { 22 | "lm": null, 23 | "traces": [], 24 | "train": [], 25 | "demos": [ 26 | { 27 | "augmented": true, 28 | "facts_g": "1. Criminalizing abortion increases stigma for marginalized individuals.\n2. Criminalizing abortion leads to a lack of information and disinformation for marginalized individuals.\n3. Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights.\n4. Marginalized individuals are disproportionately affected by abortion criminalization.\n5. Criminalization of abortion contributes to the high number of unsafe abortions.\n6. Unsafe abortions lead to increased maternal mortality and morbidity.\n7. Access to health services is undermined by abortion criminalization.\n8. Marginalized individuals are forced to resort to unsafe clandestine abortion methods.\n9. Restrictive abortion laws in Nigeria make it difficult to access safe abortion care.", 29 | "facts_a": "1. Criminalizing abortion can lead to unsafe and illegal methods of termination, resulting in health complications and death, especially for marginalized individuals.\n2. Marginalized individuals may face financial burdens due to limited access to safe and legal abortions in criminalized countries.\n3. Criminalizing abortion restricts reproductive choices and autonomy, particularly for marginalized individuals who face barriers to healthcare and education.\n4. Criminalizing abortion exacerbates existing inequalities, disproportionately affecting marginalized groups such as low-income women, racial and ethnic minorities, LGBTQ+ individuals, and rural residents.\n5. In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself, resulting in surveillance, punishment, and discrimination against pregnant individuals from marginalized communities.\n6. The criminalization of abortion can have significant mental health impacts, including increased stress, anxiety, and depression, for marginalized individuals.\n7. Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals control over their bodies and reproductive health decisions.\n8. The consequences of criminalizing abortion are intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.\n9. Criminalizing abortion can lead to increased inequality between privileged and marginalized groups.\n10. Criminalizing abortion can result in the denial of reproductive rights and freedoms for marginalized individuals.", 30 | "rationale": "produce the fact_groups. \n\n1. We start by comparing the facts from the ground truth (Facts G) with the facts from the answer (Facts A).\n2. For each fact in Facts G, we check if it is also present in Facts A.\n3. If the fact is present in both Facts G and Facts A, we classify it as a TP (true positive).\n4. If the fact is present in Facts A but not in Facts G, we classify it as a FP (false positive).\n5. If the fact is present in Facts G but not in Facts A, we classify it as a FN (false negative).\n6. Finally, we group the facts into JSON string with keys TP, FP, FN and list of facts for each as values.", 31 | "fact_groups": "```json\n{\n \"TP\": [\n \"Criminalizing abortion increases stigma for marginalized individuals.\",\n \"Criminalizing abortion leads to a lack of information and disinformation for marginalized individuals.\",\n \"Marginalized individuals are disproportionately affected by abortion criminalization.\",\n \"Criminalization of abortion contributes to the high number of unsafe abortions.\",\n \"Unsafe abortions lead to increased maternal mortality and morbidity.\",\n \"Access to health services is undermined by abortion criminalization.\",\n \"Marginalized individuals are forced to resort to unsafe clandestine abortion methods.\"\n ],\n \"FP\": [\n \"Criminalizing abortion can lead to unsafe and illegal methods of termination, resulting in health complications and death, especially for marginalized individuals.\",\n \"Marginalized individuals may face financial burdens due to limited access to safe and legal abortions in criminalized countries.\",\n \"Criminalizing abortion restricts reproductive choices and autonomy, particularly for marginalized individuals who face barriers to healthcare and education.\",\n \"Criminalizing abortion exacerbates existing inequalities, disproportionately affecting marginalized groups such as low-income women, racial and ethnic minorities, LGBTQ+ individuals, and rural residents.\",\n \"In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself, resulting in surveillance, punishment, and discrimination against pregnant individuals from marginalized communities.\",\n \"The criminalization of abortion can have significant mental health impacts, including increased stress, anxiety, and depression, for marginalized individuals.\",\n \"Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals control over their bodies and reproductive health decisions.\",\n \"The consequences of criminalizing abortion are intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.\",\n \"Criminalizing abortion can lead to increased inequality between privileged and marginalized groups.\",\n \"Criminalizing abortion can result in the denial of reproductive rights and freedoms for marginalized individuals.\"\n ],\n \"FN\": [\n \"Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights.\",\n \"Restrictive abortion laws in Nigeria make it difficult to access safe abortion care.\"\n ]\n}\n```" 32 | }, 33 | { 34 | "augmented": true, 35 | "facts_g": "1. The prosecution considers statements contrary to the official position as 'false' under Article 207.3.\n2. Statements must be in opposition to the official position of the Russian authorities to be considered 'false'.", 36 | "facts_a": "1. Article 207.3 of the Russian Criminal Code defines 'false' statements as those made intentionally to discredit the state or its authorities.\n2. The prosecution considers statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage.\n3. The prosecution will consider statements as false if they are deliberately made to mislead the public or undermine the credibility of the state.\n4. The prosecution will consider statements as false if they are made contrary to the official position.\n5. The prosecution will consider statements as false if they are made with the aim of discrediting the state or its authorities.", 37 | "rationale": "Facts G:\n1. The prosecution considers statements contrary to the official position as 'false' under Article 207.3.\n2. Statements must be in opposition to the official position of the Russian authorities to be considered 'false'.\n\nFacts A:\n1. Article 207.3 of the Russian Criminal Code defines 'false' statements as those made intentionally to discredit the state or its authorities.\n2. The prosecution considers statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage.\n3. The prosecution will consider statements as false if they are deliberately made to mislead the public or undermine the credibility of the state.\n4. The prosecution will consider statements as false if they are made contrary to the official position.\n5. The prosecution will consider statements as false if they are made with the aim of discrediting the state or its authorities.\n\nReasoning: Let's think step by step in order to produce the fact_groups.\n- TP: Fact 1 in G and Fact 4 in A are both about the prosecution considering statements contrary to the official position as 'false'.\n- FP: Fact 2 in A is about the prosecution considering statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage, which is not mentioned in G. Fact 3 in A is about the prosecution considering statements as false if they are deliberately made to mislead the public or undermine the credibility of the state, which is also not mentioned in G. Fact 5 in A is about the prosecution considering statements as false if they are made with the aim of discrediting the state or its authorities, which is already covered by Fact 1 in G.\n- FN: Fact 2 in G is about statements must be in opposition to the official position of the Russian authorities to be considered 'false', which is not mentioned in A.", 38 | "fact_groups": "```json\n{\n \"TP\": [\n \"The prosecution considers statements contrary to the official position as 'false' under Article 207.3.\"\n ],\n \"FP\": [\n \"The prosecution considers statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage.\",\n \"The prosecution will consider statements as false if they are deliberately made to mislead the public or undermine the credibility of the state.\",\n \"The prosecution will consider statements as false if they are made with the aim of discrediting the state or its authorities.\"\n ],\n \"FN\": [\n \"Statements must be in opposition to the official position of the Russian authorities to be considered 'false'.\"\n ]\n}\n```" 39 | } 40 | ], 41 | "signature_instructions": " Classify each answer fact with respect to ground truth fact as follows:\n - TP (true positive): fact is present in both answer and ground truth\n - FP (false positive): fact is present in answer but not in ground truth\n - FN (false negative): fact is present in ground truth not in answer.\n Group the input facts as numbered lists under these 3 categories\n ", 42 | "signature_prefix": "Fact Groups:", 43 | "extended_signature_instructions": " Classify each answer fact with respect to ground truth fact as follows:\n - TP (true positive): fact is present in both answer and ground truth\n - FP (false positive): fact is present in answer but not in ground truth\n - FN (false negative): fact is present in ground truth not in answer.\n Group the input facts as numbered lists under these 3 categories\n ", 44 | "extended_signature_prefix": "Fact Groups:" 45 | } 46 | } -------------------------------------------------------------------------------- /resources/configs/answer_relevance-best.json: -------------------------------------------------------------------------------- 1 | { 2 | "question_generator": { 3 | "lm": null, 4 | "traces": [], 5 | "train": [], 6 | "demos": [ 7 | { 8 | "augmented": true, 9 | "answer": "Everyone has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. This right is recognized by various international human rights instruments, such as the Universal Declaration of Human Rights and the International Covenant on Civil and Political Rights.\n\nThe victims of human rights violations and their families have a particular right to know the truth about what happened to them. This includes knowing the identities of the perpetrators and the fate of the victims, such as whether they are alive or dead, and if dead, the circumstances of their death.\n\nFurthermore, society as a whole has a right to be informed about human rights violations. This is essential for holding perpetrators accountable, preventing future abuses, and ensuring justice and reparations for the victims. Access to information about human rights violations also allows for public scrutiny and debate, which are crucial for building a culture of human rights and promoting transparency and accountability.\n\nJournalists, human rights defenders, and civil society organizations play a vital role in investigating and reporting on human rights violations. They have the right to access information, conduct independent investigations, and disseminate their findings to the public. Governments have a responsibility to protect and facilitate these activities, ensuring that individuals and organizations can operate freely and without fear of reprisals.\n\nIn summary, the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims, belongs to everyone. It is a fundamental aspect of promoting human rights, accountability, and justice.", 10 | "context": "1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.", 11 | "gen_questions": "1. What are the key provisions of international human rights instruments regarding the right to information about human rights violations?\n2. How does the right to truth contribute to holding perpetrators accountable and preventing future abuses?\n3. What is the role of journalists and civil society organizations in promoting the right to information about human rights violations?\n4. How does the right to know the fate and whereabouts of disappeared relatives relate to the right to truth?\n5. What are the responsibilities of governments in ensuring access to information about human rights violations?" 12 | }, 13 | { 14 | "augmented": true, 15 | "answer": "The purpose of the agreement known as 30x30 is to conserve and protect 30% of the planet's land and oceans by the year 2030. This agreement aims to address the global biodiversity crisis and the impacts of climate change by preserving and restoring ecosystems, promoting sustainable land and ocean management practices, and safeguarding the habitats of numerous plant and animal species. The goal is to ensure the long-term health and resilience of the planet's natural resources and to support the well-being of both human and non-human communities.", 16 | "context": "1. These include the agreement known as 30x30, adopted at the UN Conference on Biological Diversity (COP15) held in Montreal, Canada, in late 2022, which commits Parties to designate 30% of the world's land and sea masses as protected areas for biodiversity conservation by 2030.\n2. Protecting and restoring these ecosystems is one of the most efficient, equitable and cost-effective mechanisms to capture and store carbon without compromising human rights.\n3. However, the focus of the agreement should not be limited to the amount of area it covers, but rather to how it is managed, who selects the areas, and what biodiversity exists in them.\n4. Indigenous Peoples and local communities have a key role to play here.\n5. Unfortunately, at COP15, States failed to explicitly recognize Indigenous Peoples' lands and territories as a distinct category of protected area, and this threatens their rights.", 17 | "gen_questions": "1. What is the goal of the 30x30 agreement?\n2. What percentage of the planet's land and oceans does the 30x30 agreement aim to protect?\n3. What are the benefits of conserving and protecting 30% of the planet's land and oceans?\n4. What is the role of Indigenous Peoples and local communities in the 30x30 agreement?\n5. What are some of the challenges associated with implementing the 30x30 agreement?" 18 | } 19 | ], 20 | "signature_instructions": " Given the answer and context, generate 3 to 5 questions that can be \n answered by the answer. ", 21 | "signature_prefix": "Gen Questions:" 22 | }, 23 | "answer_classifier": { 24 | "lm": null, 25 | "traces": [], 26 | "train": [], 27 | "demos": [ 28 | { 29 | "augmented": true, 30 | "question": "What are the key provisions of international human rights instruments regarding the right to information about human rights violations?", 31 | "context": "1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.", 32 | "rationale": "produce the noncommital. We first need to identify the key provisions of international human rights instruments regarding the right to information about human rights violations. These provisions can be found in various international human rights instruments, including the Universal Declaration of Human Rights, the International Covenant on Civil and Political Rights, and the Convention against Torture and Other Cruel, Inhuman or Degrading Treatment or Punishment. These instruments set out the right to information about human rights violations, as well as the obligations of states to provide such information.", 33 | "answer": "The key provisions of international human rights instruments regarding the right to information about human rights violations include the right to know the truth about past human rights violations, the right to access to justice, remedy, and information, and the right to know the fate and whereabouts of 'disappeared' relatives.", 34 | "noncommital": "no" 35 | }, 36 | { 37 | "augmented": true, 38 | "question": "How does the right to truth contribute to holding perpetrators accountable and preventing future abuses?", 39 | "context": "1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.", 40 | "rationale": "produce the noncommital. We first need to understand the concept of the right to truth. The right to truth is the right of victims of human rights violations and their families to know the whole truth about past human rights violations. This right is engaged by the violation of the rights to access to justice, remedy, and information. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.", 41 | "answer": "The right to truth contributes to holding perpetrators accountable and preventing future abuses by providing victims and their families with the information they need to seek justice and to prevent similar violations from happening again.", 42 | "noncommital": "no" 43 | } 44 | ], 45 | "signature_instructions": " Given a question and its context, use only the context to generate\n an answer, and classify if the answer is commital or noncommital.\n ", 46 | "signature_prefix": "Noncommital:", 47 | "extended_signature_instructions": " Given a question and its context, use only the context to generate\n an answer, and classify if the answer is commital or noncommital.\n ", 48 | "extended_signature_prefix": "Noncommital:" 49 | } 50 | } -------------------------------------------------------------------------------- /resources/configs/context_precision-best.json: -------------------------------------------------------------------------------- 1 | { 2 | "usefulness_classifier": { 3 | "lm": null, 4 | "traces": [], 5 | "train": [], 6 | "demos": [ 7 | { 8 | "augmented": true, 9 | "question": "Who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims?", 10 | "answer": "Everyone has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. This right is recognized by various international human rights instruments, such as the Universal Declaration of Human Rights and the International Covenant on Civil and Political Rights.\n\nThe victims of human rights violations and their families have a particular right to know the truth about what happened to them. This includes knowing the identities of the perpetrators and the fate of the victims, such as whether they are alive or dead, and if dead, the circumstances of their death.\n\nFurthermore, society as a whole has a right to be informed about human rights violations. This is essential for holding perpetrators accountable, preventing future abuses, and ensuring justice and reparations for the victims. Access to information about human rights violations also allows for public scrutiny and debate, which are crucial for building a culture of human rights and promoting transparency and accountability.\n\nJournalists, human rights defenders, and civil society organizations play a vital role in investigating and reporting on human rights violations. They have the right to access information, conduct independent investigations, and disseminate their findings to the public. Governments have a responsibility to protect and facilitate these activities, ensuring that individuals and organizations can operate freely and without fear of reprisals.\n\nIn summary, the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims, belongs to everyone. It is a fundamental aspect of promoting human rights, accountability, and justice.", 11 | "context": "The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.", 12 | "rationale": "produce the score.\n1. The question asks who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims.\n2. The context states that the victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n3. Therefore, the context is useful for answering the question.", 13 | "score": "yes" 14 | }, 15 | { 16 | "augmented": true, 17 | "question": "Who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims?", 18 | "answer": "Everyone has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. This right is recognized by various international human rights instruments, such as the Universal Declaration of Human Rights and the International Covenant on Civil and Political Rights.\n\nThe victims of human rights violations and their families have a particular right to know the truth about what happened to them. This includes knowing the identities of the perpetrators and the fate of the victims, such as whether they are alive or dead, and if dead, the circumstances of their death.\n\nFurthermore, society as a whole has a right to be informed about human rights violations. This is essential for holding perpetrators accountable, preventing future abuses, and ensuring justice and reparations for the victims. Access to information about human rights violations also allows for public scrutiny and debate, which are crucial for building a culture of human rights and promoting transparency and accountability.\n\nJournalists, human rights defenders, and civil society organizations play a vital role in investigating and reporting on human rights violations. They have the right to access information, conduct independent investigations, and disseminate their findings to the public. Governments have a responsibility to protect and facilitate these activities, ensuring that individuals and organizations can operate freely and without fear of reprisals.\n\nIn summary, the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims, belongs to everyone. It is a fundamental aspect of promoting human rights, accountability, and justice.", 19 | "context": "The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.", 20 | "rationale": "produce the score. The question asks who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. The answer states that everyone has this right, and it is recognized by various international human rights instruments. The context provided does not directly address the question or provide any additional information that is relevant to answering it. Therefore, the score is 0.0.", 21 | "score": "0.0" 22 | } 23 | ], 24 | "signature_instructions": " Given a question, an answer to the question, and supporting context,\n provide a yes\/no score indicating if the context was useful for\n answering the question.", 25 | "signature_prefix": "Score:", 26 | "extended_signature_instructions": " Given a question, an answer to the question, and supporting context,\n provide a yes\/no score indicating if the context was useful for\n answering the question.", 27 | "extended_signature_prefix": "Score:" 28 | } 29 | } -------------------------------------------------------------------------------- /resources/configs/context_recall-best.json: -------------------------------------------------------------------------------- 1 | { 2 | "attrib_clf": { 3 | "lm": null, 4 | "traces": [], 5 | "train": [], 6 | "demos": [ 7 | { 8 | "augmented": true, 9 | "answer": [ 10 | "The purpose of the agreement known as 30x30 is to conserve and protect 30% of the planet's land and oceans by the year 2030.", 11 | "This agreement aims to address the global biodiversity crisis and the impacts of climate change by preserving and restoring ecosystems, promoting sustainable land and ocean management practices, and safeguarding the habitats of numerous plant and animal species.", 12 | "The goal is to ensure the long-term health and resilience of the planet's natural resources and to support the well-being of both human and non-human communities." 13 | ], 14 | "context_item": "- These include the agreement known as 30x30, adopted at the UN Conference on Biological Diversity (COP15) held in Montreal, Canada, in late 2022, which commits Parties to designate 30% of the world's land and sea masses as protected areas for biodiversity conservation by 2030.", 15 | "rationale": "produce the scores. We start with the context item, which provides information about the 30x30 agreement. The first sentence of the answer provides a concise summary of the agreement's purpose, which is consistent with the information in the context item. The second sentence elaborates on the agreement's goals, which are also aligned with the information in the context item. The third sentence emphasizes the importance of the agreement for the long-term health of the planet and the well-being of human and non-human communities, which is not explicitly mentioned in the context item but is a reasonable inference based on the information provided.", 16 | "scores": "yes, yes, yes" 17 | }, 18 | { 19 | "answer": "Amnesty International urged its supporters to write letters to the Nigerian government, calling for an independent investigation into the killings of the Ogoni 9 and for those responsible to be brought to justice. They also called for an end to the human rights abuses in the Niger Delta region and for the protection of the rights of the Ogoni people.", 20 | "context": [ 21 | "Amnesty International called on its vast network of supporters to deluge Nigerian authorities first with appeals for the defenders\u2019 freedom, and later with letters of outrage." 22 | ], 23 | "score": "1.0" 24 | } 25 | ], 26 | "signature_instructions": " Given a context item and an answer, for each sentence in the answer,\n classify if the sentence can be attributed to the context item. ", 27 | "signature_prefix": "Scores:", 28 | "extended_signature_instructions": " Given a context item and an answer, for each sentence in the answer,\n classify if the sentence can be attributed to the context item. ", 29 | "extended_signature_prefix": "Scores:" 30 | } 31 | } -------------------------------------------------------------------------------- /resources/configs/context_relevance-best.json: -------------------------------------------------------------------------------- 1 | { 2 | "nec_classifier": { 3 | "lm": null, 4 | "traces": [], 5 | "train": [], 6 | "demos": [ 7 | { 8 | "augmented": true, 9 | "question": "Which right guarantees access to comprehensive information about past human rights violations, including the identities of the perpetrators and the fate of the victims, as well as the circumstances surrounding the violations?", 10 | "ctx_sent": "The Act raises serious questions about its compatibility with the right to truth.", 11 | "rationale": "produce the score. We need to know which right guarantees access to comprehensive information about past human rights violations, including the identities of the perpetrators and the fate of the victims, as well as the circumstances surrounding the violations. The sentence does not provide this information, so it is not absolutely necessary to answer the question.", 12 | "score": "no" 13 | }, 14 | { 15 | "question": "What action did Amnesty International urge its supporters to take in response to the killing of the Ogoni 9?", 16 | "context": "- Amnesty International called on its vast network of supporters to deluge Nigerian authorities first with appeals for the defenders\u2019 freedom, and later with letters of outrage.", 17 | "score": "1.0" 18 | } 19 | ], 20 | "signature_instructions": " Given a question and a sentence from the context, classify\n if sentence is absolutely necessary to answer question\n ", 21 | "signature_prefix": "Score:", 22 | "extended_signature_instructions": " Given a question and a sentence from the context, classify\n if sentence is absolutely necessary to answer question\n ", 23 | "extended_signature_prefix": "Score:" 24 | } 25 | } -------------------------------------------------------------------------------- /resources/configs/faithfulness-best.json: -------------------------------------------------------------------------------- 1 | { 2 | "extractor": { 3 | "lm": null, 4 | "traces": [], 5 | "train": [], 6 | "demos": [ 7 | { 8 | "augmented": true, 9 | "question": "What role do social media companies play in protecting users' rights online, regardless of their language and political views?", 10 | "answer": "Social media companies play a crucial role in protecting users' rights online, regardless of their language and political views. Here are some key aspects of their role:\n\n1. Content moderation: Social media companies have the responsibility to moderate and remove content that violates their community guidelines, including hate speech, harassment, and misinformation. This helps protect users from harmful and abusive content, ensuring a safer online environment.\n\n2. Privacy and data protection: Companies must safeguard users' personal information and ensure their privacy rights are respected. This involves implementing robust data protection measures, obtaining user consent for data collection and usage, and providing transparent privacy policies.\n\n3. Algorithmic transparency: Social media platforms use algorithms to curate and personalize content for users. Ensuring transparency in these algorithms is crucial to prevent bias, discrimination, and the manipulation of information. Companies should disclose how algorithms work and take steps to address any unintended consequences.\n\n4. Accessibility and inclusivity: Social media companies should strive to make their platforms accessible to users of all languages and abilities. This includes providing language options, captioning for videos, and features that accommodate users with disabilities, ensuring equal access to information and participation.\n\n5. Fighting disinformation: Companies should actively combat the spread of disinformation and fake news on their platforms. This involves implementing fact-checking mechanisms, partnering with credible sources, and promoting media literacy to help users distinguish between reliable and misleading information.\n\n6. Transparency and accountability: Social media companies should be transparent about their policies, practices, and decision-making processes. They should provide clear avenues for users to report issues, appeal content removals, and seek redress. Regular transparency reports can help build trust and hold companies accountable.\n\n7. Collaboration with external stakeholders: Companies should collaborate with governments, civil society organizations, and experts to develop policies and practices that protect users' rights. Engaging in multi-stakeholder dialogues helps ensure diverse perspectives are considered and fosters a collective effort to address online challenges.\n\nOverall, social media companies have a responsibility to create a safe, inclusive, and rights-respecting online environment for all users, regardless of their language and political views.", 11 | "facts": "1. Social media companies moderate content to remove harmful and abusive content, protecting users from hate speech, harassment, and misinformation.\n2. Companies must safeguard users' personal information and ensure their privacy rights are respected, implementing robust data protection measures and providing transparent privacy policies.\n3. Social media platforms should strive to make their platforms accessible to users of all languages and abilities, providing language options, captioning for videos, and features that accommodate users with disabilities." 12 | }, 13 | { 14 | "augmented": true, 15 | "question": "What factors have contributed to the decline of independent civil society organizations in Nicaragua?", 16 | "answer": "There are several factors that have contributed to the decline of independent civil society organizations in Nicaragua:\n\n1. Repressive government policies: The Nicaraguan government, under President Daniel Ortega, has implemented a series of repressive policies that have targeted civil society organizations. These policies include restrictive laws, such as the Foreign Agents Law, which requires organizations receiving foreign funding to register as \"foreign agents\" and submit detailed financial reports. This has created a climate of fear and uncertainty for many organizations, leading to self-censorship and a decline in their activities.\n\n2. Harassment and intimidation: Civil society organizations in Nicaragua have faced harassment and intimidation from government authorities, including surveillance, arbitrary arrests, and physical attacks. Human rights defenders and activists have been particularly targeted, with many facing threats to their personal safety and the safety of their families. This has made it difficult for organizations to operate freely and has deterred many individuals from getting involved in civil society work.\n\n3. Lack of funding: The repressive policies and climate of fear have also made it difficult for civil society organizations to secure funding. Many international donors have been reluctant to support organizations in Nicaragua due to concerns about the government's crackdown on civil society. This has led to a decline in resources available for organizations to carry out their work, including advocacy, research, and service provision.\n\n4. Legal restrictions: The Nicaraguan government has implemented a series of legal restrictions that have made it difficult for civil society organizations to operate. These include laws that limit freedom of association, freedom of expression, and the right to peaceful assembly. Organizations that are critical of the government or engage in activities deemed \"political\" have faced increased scrutiny and legal challenges, further limiting their ability to operate independently.\n\n5. Polarization and division: The political situation in Nicaragua has become increasingly polarized, with the government cracking down on dissent and opposition voices. This has created a climate of fear and mistrust, making it difficult for civil society organizations to collaborate and coordinate their efforts. The lack of unity and solidarity among civil society actors has weakened their collective impact and made it easier for the government to suppress their activities.\n\nOverall, the decline of independent civil society organizations in Nicaragua can be attributed to a combination of repressive government policies, harassment and intimidation, lack of funding, legal restrictions, and polarization. These factors have created a hostile environment for civil society, leading to a decline in their numbers and activities.", 17 | "facts": "1. The Nicaraguan government has implemented repressive policies, such as the Foreign Agents Law, which has created a climate of fear and uncertainty for civil society organizations.\n2. Civil society organizations in Nicaragua have faced harassment and intimidation from government authorities, including surveillance, arbitrary arrests, and physical attacks.\n3. The lack of funding, due to the repressive policies and climate of fear, has made it difficult for civil society organizations to secure funding and carry out their work.\n4. The Nicaraguan government has implemented legal restrictions that limit freedom of association, freedom of expression, and the right to peaceful assembly, making it difficult for civil society organizations to operate independently.\n5. The political situation in Nicaragua has become increasingly polarized, creating a climate of fear and mistrust, making it difficult for civil society organizations to collaborate and coordinate their efforts." 18 | } 19 | ], 20 | "signature_instructions": " Given a question-answer pair, generate a list of 3-5 facts\n from the answer\n ", 21 | "signature_prefix": "Facts:" 22 | }, 23 | "scorer": { 24 | "lm": null, 25 | "traces": [], 26 | "train": [], 27 | "demos": [ 28 | { 29 | "augmented": true, 30 | "context": "1. Companies, including social media companies, have a responsibility to respect all human rights wherever they operate in the world.\n2. Social media companies involved in facilitating and moderating online content need to invest adequate resources in human oversight of artificial intelligence-driven content moderation systems to ensure all users can equally exercise their rights online, regardless of language and political views.\n3. They should uphold their human rights responsibilities by engaging in human rights due diligence, including to address risks and abuses arising from their business model.\n4. They should undertake proactive measures in educating users and raising awareness about security and privacy features on their platforms that will help WHRDs create a safer and less toxic online experience.", 31 | "fact": "Social media companies moderate content to remove harmful and abusive content, protecting users from hate speech, harassment, and misinformation.", 32 | "score": "Context:\n1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.\nFact: The right to truth is a fundamental human right that is essential for the victims of human rights violations to heal and move on with their lives.\nScore: yes" 33 | }, 34 | { 35 | "augmented": true, 36 | "context": "1. Companies, including social media companies, have a responsibility to respect all human rights wherever they operate in the world.\n2. Social media companies involved in facilitating and moderating online content need to invest adequate resources in human oversight of artificial intelligence-driven content moderation systems to ensure all users can equally exercise their rights online, regardless of language and political views.\n3. They should uphold their human rights responsibilities by engaging in human rights due diligence, including to address risks and abuses arising from their business model.\n4. They should undertake proactive measures in educating users and raising awareness about security and privacy features on their platforms that will help WHRDs create a safer and less toxic online experience.", 37 | "fact": "Companies must safeguard users' personal information and ensure their privacy rights are respected, implementing robust data protection measures and providing transparent privacy policies.", 38 | "score": "Context:\n1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.\nFact: The right to truth is a fundamental human right that is essential for the promotion of justice, accountability, and reconciliation.\nScore: yes" 39 | } 40 | ], 41 | "signature_instructions": " Classify if fact can be inferred from context ", 42 | "signature_prefix": "Score:" 43 | } 44 | } -------------------------------------------------------------------------------- /resources/prompts/answer_correctness_1.txt: -------------------------------------------------------------------------------- 1 | Given a ground truth and an answer, analyze each statement in the answer and 2 | classify them in one of the following categories: 3 | 4 | - TP (true positive): statements that are present in both the answer and the ground truth, 5 | - FP (false positive): statements present in the answer but not found in the ground truth, 6 | - FN (false negative): relevant statements found in the ground truth but omitted in the answer. 7 | 8 | You must classify a single statement you must classify in exactly one category. Do not 9 | try to interpret the meaning of the ground truth or the answer, just compare the presence 10 | of the statements in them. 11 | 12 | Here are a few examples: 13 | 14 | answer: ```The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system.``` 15 | ground_truth: ```The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents.``` 16 | output: 17 | 18 | 19 | 20 | The sun's primary function is to provide light 21 | 22 | 23 | 24 | 25 | The sun is powered by nuclear fission 26 | similar to nuclear reactors on Earth 27 | 28 | 29 | 30 | 31 | The sun is powered by nuclear fusion, not fission 32 | In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy 33 | This energy provides heat and light, essential for life on Earth 34 | The sun's light plays a critical role in Earth's climate system 35 | The sun helps to drive the weather and ocean currents 36 | 37 | 38 | 39 | 40 | 41 | answer: ```The boiling point of water is 100 degrees Celsius at sea level.``` 42 | ground_truth: ```The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude.``` 43 | output: 44 | 45 | 46 | 47 | The boiling point of water is 100 degrees Celsius at sea level 48 | 49 | 50 | 51 | 52 | 53 | 54 | The boiling point can change with altitude 55 | The boiling point of water is 212 degrees Fahrenheit at sea level 56 | 57 | 58 | 59 | 60 | 61 | answer: ```{answer}``` 62 | ground_truth: ```{ground_truth}``` 63 | output: 64 | -------------------------------------------------------------------------------- /resources/prompts/answer_relevance_1.txt: -------------------------------------------------------------------------------- 1 | Generate {num_questions_to_generate} question for the given answer and context. 2 | 3 | answer: {answer} 4 | context: {context} 5 | 6 | Return the output using the following XML format. 7 | 8 | 9 | 10 | generated question 1 11 | generated question 2 12 | ... 13 | 14 | 15 | -------------------------------------------------------------------------------- /resources/prompts/answer_relevance_2.txt: -------------------------------------------------------------------------------- 1 | Given a question and its context, use ONLY the context to generate an answer, then 2 | identify if answer is noncommittal. 3 | Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal. 4 | A noncommittal answer is one that is evasive, vague, or ambiguous. For example, 5 | "I don't know" or "I'm not sure" are noncommittal answers. 6 | 7 | Here are a few examples: 8 | 9 | question: Where was Albert Einstein born? 10 | context: Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time 11 | output: 12 | 13 | Where was Albert Einstein born? 14 | Albert Einstein was born in Germany. 15 | 0 16 | 17 | 18 | 19 | question: What unique ability does the newly discovered species of frog have? 20 | context: A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment. 21 | output: 22 | 23 | What unique ability does the newly discovered species of frog have? 24 | It can change its skin color based on the temperature of its environment. 25 | 0 26 | 27 | 28 | 29 | question: What is the tallest mountain on Earth? 30 | context: The tallest mountain on Earth, measured from sea level, is Mt. Everest, a renowned peak located in the Himalayas. 31 | output: 32 | 33 | What is the tallest mountain on Earth? 34 | Everest 35 | 0 36 | 37 | 38 | 39 | question: What was the groundbreaking feature of the smartphone invented in 2023? 40 | context: In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology. 41 | output: 42 | 43 | What was the groundbreaking feature of the smartphone invented in 2023? 44 | I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unaware of information beyond 2022. 45 | 1 46 | 47 | 48 | 49 | question: {question} 50 | context: {context} 51 | output: -------------------------------------------------------------------------------- /resources/prompts/context_precision_1.txt: -------------------------------------------------------------------------------- 1 | Given question, answer and context verify if the context was useful in arriving at 2 | the given answer. Give verdict as "1" if useful and "0" if not. 3 | 4 | Some examples are provided. 5 | 6 | question: What can you tell me about albert Albert Einstein? 7 | context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. 8 | answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 9 | output: 10 | 11 | The provided context was indeed useful in arriving at the given answer. The context includes key information about Albert Einstein's life and contributions, which are reflected in the answer. 12 | 1 13 | 14 | 15 | 16 | question: who won 2020 icc world cup? 17 | context: The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title. 18 | answer: England 19 | output: 20 | 21 | the context was useful in clarifying the situation regarding the 2020 ICC World Cup and indicating that England was the winner of the tournament that was intended to be held in 2020 but actually took place in 2022. 22 | 1 23 | 24 | 25 | 26 | question: What is the tallest mountain in the world? 27 | context: The Andes is the longest continental mountain range in the world, located in South America. It stretches across seven countries and features many of the highest peaks in the Western Hemisphere. The range is known for its diverse ecosystems, including the high-altitude Andean Plateau and the Amazon rainforest. 28 | answer: Mount Everest. 29 | output: 30 | 31 | the provided context discusses the Andes mountain range, which, while impressive, does not include Mount Everest or directly relate to the question about the world's tallest mountain. 32 | 0 33 | 34 | 35 | 36 | question: {question} 37 | context: {context} 38 | answer: {answer} 39 | output: -------------------------------------------------------------------------------- /resources/prompts/context_recall_1.txt: -------------------------------------------------------------------------------- 1 | You are given a context and an answer. The answer is provided as a markdown list of 2 | answer sentences. 3 | 4 | Analyze each answer sentence and classify if the sentence can be attributed to the 5 | context. Give a verdict of "1" if the sentence can be attributed to the context and 6 | "0" if not. Provide reasoning for your verdict. 7 | 8 | Here are some examples: 9 | 10 | context: Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass-energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. 11 | answer: ``` 12 | - Albert Einstein, born in 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. 13 | - He received the 1921 Nobel Prize in Physics for his services to theoretical physics. 14 | - He published 4 papers in 1905. 15 | - Einstein moved to Switzerland in 1895 16 | ``` 17 | output: 18 | 19 | 20 | Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. 21 | The date of birth of Einstein is mentioned clearly in the context. 22 | 1 23 | 24 | 25 | He received the 1921 Nobel Prize in Physics for his services to theoretical physics. 26 | The exact sentence is present in the given context. 27 | 1 28 | 29 | 30 | He published 4 papers in 1905. 31 | There is no mention about papers he wrote in the given context. 32 | 0 33 | 34 | 35 | Einstein moved to Switzerland in 1895. 36 | There is no supporting evidence for this in the given context. 37 | 0 38 | 39 | 40 | 41 | 42 | context: The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title. 43 | answer: ``` 44 | - England won the 2022 ICC Men's T20 World Cup. 45 | ``` 46 | output: 47 | 48 | 49 | England won the 2022 ICC Men's T20 World Cup. 50 | From context it is clear that England defeated Pakistan to win the World Cup. 51 | 1 52 | 53 | 54 | 55 | 56 | context: The primary fuel source for the sun is Hydrogen. 57 | answer: ``` 58 | - The Baltimore Sun is the primary newspaper for the Baltimore area. 59 | ``` 60 | output: 61 | 62 | 63 | The Baltimore Sun is the primary newspaper for the Baltimore area. 64 | The context is unrelated to the sentence. 65 | 0 66 | 67 | 68 | 69 | 70 | context: {context} 71 | answer: ``` 72 | {answer} 73 | ``` 74 | output: -------------------------------------------------------------------------------- /resources/prompts/context_relevance_1.txt: -------------------------------------------------------------------------------- 1 | You are provided a question with a list of context sentences. The context sentences 2 | are provided as a Markdown list. 3 | 4 | For each pair of question and context sentence, identify if the sentence is absolutely 5 | necessary to answer the question. Give a verdict of "1" if the sentence is absolutely 6 | necessary to answer the question or "0" if not. 7 | 8 | Include in the output ONLY sentences with verdict of "1". 9 | 10 | question: {question} 11 | context: ``` 12 | {context} 13 | ``` 14 | 15 | Provide your response in the following XML format: 16 | 17 | 18 | 19 | 20 | text of sentence 21 | reasoning for why sentence is useful for question 22 | 0 or 1 23 | 24 | ... 25 | 26 | 27 | -------------------------------------------------------------------------------- /resources/prompts/faithfulness_1.txt: -------------------------------------------------------------------------------- 1 | Create two or more statements from each sentence in the given answer. 2 | 3 | Here are some examples: 4 | 5 | question: Who was Albert Einstein and what is he best known for? 6 | answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. 7 | statements: 8 | Albert Einstein, a German-born theoretical physicist, is renowned for being one of the most influential physicists in history. 9 | Albert Einstein was best known for his theory of relativity. 10 | Einstein's contributions significantly advanced the field of quantum mechanics. 11 | Recognized globally, Einstein's work has profoundly impacted the scientific community. 12 | Einstein's groundbreaking theories continue to shape our understanding of physics today. 13 | 14 | 15 | question: Cadmium Chloride is slightly soluble in this chemical, it is also called what? 16 | answer: alcohol 17 | statements: 18 | Cadmium Chloride is slightly soluble in alcohol. 19 | 20 | 21 | question: Were Hitler and Benito Mussolini of the same nationality? 22 | answer: Sorry, I can't provide answer to that question. 23 | statements: 24 | 25 | 26 | 27 | question: {question} 28 | answer: {answer} 29 | statements: -------------------------------------------------------------------------------- /resources/prompts/faithfulness_2.txt: -------------------------------------------------------------------------------- 1 | Natural Language Inference. Use only "Yes" (1) or "No" (0). 2 | 3 | Here are some examples: 4 | 5 | context: John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. 6 | statements: 7 | John is majoring in Biology. 8 | John is taking a course on Artificial Intelligence. 9 | John is a dedicated student. 10 | John has a part-time job. 11 | 12 | answer: 13 | 14 | John is majoring in Biology. 15 | John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology. 16 | 0 17 | 18 | 19 | John is taking a course on Artificial Intelligence. 20 | The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI. 21 | 0 22 | 23 | 24 | John is a dedicated student. 25 | The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication. 26 | 1 27 | 28 | 29 | John has a part-time job. 30 | There is no information given in the context about John having a part-time job. 31 | 0 32 | 33 | 34 | 35 | context: Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy. 36 | statements: 37 | Albert Einstein was a genius. 38 | 39 | answer: 40 | 41 | Albert Einstein was a genius. 42 | The context and statement are unrelated. 43 | 0 44 | 45 | 46 | 47 | context: {context} 48 | statements: {statements_xml} 49 | answer: -------------------------------------------------------------------------------- /resources/prompts/more_questions.txt: -------------------------------------------------------------------------------- 1 | Similar to how the question 2 | question: {question} 3 | 4 | is answered by 5 | answer: {answer} 6 | 7 | generate {multiplier} new (question, answer, context) tuples specifically from the content in the following passages: 8 | 9 | {passages} 10 | 11 | Return these (question, answer, context) tuples in the following XML format: 12 | 13 | 14 | 15 | question 1 text> 16 | answer 1 text 17 | passage 1 text 18 | 19 | 20 | question 2 text> 21 | answer 2 text 22 | passage 2 text 23 | 24 | 25 | question 3 text> 26 | answer 3 text 27 | passage 3 text 28 | 29 | ... 30 | 31 | -------------------------------------------------------------------------------- /resources/reports/dspy-reports/answer_correctness_report.tsv: -------------------------------------------------------------------------------- 1 | #QID ANSWER_CORRECTNESS 2 | 0 0.588 3 | 1 0.000 4 | 2 1.000 5 | 3 0.333 6 | 4 0.333 7 | 5 1.000 8 | 6 1.000 9 | 7 1.000 10 | 9 0.400 11 | 10 0.571 12 | 11 0.333 13 | 12 0.500 14 | 13 0.500 15 | 14 0.333 16 | 15 0.333 17 | 16 0.000 18 | 17 0.800 19 | 19 0.000 20 | -------------------------------------------------------------------------------- /resources/reports/dspy-reports/answer_relevance_report.tsv: -------------------------------------------------------------------------------- 1 | #QID ANSWER_RELEVANCE 2 | 0 0.000 3 | 1 0.860 4 | 2 0.949 5 | 3 0.845 6 | 4 0.837 7 | 5 0.833 8 | 6 0.850 9 | 7 0.893 10 | 8 0.795 11 | 9 0.000 12 | 10 0.816 13 | 11 0.799 14 | 12 0.808 15 | 13 0.866 16 | 14 0.000 17 | 15 0.855 18 | 16 0.885 19 | 17 0.894 20 | 18 0.880 21 | 19 0.809 22 | -------------------------------------------------------------------------------- /resources/reports/dspy-reports/context_precision_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_PRECISION 2 | 0 0.749 3 | 1 0.556 4 | 2 0.500 5 | 3 1.000 6 | 4 1.000 7 | 5 1.000 8 | 6 0.167 9 | 7 0.604 10 | 8 0.242 11 | 9 0.111 12 | 10 0.875 13 | 11 1.000 14 | 12 0.500 15 | 13 0.200 16 | 14 1.000 17 | 15 0.875 18 | 16 1.000 19 | 17 0.292 20 | 18 0.550 21 | 19 0.333 22 | -------------------------------------------------------------------------------- /resources/reports/dspy-reports/context_recall_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_RECALL 2 | 0 0.357 3 | 1 0.333 4 | 2 0.500 5 | 3 1.000 6 | 4 0.250 7 | 5 1.000 8 | 6 0.667 9 | 7 0.650 10 | 8 0.100 11 | 9 0.463 12 | 10 0.531 13 | 11 1.000 14 | 12 0.000 15 | 13 0.333 16 | 14 0.000 17 | 15 0.900 18 | 16 1.000 19 | 17 0.050 20 | 18 0.314 21 | 19 0.000 22 | -------------------------------------------------------------------------------- /resources/reports/dspy-reports/context_relevance_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_RELEVANCE 2 | 0 0.571 3 | 1 0.333 4 | 2 0.500 5 | 3 1.000 6 | 4 1.000 7 | 5 1.000 8 | 6 0.167 9 | 7 0.750 10 | 8 0.100 11 | 9 0.111 12 | 10 0.875 13 | 11 0.500 14 | 12 0.500 15 | 13 0.200 16 | 14 1.000 17 | 15 0.750 18 | 16 0.500 19 | 17 0.500 20 | 18 0.700 21 | 19 0.333 22 | -------------------------------------------------------------------------------- /resources/reports/dspy-reports/context_utilization_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_UTILIZATION 2 | 0 0.497 3 | 1 0.333 4 | 2 0.500 5 | 3 1.000 6 | 4 0.000 7 | 5 1.000 8 | 6 0.167 9 | 7 0.604 10 | 8 0.000 11 | 9 0.111 12 | 10 0.346 13 | 11 0.500 14 | 12 0.000 15 | 13 0.200 16 | 14 1.000 17 | 15 0.179 18 | 16 1.000 19 | 17 0.125 20 | 18 0.400 21 | 19 0.000 22 | -------------------------------------------------------------------------------- /resources/reports/dspy-reports/faithfulness_report.tsv: -------------------------------------------------------------------------------- 1 | #QID FAITHFULNESS 2 | 0 1.000 3 | 1 1.000 4 | 2 1.000 5 | 3 1.000 6 | 4 1.000 7 | 5 1.000 8 | 6 1.000 9 | 7 1.000 10 | 8 1.000 11 | 9 1.000 12 | 10 1.000 13 | 11 1.000 14 | 12 0.000 15 | 13 1.000 16 | 14 1.000 17 | 15 1.000 18 | 16 1.000 19 | 17 1.000 20 | 18 1.000 21 | 19 1.000 22 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/answer_correctness_report.tsv: -------------------------------------------------------------------------------- 1 | #QID ANSWER_CORRECTNESS 2 | 0 0.538 3 | 1 0.167 4 | 2 0.462 5 | 3 0.747 6 | 4 0.000 7 | 5 0.490 8 | 6 1.000 9 | 7 0.286 10 | 8 0.000 11 | 9 0.571 12 | 10 1.000 13 | 11 0.977 14 | 12 0.000 15 | 13 0.987 16 | 14 0.982 17 | 15 0.400 18 | 16 0.959 19 | 17 0.552 20 | 19 0.839 21 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/answer_relevance_report.tsv: -------------------------------------------------------------------------------- 1 | #QID ANSWER_RELEVANCE 2 | 0 0.881 3 | 1 0.827 4 | 2 0.910 5 | 3 0.839 6 | 4 0.908 7 | 5 0.894 8 | 6 0.000 9 | 7 0.899 10 | 8 0.874 11 | 9 0.850 12 | 10 0.914 13 | 11 0.852 14 | 12 0.928 15 | 13 0.894 16 | 14 0.000 17 | 15 0.947 18 | 16 0.945 19 | 17 0.877 20 | 19 0.835 21 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/answer_similarity_report.tsv: -------------------------------------------------------------------------------- 1 | #QID ANSWER_SIMILARITY 2 | 0 0.979 3 | 1 0.983 4 | 2 0.971 5 | 3 0.973 6 | 4 0.971 7 | 5 0.996 8 | 6 1.000 9 | 7 0.979 10 | 8 0.975 11 | 9 0.974 12 | 10 0.972 13 | 11 0.970 14 | 12 0.990 15 | 13 0.975 16 | 14 0.974 17 | 15 0.977 18 | 16 0.974 19 | 17 0.969 20 | 18 0.976 21 | 19 0.999 22 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/context_precision_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_PRECISION 2 | 0 1.000 3 | 1 0.444 4 | 2 0.500 5 | 3 1.000 6 | 4 1.000 7 | 5 1.000 8 | 6 0.167 9 | 7 0.604 10 | 8 0.100 11 | 9 0.111 12 | 10 0.385 13 | 11 1.000 14 | 12 0.500 15 | 13 0.200 16 | 14 1.000 17 | 15 1.000 18 | 16 1.000 19 | 17 0.125 20 | 18 0.550 21 | 19 0.333 22 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/context_recall_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_RECALL 2 | 0 0.347 3 | 1 0.260 4 | 2 0.500 5 | 3 1.000 6 | 4 1.000 7 | 5 0.500 8 | 6 0.333 9 | 7 0.750 10 | 8 0.500 11 | 9 0.333 12 | 10 0.875 13 | 11 1.000 14 | 12 0.500 15 | 13 0.200 16 | 14 1.000 17 | 15 0.406 18 | 16 1.000 19 | 17 0.500 20 | 19 0.333 21 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/context_relevance_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_RELEVANCE 2 | 0 0.857 3 | 1 0.444 4 | 2 0.500 5 | 3 1.000 6 | 4 1.000 7 | 5 1.000 8 | 6 0.600 9 | 7 1.000 10 | 8 0.200 11 | 9 0.111 12 | 10 0.875 13 | 11 1.000 14 | 12 0.500 15 | 13 0.200 16 | 14 1.000 17 | 16 0.500 18 | 17 0.750 19 | 18 0.429 20 | 19 0.333 21 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/context_utilization_report.tsv: -------------------------------------------------------------------------------- 1 | #QID CONTEXT_UTILIZATION 2 | 0 0.382 3 | 1 0.451 4 | 2 0.500 5 | 3 1.000 6 | 4 0.604 7 | 5 1.000 8 | 6 0.167 9 | 7 1.000 10 | 8 0.000 11 | 9 0.419 12 | 10 0.764 13 | 11 1.000 14 | 12 0.000 15 | 13 0.200 16 | 14 1.000 17 | 15 1.000 18 | 16 1.000 19 | 17 1.000 20 | 19 0.000 21 | -------------------------------------------------------------------------------- /resources/reports/lcel-reports/faithfulness_report.tsv: -------------------------------------------------------------------------------- 1 | #QID FAITHFULNESS 2 | 0 0.262 3 | 1 0.909 4 | 2 0.200 5 | 3 0.500 6 | 4 1.000 7 | 5 1.000 8 | 6 0.583 9 | 7 0.969 10 | 8 0.950 11 | 9 0.778 12 | 10 0.600 13 | 11 0.333 14 | 12 0.000 15 | 13 0.300 16 | 14 1.000 17 | 15 0.844 18 | 16 0.375 19 | 17 0.812 20 | 19 0.000 21 | -------------------------------------------------------------------------------- /src/convert_amnestyQA_to_desired_format.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "source": [ 20 | "# this will convert the AmnestyQA dataset\n", 21 | "# https://huggingface.co/datasets/explodinggradients/amnesty_qa/blob/main/english.json\n", 22 | "# to the desired format that we are using in the competition\n", 23 | "# https://github.com/sujitpal/llm-rag-eval\n", 24 | "#\n", 25 | "# download the above json file to a file local to this notebook\n", 26 | "# (i had trouble reading this directly online - https://huggingface.co/datasets/explodinggradients/amnesty_qa/raw/main/english.json)" 27 | ], 28 | "metadata": { 29 | "id": "TuhTbzYKIqGG" 30 | }, 31 | "execution_count": 27, 32 | "outputs": [] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 28, 37 | "metadata": { 38 | "id": "XRqK1t_l0g7_" 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import json\n", 43 | "# input file\n", 44 | "input_file_path = 'english.json'\n", 45 | "# output file\n", 46 | "output_file_path = 'amnesty_qa.jsonl'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "source": [ 52 | "def split_newlines(input_string):\n", 53 | " \"\"\"Split the input string at newline characters and return a list of strings.\"\"\"\n", 54 | " if '\\n' in input_string:\n", 55 | " return input_string.split('\\n')\n", 56 | " else:\n", 57 | " return [input_string]" 58 | ], 59 | "metadata": { 60 | "id": "wECIVkMrl_r_" 61 | }, 62 | "execution_count": 29, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "source": [ 68 | "# Opening the JSON file and loading the data\n", 69 | "with open(input_file_path, 'r') as file:\n", 70 | " data = json.load(file)" 71 | ], 72 | "metadata": { 73 | "id": "piVFXu0Z0zL9" 74 | }, 75 | "execution_count": 31, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "source": [ 81 | "# the data is not nested / folded\n", 82 | "# so each of components is read in and the output folding happens next\n", 83 | "questions = data.get('question')\n", 84 | "ground_truths = data.get('ground_truths')\n", 85 | "answers = data.get('answer')\n", 86 | "contexts = data.get('contexts')" 87 | ], 88 | "metadata": { 89 | "id": "hjIQKOEr0_dQ" 90 | }, 91 | "execution_count": 32, 92 | "outputs": [] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "source": [ 97 | "# output will be lines\n", 98 | "lines = []" 99 | ], 100 | "metadata": { 101 | "id": "Et8TapVb4pjj" 102 | }, 103 | "execution_count": 33, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "source": [ 109 | "# assemble from the nth item from each component list\n", 110 | "# blow out the multi-line contexts into single line chunk dictionaries\n", 111 | "for i in range(len(questions)):\n", 112 | " line = {}\n", 113 | " line['id'] = i\n", 114 | " line['query'] = questions[i]\n", 115 | " context_list = []\n", 116 | " split_contexts = split_newlines(contexts[i][0])\n", 117 | " for j in range(len(split_contexts)):\n", 118 | " context_dict = {}\n", 119 | " context_dict['id'] = str(j)\n", 120 | " context_dict['chunk_text'] = split_contexts[j]\n", 121 | " context_list.append(context_dict)\n", 122 | " line['context'] = context_list\n", 123 | " line['ideal_answer'] = ground_truths[i][0]\n", 124 | " line['predicted_answer'] = answers[i]\n", 125 | " lines.append(line)" 126 | ], 127 | "metadata": { 128 | "id": "T5d9q2Jo1I9l" 129 | }, 130 | "execution_count": 34, 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "source": [ 136 | "# write the lines to the output file\n", 137 | "with open(output_file_path, 'w') as outfile:\n", 138 | " for dictionary in lines:\n", 139 | " json_line = json.dumps(dictionary) # Convert the dictionary to a JSON string\n", 140 | " outfile.write(json_line + '\\n') # Write the JSON string to the file and add a newline" 141 | ], 142 | "metadata": { 143 | "id": "CF7_eK3A1w20" 144 | }, 145 | "execution_count": 36, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "source": [], 151 | "metadata": { 152 | "id": "CK2v4BF4rmwp" 153 | }, 154 | "execution_count": 36, 155 | "outputs": [] 156 | } 157 | ] 158 | } -------------------------------------------------------------------------------- /src/generate_datasets.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import json 4 | import logging 5 | import os 6 | 7 | from dotenv import load_dotenv, find_dotenv 8 | from langchain_google_genai import ( 9 | ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings 10 | ) 11 | from typing import List 12 | 13 | import prompted.faithfulness as faithfulness_p 14 | import prompted.answer_relevance as answer_relevance_p 15 | import prompted.context_precision as context_precision_p 16 | import prompted.context_relevance as context_relevance_p 17 | import prompted.context_recall as context_recall_p 18 | import prompted.answer_correctness as answer_correctness_p 19 | from metrics import Metrics 20 | 21 | 22 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) 23 | 24 | # Safety config 25 | 26 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory 27 | 28 | safety_settings = { 29 | HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 30 | HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH, 31 | HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 32 | HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH 33 | } 34 | 35 | async def generate_faithfulness_dataset(id: int, 36 | question: str, 37 | answer: str, 38 | context: List[str], 39 | run_parallel: bool, 40 | model, 41 | logger, 42 | fout): 43 | statements = faithfulness_p._get_statements_from_answer( 44 | question, answer, model, logger) 45 | entailments = await faithfulness_p._get_entailments_from_context( 46 | context, statements, model, logger, 47 | parallel=run_parallel) 48 | score = faithfulness_p._compute_faithfulness(entailments) 49 | fout.write(json.dumps({ 50 | "id": id, 51 | "question": question, 52 | "context": context, 53 | "answer": answer, 54 | "statements": statements, 55 | "entailments": entailments, 56 | "score": score 57 | }) + "\n") 58 | 59 | 60 | async def generate_answer_relevance_dataset(id: int, 61 | question: str, 62 | context: List[str], 63 | answer: str, 64 | run_parallel: bool, 65 | model, 66 | encoder, 67 | logger, 68 | fout): 69 | context_str = answer_relevance_p._flatten_context(context) 70 | gen_questions = \ 71 | answer_relevance_p._generate_questions_from_answer_and_context( 72 | context_str, answer, 5, model, logger) 73 | qa_pairs = await answer_relevance_p._predict_noncommittal_from_questions( 74 | gen_questions, context_str, run_parallel, model, 75 | logger) 76 | score = answer_relevance_p._compute_answer_relevance( 77 | question, qa_pairs, encoder, logger) 78 | fout.write(json.dumps({ 79 | "id": id, 80 | "question": question, 81 | "context": context, 82 | "answer": answer, 83 | "gen_questions": gen_questions, 84 | "non_commitals": [qap.noncommittal for qap in qa_pairs], 85 | "score": score 86 | }) + "\n") 87 | 88 | 89 | async def generate_context_precision_dataset(id: int, 90 | question: str, 91 | answer: str, 92 | context: List[str], 93 | run_parallel: bool, 94 | model, 95 | logger, 96 | fout): 97 | precs = await context_precision_p._compute_usefulness_scores( 98 | question, context, answer, run_parallel, model, logger) 99 | score = context_precision_p._compute_content_precision(precs) 100 | fout.write(json.dumps({ 101 | "id": id, 102 | "question": question, 103 | "context": context, 104 | "answer": answer, 105 | "precision": precs, 106 | "score": score 107 | }) + "\n") 108 | 109 | 110 | async def generate_context_relevance_dataset(id: int, 111 | question: str, 112 | context: List[str], 113 | run_parallel: bool, 114 | model, 115 | logger, 116 | fout): 117 | num_total_sents, context_markdowns = \ 118 | context_relevance_p._convert_to_markdown_lists(context) 119 | score = 0.0 120 | if num_total_sents > 0: 121 | necessary_sents = \ 122 | await context_relevance_p._generate_necessity_verdicts( 123 | question, context_markdowns, run_parallel, model, logger) 124 | score = context_relevance_p._compute_context_relevance_score( 125 | num_total_sents, necessary_sents) 126 | fout.write(json.dumps({ 127 | "id": id, 128 | "question": question, 129 | "context": context, 130 | "context_sents": context_markdowns, 131 | "necessary_sents": necessary_sents, 132 | "score": score 133 | }) + "\n") 134 | 135 | 136 | async def generate_context_recall_dataset(id: int, 137 | context: List[str], 138 | answer: str, 139 | run_parallel: bool, 140 | model, 141 | logger, 142 | fout): 143 | answer_md = context_recall_p._convert_answer_to_markdown_list( 144 | answer, logger) 145 | inferences = await \ 146 | context_recall_p._classify_ans_sents_attributable_to_context( 147 | answer_md, context, run_parallel, model, logger) 148 | score = context_recall_p._compute_context_recall_score(inferences) 149 | fout.write(json.dumps({ 150 | "id": id, 151 | "context": context, 152 | "answer_md": answer_md, 153 | "answer": answer, 154 | "inferences": inferences, 155 | "score": score 156 | }) + "\n") 157 | 158 | 159 | async def generate_answer_correctness_dataset(id: int, 160 | answer: str, 161 | ideal_answer: str, 162 | model, 163 | logger, 164 | fout): 165 | classification = answer_correctness_p._do_classification( 166 | answer, ideal_answer, model, logger) 167 | statements_by_class_dict = {} 168 | for key in ["TP", "FP", "FN"]: 169 | statements_by_class_dict[key] = \ 170 | answer_correctness_p._get_statements_for_class( 171 | classification, key) 172 | score = answer_correctness_p._compute_answer_correctness_score( 173 | statements_by_class_dict) 174 | fout.write(json.dumps({ 175 | "id": id, 176 | "answer": answer, 177 | "ideal_answer": ideal_answer, 178 | "classification": statements_by_class_dict, 179 | "score": score 180 | }) + "\n") 181 | 182 | 183 | async def runner(): 184 | 185 | parser = argparse.ArgumentParser() 186 | parser.add_argument("--metric", type=str, 187 | choices=sorted([m.value for m in Metrics]), 188 | required=True, 189 | help="The metric to generate datasets for") 190 | parser.add_argument("--input", type=str, required=True, 191 | help="Full path to input JSONL file") 192 | parser.add_argument("--output", type=str, required=True, 193 | help="Full path to output directory") 194 | parser.add_argument("--parallel", action="store_true", 195 | help="Run in parallel where possible (default false)") 196 | parser.add_argument("--debug", action="store_true", 197 | help="Turn debugging on (default: false)") 198 | args = parser.parse_args() 199 | metric = args.metric 200 | input_fp = args.input 201 | output_fp = os.path.join(args.output, f"{metric}.jsonl") 202 | run_parallel = args.parallel 203 | 204 | _ = load_dotenv(find_dotenv()) 205 | 206 | logger = logging.getLogger(__name__) 207 | logger.setLevel(logging.DEBUG if args.debug else logging.INFO) 208 | 209 | model = ChatGoogleGenerativeAI( 210 | model="gemini-pro", 211 | api_key=os.environ["GOOGLE_API_KEY"], 212 | temperature=0.0, 213 | safety_settings=safety_settings) 214 | encoder = GoogleGenerativeAIEmbeddings(model="models/embedding-001") 215 | 216 | os.makedirs(args.output, exist_ok=True) 217 | 218 | with open(input_fp, "r", encoding="utf-8") as fin, \ 219 | open(output_fp, "w", encoding="utf-8") as fout: 220 | for line in fin: 221 | record = json.loads(line) 222 | id = record["id"] 223 | # if int(id) != 14: 224 | # continue 225 | question = record["query"] 226 | context = [ctx["chunk_text"] for ctx in record["context"]] 227 | answer = record["predicted_answer"] 228 | ideal_answer = record["ideal_answer"] 229 | 230 | logger.info(f"Processing query ({id}): {question}") 231 | 232 | match Metrics(metric): 233 | case Metrics.FAITHFULNESS: 234 | await generate_faithfulness_dataset( 235 | id, question, answer, context, run_parallel, 236 | model, logger, fout) 237 | case Metrics.ANSWER_RELEVANCE: 238 | await generate_answer_relevance_dataset( 239 | id, question, context, answer, run_parallel, 240 | model, encoder, logger, fout) 241 | case Metrics.CONTEXT_PRECISION: 242 | await generate_context_precision_dataset( 243 | id, question, answer, context, run_parallel, 244 | model, logger, fout) 245 | case Metrics.CONTEXT_RELEVANCE: 246 | await generate_context_relevance_dataset( 247 | id, question, context, run_parallel, model, 248 | logger, fout) 249 | case Metrics.CONTEXT_RECALL: 250 | await generate_context_recall_dataset( 251 | id, context, answer, run_parallel, model, 252 | logger, fout) 253 | case Metrics.ANSWER_SIMILARITY: 254 | raise NotImplementedError( 255 | "Use prompted version of answer similarity") 256 | case Metrics.ANSWER_CORRECTNESS: 257 | await generate_answer_correctness_dataset( 258 | id, answer, ideal_answer, model, logger, fout) 259 | case _: 260 | pass 261 | 262 | 263 | if __name__ == "__main__": 264 | asyncio.run(runner()) 265 | -------------------------------------------------------------------------------- /src/learned/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/src/learned/__init__.py -------------------------------------------------------------------------------- /src/learned/answer_correctness.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | import os 4 | 5 | from typing import Dict 6 | 7 | from .learning_utils import score_metric, optimize_prompt 8 | 9 | 10 | DATA_DIR = "../data" 11 | RESOURCE_DIR = "../resources" 12 | 13 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets") 14 | DATASET_FP = os.path.join(DATASET_DIR, "answer_correctness.jsonl") 15 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs") 16 | BEST_CONFIG = os.path.join(CONFIGS_DIR, "answer_correctness-best.json") 17 | DEBUG = False 18 | 19 | 20 | class AnswerToFacts(dspy.Signature): 21 | """ Extract 5-10 facts from the answer""" 22 | answer = dspy.InputField(desc="answer to extract facts from") 23 | facts = dspy.OutputField(desc="facts extracted from answer") 24 | 25 | 26 | class FactGroupings(dspy.Signature): 27 | """ Classify each answer fact with respect to ground truth fact as follows: 28 | - TP (true positive): fact is present in both answer and ground truth 29 | - FP (false positive): fact is present in answer but not in ground truth 30 | - FN (false negative): fact is present in ground truth not in answer. 31 | Group the input facts as numbered lists under these 3 categories 32 | """ 33 | facts_g = dspy.InputField(desc="facts from ground truth", format=str) 34 | facts_a = dspy.InputField(desc="facts from answer", format=str) 35 | fact_groups = dspy.OutputField( 36 | desc="groupings of facts as JSON string with keys TP, FP, FN and " 37 | "list of facts for each as values") 38 | 39 | 40 | class AnswerCorrectness(dspy.Module): 41 | def __init__(self): 42 | super().__init__() 43 | self.fact_extractor = dspy.Predict(AnswerToFacts) 44 | self.fact_grouper = dspy.ChainOfThought(FactGroupings) 45 | 46 | def _parse_json_response(self, fact_groups: str) -> Dict[str, int]: 47 | fact_groups_j = json.loads(fact_groups 48 | .replace("```json", "") 49 | .replace("```", "")) 50 | return {k: len(v) for k, v in fact_groups_j.items()} 51 | 52 | def forward(self, answer, ground_truth): 53 | dspy.logger.debug(f"input answer: {answer}, ground_truth: {ground_truth}") 54 | facts_g = self.fact_extractor(answer=ground_truth).facts 55 | dspy.logger.debug(f"facts from ground truth: {facts_g}") 56 | facts_a = self.fact_extractor(answer=answer).facts 57 | dspy.logger.debug(f"facts from answer: {facts_a}") 58 | fact_groups = self.fact_grouper( 59 | facts_g=facts_g, facts_a=facts_a).fact_groups 60 | dspy.logger.debug(f"fact groups: {fact_groups}") 61 | group_counts = self._parse_json_response(fact_groups) 62 | tp = group_counts.get("TP", 0) 63 | fp = group_counts.get("FP", 0) 64 | fn = group_counts.get("FN", 0) 65 | score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0.0 66 | dspy.logger.debug(f"tp: {tp}, fp: {fp}, fn: {fn}, score: {score}") 67 | return dspy.Prediction(score=score) 68 | 69 | 70 | def answer_correctness_dataset(file_path): 71 | if not os.path.exists(file_path): 72 | raise FileNotFoundError( 73 | f"answer correctness dataset {file_path} not found, " 74 | f"create it using generate_datasets.py first.") 75 | 76 | examples = [] 77 | with open(file_path, "r") as fin: 78 | for line in fin: 79 | record = json.loads(line) 80 | answer = record["answer"] 81 | ground_truth = record["ideal_answer"] 82 | score = record["score"] 83 | examples.append(dspy.Example( 84 | answer=answer, ground_truth=ground_truth, score=str(score)) 85 | .with_inputs("answer", "ground_truth")) 86 | return examples 87 | 88 | 89 | def compute_answer_correctness(answer: str, 90 | ideal_answer: str, 91 | prompts_dict) -> float: 92 | try: 93 | answer_correctness_opt = prompts_dict["answer_correctness"] 94 | except KeyError: 95 | answer_correctness_opt = optimize_prompt("answer_correctness", 96 | CONFIGS_DIR, 97 | answer_correctness_dataset, 98 | DATASET_FP, 99 | score_metric, 100 | AnswerCorrectness()) 101 | prompts_dict["answer_correctness"] = answer_correctness_opt 102 | pred = answer_correctness_opt(answer=answer, ground_truth=ideal_answer) 103 | return float(pred.score) 104 | -------------------------------------------------------------------------------- /src/learned/answer_relevance.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | import numpy as np 4 | import os 5 | 6 | from typing import List 7 | 8 | from .learning_utils import ( 9 | string_to_list, list_to_string, string_to_bool, 10 | score_metric, optimize_prompt 11 | ) 12 | 13 | 14 | DATA_DIR = "../data" 15 | RESOURCE_DIR = "../resources" 16 | 17 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets") 18 | DATASET_FP = os.path.join(DATASET_DIR, "answer_relevance.jsonl") 19 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs") 20 | 21 | 22 | class AnswerContextToGenQuestions(dspy.Signature): 23 | """ Given the answer and context, generate 3 to 5 questions that can be 24 | answered by the answer. """ 25 | answer: str = dspy.InputField(desc="the answer") 26 | context: str = dspy.InputField(desc="the context of answer") 27 | gen_questions: str = dspy.OutputField( 28 | desc="list of questions that can be answered by answer") 29 | 30 | 31 | class QuestionContextGenQToNonCommital(dspy.Signature): 32 | """ Given a question and its context, use only the context to generate 33 | an answer, and classify if the answer is commital or noncommital. 34 | """ 35 | question: str = dspy.InputField(desc="the question") 36 | context: str = dspy.InputField(desc="the context of question") 37 | answer: str = dspy.OutputField(desc="the generated answer") 38 | noncommital: str = dspy.OutputField(desc="yes or no") 39 | 40 | 41 | class AnswerRelevance(dspy.Module): 42 | def __init__(self, encoder): 43 | super().__init__() 44 | self.question_generator = dspy.Predict( 45 | AnswerContextToGenQuestions) 46 | self.answer_classifier = dspy.ChainOfThought( 47 | QuestionContextGenQToNonCommital) 48 | self.encoder = encoder 49 | 50 | def _cosine_similarity(self, source, targets): 51 | source = source.reshape(1, -1) 52 | sims = np.dot(source, targets.T) / ( 53 | np.linalg.norm(source) * np.linalg.norm(targets, axis=1)) 54 | return np.mean(sims) 55 | 56 | def _compute_score(self, q_list: List[str]): 57 | embeddings = self.encoder.embed_documents(q_list) 58 | E = np.array(embeddings) 59 | source, targets = E[0, :], E[1:, :] 60 | if len(targets) == 0: 61 | return 0.0 62 | return self._cosine_similarity(source, targets) 63 | 64 | def forward(self, question: str, answer: str, context: str): 65 | dspy.logger.debug(f"input question: {question}, answer: {answer}, " 66 | f"context: {context}") 67 | gen_questions = self.question_generator( 68 | answer=answer, context=context).gen_questions 69 | dspy.logger.debug(f"gen_questions: {gen_questions}") 70 | q_list = [question] 71 | for gen_q in string_to_list(gen_questions): 72 | ans_cls = self.answer_classifier(question=gen_q, context=context) 73 | noncommital = ans_cls.noncommital 74 | if not string_to_bool(noncommital, choices=["yes", "no"]): 75 | q_list.append(gen_q) 76 | dspy.logger.debug(f"q_list: {q_list}") 77 | score = self._compute_score(q_list) 78 | dspy.logger.debug(f"score: {score}") 79 | return dspy.Prediction(score=str(score)) 80 | 81 | 82 | def answer_relevance_dataset(file_path): 83 | if not os.path.exists(file_path): 84 | raise FileNotFoundError( 85 | f"answer relevance dataset: {file_path} not found, " 86 | f"create it with generate_datasets.py first.") 87 | examples = [] 88 | with open(file_path, "r", encoding="utf-8") as fin: 89 | for line in fin: 90 | record = json.loads(line) 91 | question = record["question"] 92 | answer = record["answer"] 93 | context = list_to_string(record["context"], style="number") 94 | score = record["score"] 95 | examples.append(dspy.Example( 96 | question=question, answer=answer, 97 | context=context, score=score) 98 | .with_inputs("question", "answer", "context")) 99 | return examples 100 | 101 | 102 | def compute_answer_relevance(question: str, 103 | context: List[str], 104 | answer: str, 105 | prompts_dict, 106 | encoder): 107 | try: 108 | answer_relevance_opt = prompts_dict["answer_relevance"] 109 | except KeyError: 110 | answer_relevance_opt = optimize_prompt("answer_relevance", 111 | CONFIGS_DIR, 112 | answer_relevance_dataset, 113 | DATASET_FP, 114 | score_metric, 115 | AnswerRelevance(encoder=encoder)) 116 | prompts_dict["answer_relevance"] = answer_relevance_opt 117 | dspy.logger.debug(f"context: {context}") 118 | context_str = list_to_string(context, style="number") 119 | pred = answer_relevance_opt( 120 | question=question, answer=answer, context=context_str) 121 | return float(pred.score) 122 | -------------------------------------------------------------------------------- /src/learned/context_precision.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | import os 4 | 5 | from typing import List 6 | 7 | from .learning_utils import ( 8 | list_to_string, string_to_bool, score_metric, optimize_prompt 9 | ) 10 | 11 | 12 | DATA_DIR = "../data" 13 | RESOURCE_DIR = "../resources" 14 | 15 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets") 16 | DATASET_FP = os.path.join(DATASET_DIR, "context_precision.jsonl") 17 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs") 18 | 19 | 20 | class QuestionAnswerContextToUseful(dspy.Signature): 21 | """ Given a question, an answer to the question, and supporting context, 22 | provide a yes/no score indicating if the context was useful for 23 | answering the question.""" 24 | question: str = dspy.InputField(desc="the question") 25 | answer: str = dspy.InputField(desc="answer to question") 26 | context: str = dspy.InputField( 27 | desc="supporting context used to answer question") 28 | score: str = dspy.OutputField(desc="yes or no") 29 | 30 | 31 | class ContextPrecision(dspy.Module): 32 | def __init__(self): 33 | self.model = None 34 | self.usefulness_classifier = dspy.ChainOfThought( 35 | QuestionAnswerContextToUseful) 36 | 37 | def forward(self, question: str, answer: str, 38 | context: List[str]) -> str: 39 | dspy.logger.debug(f"input question: {question}, answer: {answer}, " 40 | f"context: {context}") 41 | scores, weights = [], [] 42 | for i, ctx in enumerate(context): 43 | pred = self.usefulness_classifier(question=question, 44 | answer=answer, 45 | context=ctx) 46 | scores.append(string_to_bool(pred.score, choices=["yes", "no"])) 47 | dspy.logger.debug(f"scores: {scores}") 48 | score = 0.0 49 | if len(scores) > 0: 50 | weights = [sum(scores[:i + 1]) / (i + 1) * scores[i] 51 | for i in range(len(scores))] 52 | dspy.logger.debug(f"weights: {weights}") 53 | score = (sum(w * s for w, s in 54 | zip(weights, scores)) / len(scores)) 55 | dspy.logger.debug(f"score: {score}") 56 | return dspy.Prediction(score=str(score)) 57 | 58 | 59 | def context_precision_dataset(file_path): 60 | if not os.path.exists(file_path): 61 | raise FileNotFoundError( 62 | f"context precision dataset: {file_path} not found, " 63 | f"create it with generate_datasets.py first.") 64 | examples = [] 65 | with open(file_path, "r", encoding="utf-8") as fin: 66 | for line in fin: 67 | record = json.loads(line) 68 | question = record["question"] 69 | context = list_to_string(record["context"], style="number") 70 | answer = record["answer"] 71 | score = record["score"] 72 | examples.append(dspy.Example( 73 | question=question, context=context, 74 | answer=answer, score=str(score)) 75 | .with_inputs("question", "context", "answer")) 76 | return examples 77 | 78 | 79 | def compute_context_precision(question: str, 80 | answer: str, 81 | context: List[str], 82 | prompts_dict): 83 | try: 84 | context_precision_opt = prompts_dict["context_precision"] 85 | except KeyError: 86 | context_precision_opt = optimize_prompt("context_precision", 87 | CONFIGS_DIR, 88 | context_precision_dataset, 89 | DATASET_FP, 90 | score_metric, 91 | ContextPrecision()) 92 | prompts_dict["context_precision"] = context_precision_opt 93 | pred = context_precision_opt(question=question, 94 | answer=answer, 95 | context=context) 96 | return float(pred.score) 97 | -------------------------------------------------------------------------------- /src/learned/context_recall.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | import nltk 4 | import numpy as np 5 | import os 6 | 7 | from typing import List 8 | 9 | from .learning_utils import string_to_bool_array, score_metric, optimize_prompt 10 | 11 | 12 | DATA_DIR = "../data" 13 | RESOURCE_DIR = "../resources" 14 | 15 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets") 16 | DATASET_FP = os.path.join(DATASET_DIR, "context_recall.jsonl") 17 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs") 18 | BEST_CONFIG = os.path.join(CONFIGS_DIR, "context_recall-best.json") 19 | 20 | 21 | class ContextItemAnswerToScore(dspy.Signature): 22 | """ Given a context item and an answer, for each sentence in the answer, 23 | classify if the sentence can be attributed to the context item. """ 24 | answer = dspy.InputField(desc="the answer", format=str) 25 | context_item = dspy.InputField(desc="the context item") 26 | scores = dspy.OutputField( 27 | desc="yes/no for each answer sentence if it is attributale to context") 28 | 29 | 30 | class ContextRecall(dspy.Module): 31 | def __init__(self): 32 | super().__init__() 33 | self.attrib_clf = dspy.ChainOfThought(ContextItemAnswerToScore) 34 | 35 | def forward(self, context: List[str], answer: str): 36 | dspy.logger.debug(f"input context: {context}, answer: {answer}") 37 | answer_sents = [sent for sent 38 | in nltk.sent_tokenize(answer.replace("\n", "")) 39 | if len(sent.strip()) > 0][0:10] 40 | dspy.logger.debug(f"answer sentences: {answer_sents}") 41 | scores = [] 42 | for context_item in context: 43 | if len(context_item.strip()) < 10: 44 | continue 45 | ctx_score = 0.0 46 | try: 47 | ctx_scores = self.attrib_clf( 48 | answer=answer_sents, 49 | context_item=context_item).scores 50 | num_pos, num_neg = string_to_bool_array( 51 | ctx_scores, choices=["yes", "no"]) 52 | if num_pos + num_neg > 0: 53 | ctx_score = num_pos / (num_pos + num_neg) 54 | except Exception: 55 | pass 56 | # print(f"context: {context_item}, score: {ctx_score}") 57 | scores.append(ctx_score) 58 | dspy.logger.debug(f"scores: {scores}") 59 | score = 0.0 60 | if len(scores) > 0: 61 | score = np.mean(scores) 62 | dspy.logger.debug(f"score: {score}") 63 | return dspy.Prediction(score=str(score)) 64 | 65 | 66 | def context_recall_dataset(file_path): 67 | if not os.path.exists(file_path): 68 | raise FileNotFoundError( 69 | f"context recall dataset: {file_path} not found, " 70 | "create it with generate_datasets.py first.") 71 | examples = [] 72 | with open(file_path, "r", encoding="utf-8") as fin: 73 | for line in fin: 74 | record = json.loads(line) 75 | answer = record["answer"] 76 | context = record["context"] 77 | score = record["score"] 78 | examples.append(dspy.Example( 79 | answer=answer, 80 | context=context, 81 | score=str(score)) 82 | .with_inputs("answer", "context")) 83 | return examples 84 | 85 | 86 | def compute_context_recall(context: List[str], 87 | answer: str, 88 | prompts_dict): 89 | try: 90 | context_recall_opt = prompts_dict["context_recall"] 91 | except KeyError: 92 | context_recall_opt = optimize_prompt("context_recall", 93 | CONFIGS_DIR, 94 | context_recall_dataset, 95 | DATASET_FP, 96 | score_metric, 97 | ContextRecall()) 98 | prompts_dict["context_recall"] = context_recall_opt 99 | pred = context_recall_opt(context=context, answer=answer) 100 | return float(pred.score) 101 | -------------------------------------------------------------------------------- /src/learned/context_relevance.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | import nltk 4 | import os 5 | import time 6 | 7 | import ssl 8 | 9 | try: 10 | nltk.data.find('tokenizers/punkt') 11 | except LookupError: 12 | try: 13 | _create_unverified_https_context = ssl._create_unverified_context 14 | except AttributeError: 15 | pass 16 | else: 17 | ssl._create_default_https_context = _create_unverified_https_context 18 | 19 | nltk.download('punkt') 20 | 21 | from typing import List 22 | 23 | from .learning_utils import ( 24 | list_to_string, string_to_bool, score_metric, 25 | optimize_prompt 26 | ) 27 | 28 | 29 | DATA_DIR = "../data" 30 | RESOURCE_DIR = "../resources" 31 | 32 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets") 33 | DATASET_FP = os.path.join(DATASET_DIR, "context_relevance.jsonl") 34 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs") 35 | 36 | 37 | class QuestionCtxSentToScore(dspy.Signature): 38 | """ Given a question and a sentence from the context, classify 39 | if sentence is absolutely necessary to answer question 40 | """ 41 | question: str = dspy.InputField(desc="the question") 42 | ctx_sent: str = dspy.InputField(desc="a sentence from the context") 43 | score: float = dspy.OutputField(desc="yes or no") 44 | 45 | 46 | class ContextRelevance(dspy.Module): 47 | def __init__(self): 48 | super().__init__() 49 | self.nec_classifier = dspy.ChainOfThought(QuestionCtxSentToScore) 50 | 51 | def forward(self, question: str, context: List[str]): 52 | dspy.logger.debug(f"input question: {question}, context: {context}") 53 | ctx_scores = [] 54 | for ctx in context: 55 | sent_scores = [] 56 | for ctx_sent in nltk.sent_tokenize(ctx): 57 | score = self.nec_classifier(question=question, 58 | ctx_sent=ctx_sent).score 59 | sent_scores.append(string_to_bool(score, choices=["yes", "no"])) 60 | if len(sent_scores) == 0: 61 | ctx_scores.append(0.0) 62 | else: 63 | ctx_scores.append(sum(sent_scores) / len(sent_scores)) 64 | # to prevent ResourceExhaustedError 65 | time.sleep(0.3) 66 | dspy.logger.debug(f"context scores: {ctx_scores}") 67 | score = 0.0 68 | if len(ctx_scores) > 0: 69 | score = sum(ctx_scores) / len(ctx_scores) 70 | dspy.logger.debug(f"score: {score}") 71 | return dspy.Prediction(score=str(score)) 72 | 73 | 74 | def context_relevance_dataset(file_path: str): 75 | if not os.path.exists(file_path): 76 | raise FileNotFoundError( 77 | f"context relevance dataset: {file_path} not found, " 78 | "create it with generate_datasets.py first.") 79 | examples = [] 80 | with open(file_path, "r", encoding="utf-8") as fin: 81 | for line in fin: 82 | record = json.loads(line) 83 | question = record["question"] 84 | context = record["context"] 85 | score = record["score"] 86 | examples.append(dspy.Example( 87 | question=question, 88 | context=list_to_string(context), 89 | score=str(score) 90 | ).with_inputs("question", "context")) 91 | return examples 92 | 93 | 94 | def compute_context_relevance(question: str, 95 | context: List[str], 96 | prompts_dict): 97 | try: 98 | context_relevance_opt = prompts_dict["context_relevance"] 99 | except KeyError: 100 | context_relevance_opt = optimize_prompt("context_relevance", 101 | CONFIGS_DIR, 102 | context_relevance_dataset, 103 | DATASET_FP, 104 | score_metric, 105 | ContextRelevance()) 106 | prompts_dict["context_relevance"] = context_relevance_opt 107 | pred = context_relevance_opt(question=question, context=context) 108 | return float(pred.score) 109 | -------------------------------------------------------------------------------- /src/learned/faithfulness.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | import os 4 | 5 | from typing import List 6 | 7 | from .learning_utils import ( 8 | list_to_string, string_to_list, string_to_bool, 9 | score_metric, optimize_prompt 10 | ) 11 | 12 | 13 | DATA_DIR = "../data" 14 | RESOURCE_DIR = "../resources" 15 | 16 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets") 17 | DATASET_FP = os.path.join(DATASET_DIR, "faithfulness.jsonl") 18 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs") 19 | 20 | 21 | class QuestAnswerToFacts(dspy.Signature): 22 | """ Given a question-answer pair, generate a list of 3-5 facts 23 | from the answer 24 | """ 25 | question: str = dspy.InputField(desc="a question") 26 | answer: str = dspy.InputField(desc="an answer") 27 | facts: str = dspy.OutputField(desc="a list of facts") 28 | 29 | 30 | class ContextFactsToScore(dspy.Signature): 31 | """ Classify if fact can be inferred from context """ 32 | context: str = dspy.InputField(desc="a context") 33 | fact: str = dspy.InputField(desc="a fact") 34 | score: bool = dspy.OutputField( 35 | desc="can fact be inferred from context? yes or no") 36 | 37 | 38 | class Faithfulness(dspy.Module): 39 | def __init__(self): 40 | super().__init__() 41 | self.extractor = dspy.Predict(QuestAnswerToFacts) 42 | self.scorer = dspy.Predict(ContextFactsToScore) 43 | 44 | def forward(self, question: str, answer: str, context: str): 45 | dspy.logger.debug(f"input question: {question}, answer: {answer}, " 46 | f"context: {context}") 47 | facts = self.extractor(question=question, answer=answer).facts 48 | dspy.logger.debug(f"facts: {facts}") 49 | scores = [] 50 | for fact in string_to_list(facts): 51 | can_infer = self.scorer(context=context, fact=fact).score 52 | scores.append(string_to_bool(can_infer, ["yes", "no"])) 53 | dspy.logger.debug(f"scores: {scores}") 54 | score = sum(scores) / len(scores) 55 | dspy.logger.debug(f"score: {score}") 56 | return dspy.Prediction(score=str(score)) 57 | 58 | 59 | def faithfulness_dataset(file_path): 60 | if not os.path.exists(file_path): 61 | raise FileNotFoundError( 62 | f"Faithfulness dataset: {file_path} not found, " 63 | "create it with generate_datasets.py first.") 64 | examples = [] 65 | with open(file_path, "r", encoding="utf-8") as fin: 66 | for line in fin: 67 | record = json.loads(line) 68 | question = record["question"] 69 | answer = record["answer"] 70 | context = list_to_string(record["context"], style="number") 71 | score = record["score"] 72 | examples.append(dspy.Example( 73 | question=question, 74 | answer=answer, 75 | context=context, 76 | score=str(score)) 77 | .with_inputs("question", "answer", "context")) 78 | return examples 79 | 80 | 81 | def compute_faithfulness(question: str, 82 | answer: str, 83 | context: List[str], 84 | prompts_dict): 85 | try: 86 | faithfulness_opt = prompts_dict["faithfulness"] 87 | except KeyError: 88 | faithfulness_opt = optimize_prompt("faithfulness", 89 | CONFIGS_DIR, 90 | faithfulness_dataset, 91 | DATASET_FP, 92 | score_metric, 93 | Faithfulness()) 94 | prompts_dict["faithfulness"] = faithfulness_opt 95 | pred = faithfulness_opt( 96 | question=question, answer=answer, 97 | context=list_to_string(context, style="number")) 98 | return float(pred.score) 99 | -------------------------------------------------------------------------------- /src/learned/learning_utils.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import glob 3 | import re 4 | import numpy as np 5 | import os 6 | import shutil 7 | 8 | from dspy.evaluate import Evaluate 9 | from dspy.teleprompt import BootstrapFewShotWithRandomSearch 10 | from sklearn.model_selection import train_test_split 11 | from typing import Callable, List 12 | 13 | 14 | STYLE_TO_CHAR = { 15 | "dash": "-", 16 | "star": "*", 17 | } 18 | NUM_BULLET_PATTERM = re.compile(r"^\d+(\.)?\s(.*?)$") 19 | GEN_BULLET_PATTERN = re.compile(r"^[*-]\s(.*?)$") 20 | 21 | 22 | def string_to_list(xs: str, style: str = "dash") -> List[str]: 23 | """ Convert a string with embedded newlines to a list of strings 24 | separated by newlines. 25 | 26 | :param xs: string with embedded newlines 27 | :param style: bullet style to strip (dash, star, number) 28 | :return: a list of strings 29 | """ 30 | stripped_xs = [] 31 | for x in xs.split("\n"): 32 | if NUM_BULLET_PATTERM.match(x): 33 | stripped_x = NUM_BULLET_PATTERM.match(x).group(2) 34 | elif GEN_BULLET_PATTERN.match(x): 35 | stripped_x = GEN_BULLET_PATTERN.match(x).group(1) 36 | else: 37 | stripped_x = x 38 | stripped_xs.append(stripped_x) 39 | return stripped_xs 40 | 41 | 42 | def list_to_string(xs: List[str], style: str = "dash") -> str: 43 | """ Convert a list of strings to a string with embedded newlines. 44 | Strips (known) bullet chars off each string element and 45 | adds in the appropriate bullet specified by the style. 46 | 47 | :param xs: list of strings 48 | :param style: bullet style to strip (dash, star, number) 49 | :return: a string with embedded newlines 50 | """ 51 | stripped_xs = [] 52 | for i, x in enumerate(xs): 53 | if NUM_BULLET_PATTERM.match(x): 54 | stripped_x = NUM_BULLET_PATTERM.match(x).group(2) 55 | elif GEN_BULLET_PATTERN.match(x): 56 | stripped_x = GEN_BULLET_PATTERN.match(x).group(1) 57 | else: 58 | stripped_x = x 59 | if style == "number": 60 | stripped_x = f"{i+1}. {stripped_x}" 61 | elif style in ["dash", "star"]: 62 | bullet = STYLE_TO_CHAR.get(style, "-") 63 | stripped_x = f"{bullet} {stripped_x}" 64 | else: 65 | pass 66 | stripped_xs.append(stripped_x) 67 | return "\n".join(stripped_xs) 68 | 69 | 70 | def string_to_bool(s: str, choices: List[str]) -> bool: 71 | """ Convert a string to a boolean given list of choices representing 72 | True and False response 73 | 74 | :param s: string to convert 75 | :param choices: list of strings ["True", "False] 76 | :return: boolean 77 | """ 78 | matches = [i for i, c in enumerate(choices) 79 | if re.search(c.lower(), s.lower()) is not None] 80 | if len(matches) == 0: 81 | return False 82 | else: 83 | return True if matches[0] == 0 else False 84 | 85 | 86 | def string_to_bool_array(s: str, choices: List[str]) -> List[bool]: 87 | """ Convert a string to a list of booleans given list of choices 88 | representing True and False response 89 | 90 | :param s: string to convert 91 | :param choices: list of strings ["True", "False] 92 | :return: counts for each choice found 93 | """ 94 | patterns = [re.compile(c) for c in choices] 95 | counts = [len(re.findall(p, s)) for p in patterns] 96 | return counts 97 | 98 | 99 | def strip_newlines(s: str) -> str: 100 | """ Strip newlines from a string 101 | 102 | :param s: string to strip 103 | :return: string with newlines stripped 104 | """ 105 | s = re.sub(r"\n", " ", s) 106 | s = re.sub(r"\s+", " ", s) 107 | return s 108 | 109 | 110 | def score_metric(example, pred, trace=None): 111 | """ Common score metric used in all learning metrics """ 112 | if trace is None: 113 | return 1.0 - abs(float(example.score) - float(pred.score)) 114 | else: 115 | return float(pred.score) # inference 116 | 117 | 118 | def optimize_prompt(metric_name: str, 119 | config_dir: str, 120 | dataset_generator_fn: Callable, 121 | dataset_fp: str, 122 | score_metric: Callable, 123 | metric_instance: dspy.Module): 124 | 125 | config_fn_pattern = f"{metric_name}-*.json" 126 | config_paths = glob.glob(os.path.join(config_dir, config_fn_pattern)) 127 | best_config_fp = os.path.join(config_dir, f"{metric_name}-best.json") 128 | 129 | if len(config_paths) == 0: 130 | teleprompter = BootstrapFewShotWithRandomSearch( 131 | metric=score_metric, 132 | max_bootstrapped_demos=2, 133 | max_labeled_demos=2, 134 | num_threads=1 135 | ) 136 | examples = dataset_generator_fn(dataset_fp) 137 | trainset, devset = train_test_split(examples, test_size=0.3, 138 | random_state=42) 139 | print( 140 | f"fact extractor dataset sizes: " 141 | f"{len(trainset)}, {len(devset)}, total: {len(examples)}") 142 | 143 | print("--- training ---") 144 | faithfulness = metric_instance 145 | faithfulness_opt = teleprompter.compile( 146 | faithfulness, trainset=trainset) 147 | ensemble = [prog for *_, prog in 148 | faithfulness_opt.candidate_programs[:4]] 149 | 150 | os.makedirs(config_dir, exist_ok=True) 151 | for idx, prog in enumerate(ensemble): 152 | config_path = os.path.join( 153 | config_dir, f"{metric_name}-{idx}.json") 154 | config_paths.append(config_path) 155 | prog.save(config_path) 156 | 157 | print("--- evaluation ---") 158 | evaluate = Evaluate(devset=devset, metric=score_metric, 159 | num_threads=1, display_progress=True) 160 | scores = [evaluate(prog) for prog in ensemble] 161 | print(f"Evaluation scores: {scores}") 162 | best_prompt_id = np.argmax(scores) 163 | shutil.copy(config_paths[best_prompt_id], best_config_fp) 164 | 165 | prog = metric_instance 166 | prog.load(best_config_fp) 167 | return prog 168 | 169 | 170 | def clean_up_log_files(): 171 | log_files = ["openai_usage.log", 172 | "azure_openai_usage.log", 173 | "assertion.log"] 174 | for log_file in log_files: 175 | if os.path.exists(log_file): 176 | os.remove(log_file) 177 | -------------------------------------------------------------------------------- /src/make_more_questions.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import json 4 | import logging 5 | import os 6 | 7 | from dotenv import find_dotenv, load_dotenv 8 | from enum import Enum 9 | from langchain_google_genai import ( 10 | ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings 11 | ) 12 | 13 | from prompted.more_questions import compute_more_questions 14 | 15 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) 16 | 17 | 18 | DATA_DIR = "../data" 19 | REPORTS_DIR = os.path.join(DATA_DIR, "reports") 20 | 21 | # Safety config 22 | 23 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory 24 | 25 | safety_settings = { 26 | HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 27 | HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH, 28 | HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 29 | HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH 30 | } 31 | 32 | async def runner(): 33 | 34 | parser = argparse.ArgumentParser() 35 | 36 | parser.add_argument("--input-jsonl", type=str, required=True, 37 | help="Full path to evaluation data in JSONL format") 38 | parser.add_argument("--output-jsonl", type=str, required=False, 39 | help="Full path to output file") 40 | parser.add_argument("--debug", action="store_true", 41 | help="Turn debugging on (default: false)") 42 | parser.add_argument("--id-start", type=int, required=False, 43 | help="The number that the question ids in the output should start with (default 0)") 44 | parser.add_argument("--multiplier", type=int, required=False, 45 | help="The number of new questions to be generated PER question in input data (default 3)") 46 | parser.add_argument("--model-temp", type=float, required=False, 47 | help="The temperature of the model - between 0.0 and 1.0 (default 0.0)") 48 | parser.add_argument("--max", type=int, required=False, 49 | help="The maximum number of new questions to be generated total (no default)") 50 | 51 | args = parser.parse_args() 52 | input_fp = args.input_jsonl 53 | output_fp = args.output_jsonl 54 | if output_fp is None: 55 | output_fp = os.path.join(REPORTS_DIR, f"default_report.jsonl") 56 | debug = args.debug 57 | id_start = args.id_start 58 | if id_start is None: 59 | id_start = 0 60 | multiplier = args.multiplier 61 | if multiplier is None: 62 | multiplier = 3 63 | model_temp = args.model_temp 64 | if model_temp is None or model_temp > 1.0 or model_temp < 0.0: 65 | model_temp = 0.0 66 | maxq = args.max 67 | if maxq is None: 68 | maxq = 99999 69 | 70 | logger = logging.getLogger(__name__) 71 | logger.setLevel(logging.DEBUG if debug else logging.INFO) 72 | 73 | _ = load_dotenv(find_dotenv()) 74 | 75 | model = ChatGoogleGenerativeAI( 76 | model="gemini-pro", 77 | api_key=os.environ["GOOGLE_API_KEY"], 78 | temperature=model_temp, 79 | safety_settings=safety_settings) 80 | 81 | os.makedirs(REPORTS_DIR, exist_ok=True) 82 | 83 | with open(input_fp, "r", encoding="utf-8") as fin, \ 84 | open(output_fp, "w", encoding="utf-8") as fout: 85 | 86 | # fout.write("\t".join(["#QID", metric.upper()]) + "\n") 87 | q_counter = 0 88 | new_q_id = id_start 89 | for line in fin: 90 | record = json.loads(line) 91 | # extract relevant data to evaluate 92 | id = record["id"] 93 | # be done if hit max 94 | if q_counter + multiplier > maxq: 95 | break 96 | question = record["query"] 97 | context = [ctx["chunk_text"] for ctx in record["context"]] 98 | answer = record["predicted_answer"] 99 | ideal_answer = record["ideal_answer"] 100 | 101 | more_questions = compute_more_questions( 102 | question, ideal_answer, multiplier, model, logger) 103 | 104 | q_counter = q_counter + multiplier 105 | 106 | for q in more_questions: 107 | q["id"] = new_q_id 108 | new_q_id += 1 109 | fout.write(json.dumps(q) + '\n') 110 | 111 | 112 | if __name__ == "__main__": 113 | asyncio.run(runner()) 114 | -------------------------------------------------------------------------------- /src/metrics.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Metrics(Enum): 5 | FAITHFULNESS = "faithfulness" 6 | ANSWER_RELEVANCE = "answer_relevance" 7 | CONTEXT_PRECISION = "context_precision" 8 | CONTEXT_UTILIZATION = "context_utilization" 9 | CONTEXT_RELEVANCE = "context_relevance" 10 | CONTEXT_RECALL = "context_recall" 11 | ANSWER_SIMILARITY = "answer_similarity" 12 | ANSWER_CORRECTNESS = "answer_correctness" 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/prompted/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/src/prompted/__init__.py -------------------------------------------------------------------------------- /src/prompted/answer_correctness.py: -------------------------------------------------------------------------------- 1 | from langchain_core.output_parsers import StrOutputParser 2 | from langchain_core.language_models.chat_models import BaseChatModel 3 | from langchain.prompts import PromptTemplate 4 | from xml.sax.saxutils import escape 5 | 6 | from .prompt_utils import read_template_from_file, parse_response 7 | 8 | 9 | PROMPT_CLASSIFY_FACTS = "answer_correctness_1.txt" 10 | 11 | 12 | def _do_classification(answer: str, ideal_answer: str, 13 | model, logger): 14 | prompt_template = read_template_from_file(PROMPT_CLASSIFY_FACTS) 15 | prompt = PromptTemplate(template=prompt_template, 16 | input_variables=["answer", "ground_truth"]) 17 | chain = prompt | model | StrOutputParser() 18 | response = chain.invoke({ 19 | "answer": escape(answer), 20 | "ground_truth": escape(ideal_answer) 21 | }) 22 | result = parse_response(response) 23 | logger.debug("result:", result) 24 | classification = result.value["classes"] 25 | return classification 26 | 27 | 28 | def _get_statements_for_class(statements_dict, class_name): 29 | try: 30 | if statements_dict[class_name] is None: 31 | return [] 32 | else: 33 | return statements_dict[class_name]["sts"]["st"] 34 | except KeyError: 35 | return [] 36 | 37 | 38 | def _compute_answer_correctness_score(statements_by_class_dict): 39 | tp = len(statements_by_class_dict["TP"]) 40 | fp = len(statements_by_class_dict["FP"]) 41 | fn = len(statements_by_class_dict["FN"]) 42 | score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0.0 43 | return score 44 | 45 | 46 | def compute_answer_correctness(answer: str, 47 | ideal_answer: str, 48 | model: BaseChatModel, 49 | logger) -> float: 50 | classification = _do_classification(answer, ideal_answer, model, logger) 51 | statements_by_class_dict = {} 52 | for key in ["TP", "FP", "FN"]: 53 | statements_by_class_dict[key] = _get_statements_for_class( 54 | classification, key) 55 | score = _compute_answer_correctness_score(statements_by_class_dict) 56 | return score 57 | -------------------------------------------------------------------------------- /src/prompted/answer_relevance.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import numpy as np 3 | 4 | from langchain_core.output_parsers import StrOutputParser 5 | from langchain_core.embeddings import Embeddings 6 | from langchain_core.language_models.chat_models import BaseChatModel 7 | from langchain.prompts import PromptTemplate 8 | from pydantic import BaseModel, Field 9 | from typing import List 10 | 11 | from .prompt_utils import read_template_from_file, parse_response 12 | 13 | 14 | PROMPT_GEN_QUESTIONS = "answer_relevance_1.txt" 15 | PROMPT_CLASSIFY_NONCOMMITTAL = "answer_relevance_2.txt" 16 | 17 | 18 | class ClassifiedQAPair(BaseModel): 19 | question: str = Field(alias="question", description="Generated question") 20 | answer: str = Field(alias="answer", description="answer to generated question") 21 | noncommittal: str = Field(alias="noncommittal", description="Noncommittal (0/1)") 22 | 23 | 24 | def _cosine_similarity(query_vector, doc_vectors): 25 | query_vector = query_vector.reshape(1, -1) 26 | sims = np.dot(query_vector, doc_vectors.T) / ( 27 | np.linalg.norm(query_vector) * np.linalg.norm(doc_vectors, axis=1) 28 | ) 29 | return np.mean(sims) 30 | 31 | 32 | def _flatten_context(context: List[str]) -> str: 33 | return " ".join([f"{i+1}. {chunk}" for i, chunk in enumerate(context)]) 34 | 35 | 36 | def _generate_questions_from_answer_and_context(context_flat: str, 37 | answer: str, 38 | num_questions_to_generate: int, 39 | model: BaseChatModel, 40 | logger): 41 | # generate questions 42 | prompt_genq = read_template_from_file(PROMPT_GEN_QUESTIONS) 43 | prompt = PromptTemplate(template=prompt_genq, 44 | input_variables=[ 45 | "num_questions_to_generate", 46 | "answer", "context"]) 47 | chain_genq = prompt | model | StrOutputParser() 48 | response = chain_genq.invoke({ 49 | "num_questions_to_generate": num_questions_to_generate, 50 | "answer": answer, 51 | "context": context_flat 52 | }) 53 | result = parse_response(response) 54 | gen_questions = result.value["questions"]["question"] 55 | logger.debug(f"gen_questions: {gen_questions}") 56 | return gen_questions 57 | 58 | 59 | async def _predict_noncommittal_from_questions(gen_questions: List[str], 60 | context_flat: str, 61 | parallel: bool, 62 | model: BaseChatModel, 63 | logger): 64 | # generate answers to generated questions based on context and 65 | # classify answers as committal or non-committal 66 | prompt_anc = read_template_from_file(PROMPT_CLASSIFY_NONCOMMITTAL) 67 | prompt = PromptTemplate(template=prompt_anc, 68 | input_variables=["question", "context"]) 69 | chain_anc = prompt | model | StrOutputParser() 70 | 71 | qa_pairs = [] 72 | if parallel: 73 | tasks = [] 74 | for gen_question in gen_questions: 75 | tasks.append(chain_anc.ainvoke({ 76 | "question": gen_question, 77 | "context": context_flat 78 | })) 79 | responses = await asyncio.gather(*tasks) 80 | for response in responses: 81 | result = parse_response(response) 82 | qa_pair = ClassifiedQAPair(**result.value["qa_pair"]) 83 | qa_pairs.append(qa_pair) 84 | else: 85 | for gen_question in gen_questions: 86 | response = chain_anc.invoke({ 87 | "question": gen_question, 88 | "context": context_flat 89 | }) 90 | result = parse_response(response) 91 | qa_pair = ClassifiedQAPair(**result.value["qa_pair"]) 92 | qa_pairs.append(qa_pair) 93 | 94 | logger.debug(f"qa_pairs: {qa_pairs}") 95 | return qa_pairs 96 | 97 | 98 | def _compute_answer_relevance(question: str, 99 | qa_pairs: List[ClassifiedQAPair], 100 | encoder: Embeddings, 101 | logger): 102 | # if all non-committal questions, then answer is not relevant 103 | if np.all([qa_pair.noncommittal == "1" for qa_pair in qa_pairs]): 104 | logger.warning("cannot compute similarity, generated questions " 105 | "are all non-committal") 106 | return 0.0 107 | else: 108 | questions = [question] 109 | questions.extend([qa_pair.question for qa_pair in qa_pairs]) 110 | embeddings = encoder.embed_documents(questions) 111 | E = np.array(embeddings) 112 | source, target = E[0, :], E[1:, :] 113 | return _cosine_similarity(source, target) 114 | 115 | 116 | async def compute_answer_relevance(question: str, 117 | context: List[str], 118 | answer: str, 119 | model: BaseChatModel, 120 | encoder: Embeddings, 121 | logger, 122 | num_questions_to_generate: int = 5, 123 | parallel: bool = True 124 | ) -> float: 125 | # reformat context 126 | context_flat = _flatten_context(context) 127 | gen_questions = _generate_questions_from_answer_and_context( 128 | context_flat, answer, num_questions_to_generate, model, logger) 129 | qa_pairs = await _predict_noncommittal_from_questions( 130 | gen_questions, context_flat, parallel, model, logger) 131 | answer_relevance = _compute_answer_relevance( 132 | question, qa_pairs, encoder, logger) 133 | return answer_relevance 134 | -------------------------------------------------------------------------------- /src/prompted/answer_similarity.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy as np 3 | 4 | from langchain_core.embeddings import Embeddings 5 | 6 | 7 | def compute_answer_similarity(ideal_answer: str, 8 | predicted_answer: str, 9 | encoder: Embeddings, 10 | logger, 11 | cross_encoder: bool = True) -> float: 12 | 13 | logger.debug(f"predicted answer: {predicted_answer}") 14 | logger.debug(f"ideal answer: {ideal_answer}") 15 | 16 | if cross_encoder: 17 | # use cross encoder similarity scoring (token level) 18 | pa_words = [w for w in nltk.word_tokenize(predicted_answer)] 19 | ia_words = [w for w in nltk.word_tokenize(ideal_answer)] 20 | words = pa_words + ia_words 21 | embs = encoder.embed_documents(words) 22 | pa_vecs = np.array(embs[0:len(pa_words)]) 23 | ia_vecs = np.array(embs[len(pa_words):]) 24 | similarity = (np.sum( 25 | np.max( 26 | np.dot(ia_vecs, pa_vecs.T) / ( 27 | np.linalg.norm(ia_vecs, axis=1)[:, None] * 28 | np.linalg.norm(pa_vecs, axis=1)[None, :] 29 | ), axis=1 30 | ) 31 | ) / len(ia_words)) 32 | else: 33 | # use cosine similarity 34 | # NOTE: RAGAS docs mention using cross-encoder similarity but 35 | # the code uses cosine similarity 36 | answers = [predicted_answer, ideal_answer] 37 | embeddings = np.array(encoder.embed_documents(answers)) 38 | source, target = embeddings[0, :], embeddings[1, :] 39 | similarity = np.dot(source, target) / ( 40 | np.linalg.norm(source) * np.linalg.norm(target)) 41 | 42 | return similarity 43 | -------------------------------------------------------------------------------- /src/prompted/context_precision.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from langchain_core.output_parsers import StrOutputParser 5 | from langchain_core.language_models.chat_models import BaseChatModel 6 | from langchain.prompts import PromptTemplate 7 | from pydantic import BaseModel, Field 8 | from typing import List 9 | 10 | from .prompt_utils import read_template_from_file, parse_response 11 | 12 | 13 | PROMPT_CONTEXT_PRECISION = "context_precision_1.txt" 14 | 15 | 16 | class Verdict(BaseModel): 17 | reason: str = Field(alias="reason", description="Reason for verdict") 18 | infer: str = Field(alias="infer", description="The inference (0/1)") 19 | 20 | 21 | async def _compute_usefulness_scores(question: str, 22 | context: List[str], 23 | answer: str, 24 | parallel: bool, 25 | model, 26 | logger) -> List[int]: 27 | 28 | prompt_template = read_template_from_file(PROMPT_CONTEXT_PRECISION) 29 | prompt_cprec = PromptTemplate( 30 | template=prompt_template, 31 | input_variables=["question", "context", "answer"]) 32 | chain_cprec = prompt_cprec | model | StrOutputParser() 33 | 34 | verdicts = [] 35 | if parallel: 36 | tasks = [] 37 | for ctx in context: 38 | tasks.append(chain_cprec.ainvoke({ 39 | "question": question, 40 | "context": ctx, 41 | "answer": answer 42 | })) 43 | responses = await asyncio.gather(*tasks) 44 | for response in responses: 45 | result = parse_response(response) 46 | verdict = Verdict(**result.value["verdict"]) 47 | verdicts.append(verdict) 48 | else: 49 | for ctx in context: 50 | response = chain_cprec.invoke({ 51 | "question": question, 52 | "context": ctx, 53 | "answer": answer 54 | }) 55 | result = parse_response(response) 56 | verdicts.append(Verdict(**result.value["verdict"])) 57 | 58 | logger.debug(f"verdicts: {verdicts}") 59 | scores = [int(verdict.infer) for verdict in verdicts] 60 | return scores 61 | 62 | 63 | def _compute_content_precision(scores: List[int]) -> float: 64 | # precision@k (for k=1..K) discounted by by reciprocal of position 65 | weights = [sum(scores[:i + 1]) / (i + 1) * scores[i] 66 | for i in range(len(scores))] 67 | if len(scores) == 0: 68 | return 0.0 69 | context_precision = ( 70 | sum(w * s for w, s in zip(weights, scores)) / len(scores)) 71 | return context_precision 72 | 73 | 74 | async def compute_context_precision(question: str, 75 | context: List[str], 76 | answer: str, 77 | model: BaseChatModel, 78 | logger, 79 | parallel: bool = True) -> float: 80 | 81 | # prompt_template = read_template_from_file(PROMPT_CONTEXT_PRECISION) 82 | # prompt_cprec = PromptTemplate( 83 | # template=prompt_template, 84 | # input_variables=["question", "context", "answer"]) 85 | # chain_cprec = prompt_cprec | model | StrOutputParser() 86 | 87 | # verdicts = [] 88 | # if parallel: 89 | # tasks = [] 90 | # for ctx in context: 91 | # tasks.append(chain_cprec.ainvoke({ 92 | # "question": question, 93 | # "context": ctx, 94 | # "answer": answer 95 | # })) 96 | # responses = await asyncio.gather(*tasks) 97 | # for response in responses: 98 | # result = parse_response(response) 99 | # verdict = Verdict(**result.value["verdict"]) 100 | # verdicts.append(verdict) 101 | # else: 102 | # for ctx in context: 103 | # response = chain_cprec.invoke({ 104 | # "question": question, 105 | # "context": ctx, 106 | # "answer": answer 107 | # }) 108 | # result = parse_response(response) 109 | # verdicts.append(Verdict(**result.value["verdict"])) 110 | 111 | # logger.debug(f"verdicts: {verdicts}") 112 | precs = await _compute_usefulness_scores(question, context, answer, 113 | parallel, model, logger) 114 | context_precision = _compute_content_precision(precs) 115 | # # precision@k (for k=1..K) 116 | # scores = [int(verdict.infer) for verdict in verdicts] 117 | # # weighted by reciprocal of position 118 | # weights = [sum(scores[:i + 1]) / (i + 1) * scores[i] 119 | # for i in range(len(scores))] 120 | 121 | # if len(scores) == 0: 122 | # return 0.0 123 | # context_precision = ( 124 | # sum(w * s for w, s in zip(weights, scores)) / len(scores)) 125 | return context_precision 126 | 127 | -------------------------------------------------------------------------------- /src/prompted/context_recall.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import nltk 3 | 4 | from itertools import chain 5 | from langchain_core.output_parsers import StrOutputParser 6 | from langchain_core.language_models.chat_models import BaseChatModel 7 | from langchain.prompts import PromptTemplate 8 | from typing import List 9 | from xml.sax.saxutils import escape 10 | 11 | from .prompt_utils import ( 12 | read_template_from_file, parse_response, 13 | parse_verdicts_from_result 14 | ) 15 | 16 | PROMPT_CLASSIFY_ATTRIB = "context_recall_1.txt" 17 | 18 | 19 | def _convert_answer_to_markdown_list(answer: str, 20 | logger, 21 | max_sents: int = 10) -> str: 22 | answer_sents = [] 23 | for sent in nltk.sent_tokenize(answer): 24 | sent = escape(sent) 25 | answer_sents.append(sent) 26 | answer_markdown = "\n".join([f"- {sent}" for sent in answer_sents]) 27 | # to deal with context length limitations (this hack is to take 28 | # care of lines that are already in the list format before sent_tokenize 29 | answer_sents = answer_markdown.split("\n- ")[:max_sents] 30 | answer_markdown = "\n- ".join(answer_sents) 31 | logger.debug(f"answer_md ({len(answer_sents)} sentences): {answer_markdown}") 32 | return answer_markdown 33 | 34 | 35 | async def _classify_ans_sents_attributable_to_context(answer_md: str, 36 | context: List[str], 37 | parallel: bool, 38 | model, 39 | logger) -> List[int]: 40 | prompt_template = read_template_from_file(PROMPT_CLASSIFY_ATTRIB) 41 | prompt = PromptTemplate( 42 | template=prompt_template, 43 | input_variables=["context", "answer"] 44 | ) 45 | chain = prompt | model | StrOutputParser() 46 | 47 | inferences = [] 48 | if parallel: 49 | tasks = [] 50 | for ctx in context: 51 | tasks.append(chain.invoke({ 52 | "context": ctx, 53 | "answer": answer_md 54 | })) 55 | responses = await asyncio.gather(*tasks) 56 | for response in responses: 57 | result = parse_response(response) 58 | verdicts = parse_verdicts_from_result(result) 59 | inferences.append([int(verdict.infer) for verdict in verdicts]) 60 | else: 61 | for ctx in context: 62 | response = chain.invoke({ 63 | "context": ctx, 64 | "answer": answer_md 65 | }) 66 | result = parse_response(response) 67 | verdicts = parse_verdicts_from_result(result) 68 | inferences.append([int(verdict.infer) for verdict in verdicts]) 69 | 70 | logger.debug(f"inferences: {inferences}") 71 | return inferences 72 | 73 | 74 | def _compute_context_recall_score(inferences: List[int]) -> float: 75 | inferences_f = list(chain(*inferences)) 76 | score = 0.0 77 | if len(inferences_f) > 0: 78 | score = sum(inferences_f) / len(inferences_f) 79 | return score 80 | 81 | 82 | async def compute_context_recall(context: List[str], 83 | answer: str, 84 | model: BaseChatModel, 85 | logger, 86 | parallel: bool = True) -> float: 87 | 88 | answer_md = _convert_answer_to_markdown_list(answer, logger) 89 | inferences = await _classify_ans_sents_attributable_to_context( 90 | answer_md, context, parallel, model, logger) 91 | score = _compute_context_recall_score(inferences) 92 | return score 93 | -------------------------------------------------------------------------------- /src/prompted/context_relevance.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import nltk 3 | 4 | from langchain_core.output_parsers import StrOutputParser 5 | from langchain_core.language_models.chat_models import BaseChatModel 6 | from langchain.prompts import PromptTemplate 7 | from typing import List, Tuple 8 | 9 | from .prompt_utils import ( 10 | read_template_from_file, parse_response, 11 | parse_verdicts_from_result 12 | ) 13 | 14 | 15 | PROMPT_CLASSIFY_NECESSARY = "context_relevance_1.txt" 16 | 17 | 18 | def _convert_to_markdown_list(context: str) -> Tuple[int, str]: 19 | context_sents = [] 20 | for sent in nltk.sent_tokenize(context): 21 | context_sents.append(sent) 22 | context_markdown = "\n".join([f"- {sent}" for sent in context_sents]) 23 | return len(context_sents), context_markdown 24 | 25 | 26 | def _convert_to_markdown_lists(context: str) -> Tuple[int, str]: 27 | total_sents = 0 28 | context_markdowns = [] 29 | for ctx in context: 30 | num_sents, context_md = _convert_to_markdown_list(ctx) 31 | total_sents += num_sents 32 | context_markdowns.append(context_md) 33 | return total_sents, context_markdowns 34 | 35 | 36 | async def _generate_necessity_verdicts(question: str, 37 | context_markdowns: List[str], 38 | parallel: bool, 39 | model, 40 | logger): 41 | prompt_template = read_template_from_file(PROMPT_CLASSIFY_NECESSARY) 42 | prompt = PromptTemplate(template=prompt_template, 43 | input_variables=["question", "context"]) 44 | chain = prompt | model | StrOutputParser() 45 | 46 | necessary_sents = [] 47 | if parallel: 48 | tasks = [] 49 | for context_markdown in context_markdowns: 50 | tasks.append(chain.ainvoke({ 51 | "question": question, 52 | "context": context_markdown 53 | })) 54 | responses = await asyncio.gather(*tasks) 55 | for response in responses: 56 | result = parse_response(response) 57 | verdicts = parse_verdicts_from_result(result) 58 | necessary_sents.append([verdict.statement for verdict in verdicts 59 | if int(verdict.infer) == 1]) 60 | else: 61 | for context_markdown in context_markdowns: 62 | response = chain.invoke({ 63 | "question": question, 64 | "context": context_markdown 65 | }) 66 | result = parse_response(response) 67 | logger.debug(f"result: {result}") 68 | verdicts = parse_verdicts_from_result(result) 69 | necessary_sents.append([verdict.statement for verdict in verdicts 70 | if int(verdict.infer) == 1]) 71 | return necessary_sents 72 | 73 | 74 | def _compute_context_relevance_score(num_total_sents: int, 75 | necessary_sents: List[List[str]] 76 | ) -> float: 77 | num_necessary_sents = sum([len(ctx_sents) for ctx_sents in necessary_sents]) 78 | return num_necessary_sents / num_total_sents 79 | 80 | 81 | async def compute_context_relevance(question: str, 82 | context: List[str], 83 | model: BaseChatModel, 84 | logger, 85 | parallel: bool = True) -> float: 86 | 87 | num_total_sents, context_markdowns = _convert_to_markdown_lists(context) 88 | score = 0.0 89 | if num_total_sents > 0: 90 | necessary_sents = await _generate_necessity_verdicts( 91 | question, context_markdowns, parallel, model, logger) 92 | score = _compute_context_relevance_score( 93 | num_total_sents, necessary_sents) 94 | return score 95 | -------------------------------------------------------------------------------- /src/prompted/faithfulness.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from itertools import chain 4 | from langchain_core.output_parsers import StrOutputParser 5 | from langchain_core.language_models.chat_models import BaseChatModel 6 | from langchain.prompts import PromptTemplate 7 | from typing import List 8 | 9 | from .prompt_utils import read_template_from_file, parse_response, Verdict 10 | 11 | 12 | PROMPT_EXTRACT_STATEMENTS_FROM_ANSWER = "faithfulness_1.txt" 13 | PROMPT_INFER_ENTAILMENT_FROM_CONTEXT = "faithfulness_2.txt" 14 | 15 | 16 | def _reformat_statements_to_xml(statements: List[str]) -> str: 17 | statements_xml = [""] 18 | for statement in statements: 19 | statements_xml.append(f" {statement}") 20 | statements_xml.append("") 21 | return "\n".join(statements_xml) 22 | 23 | 24 | def _get_statements_from_answer(question: str, 25 | answer: str, 26 | model: BaseChatModel, 27 | logger) -> List[str]: 28 | prompt_template = read_template_from_file( 29 | PROMPT_EXTRACT_STATEMENTS_FROM_ANSWER) 30 | prompt_ans_to_stmt = PromptTemplate( 31 | template=prompt_template, 32 | input_variables=["question", "answer"]) 33 | chain_ans_to_stmt = prompt_ans_to_stmt | model | StrOutputParser() 34 | response = chain_ans_to_stmt.invoke({ 35 | "question": question, 36 | "answer": answer 37 | }) 38 | logger.debug(f"response: {response}") 39 | result = parse_response(response) 40 | logger.debug(f"result (parsed response): {result}") 41 | statements = result.value["statements"]["statement"] 42 | if not isinstance(statements, list): 43 | statements = [statements] 44 | logger.debug("statement made into a []") 45 | 46 | logger.debug(f"statements: {statements}") 47 | return statements 48 | 49 | 50 | async def _get_entailments_from_context(context: List[str], 51 | statements: List[str], 52 | model: BaseChatModel, 53 | logger, 54 | parallel: bool 55 | ) -> List[List[float]]: 56 | statements_xml = _reformat_statements_to_xml(statements) 57 | logger.debug(f"statements_xml: {statements_xml}") 58 | 59 | prompt_template = read_template_from_file( 60 | PROMPT_INFER_ENTAILMENT_FROM_CONTEXT) 61 | prompt_nli = PromptTemplate( 62 | template=prompt_template, 63 | input_variables=["context", "statements_xml"]) 64 | chain_nli = prompt_nli | model | StrOutputParser() 65 | 66 | entailments = [] 67 | if parallel: 68 | tasks = [] 69 | for ctx in context: 70 | tasks.append(chain_nli.ainvoke({ 71 | "context": ctx, 72 | "statements_xml": statements_xml 73 | })) 74 | responses = await asyncio.gather(*tasks) 75 | for response in responses: 76 | result = parse_response(response) 77 | logger.debug(f"entailment verdicts: {result}") 78 | verdicts = result.value["verdicts"]["verdict"] 79 | logger.debug(f"type(verdicts): {type(verdicts)}") 80 | if not isinstance(verdicts, list): 81 | verdicts = [verdicts] 82 | logger.debug(f"made verdicts into []") 83 | 84 | entailments.append([int(Verdict(**v).infer) for v in verdicts]) 85 | else: 86 | for ctx in context: 87 | response = chain_nli.invoke({ 88 | "context": ctx, 89 | "statements_xml": statements_xml 90 | }) 91 | result = parse_response(response) 92 | logger.debug(f"entailment verdicts: {result}") 93 | verdicts = result.value["verdicts"]["verdict"] 94 | if not isinstance(verdicts, list): 95 | verdicts = [verdicts] 96 | logger.debug(f"made verdicts into []") 97 | 98 | entailments.append([int(Verdict(**v).infer) for v in verdicts]) 99 | 100 | logger.debug(f"entailments: {entailments}") 101 | return entailments 102 | 103 | 104 | def _compute_faithfulness(entailments_lol: List[List[float]]) -> float: 105 | entailments = list(chain.from_iterable(entailments_lol)) 106 | try: 107 | faithfulness = sum(entailments) / len(entailments) 108 | except ZeroDivisionError: 109 | faithfulness = 0.0 110 | return faithfulness 111 | 112 | 113 | async def compute_faithfulness(question: str, 114 | answer: str, 115 | context: List[str], 116 | model: BaseChatModel, 117 | logger, 118 | parallel: bool = True) -> float: 119 | statements = _get_statements_from_answer(question, answer, model, logger) 120 | entailments_lol = await _get_entailments_from_context( 121 | context, statements, model, logger, parallel) 122 | faithfulness = _compute_faithfulness(entailments_lol) 123 | return faithfulness 124 | -------------------------------------------------------------------------------- /src/prompted/more_questions.py: -------------------------------------------------------------------------------- 1 | from langchain_core.output_parsers import StrOutputParser 2 | from langchain_core.language_models.chat_models import BaseChatModel 3 | from langchain.prompts import PromptTemplate 4 | from xml.sax.saxutils import escape 5 | 6 | import dspy 7 | import dsp 8 | 9 | from .prompt_utils import read_template_from_file, parse_response 10 | 11 | colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts') 12 | dspy.settings.configure(rm=colbertv2_wiki17_abstracts) 13 | 14 | PROMPT_MORE_QUESTIONS = "more_questions.txt" 15 | 16 | def split_newlines(input_string): 17 | """Split the input string at newline characters and return a list of strings.""" 18 | if '\n' in input_string: 19 | return input_string.split('\n') 20 | else: 21 | return [input_string] 22 | 23 | def compute_more_questions(question: str, 24 | ideal_answer: str, 25 | multiplier: int, 26 | model: BaseChatModel, 27 | logger) -> list: 28 | more_passages = dsp.retrieve(question, k=multiplier) 29 | esc_more_passages = [escape(x) for x in more_passages] 30 | prompt_template = read_template_from_file(PROMPT_MORE_QUESTIONS) 31 | prompt = PromptTemplate(template=prompt_template, 32 | input_variables=["question", "answer", "multiplier", "passages"]) 33 | chain = prompt | model | StrOutputParser() 34 | response = chain.invoke({ 35 | "question": question, 36 | "answer": ideal_answer, 37 | "multiplier": multiplier, 38 | "passages": esc_more_passages 39 | }) 40 | 41 | # make list of dicts from XML output 42 | # format per requirement 43 | # ideal_answer is just the predicted_answer, as a workaround 44 | 45 | result = parse_response(response) 46 | 47 | result_tuples = result.value["tuples"] 48 | result_list = [] 49 | for _ in result_tuples: 50 | chunks = [] 51 | context_lines = split_newlines(_["context"]) 52 | for i in range(len(context_lines)): 53 | chunk_dict = {} 54 | chunk_dict['id'] = str(i) 55 | chunk_dict['chunk_text'] = context_lines[i] 56 | chunks.append(chunk_dict) 57 | r_dict = {"query": _["question"], "predicted_answer": _["answer"], "ideal_answer": _["answer"], "context": chunks} 58 | result_list.append(r_dict) 59 | logger.debug(f"result_list: \n{result_list}") 60 | 61 | return result_list 62 | -------------------------------------------------------------------------------- /src/prompted/prompt_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import xmltodict 4 | 5 | from pydantic.generics import GenericModel 6 | from pydantic import BaseModel, Field 7 | from typing import List, TypeVar, Generic 8 | 9 | 10 | ################ parse LLM output to Pydantic object ################ 11 | 12 | T = TypeVar("T") 13 | 14 | 15 | class Result(GenericModel, Generic[T]): 16 | value: T = Field(alias="result") 17 | 18 | 19 | def parse_response(response): 20 | response = response.strip() 21 | start_tag, end_tag = "", "" 22 | is_valid = response.startswith(start_tag) and response.endswith(end_tag) 23 | if not is_valid: 24 | pattern = f"(?:{start_tag})(.*)(?:{end_tag})" 25 | p = re.compile(pattern, re.DOTALL) 26 | m = p.search(response) 27 | if m is not None: 28 | response = start_tag + m.group(1) + end_tag 29 | try: 30 | resp_dict = xmltodict.parse(response) 31 | except Exception as e: 32 | print("response:", response) 33 | raise e 34 | result = Result(**resp_dict) 35 | return result 36 | 37 | 38 | ################ parse LLM output to Pydantic object ################ 39 | 40 | 41 | PROMPT_DIR = "../resources/prompts" 42 | 43 | 44 | def read_template_from_file(prompt_fn: str, 45 | prompt_dir: str = PROMPT_DIR) -> str: 46 | prompt_fp = os.path.join(prompt_dir, prompt_fn) 47 | with open(prompt_fp, "r", encoding="utf-8") as f: 48 | prompt_template_text = f.read() 49 | return prompt_template_text 50 | 51 | 52 | #################### verdict processing ################################ 53 | 54 | 55 | class Verdict(BaseModel): 56 | statement: str = Field(alias="statement", description="The statement") 57 | reason: str = Field(alias="reason", description="Reason for verdict") 58 | infer: str = Field(alias="infer", description="The inference (0/1)") 59 | 60 | 61 | def parse_verdicts_from_result(result) -> List[Verdict]: 62 | verdicts_el = result.value["verdicts"] 63 | if verdicts_el is None: 64 | return [] 65 | verdict_el = verdicts_el["verdict"] 66 | if isinstance(verdict_el, dict): 67 | verdicts = [Verdict(**verdict_el)] 68 | else: 69 | verdicts = [Verdict(**verdict_dict) for verdict_dict in verdict_el] 70 | return verdicts 71 | -------------------------------------------------------------------------------- /src/run_learned_metrics.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import dspy 3 | import json 4 | import logging 5 | import os 6 | 7 | from dotenv import find_dotenv, load_dotenv 8 | from langchain_google_genai import GoogleGenerativeAIEmbeddings 9 | 10 | from learned.faithfulness import compute_faithfulness 11 | from learned.answer_relevance import compute_answer_relevance 12 | from learned.context_precision import compute_context_precision 13 | from learned.context_relevance import compute_context_relevance 14 | from learned.context_recall import compute_context_recall 15 | from learned.answer_correctness import compute_answer_correctness 16 | from learned.learning_utils import clean_up_log_files 17 | from metrics import Metrics 18 | 19 | # Safety config 20 | 21 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory 22 | 23 | safety_settings = { 24 | HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 25 | HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH, 26 | HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 27 | HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH 28 | } 29 | 30 | def runner(): 31 | 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--metric", type=str, 34 | choices=sorted([m.value for m in Metrics]), 35 | required=True, 36 | help="The metric to compute") 37 | parser.add_argument("--input", type=str, required=True, 38 | help="Full path to evaluation data in JSONL format") 39 | parser.add_argument("--output", type=str, required=True, 40 | help="Full path to output directory") 41 | parser.add_argument("--cross-encoder", action="store_false", 42 | help="Use cross-encoder similarity scoring (default true)") 43 | parser.add_argument("--model-temp", type=float, required=False, 44 | help="The temperature of the model - between 0.0 and 1.0 (default 0.0)") 45 | parser.add_argument("--qs_to_skip", type=str, required=False, 46 | help="skip questions provided as a comma separated list of ints: '3,4,11'") 47 | parser.add_argument("--qs_to_use", type=str, required=False, 48 | help="only use questions provided as a comma separated list of ints: '3,4,11'") 49 | parser.add_argument("--debug", action="store_true", 50 | help="Turn debugging on (default: false)") 51 | 52 | args = parser.parse_args() 53 | metric = args.metric 54 | input_fp = args.input 55 | output_dir = args.output 56 | model_temp = args.model_temp 57 | if model_temp is None or model_temp > 1.0 or model_temp < 0.0: 58 | model_temp = 0.0 59 | debug = args.debug 60 | qs_to_skip = args.qs_to_skip 61 | if qs_to_skip is None: 62 | skip_qs = [] 63 | else: 64 | skip_qs = list(map(int, qs_to_skip.split(','))) 65 | 66 | qs_to_use = args.qs_to_use 67 | if qs_to_use is None: 68 | use_qs = [] 69 | else: 70 | use_qs = list(map(int, qs_to_use.split(','))) 71 | 72 | _ = load_dotenv(find_dotenv()) 73 | 74 | model = dspy.Google("models/gemini-1.0-pro", 75 | api_key=os.environ["GOOGLE_API_KEY"], 76 | max_output_tokens=1024, 77 | temperature=model_temp, 78 | safety_settings=safety_settings) 79 | dspy.settings.configure(lm=model) 80 | dspy.logger.level = logging.DEBUG if debug else logging.INFO 81 | 82 | encoder = GoogleGenerativeAIEmbeddings(model="models/embedding-001") 83 | 84 | os.makedirs(output_dir, exist_ok=True) 85 | output_fp = os.path.join(output_dir, f"{metric}_report.tsv") 86 | 87 | optimized_prompts = {} 88 | 89 | with open(input_fp, "r", encoding="utf-8") as fin, \ 90 | open(output_fp, "w", encoding="utf-8") as fout: 91 | 92 | fout.write("\t".join(["#QID", metric.upper()]) + "\n") 93 | for line in fin: 94 | record = json.loads(line) 95 | id = record["id"] 96 | # if int(id) % 4 != 0: 97 | # continue 98 | # use_qs will override skip_qs 99 | if int(id) in skip_qs and int(id) not in use_qs: 100 | continue 101 | question = record["query"] 102 | context = [ctx["chunk_text"] for ctx in record["context"]] 103 | answer = record["predicted_answer"] 104 | ideal_answer = record["ideal_answer"] 105 | 106 | 107 | match Metrics(metric): 108 | case Metrics.FAITHFULNESS: 109 | metric_value = compute_faithfulness( 110 | question, answer, context, optimized_prompts) 111 | case Metrics.ANSWER_RELEVANCE: 112 | metric_value = compute_answer_relevance( 113 | question, context, answer, optimized_prompts, 114 | encoder) 115 | case Metrics.CONTEXT_PRECISION: 116 | metric_value = compute_context_precision( 117 | question, ideal_answer, context, optimized_prompts) 118 | case Metrics.CONTEXT_UTILIZATION: 119 | metric_value = compute_context_precision( 120 | question, answer, context, optimized_prompts) 121 | case Metrics.CONTEXT_RELEVANCE: 122 | metric_value = compute_context_relevance( 123 | question, context, optimized_prompts) 124 | case Metrics.CONTEXT_RECALL: 125 | metric_value = compute_context_recall( 126 | context, answer, optimized_prompts) 127 | case Metrics.ANSWER_SIMILARITY: 128 | raise NotImplementedError( 129 | "Use prompted version of answer similarity") 130 | case Metrics.ANSWER_CORRECTNESS: 131 | metric_value = compute_answer_correctness( 132 | ideal_answer, answer, optimized_prompts) 133 | case _: 134 | print(f"Unsupported metric: {metric}") 135 | 136 | print(f"query ({id}): {question}, {metric}: {metric_value}") 137 | fout.write(f"{id}\t{metric_value:.3f}\n") 138 | 139 | if not debug: 140 | clean_up_log_files() 141 | 142 | if __name__ == "__main__": 143 | runner() 144 | -------------------------------------------------------------------------------- /src/run_prompted_metrics.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import json 4 | import logging 5 | import os 6 | 7 | from dotenv import find_dotenv, load_dotenv 8 | from langchain_google_genai import ( 9 | ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings 10 | ) 11 | 12 | from prompted.faithfulness import compute_faithfulness 13 | from prompted.answer_relevance import compute_answer_relevance 14 | from prompted.context_precision import compute_context_precision 15 | from prompted.context_relevance import compute_context_relevance 16 | from prompted.context_recall import compute_context_recall 17 | from prompted.answer_similarity import compute_answer_similarity 18 | from prompted.answer_correctness import compute_answer_correctness 19 | from metrics import Metrics 20 | 21 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) 22 | 23 | # Safety config 24 | 25 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory 26 | 27 | safety_settings = { 28 | HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 29 | HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH, 30 | HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 31 | HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH 32 | } 33 | 34 | async def runner(): 35 | 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--metric", type=str, 38 | choices=sorted([m.value for m in Metrics]), 39 | required=True, 40 | help="The metric to compute") 41 | parser.add_argument("--input", type=str, required=True, 42 | help="Full path to evaluation data in JSONL format") 43 | parser.add_argument("--output", type=str, required=True, 44 | help="Full path to output directory") 45 | parser.add_argument("--parallel", action="store_true", 46 | help="Run in parallel where possible (default false)") 47 | parser.add_argument("--cross-encoder", action="store_false", 48 | help="Use cross-encoder similarity scoring (default true)") 49 | parser.add_argument("--model-temp", type=float, required=False, 50 | help="The temperature of the model - between 0.0 and 1.0 (default 0.0)") 51 | parser.add_argument("--debug", action="store_true", 52 | help="Turn debugging on (default: false)") 53 | args = parser.parse_args() 54 | metric = args.metric 55 | input_fp = args.input 56 | output_dir = args.output 57 | run_in_parallel = args.parallel 58 | use_cross_encoder = args.cross_encoder 59 | model_temp = args.model_temp 60 | if model_temp is None or model_temp > 1.0 or model_temp < 0.0: 61 | model_temp = 0.0 62 | debug = args.debug 63 | 64 | logger = logging.getLogger(__name__) 65 | logger.setLevel(logging.DEBUG if debug else logging.INFO) 66 | 67 | _ = load_dotenv(find_dotenv()) 68 | 69 | model = ChatGoogleGenerativeAI( 70 | model="gemini-pro", 71 | api_key=os.environ["GOOGLE_API_KEY"], 72 | temperature=model_temp, 73 | safety_settings=safety_settings) 74 | encoder = GoogleGenerativeAIEmbeddings(model="models/embedding-001") 75 | 76 | os.makedirs(output_dir, exist_ok=True) 77 | output_fp = os.path.join(output_dir, f"{metric}_report.tsv") 78 | 79 | with open(input_fp, "r", encoding="utf-8") as fin, \ 80 | open(output_fp, "w", encoding="utf-8") as fout: 81 | 82 | fout.write("\t".join(["#QID", metric.upper()]) + "\n") 83 | for line in fin: 84 | record = json.loads(line) 85 | # extract relevant data to evaluate 86 | id = record["id"] 87 | if int(id) % 4 != 0: 88 | continue 89 | question = record["query"] 90 | context = [ctx["chunk_text"] for ctx in record["context"]] 91 | answer = record["predicted_answer"] 92 | ideal_answer = record["ideal_answer"] 93 | 94 | match Metrics(metric): 95 | case Metrics.FAITHFULNESS: 96 | metric_value = await compute_faithfulness( 97 | question, answer, context, model, logger, 98 | parallel=run_in_parallel) 99 | case Metrics.ANSWER_RELEVANCE: 100 | metric_value = await compute_answer_relevance( 101 | question, context, answer, model, encoder, logger, 102 | parallel=run_in_parallel) 103 | case Metrics.CONTEXT_PRECISION: 104 | metric_value = await compute_context_precision( 105 | question, context, ideal_answer, model, logger, 106 | parallel=run_in_parallel) 107 | case Metrics.CONTEXT_UTILIZATION: 108 | metric_value = await compute_context_precision( 109 | question, context, answer, model, logger, 110 | parallel=run_in_parallel) 111 | case Metrics.CONTEXT_RELEVANCE: 112 | metric_value = await compute_context_relevance( 113 | question, context, model, logger, 114 | parallel=run_in_parallel) 115 | case Metrics.CONTEXT_RECALL: 116 | metric_value = await compute_context_recall( 117 | context, ideal_answer, model, logger, 118 | parallel=run_in_parallel) 119 | case Metrics.ANSWER_SIMILARITY: 120 | metric_value = compute_answer_similarity( 121 | answer, ideal_answer, encoder, logger, 122 | cross_encoder=use_cross_encoder) 123 | case Metrics.ANSWER_CORRECTNESS: 124 | metric_value = compute_answer_correctness( 125 | ideal_answer, answer, model, logger) 126 | case _: 127 | logger.error(f"Unsupported metric: {metric}") 128 | 129 | logger.info( 130 | f"query ({id}): {question}, {metric}: {metric_value}") 131 | fout.write(f"{id}\t{metric_value:.3f}\n") 132 | 133 | 134 | if __name__ == "__main__": 135 | asyncio.run(runner()) 136 | -------------------------------------------------------------------------------- /src/score_distributions.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import os 4 | 5 | 6 | REPORTS_DIR = "../resources/reports" 7 | LCEL_REPORTS_DIR = os.path.join(REPORTS_DIR, "lcel-reports") 8 | DSPY_REPORTS_DIR = os.path.join(REPORTS_DIR, "dspy-reports") 9 | 10 | 11 | def read_data(report_fp): 12 | scores = [] 13 | with open(report_fp) as f: 14 | for line in f: 15 | if line.startswith("#QID"): 16 | continue 17 | qid, score = line.strip().split("\t") 18 | scores.append(float(score)) 19 | return scores 20 | 21 | 22 | def bimodality(scores): 23 | """ Measure the "width" of the distribution around 0 and 1. 24 | All metrics return a value in the range [0, 1]. We measure 25 | deviation from the mean and then compute the standard deviation 26 | of the deviations. Smaller values of standard deviation indicate 27 | a more confident model. 28 | """ 29 | deviations = [abs(x - 0.5) for x in scores] 30 | return np.std(deviations) 31 | 32 | 33 | if __name__ == "__main__": 34 | 35 | print("| Metric | LCEL | DSPy |") 36 | print("|--------|------|------|") 37 | 38 | plt.figure(figsize=(10, 5)) 39 | for idx, report_fn in enumerate(os.listdir(DSPY_REPORTS_DIR)): 40 | title = report_fn.split("_report")[0].replace("_", " ").title() 41 | dspy_report_fp = os.path.join(DSPY_REPORTS_DIR, report_fn) 42 | lcel_report_fp = os.path.join(LCEL_REPORTS_DIR, report_fn) 43 | dspy_scores = read_data(dspy_report_fp) 44 | lcel_scores = read_data(lcel_report_fp) 45 | lcel_bim = bimodality(lcel_scores) 46 | dspy_bim = bimodality(dspy_scores) 47 | print(f"| {title} | {lcel_bim:.3f} | {dspy_bim:.3f} |") 48 | plt.subplot(2, 4, idx + 1) 49 | plt.hist(lcel_scores, bins=10, alpha=0.5, label="LCEL") 50 | plt.hist(dspy_scores, bins=10, alpha=0.5, label="DSPy") 51 | plt.title(title) 52 | if idx == 0: 53 | plt.legend(loc="best") 54 | 55 | plt.tight_layout() 56 | _ = plt.show() 57 | -------------------------------------------------------------------------------- /src/streamlit_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | x = st.slider("Select a value") 3 | st.write(x, "squared is", x * x) -------------------------------------------------------------------------------- /src/test_nltk.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import word_tokenize 3 | 4 | import ssl 5 | 6 | try: 7 | _create_unverified_https_context = ssl._create_unverified_context 8 | except AttributeError: 9 | pass 10 | else: 11 | ssl._create_default_https_context = _create_unverified_https_context 12 | 13 | 14 | # Download the necessary NLTK models (if not already downloaded) 15 | nltk.download('punkt') 16 | 17 | # Example usage of the tokenizer 18 | text = "Hello, how are you doing?" 19 | tokens = word_tokenize(text) 20 | print(tokens) 21 | 22 | print(nltk.data.path) --------------------------------------------------------------------------------