├── .gitignore
├── LICENSE
├── README.md
├── figs
    ├── ragas_metrics.png
    └── score_distribs.png
├── rag-data
    ├── amnesty_qa-new-01.jsonl
    ├── amnesty_qa-new-01.tsv
    ├── amnesty_qa-new-02.jsonl
    ├── more-q-01.jsonl
    └── output-01.tsv
├── requirements.txt
├── resources
    ├── configs
    │   ├── answer_correctness-best.json
    │   ├── answer_relevance-best.json
    │   ├── context_precision-best.json
    │   ├── context_recall-best.json
    │   ├── context_relevance-best.json
    │   └── faithfulness-best.json
    ├── demodata
    │   └── amnesty_qa.jsonl
    ├── prompts
    │   ├── answer_correctness_1.txt
    │   ├── answer_relevance_1.txt
    │   ├── answer_relevance_2.txt
    │   ├── context_precision_1.txt
    │   ├── context_recall_1.txt
    │   ├── context_relevance_1.txt
    │   ├── faithfulness_1.txt
    │   ├── faithfulness_2.txt
    │   └── more_questions.txt
    └── reports
    │   ├── dspy-reports
    │       ├── answer_correctness_report.tsv
    │       ├── answer_relevance_report.tsv
    │       ├── context_precision_report.tsv
    │       ├── context_recall_report.tsv
    │       ├── context_relevance_report.tsv
    │       ├── context_utilization_report.tsv
    │       └── faithfulness_report.tsv
    │   └── lcel-reports
    │       ├── answer_correctness_report.tsv
    │       ├── answer_relevance_report.tsv
    │       ├── answer_similarity_report.tsv
    │       ├── context_precision_report.tsv
    │       ├── context_recall_report.tsv
    │       ├── context_relevance_report.tsv
    │       ├── context_utilization_report.tsv
    │       └── faithfulness_report.tsv
└── src
    ├── convert_amnestyQA_to_desired_format.ipynb
    ├── generate_datasets.py
    ├── learned
        ├── __init__.py
        ├── answer_correctness.py
        ├── answer_relevance.py
        ├── context_precision.py
        ├── context_recall.py
        ├── context_relevance.py
        ├── faithfulness.py
        └── learning_utils.py
    ├── make_more_questions.py
    ├── metrics.py
    ├── prompted
        ├── __init__.py
        ├── answer_correctness.py
        ├── answer_relevance.py
        ├── answer_similarity.py
        ├── context_precision.py
        ├── context_recall.py
        ├── context_relevance.py
        ├── faithfulness.py
        ├── more_questions.py
        └── prompt_utils.py
    ├── run_learned_metrics.py
    ├── run_prompted_metrics.py
    ├── score_distributions.py
    ├── streamlit_app.py
    └── test_nltk.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # application specific
163 | **/data
164 | **/notes.txt
165 | **/try
166 | 
167 | # mac thing
168 | .DS_Store
169 | **/.DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # llm-rag-eval
  2 | 
  3 | Large Language Model (LLM) powered evaluator for Retrieval Augmented Generation (RAG) pipelines.
  4 | 
  5 | <img src="figs/ragas_metrics.png"/>
  6 | 
  7 | ## Google AI Hackathon Submission
  8 | 
  9 | [Submission Video Link](https://youtu.be/yOIU65chc98)
 10 | 
 11 | ### Inspiration
 12 | 
 13 | Our project is inspired by the [RAGAS](https://github.com/explodinggradients/ragas) project which defines and implements 8 metrics to evaluate inputs and outputs of a Retrieval Augmented Generation (RAG) pipeline, and by ideas from the [ARES paper](https://arxiv.org/abs/2311.09476), which attempts to calibrate these LLM evaluators against human evaluators.
 14 | 
 15 | ### What it does
 16 | 
 17 | It provides an LLM based framework to evaluate the performance of RAG systems using a set of metrics that are optimized for the application domain it (the RAG system) operates in. We have used the Gemini Pro 1.0 from Google AI as the LLM the framework uses. We have also used the Google AI embedding model to generate embeddings for some of the metrics.
 18 | 
 19 | ### How we built it
 20 | 
 21 | 1. We re-implemented the RAGAS metrics using [LangChain Expression Language (LCEL)](https://python.langchain.com/docs/expression_language/) so we could access outputs of intermediate steps in metrics calculation.
 22 | 2. We then implemented the metrics using [DSPy (Declarative Self-improving Language Programs in Python)](https://github.com/stanfordnlp/dspy) and optimized the prompts to minimize score difference with LCEL using a subset of examples for Few Shot Learning (using Bootstrap Few Shot with Random Search).
 23 | 3. We evaluated the confidence of scores produced by LCEL and DSPy metric implementations.
 24 | 4. We are building a tool that allows human oversight on the LCEL outputs (including intermediate steps) for Active Learning supervision.
 25 | 5. We will re-optimize the DSPy metrics using recalculated scores based on tool updates.
 26 | 
 27 | ### Challenges we ran into
 28 | 
 29 | * DSPy has a steep learning curve and it is still a work in progress, so some parts of it don't work as expected
 30 | * Our project grew iteratively as our understanding of the problem space grew, so we had to do some steps sequentially, leading to wasted time
 31 | 
 32 | ### Accomplishments that we're proud of
 33 | 
 34 | * How team members from different parts of the world came together and pooled their skills towards our common goal of building a set of domain optimized metrics.
 35 | 
 36 | ### What we learned
 37 | 
 38 | * We gained greater insight into the RAGAS metrics once we implemented them ourselves. We gained additional insight when building the tool using the intermediate outputs.
 39 | * Our team was not familiar with DSPy at all, we learned to use it and are very impressed with its capabilities
 40 | 
 41 | ### What's next for llm-rag-eval
 42 | 
 43 | We notice that most of our metrics involve predictive steps, where we predict a binary outcome given a pair of strings. These seem like variants of NLI (Natural Language Inference) which could be handled by non-LLM models, which are not only cheaper but also don't suffer from hallucinations, leading to more repeatable evaluations. It will require more data to train them, so we are starting to generate synthetic data, but this has other dependencies before we can start to offload these steps to smaller models.
 44 | 
 45 | 
 46 | ## Running Prompted RAGAS metrics
 47 | 
 48 | The following RAGAS metrics have been (re-)implemented in this project (because
 49 | I had trouble making them work as-is, and because they are conceptually quite 
 50 | simple to implement).
 51 | 
 52 | * [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html) -- given question, context and answer, measures factual consistency of the answer against the given context.
 53 | * [Answer Relevance](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html) -- given question, context and answer, measures how pertinent the answer is to the question.
 54 | * [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html) -- given question, context and ground truth, measures whether statements in the ground truth are found in context.
 55 | * Context Utilization -- same as Context Precision, using the predicted answer instead of the ground truth.
 56 | * [Context Relevance](https://docs.ragas.io/en/stable/concepts/metrics/context_relevancy.html) -- given question and context, measures relevance of the context against the question.
 57 | * [Context Recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html) -- given the context and answer, measures extent to which context aligns with the answer.
 58 | * [Answer Similarity](https://docs.ragas.io/en/stable/concepts/metrics/semantic_similarity.html) -- given answer and ground truth, measures semantic similarity between them (cosine or cross encoder similarity).
 59 | * [Answer Correctness](https://docs.ragas.io/en/stable/concepts/metrics/answer_correctness.html) -- given answer and ground truth, measures the accuracy given facts in these two texts.
 60 | 
 61 | The metrics described above can be run against your dataset by calling the `run_prompted_metrics.py` script in the `src` directory, with the path to the input (provided in JSON-L), the path to the output (TSV) file, and the appropriate metric name.
 62 | 
 63 | ```
 64 | $ python3 run_prompted_metrics.py --help
 65 | usage: run_prompted_metrics.py [-h] --metric
 66 |                                {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness}
 67 |                                --input INPUT_JSONL [--output OUTPUT_TSV] [--parallel] [--cross-encoder]
 68 | 
 69 | options:
 70 |   -h, --help            show this help message and exit
 71 |   --metric {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness}
 72 |                         The metric to compute
 73 |   --input INPUT_JSONL
 74 |                         Full path to evaluation data in JSONL format
 75 |   --output OUTPUT_TSV
 76 |                         Full path to output TSV file
 77 |   --parallel            Run in parallel where possible (default false)
 78 |   --cross-encoder       Use cross-encoder similarity scoring (default false)
 79 | ```
 80 | 
 81 | Ideally, we want to generate metrics from a running RAG pipeline, but in order to simplify the development process, we have isolate the evaluation functionality, feeding it the input it needs via a JSON-L file. Each line of the JSON-L file represents a single RAG transaction. The required fields are as follows.
 82 | 
 83 | ```
 84 | {
 85 |     "id": {qid: int},
 86 |     "query": {query: str},
 87 |     "context": [
 88 |         {
 89 |             "id": {chunk_id: str},
 90 |             "chunk_text": {chunk_text: str},
 91 |             ... other relevant fields
 92 |         },
 93 |         ... more context elements
 94 |     ],
 95 |     "ideal_answer": {ground_truth: str},
 96 |     "predicted_answer": {answer: str}
 97 | }
 98 | ```
 99 | 
100 | We have used the [AmnestyQA](https://huggingface.co/datasets/explodinggradients/amnesty_qa) dataset on HuggingFace as our reference dataset. You can find a copy of that data in the format described above.
101 | 
102 | ## Running Learned RAGAS metrics
103 | 
104 | We have used DSPy to optimize our prompts for AmnestyQA dataset. At a very high level, this involves using random subsets of the training data (in our case outputs from our prompted RAG metrics) and finding the best subset of examples that produce the most optimized prompt.
105 | 
106 | The DSPy implementations look for the optimized configuration in `resource/configs`. If it doesn't find it, then it looks for the dataset to allow it to optimize itself first. We have provided configurations for RAGAS metrics prompts optimized for AmnestyQA, but you probably need to generate optimized versions for your own dataset. To do so, you need to run the `generate_datasets.py` script, which will extract data from running the RAGAS prompts against the LLM and write it out as a JSON file into `data/dspy-datasets` (that's where the DSPy fine-tuning code expects to find it). The command to generate a dataset to fine-tune a DSPy prompt for a particular metric is shown below:
107 | 
108 | ```
109 | $ python3 generate_datasets.py --help
110 | usage: generate_datasets.py [-h] --metric
111 |                             {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness}
112 |                             --input INPUT --output OUTPUT [--parallel] [--debug]
113 | 
114 | options:
115 |   -h, --help            show this help message and exit
116 |   --metric {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness}
117 |                         The metric to generate datasets for
118 |   --input INPUT         Full path to input JSONL file
119 |   --output OUTPUT       Full path to output directory
120 |   --parallel            Run in parallel where possible (default false)
121 |   --debug               Turn debugging on (default: false)
122 | ```
123 | 
124 | To re-run the optimization locally, remove the configuration file for the metric from the `resources/config` directory. The next time you run `run_learned_metrics.py` it will re-optimize (this is a fairly lengthy process but doesn't require GPU). Leave the config file alone to re-use the prompt optimized for AmnestyQA.
125 | 
126 | ```
127 | $ python3 run_learned_metrics.py --help
128 | usage: run_learned_metrics.py [-h] --metric
129 |                               {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness}
130 |                               --input INPUT [--output OUTPUT] [--cross-encoder] [--model-temp MODEL_TEMP]
131 | 
132 | options:
133 |   -h, --help            show this help message and exit
134 |   --metric {answer_correctness,answer_relevance,answer_similarity,context_precision,context_recall,context_relevance,context_utilization,faithfulness}
135 |                         The metric to compute
136 |   --input INPUT         Full path to evaluation data in JSONL format
137 |   --output OUTPUT       Full path to output TSV file
138 |   --cross-encoder       Use cross-encoder similarity scoring (default true)
139 |   --model-temp MODEL_TEMP
140 |                         The temperature of the model - between 0.0 and 1.0 (default 0.0)
141 | ```
142 | 
143 | ## Comparing Prompted vs Learned Metrics
144 | 
145 | After computing the different metrics using the "prompted" approach using LCEL and the "optimized from data" approach using DSPy, we plot the scores as shown below.
146 | 
147 | <img src="figs/score_distribs.png"/>
148 | 
149 | Visually, at least for some of the metrics, the DSPy approach seems to produce scores whose histograms are narrower around 0 and 1. In order to quantify this intuition, we measure deviations of the scores from 0.5 on the upper and lower side, then compute the standard deviation of the deviations for both LCEL and DSPy scores for each metric. The results are summarized below.
150 | 
151 | | Metric | LCEL | DSPy |
152 | |--------|------|------|
153 | | Faithfulness | 0.162 | **0.000** |
154 | | Context Relevance | 0.191 | **0.184** |
155 | | Answer Correctness | 0.194 | **0.193** |
156 | | Context Recall | 0.191 | **0.183** |
157 | | Context Utilization | **0.186** | 0.189 |
158 | | Context Precision | 0.191 | **0.178** |
159 | | Answer Relevance | **0.049** | 0.064 |
160 | 
161 | As can be seen, for most of the metrics, optimized prompts from DSPy produce more confident scores. In many cases, the difference is quite small, which may be attributed to the relatively few examples we are working with.
162 | 


--------------------------------------------------------------------------------
/figs/ragas_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/figs/ragas_metrics.png


--------------------------------------------------------------------------------
/figs/score_distribs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/figs/score_distribs.png


--------------------------------------------------------------------------------
/rag-data/amnesty_qa-new-01.tsv:
--------------------------------------------------------------------------------
 1 | #QID	FAITHFULNESS
 2 | 20	1.000
 3 | 21	1.000
 4 | 22	1.000
 5 | 23	1.000
 6 | 24	1.000
 7 | 25	1.000
 8 | 26	1.000
 9 | 27	1.000
10 | 28	1.000
11 | 29	1.000
12 | 30	1.000
13 | 31	1.000
14 | 32	1.000
15 | 33	1.000
16 | 34	1.000
17 | 35	1.000
18 | 36	1.000
19 | 37	1.000
20 | 38	1.000
21 | 39	1.000
22 | 40	0.500
23 | 41	0.500
24 | 42	1.000
25 | 43	0.500
26 | 44	1.000
27 | 45	1.000
28 | 46	0.500
29 | 47	1.000
30 | 48	1.000
31 | 49	1.000
32 | 50	1.000
33 | 51	1.000
34 | 52	1.000
35 | 53	1.000
36 | 54	1.000
37 | 55	0.400
38 | 56	1.000
39 | 


--------------------------------------------------------------------------------
/rag-data/amnesty_qa-new-02.jsonl:
--------------------------------------------------------------------------------
1 | {"query": "What is the Roe effect?", "predicted_answer": "The Roe effect is a hypothesis that suggests that the practice of abortion will eventually lead to the restriction or illegalization of abortion due to the erosion of the political base of abortion rights supporters.", "context": [{"id": "1", "chunk_text": "Roe effect | The Roe effect is a hypothesis about the long-term effect of abortion on the political balance of the United States, which suggests that since supporters of abortion rights cause the erosion of their own political base, the practice of abortion will eventually lead to the restriction or illegalization of abortion. It is named after \"Roe v. Wade\", the U.S. Supreme Court case that effectively legalized abortion nationwide in the U.S. Its best-known proponent is James Taranto of the \"Wall Street Journal\" who coined the phrase \"Roe effect\" in \"Best of the Web Today\", his OpinionJournal.com column."}], "id": 20}
2 | {"query": "What is the purpose of the Human Life Amendment?", "predicted_answer": "The Human Life Amendment is a proposed amendment to the United States Constitution that would overturn the Supreme Court's 1973 \"Roe v. Wade\" decision, which legalized abortion nationwide. The amendment would prohibit both Congress and the states from legalizing abortion.", "context": [{"id": "1", "chunk_text": "Human Life Amendment | The Human Life Amendment is the name of multiple proposals to amend the United States Constitution that would have the effect of overturning the Supreme Court 1973 decision \"Roe v. Wade\", which ruled that prohibitions against abortion were unconstitutional. All of these amendment proposals seek to overturn \"Roe v. Wade\", but most of them go further by forbidding both Congress and the states from legalizing abortion. Some of the proposals define human life as beginning with conception or fertilization."}], "id": 21}
3 | {"query": "What is the purpose of greenhouse gas accounting?", "predicted_answer": "Greenhouse gas accounting is a way to inventory and audit greenhouse gas (GHG) emissions. It quantifies the total greenhouse gases produced directly and indirectly from a business or organization's activities, providing information for understanding and managing climate change impacts.", "context": [{"id": "1", "chunk_text": "Greenhouse gas accounting | Greenhouse gas accounting describes the way to inventory and audit greenhouse gas (GHG) emissions. A corporate or organisational greenhouse gas (GHG) emissions assessment quantifies the total greenhouse gases produced directly and indirectly from a business or organisation\u2019s activities. Also known as a carbon footprint, it is a business tool that provides information with a basis for understanding and managing climate change impacts."}], "id": 22}
4 | {"query": "What was the outcome of the American Electric Power Co. v. Connecticut case?", "predicted_answer": "In American Electric Power Company v. Connecticut, the U.S. Supreme Court ruled that corporations cannot be sued for greenhouse gas emissions under federal common law because the Clean Air Act delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA).", "context": [{"id": "1", "chunk_text": "American Electric Power Co. v. Connecticut | American Electric Power Company v. Connecticut, 564 U.S. 410 (2011) , was a United States Supreme Court case in which the Court, in an 8\u20130 decision, held that corporations cannot be sued for greenhouse gas emissions (GHGs) under federal common law, primarily because the Clean Air Act (CAA) delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA). Brought to court in July 2004 in the Southern District of New York, this was the first global warming case based on a public nuisance claim."}], "id": 23}
5 | {"query": "What is the primary activity of the Carbon Disclosure Project (CDP)?", "predicted_answer": "The primary activity of the Carbon Disclosure Project (CDP) is to disclose the greenhouse gas (GHG) emissions of major corporations.", "context": [{"id": "1", "chunk_text": "Carbon Disclosure Project | The CDP (formerly the \"Carbon Disclosure Project\") is an organisation based in the United Kingdom which works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations. As disclosure of emission related data as CDP's primary activity, the quality of the data reported to CDP is key."}], "id": 24}
6 | {"query": "Where is the Weyburn-Midale Carbon Dioxide Project located?", "predicted_answer": "The Weyburn-Midale Carbon Dioxide Project is located in Midale, Saskatchewan, Canada.", "context": [{"id": "1", "chunk_text": "Weyburn-Midale Carbon Dioxide Project | The Weyburn-Midale Carbon Dioxide Project (or IEA GHG Weyburn-Midale CO Monitoring and Storage Project) is, as of 2008, the world's largest carbon capture and storage project. It is located in Midale, Saskatchewan, Canada."}], "id": 25}
7 | {"query": "What is Owens Wiwa's area of expertise?", "predicted_answer": "Owens Wiwa is an internationally renowned expert on the effects of globalization, especially as it relates to the business practices of Royal Dutch Shell in the Niger Delta.", "context": [{"id": "1", "chunk_text": "Owens Wiwa | Monday Owens Wiwa (born 10 October 1957 in Bori, Nigeria) is a medical doctor and human rights activist. He is the brother of executed Ogoni leader Ken Saro-Wiwa, and the son of Ogoni chief Jim Wiwa. Wiwa is an internationally renowned expert on the effects of globalisation, especially as it relates to the highly controversial business practices of Royal Dutch Shell in the Niger Delta. Vice-chairman of the Toronto chapter of the Sierra Club Canada and an active member of Amnesty International, Wiwa is frequently called upon to advocate for development programs in Canada and abroad and to campaign for increased corporate responsibility. This work has taken him to Ireland, which he visits in support of the Shell to Sea campaign. Currently, he is the Country Director for the Clinton Health Access Initiative in Nigeria."}], "id": 26}
8 | {"query": "What did Amnesty International do in response to the Zakoora and Tengpora Massacre?", "predicted_answer": "Amnesty International issued an appeal for urgent action on Kashmir.", "context": [{"id": "1", "chunk_text": "Zakoora and Tengpora massacre | The Zukoora And Tengpora Massacre was the killing of protesters calling for the implementation of a United Nations resolution regarding a plebiscite in Kashmir at Zakoora Crossing and Tengpora Bypass Road in Srinagar on 1 March 1990, in which 33 people were killed and 47 injured. It led Amnesty International to issue an appeal for urgent action on Kashmir."}], "id": 27}
9 | 


--------------------------------------------------------------------------------
/rag-data/more-q-01.jsonl:
--------------------------------------------------------------------------------
1 | {"query": "What is the Roe effect?", "predicted_answer": "The Roe effect is a hypothesis that suggests that the practice of abortion will eventually lead to the restriction or illegalization of abortion due to the erosion of the political base of abortion rights supporters.", "ideal_answer": "The Roe effect is a hypothesis that suggests that the practice of abortion will eventually lead to the restriction or illegalization of abortion due to the erosion of the political base of abortion rights supporters.", "context": [{"id": "0", "chunk_text": "Roe effect | The Roe effect is a hypothesis about the long-term effect of abortion on the political balance of the United States, which suggests that since supporters of abortion rights cause the erosion of their own political base, the practice of abortion will eventually lead to the restriction or illegalization of abortion."}], "id": 0}
2 | {"query": "What is the purpose of the Human Life Amendment?", "predicted_answer": "The Human Life Amendment is a proposal to amend the United States Constitution to overturn the Supreme Court's Roe v. Wade decision, which legalized abortion nationwide, and to forbid both Congress and the states from legalizing abortion.", "ideal_answer": "The Human Life Amendment is a proposal to amend the United States Constitution to overturn the Supreme Court's Roe v. Wade decision, which legalized abortion nationwide, and to forbid both Congress and the states from legalizing abortion.", "context": [{"id": "0", "chunk_text": "Human Life Amendment | The Human Life Amendment is the name of multiple proposals to amend the United States Constitution that would have the effect of overturning the Supreme Court 1973 decision \"Roe v. Wade\", which ruled that prohibitions against abortion were unconstitutional. All of these amendment proposals seek to overturn \"Roe v. Wade\", but most of them go further by forbidding both Congress and the states from legalizing abortion."}], "id": 1}
3 | {"query": "What was the ruling in United States v. Vuitch?", "predicted_answer": "In United States v. Vuitch, the Supreme Court held that the District of Columbia's abortion law banning the practice except when necessary for the health or life of the woman was not unconstitutionally vague.", "ideal_answer": "In United States v. Vuitch, the Supreme Court held that the District of Columbia's abortion law banning the practice except when necessary for the health or life of the woman was not unconstitutionally vague.", "context": [{"id": "0", "chunk_text": "United States v. Vuitch | United States v. Vuitch, 402 U.S. 62 (1971) was a United States Supreme Court abortion rights case, which held that the District of Columbia's abortion law banning the practice except when necessary for the health or life of the woman was not unconstitutionally vague."}], "id": 2}
4 | {"query": "What is the purpose of greenhouse gas accounting?", "predicted_answer": "Greenhouse gas accounting is a way to inventory and audit greenhouse gas (GHG) emissions. It quantifies the total greenhouse gases produced directly and indirectly from a business or organization's activities, providing information for understanding and managing climate change impacts.", "ideal_answer": "Greenhouse gas accounting is a way to inventory and audit greenhouse gas (GHG) emissions. It quantifies the total greenhouse gases produced directly and indirectly from a business or organization's activities, providing information for understanding and managing climate change impacts.", "context": [{"id": "0", "chunk_text": "Greenhouse gas accounting | Greenhouse gas accounting describes the way to inventory and audit greenhouse gas (GHG) emissions. A corporate or organisational greenhouse gas (GHG) emissions assessment quantifies the total greenhouse gases produced directly and indirectly from a business or organisation\u2019s activities. Also known as a carbon footprint, it is a business tool that provides information with a basis for understanding and managing climate change impacts."}], "id": 3}
5 | {"query": "What was the outcome of the American Electric Power Co. v. Connecticut case?", "predicted_answer": "In American Electric Power Company v. Connecticut, the U.S. Supreme Court ruled that corporations cannot be sued for greenhouse gas emissions under federal common law because the Clean Air Act delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA).", "ideal_answer": "In American Electric Power Company v. Connecticut, the U.S. Supreme Court ruled that corporations cannot be sued for greenhouse gas emissions under federal common law because the Clean Air Act delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA).", "context": [{"id": "0", "chunk_text": "American Electric Power Co. v. Connecticut | American Electric Power Company v. Connecticut, 564 U.S. 410 (2011) , was a United States Supreme Court case in which the Court, in an 8\u20130 decision, held that corporations cannot be sued for greenhouse gas emissions (GHGs) under federal common law, primarily because the Clean Air Act (CAA) delegates the management of carbon dioxide and other GHG emissions to the Environmental Protection Agency (EPA). Brought to court in July 2004 in the Southern District of New York, this was the first global warming case based on a public nuisance claim."}], "id": 4}
6 | {"query": "What is the primary activity of the Carbon Disclosure Project (CDP)?", "predicted_answer": "The Carbon Disclosure Project (CDP) works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations.", "ideal_answer": "The Carbon Disclosure Project (CDP) works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations.", "context": [{"id": "0", "chunk_text": "Carbon Disclosure Project | The CDP (formerly the \"Carbon Disclosure Project\") is an organisation based in the United Kingdom which works with shareholders and corporations to disclose the greenhouse gas (GHG) emissions of major corporations. As disclosure of emission related data as CDP\\'s primary activity, the quality of the data reported to CDP is key."}], "id": 5}
7 | 


--------------------------------------------------------------------------------
/rag-data/output-01.tsv:
--------------------------------------------------------------------------------
1 | #QID	CONTEXT_RECALL
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py
 2 | accelerate
 3 | Jinja2
 4 | dspy-ai
 5 | dspy-ai[chromadb]
 6 | evaluate
 7 | google-generativeai
 8 | langchain
 9 | langchain-google-genai
10 | nltk
11 | pillow
12 | polars
13 | rouge-score
14 | scikit-learn
15 | tensorboardX
16 | transformers[torch]
17 | xmltodict
18 | streamlit


--------------------------------------------------------------------------------
/resources/configs/answer_correctness-best.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fact_extractor": {
 3 |     "lm": null,
 4 |     "traces": [],
 5 |     "train": [],
 6 |     "demos": [
 7 |       {
 8 |         "augmented": true,
 9 |         "answer": "The consequences of criminalizing abortion for marginalized individuals include increased stigma, lack of information, and disinformation. This can have severe and irreversible effects on these individuals. Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights. Marginalized individuals, such as those living in poverty, historically discriminated against, Indigenous and Afro-descendent women, migrants, and refugees, are disproportionately affected by abortion criminalization. The criminalization of abortion is a major factor contributing to the high number of unsafe abortions, which leads to increased maternal mortality and morbidity. Access to health services is undermined, resulting in preventable maternal deaths and complications. Marginalized individuals are forced to resort to unsafe clandestine abortion methods, putting their lives and health at risk. In Nigeria, restrictive abortion laws make it difficult to access safe abortion care.",
10 |         "facts": "1. Criminalizing abortion increases stigma for marginalized individuals.\n2. Criminalizing abortion leads to a lack of information and disinformation for marginalized individuals.\n3. Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights.\n4. Marginalized individuals are disproportionately affected by abortion criminalization.\n5. Criminalization of abortion contributes to the high number of unsafe abortions.\n6. Unsafe abortions lead to increased maternal mortality and morbidity.\n7. Access to health services is undermined by abortion criminalization.\n8. Marginalized individuals are forced to resort to unsafe clandestine abortion methods.\n9. Restrictive abortion laws in Nigeria make it difficult to access safe abortion care."
11 |       },
12 |       {
13 |         "augmented": true,
14 |         "answer": "Criminalizing abortion can have severe consequences for marginalized individuals, particularly those who are already facing various forms of discrimination and inequality. Some of the consequences include:\n\n1. Health risks: When abortion is criminalized, individuals may resort to unsafe and illegal methods to terminate their pregnancies, leading to serious health complications and even death. Marginalized individuals, who often have limited access to healthcare and resources, are more likely to be affected by these risks.\n\n2. Economic burden: Marginalized individuals, who are more likely to face financial constraints, may struggle to afford safe and legal abortions in countries where it is criminalized. This can lead to increased financial burdens, as they may have to travel long distances or seek expensive underground services.\n\n3. Limited reproductive choices: Criminalizing abortion restricts individuals' reproductive choices and autonomy. Marginalized individuals, who already face various barriers to accessing healthcare and education, may be further limited in their ability to make informed decisions about their reproductive health.\n\n4. Increased inequality: Criminalizing abortion disproportionately affects marginalized individuals, exacerbating existing inequalities. Women from low-income backgrounds, racial and ethnic minorities, LGBTQ+ individuals, and those living in rural areas often face additional barriers to accessing safe and legal abortion services, further widening the gap between privileged and marginalized groups.\n\n5. Criminalization of pregnancy: In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself. This can result in the surveillance and punishment of pregnant individuals, particularly those from marginalized communities, who may be subjected to invasive monitoring, discrimination, and even imprisonment for actions perceived as harmful to the fetus.\n\n6. Mental health impact: The criminalization of abortion can have significant psychological consequences for marginalized individuals. The stigma and shame associated with seeking illegal abortions or being unable to access safe services can lead to increased stress, anxiety, and depression.\n\n7. Reproductive injustice: Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals the right to control their own bodies and make decisions about their reproductive health. It reinforces systemic inequalities and denies them the same reproductive rights and freedoms enjoyed by more privileged individuals.\n\nIt is important to note that these consequences are not limited to criminalizing abortion alone but are often intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.",
15 |         "facts": "1. Criminalizing abortion can lead to unsafe and illegal methods of termination, resulting in health complications and death, especially for marginalized individuals.\n2. Marginalized individuals may face financial burdens due to limited access to safe and legal abortions in criminalized countries.\n3. Criminalizing abortion restricts reproductive choices and autonomy, particularly for marginalized individuals who face barriers to healthcare and education.\n4. Criminalizing abortion exacerbates existing inequalities, disproportionately affecting marginalized groups such as low-income women, racial and ethnic minorities, LGBTQ+ individuals, and rural residents.\n5. In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself, resulting in surveillance, punishment, and discrimination against pregnant individuals from marginalized communities.\n6. The criminalization of abortion can have significant mental health impacts, including increased stress, anxiety, and depression, for marginalized individuals.\n7. Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals control over their bodies and reproductive health decisions.\n8. The consequences of criminalizing abortion are intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.\n9. Criminalizing abortion can lead to increased inequality between privileged and marginalized groups.\n10. Criminalizing abortion can result in the denial of reproductive rights and freedoms for marginalized individuals."
16 |       }
17 |     ],
18 |     "signature_instructions": " Extract 5-10 facts from the answer",
19 |     "signature_prefix": "Facts:"
20 |   },
21 |   "fact_grouper": {
22 |     "lm": null,
23 |     "traces": [],
24 |     "train": [],
25 |     "demos": [
26 |       {
27 |         "augmented": true,
28 |         "facts_g": "1. Criminalizing abortion increases stigma for marginalized individuals.\n2. Criminalizing abortion leads to a lack of information and disinformation for marginalized individuals.\n3. Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights.\n4. Marginalized individuals are disproportionately affected by abortion criminalization.\n5. Criminalization of abortion contributes to the high number of unsafe abortions.\n6. Unsafe abortions lead to increased maternal mortality and morbidity.\n7. Access to health services is undermined by abortion criminalization.\n8. Marginalized individuals are forced to resort to unsafe clandestine abortion methods.\n9. Restrictive abortion laws in Nigeria make it difficult to access safe abortion care.",
29 |         "facts_a": "1. Criminalizing abortion can lead to unsafe and illegal methods of termination, resulting in health complications and death, especially for marginalized individuals.\n2. Marginalized individuals may face financial burdens due to limited access to safe and legal abortions in criminalized countries.\n3. Criminalizing abortion restricts reproductive choices and autonomy, particularly for marginalized individuals who face barriers to healthcare and education.\n4. Criminalizing abortion exacerbates existing inequalities, disproportionately affecting marginalized groups such as low-income women, racial and ethnic minorities, LGBTQ+ individuals, and rural residents.\n5. In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself, resulting in surveillance, punishment, and discrimination against pregnant individuals from marginalized communities.\n6. The criminalization of abortion can have significant mental health impacts, including increased stress, anxiety, and depression, for marginalized individuals.\n7. Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals control over their bodies and reproductive health decisions.\n8. The consequences of criminalizing abortion are intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.\n9. Criminalizing abortion can lead to increased inequality between privileged and marginalized groups.\n10. Criminalizing abortion can result in the denial of reproductive rights and freedoms for marginalized individuals.",
30 |         "rationale": "produce the fact_groups. \n\n1. We start by comparing the facts from the ground truth (Facts G) with the facts from the answer (Facts A).\n2. For each fact in Facts G, we check if it is also present in Facts A.\n3. If the fact is present in both Facts G and Facts A, we classify it as a TP (true positive).\n4. If the fact is present in Facts A but not in Facts G, we classify it as a FP (false positive).\n5. If the fact is present in Facts G but not in Facts A, we classify it as a FN (false negative).\n6. Finally, we group the facts into JSON string with keys TP, FP, FN and list of facts for each as values.",
31 |         "fact_groups": "```json\n{\n  \"TP\": [\n    \"Criminalizing abortion increases stigma for marginalized individuals.\",\n    \"Criminalizing abortion leads to a lack of information and disinformation for marginalized individuals.\",\n    \"Marginalized individuals are disproportionately affected by abortion criminalization.\",\n    \"Criminalization of abortion contributes to the high number of unsafe abortions.\",\n    \"Unsafe abortions lead to increased maternal mortality and morbidity.\",\n    \"Access to health services is undermined by abortion criminalization.\",\n    \"Marginalized individuals are forced to resort to unsafe clandestine abortion methods.\"\n  ],\n  \"FP\": [\n    \"Criminalizing abortion can lead to unsafe and illegal methods of termination, resulting in health complications and death, especially for marginalized individuals.\",\n    \"Marginalized individuals may face financial burdens due to limited access to safe and legal abortions in criminalized countries.\",\n    \"Criminalizing abortion restricts reproductive choices and autonomy, particularly for marginalized individuals who face barriers to healthcare and education.\",\n    \"Criminalizing abortion exacerbates existing inequalities, disproportionately affecting marginalized groups such as low-income women, racial and ethnic minorities, LGBTQ+ individuals, and rural residents.\",\n    \"In some cases, criminalizing abortion can lead to the criminalization of pregnancy itself, resulting in surveillance, punishment, and discrimination against pregnant individuals from marginalized communities.\",\n    \"The criminalization of abortion can have significant mental health impacts, including increased stress, anxiety, and depression, for marginalized individuals.\",\n    \"Criminalizing abortion perpetuates reproductive injustice by denying marginalized individuals control over their bodies and reproductive health decisions.\",\n    \"The consequences of criminalizing abortion are intertwined with broader social, economic, and healthcare disparities faced by marginalized communities.\",\n    \"Criminalizing abortion can lead to increased inequality between privileged and marginalized groups.\",\n    \"Criminalizing abortion can result in the denial of reproductive rights and freedoms for marginalized individuals.\"\n  ],\n  \"FN\": [\n    \"Girls and young women may be forced to carry pregnancies resulting from sexual violence due to a lack of knowledge about their rights.\",\n    \"Restrictive abortion laws in Nigeria make it difficult to access safe abortion care.\"\n  ]\n}\n```"
32 |       },
33 |       {
34 |         "augmented": true,
35 |         "facts_g": "1. The prosecution considers statements contrary to the official position as 'false' under Article 207.3.\n2. Statements must be in opposition to the official position of the Russian authorities to be considered 'false'.",
36 |         "facts_a": "1. Article 207.3 of the Russian Criminal Code defines 'false' statements as those made intentionally to discredit the state or its authorities.\n2. The prosecution considers statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage.\n3. The prosecution will consider statements as false if they are deliberately made to mislead the public or undermine the credibility of the state.\n4. The prosecution will consider statements as false if they are made contrary to the official position.\n5. The prosecution will consider statements as false if they are made with the aim of discrediting the state or its authorities.",
37 |         "rationale": "Facts G:\n1. The prosecution considers statements contrary to the official position as 'false' under Article 207.3.\n2. Statements must be in opposition to the official position of the Russian authorities to be considered 'false'.\n\nFacts A:\n1. Article 207.3 of the Russian Criminal Code defines 'false' statements as those made intentionally to discredit the state or its authorities.\n2. The prosecution considers statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage.\n3. The prosecution will consider statements as false if they are deliberately made to mislead the public or undermine the credibility of the state.\n4. The prosecution will consider statements as false if they are made contrary to the official position.\n5. The prosecution will consider statements as false if they are made with the aim of discrediting the state or its authorities.\n\nReasoning: Let's think step by step in order to produce the fact_groups.\n- TP: Fact 1 in G and Fact 4 in A are both about the prosecution considering statements contrary to the official position as 'false'.\n- FP: Fact 2 in A is about the prosecution considering statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage, which is not mentioned in G. Fact 3 in A is about the prosecution considering statements as false if they are deliberately made to mislead the public or undermine the credibility of the state, which is also not mentioned in G. Fact 5 in A is about the prosecution considering statements as false if they are made with the aim of discrediting the state or its authorities, which is already covered by Fact 1 in G.\n- FN: Fact 2 in G is about statements must be in opposition to the official position of the Russian authorities to be considered 'false', which is not mentioned in A.",
38 |         "fact_groups": "```json\n{\n  \"TP\": [\n    \"The prosecution considers statements contrary to the official position as 'false' under Article 207.3.\"\n  ],\n  \"FP\": [\n    \"The prosecution considers statements 'false' if they are made with the aim of spreading false information that could harm the public interest or cause significant damage.\",\n    \"The prosecution will consider statements as false if they are deliberately made to mislead the public or undermine the credibility of the state.\",\n    \"The prosecution will consider statements as false if they are made with the aim of discrediting the state or its authorities.\"\n  ],\n  \"FN\": [\n    \"Statements must be in opposition to the official position of the Russian authorities to be considered 'false'.\"\n  ]\n}\n```"
39 |       }
40 |     ],
41 |     "signature_instructions": " Classify each answer fact with respect to ground truth fact as follows:\n        - TP (true positive): fact is present in both answer and ground truth\n        - FP (false positive): fact is present in answer but not in ground truth\n        - FN (false negative): fact is present in ground truth not in answer.\n        Group the input facts as numbered lists under these 3 categories\n    ",
42 |     "signature_prefix": "Fact Groups:",
43 |     "extended_signature_instructions": " Classify each answer fact with respect to ground truth fact as follows:\n        - TP (true positive): fact is present in both answer and ground truth\n        - FP (false positive): fact is present in answer but not in ground truth\n        - FN (false negative): fact is present in ground truth not in answer.\n        Group the input facts as numbered lists under these 3 categories\n    ",
44 |     "extended_signature_prefix": "Fact Groups:"
45 |   }
46 | }


--------------------------------------------------------------------------------
/resources/configs/answer_relevance-best.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "question_generator": {
 3 |     "lm": null,
 4 |     "traces": [],
 5 |     "train": [],
 6 |     "demos": [
 7 |       {
 8 |         "augmented": true,
 9 |         "answer": "Everyone has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. This right is recognized by various international human rights instruments, such as the Universal Declaration of Human Rights and the International Covenant on Civil and Political Rights.\n\nThe victims of human rights violations and their families have a particular right to know the truth about what happened to them. This includes knowing the identities of the perpetrators and the fate of the victims, such as whether they are alive or dead, and if dead, the circumstances of their death.\n\nFurthermore, society as a whole has a right to be informed about human rights violations. This is essential for holding perpetrators accountable, preventing future abuses, and ensuring justice and reparations for the victims. Access to information about human rights violations also allows for public scrutiny and debate, which are crucial for building a culture of human rights and promoting transparency and accountability.\n\nJournalists, human rights defenders, and civil society organizations play a vital role in investigating and reporting on human rights violations. They have the right to access information, conduct independent investigations, and disseminate their findings to the public. Governments have a responsibility to protect and facilitate these activities, ensuring that individuals and organizations can operate freely and without fear of reprisals.\n\nIn summary, the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims, belongs to everyone. It is a fundamental aspect of promoting human rights, accountability, and justice.",
10 |         "context": "1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.",
11 |         "gen_questions": "1. What are the key provisions of international human rights instruments regarding the right to information about human rights violations?\n2. How does the right to truth contribute to holding perpetrators accountable and preventing future abuses?\n3. What is the role of journalists and civil society organizations in promoting the right to information about human rights violations?\n4. How does the right to know the fate and whereabouts of disappeared relatives relate to the right to truth?\n5. What are the responsibilities of governments in ensuring access to information about human rights violations?"
12 |       },
13 |       {
14 |         "augmented": true,
15 |         "answer": "The purpose of the agreement known as 30x30 is to conserve and protect 30% of the planet's land and oceans by the year 2030. This agreement aims to address the global biodiversity crisis and the impacts of climate change by preserving and restoring ecosystems, promoting sustainable land and ocean management practices, and safeguarding the habitats of numerous plant and animal species. The goal is to ensure the long-term health and resilience of the planet's natural resources and to support the well-being of both human and non-human communities.",
16 |         "context": "1. These include the agreement known as 30x30, adopted at the UN Conference on Biological Diversity (COP15) held in Montreal, Canada, in late 2022, which commits Parties to designate 30% of the world's land and sea masses as protected areas for biodiversity conservation by 2030.\n2. Protecting and restoring these ecosystems is one of the most efficient, equitable and cost-effective mechanisms to capture and store carbon without compromising human rights.\n3. However, the focus of the agreement should not be limited to the amount of area it covers, but rather to how it is managed, who selects the areas, and what biodiversity exists in them.\n4. Indigenous Peoples and local communities have a key role to play here.\n5. Unfortunately, at COP15, States failed to explicitly recognize Indigenous Peoples' lands and territories as a distinct category of protected area, and this threatens their rights.",
17 |         "gen_questions": "1. What is the goal of the 30x30 agreement?\n2. What percentage of the planet's land and oceans does the 30x30 agreement aim to protect?\n3. What are the benefits of conserving and protecting 30% of the planet's land and oceans?\n4. What is the role of Indigenous Peoples and local communities in the 30x30 agreement?\n5. What are some of the challenges associated with implementing the 30x30 agreement?"
18 |       }
19 |     ],
20 |     "signature_instructions": " Given the answer and context, generate 3 to 5 questions that can be \n        answered by the answer. ",
21 |     "signature_prefix": "Gen Questions:"
22 |   },
23 |   "answer_classifier": {
24 |     "lm": null,
25 |     "traces": [],
26 |     "train": [],
27 |     "demos": [
28 |       {
29 |         "augmented": true,
30 |         "question": "What are the key provisions of international human rights instruments regarding the right to information about human rights violations?",
31 |         "context": "1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.",
32 |         "rationale": "produce the noncommital. We first need to identify the key provisions of international human rights instruments regarding the right to information about human rights violations. These provisions can be found in various international human rights instruments, including the Universal Declaration of Human Rights, the International Covenant on Civil and Political Rights, and the Convention against Torture and Other Cruel, Inhuman or Degrading Treatment or Punishment. These instruments set out the right to information about human rights violations, as well as the obligations of states to provide such information.",
33 |         "answer": "The key provisions of international human rights instruments regarding the right to information about human rights violations include the right to know the truth about past human rights violations, the right to access to justice, remedy, and information, and the right to know the fate and whereabouts of 'disappeared' relatives.",
34 |         "noncommital": "no"
35 |       },
36 |       {
37 |         "augmented": true,
38 |         "question": "How does the right to truth contribute to holding perpetrators accountable and preventing future abuses?",
39 |         "context": "1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.",
40 |         "rationale": "produce the noncommital. We first need to understand the concept of the right to truth. The right to truth is the right of victims of human rights violations and their families to know the whole truth about past human rights violations. This right is engaged by the violation of the rights to access to justice, remedy, and information. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.",
41 |         "answer": "The right to truth contributes to holding perpetrators accountable and preventing future abuses by providing victims and their families with the information they need to seek justice and to prevent similar violations from happening again.",
42 |         "noncommital": "no"
43 |       }
44 |     ],
45 |     "signature_instructions": " Given a question and its context, use only the context to generate\n        an answer, and classify if the answer is commital or noncommital.\n    ",
46 |     "signature_prefix": "Noncommital:",
47 |     "extended_signature_instructions": " Given a question and its context, use only the context to generate\n        an answer, and classify if the answer is commital or noncommital.\n    ",
48 |     "extended_signature_prefix": "Noncommital:"
49 |   }
50 | }


--------------------------------------------------------------------------------
/resources/configs/context_precision-best.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "usefulness_classifier": {
 3 |     "lm": null,
 4 |     "traces": [],
 5 |     "train": [],
 6 |     "demos": [
 7 |       {
 8 |         "augmented": true,
 9 |         "question": "Who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims?",
10 |         "answer": "Everyone has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. This right is recognized by various international human rights instruments, such as the Universal Declaration of Human Rights and the International Covenant on Civil and Political Rights.\n\nThe victims of human rights violations and their families have a particular right to know the truth about what happened to them. This includes knowing the identities of the perpetrators and the fate of the victims, such as whether they are alive or dead, and if dead, the circumstances of their death.\n\nFurthermore, society as a whole has a right to be informed about human rights violations. This is essential for holding perpetrators accountable, preventing future abuses, and ensuring justice and reparations for the victims. Access to information about human rights violations also allows for public scrutiny and debate, which are crucial for building a culture of human rights and promoting transparency and accountability.\n\nJournalists, human rights defenders, and civil society organizations play a vital role in investigating and reporting on human rights violations. They have the right to access information, conduct independent investigations, and disseminate their findings to the public. Governments have a responsibility to protect and facilitate these activities, ensuring that individuals and organizations can operate freely and without fear of reprisals.\n\nIn summary, the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims, belongs to everyone. It is a fundamental aspect of promoting human rights, accountability, and justice.",
11 |         "context": "The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.",
12 |         "rationale": "produce the score.\n1. The question asks who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims.\n2. The context states that the victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n3. Therefore, the context is useful for answering the question.",
13 |         "score": "yes"
14 |       },
15 |       {
16 |         "augmented": true,
17 |         "question": "Who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims?",
18 |         "answer": "Everyone has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. This right is recognized by various international human rights instruments, such as the Universal Declaration of Human Rights and the International Covenant on Civil and Political Rights.\n\nThe victims of human rights violations and their families have a particular right to know the truth about what happened to them. This includes knowing the identities of the perpetrators and the fate of the victims, such as whether they are alive or dead, and if dead, the circumstances of their death.\n\nFurthermore, society as a whole has a right to be informed about human rights violations. This is essential for holding perpetrators accountable, preventing future abuses, and ensuring justice and reparations for the victims. Access to information about human rights violations also allows for public scrutiny and debate, which are crucial for building a culture of human rights and promoting transparency and accountability.\n\nJournalists, human rights defenders, and civil society organizations play a vital role in investigating and reporting on human rights violations. They have the right to access information, conduct independent investigations, and disseminate their findings to the public. Governments have a responsibility to protect and facilitate these activities, ensuring that individuals and organizations can operate freely and without fear of reprisals.\n\nIn summary, the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims, belongs to everyone. It is a fundamental aspect of promoting human rights, accountability, and justice.",
19 |         "context": "The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.",
20 |         "rationale": "produce the score. The question asks who has the right to be fully informed about human rights violations, including the identities of the perpetrators and the fate of the victims. The answer states that everyone has this right, and it is recognized by various international human rights instruments. The context provided does not directly address the question or provide any additional information that is relevant to answering it. Therefore, the score is 0.0.",
21 |         "score": "0.0"
22 |       }
23 |     ],
24 |     "signature_instructions": " Given a question, an answer to the question, and supporting context,\n        provide a yes\/no score indicating if the context was useful for\n        answering the question.",
25 |     "signature_prefix": "Score:",
26 |     "extended_signature_instructions": " Given a question, an answer to the question, and supporting context,\n        provide a yes\/no score indicating if the context was useful for\n        answering the question.",
27 |     "extended_signature_prefix": "Score:"
28 |   }
29 | }


--------------------------------------------------------------------------------
/resources/configs/context_recall-best.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attrib_clf": {
 3 |     "lm": null,
 4 |     "traces": [],
 5 |     "train": [],
 6 |     "demos": [
 7 |       {
 8 |         "augmented": true,
 9 |         "answer": [
10 |           "The purpose of the agreement known as 30x30 is to conserve and protect 30% of the planet's land and oceans by the year 2030.",
11 |           "This agreement aims to address the global biodiversity crisis and the impacts of climate change by preserving and restoring ecosystems, promoting sustainable land and ocean management practices, and safeguarding the habitats of numerous plant and animal species.",
12 |           "The goal is to ensure the long-term health and resilience of the planet's natural resources and to support the well-being of both human and non-human communities."
13 |         ],
14 |         "context_item": "- These include the agreement known as 30x30, adopted at the UN Conference on Biological Diversity (COP15) held in Montreal, Canada, in late 2022, which commits Parties to designate 30% of the world's land and sea masses as protected areas for biodiversity conservation by 2030.",
15 |         "rationale": "produce the scores. We start with the context item, which provides information about the 30x30 agreement. The first sentence of the answer provides a concise summary of the agreement's purpose, which is consistent with the information in the context item. The second sentence elaborates on the agreement's goals, which are also aligned with the information in the context item. The third sentence emphasizes the importance of the agreement for the long-term health of the planet and the well-being of human and non-human communities, which is not explicitly mentioned in the context item but is a reasonable inference based on the information provided.",
16 |         "scores": "yes, yes, yes"
17 |       },
18 |       {
19 |         "answer": "Amnesty International urged its supporters to write letters to the Nigerian government, calling for an independent investigation into the killings of the Ogoni 9 and for those responsible to be brought to justice. They also called for an end to the human rights abuses in the Niger Delta region and for the protection of the rights of the Ogoni people.",
20 |         "context": [
21 |           "Amnesty International called on its vast network of supporters to deluge Nigerian authorities first with appeals for the defenders\u2019 freedom, and later with letters of outrage."
22 |         ],
23 |         "score": "1.0"
24 |       }
25 |     ],
26 |     "signature_instructions": " Given a context item and an answer, for each sentence in the answer,\n        classify if the sentence can be attributed to the context item. ",
27 |     "signature_prefix": "Scores:",
28 |     "extended_signature_instructions": " Given a context item and an answer, for each sentence in the answer,\n        classify if the sentence can be attributed to the context item. ",
29 |     "extended_signature_prefix": "Scores:"
30 |   }
31 | }


--------------------------------------------------------------------------------
/resources/configs/context_relevance-best.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nec_classifier": {
 3 |     "lm": null,
 4 |     "traces": [],
 5 |     "train": [],
 6 |     "demos": [
 7 |       {
 8 |         "augmented": true,
 9 |         "question": "Which right guarantees access to comprehensive information about past human rights violations, including the identities of the perpetrators and the fate of the victims, as well as the circumstances surrounding the violations?",
10 |         "ctx_sent": "The Act raises serious questions about its compatibility with the right to truth.",
11 |         "rationale": "produce the score. We need to know which right guarantees access to comprehensive information about past human rights violations, including the identities of the perpetrators and the fate of the victims, as well as the circumstances surrounding the violations. The sentence does not provide this information, so it is not absolutely necessary to answer the question.",
12 |         "score": "no"
13 |       },
14 |       {
15 |         "question": "What action did Amnesty International urge its supporters to take in response to the killing of the Ogoni 9?",
16 |         "context": "- Amnesty International called on its vast network of supporters to deluge Nigerian authorities first with appeals for the defenders\u2019 freedom, and later with letters of outrage.",
17 |         "score": "1.0"
18 |       }
19 |     ],
20 |     "signature_instructions": " Given a question and a sentence from the context, classify\n        if sentence is absolutely necessary to answer question\n    ",
21 |     "signature_prefix": "Score:",
22 |     "extended_signature_instructions": " Given a question and a sentence from the context, classify\n        if sentence is absolutely necessary to answer question\n    ",
23 |     "extended_signature_prefix": "Score:"
24 |   }
25 | }


--------------------------------------------------------------------------------
/resources/configs/faithfulness-best.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extractor": {
 3 |     "lm": null,
 4 |     "traces": [],
 5 |     "train": [],
 6 |     "demos": [
 7 |       {
 8 |         "augmented": true,
 9 |         "question": "What role do social media companies play in protecting users' rights online, regardless of their language and political views?",
10 |         "answer": "Social media companies play a crucial role in protecting users' rights online, regardless of their language and political views. Here are some key aspects of their role:\n\n1. Content moderation: Social media companies have the responsibility to moderate and remove content that violates their community guidelines, including hate speech, harassment, and misinformation. This helps protect users from harmful and abusive content, ensuring a safer online environment.\n\n2. Privacy and data protection: Companies must safeguard users' personal information and ensure their privacy rights are respected. This involves implementing robust data protection measures, obtaining user consent for data collection and usage, and providing transparent privacy policies.\n\n3. Algorithmic transparency: Social media platforms use algorithms to curate and personalize content for users. Ensuring transparency in these algorithms is crucial to prevent bias, discrimination, and the manipulation of information. Companies should disclose how algorithms work and take steps to address any unintended consequences.\n\n4. Accessibility and inclusivity: Social media companies should strive to make their platforms accessible to users of all languages and abilities. This includes providing language options, captioning for videos, and features that accommodate users with disabilities, ensuring equal access to information and participation.\n\n5. Fighting disinformation: Companies should actively combat the spread of disinformation and fake news on their platforms. This involves implementing fact-checking mechanisms, partnering with credible sources, and promoting media literacy to help users distinguish between reliable and misleading information.\n\n6. Transparency and accountability: Social media companies should be transparent about their policies, practices, and decision-making processes. They should provide clear avenues for users to report issues, appeal content removals, and seek redress. Regular transparency reports can help build trust and hold companies accountable.\n\n7. Collaboration with external stakeholders: Companies should collaborate with governments, civil society organizations, and experts to develop policies and practices that protect users' rights. Engaging in multi-stakeholder dialogues helps ensure diverse perspectives are considered and fosters a collective effort to address online challenges.\n\nOverall, social media companies have a responsibility to create a safe, inclusive, and rights-respecting online environment for all users, regardless of their language and political views.",
11 |         "facts": "1. Social media companies moderate content to remove harmful and abusive content, protecting users from hate speech, harassment, and misinformation.\n2. Companies must safeguard users' personal information and ensure their privacy rights are respected, implementing robust data protection measures and providing transparent privacy policies.\n3. Social media platforms should strive to make their platforms accessible to users of all languages and abilities, providing language options, captioning for videos, and features that accommodate users with disabilities."
12 |       },
13 |       {
14 |         "augmented": true,
15 |         "question": "What factors have contributed to the decline of independent civil society organizations in Nicaragua?",
16 |         "answer": "There are several factors that have contributed to the decline of independent civil society organizations in Nicaragua:\n\n1. Repressive government policies: The Nicaraguan government, under President Daniel Ortega, has implemented a series of repressive policies that have targeted civil society organizations. These policies include restrictive laws, such as the Foreign Agents Law, which requires organizations receiving foreign funding to register as \"foreign agents\" and submit detailed financial reports. This has created a climate of fear and uncertainty for many organizations, leading to self-censorship and a decline in their activities.\n\n2. Harassment and intimidation: Civil society organizations in Nicaragua have faced harassment and intimidation from government authorities, including surveillance, arbitrary arrests, and physical attacks. Human rights defenders and activists have been particularly targeted, with many facing threats to their personal safety and the safety of their families. This has made it difficult for organizations to operate freely and has deterred many individuals from getting involved in civil society work.\n\n3. Lack of funding: The repressive policies and climate of fear have also made it difficult for civil society organizations to secure funding. Many international donors have been reluctant to support organizations in Nicaragua due to concerns about the government's crackdown on civil society. This has led to a decline in resources available for organizations to carry out their work, including advocacy, research, and service provision.\n\n4. Legal restrictions: The Nicaraguan government has implemented a series of legal restrictions that have made it difficult for civil society organizations to operate. These include laws that limit freedom of association, freedom of expression, and the right to peaceful assembly. Organizations that are critical of the government or engage in activities deemed \"political\" have faced increased scrutiny and legal challenges, further limiting their ability to operate independently.\n\n5. Polarization and division: The political situation in Nicaragua has become increasingly polarized, with the government cracking down on dissent and opposition voices. This has created a climate of fear and mistrust, making it difficult for civil society organizations to collaborate and coordinate their efforts. The lack of unity and solidarity among civil society actors has weakened their collective impact and made it easier for the government to suppress their activities.\n\nOverall, the decline of independent civil society organizations in Nicaragua can be attributed to a combination of repressive government policies, harassment and intimidation, lack of funding, legal restrictions, and polarization. These factors have created a hostile environment for civil society, leading to a decline in their numbers and activities.",
17 |         "facts": "1. The Nicaraguan government has implemented repressive policies, such as the Foreign Agents Law, which has created a climate of fear and uncertainty for civil society organizations.\n2. Civil society organizations in Nicaragua have faced harassment and intimidation from government authorities, including surveillance, arbitrary arrests, and physical attacks.\n3. The lack of funding, due to the repressive policies and climate of fear, has made it difficult for civil society organizations to secure funding and carry out their work.\n4. The Nicaraguan government has implemented legal restrictions that limit freedom of association, freedom of expression, and the right to peaceful assembly, making it difficult for civil society organizations to operate independently.\n5. The political situation in Nicaragua has become increasingly polarized, creating a climate of fear and mistrust, making it difficult for civil society organizations to collaborate and coordinate their efforts."
18 |       }
19 |     ],
20 |     "signature_instructions": " Given a question-answer pair, generate a list of 3-5 facts\n        from the answer\n    ",
21 |     "signature_prefix": "Facts:"
22 |   },
23 |   "scorer": {
24 |     "lm": null,
25 |     "traces": [],
26 |     "train": [],
27 |     "demos": [
28 |       {
29 |         "augmented": true,
30 |         "context": "1. Companies, including social media companies, have a responsibility to respect all human rights wherever they operate in the world.\n2. Social media companies involved in facilitating and moderating online content need to invest adequate resources in human oversight of artificial intelligence-driven content moderation systems to ensure all users can equally exercise their rights online, regardless of language and political views.\n3. They should uphold their human rights responsibilities by engaging in human rights due diligence, including to address risks and abuses arising from their business model.\n4. They should undertake proactive measures in educating users and raising awareness about security and privacy features on their platforms that will help WHRDs create a safer and less toxic online experience.",
31 |         "fact": "Social media companies moderate content to remove harmful and abusive content, protecting users from hate speech, harassment, and misinformation.",
32 |         "score": "Context:\n1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.\nFact: The right to truth is a fundamental human right that is essential for the victims of human rights violations to heal and move on with their lives.\nScore: yes"
33 |       },
34 |       {
35 |         "augmented": true,
36 |         "context": "1. Companies, including social media companies, have a responsibility to respect all human rights wherever they operate in the world.\n2. Social media companies involved in facilitating and moderating online content need to invest adequate resources in human oversight of artificial intelligence-driven content moderation systems to ensure all users can equally exercise their rights online, regardless of language and political views.\n3. They should uphold their human rights responsibilities by engaging in human rights due diligence, including to address risks and abuses arising from their business model.\n4. They should undertake proactive measures in educating users and raising awareness about security and privacy features on their platforms that will help WHRDs create a safer and less toxic online experience.",
37 |         "fact": "Companies must safeguard users' personal information and ensure their privacy rights are respected, implementing robust data protection measures and providing transparent privacy policies.",
38 |         "score": "Context:\n1. The victims of gross human rights violations and their families, as well as members of society generally, have the right to know the whole truth about past human rights violations.\n2. The right to truth is engaged by the violation of the rights to access to justice, remedy, and information.\n3. Victims and their families possess an inalienable and non-derogable right to know the truth regarding the circumstances and reasons behind human rights violations, including the identities of the perpetrators and the fate of the victims.\n4. The right to know the fate and whereabouts of 'disappeared' relatives has been confirmed in the jurisprudence of international and regional human rights bodies, as well as of national courts.\nFact: The right to truth is a fundamental human right that is essential for the promotion of justice, accountability, and reconciliation.\nScore: yes"
39 |       }
40 |     ],
41 |     "signature_instructions": " Classify if fact can be inferred from context ",
42 |     "signature_prefix": "Score:"
43 |   }
44 | }


--------------------------------------------------------------------------------
/resources/prompts/answer_correctness_1.txt:
--------------------------------------------------------------------------------
 1 | Given a ground truth and an answer, analyze each statement in the answer and 
 2 | classify them in one of the following categories:
 3 | 
 4 | - TP (true positive): statements that are present in both the answer and the ground truth,
 5 | - FP (false positive): statements present in the answer but not found in the ground truth,
 6 | - FN (false negative): relevant statements found in the ground truth but omitted in the answer.
 7 | 
 8 | You must classify a single statement you must classify in exactly one category. Do not
 9 | try to interpret the meaning of the ground truth or the answer, just compare the presence
10 | of the statements in them.
11 | 
12 | Here are a few examples:
13 | 
14 | answer: ```The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system.```
15 | ground_truth: ```The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents.```
16 | output: <result>
17 |     <classes>
18 |         <TP>
19 |             <sts>
20 |                 <st>The sun's primary function is to provide light</st>
21 |             </sts>
22 |         </TP>
23 |         <FP>
24 |             <sts>
25 |                 <st>The sun is powered by nuclear fission</st>
26 |                 <st>similar to nuclear reactors on Earth</st>
27 |             </sts>
28 |         </FP>
29 |         <FN>
30 |             <sts>
31 |                 <st>The sun is powered by nuclear fusion, not fission</st>
32 |                 <st>In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy</st>
33 |                 <st>This energy provides heat and light, essential for life on Earth</st>
34 |                 <st>The sun's light plays a critical role in Earth's climate system</st>
35 |                 <st>The sun helps to drive the weather and ocean currents</st>
36 |             </sts>
37 |         </FN>
38 |     </classes>
39 | </result>
40 | 
41 | answer: ```The boiling point of water is 100 degrees Celsius at sea level.```
42 | ground_truth: ```The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude.```
43 | output: <result>
44 |     <classes>
45 |         <TP>
46 |             <sts>
47 |                 <st>The boiling point of water is 100 degrees Celsius at sea level</st>
48 |             </sts>
49 |         </TP>
50 |         <FP>
51 |         </FP>
52 |         <FN>
53 |             <sts>
54 |                 <st>The boiling point can change with altitude</st>
55 |                 <st>The boiling point of water is 212 degrees Fahrenheit at sea level</st>
56 |             </sts>
57 |         </FN>
58 |     </classes>
59 | </result>
60 | 
61 | answer: ```{answer}```
62 | ground_truth: ```{ground_truth}```
63 | output:
64 | 


--------------------------------------------------------------------------------
/resources/prompts/answer_relevance_1.txt:
--------------------------------------------------------------------------------
 1 | Generate {num_questions_to_generate} question for the given answer and context.
 2 | 
 3 | answer: {answer}
 4 | context: {context}
 5 | 
 6 | Return the output using the following XML format.
 7 | 
 8 | <result>
 9 |     <questions>
10 |         <question>generated question 1</question>
11 |         <question>generated question 2</question>
12 |         ...
13 |     </questions>
14 | </result>
15 | 


--------------------------------------------------------------------------------
/resources/prompts/answer_relevance_2.txt:
--------------------------------------------------------------------------------
 1 | Given a question and its context, use ONLY the context to generate an answer, then 
 2 | identify if answer is noncommittal.
 3 | Give noncommittal as 1 if the answer is noncommittal and 0 if the answer is committal.
 4 | A noncommittal answer is one that is evasive, vague, or ambiguous. For example, 
 5 | "I don't know" or "I'm not sure" are noncommittal answers.
 6 | 
 7 | Here are a few examples:
 8 | 
 9 | question: Where was Albert Einstein born?
10 | context: Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time
11 | output: <result>
12 |     <qa_pair>
13 |         <question>Where was Albert Einstein born?</question>
14 |         <answer>Albert Einstein was born in Germany.</answer>
15 |         <noncommittal>0</noncommittal>
16 |     </qa_pair>
17 | </result>
18 | 
19 | question: What unique ability does the newly discovered species of frog have?
20 | context: A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.
21 | output: <result>
22 |     <qa_pair>
23 |         <question>What unique ability does the newly discovered species of frog have?</question>
24 |         <answer>It can change its skin color based on the temperature of its environment.</answer>
25 |         <noncommittal>0</noncommittal>
26 |     </qa_pair>
27 | </result>
28 | 
29 | question: What is the tallest mountain on Earth?
30 | context: The tallest mountain on Earth, measured from sea level, is Mt. Everest, a renowned peak located in the Himalayas.
31 | output: <result>
32 |     <qa_pair>
33 |         <question>What is the tallest mountain on Earth?</question>
34 |         <answer>Everest</answer>
35 |         <noncommittal>0</noncommittal>
36 |     </qa_pair>
37 | </result>
38 | 
39 | question: What was the groundbreaking feature of the smartphone invented in 2023?
40 | context: In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology.
41 | output: <result>
42 |     <qa_pair>
43 |         <question>What was the groundbreaking feature of the smartphone invented in 2023?</question>
44 |         <answer>I don't know about the  groundbreaking feature of the smartphone invented in 2023 as am unaware of information beyond 2022.</answer>
45 |         <noncommittal>1</noncommittal>
46 |     </qa_pair>
47 | </result>
48 | 
49 | question: {question}
50 | context: {context}
51 | output: 


--------------------------------------------------------------------------------
/resources/prompts/context_precision_1.txt:
--------------------------------------------------------------------------------
 1 | Given question, answer and context verify if the context was useful in arriving at
 2 | the given answer. Give verdict as "1" if useful and "0" if not.
 3 | 
 4 | Some examples are provided.
 5 | 
 6 | question: What can you tell me about albert Albert Einstein?
 7 | context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
 8 | answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895
 9 | output: <result>
10 |     <verdict>
11 |         <reason>The provided context was indeed useful in arriving at the given answer. The context includes key information about Albert Einstein's life and contributions, which are reflected in the answer.</reason>
12 |         <infer>1</infer>
13 |     </verdict>
14 | </result>
15 | 
16 | question: who won 2020 icc world cup?
17 | context: The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title.
18 | answer: England
19 | output: <result>
20 |     <verdict>
21 |         <reason>the context was useful in clarifying the situation regarding the 2020 ICC World Cup and indicating that England was the winner of the tournament that was intended to be held in 2020 but actually took place in 2022.</reason>
22 |         <infer>1</infer>
23 |     </verdict>
24 | </result>
25 | 
26 | question: What is the tallest mountain in the world?
27 | context: The Andes is the longest continental mountain range in the world, located in South America. It stretches across seven countries and features many of the highest peaks in the Western Hemisphere. The range is known for its diverse ecosystems, including the high-altitude Andean Plateau and the Amazon rainforest.
28 | answer: Mount Everest.
29 | output: <result>
30 |     <verdict>
31 |         <reason>the provided context discusses the Andes mountain range, which, while impressive, does not include Mount Everest or directly relate to the question about the world's tallest mountain.</reason>
32 |         <infer>0</infer>
33 |     </verdict>
34 | </result>
35 | 
36 | question: {question}
37 | context: {context}
38 | answer: {answer}
39 | output:


--------------------------------------------------------------------------------
/resources/prompts/context_recall_1.txt:
--------------------------------------------------------------------------------
 1 | You are given a context and an answer. The answer is provided as a markdown list of
 2 | answer sentences.
 3 | 
 4 | Analyze each answer sentence and classify if the sentence can be attributed to the 
 5 | context. Give a verdict of "1" if the sentence can be attributed to the context and
 6 | "0" if not. Provide reasoning for your verdict.
 7 | 
 8 | Here are some examples:
 9 | 
10 | context: Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass-energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
11 | answer: ```
12 | - Albert Einstein, born in 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.
13 | - He received the 1921 Nobel Prize in Physics for his services to theoretical physics.
14 | - He published 4 papers in 1905. 
15 | - Einstein moved to Switzerland in 1895
16 | ```
17 | output: <result>
18 |     <verdicts>
19 |         <verdict>
20 |             <statement>Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.</statement>
21 |             <reason>The date of birth of Einstein is mentioned clearly in the context.</reason>
22 |             <infer>1</infer>
23 |         </verdict>
24 |         <verdict>
25 |             <statement>He received the 1921 Nobel Prize in Physics for his services to theoretical physics.</statement>
26 |             <reason>The exact sentence is present in the given context.</reason>
27 |             <infer>1</infer>
28 |         </verdict>
29 |         <verdict>
30 |             <statement>He published 4 papers in 1905.</statement>
31 |             <reason>There is no mention about papers he wrote in the given context.</reason>
32 |             <infer>0</infer>
33 |         </verdict>
34 |         <verdict>
35 |             <statement>Einstein moved to Switzerland in 1895.</statement>
36 |             <reason>There is no supporting evidence for this in the given context.</reason>
37 |             <infer>0</infer>
38 |         </verdict>
39 |     </verdicts>
40 | </result>
41 | 
42 | context: The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title.
43 | answer: ```
44 | - England won the 2022 ICC Men's T20 World Cup.
45 | ```
46 | output: <result>
47 |     <verdicts>
48 |         <verdict>
49 |             <statement>England won the 2022 ICC Men's T20 World Cup.</statement>
50 |             <reason>From context it is clear that England defeated Pakistan to win the World Cup.</reason>
51 |             <infer>1</infer>
52 |         </verdict>
53 |     </verdicts>
54 | </result>
55 | 
56 | context: The primary fuel source for the sun is Hydrogen.
57 | answer: ```
58 | - The Baltimore Sun is the primary newspaper for the Baltimore area.
59 | ```
60 | output: <result>
61 |     <verdicts>
62 |         <verdict>
63 |             <statement>The Baltimore Sun is the primary newspaper for the Baltimore area.</statement>
64 |             <reason>The context is unrelated to the sentence.</reason>
65 |             <infer>0</infer>
66 |         </verdict>
67 |     </verdicts>
68 | </result>
69 | 
70 | context: {context}
71 | answer: ```
72 | {answer}
73 | ```
74 | output: 


--------------------------------------------------------------------------------
/resources/prompts/context_relevance_1.txt:
--------------------------------------------------------------------------------
 1 | You are provided a question with a list of context sentences. The context sentences
 2 | are provided as a Markdown list.
 3 | 
 4 | For each pair of question and context sentence, identify if the sentence is absolutely
 5 | necessary to answer the question. Give a verdict of "1" if the sentence is absolutely
 6 | necessary to answer the question or "0" if not.
 7 | 
 8 | Include in the output ONLY sentences with verdict of "1".
 9 | 
10 | question: {question}
11 | context: ```
12 | {context}
13 | ```
14 | 
15 | Provide your response in the following XML format:
16 | 
17 | <result>
18 |     <verdicts>
19 |         <verdict>
20 |             <statement>text of sentence</statement>
21 |             <reason>reasoning for why sentence is useful for question</reason>
22 |             <infer>0 or 1</infer>
23 |         </verdict>
24 |         ...
25 |     </verdicts>
26 | </result>
27 | 


--------------------------------------------------------------------------------
/resources/prompts/faithfulness_1.txt:
--------------------------------------------------------------------------------
 1 | Create two or more statements from each sentence in the given answer.
 2 | 
 3 | Here are some examples:
 4 | 
 5 | question: Who was  Albert Einstein and what is he best known for?
 6 | answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
 7 | statements: <result><statements>
 8 |     <statement>Albert Einstein, a German-born theoretical physicist, is renowned for being one of the most influential physicists in history.</statement>
 9 |     <statement>Albert Einstein was best known for his theory of relativity.</statement>
10 |     <statement>Einstein's contributions significantly advanced the field of quantum mechanics.</statement>
11 |     <statement>Recognized globally, Einstein's work has profoundly impacted the scientific community.</statement>
12 |     <statement>Einstein's groundbreaking theories continue to shape our understanding of physics today.</statement>
13 | </statements></result>
14 | 
15 | question: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
16 | answer: alcohol
17 | statements: <result><statements>
18 |     <statement>Cadmium Chloride is slightly soluble in alcohol.</statement>
19 | </statements></result>
20 | 
21 | question: Were Hitler and Benito Mussolini of the same nationality?
22 | answer: Sorry, I can't provide answer to that question.
23 | statements: <result><statements>
24 |     <statement></statement>
25 | </statements></result>
26 | 
27 | question: {question}
28 | answer: {answer}
29 | statements:


--------------------------------------------------------------------------------
/resources/prompts/faithfulness_2.txt:
--------------------------------------------------------------------------------
 1 | Natural Language Inference. Use only "Yes" (1) or "No" (0).
 2 | 
 3 | Here are some examples:
 4 | 
 5 | context: John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.
 6 | statements: <statements>
 7 |     <statement>John is majoring in Biology.</statement>
 8 |     <statement>John is taking a course on Artificial Intelligence.</statement>
 9 |     <statement>John is a dedicated student.</statement>
10 |     <statement>John has a part-time job.</statement>
11 | </statements>
12 | answer: <result><verdicts>
13 |     <verdict>
14 |         <statement>John is majoring in Biology.</statement>
15 |         <reason>John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.</reason>
16 |         <infer>0</infer>
17 |     </verdict>
18 |     <verdict>
19 |         <statement>John is taking a course on Artificial Intelligence.</statement>
20 |         <reason>The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.</reason>
21 |         <infer>0</infer>
22 |     </verdict>
23 |     <verdict>
24 |         <statement>John is a dedicated student.</statement>
25 |         <reason>The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.</reason>
26 |         <infer>1</infer>
27 |     </verdict>
28 |     <verdict>
29 |         <statement>John has a part-time job.</statement>
30 |         <reason>There is no information given in the context about John having a part-time job.</reason>
31 |         <infer>0</infer>
32 |     </verdict>
33 | </verdicts></result>
34 | 
35 | context: Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.
36 | statements: <statements>
37 |     <statement>Albert Einstein was a genius.</statement>
38 | </statements>
39 | answer: <result><verdicts>
40 |     <verdict>
41 |         <statement>Albert Einstein was a genius.</statement>
42 |         <reason>The context and statement are unrelated.</reason>
43 |         <infer>0</infer>
44 |     </verdict>
45 | </verdicts></result>
46 | 
47 | context: {context}
48 | statements: {statements_xml}
49 | answer:


--------------------------------------------------------------------------------
/resources/prompts/more_questions.txt:
--------------------------------------------------------------------------------
 1 | Similar to how the question
 2 | question: {question}
 3 | 
 4 | is answered by
 5 | answer: {answer}
 6 | 
 7 | generate {multiplier} new (question, answer, context) tuples specifically from the content in the following passages:
 8 | 
 9 | {passages}
10 | 
11 | Return these (question, answer, context) tuples in the following XML format:
12 | 
13 | <result>
14 |     <tuples>
15 |         <question>question 1 text></question>
16 |         <answer>answer 1 text</answer>
17 |         <context>passage 1 text</context>
18 |     </tuples>
19 |     <tuples>
20 |         <question>question 2 text></question>
21 |         <answer>answer 2 text</answer>
22 |         <context>passage 2 text</context>
23 |     </tuples>
24 |     <tuples>
25 |         <question>question 3 text></question>
26 |         <answer>answer 3 text</answer>
27 |         <context>passage 3 text</context>
28 |     </tuples>
29 |     ...
30 | </result>
31 | 


--------------------------------------------------------------------------------
/resources/reports/dspy-reports/answer_correctness_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	ANSWER_CORRECTNESS
 2 | 0	0.588
 3 | 1	0.000
 4 | 2	1.000
 5 | 3	0.333
 6 | 4	0.333
 7 | 5	1.000
 8 | 6	1.000
 9 | 7	1.000
10 | 9	0.400
11 | 10	0.571
12 | 11	0.333
13 | 12	0.500
14 | 13	0.500
15 | 14	0.333
16 | 15	0.333
17 | 16	0.000
18 | 17	0.800
19 | 19	0.000
20 | 


--------------------------------------------------------------------------------
/resources/reports/dspy-reports/answer_relevance_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	ANSWER_RELEVANCE
 2 | 0	0.000
 3 | 1	0.860
 4 | 2	0.949
 5 | 3	0.845
 6 | 4	0.837
 7 | 5	0.833
 8 | 6	0.850
 9 | 7	0.893
10 | 8	0.795
11 | 9	0.000
12 | 10	0.816
13 | 11	0.799
14 | 12	0.808
15 | 13	0.866
16 | 14	0.000
17 | 15	0.855
18 | 16	0.885
19 | 17	0.894
20 | 18	0.880
21 | 19	0.809
22 | 


--------------------------------------------------------------------------------
/resources/reports/dspy-reports/context_precision_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_PRECISION
 2 | 0	0.749
 3 | 1	0.556
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	1.000
 7 | 5	1.000
 8 | 6	0.167
 9 | 7	0.604
10 | 8	0.242
11 | 9	0.111
12 | 10	0.875
13 | 11	1.000
14 | 12	0.500
15 | 13	0.200
16 | 14	1.000
17 | 15	0.875
18 | 16	1.000
19 | 17	0.292
20 | 18	0.550
21 | 19	0.333
22 | 


--------------------------------------------------------------------------------
/resources/reports/dspy-reports/context_recall_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_RECALL
 2 | 0	0.357
 3 | 1	0.333
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	0.250
 7 | 5	1.000
 8 | 6	0.667
 9 | 7	0.650
10 | 8	0.100
11 | 9	0.463
12 | 10	0.531
13 | 11	1.000
14 | 12	0.000
15 | 13	0.333
16 | 14	0.000
17 | 15	0.900
18 | 16	1.000
19 | 17	0.050
20 | 18	0.314
21 | 19	0.000
22 | 


--------------------------------------------------------------------------------
/resources/reports/dspy-reports/context_relevance_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_RELEVANCE
 2 | 0	0.571
 3 | 1	0.333
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	1.000
 7 | 5	1.000
 8 | 6	0.167
 9 | 7	0.750
10 | 8	0.100
11 | 9	0.111
12 | 10	0.875
13 | 11	0.500
14 | 12	0.500
15 | 13	0.200
16 | 14	1.000
17 | 15	0.750
18 | 16	0.500
19 | 17	0.500
20 | 18	0.700
21 | 19	0.333
22 | 


--------------------------------------------------------------------------------
/resources/reports/dspy-reports/context_utilization_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_UTILIZATION
 2 | 0	0.497
 3 | 1	0.333
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	0.000
 7 | 5	1.000
 8 | 6	0.167
 9 | 7	0.604
10 | 8	0.000
11 | 9	0.111
12 | 10	0.346
13 | 11	0.500
14 | 12	0.000
15 | 13	0.200
16 | 14	1.000
17 | 15	0.179
18 | 16	1.000
19 | 17	0.125
20 | 18	0.400
21 | 19	0.000
22 | 


--------------------------------------------------------------------------------
/resources/reports/dspy-reports/faithfulness_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	FAITHFULNESS
 2 | 0	1.000
 3 | 1	1.000
 4 | 2	1.000
 5 | 3	1.000
 6 | 4	1.000
 7 | 5	1.000
 8 | 6	1.000
 9 | 7	1.000
10 | 8	1.000
11 | 9	1.000
12 | 10	1.000
13 | 11	1.000
14 | 12	0.000
15 | 13	1.000
16 | 14	1.000
17 | 15	1.000
18 | 16	1.000
19 | 17	1.000
20 | 18	1.000
21 | 19	1.000
22 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/answer_correctness_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	ANSWER_CORRECTNESS
 2 | 0	0.538
 3 | 1	0.167
 4 | 2	0.462
 5 | 3	0.747
 6 | 4	0.000
 7 | 5	0.490
 8 | 6	1.000
 9 | 7	0.286
10 | 8	0.000
11 | 9	0.571
12 | 10	1.000
13 | 11	0.977
14 | 12	0.000
15 | 13	0.987
16 | 14	0.982
17 | 15	0.400
18 | 16	0.959
19 | 17	0.552
20 | 19	0.839
21 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/answer_relevance_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	ANSWER_RELEVANCE
 2 | 0	0.881
 3 | 1	0.827
 4 | 2	0.910
 5 | 3	0.839
 6 | 4	0.908
 7 | 5	0.894
 8 | 6	0.000
 9 | 7	0.899
10 | 8	0.874
11 | 9	0.850
12 | 10	0.914
13 | 11	0.852
14 | 12	0.928
15 | 13	0.894
16 | 14	0.000
17 | 15	0.947
18 | 16	0.945
19 | 17	0.877
20 | 19	0.835
21 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/answer_similarity_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	ANSWER_SIMILARITY
 2 | 0	0.979
 3 | 1	0.983
 4 | 2	0.971
 5 | 3	0.973
 6 | 4	0.971
 7 | 5	0.996
 8 | 6	1.000
 9 | 7	0.979
10 | 8	0.975
11 | 9	0.974
12 | 10	0.972
13 | 11	0.970
14 | 12	0.990
15 | 13	0.975
16 | 14	0.974
17 | 15	0.977
18 | 16	0.974
19 | 17	0.969
20 | 18	0.976
21 | 19	0.999
22 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/context_precision_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_PRECISION
 2 | 0	1.000
 3 | 1	0.444
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	1.000
 7 | 5	1.000
 8 | 6	0.167
 9 | 7	0.604
10 | 8	0.100
11 | 9	0.111
12 | 10	0.385
13 | 11	1.000
14 | 12	0.500
15 | 13	0.200
16 | 14	1.000
17 | 15	1.000
18 | 16	1.000
19 | 17	0.125
20 | 18	0.550
21 | 19	0.333
22 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/context_recall_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_RECALL
 2 | 0	0.347
 3 | 1	0.260
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	1.000
 7 | 5	0.500
 8 | 6	0.333
 9 | 7	0.750
10 | 8	0.500
11 | 9	0.333
12 | 10	0.875
13 | 11	1.000
14 | 12	0.500
15 | 13	0.200
16 | 14	1.000
17 | 15	0.406
18 | 16	1.000
19 | 17	0.500
20 | 19	0.333
21 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/context_relevance_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_RELEVANCE
 2 | 0	0.857
 3 | 1	0.444
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	1.000
 7 | 5	1.000
 8 | 6	0.600
 9 | 7	1.000
10 | 8	0.200
11 | 9	0.111
12 | 10	0.875
13 | 11	1.000
14 | 12	0.500
15 | 13	0.200
16 | 14	1.000
17 | 16	0.500
18 | 17	0.750
19 | 18	0.429
20 | 19	0.333
21 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/context_utilization_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	CONTEXT_UTILIZATION
 2 | 0	0.382
 3 | 1	0.451
 4 | 2	0.500
 5 | 3	1.000
 6 | 4	0.604
 7 | 5	1.000
 8 | 6	0.167
 9 | 7	1.000
10 | 8	0.000
11 | 9	0.419
12 | 10	0.764
13 | 11	1.000
14 | 12	0.000
15 | 13	0.200
16 | 14	1.000
17 | 15	1.000
18 | 16	1.000
19 | 17	1.000
20 | 19	0.000
21 | 


--------------------------------------------------------------------------------
/resources/reports/lcel-reports/faithfulness_report.tsv:
--------------------------------------------------------------------------------
 1 | #QID	FAITHFULNESS
 2 | 0	0.262
 3 | 1	0.909
 4 | 2	0.200
 5 | 3	0.500
 6 | 4	1.000
 7 | 5	1.000
 8 | 6	0.583
 9 | 7	0.969
10 | 8	0.950
11 | 9	0.778
12 | 10	0.600
13 | 11	0.333
14 | 12	0.000
15 | 13	0.300
16 | 14	1.000
17 | 15	0.844
18 | 16	0.375
19 | 17	0.812
20 | 19	0.000
21 | 


--------------------------------------------------------------------------------
/src/convert_amnestyQA_to_desired_format.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "source": [
 20 |         "# this will convert the AmnestyQA dataset\n",
 21 |         "# https://huggingface.co/datasets/explodinggradients/amnesty_qa/blob/main/english.json\n",
 22 |         "# to the desired format that we are using in the competition\n",
 23 |         "# https://github.com/sujitpal/llm-rag-eval\n",
 24 |         "#\n",
 25 |         "# download the above json file to a file local to this notebook\n",
 26 |         "# (i had trouble reading this directly online - https://huggingface.co/datasets/explodinggradients/amnesty_qa/raw/main/english.json)"
 27 |       ],
 28 |       "metadata": {
 29 |         "id": "TuhTbzYKIqGG"
 30 |       },
 31 |       "execution_count": 27,
 32 |       "outputs": []
 33 |     },
 34 |     {
 35 |       "cell_type": "code",
 36 |       "execution_count": 28,
 37 |       "metadata": {
 38 |         "id": "XRqK1t_l0g7_"
 39 |       },
 40 |       "outputs": [],
 41 |       "source": [
 42 |         "import json\n",
 43 |         "# input file\n",
 44 |         "input_file_path = 'english.json'\n",
 45 |         "# output file\n",
 46 |         "output_file_path = 'amnesty_qa.jsonl'"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "code",
 51 |       "source": [
 52 |         "def split_newlines(input_string):\n",
 53 |         "    \"\"\"Split the input string at newline characters and return a list of strings.\"\"\"\n",
 54 |         "    if '\\n' in input_string:\n",
 55 |         "        return input_string.split('\\n')\n",
 56 |         "    else:\n",
 57 |         "        return [input_string]"
 58 |       ],
 59 |       "metadata": {
 60 |         "id": "wECIVkMrl_r_"
 61 |       },
 62 |       "execution_count": 29,
 63 |       "outputs": []
 64 |     },
 65 |     {
 66 |       "cell_type": "code",
 67 |       "source": [
 68 |         "# Opening the JSON file and loading the data\n",
 69 |         "with open(input_file_path, 'r') as file:\n",
 70 |         "    data = json.load(file)"
 71 |       ],
 72 |       "metadata": {
 73 |         "id": "piVFXu0Z0zL9"
 74 |       },
 75 |       "execution_count": 31,
 76 |       "outputs": []
 77 |     },
 78 |     {
 79 |       "cell_type": "code",
 80 |       "source": [
 81 |         "# the data is not nested / folded\n",
 82 |         "# so each of components is read in and the output folding happens next\n",
 83 |         "questions = data.get('question')\n",
 84 |         "ground_truths = data.get('ground_truths')\n",
 85 |         "answers = data.get('answer')\n",
 86 |         "contexts = data.get('contexts')"
 87 |       ],
 88 |       "metadata": {
 89 |         "id": "hjIQKOEr0_dQ"
 90 |       },
 91 |       "execution_count": 32,
 92 |       "outputs": []
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "source": [
 97 |         "# output will be lines\n",
 98 |         "lines = []"
 99 |       ],
100 |       "metadata": {
101 |         "id": "Et8TapVb4pjj"
102 |       },
103 |       "execution_count": 33,
104 |       "outputs": []
105 |     },
106 |     {
107 |       "cell_type": "code",
108 |       "source": [
109 |         "# assemble from the nth item from each component list\n",
110 |         "# blow out the multi-line contexts into single line chunk dictionaries\n",
111 |         "for i in range(len(questions)):\n",
112 |         "  line = {}\n",
113 |         "  line['id'] = i\n",
114 |         "  line['query'] = questions[i]\n",
115 |         "  context_list = []\n",
116 |         "  split_contexts = split_newlines(contexts[i][0])\n",
117 |         "  for j in range(len(split_contexts)):\n",
118 |         "    context_dict = {}\n",
119 |         "    context_dict['id'] = str(j)\n",
120 |         "    context_dict['chunk_text'] = split_contexts[j]\n",
121 |         "    context_list.append(context_dict)\n",
122 |         "  line['context'] = context_list\n",
123 |         "  line['ideal_answer'] = ground_truths[i][0]\n",
124 |         "  line['predicted_answer'] = answers[i]\n",
125 |         "  lines.append(line)"
126 |       ],
127 |       "metadata": {
128 |         "id": "T5d9q2Jo1I9l"
129 |       },
130 |       "execution_count": 34,
131 |       "outputs": []
132 |     },
133 |     {
134 |       "cell_type": "code",
135 |       "source": [
136 |         "# write the lines to the output file\n",
137 |         "with open(output_file_path, 'w') as outfile:\n",
138 |         "    for dictionary in lines:\n",
139 |         "        json_line = json.dumps(dictionary)  # Convert the dictionary to a JSON string\n",
140 |         "        outfile.write(json_line + '\\n')  # Write the JSON string to the file and add a newline"
141 |       ],
142 |       "metadata": {
143 |         "id": "CF7_eK3A1w20"
144 |       },
145 |       "execution_count": 36,
146 |       "outputs": []
147 |     },
148 |     {
149 |       "cell_type": "code",
150 |       "source": [],
151 |       "metadata": {
152 |         "id": "CK2v4BF4rmwp"
153 |       },
154 |       "execution_count": 36,
155 |       "outputs": []
156 |     }
157 |   ]
158 | }


--------------------------------------------------------------------------------
/src/generate_datasets.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import json
  4 | import logging
  5 | import os
  6 | 
  7 | from dotenv import load_dotenv, find_dotenv
  8 | from langchain_google_genai import (
  9 |     ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 10 | )
 11 | from typing import List
 12 | 
 13 | import prompted.faithfulness as faithfulness_p
 14 | import prompted.answer_relevance as answer_relevance_p
 15 | import prompted.context_precision as context_precision_p
 16 | import prompted.context_relevance as context_relevance_p
 17 | import prompted.context_recall as context_recall_p
 18 | import prompted.answer_correctness as answer_correctness_p
 19 | from metrics import Metrics
 20 | 
 21 | 
 22 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 23 | 
 24 | # Safety config
 25 | 
 26 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
 27 | 
 28 | safety_settings = {
 29 |     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 30 |     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 31 |     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 32 |     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH
 33 | }
 34 | 
 35 | async def generate_faithfulness_dataset(id: int,
 36 |                                         question: str,
 37 |                                         answer: str,
 38 |                                         context: List[str],
 39 |                                         run_parallel: bool,
 40 |                                         model,
 41 |                                         logger,
 42 |                                         fout):
 43 |     statements = faithfulness_p._get_statements_from_answer(
 44 |         question, answer, model, logger)
 45 |     entailments = await faithfulness_p._get_entailments_from_context(
 46 |         context, statements, model, logger,
 47 |         parallel=run_parallel)
 48 |     score = faithfulness_p._compute_faithfulness(entailments)
 49 |     fout.write(json.dumps({
 50 |         "id": id,
 51 |         "question": question,
 52 |         "context": context,
 53 |         "answer": answer,
 54 |         "statements": statements,
 55 |         "entailments": entailments,
 56 |         "score": score
 57 |     }) + "\n")
 58 | 
 59 | 
 60 | async def generate_answer_relevance_dataset(id: int,
 61 |                                             question: str,
 62 |                                             context: List[str],
 63 |                                             answer: str,
 64 |                                             run_parallel: bool,
 65 |                                             model,
 66 |                                             encoder,
 67 |                                             logger,
 68 |                                             fout):
 69 |     context_str = answer_relevance_p._flatten_context(context)
 70 |     gen_questions = \
 71 |         answer_relevance_p._generate_questions_from_answer_and_context(
 72 |             context_str, answer, 5, model, logger)
 73 |     qa_pairs = await answer_relevance_p._predict_noncommittal_from_questions(
 74 |         gen_questions, context_str, run_parallel, model,
 75 |         logger)
 76 |     score = answer_relevance_p._compute_answer_relevance(
 77 |         question, qa_pairs, encoder, logger)
 78 |     fout.write(json.dumps({
 79 |         "id": id,
 80 |         "question": question,
 81 |         "context": context,
 82 |         "answer": answer,
 83 |         "gen_questions": gen_questions,
 84 |         "non_commitals": [qap.noncommittal for qap in qa_pairs],
 85 |         "score": score
 86 |     }) + "\n")
 87 | 
 88 | 
 89 | async def generate_context_precision_dataset(id: int,
 90 |                                              question: str,
 91 |                                              answer: str,
 92 |                                              context: List[str],
 93 |                                              run_parallel: bool,
 94 |                                              model,
 95 |                                              logger,
 96 |                                              fout):
 97 |     precs = await context_precision_p._compute_usefulness_scores(
 98 |         question, context, answer, run_parallel, model, logger)
 99 |     score = context_precision_p._compute_content_precision(precs)
100 |     fout.write(json.dumps({
101 |         "id": id,
102 |         "question": question,
103 |         "context": context,
104 |         "answer": answer,
105 |         "precision": precs,
106 |         "score": score
107 |     }) + "\n")
108 | 
109 | 
110 | async def generate_context_relevance_dataset(id: int,
111 |                                              question: str,
112 |                                              context: List[str],
113 |                                              run_parallel: bool,
114 |                                              model,
115 |                                              logger,
116 |                                              fout):
117 |     num_total_sents, context_markdowns = \
118 |         context_relevance_p._convert_to_markdown_lists(context)
119 |     score = 0.0
120 |     if num_total_sents > 0:
121 |         necessary_sents = \
122 |             await context_relevance_p._generate_necessity_verdicts(
123 |                 question, context_markdowns, run_parallel, model, logger)
124 |     score = context_relevance_p._compute_context_relevance_score(
125 |         num_total_sents, necessary_sents)
126 |     fout.write(json.dumps({
127 |         "id": id,
128 |         "question": question,
129 |         "context": context,
130 |         "context_sents": context_markdowns,
131 |         "necessary_sents": necessary_sents,
132 |         "score": score
133 |     }) + "\n")
134 | 
135 | 
136 | async def generate_context_recall_dataset(id: int,
137 |                                           context: List[str],
138 |                                           answer: str,
139 |                                           run_parallel: bool,
140 |                                           model,
141 |                                           logger,
142 |                                           fout):
143 |     answer_md = context_recall_p._convert_answer_to_markdown_list(
144 |         answer, logger)
145 |     inferences = await \
146 |         context_recall_p._classify_ans_sents_attributable_to_context(
147 |             answer_md, context, run_parallel, model, logger)
148 |     score = context_recall_p._compute_context_recall_score(inferences)
149 |     fout.write(json.dumps({
150 |         "id": id,
151 |         "context": context,
152 |         "answer_md": answer_md,
153 |         "answer": answer,
154 |         "inferences": inferences,
155 |         "score": score
156 |     }) + "\n")
157 | 
158 | 
159 | async def generate_answer_correctness_dataset(id: int,
160 |                                               answer: str,
161 |                                               ideal_answer: str,
162 |                                               model,
163 |                                               logger,
164 |                                               fout):
165 |     classification = answer_correctness_p._do_classification(
166 |         answer, ideal_answer, model, logger)
167 |     statements_by_class_dict = {}
168 |     for key in ["TP", "FP", "FN"]:
169 |         statements_by_class_dict[key] = \
170 |             answer_correctness_p._get_statements_for_class(
171 |                 classification, key)
172 |     score = answer_correctness_p._compute_answer_correctness_score(
173 |         statements_by_class_dict)
174 |     fout.write(json.dumps({
175 |         "id": id,
176 |         "answer": answer,
177 |         "ideal_answer": ideal_answer,
178 |         "classification": statements_by_class_dict,
179 |         "score": score
180 |         }) + "\n")
181 | 
182 | 
183 | async def runner():
184 | 
185 |     parser = argparse.ArgumentParser()
186 |     parser.add_argument("--metric", type=str,
187 |                         choices=sorted([m.value for m in Metrics]),
188 |                         required=True,
189 |                         help="The metric to generate datasets for")
190 |     parser.add_argument("--input", type=str, required=True,
191 |                         help="Full path to input JSONL file")
192 |     parser.add_argument("--output", type=str, required=True,
193 |                         help="Full path to output directory")
194 |     parser.add_argument("--parallel", action="store_true",
195 |                         help="Run in parallel where possible (default false)")
196 |     parser.add_argument("--debug", action="store_true",
197 |                         help="Turn debugging on (default: false)")
198 |     args = parser.parse_args()
199 |     metric = args.metric
200 |     input_fp = args.input
201 |     output_fp = os.path.join(args.output, f"{metric}.jsonl")
202 |     run_parallel = args.parallel
203 | 
204 |     _ = load_dotenv(find_dotenv())
205 | 
206 |     logger = logging.getLogger(__name__)
207 |     logger.setLevel(logging.DEBUG if args.debug else logging.INFO)
208 | 
209 |     model = ChatGoogleGenerativeAI(
210 |         model="gemini-pro",
211 |         api_key=os.environ["GOOGLE_API_KEY"],
212 |         temperature=0.0,
213 |         safety_settings=safety_settings)
214 |     encoder = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
215 | 
216 |     os.makedirs(args.output, exist_ok=True)
217 | 
218 |     with open(input_fp, "r", encoding="utf-8") as fin, \
219 |          open(output_fp, "w", encoding="utf-8") as fout:
220 |         for line in fin:
221 |             record = json.loads(line)
222 |             id = record["id"]
223 |             # if int(id) != 14:
224 |             #     continue
225 |             question = record["query"]
226 |             context = [ctx["chunk_text"] for ctx in record["context"]]
227 |             answer = record["predicted_answer"]
228 |             ideal_answer = record["ideal_answer"]
229 | 
230 |             logger.info(f"Processing query ({id}): {question}")
231 | 
232 |             match Metrics(metric):
233 |                 case Metrics.FAITHFULNESS:
234 |                     await generate_faithfulness_dataset(
235 |                         id, question, answer, context, run_parallel,
236 |                         model, logger, fout)
237 |                 case Metrics.ANSWER_RELEVANCE:
238 |                     await generate_answer_relevance_dataset(
239 |                         id, question, context, answer, run_parallel,
240 |                         model, encoder, logger, fout)
241 |                 case Metrics.CONTEXT_PRECISION:
242 |                     await generate_context_precision_dataset(
243 |                         id, question, answer, context, run_parallel,
244 |                         model, logger, fout)
245 |                 case Metrics.CONTEXT_RELEVANCE:
246 |                     await generate_context_relevance_dataset(
247 |                         id, question, context, run_parallel, model,
248 |                         logger, fout)
249 |                 case Metrics.CONTEXT_RECALL:
250 |                     await generate_context_recall_dataset(
251 |                         id, context, answer, run_parallel, model,
252 |                         logger, fout)
253 |                 case Metrics.ANSWER_SIMILARITY:
254 |                     raise NotImplementedError(
255 |                         "Use prompted version of answer similarity")
256 |                 case Metrics.ANSWER_CORRECTNESS:
257 |                     await generate_answer_correctness_dataset(
258 |                         id, answer, ideal_answer, model, logger, fout)
259 |                 case _:
260 |                     pass
261 | 
262 | 
263 | if __name__ == "__main__":
264 |     asyncio.run(runner())
265 | 


--------------------------------------------------------------------------------
/src/learned/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/src/learned/__init__.py


--------------------------------------------------------------------------------
/src/learned/answer_correctness.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | import json
  3 | import os
  4 | 
  5 | from typing import Dict
  6 | 
  7 | from .learning_utils import score_metric, optimize_prompt
  8 | 
  9 | 
 10 | DATA_DIR = "../data"
 11 | RESOURCE_DIR = "../resources"
 12 | 
 13 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets")
 14 | DATASET_FP = os.path.join(DATASET_DIR, "answer_correctness.jsonl")
 15 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs")
 16 | BEST_CONFIG = os.path.join(CONFIGS_DIR, "answer_correctness-best.json")
 17 | DEBUG = False
 18 | 
 19 | 
 20 | class AnswerToFacts(dspy.Signature):
 21 |     """ Extract 5-10 facts from the answer"""
 22 |     answer = dspy.InputField(desc="answer to extract facts from")
 23 |     facts = dspy.OutputField(desc="facts extracted from answer")
 24 | 
 25 | 
 26 | class FactGroupings(dspy.Signature):
 27 |     """ Classify each answer fact with respect to ground truth fact as follows:
 28 |         - TP (true positive): fact is present in both answer and ground truth
 29 |         - FP (false positive): fact is present in answer but not in ground truth
 30 |         - FN (false negative): fact is present in ground truth not in answer.
 31 |         Group the input facts as numbered lists under these 3 categories
 32 |     """
 33 |     facts_g = dspy.InputField(desc="facts from ground truth", format=str)
 34 |     facts_a = dspy.InputField(desc="facts from answer", format=str)
 35 |     fact_groups = dspy.OutputField(
 36 |         desc="groupings of facts as JSON string with keys TP, FP, FN and "
 37 |              "list of facts for each as values")
 38 | 
 39 | 
 40 | class AnswerCorrectness(dspy.Module):
 41 |     def __init__(self):
 42 |         super().__init__()
 43 |         self.fact_extractor = dspy.Predict(AnswerToFacts)
 44 |         self.fact_grouper = dspy.ChainOfThought(FactGroupings)
 45 | 
 46 |     def _parse_json_response(self, fact_groups: str) -> Dict[str, int]:
 47 |         fact_groups_j = json.loads(fact_groups
 48 |                                    .replace("```json", "")
 49 |                                    .replace("```", ""))
 50 |         return {k: len(v) for k, v in fact_groups_j.items()}
 51 | 
 52 |     def forward(self, answer, ground_truth):
 53 |         dspy.logger.debug(f"input answer: {answer}, ground_truth: {ground_truth}")
 54 |         facts_g = self.fact_extractor(answer=ground_truth).facts
 55 |         dspy.logger.debug(f"facts from ground truth: {facts_g}")
 56 |         facts_a = self.fact_extractor(answer=answer).facts
 57 |         dspy.logger.debug(f"facts from answer: {facts_a}")
 58 |         fact_groups = self.fact_grouper(
 59 |             facts_g=facts_g, facts_a=facts_a).fact_groups
 60 |         dspy.logger.debug(f"fact groups: {fact_groups}")
 61 |         group_counts = self._parse_json_response(fact_groups)
 62 |         tp = group_counts.get("TP", 0)
 63 |         fp = group_counts.get("FP", 0)
 64 |         fn = group_counts.get("FN", 0)
 65 |         score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0.0
 66 |         dspy.logger.debug(f"tp: {tp}, fp: {fp}, fn: {fn}, score: {score}")
 67 |         return dspy.Prediction(score=score)
 68 | 
 69 | 
 70 | def answer_correctness_dataset(file_path):
 71 |     if not os.path.exists(file_path):
 72 |         raise FileNotFoundError(
 73 |             f"answer correctness dataset {file_path} not found, "
 74 |             f"create it using generate_datasets.py first.")
 75 | 
 76 |     examples = []
 77 |     with open(file_path, "r") as fin:
 78 |         for line in fin:
 79 |             record = json.loads(line)
 80 |             answer = record["answer"]
 81 |             ground_truth = record["ideal_answer"]
 82 |             score = record["score"]
 83 |             examples.append(dspy.Example(
 84 |                 answer=answer, ground_truth=ground_truth, score=str(score))
 85 |                 .with_inputs("answer", "ground_truth"))
 86 |     return examples
 87 | 
 88 | 
 89 | def compute_answer_correctness(answer: str,
 90 |                                ideal_answer: str,
 91 |                                prompts_dict) -> float:
 92 |     try:
 93 |         answer_correctness_opt = prompts_dict["answer_correctness"]
 94 |     except KeyError:
 95 |         answer_correctness_opt = optimize_prompt("answer_correctness",
 96 |                                                  CONFIGS_DIR,
 97 |                                                  answer_correctness_dataset,
 98 |                                                  DATASET_FP,
 99 |                                                  score_metric,
100 |                                                  AnswerCorrectness())
101 |         prompts_dict["answer_correctness"] = answer_correctness_opt
102 |     pred = answer_correctness_opt(answer=answer, ground_truth=ideal_answer)        
103 |     return float(pred.score)
104 | 


--------------------------------------------------------------------------------
/src/learned/answer_relevance.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | import json
  3 | import numpy as np
  4 | import os
  5 | 
  6 | from typing import List
  7 | 
  8 | from .learning_utils import (
  9 |     string_to_list, list_to_string, string_to_bool,
 10 |     score_metric, optimize_prompt
 11 | )
 12 | 
 13 | 
 14 | DATA_DIR = "../data"
 15 | RESOURCE_DIR = "../resources"
 16 | 
 17 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets")
 18 | DATASET_FP = os.path.join(DATASET_DIR, "answer_relevance.jsonl")
 19 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs")
 20 | 
 21 | 
 22 | class AnswerContextToGenQuestions(dspy.Signature):
 23 |     """ Given the answer and context, generate 3 to 5 questions that can be 
 24 |         answered by the answer. """
 25 |     answer: str = dspy.InputField(desc="the answer")
 26 |     context: str = dspy.InputField(desc="the context of answer")
 27 |     gen_questions: str = dspy.OutputField(
 28 |         desc="list of questions that can be answered by answer")
 29 | 
 30 | 
 31 | class QuestionContextGenQToNonCommital(dspy.Signature):
 32 |     """ Given a question and its context, use only the context to generate
 33 |         an answer, and classify if the answer is commital or noncommital.
 34 |     """
 35 |     question: str = dspy.InputField(desc="the question")
 36 |     context: str = dspy.InputField(desc="the context of question")
 37 |     answer: str = dspy.OutputField(desc="the generated answer")
 38 |     noncommital: str = dspy.OutputField(desc="yes or no")
 39 | 
 40 | 
 41 | class AnswerRelevance(dspy.Module):
 42 |     def __init__(self, encoder):
 43 |         super().__init__()
 44 |         self.question_generator = dspy.Predict(
 45 |             AnswerContextToGenQuestions)
 46 |         self.answer_classifier = dspy.ChainOfThought(
 47 |             QuestionContextGenQToNonCommital)
 48 |         self.encoder = encoder
 49 | 
 50 |     def _cosine_similarity(self, source, targets):
 51 |         source = source.reshape(1, -1)
 52 |         sims = np.dot(source, targets.T) / (
 53 |             np.linalg.norm(source) * np.linalg.norm(targets, axis=1))
 54 |         return np.mean(sims)
 55 | 
 56 |     def _compute_score(self, q_list: List[str]):
 57 |         embeddings = self.encoder.embed_documents(q_list)
 58 |         E = np.array(embeddings)
 59 |         source, targets = E[0, :], E[1:, :]
 60 |         if len(targets) == 0:
 61 |             return 0.0
 62 |         return self._cosine_similarity(source, targets)
 63 |     
 64 |     def forward(self, question: str, answer: str, context: str):
 65 |         dspy.logger.debug(f"input question: {question}, answer: {answer}, "
 66 |                           f"context: {context}")
 67 |         gen_questions = self.question_generator(
 68 |             answer=answer, context=context).gen_questions
 69 |         dspy.logger.debug(f"gen_questions: {gen_questions}")
 70 |         q_list = [question]
 71 |         for gen_q in string_to_list(gen_questions):
 72 |             ans_cls = self.answer_classifier(question=gen_q, context=context)
 73 |             noncommital = ans_cls.noncommital
 74 |             if not string_to_bool(noncommital, choices=["yes", "no"]):
 75 |                 q_list.append(gen_q)
 76 |         dspy.logger.debug(f"q_list: {q_list}")
 77 |         score = self._compute_score(q_list)
 78 |         dspy.logger.debug(f"score: {score}")
 79 |         return dspy.Prediction(score=str(score))
 80 | 
 81 | 
 82 | def answer_relevance_dataset(file_path):
 83 |     if not os.path.exists(file_path):
 84 |         raise FileNotFoundError(
 85 |             f"answer relevance dataset: {file_path} not found, "
 86 |             f"create it with generate_datasets.py first.")
 87 |     examples = []
 88 |     with open(file_path, "r", encoding="utf-8") as fin:
 89 |         for line in fin:
 90 |             record = json.loads(line)
 91 |             question = record["question"]
 92 |             answer = record["answer"]
 93 |             context = list_to_string(record["context"], style="number")
 94 |             score = record["score"]
 95 |             examples.append(dspy.Example(
 96 |                 question=question, answer=answer,
 97 |                 context=context, score=score)
 98 |                 .with_inputs("question", "answer", "context"))
 99 |     return examples
100 | 
101 | 
102 | def compute_answer_relevance(question: str,
103 |                              context: List[str],
104 |                              answer: str,
105 |                              prompts_dict, 
106 |                              encoder):
107 |     try:
108 |         answer_relevance_opt = prompts_dict["answer_relevance"]
109 |     except KeyError:
110 |         answer_relevance_opt = optimize_prompt("answer_relevance",
111 |                                                CONFIGS_DIR,
112 |                                                answer_relevance_dataset,
113 |                                                DATASET_FP,
114 |                                                score_metric,
115 |                                                AnswerRelevance(encoder=encoder))
116 |         prompts_dict["answer_relevance"] = answer_relevance_opt
117 |     dspy.logger.debug(f"context: {context}")
118 |     context_str = list_to_string(context, style="number")
119 |     pred = answer_relevance_opt(
120 |         question=question, answer=answer, context=context_str)
121 |     return float(pred.score)
122 | 


--------------------------------------------------------------------------------
/src/learned/context_precision.py:
--------------------------------------------------------------------------------
 1 | import dspy
 2 | import json
 3 | import os
 4 | 
 5 | from typing import List
 6 | 
 7 | from .learning_utils import (
 8 |     list_to_string, string_to_bool, score_metric, optimize_prompt
 9 | )
10 | 
11 | 
12 | DATA_DIR = "../data"
13 | RESOURCE_DIR = "../resources"
14 | 
15 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets")
16 | DATASET_FP = os.path.join(DATASET_DIR, "context_precision.jsonl")
17 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs")
18 | 
19 | 
20 | class QuestionAnswerContextToUseful(dspy.Signature):
21 |     """ Given a question, an answer to the question, and supporting context,
22 |         provide a yes/no score indicating if the context was useful for
23 |         answering the question."""
24 |     question: str = dspy.InputField(desc="the question")
25 |     answer: str = dspy.InputField(desc="answer to question")
26 |     context: str = dspy.InputField(
27 |         desc="supporting context used to answer question")
28 |     score: str = dspy.OutputField(desc="yes or no")
29 | 
30 | 
31 | class ContextPrecision(dspy.Module):
32 |     def __init__(self):
33 |         self.model = None
34 |         self.usefulness_classifier = dspy.ChainOfThought(
35 |             QuestionAnswerContextToUseful)
36 |         
37 |     def forward(self, question: str, answer: str,
38 |                 context: List[str]) -> str:
39 |         dspy.logger.debug(f"input question: {question}, answer: {answer}, "
40 |                           f"context: {context}")
41 |         scores, weights = [], []
42 |         for i, ctx in enumerate(context):
43 |             pred = self.usefulness_classifier(question=question,
44 |                                               answer=answer,
45 |                                               context=ctx)
46 |             scores.append(string_to_bool(pred.score, choices=["yes", "no"]))
47 |         dspy.logger.debug(f"scores: {scores}")
48 |         score = 0.0
49 |         if len(scores) > 0:
50 |             weights = [sum(scores[:i + 1]) / (i + 1) * scores[i]
51 |                        for i in range(len(scores))]
52 |             dspy.logger.debug(f"weights: {weights}")
53 |             score = (sum(w * s for w, s in
54 |                          zip(weights, scores)) / len(scores))
55 |         dspy.logger.debug(f"score: {score}")
56 |         return dspy.Prediction(score=str(score))
57 | 
58 | 
59 | def context_precision_dataset(file_path):
60 |     if not os.path.exists(file_path):
61 |         raise FileNotFoundError(
62 |             f"context precision dataset: {file_path} not found, "
63 |             f"create it with generate_datasets.py first.")
64 |     examples = []
65 |     with open(file_path, "r", encoding="utf-8") as fin:
66 |         for line in fin:
67 |             record = json.loads(line)
68 |             question = record["question"]
69 |             context = list_to_string(record["context"], style="number")
70 |             answer = record["answer"]
71 |             score = record["score"]
72 |             examples.append(dspy.Example(
73 |                 question=question, context=context,
74 |                 answer=answer, score=str(score))
75 |                 .with_inputs("question", "context", "answer"))
76 |     return examples
77 | 
78 | 
79 | def compute_context_precision(question: str,
80 |                               answer: str,
81 |                               context: List[str],
82 |                               prompts_dict):
83 |     try:
84 |         context_precision_opt = prompts_dict["context_precision"]
85 |     except KeyError:
86 |         context_precision_opt = optimize_prompt("context_precision",
87 |                                                 CONFIGS_DIR,
88 |                                                 context_precision_dataset,
89 |                                                 DATASET_FP,
90 |                                                 score_metric,
91 |                                                 ContextPrecision())
92 |         prompts_dict["context_precision"] = context_precision_opt
93 |     pred = context_precision_opt(question=question,
94 |                                  answer=answer,
95 |                                  context=context)
96 |     return float(pred.score)
97 | 


--------------------------------------------------------------------------------
/src/learned/context_recall.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | import json
  3 | import nltk
  4 | import numpy as np
  5 | import os
  6 | 
  7 | from typing import List
  8 | 
  9 | from .learning_utils import string_to_bool_array, score_metric, optimize_prompt
 10 | 
 11 | 
 12 | DATA_DIR = "../data"
 13 | RESOURCE_DIR = "../resources"
 14 | 
 15 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets")
 16 | DATASET_FP = os.path.join(DATASET_DIR, "context_recall.jsonl")
 17 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs")
 18 | BEST_CONFIG = os.path.join(CONFIGS_DIR, "context_recall-best.json")
 19 | 
 20 | 
 21 | class ContextItemAnswerToScore(dspy.Signature):
 22 |     """ Given a context item and an answer, for each sentence in the answer,
 23 |         classify if the sentence can be attributed to the context item. """
 24 |     answer = dspy.InputField(desc="the answer", format=str)
 25 |     context_item = dspy.InputField(desc="the context item")
 26 |     scores = dspy.OutputField(
 27 |         desc="yes/no for each answer sentence if it is attributale to context")
 28 | 
 29 | 
 30 | class ContextRecall(dspy.Module):
 31 |     def __init__(self):
 32 |         super().__init__()
 33 |         self.attrib_clf = dspy.ChainOfThought(ContextItemAnswerToScore)
 34 | 
 35 |     def forward(self, context: List[str], answer: str):
 36 |         dspy.logger.debug(f"input context: {context}, answer: {answer}")
 37 |         answer_sents = [sent for sent
 38 |                         in nltk.sent_tokenize(answer.replace("\n", ""))
 39 |                         if len(sent.strip()) > 0][0:10]
 40 |         dspy.logger.debug(f"answer sentences: {answer_sents}")
 41 |         scores = []
 42 |         for context_item in context:
 43 |             if len(context_item.strip()) < 10:
 44 |                 continue
 45 |             ctx_score = 0.0
 46 |             try:
 47 |                 ctx_scores = self.attrib_clf(
 48 |                     answer=answer_sents,
 49 |                     context_item=context_item).scores
 50 |                 num_pos, num_neg = string_to_bool_array(
 51 |                     ctx_scores, choices=["yes", "no"])
 52 |                 if num_pos + num_neg > 0:
 53 |                     ctx_score = num_pos / (num_pos + num_neg)
 54 |             except Exception:
 55 |                 pass
 56 |             # print(f"context: {context_item}, score: {ctx_score}")
 57 |             scores.append(ctx_score)
 58 |         dspy.logger.debug(f"scores: {scores}")
 59 |         score = 0.0
 60 |         if len(scores) > 0:
 61 |             score = np.mean(scores)
 62 |         dspy.logger.debug(f"score: {score}")
 63 |         return dspy.Prediction(score=str(score))
 64 | 
 65 | 
 66 | def context_recall_dataset(file_path):
 67 |     if not os.path.exists(file_path):
 68 |         raise FileNotFoundError(
 69 |             f"context recall dataset: {file_path} not found, "
 70 |             "create it with generate_datasets.py first.")
 71 |     examples = []
 72 |     with open(file_path, "r", encoding="utf-8") as fin:
 73 |         for line in fin:
 74 |             record = json.loads(line)
 75 |             answer = record["answer"]
 76 |             context = record["context"]
 77 |             score = record["score"]
 78 |             examples.append(dspy.Example(
 79 |                 answer=answer,
 80 |                 context=context,
 81 |                 score=str(score))
 82 |                 .with_inputs("answer", "context"))
 83 |     return examples
 84 | 
 85 | 
 86 | def compute_context_recall(context: List[str],
 87 |                            answer: str,
 88 |                            prompts_dict):
 89 |     try:
 90 |         context_recall_opt = prompts_dict["context_recall"]
 91 |     except KeyError:
 92 |         context_recall_opt = optimize_prompt("context_recall",
 93 |                                              CONFIGS_DIR,
 94 |                                              context_recall_dataset,
 95 |                                              DATASET_FP,
 96 |                                              score_metric,
 97 |                                              ContextRecall())
 98 |         prompts_dict["context_recall"] = context_recall_opt
 99 |     pred = context_recall_opt(context=context, answer=answer)
100 |     return float(pred.score)
101 | 


--------------------------------------------------------------------------------
/src/learned/context_relevance.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | import json
  3 | import nltk
  4 | import os
  5 | import time
  6 | 
  7 | import ssl
  8 | 
  9 | try:
 10 |     nltk.data.find('tokenizers/punkt')
 11 | except LookupError:
 12 |     try:
 13 |         _create_unverified_https_context = ssl._create_unverified_context
 14 |     except AttributeError:
 15 |         pass
 16 |     else:
 17 |         ssl._create_default_https_context = _create_unverified_https_context
 18 | 
 19 |     nltk.download('punkt')
 20 | 
 21 | from typing import List
 22 | 
 23 | from .learning_utils import (
 24 |     list_to_string, string_to_bool, score_metric,
 25 |     optimize_prompt
 26 | )
 27 | 
 28 | 
 29 | DATA_DIR = "../data"
 30 | RESOURCE_DIR = "../resources"
 31 | 
 32 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets")
 33 | DATASET_FP = os.path.join(DATASET_DIR, "context_relevance.jsonl")
 34 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs")
 35 | 
 36 | 
 37 | class QuestionCtxSentToScore(dspy.Signature):
 38 |     """ Given a question and a sentence from the context, classify
 39 |         if sentence is absolutely necessary to answer question
 40 |     """
 41 |     question: str = dspy.InputField(desc="the question")
 42 |     ctx_sent: str = dspy.InputField(desc="a sentence from the context")
 43 |     score: float = dspy.OutputField(desc="yes or no")
 44 | 
 45 | 
 46 | class ContextRelevance(dspy.Module):
 47 |     def __init__(self):
 48 |         super().__init__()
 49 |         self.nec_classifier = dspy.ChainOfThought(QuestionCtxSentToScore)
 50 | 
 51 |     def forward(self, question: str, context: List[str]):
 52 |         dspy.logger.debug(f"input question: {question}, context: {context}")
 53 |         ctx_scores = []
 54 |         for ctx in context:
 55 |             sent_scores = []
 56 |             for ctx_sent in nltk.sent_tokenize(ctx):
 57 |                 score = self.nec_classifier(question=question,
 58 |                                             ctx_sent=ctx_sent).score
 59 |                 sent_scores.append(string_to_bool(score, choices=["yes", "no"]))
 60 |             if len(sent_scores) == 0:
 61 |                 ctx_scores.append(0.0)
 62 |             else:
 63 |                 ctx_scores.append(sum(sent_scores) / len(sent_scores))
 64 |             # to prevent ResourceExhaustedError
 65 |             time.sleep(0.3)
 66 |         dspy.logger.debug(f"context scores: {ctx_scores}")
 67 |         score = 0.0
 68 |         if len(ctx_scores) > 0:
 69 |             score = sum(ctx_scores) / len(ctx_scores)
 70 |         dspy.logger.debug(f"score: {score}")
 71 |         return dspy.Prediction(score=str(score))
 72 | 
 73 | 
 74 | def context_relevance_dataset(file_path: str):
 75 |     if not os.path.exists(file_path):
 76 |         raise FileNotFoundError(
 77 |             f"context relevance dataset: {file_path} not found, "
 78 |             "create it with generate_datasets.py first.")
 79 |     examples = []
 80 |     with open(file_path, "r", encoding="utf-8") as fin:
 81 |         for line in fin:
 82 |             record = json.loads(line)
 83 |             question = record["question"]
 84 |             context = record["context"]
 85 |             score = record["score"]
 86 |             examples.append(dspy.Example(
 87 |                 question=question,
 88 |                 context=list_to_string(context),
 89 |                 score=str(score)
 90 |             ).with_inputs("question", "context"))
 91 |     return examples
 92 | 
 93 | 
 94 | def compute_context_relevance(question: str,
 95 |                               context: List[str],
 96 |                               prompts_dict):
 97 |     try:
 98 |         context_relevance_opt = prompts_dict["context_relevance"]
 99 |     except KeyError:
100 |         context_relevance_opt = optimize_prompt("context_relevance",
101 |                                                 CONFIGS_DIR,
102 |                                                 context_relevance_dataset,
103 |                                                 DATASET_FP,
104 |                                                 score_metric,
105 |                                                 ContextRelevance())
106 |         prompts_dict["context_relevance"] = context_relevance_opt
107 |     pred = context_relevance_opt(question=question, context=context)
108 |     return float(pred.score)
109 | 


--------------------------------------------------------------------------------
/src/learned/faithfulness.py:
--------------------------------------------------------------------------------
 1 | import dspy
 2 | import json
 3 | import os
 4 | 
 5 | from typing import List
 6 | 
 7 | from .learning_utils import (
 8 |     list_to_string, string_to_list, string_to_bool,
 9 |     score_metric, optimize_prompt
10 | )
11 | 
12 | 
13 | DATA_DIR = "../data"
14 | RESOURCE_DIR = "../resources"
15 | 
16 | DATASET_DIR = os.path.join(DATA_DIR, "dspy-datasets")
17 | DATASET_FP = os.path.join(DATASET_DIR, "faithfulness.jsonl")
18 | CONFIGS_DIR = os.path.join(RESOURCE_DIR, "configs")
19 | 
20 | 
21 | class QuestAnswerToFacts(dspy.Signature):
22 |     """ Given a question-answer pair, generate a list of 3-5 facts
23 |         from the answer
24 |     """
25 |     question: str = dspy.InputField(desc="a question")
26 |     answer: str = dspy.InputField(desc="an answer")
27 |     facts: str = dspy.OutputField(desc="a list of facts")
28 | 
29 | 
30 | class ContextFactsToScore(dspy.Signature):
31 |     """ Classify if fact can be inferred from context """
32 |     context: str = dspy.InputField(desc="a context")
33 |     fact: str = dspy.InputField(desc="a fact")
34 |     score: bool = dspy.OutputField(
35 |         desc="can fact be inferred from context? yes or no")
36 | 
37 | 
38 | class Faithfulness(dspy.Module):
39 |     def __init__(self):
40 |         super().__init__()
41 |         self.extractor = dspy.Predict(QuestAnswerToFacts)
42 |         self.scorer = dspy.Predict(ContextFactsToScore)
43 | 
44 |     def forward(self, question: str, answer: str, context: str):
45 |         dspy.logger.debug(f"input question: {question}, answer: {answer}, "
46 |                           f"context: {context}")
47 |         facts = self.extractor(question=question, answer=answer).facts
48 |         dspy.logger.debug(f"facts: {facts}")
49 |         scores = []
50 |         for fact in string_to_list(facts):
51 |             can_infer = self.scorer(context=context, fact=fact).score
52 |             scores.append(string_to_bool(can_infer, ["yes", "no"]))
53 |         dspy.logger.debug(f"scores: {scores}")
54 |         score = sum(scores) / len(scores)
55 |         dspy.logger.debug(f"score: {score}")
56 |         return dspy.Prediction(score=str(score))
57 | 
58 | 
59 | def faithfulness_dataset(file_path):
60 |     if not os.path.exists(file_path):
61 |         raise FileNotFoundError(
62 |             f"Faithfulness dataset: {file_path} not found, "
63 |             "create it with generate_datasets.py first.")
64 |     examples = []
65 |     with open(file_path, "r", encoding="utf-8") as fin:
66 |         for line in fin:
67 |             record = json.loads(line)
68 |             question = record["question"]
69 |             answer = record["answer"]
70 |             context = list_to_string(record["context"], style="number")
71 |             score = record["score"]
72 |             examples.append(dspy.Example(
73 |                 question=question,
74 |                 answer=answer,
75 |                 context=context,
76 |                 score=str(score))
77 |                 .with_inputs("question", "answer", "context"))
78 |     return examples
79 | 
80 | 
81 | def compute_faithfulness(question: str,
82 |                          answer: str,
83 |                          context: List[str],
84 |                          prompts_dict):
85 |     try:
86 |         faithfulness_opt = prompts_dict["faithfulness"]
87 |     except KeyError:
88 |         faithfulness_opt = optimize_prompt("faithfulness",
89 |                                            CONFIGS_DIR,
90 |                                            faithfulness_dataset,
91 |                                            DATASET_FP,
92 |                                            score_metric,
93 |                                            Faithfulness())
94 |         prompts_dict["faithfulness"] = faithfulness_opt
95 |     pred = faithfulness_opt(
96 |         question=question, answer=answer,
97 |         context=list_to_string(context, style="number"))
98 |     return float(pred.score)
99 | 


--------------------------------------------------------------------------------
/src/learned/learning_utils.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | import glob
  3 | import re
  4 | import numpy as np
  5 | import os
  6 | import shutil
  7 | 
  8 | from dspy.evaluate import Evaluate
  9 | from dspy.teleprompt import BootstrapFewShotWithRandomSearch
 10 | from sklearn.model_selection import train_test_split
 11 | from typing import Callable, List
 12 | 
 13 | 
 14 | STYLE_TO_CHAR = {
 15 |     "dash": "-",
 16 |     "star": "*",
 17 | }
 18 | NUM_BULLET_PATTERM = re.compile(r"^\d+(\.)?\s(.*?)$")
 19 | GEN_BULLET_PATTERN = re.compile(r"^[*-]\s(.*?)$")
 20 | 
 21 | 
 22 | def string_to_list(xs: str, style: str = "dash") -> List[str]:
 23 |     """ Convert a string with embedded newlines to a list of strings
 24 |         separated by newlines.
 25 | 
 26 |         :param xs: string with embedded newlines
 27 |         :param style: bullet style to strip (dash, star, number)
 28 |         :return: a list of strings
 29 |     """
 30 |     stripped_xs = []
 31 |     for x in xs.split("\n"):
 32 |         if NUM_BULLET_PATTERM.match(x):
 33 |             stripped_x = NUM_BULLET_PATTERM.match(x).group(2)
 34 |         elif GEN_BULLET_PATTERN.match(x):
 35 |             stripped_x = GEN_BULLET_PATTERN.match(x).group(1)
 36 |         else:
 37 |             stripped_x = x
 38 |         stripped_xs.append(stripped_x)
 39 |     return stripped_xs
 40 | 
 41 | 
 42 | def list_to_string(xs: List[str], style: str = "dash") -> str:
 43 |     """ Convert a list of strings to a string with embedded newlines.
 44 |         Strips (known) bullet chars off each string element and
 45 |         adds in the appropriate bullet specified by the style.
 46 | 
 47 |         :param xs: list of strings
 48 |         :param style: bullet style to strip (dash, star, number)
 49 |         :return: a string with embedded newlines
 50 |     """
 51 |     stripped_xs = []
 52 |     for i, x in enumerate(xs):
 53 |         if NUM_BULLET_PATTERM.match(x):
 54 |             stripped_x = NUM_BULLET_PATTERM.match(x).group(2)
 55 |         elif GEN_BULLET_PATTERN.match(x):
 56 |             stripped_x = GEN_BULLET_PATTERN.match(x).group(1)
 57 |         else:
 58 |             stripped_x = x
 59 |         if style == "number":
 60 |             stripped_x = f"{i+1}. {stripped_x}"
 61 |         elif style in ["dash", "star"]:
 62 |             bullet = STYLE_TO_CHAR.get(style, "-")
 63 |             stripped_x = f"{bullet} {stripped_x}"
 64 |         else:
 65 |             pass
 66 |         stripped_xs.append(stripped_x)
 67 |     return "\n".join(stripped_xs)
 68 | 
 69 | 
 70 | def string_to_bool(s: str, choices: List[str]) -> bool:
 71 |     """ Convert a string to a boolean given list of choices representing
 72 |         True and False response
 73 | 
 74 |         :param s: string to convert
 75 |         :param choices: list of strings ["True", "False]
 76 |         :return: boolean
 77 |     """
 78 |     matches = [i for i, c in enumerate(choices)
 79 |                if re.search(c.lower(), s.lower()) is not None]
 80 |     if len(matches) == 0:
 81 |         return False
 82 |     else:
 83 |         return True if matches[0] == 0 else False
 84 | 
 85 | 
 86 | def string_to_bool_array(s: str, choices: List[str]) -> List[bool]:
 87 |     """ Convert a string to a list of booleans given list of choices
 88 |         representing True and False response
 89 | 
 90 |         :param s: string to convert
 91 |         :param choices: list of strings ["True", "False]
 92 |         :return: counts for each choice found
 93 |     """
 94 |     patterns = [re.compile(c) for c in choices]
 95 |     counts = [len(re.findall(p, s)) for p in patterns]
 96 |     return counts
 97 | 
 98 | 
 99 | def strip_newlines(s: str) -> str:
100 |     """ Strip newlines from a string
101 | 
102 |         :param s: string to strip
103 |         :return: string with newlines stripped
104 |     """
105 |     s = re.sub(r"\n", " ", s)
106 |     s = re.sub(r"\s+", " ", s)
107 |     return s
108 | 
109 | 
110 | def score_metric(example, pred, trace=None):
111 |     """ Common score metric used in all learning metrics """
112 |     if trace is None:
113 |         return 1.0 - abs(float(example.score) - float(pred.score))
114 |     else:
115 |         return float(pred.score)    # inference
116 | 
117 | 
118 | def optimize_prompt(metric_name: str,
119 |                     config_dir: str,
120 |                     dataset_generator_fn: Callable,
121 |                     dataset_fp: str,
122 |                     score_metric: Callable,
123 |                     metric_instance: dspy.Module):
124 | 
125 |     config_fn_pattern = f"{metric_name}-*.json"
126 |     config_paths = glob.glob(os.path.join(config_dir, config_fn_pattern))
127 |     best_config_fp = os.path.join(config_dir, f"{metric_name}-best.json")
128 | 
129 |     if len(config_paths) == 0:
130 |         teleprompter = BootstrapFewShotWithRandomSearch(
131 |             metric=score_metric,
132 |             max_bootstrapped_demos=2,
133 |             max_labeled_demos=2,
134 |             num_threads=1
135 |         )
136 |         examples = dataset_generator_fn(dataset_fp)
137 |         trainset, devset = train_test_split(examples, test_size=0.3,
138 |                                             random_state=42)
139 |         print(
140 |             f"fact extractor dataset sizes: "
141 |             f"{len(trainset)}, {len(devset)}, total: {len(examples)}")
142 | 
143 |         print("--- training ---")
144 |         faithfulness = metric_instance
145 |         faithfulness_opt = teleprompter.compile(
146 |             faithfulness, trainset=trainset)
147 |         ensemble = [prog for *_, prog in
148 |                     faithfulness_opt.candidate_programs[:4]]
149 |         
150 |         os.makedirs(config_dir, exist_ok=True)
151 |         for idx, prog in enumerate(ensemble):
152 |             config_path = os.path.join(
153 |                 config_dir, f"{metric_name}-{idx}.json")
154 |             config_paths.append(config_path)
155 |             prog.save(config_path)
156 | 
157 |         print("--- evaluation ---")
158 |         evaluate = Evaluate(devset=devset, metric=score_metric,
159 |                             num_threads=1, display_progress=True)
160 |         scores = [evaluate(prog) for prog in ensemble]
161 |         print(f"Evaluation scores: {scores}")
162 |         best_prompt_id = np.argmax(scores)
163 |         shutil.copy(config_paths[best_prompt_id], best_config_fp)
164 | 
165 |     prog = metric_instance
166 |     prog.load(best_config_fp)
167 |     return prog
168 | 
169 | 
170 | def clean_up_log_files():
171 |     log_files = ["openai_usage.log",
172 |                  "azure_openai_usage.log",
173 |                  "assertion.log"]
174 |     for log_file in log_files:
175 |         if os.path.exists(log_file):
176 |             os.remove(log_file)
177 | 


--------------------------------------------------------------------------------
/src/make_more_questions.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import json
  4 | import logging
  5 | import os
  6 | 
  7 | from dotenv import find_dotenv, load_dotenv
  8 | from enum import Enum
  9 | from langchain_google_genai import (
 10 |     ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 11 |     )
 12 | 
 13 | from prompted.more_questions import compute_more_questions
 14 | 
 15 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 16 | 
 17 | 
 18 | DATA_DIR = "../data"
 19 | REPORTS_DIR = os.path.join(DATA_DIR, "reports")
 20 | 
 21 | # Safety config
 22 | 
 23 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
 24 | 
 25 | safety_settings = {
 26 |     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 27 |     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 28 |     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 29 |     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH
 30 | }
 31 | 
 32 | async def runner():
 33 | 
 34 |     parser = argparse.ArgumentParser()
 35 |     
 36 |     parser.add_argument("--input-jsonl", type=str, required=True,
 37 |                         help="Full path to evaluation data in JSONL format")
 38 |     parser.add_argument("--output-jsonl", type=str, required=False,
 39 |                         help="Full path to output file")
 40 |     parser.add_argument("--debug", action="store_true",
 41 |                         help="Turn debugging on (default: false)")
 42 |     parser.add_argument("--id-start", type=int, required=False,
 43 |                         help="The number that the question ids in the output should start with (default 0)")
 44 |     parser.add_argument("--multiplier", type=int, required=False,
 45 |                         help="The number of new questions to be generated PER question in input data (default 3)")
 46 |     parser.add_argument("--model-temp", type=float, required=False,
 47 |                         help="The temperature of the model - between 0.0 and 1.0 (default 0.0)")
 48 |     parser.add_argument("--max", type=int, required=False,
 49 |                         help="The maximum number of new questions to be generated total (no default)")
 50 |    
 51 |     args = parser.parse_args()
 52 |     input_fp = args.input_jsonl
 53 |     output_fp = args.output_jsonl
 54 |     if output_fp is None:
 55 |         output_fp = os.path.join(REPORTS_DIR, f"default_report.jsonl")
 56 |     debug = args.debug
 57 |     id_start = args.id_start
 58 |     if id_start is None:
 59 |         id_start = 0
 60 |     multiplier = args.multiplier
 61 |     if multiplier is None:
 62 |         multiplier = 3
 63 |     model_temp = args.model_temp
 64 |     if model_temp is None or model_temp > 1.0 or model_temp < 0.0:
 65 |         model_temp = 0.0
 66 |     maxq = args.max
 67 |     if maxq is None:
 68 |         maxq = 99999
 69 |     
 70 |     logger = logging.getLogger(__name__)
 71 |     logger.setLevel(logging.DEBUG if debug else logging.INFO)
 72 | 
 73 |     _ = load_dotenv(find_dotenv())
 74 | 
 75 |     model = ChatGoogleGenerativeAI(
 76 |         model="gemini-pro",
 77 |         api_key=os.environ["GOOGLE_API_KEY"],
 78 |         temperature=model_temp,
 79 |         safety_settings=safety_settings)
 80 | 
 81 |     os.makedirs(REPORTS_DIR, exist_ok=True)
 82 | 
 83 |     with open(input_fp, "r", encoding="utf-8") as fin, \
 84 |          open(output_fp, "w", encoding="utf-8") as fout:
 85 | 
 86 |         # fout.write("\t".join(["#QID", metric.upper()]) + "\n")
 87 |         q_counter = 0
 88 |         new_q_id = id_start
 89 |         for line in fin:
 90 |             record = json.loads(line)
 91 |             # extract relevant data to evaluate
 92 |             id = record["id"]
 93 |             # be done if hit max
 94 |             if q_counter + multiplier > maxq:
 95 |                 break
 96 |             question = record["query"]
 97 |             context = [ctx["chunk_text"] for ctx in record["context"]]
 98 |             answer = record["predicted_answer"]
 99 |             ideal_answer = record["ideal_answer"]
100 |             
101 |             more_questions = compute_more_questions(
102 |                 question, ideal_answer, multiplier, model, logger)
103 |             
104 |             q_counter = q_counter + multiplier
105 | 
106 |             for q in more_questions:
107 |                 q["id"] = new_q_id
108 |                 new_q_id += 1
109 |                 fout.write(json.dumps(q) + '\n')
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     asyncio.run(runner())
114 | 


--------------------------------------------------------------------------------
/src/metrics.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class Metrics(Enum):
 5 |     FAITHFULNESS = "faithfulness"
 6 |     ANSWER_RELEVANCE = "answer_relevance"
 7 |     CONTEXT_PRECISION = "context_precision"
 8 |     CONTEXT_UTILIZATION = "context_utilization"
 9 |     CONTEXT_RELEVANCE = "context_relevance"
10 |     CONTEXT_RECALL = "context_recall"
11 |     ANSWER_SIMILARITY = "answer_similarity"
12 |     ANSWER_CORRECTNESS = "answer_correctness"
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/src/prompted/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sujitpal/llm-rag-eval/8936ae484d7ab78c9debfcdabc3a2f14eea38ac2/src/prompted/__init__.py


--------------------------------------------------------------------------------
/src/prompted/answer_correctness.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.output_parsers import StrOutputParser
 2 | from langchain_core.language_models.chat_models import BaseChatModel
 3 | from langchain.prompts import PromptTemplate
 4 | from xml.sax.saxutils import escape
 5 | 
 6 | from .prompt_utils import read_template_from_file, parse_response
 7 | 
 8 | 
 9 | PROMPT_CLASSIFY_FACTS = "answer_correctness_1.txt"
10 | 
11 | 
12 | def _do_classification(answer: str, ideal_answer: str,
13 |                        model, logger):
14 |     prompt_template = read_template_from_file(PROMPT_CLASSIFY_FACTS)
15 |     prompt = PromptTemplate(template=prompt_template,
16 |                             input_variables=["answer", "ground_truth"])
17 |     chain = prompt | model | StrOutputParser()
18 |     response = chain.invoke({
19 |         "answer": escape(answer),
20 |         "ground_truth": escape(ideal_answer)
21 |     })
22 |     result = parse_response(response)
23 |     logger.debug("result:", result)
24 |     classification = result.value["classes"]
25 |     return classification
26 | 
27 | 
28 | def _get_statements_for_class(statements_dict, class_name):
29 |     try:
30 |         if statements_dict[class_name] is None:
31 |             return []
32 |         else:
33 |             return statements_dict[class_name]["sts"]["st"]
34 |     except KeyError:
35 |         return []
36 | 
37 | 
38 | def _compute_answer_correctness_score(statements_by_class_dict):
39 |     tp = len(statements_by_class_dict["TP"])
40 |     fp = len(statements_by_class_dict["FP"])
41 |     fn = len(statements_by_class_dict["FN"])
42 |     score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0.0
43 |     return score
44 | 
45 | 
46 | def compute_answer_correctness(answer: str,
47 |                                ideal_answer: str,
48 |                                model: BaseChatModel,
49 |                                logger) -> float:
50 |     classification = _do_classification(answer, ideal_answer, model, logger)
51 |     statements_by_class_dict = {}
52 |     for key in ["TP", "FP", "FN"]:
53 |         statements_by_class_dict[key] = _get_statements_for_class(
54 |             classification, key)
55 |     score = _compute_answer_correctness_score(statements_by_class_dict)
56 |     return score
57 | 


--------------------------------------------------------------------------------
/src/prompted/answer_relevance.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import numpy as np
  3 | 
  4 | from langchain_core.output_parsers import StrOutputParser
  5 | from langchain_core.embeddings import Embeddings
  6 | from langchain_core.language_models.chat_models import BaseChatModel
  7 | from langchain.prompts import PromptTemplate
  8 | from pydantic import BaseModel, Field
  9 | from typing import List
 10 | 
 11 | from .prompt_utils import read_template_from_file, parse_response
 12 | 
 13 | 
 14 | PROMPT_GEN_QUESTIONS = "answer_relevance_1.txt"
 15 | PROMPT_CLASSIFY_NONCOMMITTAL = "answer_relevance_2.txt"
 16 | 
 17 | 
 18 | class ClassifiedQAPair(BaseModel):
 19 |     question: str = Field(alias="question", description="Generated question")
 20 |     answer: str = Field(alias="answer", description="answer to generated question")
 21 |     noncommittal: str = Field(alias="noncommittal", description="Noncommittal (0/1)")
 22 | 
 23 | 
 24 | def _cosine_similarity(query_vector, doc_vectors):
 25 |     query_vector = query_vector.reshape(1, -1)
 26 |     sims = np.dot(query_vector, doc_vectors.T) / (
 27 |         np.linalg.norm(query_vector) * np.linalg.norm(doc_vectors, axis=1)
 28 |     )
 29 |     return np.mean(sims)
 30 | 
 31 | 
 32 | def _flatten_context(context: List[str]) -> str:
 33 |     return " ".join([f"{i+1}. {chunk}" for i, chunk in enumerate(context)])
 34 | 
 35 | 
 36 | def _generate_questions_from_answer_and_context(context_flat: str,
 37 |                                                 answer: str,
 38 |                                                 num_questions_to_generate: int,
 39 |                                                 model: BaseChatModel,
 40 |                                                 logger):
 41 |     # generate questions
 42 |     prompt_genq = read_template_from_file(PROMPT_GEN_QUESTIONS)
 43 |     prompt = PromptTemplate(template=prompt_genq,
 44 |                             input_variables=[
 45 |                                 "num_questions_to_generate",
 46 |                                 "answer", "context"])
 47 |     chain_genq = prompt | model | StrOutputParser()
 48 |     response = chain_genq.invoke({
 49 |         "num_questions_to_generate": num_questions_to_generate,
 50 |         "answer": answer,
 51 |         "context": context_flat
 52 |     })
 53 |     result = parse_response(response)
 54 |     gen_questions = result.value["questions"]["question"]
 55 |     logger.debug(f"gen_questions: {gen_questions}")
 56 |     return gen_questions
 57 | 
 58 | 
 59 | async def _predict_noncommittal_from_questions(gen_questions: List[str],
 60 |                                                context_flat: str,
 61 |                                                parallel: bool,
 62 |                                                model: BaseChatModel,
 63 |                                                logger):
 64 |     # generate answers to generated questions based on context and
 65 |     # classify answers as committal or non-committal
 66 |     prompt_anc = read_template_from_file(PROMPT_CLASSIFY_NONCOMMITTAL)
 67 |     prompt = PromptTemplate(template=prompt_anc,
 68 |                             input_variables=["question", "context"])
 69 |     chain_anc = prompt | model | StrOutputParser()
 70 | 
 71 |     qa_pairs = []
 72 |     if parallel:
 73 |         tasks = []
 74 |         for gen_question in gen_questions:
 75 |             tasks.append(chain_anc.ainvoke({
 76 |                 "question": gen_question,
 77 |                 "context": context_flat
 78 |             }))
 79 |         responses = await asyncio.gather(*tasks)
 80 |         for response in responses:
 81 |             result = parse_response(response)
 82 |             qa_pair = ClassifiedQAPair(**result.value["qa_pair"])
 83 |             qa_pairs.append(qa_pair)
 84 |     else:
 85 |         for gen_question in gen_questions:
 86 |             response = chain_anc.invoke({
 87 |                 "question": gen_question,
 88 |                 "context": context_flat
 89 |             })
 90 |             result = parse_response(response)
 91 |             qa_pair = ClassifiedQAPair(**result.value["qa_pair"])
 92 |             qa_pairs.append(qa_pair)
 93 | 
 94 |     logger.debug(f"qa_pairs: {qa_pairs}")
 95 |     return qa_pairs
 96 | 
 97 | 
 98 | def _compute_answer_relevance(question: str,
 99 |                               qa_pairs: List[ClassifiedQAPair],
100 |                               encoder: Embeddings,
101 |                               logger):
102 |     # if all non-committal questions, then answer is not relevant
103 |     if np.all([qa_pair.noncommittal == "1" for qa_pair in qa_pairs]):
104 |         logger.warning("cannot compute similarity, generated questions "
105 |                        "are all non-committal")
106 |         return 0.0
107 |     else:
108 |         questions = [question]
109 |         questions.extend([qa_pair.question for qa_pair in qa_pairs])
110 |         embeddings = encoder.embed_documents(questions)
111 |         E = np.array(embeddings)
112 |         source, target = E[0, :], E[1:, :]
113 |         return _cosine_similarity(source, target)
114 | 
115 | 
116 | async def compute_answer_relevance(question: str,
117 |                                    context: List[str],
118 |                                    answer: str,
119 |                                    model: BaseChatModel,
120 |                                    encoder: Embeddings,
121 |                                    logger,
122 |                                    num_questions_to_generate: int = 5,
123 |                                    parallel: bool = True
124 |                                    ) -> float:
125 |     # reformat context
126 |     context_flat = _flatten_context(context)
127 |     gen_questions = _generate_questions_from_answer_and_context(
128 |         context_flat, answer, num_questions_to_generate, model, logger)
129 |     qa_pairs = await _predict_noncommittal_from_questions(
130 |         gen_questions, context_flat, parallel, model, logger)
131 |     answer_relevance = _compute_answer_relevance(
132 |         question, qa_pairs, encoder, logger)
133 |     return answer_relevance
134 | 


--------------------------------------------------------------------------------
/src/prompted/answer_similarity.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import numpy as np
 3 | 
 4 | from langchain_core.embeddings import Embeddings
 5 | 
 6 | 
 7 | def compute_answer_similarity(ideal_answer: str,
 8 |                               predicted_answer: str,
 9 |                               encoder: Embeddings,
10 |                               logger,
11 |                               cross_encoder: bool = True) -> float:
12 | 
13 |     logger.debug(f"predicted answer: {predicted_answer}")
14 |     logger.debug(f"ideal answer: {ideal_answer}")
15 | 
16 |     if cross_encoder:
17 |         # use cross encoder similarity scoring (token level)
18 |         pa_words = [w for w in nltk.word_tokenize(predicted_answer)]
19 |         ia_words = [w for w in nltk.word_tokenize(ideal_answer)]
20 |         words = pa_words + ia_words
21 |         embs = encoder.embed_documents(words)
22 |         pa_vecs = np.array(embs[0:len(pa_words)])
23 |         ia_vecs = np.array(embs[len(pa_words):])
24 |         similarity = (np.sum(
25 |             np.max(
26 |                 np.dot(ia_vecs, pa_vecs.T) / (
27 |                     np.linalg.norm(ia_vecs, axis=1)[:, None] *
28 |                     np.linalg.norm(pa_vecs, axis=1)[None, :]
29 |                 ), axis=1
30 |             )
31 |         ) / len(ia_words))
32 |     else:
33 |         # use cosine similarity
34 |         # NOTE: RAGAS docs mention using cross-encoder similarity but
35 |         # the code uses cosine similarity
36 |         answers = [predicted_answer, ideal_answer]
37 |         embeddings = np.array(encoder.embed_documents(answers))
38 |         source, target = embeddings[0, :], embeddings[1, :]
39 |         similarity = np.dot(source, target) / (
40 |             np.linalg.norm(source) * np.linalg.norm(target))
41 | 
42 |     return similarity
43 | 


--------------------------------------------------------------------------------
/src/prompted/context_precision.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import time
  3 | 
  4 | from langchain_core.output_parsers import StrOutputParser
  5 | from langchain_core.language_models.chat_models import BaseChatModel
  6 | from langchain.prompts import PromptTemplate
  7 | from pydantic import BaseModel, Field
  8 | from typing import List
  9 | 
 10 | from .prompt_utils import read_template_from_file, parse_response
 11 | 
 12 | 
 13 | PROMPT_CONTEXT_PRECISION = "context_precision_1.txt"
 14 | 
 15 | 
 16 | class Verdict(BaseModel):
 17 |     reason: str = Field(alias="reason", description="Reason for verdict")
 18 |     infer: str = Field(alias="infer", description="The inference (0/1)")
 19 | 
 20 | 
 21 | async def _compute_usefulness_scores(question: str,
 22 |                                      context: List[str],
 23 |                                      answer: str,
 24 |                                      parallel: bool,
 25 |                                      model,
 26 |                                      logger) -> List[int]:
 27 |     
 28 |     prompt_template = read_template_from_file(PROMPT_CONTEXT_PRECISION)
 29 |     prompt_cprec = PromptTemplate(
 30 |         template=prompt_template,
 31 |         input_variables=["question", "context", "answer"])
 32 |     chain_cprec = prompt_cprec | model | StrOutputParser()
 33 | 
 34 |     verdicts = []
 35 |     if parallel:
 36 |         tasks = []
 37 |         for ctx in context:
 38 |             tasks.append(chain_cprec.ainvoke({
 39 |                 "question": question,
 40 |                 "context": ctx,
 41 |                 "answer": answer
 42 |             }))
 43 |         responses = await asyncio.gather(*tasks)
 44 |         for response in responses:
 45 |             result = parse_response(response)
 46 |             verdict = Verdict(**result.value["verdict"])
 47 |             verdicts.append(verdict)        
 48 |     else:
 49 |         for ctx in context:
 50 |             response = chain_cprec.invoke({
 51 |                 "question": question,
 52 |                 "context": ctx,
 53 |                 "answer": answer
 54 |             })
 55 |             result = parse_response(response)
 56 |             verdicts.append(Verdict(**result.value["verdict"]))
 57 | 
 58 |     logger.debug(f"verdicts: {verdicts}")
 59 |     scores = [int(verdict.infer) for verdict in verdicts]
 60 |     return scores
 61 | 
 62 | 
 63 | def _compute_content_precision(scores: List[int]) -> float:
 64 |     # precision@k (for k=1..K) discounted by by reciprocal of position
 65 |     weights = [sum(scores[:i + 1]) / (i + 1) * scores[i]
 66 |                for i in range(len(scores))]
 67 |     if len(scores) == 0:
 68 |         return 0.0
 69 |     context_precision = (
 70 |         sum(w * s for w, s in zip(weights, scores)) / len(scores))
 71 |     return context_precision
 72 | 
 73 | 
 74 | async def compute_context_precision(question: str,
 75 |                                     context: List[str],
 76 |                                     answer: str,
 77 |                                     model: BaseChatModel,
 78 |                                     logger,
 79 |                                     parallel: bool = True) -> float:
 80 | 
 81 |     # prompt_template = read_template_from_file(PROMPT_CONTEXT_PRECISION)
 82 |     # prompt_cprec = PromptTemplate(
 83 |     #     template=prompt_template,
 84 |     #     input_variables=["question", "context", "answer"])
 85 |     # chain_cprec = prompt_cprec | model | StrOutputParser()
 86 | 
 87 |     # verdicts = []
 88 |     # if parallel:
 89 |     #     tasks = []
 90 |     #     for ctx in context:
 91 |     #         tasks.append(chain_cprec.ainvoke({
 92 |     #             "question": question,
 93 |     #             "context": ctx,
 94 |     #             "answer": answer
 95 |     #         }))
 96 |     #     responses = await asyncio.gather(*tasks)
 97 |     #     for response in responses:
 98 |     #         result = parse_response(response)
 99 |     #         verdict = Verdict(**result.value["verdict"])
100 |     #         verdicts.append(verdict)        
101 |     # else:
102 |     #     for ctx in context:
103 |     #         response = chain_cprec.invoke({
104 |     #             "question": question,
105 |     #             "context": ctx,
106 |     #             "answer": answer
107 |     #         })
108 |     #         result = parse_response(response)
109 |     #         verdicts.append(Verdict(**result.value["verdict"]))
110 | 
111 |     # logger.debug(f"verdicts: {verdicts}")
112 |     precs = await _compute_usefulness_scores(question, context, answer,
113 |                                              parallel, model, logger)
114 |     context_precision = _compute_content_precision(precs)
115 |     # # precision@k (for k=1..K)
116 |     # scores = [int(verdict.infer) for verdict in verdicts]
117 |     # # weighted by reciprocal of position
118 |     # weights = [sum(scores[:i + 1]) / (i + 1) * scores[i]
119 |     #            for i in range(len(scores))]
120 | 
121 |     # if len(scores) == 0:
122 |     #     return 0.0
123 |     # context_precision = (
124 |     #     sum(w * s for w, s in zip(weights, scores)) / len(scores))
125 |     return context_precision
126 | 
127 | 


--------------------------------------------------------------------------------
/src/prompted/context_recall.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import nltk
 3 | 
 4 | from itertools import chain
 5 | from langchain_core.output_parsers import StrOutputParser
 6 | from langchain_core.language_models.chat_models import BaseChatModel
 7 | from langchain.prompts import PromptTemplate
 8 | from typing import List
 9 | from xml.sax.saxutils import escape
10 | 
11 | from .prompt_utils import (
12 |     read_template_from_file, parse_response,
13 |     parse_verdicts_from_result
14 | )
15 | 
16 | PROMPT_CLASSIFY_ATTRIB = "context_recall_1.txt"
17 | 
18 | 
19 | def _convert_answer_to_markdown_list(answer: str,
20 |                                      logger,
21 |                                      max_sents: int = 10) -> str:
22 |     answer_sents = []
23 |     for sent in nltk.sent_tokenize(answer):
24 |         sent = escape(sent)
25 |         answer_sents.append(sent)
26 |     answer_markdown = "\n".join([f"- {sent}" for sent in answer_sents])
27 |     # to deal with context length limitations (this hack is to take
28 |     # care of lines that are already in the list format before sent_tokenize
29 |     answer_sents = answer_markdown.split("\n- ")[:max_sents]
30 |     answer_markdown = "\n- ".join(answer_sents)
31 |     logger.debug(f"answer_md ({len(answer_sents)} sentences): {answer_markdown}")
32 |     return answer_markdown
33 | 
34 | 
35 | async def _classify_ans_sents_attributable_to_context(answer_md: str,
36 |                                                       context: List[str],
37 |                                                       parallel: bool,
38 |                                                       model,
39 |                                                       logger) -> List[int]:
40 |     prompt_template = read_template_from_file(PROMPT_CLASSIFY_ATTRIB)
41 |     prompt = PromptTemplate(
42 |         template=prompt_template,
43 |         input_variables=["context", "answer"]
44 |     )
45 |     chain = prompt | model | StrOutputParser()
46 | 
47 |     inferences = []
48 |     if parallel:
49 |         tasks = []
50 |         for ctx in context:
51 |             tasks.append(chain.invoke({
52 |                 "context": ctx,
53 |                 "answer": answer_md
54 |             }))
55 |         responses = await asyncio.gather(*tasks)
56 |         for response in responses:
57 |             result = parse_response(response)
58 |             verdicts = parse_verdicts_from_result(result)
59 |             inferences.append([int(verdict.infer) for verdict in verdicts])
60 |     else:
61 |         for ctx in context:
62 |             response = chain.invoke({
63 |                 "context": ctx,
64 |                 "answer": answer_md
65 |             })
66 |             result = parse_response(response)
67 |             verdicts = parse_verdicts_from_result(result)
68 |             inferences.append([int(verdict.infer) for verdict in verdicts])
69 |     
70 |     logger.debug(f"inferences: {inferences}")
71 |     return inferences
72 | 
73 | 
74 | def _compute_context_recall_score(inferences: List[int]) -> float:
75 |     inferences_f = list(chain(*inferences))
76 |     score = 0.0
77 |     if len(inferences_f) > 0:
78 |         score = sum(inferences_f) / len(inferences_f)
79 |     return score
80 | 
81 | 
82 | async def compute_context_recall(context: List[str],
83 |                                  answer: str,
84 |                                  model: BaseChatModel,
85 |                                  logger,
86 |                                  parallel: bool = True) -> float:
87 | 
88 |     answer_md = _convert_answer_to_markdown_list(answer, logger)
89 |     inferences = await _classify_ans_sents_attributable_to_context(
90 |         answer_md, context, parallel, model, logger)
91 |     score = _compute_context_recall_score(inferences)
92 |     return score
93 | 


--------------------------------------------------------------------------------
/src/prompted/context_relevance.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import nltk
 3 | 
 4 | from langchain_core.output_parsers import StrOutputParser
 5 | from langchain_core.language_models.chat_models import BaseChatModel
 6 | from langchain.prompts import PromptTemplate
 7 | from typing import List, Tuple
 8 | 
 9 | from .prompt_utils import (
10 |     read_template_from_file, parse_response,
11 |     parse_verdicts_from_result
12 | )
13 | 
14 | 
15 | PROMPT_CLASSIFY_NECESSARY = "context_relevance_1.txt"
16 | 
17 | 
18 | def _convert_to_markdown_list(context: str) -> Tuple[int, str]:
19 |     context_sents = []
20 |     for sent in nltk.sent_tokenize(context):
21 |         context_sents.append(sent)
22 |     context_markdown = "\n".join([f"- {sent}" for sent in context_sents])
23 |     return len(context_sents), context_markdown
24 | 
25 | 
26 | def _convert_to_markdown_lists(context: str) -> Tuple[int, str]:
27 |     total_sents = 0
28 |     context_markdowns = []
29 |     for ctx in context:
30 |         num_sents, context_md = _convert_to_markdown_list(ctx)
31 |         total_sents += num_sents
32 |         context_markdowns.append(context_md)
33 |     return total_sents, context_markdowns
34 | 
35 | 
36 | async def _generate_necessity_verdicts(question: str,
37 |                                        context_markdowns: List[str],
38 |                                        parallel: bool,
39 |                                        model,
40 |                                        logger):
41 |     prompt_template = read_template_from_file(PROMPT_CLASSIFY_NECESSARY)
42 |     prompt = PromptTemplate(template=prompt_template,
43 |                             input_variables=["question", "context"])
44 |     chain = prompt | model | StrOutputParser()
45 | 
46 |     necessary_sents = []
47 |     if parallel:
48 |         tasks = []
49 |         for context_markdown in context_markdowns:
50 |             tasks.append(chain.ainvoke({
51 |                 "question": question,
52 |                 "context": context_markdown
53 |             }))
54 |         responses = await asyncio.gather(*tasks)
55 |         for response in responses:
56 |             result = parse_response(response)
57 |             verdicts = parse_verdicts_from_result(result)
58 |             necessary_sents.append([verdict.statement for verdict in verdicts
59 |                                     if int(verdict.infer) == 1])
60 |     else:
61 |         for context_markdown in context_markdowns:
62 |             response = chain.invoke({
63 |                 "question": question,
64 |                 "context": context_markdown
65 |             })
66 |             result = parse_response(response)
67 |             logger.debug(f"result: {result}")
68 |             verdicts = parse_verdicts_from_result(result)
69 |             necessary_sents.append([verdict.statement for verdict in verdicts
70 |                                     if int(verdict.infer) == 1])
71 |     return necessary_sents
72 | 
73 | 
74 | def _compute_context_relevance_score(num_total_sents: int,
75 |                                      necessary_sents: List[List[str]]
76 |                                      ) -> float:
77 |     num_necessary_sents = sum([len(ctx_sents) for ctx_sents in necessary_sents])
78 |     return num_necessary_sents / num_total_sents
79 | 
80 | 
81 | async def compute_context_relevance(question: str,
82 |                                     context: List[str],
83 |                                     model: BaseChatModel,
84 |                                     logger,
85 |                                     parallel: bool = True) -> float:
86 | 
87 |     num_total_sents, context_markdowns = _convert_to_markdown_lists(context)
88 |     score = 0.0
89 |     if num_total_sents > 0:
90 |         necessary_sents = await _generate_necessity_verdicts(
91 |             question, context_markdowns, parallel, model, logger)
92 |         score = _compute_context_relevance_score(
93 |             num_total_sents, necessary_sents)
94 |     return score
95 | 


--------------------------------------------------------------------------------
/src/prompted/faithfulness.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | from itertools import chain
  4 | from langchain_core.output_parsers import StrOutputParser
  5 | from langchain_core.language_models.chat_models import BaseChatModel
  6 | from langchain.prompts import PromptTemplate
  7 | from typing import List
  8 | 
  9 | from .prompt_utils import read_template_from_file, parse_response, Verdict
 10 | 
 11 | 
 12 | PROMPT_EXTRACT_STATEMENTS_FROM_ANSWER = "faithfulness_1.txt"
 13 | PROMPT_INFER_ENTAILMENT_FROM_CONTEXT = "faithfulness_2.txt"
 14 | 
 15 | 
 16 | def _reformat_statements_to_xml(statements: List[str]) -> str:
 17 |     statements_xml = ["<statements>"]
 18 |     for statement in statements:
 19 |         statements_xml.append(f" <statement>{statement}</statement>")
 20 |     statements_xml.append("</statements>")
 21 |     return "\n".join(statements_xml)
 22 | 
 23 | 
 24 | def _get_statements_from_answer(question: str,
 25 |                                 answer: str,
 26 |                                 model: BaseChatModel,
 27 |                                 logger) -> List[str]:
 28 |     prompt_template = read_template_from_file(
 29 |         PROMPT_EXTRACT_STATEMENTS_FROM_ANSWER)
 30 |     prompt_ans_to_stmt = PromptTemplate(
 31 |         template=prompt_template,
 32 |         input_variables=["question", "answer"])
 33 |     chain_ans_to_stmt = prompt_ans_to_stmt | model | StrOutputParser()
 34 |     response = chain_ans_to_stmt.invoke({
 35 |         "question": question,
 36 |         "answer": answer
 37 |     })
 38 |     logger.debug(f"response: {response}")
 39 |     result = parse_response(response)
 40 |     logger.debug(f"result (parsed response): {result}")
 41 |     statements = result.value["statements"]["statement"]
 42 |     if not isinstance(statements, list):
 43 |         statements = [statements]
 44 |         logger.debug("statement made into a []")
 45 | 
 46 |     logger.debug(f"statements: {statements}")
 47 |     return statements
 48 | 
 49 | 
 50 | async def _get_entailments_from_context(context: List[str],
 51 |                                         statements: List[str],
 52 |                                         model: BaseChatModel,
 53 |                                         logger,
 54 |                                         parallel: bool
 55 |                                         ) -> List[List[float]]:
 56 |     statements_xml = _reformat_statements_to_xml(statements)
 57 |     logger.debug(f"statements_xml: {statements_xml}")
 58 | 
 59 |     prompt_template = read_template_from_file(
 60 |         PROMPT_INFER_ENTAILMENT_FROM_CONTEXT)
 61 |     prompt_nli = PromptTemplate(
 62 |         template=prompt_template,
 63 |         input_variables=["context", "statements_xml"])
 64 |     chain_nli = prompt_nli | model | StrOutputParser()
 65 | 
 66 |     entailments = []
 67 |     if parallel:
 68 |         tasks = []
 69 |         for ctx in context:
 70 |             tasks.append(chain_nli.ainvoke({
 71 |                 "context": ctx,
 72 |                 "statements_xml": statements_xml
 73 |             }))
 74 |         responses = await asyncio.gather(*tasks)
 75 |         for response in responses:
 76 |             result = parse_response(response)
 77 |             logger.debug(f"entailment verdicts: {result}")
 78 |             verdicts = result.value["verdicts"]["verdict"]
 79 |             logger.debug(f"type(verdicts): {type(verdicts)}")
 80 |             if not isinstance(verdicts, list):
 81 |                 verdicts = [verdicts]
 82 |                 logger.debug(f"made verdicts into []")
 83 | 
 84 |             entailments.append([int(Verdict(**v).infer) for v in verdicts])
 85 |     else:
 86 |         for ctx in context:
 87 |             response = chain_nli.invoke({
 88 |                 "context": ctx,
 89 |                 "statements_xml": statements_xml
 90 |             })
 91 |             result = parse_response(response)
 92 |             logger.debug(f"entailment verdicts: {result}")
 93 |             verdicts = result.value["verdicts"]["verdict"]
 94 |             if not isinstance(verdicts, list):
 95 |                 verdicts = [verdicts]
 96 |                 logger.debug(f"made verdicts into []")
 97 | 
 98 |             entailments.append([int(Verdict(**v).infer) for v in verdicts])
 99 | 
100 |     logger.debug(f"entailments: {entailments}")
101 |     return entailments
102 | 
103 | 
104 | def _compute_faithfulness(entailments_lol: List[List[float]]) -> float:
105 |     entailments = list(chain.from_iterable(entailments_lol))
106 |     try:
107 |         faithfulness = sum(entailments) / len(entailments)
108 |     except ZeroDivisionError:
109 |         faithfulness = 0.0
110 |     return faithfulness
111 | 
112 | 
113 | async def compute_faithfulness(question: str,
114 |                                answer: str,
115 |                                context: List[str],
116 |                                model: BaseChatModel,
117 |                                logger,
118 |                                parallel: bool = True) -> float:
119 |     statements = _get_statements_from_answer(question, answer, model, logger)
120 |     entailments_lol = await _get_entailments_from_context(
121 |         context, statements, model, logger, parallel)
122 |     faithfulness = _compute_faithfulness(entailments_lol)
123 |     return faithfulness
124 | 


--------------------------------------------------------------------------------
/src/prompted/more_questions.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.output_parsers import StrOutputParser
 2 | from langchain_core.language_models.chat_models import BaseChatModel
 3 | from langchain.prompts import PromptTemplate
 4 | from xml.sax.saxutils import escape
 5 | 
 6 | import dspy
 7 | import dsp
 8 | 
 9 | from .prompt_utils import read_template_from_file, parse_response
10 | 
11 | colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
12 | dspy.settings.configure(rm=colbertv2_wiki17_abstracts)
13 | 
14 | PROMPT_MORE_QUESTIONS = "more_questions.txt"
15 | 
16 | def split_newlines(input_string):
17 |     """Split the input string at newline characters and return a list of strings."""
18 |     if '\n' in input_string:
19 |         return input_string.split('\n')
20 |     else:
21 |         return [input_string]
22 | 
23 | def compute_more_questions(question: str,
24 |                                ideal_answer: str,
25 |                                multiplier: int,
26 |                                model: BaseChatModel,
27 |                                logger) -> list:
28 |     more_passages = dsp.retrieve(question, k=multiplier)
29 |     esc_more_passages = [escape(x) for x in more_passages]
30 |     prompt_template = read_template_from_file(PROMPT_MORE_QUESTIONS)
31 |     prompt = PromptTemplate(template=prompt_template,
32 |                             input_variables=["question", "answer", "multiplier", "passages"])
33 |     chain = prompt | model | StrOutputParser()
34 |     response = chain.invoke({
35 |         "question": question,
36 |         "answer": ideal_answer,
37 |         "multiplier": multiplier,
38 |         "passages": esc_more_passages
39 |     })
40 |     
41 |     # make list of dicts from XML output
42 |     # format per requirement
43 |     # ideal_answer is just the predicted_answer, as a workaround
44 | 
45 |     result = parse_response(response)
46 |     
47 |     result_tuples = result.value["tuples"]
48 |     result_list = []
49 |     for _ in result_tuples:
50 |         chunks = []
51 |         context_lines = split_newlines(_["context"])
52 |         for i in range(len(context_lines)):
53 |             chunk_dict = {}
54 |             chunk_dict['id'] = str(i)
55 |             chunk_dict['chunk_text'] = context_lines[i]
56 |             chunks.append(chunk_dict)
57 |         r_dict = {"query": _["question"], "predicted_answer": _["answer"], "ideal_answer": _["answer"], "context": chunks}
58 |         result_list.append(r_dict)
59 |     logger.debug(f"result_list: \n{result_list}")
60 |  
61 |     return result_list
62 | 


--------------------------------------------------------------------------------
/src/prompted/prompt_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import xmltodict
 4 | 
 5 | from pydantic.generics import GenericModel
 6 | from pydantic import BaseModel, Field
 7 | from typing import List, TypeVar, Generic
 8 | 
 9 | 
10 | ################ parse LLM output to Pydantic object ################
11 | 
12 | T = TypeVar("T")
13 | 
14 | 
15 | class Result(GenericModel, Generic[T]):
16 |     value: T = Field(alias="result")
17 | 
18 | 
19 | def parse_response(response):
20 |     response = response.strip()
21 |     start_tag, end_tag = "<result>", "</result>"
22 |     is_valid = response.startswith(start_tag) and response.endswith(end_tag)
23 |     if not is_valid:
24 |         pattern = f"(?:{start_tag})(.*)(?:{end_tag})"
25 |         p = re.compile(pattern, re.DOTALL)
26 |         m = p.search(response)
27 |         if m is not None:
28 |             response = start_tag + m.group(1) + end_tag
29 |     try:
30 |         resp_dict = xmltodict.parse(response)
31 |     except Exception as e:
32 |         print("response:", response)
33 |         raise e
34 |     result = Result(**resp_dict)
35 |     return result
36 | 
37 | 
38 | ################ parse LLM output to Pydantic object ################
39 | 
40 | 
41 | PROMPT_DIR = "../resources/prompts"
42 | 
43 | 
44 | def read_template_from_file(prompt_fn: str,
45 |                             prompt_dir: str = PROMPT_DIR) -> str:
46 |     prompt_fp = os.path.join(prompt_dir, prompt_fn)
47 |     with open(prompt_fp, "r", encoding="utf-8") as f:
48 |         prompt_template_text = f.read()
49 |     return prompt_template_text
50 | 
51 | 
52 | #################### verdict processing ################################
53 | 
54 | 
55 | class Verdict(BaseModel):
56 |     statement: str = Field(alias="statement", description="The statement")
57 |     reason: str = Field(alias="reason", description="Reason for verdict")
58 |     infer: str = Field(alias="infer", description="The inference (0/1)")
59 | 
60 | 
61 | def parse_verdicts_from_result(result) -> List[Verdict]:
62 |     verdicts_el = result.value["verdicts"]
63 |     if verdicts_el is None:
64 |         return []
65 |     verdict_el = verdicts_el["verdict"]
66 |     if isinstance(verdict_el, dict):
67 |         verdicts = [Verdict(**verdict_el)]
68 |     else:
69 |         verdicts = [Verdict(**verdict_dict) for verdict_dict in verdict_el]
70 |     return verdicts
71 | 


--------------------------------------------------------------------------------
/src/run_learned_metrics.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import dspy
  3 | import json
  4 | import logging
  5 | import os
  6 | 
  7 | from dotenv import find_dotenv, load_dotenv
  8 | from langchain_google_genai import GoogleGenerativeAIEmbeddings
  9 | 
 10 | from learned.faithfulness import compute_faithfulness
 11 | from learned.answer_relevance import compute_answer_relevance
 12 | from learned.context_precision import compute_context_precision
 13 | from learned.context_relevance import compute_context_relevance
 14 | from learned.context_recall import compute_context_recall
 15 | from learned.answer_correctness import compute_answer_correctness
 16 | from learned.learning_utils import clean_up_log_files
 17 | from metrics import Metrics
 18 | 
 19 | # Safety config
 20 | 
 21 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
 22 | 
 23 | safety_settings = {
 24 |     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 25 |     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 26 |     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 27 |     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH
 28 | }
 29 | 
 30 | def runner():
 31 | 
 32 |     parser = argparse.ArgumentParser()
 33 |     parser.add_argument("--metric", type=str,
 34 |                         choices=sorted([m.value for m in Metrics]),
 35 |                         required=True,
 36 |                         help="The metric to compute")
 37 |     parser.add_argument("--input", type=str, required=True,
 38 |                         help="Full path to evaluation data in JSONL format")
 39 |     parser.add_argument("--output", type=str, required=True,
 40 |                         help="Full path to output directory")
 41 |     parser.add_argument("--cross-encoder", action="store_false",
 42 |                         help="Use cross-encoder similarity scoring (default true)")
 43 |     parser.add_argument("--model-temp", type=float, required=False,
 44 |                         help="The temperature of the model - between 0.0 and 1.0 (default 0.0)")
 45 |     parser.add_argument("--qs_to_skip", type=str, required=False,
 46 |                         help="skip questions provided as a comma separated list of ints: '3,4,11'")
 47 |     parser.add_argument("--qs_to_use", type=str, required=False,
 48 |                         help="only use questions provided as a comma separated list of ints: '3,4,11'")
 49 |     parser.add_argument("--debug", action="store_true",
 50 |                         help="Turn debugging on (default: false)")
 51 | 
 52 |     args = parser.parse_args()
 53 |     metric = args.metric
 54 |     input_fp = args.input
 55 |     output_dir = args.output
 56 |     model_temp = args.model_temp
 57 |     if model_temp is None or model_temp > 1.0 or model_temp < 0.0:
 58 |         model_temp = 0.0
 59 |     debug = args.debug
 60 |     qs_to_skip = args.qs_to_skip
 61 |     if qs_to_skip is None:
 62 |         skip_qs = []
 63 |     else:
 64 |         skip_qs = list(map(int, qs_to_skip.split(',')))
 65 | 
 66 |     qs_to_use = args.qs_to_use
 67 |     if qs_to_use is None:
 68 |         use_qs = []
 69 |     else:
 70 |         use_qs = list(map(int, qs_to_use.split(',')))
 71 | 
 72 |     _ = load_dotenv(find_dotenv())
 73 | 
 74 |     model = dspy.Google("models/gemini-1.0-pro",
 75 |                         api_key=os.environ["GOOGLE_API_KEY"],
 76 |                         max_output_tokens=1024,
 77 |                         temperature=model_temp,
 78 |                         safety_settings=safety_settings)
 79 |     dspy.settings.configure(lm=model)
 80 |     dspy.logger.level = logging.DEBUG if debug else logging.INFO
 81 | 
 82 |     encoder = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
 83 | 
 84 |     os.makedirs(output_dir, exist_ok=True)
 85 |     output_fp = os.path.join(output_dir, f"{metric}_report.tsv")
 86 | 
 87 |     optimized_prompts = {}
 88 | 
 89 |     with open(input_fp, "r", encoding="utf-8") as fin, \
 90 |          open(output_fp, "w", encoding="utf-8") as fout:
 91 | 
 92 |         fout.write("\t".join(["#QID", metric.upper()]) + "\n")
 93 |         for line in fin:
 94 |             record = json.loads(line)
 95 |             id = record["id"]
 96 |             # if int(id) % 4 != 0:
 97 |             #     continue
 98 |             # use_qs will override skip_qs
 99 |             if int(id) in skip_qs and int(id) not in use_qs:
100 |                 continue
101 |             question = record["query"]
102 |             context = [ctx["chunk_text"] for ctx in record["context"]]
103 |             answer = record["predicted_answer"]
104 |             ideal_answer = record["ideal_answer"]
105 |             
106 | 
107 |             match Metrics(metric):
108 |                 case Metrics.FAITHFULNESS:
109 |                     metric_value = compute_faithfulness(
110 |                         question, answer, context, optimized_prompts)
111 |                 case Metrics.ANSWER_RELEVANCE:
112 |                     metric_value = compute_answer_relevance(
113 |                         question, context, answer, optimized_prompts,
114 |                         encoder)
115 |                 case Metrics.CONTEXT_PRECISION:
116 |                     metric_value = compute_context_precision(
117 |                         question, ideal_answer, context, optimized_prompts)
118 |                 case Metrics.CONTEXT_UTILIZATION:
119 |                     metric_value = compute_context_precision(
120 |                         question, answer, context, optimized_prompts)
121 |                 case Metrics.CONTEXT_RELEVANCE:
122 |                     metric_value = compute_context_relevance(
123 |                         question, context, optimized_prompts)
124 |                 case Metrics.CONTEXT_RECALL:
125 |                     metric_value = compute_context_recall(
126 |                         context, answer, optimized_prompts)
127 |                 case Metrics.ANSWER_SIMILARITY:
128 |                     raise NotImplementedError(
129 |                         "Use prompted version of answer similarity")
130 |                 case Metrics.ANSWER_CORRECTNESS:
131 |                     metric_value = compute_answer_correctness(
132 |                         ideal_answer, answer, optimized_prompts)
133 |                 case _:
134 |                     print(f"Unsupported metric: {metric}")
135 | 
136 |             print(f"query ({id}): {question}, {metric}: {metric_value}")
137 |             fout.write(f"{id}\t{metric_value:.3f}\n")
138 | 
139 |     if not debug:
140 |         clean_up_log_files()
141 | 
142 | if __name__ == "__main__":
143 |     runner()
144 | 


--------------------------------------------------------------------------------
/src/run_prompted_metrics.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import json
  4 | import logging
  5 | import os
  6 | 
  7 | from dotenv import find_dotenv, load_dotenv
  8 | from langchain_google_genai import (
  9 |     ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 10 | )
 11 | 
 12 | from prompted.faithfulness import compute_faithfulness
 13 | from prompted.answer_relevance import compute_answer_relevance
 14 | from prompted.context_precision import compute_context_precision
 15 | from prompted.context_relevance import compute_context_relevance
 16 | from prompted.context_recall import compute_context_recall
 17 | from prompted.answer_similarity import compute_answer_similarity
 18 | from prompted.answer_correctness import compute_answer_correctness
 19 | from metrics import Metrics
 20 | 
 21 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 22 | 
 23 | # Safety config
 24 | 
 25 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
 26 | 
 27 | safety_settings = {
 28 |     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 29 |     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 30 |     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
 31 |     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH
 32 | }
 33 | 
 34 | async def runner():
 35 | 
 36 |     parser = argparse.ArgumentParser()
 37 |     parser.add_argument("--metric", type=str,
 38 |                         choices=sorted([m.value for m in Metrics]),
 39 |                         required=True,
 40 |                         help="The metric to compute")
 41 |     parser.add_argument("--input", type=str, required=True,
 42 |                         help="Full path to evaluation data in JSONL format")
 43 |     parser.add_argument("--output", type=str, required=True,
 44 |                         help="Full path to output directory")
 45 |     parser.add_argument("--parallel", action="store_true",
 46 |                         help="Run in parallel where possible (default false)")
 47 |     parser.add_argument("--cross-encoder", action="store_false",
 48 |                         help="Use cross-encoder similarity scoring (default true)")
 49 |     parser.add_argument("--model-temp", type=float, required=False,
 50 |                         help="The temperature of the model - between 0.0 and 1.0 (default 0.0)")
 51 |     parser.add_argument("--debug", action="store_true",
 52 |                         help="Turn debugging on (default: false)")
 53 |     args = parser.parse_args()
 54 |     metric = args.metric
 55 |     input_fp = args.input
 56 |     output_dir = args.output
 57 |     run_in_parallel = args.parallel
 58 |     use_cross_encoder = args.cross_encoder
 59 |     model_temp = args.model_temp
 60 |     if model_temp is None or model_temp > 1.0 or model_temp < 0.0:
 61 |         model_temp = 0.0
 62 |     debug = args.debug
 63 | 
 64 |     logger = logging.getLogger(__name__)
 65 |     logger.setLevel(logging.DEBUG if debug else logging.INFO)
 66 | 
 67 |     _ = load_dotenv(find_dotenv())
 68 | 
 69 |     model = ChatGoogleGenerativeAI(
 70 |         model="gemini-pro",
 71 |         api_key=os.environ["GOOGLE_API_KEY"],
 72 |         temperature=model_temp,
 73 |         safety_settings=safety_settings)
 74 |     encoder = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
 75 | 
 76 |     os.makedirs(output_dir, exist_ok=True)
 77 |     output_fp = os.path.join(output_dir, f"{metric}_report.tsv")
 78 | 
 79 |     with open(input_fp, "r", encoding="utf-8") as fin, \
 80 |          open(output_fp, "w", encoding="utf-8") as fout:
 81 | 
 82 |         fout.write("\t".join(["#QID", metric.upper()]) + "\n")
 83 |         for line in fin:
 84 |             record = json.loads(line)
 85 |             # extract relevant data to evaluate
 86 |             id = record["id"]
 87 |             if int(id) % 4 != 0:
 88 |                 continue
 89 |             question = record["query"]
 90 |             context = [ctx["chunk_text"] for ctx in record["context"]]
 91 |             answer = record["predicted_answer"]
 92 |             ideal_answer = record["ideal_answer"]
 93 | 
 94 |             match Metrics(metric):
 95 |                 case Metrics.FAITHFULNESS:
 96 |                     metric_value = await compute_faithfulness(
 97 |                         question, answer, context, model, logger,
 98 |                         parallel=run_in_parallel)
 99 |                 case Metrics.ANSWER_RELEVANCE:
100 |                     metric_value = await compute_answer_relevance(
101 |                         question, context, answer, model, encoder, logger,
102 |                         parallel=run_in_parallel)
103 |                 case Metrics.CONTEXT_PRECISION:
104 |                     metric_value = await compute_context_precision(
105 |                         question, context, ideal_answer, model, logger,
106 |                         parallel=run_in_parallel)
107 |                 case Metrics.CONTEXT_UTILIZATION:
108 |                     metric_value = await compute_context_precision(
109 |                         question, context, answer, model, logger,
110 |                         parallel=run_in_parallel)
111 |                 case Metrics.CONTEXT_RELEVANCE:
112 |                     metric_value = await compute_context_relevance(
113 |                         question, context, model, logger,
114 |                         parallel=run_in_parallel)
115 |                 case Metrics.CONTEXT_RECALL:
116 |                     metric_value = await compute_context_recall(
117 |                         context, ideal_answer, model, logger,
118 |                         parallel=run_in_parallel)
119 |                 case Metrics.ANSWER_SIMILARITY:
120 |                     metric_value = compute_answer_similarity(
121 |                         answer, ideal_answer, encoder, logger,
122 |                         cross_encoder=use_cross_encoder)
123 |                 case Metrics.ANSWER_CORRECTNESS:
124 |                     metric_value = compute_answer_correctness(
125 |                         ideal_answer, answer, model, logger)
126 |                 case _:
127 |                     logger.error(f"Unsupported metric: {metric}")
128 | 
129 |             logger.info(
130 |                 f"query ({id}): {question}, {metric}: {metric_value}")
131 |             fout.write(f"{id}\t{metric_value:.3f}\n")
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     asyncio.run(runner())
136 | 


--------------------------------------------------------------------------------
/src/score_distributions.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import os
 4 | 
 5 | 
 6 | REPORTS_DIR = "../resources/reports"
 7 | LCEL_REPORTS_DIR = os.path.join(REPORTS_DIR, "lcel-reports")
 8 | DSPY_REPORTS_DIR = os.path.join(REPORTS_DIR, "dspy-reports")
 9 | 
10 | 
11 | def read_data(report_fp):
12 |     scores = []
13 |     with open(report_fp) as f:
14 |         for line in f:
15 |             if line.startswith("#QID"):
16 |                 continue
17 |             qid, score = line.strip().split("\t")
18 |             scores.append(float(score))
19 |     return scores
20 | 
21 | 
22 | def bimodality(scores):
23 |     """ Measure the "width" of the distribution around 0 and 1.
24 |         All metrics return a value in the range [0, 1]. We measure
25 |         deviation from the mean and then compute the standard deviation
26 |         of the deviations. Smaller values of standard deviation indicate
27 |         a more confident model.
28 |     """
29 |     deviations = [abs(x - 0.5) for x in scores]
30 |     return np.std(deviations)
31 | 
32 | 
33 | if __name__ == "__main__":
34 | 
35 |     print("| Metric | LCEL | DSPy |")
36 |     print("|--------|------|------|")
37 | 
38 |     plt.figure(figsize=(10, 5))
39 |     for idx, report_fn in enumerate(os.listdir(DSPY_REPORTS_DIR)):
40 |         title = report_fn.split("_report")[0].replace("_", " ").title()
41 |         dspy_report_fp = os.path.join(DSPY_REPORTS_DIR, report_fn)
42 |         lcel_report_fp = os.path.join(LCEL_REPORTS_DIR, report_fn)
43 |         dspy_scores = read_data(dspy_report_fp)
44 |         lcel_scores = read_data(lcel_report_fp)
45 |         lcel_bim = bimodality(lcel_scores)
46 |         dspy_bim = bimodality(dspy_scores)
47 |         print(f"| {title} | {lcel_bim:.3f} | {dspy_bim:.3f} |")
48 |         plt.subplot(2, 4, idx + 1)
49 |         plt.hist(lcel_scores, bins=10, alpha=0.5, label="LCEL")
50 |         plt.hist(dspy_scores, bins=10, alpha=0.5, label="DSPy")
51 |         plt.title(title)
52 |         if idx == 0:
53 |             plt.legend(loc="best")
54 | 
55 |     plt.tight_layout()
56 |     _ = plt.show()
57 | 


--------------------------------------------------------------------------------
/src/streamlit_app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | x = st.slider("Select a value")
3 | st.write(x, "squared is", x * x)


--------------------------------------------------------------------------------
/src/test_nltk.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tokenize import word_tokenize
 3 | 
 4 | import ssl
 5 | 
 6 | try:
 7 |     _create_unverified_https_context = ssl._create_unverified_context
 8 | except AttributeError:
 9 |     pass
10 | else:
11 |     ssl._create_default_https_context = _create_unverified_https_context
12 | 
13 | 
14 | # Download the necessary NLTK models (if not already downloaded)
15 | nltk.download('punkt')
16 | 
17 | # Example usage of the tokenizer
18 | text = "Hello, how are you doing?"
19 | tokens = word_tokenize(text)
20 | print(tokens)
21 | 
22 | print(nltk.data.path)


--------------------------------------------------------------------------------