├── assets └── images │ ├── cranberry_jam.png │ ├── panel_output2.png │ ├── stacked_logo.png │ ├── uqlm_flow_ds.png │ ├── horizontal_logo.png │ ├── judges_graphic.png │ ├── uqensemble_tune.png │ ├── black_box_graphic.png │ ├── black_box_output4.png │ ├── uqensemble_output2.png │ ├── uqlm_flow_ds_dark.png │ ├── white_box_graphic.png │ ├── white_box_output2.png │ ├── judges_graphic_dark.png │ ├── black_box_graphic_dark.png │ ├── horizontal_logo_large.png │ ├── white_box_graphic_dark.png │ ├── uqensemble_generate_score.png │ ├── uqensemble_generate_score_dark.png │ └── COPYRIGHT.md ├── docs ├── source │ ├── _static │ │ ├── research │ │ │ └── logo.png │ │ ├── images │ │ │ ├── no_image.png │ │ │ ├── uqlm_flow_ds.png │ │ │ ├── favicon │ │ │ │ ├── favicon.ico │ │ │ │ ├── favicon-16x16.png │ │ │ │ ├── favicon-32x32.png │ │ │ │ ├── apple-touch-icon.png │ │ │ │ ├── android-chrome-192x192.png │ │ │ │ └── android-chrome-512x512.png │ │ │ ├── horizontal_logo.png │ │ │ ├── judges_graphic.png │ │ │ ├── panel_output2.png │ │ │ ├── black_box_graphic.png │ │ │ ├── black_box_output4.png │ │ │ ├── uqlm_flow_ds_dark.png │ │ │ ├── white_box_graphic.png │ │ │ ├── white_box_output2.png │ │ │ ├── judges_graphic_dark.png │ │ │ ├── uqensemble_output2.png │ │ │ ├── black_box_graphic_dark.png │ │ │ ├── horizontal_logo_large.png │ │ │ ├── horizontal_logo_no_bg.png │ │ │ ├── white_box_graphic_dark.png │ │ │ ├── uqensemble_generate_score.png │ │ │ ├── uqensemble_generate_score_dark.png │ │ │ └── COPYRIGHT.md │ │ └── custom.css │ ├── _templates │ │ ├── base.rst │ │ ├── class.rst │ │ └── module.rst │ ├── api.rst │ ├── conf.py │ ├── contribute.rst │ ├── _notebooks │ │ └── index.rst │ └── refs.bib ├── make.bat └── Makefile ├── .pre-commit-config.yaml ├── .github ├── dependabot.yml ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── ci.yaml │ ├── linting.yml │ ├── update_version_json.py │ └── documentation.yaml ├── tests ├── data │ ├── scorers │ │ ├── DATA_COPYRIGHT.md │ │ ├── test_data_panelquantifier.json │ │ ├── blackbox_results_file.json │ │ ├── whitebox_results_file.json │ │ ├── generate_data_semanticentropy.py │ │ ├── generate_data_whitebox.py │ │ ├── generate_data_blackbox.py │ │ ├── semanticentropy_results_file.json │ │ ├── generate_data_llmjudge.py │ │ ├── generate_data_ensemble.py │ │ ├── bsdetector_results_file.json │ │ └── ensemble_results_file.json │ └── similarity │ │ ├── DATA_COPYRIGHT.md │ │ └── generate_data_similarity.py ├── __init__.py ├── test_postprocessor.py ├── test_similarity.py ├── test_grader.py ├── test_semanticentropy.py ├── test_blackboxuq.py ├── test_nli.py ├── test_top_logprobs.py ├── test_logprobs_scorer.py ├── test_p_true.py ├── test_whiteboxuq.py ├── test_semanticdensity.py └── test_sampled_logprobs.py ├── uqlm ├── resources │ └── __init__.py ├── judges │ └── __init__.py ├── black_box │ ├── baseclass │ │ ├── __init__ .py │ │ └── similarity_scorer.py │ ├── __init__.py │ ├── match.py │ ├── bert.py │ ├── cosine.py │ └── consistency.py ├── white_box │ ├── baseclass │ │ ├── __init__.py │ │ └── logprobs_scorer.py │ ├── __init__.py │ ├── single_logprobs.py │ ├── top_logprobs.py │ └── p_true.py ├── scorers │ ├── baseclass │ │ └── __init__.py │ └── __init__.py ├── nli │ ├── __init__.py │ ├── nli.py │ └── cluster.py ├── calibration │ └── __init__.py ├── __init__.py └── utils │ ├── postprocessors.py │ ├── warn.py │ ├── device.py │ ├── results.py │ ├── __init__.py │ ├── display.py │ ├── llm_config.py │ └── grader.py ├── examples └── uqe_config_tuned.json ├── .gitignore ├── CONTRIBUTING.md ├── pyproject.toml └── CODE_OF_CONDUCT.md /assets/images/cranberry_jam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/cranberry_jam.png -------------------------------------------------------------------------------- /assets/images/panel_output2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/panel_output2.png -------------------------------------------------------------------------------- /assets/images/stacked_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/stacked_logo.png -------------------------------------------------------------------------------- /assets/images/uqlm_flow_ds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqlm_flow_ds.png -------------------------------------------------------------------------------- /assets/images/horizontal_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/horizontal_logo.png -------------------------------------------------------------------------------- /assets/images/judges_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/judges_graphic.png -------------------------------------------------------------------------------- /assets/images/uqensemble_tune.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_tune.png -------------------------------------------------------------------------------- /assets/images/black_box_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/black_box_graphic.png -------------------------------------------------------------------------------- /assets/images/black_box_output4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/black_box_output4.png -------------------------------------------------------------------------------- /assets/images/uqensemble_output2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_output2.png -------------------------------------------------------------------------------- /assets/images/uqlm_flow_ds_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqlm_flow_ds_dark.png -------------------------------------------------------------------------------- /assets/images/white_box_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/white_box_graphic.png -------------------------------------------------------------------------------- /assets/images/white_box_output2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/white_box_output2.png -------------------------------------------------------------------------------- /assets/images/judges_graphic_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/judges_graphic_dark.png -------------------------------------------------------------------------------- /docs/source/_static/research/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/research/logo.png -------------------------------------------------------------------------------- /assets/images/black_box_graphic_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/black_box_graphic_dark.png -------------------------------------------------------------------------------- /assets/images/horizontal_logo_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/horizontal_logo_large.png -------------------------------------------------------------------------------- /assets/images/white_box_graphic_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/white_box_graphic_dark.png -------------------------------------------------------------------------------- /docs/source/_static/images/no_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/no_image.png -------------------------------------------------------------------------------- /assets/images/uqensemble_generate_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_generate_score.png -------------------------------------------------------------------------------- /docs/source/_static/images/uqlm_flow_ds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqlm_flow_ds.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/images/horizontal_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/horizontal_logo.png -------------------------------------------------------------------------------- /docs/source/_static/images/judges_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/judges_graphic.png -------------------------------------------------------------------------------- /docs/source/_static/images/panel_output2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/panel_output2.png -------------------------------------------------------------------------------- /assets/images/uqensemble_generate_score_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_generate_score_dark.png -------------------------------------------------------------------------------- /docs/source/_static/images/black_box_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/black_box_graphic.png -------------------------------------------------------------------------------- /docs/source/_static/images/black_box_output4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/black_box_output4.png -------------------------------------------------------------------------------- /docs/source/_static/images/uqlm_flow_ds_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqlm_flow_ds_dark.png -------------------------------------------------------------------------------- /docs/source/_static/images/white_box_graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/white_box_graphic.png -------------------------------------------------------------------------------- /docs/source/_static/images/white_box_output2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/white_box_output2.png -------------------------------------------------------------------------------- /docs/source/_static/images/judges_graphic_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/judges_graphic_dark.png -------------------------------------------------------------------------------- /docs/source/_static/images/uqensemble_output2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqensemble_output2.png -------------------------------------------------------------------------------- /docs/source/_templates/base.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. auto{{ objtype }}:: {{ objname }} -------------------------------------------------------------------------------- /docs/source/_static/images/black_box_graphic_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/black_box_graphic_dark.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /docs/source/_static/images/horizontal_logo_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/horizontal_logo_large.png -------------------------------------------------------------------------------- /docs/source/_static/images/horizontal_logo_no_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/horizontal_logo_no_bg.png -------------------------------------------------------------------------------- /docs/source/_static/images/white_box_graphic_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/white_box_graphic_dark.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /docs/source/_static/images/uqensemble_generate_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqensemble_generate_score.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/android-chrome-192x192.png -------------------------------------------------------------------------------- /docs/source/_static/images/favicon/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/android-chrome-512x512.png -------------------------------------------------------------------------------- /docs/source/_static/images/uqensemble_generate_score_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqensemble_generate_score_dark.png -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.9.7 4 | hooks: 5 | - id: ruff 6 | args: [ --fix ] 7 | - id: ruff-format 8 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" # Location of your requirements.txt or other package manifest 5 | schedule: 6 | interval: "weekly" # Check for updates daily 7 | target-branch: "develop" # Target branch for updates 8 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | .. autosummary:: 5 | :toctree: _autosummary 6 | :template: module.rst 7 | :recursive: 8 | 9 | uqlm.scorers 10 | uqlm.black_box 11 | uqlm.white_box 12 | uqlm.judges 13 | uqlm.nli 14 | uqlm.calibration 15 | uqlm.resources 16 | uqlm.utils -------------------------------------------------------------------------------- /assets/images/COPYRIGHT.md: -------------------------------------------------------------------------------- 1 | Copyright 2025 CVS Health and/or one of its affiliates 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- /docs/source/_static/images/COPYRIGHT.md: -------------------------------------------------------------------------------- 1 | Copyright 2025 CVS Health and/or one of its affiliates 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- /tests/data/scorers/DATA_COPYRIGHT.md: -------------------------------------------------------------------------------- 1 | Copyright 2025 CVS Health and/or one of its affiliates 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 8 | -------------------------------------------------------------------------------- /tests/data/similarity/DATA_COPYRIGHT.md: -------------------------------------------------------------------------------- 1 | Copyright 2025 CVS Health and/or one of its affiliates 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 8 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/data/scorers/test_data_panelquantifier.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompts": [ 3 | "What is Kathy Saltzman's occupation?", 4 | "What is Eleanor Davis's occupation?" 5 | ], 6 | "responses": [ 7 | "Kathy Saltzman is a Senior Software Engineer at Dropbox.", 8 | "Eleanor Davis is a cartoonist and illustrator." 9 | ], 10 | "scores": { 11 | "judge_1": [0.8,0.9], 12 | "judge_2": [0.8, 0.9], 13 | "avg": [0.8, 0.9], 14 | "max": [0.8, 0.9], 15 | "min": [0.8, 0.9], 16 | "median": [0.8, 0.9] 17 | }, 18 | "metadata": { 19 | "num_judges": 2, 20 | "temperature": 0.7 21 | } 22 | } -------------------------------------------------------------------------------- /uqlm/resources/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /uqlm/judges/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.judges.judge import LLMJudge 16 | 17 | __all__ = ["LLMJudge"] 18 | -------------------------------------------------------------------------------- /uqlm/black_box/baseclass/__init__ .py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.black_box.similarity_scorer import SimilarityScorer 16 | 17 | __all__ = ["SimilarityScorer"] 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Version [e.g. 22] 29 | 30 | **Additional context** 31 | Add any other context about the problem here. 32 | -------------------------------------------------------------------------------- /uqlm/white_box/baseclass/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer 16 | 17 | __all__ = ["LogprobsScorer"] 18 | -------------------------------------------------------------------------------- /uqlm/scorers/baseclass/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.scorers.baseclass.uncertainty import UncertaintyQuantifier 16 | 17 | __all__ = ["UncertaintyQuantifier"] 18 | -------------------------------------------------------------------------------- /uqlm/nli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from uqlm.nli.nli import NLI 17 | from uqlm.nli.cluster import SemanticClusterer 18 | 19 | __all__ = ["NLI", "SemanticClusterer"] 20 | -------------------------------------------------------------------------------- /uqlm/calibration/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.calibration.score_calibrator import ScoreCalibrator 16 | from uqlm.calibration.evaluate import evaluate_calibration 17 | 18 | __all__ = ["ScoreCalibrator", "evaluate_calibration"] 19 | -------------------------------------------------------------------------------- /docs/source/_templates/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | :members: 7 | :show-inheritance: 8 | :inherited-members: 9 | 10 | {% block methods %} 11 | .. automethod:: __init__ 12 | 13 | {% if methods %} 14 | .. rubric:: {{ _('Methods') }} 15 | 16 | .. autosummary:: 17 | {% for item in methods %} 18 | ~{{ name }}.{{ item }} 19 | {%- endfor %} 20 | {% endif %} 21 | {% endblock %} 22 | 23 | {% block attributes %} 24 | {% if attributes %} 25 | .. rubric:: {{ _('Attributes') }} 26 | 27 | .. autosummary:: 28 | {% for item in attributes %} 29 | ~{{ name }}.{{ item }} 30 | {%- endfor %} 31 | {% endif %} 32 | {% endblock %} 33 | 34 | {% block references %} 35 | .. rubric:: {{ _('References') }} 36 | 37 | .. footbibliography:: 38 | {% endblock %} -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /uqlm/black_box/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.black_box.bert import BertScorer 16 | 17 | from uqlm.black_box.cosine import CosineScorer 18 | from uqlm.black_box.match import MatchScorer 19 | from uqlm.black_box.consistency import ConsistencyScorer 20 | 21 | __all__ = ["BertScorer", "CosineScorer", "MatchScorer", "ConsistencyScorer"] 22 | -------------------------------------------------------------------------------- /tests/test_postprocessor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.utils.postprocessors import math_postprocessor 16 | 17 | 18 | TEST_DATA = {"$3.134": "3", "the answer is 12 cookies": "12", "Hmmm /n perhaps 555.,7&333$5x": "555"} 19 | 20 | 21 | def test_math_postprocessor(): 22 | for key in TEST_DATA: 23 | assert TEST_DATA[key] == math_postprocessor(key) 24 | -------------------------------------------------------------------------------- /uqlm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.scorers.ensemble import UQEnsemble 16 | from uqlm.scorers.entropy import SemanticEntropy 17 | from uqlm.scorers.panel import LLMPanel 18 | from uqlm.scorers.white_box import WhiteBoxUQ 19 | from uqlm.scorers.black_box import BlackBoxUQ 20 | 21 | __all__ = ["UQEnsemble", "SemanticEntropy", "LLMPanel", "WhiteBoxUQ", "BlackBoxUQ"] 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'uqlm/**' 7 | - 'tests/**' 8 | - 'poetry.lock' 9 | pull_request: 10 | paths: 11 | - 'uqlm/**' 12 | - 'tests/**' 13 | - 'poetry.lock' 14 | 15 | jobs: 16 | run-tests: 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | os: [ubuntu-latest, macos-latest, windows-latest] 21 | python-version: 22 | - "3.10" 23 | - "3.11" 24 | - "3.12" 25 | - "3.13.3" 26 | 27 | name: Test 28 | runs-on: ${{ matrix.os }} 29 | 30 | steps: 31 | - name: Checkout code 32 | uses: actions/checkout@v5 33 | 34 | - name: Set up Python 35 | uses: actions/setup-python@v5 36 | with: 37 | python-version: ${{matrix.python-version}} 38 | 39 | - name: Install dependencies 40 | run: python -m pip install pytest pytest-asyncio pytest-rerunfailures langchain-openai . 41 | 42 | - name: Run tests 43 | run: pytest -v 44 | -------------------------------------------------------------------------------- /uqlm/scorers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.scorers.ensemble import UQEnsemble 16 | from uqlm.scorers.entropy import SemanticEntropy 17 | from uqlm.scorers.panel import LLMPanel 18 | from uqlm.scorers.white_box import WhiteBoxUQ 19 | from uqlm.scorers.black_box import BlackBoxUQ 20 | from uqlm.scorers.density import SemanticDensity 21 | 22 | __all__ = ["UQEnsemble", "SemanticEntropy", "LLMPanel", "WhiteBoxUQ", "BlackBoxUQ", "SemanticDensity"] 23 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Linting with Ruff 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - develop 8 | workflow_dispatch: 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | ruff-formatting: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.9" 23 | cache: 'pip' 24 | - name: Get Ruff version and install 25 | run: | 26 | pip install poetry 27 | RUFF_VERSION=$(poetry show --only=dev | grep '^ruff ' | awk '{print $3}') 28 | echo "Installing ruff==$RUFF_VERSION" 29 | pip install ruff==$RUFF_VERSION 30 | - name: Lint with Ruff 31 | run: | 32 | ruff check uqlm/ 33 | - name: Check for unformatted files 34 | run: | 35 | ruff format --check uqlm/ 36 | -------------------------------------------------------------------------------- /uqlm/black_box/baseclass/similarity_scorer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from typing import List 17 | 18 | 19 | class SimilarityScorer(ABC): 20 | """Abstract class for text similarity scorers""" 21 | 22 | @abstractmethod 23 | def __init__(self): 24 | """Abstract constructor method""" 25 | pass 26 | 27 | @abstractmethod 28 | def evaluate(self, responses: List[str], sampled_responses: List[str]) -> List[float]: 29 | """Abstract method for metric computation""" 30 | pass 31 | -------------------------------------------------------------------------------- /uqlm/white_box/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from uqlm.white_box.single_logprobs import SingleLogprobsScorer, SINGLE_LOGPROBS_SCORER_NAMES 16 | from uqlm.white_box.top_logprobs import TopLogprobsScorer, TOP_LOGPROBS_SCORER_NAMES 17 | from uqlm.white_box.sampled_logprobs import SampledLogprobsScorer, SAMPLED_LOGPROBS_SCORER_NAMES 18 | from uqlm.white_box.p_true import PTrueScorer 19 | 20 | __all__ = ["SingleLogprobsScorer", "TopLogprobsScorer", "SampledLogprobsScorer", "PTrueScorer", "SINGLE_LOGPROBS_SCORER_NAMES", "TOP_LOGPROBS_SCORER_NAMES", "SAMPLED_LOGPROBS_SCORER_NAMES"] 21 | -------------------------------------------------------------------------------- /uqlm/utils/postprocessors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def math_postprocessor(input_string: str) -> str: 17 | """ 18 | Parameters 19 | ---------- 20 | 21 | input_string: str 22 | The string from which the numerical answer will be extracted. Only the integer part is extracted. 23 | 24 | Returns 25 | ------- 26 | str 27 | The postprocessed string containing the integer part of the answer. 28 | """ 29 | result = "" 30 | for char in input_string: 31 | if char.isdigit(): 32 | result += char 33 | elif char == ".": 34 | break 35 | return result 36 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | # Get the parent dir name which is this docs' version 23 | VERSION := $(notdir $(CURDIR)) 24 | 25 | github: 26 | @rm -rf build/html build/doctrees source/_autosummary/*.rst 27 | @cp -rf ../../assets/* source/_static/ 28 | @cp -rf ../../examples/* source/_notebooks/examples/ 29 | @make html 30 | @mkdir -p ../../docs/$(VERSION) 31 | @rm -rf ../../docs/$(VERSION)/* 32 | @cp -a build/html/. ../../docs/$(VERSION)/ 33 | @cp ../versions.json ../../docs/versions.json 34 | 35 | local: 36 | @python -m http.server --directory ../../docs/$(VERSION)/ 8080 37 | -------------------------------------------------------------------------------- /examples/uqe_config_tuned.json: -------------------------------------------------------------------------------- 1 | { 2 | "weights": [ 3 | 0.0025387213438219183, 4 | 0.8341461711896818, 5 | 0.040041378980108795, 6 | 0.12327372848638753 7 | ], 8 | "thresh": 0.74, 9 | "components": [ 10 | "exact_match", 11 | "noncontradiction", 12 | "normalized_probability", 13 | "judge_1" 14 | ], 15 | "llm_config": { 16 | "class_name": "AzureChatOpenAI", 17 | "module": "langchain_openai.chat_models.azure", 18 | "deployment_name": "gpt-4o-mini", 19 | "logprobs": true, 20 | "model_version": "", 21 | "n": 5, 22 | "openai_api_type": "azure", 23 | "openai_api_version": "2024-02-15-preview", 24 | "profile": {}, 25 | "streaming": false, 26 | "use_previous_response_id": false, 27 | "verbose": false 28 | }, 29 | "llm_scorers": { 30 | "judge_1": { 31 | "class_name": "AzureChatOpenAI", 32 | "module": "langchain_openai.chat_models.azure", 33 | "deployment_name": "gpt-4o-mini", 34 | "logprobs": true, 35 | "model_version": "", 36 | "n": 5, 37 | "openai_api_type": "azure", 38 | "openai_api_version": "2024-02-15-preview", 39 | "profile": {}, 40 | "streaming": false, 41 | "use_previous_response_id": false, 42 | "verbose": false 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /uqlm/utils/warn.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import warnings 16 | 17 | 18 | class UQLMBetaWarning(Warning): 19 | """Custom warning class for beta features in UQLM.""" 20 | 21 | pass 22 | 23 | 24 | def beta_warning(message: str): 25 | """Issues a beta warning with a custom message.""" 26 | warnings.warn(message, category=UQLMBetaWarning, stacklevel=2) 27 | 28 | 29 | class UQLMDeprecationWarning(Warning): 30 | """Custom warning class for future deprecation of features in UQLM.""" 31 | 32 | pass 33 | 34 | 35 | def deprecation_warning(message: str): 36 | """Issues a beta warning with a custom message.""" 37 | warnings.warn(message, category=UQLMDeprecationWarning, stacklevel=2) 38 | -------------------------------------------------------------------------------- /uqlm/utils/device.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | 18 | def get_best_device() -> torch.device: 19 | """ 20 | Detects and returns the best available PyTorch device. 21 | Prioritizes CUDA (NVIDIA GPU), then MPS (macOS), then CPU. 22 | 23 | Returns 24 | ------- 25 | torch.device 26 | The best available device. 27 | 28 | Examples 29 | -------- 30 | >>> device = get_best_device() 31 | >>> print(f"Using {device.type} device") 32 | """ 33 | if torch.cuda.is_available(): 34 | return torch.device("cuda") 35 | elif torch.backends.mps.is_available(): 36 | return torch.device("mps") 37 | else: 38 | return torch.device("cpu") 39 | -------------------------------------------------------------------------------- /.github/workflows/update_version_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from pathlib import Path 4 | 5 | 6 | def rebuild_version_json(current_version, gh_pages_path, site_url="https://cvs-health.github.io/uqlm"): 7 | version_json_path = Path(gh_pages_path) / "versions.json" 8 | entries = [] 9 | 10 | # List only v* folders, ignore 'latest' 11 | folders = [p for p in Path(gh_pages_path).iterdir() if p.is_dir() and p.name.startswith("v")] 12 | 13 | folders = sorted(folders, key=lambda f: tuple([int(x) for x in f.name[1:].split(".")]), reverse=True) 14 | 15 | entries.append({"name": f"v{current_version} (latest)", "version": current_version, "url": f"{site_url}/latest/"}) 16 | for folder in folders: 17 | version = folder.name[1:] # strip leading 'v' 18 | entry = {"name": f"v{version}", "version": version, "url": f"{site_url}/v{version}/"} 19 | 20 | entries.append(entry) 21 | 22 | # Save version.json 23 | with open(version_json_path, "w") as f: 24 | json.dump(entries, f, indent=4) 25 | 26 | 27 | if __name__ == "__main__": 28 | if len(sys.argv) != 3: 29 | print("Usage: python rebuild_version_json.py ") 30 | sys.exit(1) 31 | 32 | current_version = sys.argv[1] 33 | gh_pages_path = sys.argv[2] 34 | rebuild_version_json(current_version, gh_pages_path) 35 | -------------------------------------------------------------------------------- /docs/source/_templates/module.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. automodule:: {{ fullname }} 4 | 5 | {% block attributes %} 6 | {% if attributes %} 7 | .. rubric:: {{ _('Module Attributes') }} 8 | 9 | .. autosummary:: 10 | :toctree: 11 | {% for item in attributes %} 12 | {{ item }} 13 | {%- endfor %} 14 | {% endif %} 15 | {% endblock %} 16 | 17 | {% block functions %} 18 | {% if functions %} 19 | .. rubric:: {{ _('Functions') }} 20 | 21 | .. autosummary:: 22 | :toctree: 23 | {% for item in functions %} 24 | {{ item }} 25 | {%- endfor %} 26 | {% endif %} 27 | {% endblock %} 28 | 29 | {% block classes %} 30 | {% if classes %} 31 | .. rubric:: {{ _('Classes') }} 32 | 33 | .. autosummary:: 34 | :toctree: 35 | :template: class.rst 36 | {% for item in classes %} 37 | {{ item }} 38 | {%- endfor %} 39 | {% endif %} 40 | {% endblock %} 41 | 42 | {% block exceptions %} 43 | {% if exceptions %} 44 | .. rubric:: {{ _('Exceptions') }} 45 | 46 | .. autosummary:: 47 | :toctree: 48 | {% for item in exceptions %} 49 | {{ item }} 50 | {%- endfor %} 51 | {% endif %} 52 | {% endblock %} 53 | 54 | {% block modules %} 55 | {% if modules %} 56 | .. rubric:: Modules 57 | 58 | .. autosummary:: 59 | :toctree: 60 | :template: module.rst 61 | :recursive: 62 | {% for item in modules %} 63 | {{ item }} 64 | {%- endfor %} 65 | {% endif %} 66 | {% endblock %} -------------------------------------------------------------------------------- /tests/data/scorers/blackbox_results_file.json: -------------------------------------------------------------------------------- 1 | {"data": {"responses": ["30", "8", "17", "$5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8 emails", "8", "8", "8"], ["17", "17 marbles", "17", "17", "17"], ["$5", "$5", "$5", "$5", "$5"], ["11", "11", "11", "11", "11"]], "prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "exact_match": [1.0, 0.8, 0.8, 1.0, 1.0], "semantic_negentropy": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 0.9909547328948974, 0.9893265008926392, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "scorers": ["noncontradiction", "exact_match", "semantic_negentropy"]}} -------------------------------------------------------------------------------- /uqlm/utils/results.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict, Any 16 | import pandas as pd 17 | 18 | 19 | class UQResult: 20 | def __init__(self, result: Dict[str, Any]) -> None: 21 | """ 22 | Class that characterizes result of an UncertaintyQuantifier. 23 | 24 | Parameters 25 | ---------- 26 | result: dict 27 | A dictionary that is defined during `evaluate` or `tune_params` method 28 | """ 29 | self.data = result.get("data") 30 | self.metadata = result.get("metadata") 31 | self.result_dict = result 32 | 33 | def to_dict(self) -> Dict[str, Any]: 34 | """ 35 | Returns result in dictionary form 36 | """ 37 | return self.result_dict 38 | 39 | def to_df(self) -> pd.DataFrame: 40 | """ 41 | Returns result in pd.DataFrame 42 | """ 43 | rename_dict = {col: col[:-1] for col in self.result_dict["data"].keys() if col.endswith("s") and col not in ["sampled_responses", "raw_sampled_responses"]} 44 | 45 | return pd.DataFrame(self.result_dict["data"]).rename(columns=rename_dict) 46 | -------------------------------------------------------------------------------- /tests/data/scorers/whitebox_results_file.json: -------------------------------------------------------------------------------- 1 | {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "$5", "11"], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -9.0883464e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -1.2664457e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0007860411, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.038273167, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.00026145502, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -2.5226382e-05, "top_logprobs": []}]], "normalized_probability": [0.999999091165773, 0.9999987335551019, 0.9992142677493774, 0.9809171172485425, 0.9999747739361825], "min_probability": [0.999999091165773, 0.9999987335551019, 0.9992142677493774, 0.9624499954009256, 0.9999747739361825]}, "metadata": {"temperature": 1.0}} -------------------------------------------------------------------------------- /uqlm/white_box/single_logprobs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | from typing import List, Dict, Any 18 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer 19 | 20 | 21 | SINGLE_LOGPROBS_SCORER_NAMES = ["normalized_probability", "min_probability", "sequence_probability"] 22 | 23 | 24 | class SingleLogprobsScorer(LogprobsScorer): 25 | def __init__(self, scorers: List[str] = SINGLE_LOGPROBS_SCORER_NAMES): 26 | """Class for computing WhiteBox UQ scores with a single generation""" 27 | super().__init__() 28 | self.scorers = scorers 29 | 30 | def evaluate(self, logprobs_results: List[List[Dict[str, Any]]]) -> Dict[str, List[float]]: 31 | """Compute scores from logprobs results""" 32 | scores_dict = {"normalized_probability": self._compute_single_generation_scores(logprobs_results, self._norm_prob), "min_probability": self._compute_single_generation_scores(logprobs_results, self._min_prob), "sequence_probability": self._compute_single_generation_scores(logprobs_results, self._seq_prob)} 33 | return {k: scores_dict[k] for k in self.scorers} 34 | 35 | def _min_prob(self, single_response_logprobs: List[Dict[str, Any]]) -> float: 36 | """Compute minimum token probability""" 37 | probs = self.extract_probs(single_response_logprobs) 38 | return np.min(probs) 39 | -------------------------------------------------------------------------------- /uqlm/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from uqlm.utils.plots import plot_model_accuracies, plot_filtered_accuracy, plot_ranked_auc 17 | from uqlm.utils.dataloader import load_dataset, load_example_dataset 18 | from uqlm.utils.postprocessors import math_postprocessor 19 | from uqlm.utils.response_generator import ResponseGenerator 20 | from uqlm.utils.results import UQResult 21 | from uqlm.utils.tuner import Tuner 22 | from uqlm.utils.grader import LLMGrader 23 | from uqlm.utils.llm_config import save_llm_config, load_llm_config 24 | from uqlm.utils.display import ConditionalBarColumn, ConditionalTimeElapsedColumn, ConditionalTextColumn, ConditionalSpinnerColumn 25 | from uqlm.utils.warn import beta_warning, deprecation_warning 26 | from uqlm.utils.device import get_best_device 27 | 28 | __all__ = [ 29 | "plot_model_accuracies", 30 | "plot_filtered_accuracy", 31 | "plot_ranked_auc", 32 | "load_example_dataset", 33 | "load_dataset", 34 | "load_example_dataset", 35 | "math_postprocessor", 36 | "ResponseGenerator", 37 | "UQResult", 38 | "Tuner", 39 | "LLMGrader", 40 | "save_llm_config", 41 | "load_llm_config", 42 | "ConditionalBarColumn", 43 | "ConditionalTimeElapsedColumn", 44 | "ConditionalTextColumn", 45 | "ConditionalSpinnerColumn", 46 | "beta_warning", 47 | "deprecation_warning", 48 | "get_best_device", 49 | ] 50 | -------------------------------------------------------------------------------- /uqlm/utils/display.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from rich.progress import SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn 17 | 18 | HEADERS = ["🤖 Generation", "📈 Scoring", "⚙️ Optimization", "🤖🧮 Generation with Logprobs", "", " - [black]Grading responses against provided ground truth answers with default grader..."] 19 | OPTIMIZATION_TASKS = [" - [black]Optimizing weights...", " - [black]Jointly optimizing weights and threshold using grid search...", " - [black]Optimizing weights using grid search...", " - [black]Optimizing threshold with grid search..."] 20 | 21 | 22 | class ConditionalBarColumn(BarColumn): 23 | def render(self, task): 24 | if task.description in HEADERS: 25 | return "" 26 | return super().render(task) 27 | 28 | 29 | class ConditionalTimeElapsedColumn(TimeElapsedColumn): 30 | def render(self, task): 31 | if task.description in HEADERS: 32 | return "" 33 | return super().render(task) 34 | 35 | 36 | class ConditionalTextColumn(TextColumn): 37 | def render(self, task): 38 | if task.description in HEADERS: 39 | return "" 40 | elif task.description in OPTIMIZATION_TASKS: 41 | return f"[progress.percentage]{task.percentage:>3.0f}%" 42 | return super().render(task) 43 | 44 | 45 | class ConditionalSpinnerColumn(SpinnerColumn): 46 | def render(self, task): 47 | if task.description in HEADERS: 48 | return "" 49 | return super().render(task) 50 | -------------------------------------------------------------------------------- /tests/data/scorers/generate_data_semanticentropy.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import json 18 | from dotenv import load_dotenv, find_dotenv 19 | from langchain_openai import AzureChatOpenAI 20 | 21 | from uqlm.utils import load_example_dataset 22 | from uqlm.scorers import SemanticEntropy 23 | 24 | 25 | async def main(): 26 | # User to populate .env file with API credentials 27 | load_dotenv(find_dotenv()) 28 | 29 | API_KEY = os.getenv("API_KEY") 30 | API_BASE = os.getenv("API_BASE") 31 | API_TYPE = os.getenv("API_TYPE") 32 | API_VERSION = os.getenv("API_VERSION") 33 | DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME") 34 | 35 | llm = AzureChatOpenAI( 36 | deployment_name=DEPLOYMENT_NAME, 37 | openai_api_key=API_KEY, 38 | azure_endpoint=API_BASE, 39 | openai_api_type=API_TYPE, 40 | openai_api_version=API_VERSION, 41 | temperature=1, # User to set temperature 42 | ) 43 | 44 | # svamp dataset to be used as a prod dataset 45 | svamp = load_example_dataset("gsm8k").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5) 46 | 47 | # Define prompts 48 | MATH_INSTRUCTION = "Solve the math problem, but return only the numerical answer.\n" 49 | prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question] 50 | 51 | se = SemanticEntropy(llm=llm, use_best=False) 52 | 53 | results = await se.generate_and_score(prompts=prompts) 54 | 55 | results_file = "semanticentropy_results_file.json" 56 | with open(results_file, "w") as f: 57 | json.dump(results.to_dict(), f) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /tests/data/scorers/generate_data_whitebox.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import json 18 | from dotenv import load_dotenv, find_dotenv 19 | 20 | from uqlm.utils.dataloader import load_example_dataset 21 | from uqlm.scorers import WhiteBoxUQ 22 | from langchain_openai import AzureChatOpenAI 23 | 24 | 25 | async def main(): 26 | # svamp dataset to be used as a prod dataset 27 | svamp = load_example_dataset("svamp").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5) 28 | 29 | # Define prompts 30 | MATH_INSTRUCTION = "When you solve this math problem only return the answer with no additional text.\n" 31 | prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question] 32 | 33 | # User to populate .env file with API credentials 34 | load_dotenv(find_dotenv()) 35 | 36 | API_KEY = os.getenv("API_KEY") 37 | API_BASE = os.getenv("API_BASE") 38 | API_TYPE = os.getenv("API_TYPE") 39 | API_VERSION = os.getenv("API_VERSION") 40 | DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME") 41 | 42 | # This will be our main LLM for generation 43 | gpt = AzureChatOpenAI( 44 | deployment_name=DEPLOYMENT_NAME, 45 | openai_api_key=API_KEY, 46 | azure_endpoint=API_BASE, 47 | openai_api_type=API_TYPE, 48 | openai_api_version=API_VERSION, 49 | temperature=1, # User to set temperature 50 | ) 51 | 52 | wbuq = WhiteBoxUQ(llm=gpt) 53 | 54 | results = await wbuq.generate_and_score(prompts=prompts) 55 | 56 | results_file = "whitebox_results_file.json" 57 | with open(results_file, "w") as f: 58 | json.dump(results.to_dict(), f) 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /tests/data/scorers/generate_data_blackbox.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import json 18 | from dotenv import load_dotenv, find_dotenv 19 | 20 | from uqlm.utils.dataloader import load_example_dataset 21 | from uqlm.scorers import BlackBoxUQ 22 | from langchain_openai import AzureChatOpenAI 23 | 24 | 25 | async def main(): 26 | # svamp dataset to be used as a prod dataset 27 | svamp = load_example_dataset("svamp").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5) 28 | 29 | # Define prompts 30 | MATH_INSTRUCTION = "When you solve this math problem only return the answer with no additional text.\n" 31 | prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question] 32 | 33 | # User to populate .env file with API credentials 34 | load_dotenv(find_dotenv()) 35 | 36 | API_KEY = os.getenv("API_KEY") 37 | API_BASE = os.getenv("API_BASE") 38 | API_TYPE = os.getenv("API_TYPE") 39 | API_VERSION = os.getenv("API_VERSION") 40 | DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME") 41 | 42 | # This will be our main LLM for generation 43 | gpt = AzureChatOpenAI( 44 | deployment_name=DEPLOYMENT_NAME, 45 | openai_api_key=API_KEY, 46 | azure_endpoint=API_BASE, 47 | openai_api_type=API_TYPE, 48 | openai_api_version=API_VERSION, 49 | temperature=1, # User to set temperature 50 | ) 51 | 52 | bbuq = BlackBoxUQ(llm=gpt, scorers=["noncontradiction", "exact_match", "semantic_negentropy"]) 53 | 54 | results = await bbuq.generate_and_score(prompts=prompts, num_responses=5) 55 | 56 | results_file = "blackbox_results_file.json" 57 | with open(results_file, "w") as f: 58 | json.dump(results.to_dict(), f) 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /tests/data/scorers/semanticentropy_results_file.json: -------------------------------------------------------------------------------- 1 | {"data": {"responses": ["5", "$3", "12", "308", "35"], "entropy_values": [0.0, 1.3296613488547582, 0.6365141682948128, 0.45056120886630463, 0.8675632284814612], "confidence_scores": [1.0, 0.25790187148969435, 0.644754678724236, 0.7485370014199393, 0.5158037429793888], "sampled_responses": [["5", "5", "5", "5", "5 miles"], ["$9", "$6", "Josh makes 12 bracelets. His cost for supplies is:\n12 bracelets * $1/bracelet = $12\n\nHe sells each bracelet for $1.50, so his revenue from selling 12 bracelets is:\n12 bracelets * $1.50/bracelet = $18\n\nHis profit, therefore, is:\n$18 - $12 = $6\n\nAfter buying the cookies, he still has $3, so the cost of the cookies is:\n$6 - $3 = $3\n\nTherefore, the cost of the box of cookies is $3.", "$6", "$9"], ["12", "36", "12", "12", "36"], ["308", "308", "308", "315", "308"], ["32.5", "32", "32.5", "30.", "32"]], "prompts": ["Solve the math problem, but return only the numerical answer.\nVery early this morning, Elise left home in a cab headed for the hospital. Fortunately, the roads were clear, and the cab company only charged her a base price of $3, and $4 for every mile she traveled. If Elise paid a total of $23, how far is the hospital from her house?", "Solve the math problem, but return only the numerical answer.\nJosh is saving up for a box of cookies. To raise the money, he is going to make bracelets and sell them. It costs $1 for supplies for each bracelet and he sells each one for $1.5. If he makes 12 bracelets and after buying the cookies still has $3, how much did the box of cookies cost?", "Solve the math problem, but return only the numerical answer.\nColin can skip at six times the speed that Brandon can. Brandon can skip at one-third the speed that Tony can. And Tony can skip at twice the speed that Bruce can. At what speed, in miles per hour, can Colin skip if Bruce skips at 1 mile per hour?", "Solve the math problem, but return only the numerical answer.\nJanet, a third grade teacher, is picking up the sack lunch order from a local deli for the field trip she is taking her class on. There are 35 children in her class, 5 volunteer chaperones, and herself. She she also ordered three additional sack lunches, just in case there was a problem. Each sack lunch costs $7. How much do all the lunches cost in total?", "Solve the math problem, but return only the numerical answer.\nAt 30, Anika is 4/3 the age of Maddie. What would be their average age in 15 years?"]}, "metadata": {"parameters": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5}}} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cov.xml 56 | cov-term-missing.txt 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | docs/ 78 | docs_srcs/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # macos 138 | *.DS_Store 139 | 140 | # download data 141 | **/BLEURT-20/ 142 | 143 | # for dev 144 | /experiments 145 | 146 | 147 | .vscode/ 148 | .settings/ -------------------------------------------------------------------------------- /uqlm/black_box/match.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | from typing import List, Optional 18 | from rich.progress import Progress 19 | 20 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer 21 | 22 | import time 23 | 24 | 25 | class MatchScorer(SimilarityScorer): 26 | def __init__(self) -> None: 27 | """ 28 | Class for computing exact match rates between original responses and candidates. This 29 | method is based on Cole et al.(2023) :footcite:`cole2023selectivelyansweringambiguousquestions`. 30 | """ 31 | pass 32 | 33 | def evaluate(self, responses: List[str], sampled_responses: List[List[str]], progress_bar: Optional[Progress] = None) -> List[float]: 34 | """ 35 | This method computes exact match rates for the provided pairs of texts. 36 | 37 | Parameters 38 | ---------- 39 | responses : list of strings 40 | Original LLM response 41 | 42 | sampled_responses : list of list of strings 43 | Candidate responses to be compared to the original response 44 | 45 | progress_bar : rich.progress.Progress, default=None 46 | If provided, displays a progress bar while scoring responses 47 | 48 | Returns 49 | ------- 50 | List of float 51 | Exact match rates 52 | """ 53 | if progress_bar: 54 | progress_task = progress_bar.add_task(" - Scoring responses with exact match...", total=len(responses)) 55 | results = [] 56 | for i, (response, candidates) in enumerate(zip(responses, sampled_responses)): 57 | score = self._compute_score(response=response, candidates=candidates) 58 | results.append(score) 59 | if progress_bar: 60 | progress_bar.update(progress_task, advance=1) 61 | time.sleep(0.1) 62 | return results 63 | 64 | @staticmethod 65 | def _compute_score(response: str, candidates: List[str]) -> List[float]: 66 | """Get mean exact match rate between response and set of candidates""" 67 | return np.mean([1 if response == c else 0 for c in candidates]) 68 | -------------------------------------------------------------------------------- /tests/test_similarity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import numpy as np 17 | from uqlm.black_box import BertScorer, CosineScorer, MatchScorer 18 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer 19 | 20 | datafile_path = "tests/data/similarity/similarity_results_file.json" 21 | with open(datafile_path, "r") as f: 22 | data = json.load(f) 23 | 24 | responses = data["responses"] 25 | sampled_responses = data["sampled_responses"] 26 | 27 | 28 | def test_bert(): 29 | bert = BertScorer(device="cpu") 30 | bert_result = bert.evaluate(responses=responses, sampled_responses=sampled_responses) 31 | assert all([abs(bert_result[i] - data["bert_result"][i]) < 1e-5 for i in range(len(bert_result))]) 32 | 33 | 34 | def test_cosine(monkeypatch): 35 | embeddings1, embeddings2 = data["embeddings1"], data["embeddings2"] 36 | 37 | cosine = CosineScorer() 38 | 39 | # Mock return from ('SentenceTransformer.encode' method) 40 | def mock_encode(*args, **kwargs): 41 | if len(embeddings1) >= len(embeddings2): 42 | return np.array(embeddings1.pop(0)) 43 | return np.array(embeddings2.pop(0)) 44 | 45 | monkeypatch.setattr(cosine.model, "encode", mock_encode) 46 | 47 | cosine_result = cosine.evaluate(responses=responses, sampled_responses=sampled_responses) 48 | assert all([abs(cosine_result[i] - data["cosine_result"][i]) < 1e-5 for i in range(len(cosine_result))]) 49 | 50 | 51 | def test_exact_match(): 52 | match = MatchScorer() 53 | match_result = match.evaluate(responses=responses, sampled_responses=sampled_responses) 54 | assert all([abs(match_result[i] - data["match_result"][i]) < 1e-5 for i in range(len(match_result))]) 55 | 56 | 57 | def test_abstract_base_class(): 58 | """Test to cover abstract base class""" 59 | 60 | class TestSimilarityScorer(SimilarityScorer): 61 | def __init__(self): 62 | super().__init__() 63 | 64 | def evaluate(self, responses, sampled_responses): 65 | super().evaluate(responses, sampled_responses) 66 | return [1.0] 67 | 68 | scorer = TestSimilarityScorer() 69 | result = scorer.evaluate(["test"], ["sample"]) 70 | assert result == [1.0] 71 | -------------------------------------------------------------------------------- /tests/test_grader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | from unittest.mock import AsyncMock, MagicMock 17 | from uqlm.utils.grader import LLMGrader 18 | 19 | 20 | @pytest.mark.asyncio 21 | async def test_grade_responses(): 22 | """Test the grade_responses method""" 23 | mock_llm = MagicMock() 24 | mock_response_generator = AsyncMock() 25 | mock_response_generator.generate_responses.return_value = {"data": {"response": ["yes", "no", "yes"]}} 26 | mock_llm.response_generator = mock_response_generator 27 | 28 | grader = LLMGrader(llm=mock_llm) 29 | grader.response_generator = mock_response_generator 30 | 31 | prompts = ["What is 2+2?", "What is the capital of France?", "What is 5*5?"] 32 | responses = ["4", "Berlin", "25"] 33 | answers = [["4"], ["Paris"], ["25"]] 34 | 35 | result = await grader.grade_responses(prompts, responses, answers) 36 | assert result == [True, False, True] 37 | 38 | 39 | def test_extract_grades(): 40 | """Test the _extract_grades method""" 41 | assert LLMGrader._extract_grades("yes") is True 42 | assert LLMGrader._extract_grades("no") is False 43 | assert LLMGrader._extract_grades("YES") is True 44 | assert LLMGrader._extract_grades("NO") is False 45 | assert LLMGrader._extract_grades("maybe") is False 46 | 47 | 48 | def test_construct_grader_prompt(): 49 | """Test the _construct_grader_prompt method""" 50 | prompt = "What is 2+2?" 51 | response = "4" 52 | acceptable_answers = ["4", "four"] 53 | 54 | expected_prompt = """ 55 | Your task is to grade the following proposed answer against the provided answer key. The ground truth is the gold standard regardless of any other information you may have. Return ONLY the word "yes" or "no", with no additional text, based on whether the proposed answer aligns with any of the ground truth answers. Answer "yes" if correct, "no" if incorrect. 56 | 57 | **Question:** 58 | What is 2+2? 59 | 60 | **Ground Truth Answers (Answer Key):** 61 | ['4', 'four'] 62 | 63 | **Proposed Answer to Grade:** 64 | 4 65 | 66 | Now your answer is (yes or no): 67 | """ 68 | result = LLMGrader._construct_grader_prompt(prompt, response, acceptable_answers) 69 | assert result.strip() == expected_prompt.strip() 70 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to UQLM 2 | 3 | Welcome and thank you for considering contributing to UQLM! 4 | 5 | It takes a lot of time and effort to use software much less build upon it, so we deeply appreciate your desire to help make this project thrive. 6 | 7 | ## Table of Contents 8 | 9 | 1. [How to Contribute](#how-to-contribute) 10 | - [Reporting Bugs](#reporting-bugs) 11 | - [Suggesting Enhancements](#suggesting-enhancements) 12 | - [Pull Requests](#pull-requests) 13 | 2. [Development Setup](#development-setup) 14 | 3. [Style Guides](#style-guides) 15 | - [Code Style](#code-style) 16 | 17 | ## How to Contribute 18 | 19 | ### Reporting Bugs 20 | 21 | If you find a bug, please report it by opening an issue on GitHub. Include as much detail as possible: 22 | - Steps to reproduce the bug. 23 | - Expected and actual behavior. 24 | - Screenshots if applicable. 25 | - Any other information that might help us understand the problem. 26 | 27 | ### Suggesting Enhancements 28 | 29 | We welcome suggestions for new features or improvements. To suggest an enhancement, please open an issue on GitHub and include: 30 | - A clear description of the suggested enhancement. 31 | - Why you believe this enhancement would be useful. 32 | - Any relevant examples or mockups. 33 | 34 | ### Pull Requests 35 | 36 | 1. Fork the repository. 37 | 2. Create a new branch (`git checkout -b feature/your-feature-name`). 38 | 3. Make your changes. 39 | 4. Commit your changes (`git commit -m 'Add some feature'`). 40 | 5. Push to the branch (`git push origin feature/your-feature-name`). 41 | 6. Open a pull request. 42 | 43 | Please ensure your pull request adheres to the following guidelines: 44 | - Follow the project's code style. 45 | - Include tests for any new features or bug fixes. 46 | 47 | ## Development Setup 48 | 49 | 1. Clone the repository: `git clone https://github.com/cvs-health/uqlm.git` 50 | 2. Navigate to the project directory: `cd uqlm` 51 | 3. Create and activate a virtual environment (using `venv` or `conda`) 52 | 4. Install poetry (if you don't already have it): `pip install poetry` 53 | 5. Install uqlm with dev dependencies: `poetry install --with dev` 54 | 6. Install our pre-commit hooks to ensure code style compliance: `pre-commit install` 55 | 7. Run tests to ensure everything is working: `pre-commit run --all-files` 56 | 57 | You're ready to develop! 58 | 59 | ## Style Guides 60 | 61 | ### Code Style 62 | 63 | - We use [Ruff](https://github.com/astral-sh/ruff) to lint and format our files. 64 | - Our pre-commit hook will run Ruff linting and formatting when you commit. 65 | - You can manually run Ruff at any time (see [Ruff usage](https://github.com/astral-sh/ruff#usage)). 66 | 67 | Please ensure your code is properly formatted and linted before committing. 68 | 69 | ## License 70 | 71 | Before contributing to this CVS Health sponsored project, you will need to sign the associated [Contributor License Agreement (CLA)](https://forms.office.com/r/iFZWwzjt9C) 72 | 73 | --- 74 | 75 | Thanks again for using and supporting uqlm! -------------------------------------------------------------------------------- /tests/test_semanticentropy.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import json 17 | from uqlm.scorers import SemanticEntropy 18 | from langchain_openai import AzureChatOpenAI 19 | 20 | datafile_path = "tests/data/scorers/semanticentropy_results_file.json" 21 | with open(datafile_path, "r") as f: 22 | expected_result = json.load(f) 23 | 24 | data = expected_result["data"] 25 | metadata = expected_result["metadata"] 26 | 27 | mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com") 28 | 29 | 30 | @pytest.mark.flaky(reruns=3) 31 | @pytest.mark.asyncio 32 | async def test_semanticentropy(monkeypatch): 33 | PROMPTS = data["prompts"] 34 | MOCKED_RESPONSES = data["responses"] 35 | MOCKED_SAMPLED_RESPONSES = data["sampled_responses"] 36 | 37 | # Initiate SemanticEntropy class object 38 | se_object = SemanticEntropy(llm=mock_object, use_best=False, device="cpu") 39 | 40 | async def mock_generate_original_responses(*args, **kwargs): 41 | se_object.logprobs = [None] * 5 42 | return MOCKED_RESPONSES 43 | 44 | async def mock_generate_candidate_responses(*args, **kwargs): 45 | se_object.multiple_logprobs = [[None] * 5] * 5 46 | return MOCKED_SAMPLED_RESPONSES 47 | 48 | monkeypatch.setattr(se_object, "generate_original_responses", mock_generate_original_responses) 49 | monkeypatch.setattr(se_object, "generate_candidate_responses", mock_generate_candidate_responses) 50 | 51 | for show_progress_bars in [False, True]: 52 | se_results = await se_object.generate_and_score(prompts=PROMPTS, show_progress_bars=show_progress_bars) 53 | se_object.logprobs = None 54 | se_results = se_object.score(responses=MOCKED_RESPONSES, sampled_responses=MOCKED_SAMPLED_RESPONSES) 55 | assert se_results.data["responses"] == data["responses"] 56 | assert se_results.data["sampled_responses"] == data["sampled_responses"] 57 | assert se_results.data["prompts"] == data["prompts"] 58 | assert all([abs(se_results.data["discrete_entropy_values"][i] - data["entropy_values"][i]) < 1e-5 for i in range(len(PROMPTS))]) 59 | assert all([abs(se_results.data["discrete_confidence_scores"][i] - data["confidence_scores"][i]) < 1e-5 for i in range(len(PROMPTS))]) 60 | assert se_results.metadata == metadata 61 | -------------------------------------------------------------------------------- /tests/data/similarity/generate_data_similarity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import json 17 | import asyncio 18 | 19 | from uqlm.black_box import BertScorer, BLEURTScorer, CosineScorer, MatchScorer 20 | 21 | 22 | async def main(): 23 | # Load data 24 | current_directory = os.getcwd() 25 | datafile_path = os.path.join("/".join(current_directory.split("/")[:-1]), "scorers/bsdetector_results_file.json") 26 | with open(datafile_path, "r") as f: 27 | data = json.load(f) 28 | 29 | responses = data["responses"] 30 | sampled_responses = data["sampled_responses"] 31 | 32 | store_results = dict() 33 | store_results.update({"responses": responses, "sampled_responses": sampled_responses}) 34 | 35 | # 1. Bert Scorer 36 | bert = BertScorer() 37 | bert_result = bert.evaluate(responses=responses, sampled_responses=sampled_responses) 38 | 39 | store_results.update( 40 | { 41 | "bert_result": bert_result 42 | # 'F1': F1 43 | } 44 | ) 45 | 46 | # 2. Bleurt Scorer 47 | bluert = BLEURTScorer() 48 | bluert_result = bluert.evaluate(responses=responses, sampled_responses=sampled_responses) 49 | bluert_scorer_result = [] 50 | for i in range(len(responses)): 51 | bluert_scorer_result.append(bluert.bleurt_scorer.score(references=[responses[i]] * len(sampled_responses[i]), candidates=sampled_responses[i])) 52 | 53 | store_results.update({"bluert_result": bluert_result, "bluert_score": bluert_scorer_result}) 54 | 55 | # 3. Cosine Similarity Scorer 56 | cosine = CosineScorer() 57 | cosine_result = cosine.evaluate(responses=responses, sampled_responses=sampled_responses) 58 | embeddings1, embeddings2 = [], [] 59 | for i in range(len(responses)): 60 | embeddings1.append(cosine.model.encode([responses[i]] * len(sampled_responses[i])).tolist()) 61 | embeddings2.append(cosine.model.encode(sampled_responses[i]).tolist()) 62 | 63 | store_results.update({"cosine_result": cosine_result, "embeddings1": embeddings1, "embeddings2": embeddings2}) 64 | 65 | # 4. Exact Match scorer 66 | match = MatchScorer() 67 | match_result = match.evaluate(responses=responses, sampled_responses=sampled_responses) 68 | 69 | store_results.update({"match_result": match_result}) 70 | 71 | # Store results 72 | results_file = "similarity_results_file.json" 73 | with open(results_file, "w") as f: 74 | json.dump(store_results, f) 75 | 76 | 77 | if __name__ == "__main__": 78 | asyncio.run(main()) 79 | -------------------------------------------------------------------------------- /tests/data/scorers/generate_data_llmjudge.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import asyncio 17 | import json 18 | from dotenv import load_dotenv, find_dotenv 19 | from uqlm.judges import LLMJudge 20 | from uqlm.utils import ResponseGenerator 21 | from langchain_openai import AzureChatOpenAI 22 | 23 | 24 | async def main(): 25 | # This notebook generate results based on these input & using "exai-gpt-35-turbo-16k" model 26 | prompts = ["Which part of the human body produces insulin?", "What color are the two stars on the national flag of Syria", "How many 'm's are there in the word strawberry"] 27 | 28 | # User to populate .env file with API credentials 29 | load_dotenv(find_dotenv()) 30 | 31 | API_KEY = os.getenv("API_KEY") 32 | API_BASE = os.getenv("API_BASE") 33 | API_TYPE = os.getenv("API_TYPE") 34 | API_VERSION = os.getenv("API_VERSION") 35 | DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME") 36 | 37 | original_llm = AzureChatOpenAI( 38 | deployment_name=DEPLOYMENT_NAME, 39 | openai_api_key=API_KEY, 40 | azure_endpoint=API_BASE, 41 | openai_api_type=API_TYPE, 42 | openai_api_version=API_VERSION, 43 | temperature=1, # User to set temperature 44 | ) 45 | 46 | rg = ResponseGenerator(llm=original_llm, max_calls_per_min=250) 47 | generations = await rg.generate_responses(prompts=prompts, count=1) 48 | responses = generations["data"]["response"] 49 | 50 | judge = LLMJudge(llm=original_llm, max_calls_per_min=250) 51 | 52 | # Generate data for all templates 53 | templates = ["true_false_uncertain", "true_false", "continuous", "likert"] 54 | # Structure: one file with all template data 55 | all_results = { 56 | "prompts": prompts, 57 | "responses": responses, 58 | "templates": {}, # This will hold data for each template 59 | } 60 | for template in templates: 61 | judge = LLMJudge(llm=original_llm, max_calls_per_min=250, scoring_template=template) 62 | judge_result = await judge.judge_responses(prompts=prompts, responses=responses) 63 | extract_answer = judge._extract_answers(responses=judge_result["judge_responses"]) 64 | # Store results for this template 65 | all_results["templates"][template] = {"judge_result": judge_result, "extract_answer": extract_answer} 66 | # Save single comprehensive file 67 | results_file = "llmjudge_results_file.json" 68 | with open(results_file, "w") as f: 69 | json.dump(all_results, f) 70 | 71 | 72 | if __name__ == "__main__": 73 | asyncio.run(main()) 74 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Path setup -------------------------------------------------------------- 7 | import os 8 | import sys 9 | import importlib.metadata 10 | sys.path.insert(0, os.path.abspath('../../../uqlm')) 11 | 12 | # -- Project information ----------------------------------------------------- 13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 14 | 15 | project = 'uqlm' 16 | copyright = '2025, CVS Health' 17 | author = 'Dylan Bouchard, Mohit Singh Chauhan' 18 | release = '0.1' 19 | # version = importlib.metadata.version("uqlm") 20 | # release = ".".join(version.rsplit(".")[:-1]) 21 | 22 | # -- General configuration --------------------------------------------------- 23 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 24 | 25 | extensions = [ 26 | 'sphinx.ext.autodoc', # Core library for html generation from docstrings 27 | "sphinx_autodoc_typehints", # Automatically document type hints 28 | 'sphinx.ext.autosummary', # Create neat summary tables 29 | 'sphinx.ext.napoleon', # NumPy and Google style docsrings parsing 30 | "sphinx.ext.duration", # build duration 31 | "sphinx.ext.doctest", # Test snippets in the documentation 32 | "sphinxcontrib.bibtex", # Bibliographic references 33 | "sphinx_favicon", # Add favicon 34 | "nbsphinx", # Execute Jupyter notebooks + OSX brew install pandoc 35 | ] 36 | nbsphinx_execute="never" 37 | 38 | bibtex_bibfiles = ["refs.bib"] 39 | 40 | autosummary_generate = True 41 | 42 | templates_path = ['_templates'] 43 | 44 | html_static_path = ['_static'] 45 | 46 | html_css_files = ['custom.css'] 47 | 48 | exclude_patterns = [] 49 | 50 | # -- Options for HTML output ------------------------------------------------- 51 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 52 | 53 | favicons = [ 54 | { 55 | "rel": "icon", 56 | "sizes": "16x16", 57 | "href": "images/favicon/favicon-16x16.png", 58 | "type": "image/png" 59 | }, 60 | { 61 | "rel": "icon", 62 | "sizes": "32x32", 63 | "href": "images/favicon/favicon-32x32.png", 64 | "type": "image/png" 65 | }, 66 | { 67 | "rel": "apple-touch-icon", 68 | "sizes": "180x180", 69 | "href": "images/favicon/apple-touch-icon.png", 70 | "type": "image/png" 71 | }, 72 | ] 73 | 74 | html_theme = 'pydata_sphinx_theme' 75 | 76 | html_favicon = '_static/images/favicon/favicon.ico' 77 | 78 | html_theme_options = { 79 | "github_url": "https://github.com/cvs-health/uqlm", 80 | "navbar_align": "left", 81 | "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], 82 | "switcher": { 83 | "json_url": "https://cvs-health.github.io/uqlm/versions.json", 84 | "version_match": release, 85 | }, 86 | "logo": { 87 | "image_light": "_static/images/horizontal_logo.png", 88 | "image_dark": "_static/images/horizontal_logo_no_bg.png", 89 | }, 90 | } 91 | 92 | source_suffix = [".rst"] 93 | -------------------------------------------------------------------------------- /tests/test_blackboxuq.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import json 17 | from uqlm.scorers import BlackBoxUQ 18 | from uqlm.scorers.baseclass.uncertainty import DEFAULT_BLACK_BOX_SCORERS 19 | from langchain_openai import AzureChatOpenAI 20 | 21 | datafile_path = "tests/data/scorers/blackbox_results_file.json" 22 | with open(datafile_path, "r") as f: 23 | expected_result = json.load(f) 24 | 25 | data = expected_result["data"] 26 | metadata = expected_result["metadata"] 27 | 28 | PROMPTS = data["prompts"] 29 | MOCKED_RESPONSES = data["responses"] 30 | MOCKED_SAMPLED_RESPONSES = data["sampled_responses"] 31 | 32 | 33 | @pytest.fixture 34 | def mock_llm(): 35 | """Define mock LLM object using pytest.fixture.""" 36 | return AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com") 37 | 38 | 39 | @pytest.mark.flaky(reruns=3) 40 | @pytest.mark.asyncio 41 | async def test_bbuq(monkeypatch, mock_llm): 42 | uqe = BlackBoxUQ(llm=mock_llm, scorers=["noncontradiction", "exact_match", "semantic_negentropy"], device="cpu") 43 | 44 | async def mock_generate_original_responses(*args, **kwargs): 45 | uqe.logprobs = [None] * 5 46 | return MOCKED_RESPONSES 47 | 48 | async def mock_generate_candidate_responses(*args, **kwargs): 49 | uqe.multiple_logprobs = [[None] * 5] * 5 50 | return MOCKED_SAMPLED_RESPONSES 51 | 52 | monkeypatch.setattr(uqe, "generate_original_responses", mock_generate_original_responses) 53 | monkeypatch.setattr(uqe, "generate_candidate_responses", mock_generate_candidate_responses) 54 | for show_progress_bars in [False, True]: 55 | results = await uqe.generate_and_score(prompts=PROMPTS, num_responses=5, show_progress_bars=show_progress_bars) 56 | 57 | assert all([results.data["exact_match"][i] == pytest.approx(data["exact_match"][i]) for i in range(len(PROMPTS))]) 58 | 59 | assert all([results.data["noncontradiction"][i] == pytest.approx(data["noncontradiction"][i]) for i in range(len(PROMPTS))]) 60 | 61 | assert all([results.data["semantic_negentropy"][i] == pytest.approx(data["semantic_negentropy"][i]) for i in range(len(PROMPTS))]) 62 | 63 | assert results.metadata == metadata 64 | 65 | # Test invalid scorer 66 | with pytest.raises(ValueError): 67 | BlackBoxUQ(llm=mock_llm, scorers=["invalid_scorer"], device="cpu") 68 | 69 | # Test default scorers 70 | uqe_default = BlackBoxUQ(llm=mock_llm, scorers=None, device="cpu") 71 | assert len(uqe_default.scorers) == len(DEFAULT_BLACK_BOX_SCORERS) 72 | 73 | BlackBoxUQ(llm=mock_llm, scorers=["bert_score"], device="cpu") 74 | -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* Custom styles */ 2 | .wy-side-nav-search { 3 | background-color: #2980B9; 4 | } 5 | .wy-nav-content { 6 | max-width: 1200px; 7 | } 8 | .highlight { 9 | background: #f8f9fa; 10 | } 11 | /* Custom admonitions */ 12 | .admonition.note { 13 | background: #e7f2fa; 14 | } 15 | .admonition.warning { 16 | background: #fff3cd; 17 | } 18 | /* Custom link colors */ 19 | a { 20 | color: #2980B9; 21 | } 22 | a:hover { 23 | color: #3091d1; 24 | } 25 | 26 | /* Custom styles for the gallery */ 27 | div.sphx-glr-thumbnails { 28 | display: grid; 29 | grid-template-columns: repeat(3, minmax(0, 1fr)); 30 | gap: 20px; 31 | padding: 20px; 32 | } 33 | 34 | 35 | div.sphx-glr-thumbcontainer:hover { 36 | border: 1px solid #0066cc; 37 | box-shadow: 0 0 15px rgba(0,0,0,0.1); 38 | } 39 | 40 | .sphx-glr-footer { 41 | text-align: center; 42 | margin: 2em 0; 43 | } 44 | .sphx-glr-download { 45 | margin: 1em 0; 46 | } 47 | 48 | 49 | .sphx-glr-thumbcontainer:hover { 50 | border-color: #0066cc; 51 | } 52 | 53 | /* Make container relative for absolute positioning of link */ 54 | div.sphx-glr-thumbcontainer { 55 | position: relative; 56 | border: solid #ccc 1px; 57 | border-radius: 4px; 58 | overflow: hidden; 59 | background: #212529; 60 | display: flex; 61 | flex-direction: column; /* Stack children vertically */ 62 | } 63 | 64 | /* Make the link cover the entire container */ 65 | div.sphx-glr-thumbcontainer a { 66 | position: absolute; 67 | top: 0; 68 | left: 0; 69 | width: 100%; 70 | height: 100%; 71 | z-index: 1; 72 | } 73 | /* Force image to appear first */ 74 | div.sphx-glr-thumbcontainer img { 75 | width: 100%; 76 | height: 150px; 77 | object-fit: contain; 78 | padding: 5px; 79 | order: 0; /* This makes the image appear first */ 80 | } 81 | 82 | /* Title/caption styling */ 83 | 84 | div.sphx-glr-thumbnail-title { 85 | text-align: center; 86 | color: #4FB6D6; 87 | padding: 8px 5px; 88 | margin: 0; 89 | font-size: 0.9em; 90 | background: #212529; 91 | order: 1; /* This makes the title appear after the image */ 92 | } 93 | 94 | /* Hide doc captions ONLY in the gallery thumbnails */ 95 | .sphx-glr-thumbcontainer .docutils.container p, 96 | .sphx-glr-thumbcontainer span.doc, 97 | .sphx-glr-thumbcontainer .docutils.container .caption-text { 98 | display: none !important; 99 | } 100 | /* Keep doc visible everywhere else */ 101 | .docutils.container p, 102 | span.doc { 103 | display: inline-block; /* Default display for docs outside gallery */ 104 | } 105 | 106 | .responsive-img { 107 | max-width: 100%; 108 | height: auto; 109 | display: block; 110 | margin: 0 auto; 111 | } 112 | 113 | /* Hide References section when it's empty */ 114 | .references-section:empty, 115 | .references-section:only-child { 116 | display: none; 117 | } 118 | 119 | /* Add some styling to make the References section less prominent */ 120 | .references-section { 121 | margin-top: 2em; 122 | padding-top: 1em; 123 | border-top: 1px solid #eee; 124 | } 125 | 126 | /* Make the References heading less prominent */ 127 | .references-section .rubric { 128 | font-size: 1.2em; 129 | color: #666; 130 | } -------------------------------------------------------------------------------- /uqlm/black_box/bert.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | from typing import Any, List, Optional 18 | from bert_score import BERTScorer 19 | import torch 20 | 21 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer 22 | from uqlm.utils.device import get_best_device 23 | 24 | import time 25 | from rich.progress import Progress 26 | 27 | 28 | class BertScorer(SimilarityScorer): 29 | def __init__(self, device: Any = None) -> None: 30 | """ 31 | Class for computing BERTScore values between original responses and candidates. For more on 32 | BERTScore, refer to Zhang et al.(2020) :footcite:`zhang2020bertscoreevaluatingtextgeneration`. 33 | 34 | Parameters 35 | ---------- 36 | device : torch.device input or torch.device object, default=None 37 | Specifies the device that classifiers use for prediction. Set to "cuda" for classifiers to be able to 38 | leverage the GPU. 39 | """ 40 | # Handle device detection 41 | if device is None: 42 | device = get_best_device() 43 | elif isinstance(device, str): 44 | device = torch.device(device) 45 | 46 | from transformers import logging 47 | 48 | logging.set_verbosity_error() 49 | self.bert_scorer = BERTScorer(device=device, lang="en") 50 | 51 | def evaluate(self, responses: List[str], sampled_responses: List[List[str]], progress_bar: Optional[Progress] = None) -> List[float]: 52 | """ 53 | This method computes model-based text similarity metrics values for the provided pairs of texts. 54 | 55 | Parameters 56 | ---------- 57 | responses : list of strings 58 | Original LLM response 59 | 60 | sampled_responses : list of list of strings 61 | Candidate responses to be compared to the original response 62 | 63 | progress_bar : rich.progress.Progress, default=None 64 | If provided, displays a progress bar while scoring responses 65 | 66 | Returns 67 | ------- 68 | List of float 69 | Mean BertScore values 70 | """ 71 | if progress_bar: 72 | progress_task = progress_bar.add_task(" - Scoring responses with BERTScore...", total=len(responses)) 73 | results = [] 74 | for i in range(len(responses)): 75 | score = self._compute_score(response=responses[i], candidates=sampled_responses[i]) 76 | results.append(score) 77 | if progress_bar: 78 | progress_bar.update(progress_task, advance=1) 79 | time.sleep(0.1) 80 | return results 81 | 82 | def _compute_score(self, response: str, candidates: List[str]) -> float: 83 | """Compute mean BERTScore between a response and candidate responses""" 84 | num_responses = len(candidates) 85 | duplicated_response = [response] * num_responses 86 | P, R, F1 = self.bert_scorer.score(list(duplicated_response), refs=list(candidates)) 87 | return np.mean([float(f) for f in F1]) 88 | -------------------------------------------------------------------------------- /tests/test_nli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import gc 16 | import pytest 17 | from uqlm.nli.nli import NLI 18 | 19 | 20 | @pytest.fixture 21 | def text1(): 22 | return "Question: What is captial of France, Answer: Paris" 23 | 24 | 25 | @pytest.fixture 26 | def text2(): 27 | return "Question: What is captial of France, Answer: Capital of France is Paris city." 28 | 29 | 30 | @pytest.fixture 31 | def nli_model(): 32 | return NLI(device="cpu") 33 | 34 | 35 | @pytest.fixture 36 | def nli_model_cpu(): 37 | return NLI(verbose=True, device="cpu") 38 | 39 | 40 | @pytest.mark.flaky(reruns=3) 41 | def test_nli(text1, text2, nli_model): 42 | probabilities = nli_model.predict(text1, text2) 43 | del nli_model 44 | gc.collect() 45 | assert abs(float(probabilities[0][0]) - 0.00159405) < 1e-5 46 | 47 | 48 | # @pytest.mark.flaky(reruns=3) 49 | # def test_nli2(text1, nli_model_cpu): 50 | # result = nli_model_cpu._observed_consistency_i(original=text1, candidates=[text1] * 5, use_best=False, compute_entropy=False) 51 | # assert result["nli_score_i"] == 1 52 | # assert result["discrete_semantic_entropy"] is None 53 | # assert result["tokenprob_semantic_entropy"] is None 54 | 55 | 56 | @pytest.mark.flaky(reruns=3) 57 | def test_nli3(text1, text2, nli_model_cpu): 58 | expected_warning = "Maximum response length exceeded for NLI comparison. Truncation will occur. To adjust, change the value of max_length" 59 | 60 | with pytest.warns(UserWarning, match=expected_warning): 61 | nli_model_cpu.predict(text1 * 50, text2) 62 | del nli_model_cpu 63 | gc.collect() 64 | 65 | 66 | # @pytest.mark.flaky(reruns=3) 67 | # def test_nli4(nli_model_cpu): 68 | # text1 = "Capital of France is Paris" 69 | # text2 = " Paris is the capital of France" 70 | # text3 = "Rome is the capital of Italy" 71 | # logprobs_results = [ 72 | # [{"token": "Capital", "logprob": 0.6}, {"token": "of", "logprob": 0.5}, {"token": "France", "logprob": 0.3}, {"token": "is", "logprob": 0.3}, {"token": "Paris", "logprob": 0.3}], 73 | # [{"token": "Paris", "logprob": 0.75}, {"token": "is", "logprob": 0.8}, {"token": "the", "logprob": 0.9}, {"token": "capital", "logprob": 0.6}, {"token": "of", "logprob": 0.6}, {"token": "France", "logprob": 0.6}], 74 | # [{"token": "Rome", "logprob": 0.75}, {"token": "is", "logprob": 0.8}, {"token": "the", "logprob": 0.9}, {"token": "capital", "logprob": 0.6}, {"token": "of", "logprob": 0.6}, {"token": "Italy", "logprob": 0.6}], 75 | # ] 76 | # best_response, semantic_negentropy, nli_scores, tokenprob_semantic_entropy = nli_model_cpu._semantic_entropy_process(candidates=[text1, text2, text3], i=1, logprobs_results=logprobs_results) 77 | 78 | # assert best_response == text2 79 | # assert pytest.approx(semantic_negentropy, abs=1e-5) == 0.6365141682948128 80 | # assert pytest.approx(list(nli_scores.values()), abs=1e-5) == [0.9997053, 0.9997053, 0.24012965, 0.24012965] 81 | # assert pytest.approx(tokenprob_semantic_entropy, abs=1e-5) == 0.6918935849478249 82 | # del nli_model_cpu 83 | # gc.collect() 84 | -------------------------------------------------------------------------------- /uqlm/utils/llm_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | from importlib import import_module 17 | from typing import Any, Dict 18 | from langchain_core.language_models.chat_models import BaseChatModel 19 | 20 | 21 | def _is_serializable(value: Any) -> bool: 22 | """Check if a value is JSON serializable.""" 23 | try: 24 | json.dumps(value) 25 | return True 26 | except (TypeError, ValueError): 27 | return False 28 | 29 | 30 | def save_llm_config(llm: BaseChatModel) -> Dict[str, Any]: 31 | """ 32 | Extract and save LLM configuration by capturing all available parameters. 33 | 34 | Parameters 35 | ---------- 36 | llm : BaseChatModel 37 | The LLM instance to extract config from 38 | 39 | Returns 40 | ------- 41 | dict 42 | Dictionary containing LLM configuration 43 | """ 44 | config = {"class_name": llm.__class__.__name__, "module": llm.__class__.__module__} 45 | 46 | # Internal LangChain attributes that shouldn't be passed to constructors 47 | internal_attrs = {"config_specs", "lc_attributes", "lc_secrets", "model_computed_fields", "model_config", "model_kwargs", "disabled_params", "include_response_headers", "stream_usage", "validate_base_url", "disable_streaming"} 48 | 49 | # Endpoint and URL attributes that should not be saved (will be loaded from environment) 50 | endpoint_attrs = {"base_url", "endpoint", "azure_endpoint", "openai_api_base", "api_base", "api_url", "url"} 51 | 52 | # Save all attributes that are serializable and not None 53 | for attr_name in dir(llm): 54 | # Skip private attributes, methods, special attributes, internal LangChain attrs, and endpoint attrs 55 | if attr_name.startswith("_") or callable(getattr(llm, attr_name)) or attr_name in internal_attrs or attr_name in endpoint_attrs: 56 | continue 57 | 58 | try: 59 | value = getattr(llm, attr_name) 60 | if value is not None and _is_serializable(value): 61 | config[attr_name] = value 62 | except (AttributeError, TypeError): 63 | # Skip attributes that can't be accessed or would cause warnings 64 | continue 65 | 66 | return config 67 | 68 | 69 | def load_llm_config(llm_config: Dict[str, Any]) -> BaseChatModel: 70 | """ 71 | Recreate LLM instance from saved configuration. 72 | 73 | Parameters 74 | ---------- 75 | llm_config : dict 76 | Dictionary containing LLM configuration 77 | 78 | Returns 79 | ------- 80 | BaseChatModel 81 | Recreated LLM instance 82 | """ 83 | try: 84 | # Import the LLM class 85 | module = import_module(llm_config["module"]) 86 | llm_class = getattr(module, llm_config["class_name"]) 87 | 88 | # Extract all parameters except class info 89 | llm_params = {k: v for k, v in llm_config.items() if k not in ["class_name", "module"]} 90 | 91 | # Create LLM instance 92 | return llm_class(**llm_params) 93 | except Exception as e: 94 | raise ValueError(f"Could not recreate LLM from config: {e}") from e 95 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yaml: -------------------------------------------------------------------------------- 1 | # .github/workflows/docs.yml 2 | 3 | name: Build & Deploy Sphinx Docs 4 | 5 | on: 6 | push: 7 | tags: 8 | - 'v*' 9 | 10 | permissions: 11 | contents: write 12 | 13 | jobs: 14 | docs: 15 | runs-on: ubuntu-latest 16 | env: 17 | PANDOC_VERSION: ${{ vars.PANDOC_VERSION }} 18 | 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v4 22 | 23 | - name: Get tag name 24 | id: get_tag 25 | run: echo "tag=${GITHUB_REF_NAME}" >> $GITHUB_OUTPUT 26 | 27 | - name: Show tag 28 | run: | 29 | echo "Tag: ${{ steps.get_tag.outputs.tag }}" 30 | 31 | - name: Extract version without "v" 32 | id: version 33 | run: | 34 | RAW_TAG="${GITHUB_REF_NAME}" 35 | VERSION="${RAW_TAG#v}" 36 | VERSION="${VERSION%.*}" 37 | echo "clean_version=$VERSION" >> $GITHUB_OUTPUT 38 | echo $clean_version 39 | 40 | - name: Update conf.py release version 41 | run: | 42 | sed -i "s/^release = .*/release = '${{ steps.version.outputs.clean_version }}'/" docs/source/conf.py 43 | head -n 20 docs/source/conf.py 44 | 45 | - name: Set up Python 46 | uses: actions/setup-python@v5 47 | with: 48 | python-version: 3.12 49 | 50 | - name: Install Poetry 51 | run: | 52 | pip install poetry 53 | 54 | - name: Download and install Pandoc 55 | run: | 56 | FILE="pandoc-${PANDOC_VERSION}-1-amd64.deb" 57 | URL="https://github.com/jgm/pandoc/releases/download/${PANDOC_VERSION}/${FILE}" 58 | 59 | echo "Downloading $FILE..." 60 | curl -L -o pandoc.deb "$URL" 61 | 62 | echo "Installing Pandoc..." 63 | sudo dpkg -i pandoc.deb 64 | 65 | - name: Verify Pandoc version 66 | run: pandoc --version 67 | 68 | - name: Install dependencies 69 | run: | 70 | poetry lock 71 | poetry install --with docs 72 | eval $(poetry env activate) 73 | 74 | - name: Checkout gh-pages branch to get versions.json 75 | uses: actions/checkout@v4 76 | with: 77 | ref: gh-pages 78 | path: gh-pages 79 | 80 | - name: Update version.json 81 | run: | 82 | VERSION=${{ steps.version.outputs.clean_version }} 83 | python .github/workflows/update_version_json.py "$VERSION" "gh-pages" 84 | cat gh-pages/versions.json 85 | mkdir docsVersion 86 | cp gh-pages/versions.json docsVersion/versions.json 87 | 88 | - name: Build Sphinx docs 89 | run: | 90 | eval $(poetry env activate) 91 | make -C docs clean 92 | make -C docs html 93 | 94 | - name: Deploy to GitHub Pages 95 | uses: peaceiris/actions-gh-pages@v4 96 | with: 97 | github_token: ${{ secrets.GITHUB_TOKEN }} 98 | publish_dir: ./docsVersion 99 | keep_files: true 100 | 101 | - name: Deploy to GitHub Pages 102 | uses: peaceiris/actions-gh-pages@v4 103 | with: 104 | github_token: ${{ secrets.GITHUB_TOKEN }} 105 | publish_dir: ./docs/build/html 106 | destination_dir: v${{ steps.version.outputs.clean_version }} 107 | keep_files: true 108 | 109 | - name: Deploy to GitHub Pages 110 | uses: peaceiris/actions-gh-pages@v4 111 | with: 112 | github_token: ${{ secrets.GITHUB_TOKEN }} 113 | publish_dir: ./docs/build/html 114 | destination_dir: latest 115 | keep_files: true 116 | -------------------------------------------------------------------------------- /uqlm/white_box/baseclass/logprobs_scorer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from abc import ABC 17 | import numpy as np 18 | from typing import List, Dict, Any, Optional, Callable 19 | 20 | 21 | class LogprobsScorer(ABC): 22 | def __init__(self): 23 | pass 24 | 25 | def _norm_prob(self, single_response_logprobs: List[Dict[str, Any]]) -> float: 26 | """Compute length-normalized sequence probability""" 27 | logprobs = self.extract_logprobs(single_response_logprobs) 28 | return np.exp(np.mean(logprobs)) 29 | 30 | def _seq_prob(self, single_response_logprobs: List[Dict[str, Any]]) -> float: 31 | """Compute sequence probability""" 32 | probs = self.extract_probs(single_response_logprobs) 33 | return np.prod(probs) 34 | 35 | def _entropy_from_logprobs(self, logprobs_list: np.ndarray) -> float: 36 | """Compute entropy from list of logprobs""" 37 | probs_list = np.exp(logprobs_list) 38 | return self._entropy_from_probs(probs_list) 39 | 40 | def extract_probs(self, single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray: 41 | """Extract probabilities from token data""" 42 | return np.exp(self.extract_logprobs(single_response_logprobs)) 43 | 44 | @staticmethod 45 | def _compute_single_generation_scores(logprobs_results: List[List[Dict[str, Any]]], score_fn: Callable) -> List[float]: 46 | """Generic method to compute scores using the provided scoring function""" 47 | return [np.nan if not r else score_fn(r) for r in logprobs_results] 48 | 49 | @staticmethod 50 | def _entropy_from_probs(probs_list: np.ndarray, texts: Optional[List[str]] = None) -> float: 51 | """ 52 | Compute entropy from a list of probabilities. 53 | """ 54 | normalized_probs = probs_list / np.sum(probs_list) # normalize probabilities to sum to 1 55 | 56 | if texts is None: 57 | # Case 1: If no responses are provided, treat all probabilities as distinct events 58 | logprobs = np.log(normalized_probs) 59 | return -np.sum(normalized_probs * logprobs) 60 | else: 61 | # Case 2: If responses, account for duplicates 62 | aggregated_probs = {} 63 | for text, prob in zip(texts, normalized_probs): 64 | if text in aggregated_probs: 65 | aggregated_probs[text] += prob 66 | else: 67 | aggregated_probs[text] = prob 68 | unique_probs = np.array(list(aggregated_probs.values())) 69 | logprobs = np.log(unique_probs) 70 | return -np.sum(unique_probs * logprobs) 71 | 72 | @staticmethod 73 | def extract_top_logprobs(single_response_logprobs: List[Dict[str, Any]]) -> List[np.ndarray]: 74 | """Extract top log probabilities for each token""" 75 | return [np.array([item["logprob"] for item in d["top_logprobs"]]) for d in single_response_logprobs] 76 | 77 | @staticmethod 78 | def extract_logprobs(single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray: 79 | """Extract log probabilities from token data""" 80 | return np.array([d["logprob"] for d in single_response_logprobs]) 81 | -------------------------------------------------------------------------------- /uqlm/white_box/top_logprobs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | from typing import List, Dict, Any 18 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer 19 | 20 | 21 | TOP_LOGPROBS_SCORER_NAMES = ["min_token_negentropy", "mean_token_negentropy", "probability_margin"] 22 | 23 | 24 | class TopLogprobsScorer(LogprobsScorer): 25 | def __init__(self, scorers: List[str] = TOP_LOGPROBS_SCORER_NAMES): 26 | """Class for computing WhiteBox UQ scores with a single generation""" 27 | super().__init__() 28 | self.scorers = scorers 29 | 30 | def evaluate(self, logprobs_results: List[List[Dict[str, Any]]]) -> Dict[str, List[float]]: 31 | """Compute scores from top logprobs results""" 32 | scores_dict = {"mean_token_negentropy": self._compute_single_generation_scores(logprobs_results, self._mean_token_negentropy), "min_token_negentropy": self._compute_single_generation_scores(logprobs_results, self._min_token_negentropy), "probability_margin": self._compute_single_generation_scores(logprobs_results, self._probability_margin)} 33 | return {k: scores_dict[k] for k in self.scorers} 34 | 35 | def _compute_token_entropies(self, single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray: 36 | """Compute entropy for each token in the sequence""" 37 | top_logprobs_list = self.extract_top_logprobs(single_response_logprobs) 38 | return np.array([self._entropy_from_logprobs(top_logprobs) for top_logprobs in top_logprobs_list]) 39 | 40 | def _compute_token_negentropies(self, single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray: 41 | """Compute negentropy for each token in the sequence""" 42 | entropies = self._compute_token_entropies(single_response_logprobs) 43 | top_logprobs_list = self.extract_top_logprobs(single_response_logprobs) 44 | k_values = np.array([len(top_logprobs) for top_logprobs in top_logprobs_list]) 45 | max_entropies = np.log(k_values) 46 | negentropies = 1 - entropies / max_entropies 47 | return negentropies 48 | 49 | def _mean_token_negentropy(self, single_response_logprobs: List[Dict[str, Any]]) -> float: 50 | """Compute mean token negentropy across the sequence""" 51 | negentropies = self._compute_token_negentropies(single_response_logprobs) 52 | return np.mean(negentropies) 53 | 54 | def _min_token_negentropy(self, single_response_logprobs: List[Dict[str, Any]]) -> float: 55 | """Compute minimum token negentropy across the sequence""" 56 | negentropies = self._compute_token_negentropies(single_response_logprobs) 57 | return np.min(negentropies) 58 | 59 | def _probability_margin(self, single_response_logprobs: List[Dict[str, Any]]) -> float: 60 | """Compute mean probability margin (difference between top two probabilities)""" 61 | top_logprobs_list = self.extract_top_logprobs(single_response_logprobs) 62 | margins = [] 63 | try: 64 | for top_logprobs in top_logprobs_list: 65 | probs = np.exp(top_logprobs) 66 | probs = np.sort(probs)[::-1] 67 | margin = probs[0] - probs[1] 68 | margins.append(margin) 69 | return np.mean(margins) 70 | except IndexError: 71 | print("top_logprobs were not available. Unable to compute associated scores.") 72 | return np.nan 73 | -------------------------------------------------------------------------------- /tests/test_top_logprobs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import numpy as np 17 | from uqlm.white_box.top_logprobs import TopLogprobsScorer, TOP_LOGPROBS_SCORER_NAMES 18 | 19 | 20 | @pytest.fixture 21 | def mock_logprobs_results(): 22 | """Fixture to provide mock logprobs results.""" 23 | return [[{"token": "a", "logprobs": [-0.1, -1.0, -2.0]}, {"token": "b", "logprobs": [-0.2, -0.5, -1.5]}], [{"token": "c", "logprobs": [-0.3, -0.7, -1.2]}, {"token": "d", "logprobs": [-0.4, -0.8, -1.0]}]] 24 | 25 | 26 | @pytest.fixture 27 | def scorer(): 28 | """Fixture to create a TopLogprobsScorer instance.""" 29 | return TopLogprobsScorer() 30 | 31 | 32 | def test_evaluate(mock_logprobs_results, scorer, monkeypatch): 33 | """Test the evaluate method of TopLogprobsScorer.""" 34 | # Mock the extract_top_logprobs method to return only the logprobs 35 | monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs]) 36 | 37 | # Mock the _entropy_from_logprobs method to return a fixed entropy value 38 | monkeypatch.setattr(scorer, "_entropy_from_logprobs", lambda logprobs: 0.5) 39 | 40 | result = scorer.evaluate(mock_logprobs_results) 41 | 42 | # Verify the result contains all scorer names 43 | assert set(result.keys()) == set(TOP_LOGPROBS_SCORER_NAMES) 44 | 45 | # Verify the length of the results matches the number of sequences 46 | for key in result: 47 | assert len(result[key]) == len(mock_logprobs_results) 48 | 49 | 50 | def test_mean_token_negentropy(mock_logprobs_results, scorer, monkeypatch): 51 | """Test the _mean_token_negentropy method.""" 52 | # Mock the extract_top_logprobs method 53 | monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs]) 54 | 55 | # Mock the _entropy_from_logprobs method 56 | monkeypatch.setattr(scorer, "_entropy_from_logprobs", lambda logprobs: 0.5) 57 | 58 | result = scorer._mean_token_negentropy(mock_logprobs_results[0]) 59 | assert isinstance(result, float) 60 | assert result >= 0.0 and result <= 1.0 61 | 62 | 63 | def test_min_token_negentropy(mock_logprobs_results, scorer, monkeypatch): 64 | """Test the _min_token_negentropy method.""" 65 | # Mock the extract_top_logprobs method 66 | monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs]) 67 | 68 | # Mock the _entropy_from_logprobs method 69 | monkeypatch.setattr(scorer, "_entropy_from_logprobs", lambda logprobs: 0.5) 70 | 71 | result = scorer._min_token_negentropy(mock_logprobs_results[0]) 72 | assert isinstance(result, float) 73 | assert result >= 0.0 and result <= 1.0 74 | 75 | 76 | def test_probability_margin(mock_logprobs_results, scorer, monkeypatch): 77 | """Test the _probability_margin method.""" 78 | # Mock the extract_top_logprobs method 79 | monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs]) 80 | 81 | result = scorer._probability_margin(mock_logprobs_results[0]) 82 | assert isinstance(result, float) 83 | assert result >= 0.0 and result <= 1.0 84 | 85 | 86 | def test_probability_margin_with_empty_logprobs(scorer): 87 | """Test the _probability_margin method with empty logprobs.""" 88 | result = scorer._probability_margin([]) 89 | assert np.isnan(result) 90 | -------------------------------------------------------------------------------- /uqlm/black_box/cosine.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Any, List, Tuple, Optional 17 | 18 | import numpy as np 19 | from numpy.linalg import norm 20 | import time 21 | from rich.progress import Progress 22 | 23 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer 24 | 25 | 26 | class CosineScorer(SimilarityScorer): 27 | def __init__(self, transformer: str = "all-MiniLM-L6-v2") -> None: 28 | """Compute cosine similarity betwee original and candidate responses. 29 | 30 | Parameters 31 | ---------- 32 | transformer : str (HuggingFace sentence transformer), default='all-MiniLM-L6-v2' 33 | Specifies which huggingface sentence transformer to use when computing cosine distance. See 34 | https://huggingface.co/sentence-transformers?sort_models=likes#models 35 | for more information. The recommended sentence transformer is 'all-MiniLM-L6-v2'. 36 | """ 37 | from sentence_transformers import SentenceTransformer 38 | 39 | self.transformer = transformer 40 | self.model = SentenceTransformer(f"sentence-transformers/{transformer}") 41 | 42 | def evaluate(self, responses: List[str], sampled_responses: List[List[str]], progress_bar: Optional[Progress] = None) -> List[float]: 43 | """ 44 | This method computes model-based text similarity metrics values for the provided pairs of texts. 45 | 46 | Parameters 47 | ---------- 48 | responses : list of strings 49 | Original LLM response 50 | 51 | sampled_responses : list of list of strings 52 | Candidate responses to be compared to the original response 53 | 54 | progress_bar : rich.progress.Progress, default=None 55 | If provided, displays a progress bar while scoring responses 56 | 57 | Returns 58 | ------- 59 | List of float 60 | Mean cosine similarity values 61 | """ 62 | if progress_bar: 63 | progress_task = progress_bar.add_task(" - Scoring responses with cosine similarity...", total=len(responses)) 64 | results = [] 65 | for i in range(len(responses)): 66 | score = self._compute_score(response=responses[i], candidates=sampled_responses[i]) 67 | results.append(score) 68 | if progress_bar: 69 | progress_bar.update(progress_task, advance=1) 70 | time.sleep(0.1) 71 | return results 72 | 73 | def _get_embeddings(self, texts1: List[str], texts2: List[str]) -> Tuple[Any, Any]: 74 | """ 75 | Helper function to get embeddings 76 | """ 77 | embeddings1 = self.model.encode(texts1) 78 | embeddings2 = self.model.encode(texts2) 79 | return embeddings1, embeddings2 80 | 81 | def _compute_score(self, response: str, candidates: List[str]) -> float: 82 | """ 83 | Helper function to get cosine dist 84 | """ 85 | duplicate_responses = [response] * len(candidates) 86 | embeddings1, embeddings2 = self._get_embeddings(duplicate_responses, candidates) 87 | cosine_list = [] 88 | for i in range(0, len(embeddings1)): 89 | cosine_i = np.dot(embeddings1[i], embeddings2[i]) / (norm(embeddings1[i]) * norm(embeddings2[i])) 90 | norm_cosine_i = 0.5 + cosine_i / 2 91 | cosine_list.append(norm_cosine_i) 92 | return np.mean(cosine_list) 93 | -------------------------------------------------------------------------------- /tests/data/scorers/generate_data_ensemble.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import asyncio 16 | import os 17 | import json 18 | from dotenv import load_dotenv, find_dotenv 19 | 20 | from uqlm.utils.dataloader import load_example_dataset 21 | from uqlm.scorers import UQEnsemble 22 | from langchain_openai import AzureChatOpenAI 23 | 24 | 25 | async def main(): 26 | # svamp dataset to be used as a prod dataset 27 | svamp = load_example_dataset("svamp").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5) 28 | 29 | # Define prompts 30 | MATH_INSTRUCTION = "When you solve this math problem only return the answer with no additional text.\n" 31 | prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question] 32 | 33 | # User to populate .env file with API credentials 34 | load_dotenv(find_dotenv()) 35 | 36 | API_KEY = os.getenv("API_KEY") 37 | API_BASE = os.getenv("API_BASE") 38 | API_TYPE = os.getenv("API_TYPE") 39 | API_VERSION = os.getenv("API_VERSION") 40 | DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME") 41 | 42 | # This will be our main LLM for generation 43 | gpt = AzureChatOpenAI( 44 | deployment_name=DEPLOYMENT_NAME, 45 | openai_api_key=API_KEY, 46 | azure_endpoint=API_BASE, 47 | openai_api_type=API_TYPE, 48 | openai_api_version=API_VERSION, 49 | temperature=1, # User to set temperature 50 | ) 51 | 52 | def math_postprocessor(s: str) -> str: 53 | """Helper function to strip non-numeric characters""" 54 | return "".join(c for c in s if c.isdigit()) 55 | 56 | components = [ 57 | "exact_match", # Measures proportion of candidate responses that match original response 58 | "noncontradiction", # mean non-contradiction probability between candidate responses and original response 59 | "min_probability", # measures semantic volatility 60 | gpt, # Using same LLM as external judge for testing 61 | ] 62 | 63 | uqe = UQEnsemble( 64 | llm=gpt, 65 | max_calls_per_min=250, 66 | postprocessor=math_postprocessor, 67 | use_n_param=False, # Set True if using AzureChatOpenAI for faster generation 68 | scorers=components, 69 | ) 70 | 71 | results = await uqe.generate_and_score(prompts=prompts, num_responses=5) 72 | store_results = {"ensemble1": results.to_dict()} 73 | 74 | uqe = UQEnsemble( 75 | llm=gpt, 76 | max_calls_per_min=250, 77 | postprocessor=math_postprocessor, 78 | use_n_param=False, # Set True if using AzureChatOpenAI for faster generation 79 | ) 80 | 81 | results = await uqe.generate_and_score(prompts=prompts, num_responses=5) 82 | store_results["bsdetector"] = results.to_dict() 83 | 84 | components1 = [ 85 | "min_probability", # measures semantic volatility 86 | gpt, # Using same LLM as external judge for testing 87 | ] 88 | 89 | uqe1 = UQEnsemble( 90 | llm=gpt, 91 | max_calls_per_min=250, 92 | postprocessor=math_postprocessor, 93 | use_n_param=False, # Set True if using AzureChatOpenAI for faster generation 94 | scorers=components1, 95 | ) 96 | 97 | results1 = await uqe1.generate_and_score(prompts=prompts) 98 | store_results["ensemble2"] = results1.to_dict() 99 | 100 | results_file = "ensemble_results_file.json" 101 | with open(results_file, "w") as f: 102 | json.dump(store_results, f) 103 | 104 | 105 | if __name__ == "__main__": 106 | asyncio.run(main()) 107 | -------------------------------------------------------------------------------- /uqlm/white_box/p_true.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import time 17 | from typing import Any, Dict, List, Optional 18 | import numpy as np 19 | from rich.progress import Progress 20 | from langchain_core.language_models.chat_models import BaseChatModel 21 | from uqlm.utils.response_generator import ResponseGenerator 22 | 23 | PTRUE_SYSTEM_PROMPT = """ 24 | Your task is to determine whether a given answer to a question is correct. 25 | 26 | Guidelines for your evaluation: 27 | - Do NOT penalize phrasing differences 28 | - Respond with EXACTLY one word: "True" or "False" 29 | - Answer "True" if the response is correct 30 | - Answer "False" if the response is incorrect 31 | - Do not explain your reasoning or provide any additional commentary 32 | """ 33 | 34 | 35 | class PTrueScorer: 36 | def __init__(self, llm: BaseChatModel, max_calls_per_min: Optional[int] = None) -> None: 37 | llm.logprobs = True 38 | self.response_generator = ResponseGenerator(llm, max_calls_per_min=max_calls_per_min) 39 | self.response_generator.response_generator_type = "p_true" 40 | 41 | async def evaluate(self, prompts: List[str], responses: List[str], sampled_responses: Optional[List[List[str]]] = None, progress_bar: Optional[Progress] = None) -> Dict[str, float]: 42 | if not sampled_responses: 43 | sampled_responses = [None] * len(responses) 44 | 45 | ptrue_prompts = [self._construct_ptrue_prompt(original_prompt=original_prompt_i, original_response=original_response_i, sampled_responses=sampled_responses_i) for original_prompt_i, original_response_i, sampled_responses_i in zip(prompts, responses, sampled_responses)] 46 | ptrue_responses = await self.response_generator.generate_responses(prompts=ptrue_prompts, system_prompt=PTRUE_SYSTEM_PROMPT, progress_bar=progress_bar) 47 | time.sleep(0.1) 48 | logprob_results = ptrue_responses["metadata"]["logprobs"] 49 | ptrue_scores = [self._extract_ptrue_from_logprobs_result(logprob_result) for logprob_result in logprob_results] 50 | return {"p_true": ptrue_scores} 51 | 52 | @staticmethod 53 | def _extract_ptrue_from_logprobs_result(logprobs_result: List[Dict[str, Any]]) -> float: 54 | first_token_data = logprobs_result[0] 55 | token = first_token_data.get("token", "").strip().lower() 56 | logprob = first_token_data.get("logprob", None) 57 | 58 | if logprob is not None: 59 | prob = np.exp(logprob) 60 | if token.startswith("true"): 61 | return prob # High prob means high P_true 62 | elif token.startswith("false"): 63 | return 1.0 - prob # High prob of False means low P_true 64 | else: 65 | return np.nan 66 | 67 | @staticmethod 68 | def _construct_ptrue_prompt(original_prompt: str, original_response: str, sampled_responses: Optional[List[str]] = None) -> str: 69 | proposed_answers_text = "" 70 | if sampled_responses: 71 | unique_responses = list(set(sampled_responses + [original_response])) 72 | 73 | if len(unique_responses) > 1: 74 | proposed_answers_text = "\n\nHere are some possible answers:\n" 75 | for possible_answer in unique_responses: 76 | proposed_answers_text += possible_answer + "\n" 77 | 78 | ptrue_prompt = f""" 79 | Question: {original_prompt} 80 | {proposed_answers_text} 81 | Proposed Answer: {original_response} 82 | 83 | Is the proposed answer to the question true or false? Answer with only one word true/false. 84 | 85 | True or False: 86 | """ 87 | return ptrue_prompt 88 | -------------------------------------------------------------------------------- /tests/data/scorers/bsdetector_results_file.json: -------------------------------------------------------------------------------- 1 | {"prompts": ["Which part of the human body produces insulin?", "What color are the two stars on the national flag of Syria", "How many 'm's are there in the word strawberry"], "responses": ["The pancreas is the organ in the human body that produces insulin. More specifically, insulin is produced by specialized cells called beta cells in the pancreas. The release of insulin helps regulate blood sugar levels and is essential for the body's ability to use and store glucose for energy.", "The two stars on the national flag of Syria are red.", "There are no 'm's in the word \"strawberry.\""], "sampled_responses": [["The pancreas is the organ in the human body that produces insulin. Insulin is a hormone that helps regulate blood sugar levels.", "The pancreas is the organ in the human body that produces insulin. Insulin is a hormone that regulates blood sugar levels. It is produced by the beta cells in the islets of Langerhans in the pancreas.", "Insulin is produced by the beta cells in the pancreas, specifically in clusters called the Islets of Langerhans. These cells play a crucial role in regulating blood sugar levels.", "The pancreas is the organ in the human body that produces insulin. Insulin is produced by specific cells within the pancreas called beta cells, located in the islets of Langerhans.", "The pancreas is the organ in the human body that produces insulin. Insulin is produced and released by special cells called beta cells in the pancreas."], ["The two stars on the national flag of Syria are both red.", "The two stars on the national flag of Syria are red.", "The two stars on the national flag of Syria are colored red.", "The two stars on the national flag of Syria are black.", "The two stars on the national flag of Syria are green."], ["There are no 'm's in the word \"strawberry.\"", "There are no 'm's in the word strawberry.", "There is only one 'm' in the word \"strawberry\".", "There are zero 'm's in the word \"strawberry.\"", "There are 0 'm's in the word \"strawberry.\""]], "confidence_scores": [0.8596294307708741, 0.6640560493469239, 0.47659782075881957], "sr_scores": [1.0, 1.0, 0.0], "oc_scores": [0.799470615386963, 0.5200800704956056, 0.6808540296554566], "indicator_scores": [0.0, 0.2, 0.2], "sr_data": {"self_reflection_prompts": ["Question: Which part of the human body produces insulin?, Proposed Answer: The pancreas is the organ in the human body that produces insulin. More specifically, insulin is produced by specialized cells called beta cells in the pancreas. The release of insulin helps regulate blood sugar levels and is essential for the body's ability to use and store glucose for energy.. Your task is to look at the question and answer provided and determine if the answer is correct. You are to respond with ONLY one of: \"Correct\", \"Incorrect\", or \"I am not sure\". YOUR ANSWER MUST ONLY CONTAIN ONE OF \"Correct\", \"Incorrect\", or \"I am not sure\". DO NOT ANSWER THE QUESTION AGAIN. ONLY DETERMINE IF THE ANSWER TO THE QUESTION IS \"Correct\", \"Incorrect\", or \"I am not sure\".", "Question: What color are the two stars on the national flag of Syria, Proposed Answer: The two stars on the national flag of Syria are red.. Your task is to look at the question and answer provided and determine if the answer is correct. You are to respond with ONLY one of: \"Correct\", \"Incorrect\", or \"I am not sure\". YOUR ANSWER MUST ONLY CONTAIN ONE OF \"Correct\", \"Incorrect\", or \"I am not sure\". DO NOT ANSWER THE QUESTION AGAIN. ONLY DETERMINE IF THE ANSWER TO THE QUESTION IS \"Correct\", \"Incorrect\", or \"I am not sure\".", "Question: How many 'm's are there in the word strawberry, Proposed Answer: There are no 'm's in the word \"strawberry.\". Your task is to look at the question and answer provided and determine if the answer is correct. You are to respond with ONLY one of: \"Correct\", \"Incorrect\", or \"I am not sure\". YOUR ANSWER MUST ONLY CONTAIN ONE OF \"Correct\", \"Incorrect\", or \"I am not sure\". DO NOT ANSWER THE QUESTION AGAIN. ONLY DETERMINE IF THE ANSWER TO THE QUESTION IS \"Correct\", \"Incorrect\", or \"I am not sure\"."], "self_reflection_responses": ["Correct", "Correct", "Incorrect"], "self_reflection_scores": [1.0, 1.0, 0.0]}, "correct_indicators": [true, true, false], "updated_oc_scores": [0.8454951459984235, 0.5385066827051275, 0.7085361990580302], "updated_confidence_scores": [0.976214751274521, 0.9289554143270282, 0.10907560046902409], "optimized_parameters": {"weights": [0.8460550066262872, 0.15394499337371276], "thresh": 0.11}} -------------------------------------------------------------------------------- /uqlm/utils/grader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import time 17 | from typing import List, Optional 18 | from rich.progress import Progress 19 | from langchain_core.language_models.chat_models import BaseChatModel 20 | from uqlm.utils.response_generator import ResponseGenerator 21 | 22 | GRADER_SYSTEM_PROMPT = """ 23 | You are an expert grading assistant designed to evaluate answers against a provided answer key. Your task is to determine whether a proposed answer is correct by comparing it to the ground truth answer(s). 24 | 25 | ## Your Responsibilities: 26 | 27 | 1. **Accept the ground truth as absolute**: The provided answer key contains the gold standard answer(s) and must be treated as correct, regardless of your own knowledge or beliefs. 28 | 29 | 2. **Evaluate the proposed answer**: Determine if the proposed answer aligns with any of the ground truth answers in terms of factual content, not just wording. 30 | 31 | 3. **Focus on semantic equivalence**: Look for meaning rather than exact wording. Two answers can be expressed differently but still be semantically equivalent. 32 | 33 | 4. **Provide ONLY a binary judgment**: Your entire response must be either the single word "yes" or "no" based solely on the answer's alignment with any of the ground truth answers. Answer "yes" if correct, "no" if incorrect. 34 | 35 | 5. **Avoid any explanation or reasoning**: Do not provide any justification, commentary, or additional text beyond the single word judgment. 36 | 37 | 6. **Be charitable but accurate**: Give credit when the proposed answer captures the essential elements of any of the ground truth answers, but don't overlook substantive differences. 38 | 39 | Remember: You must return ONLY the word "yes" or "no" with no additional text. The ground truth answer(s) must be treated as correct even if you believe otherwise. 40 | """ 41 | 42 | 43 | class LLMGrader: 44 | def __init__(self, llm: BaseChatModel, max_calls_per_min: Optional[int] = None) -> None: 45 | llm.logprobs = True 46 | self.response_generator = ResponseGenerator(llm, max_calls_per_min=max_calls_per_min) 47 | self.response_generator.response_generator_type = "grader" 48 | 49 | async def grade_responses(self, prompts: List[str], responses: List[str], answers: List[str], progress_bar: Optional[Progress] = None) -> List[bool]: 50 | grader_prompts = [self._construct_grader_prompt(prompt, response, answer) for prompt, response, answer in zip(prompts, responses, answers)] 51 | grader_responses = await self.response_generator.generate_responses(prompts=grader_prompts, system_prompt=GRADER_SYSTEM_PROMPT, progress_bar=progress_bar) 52 | time.sleep(0.1) 53 | bool_grades = [self._extract_grades(grader_response) for grader_response in grader_responses["data"]["response"]] 54 | return bool_grades 55 | 56 | @staticmethod 57 | def _extract_grades(grader_response: str) -> bool: 58 | grader_response_stripped = grader_response.strip().lower() 59 | if "yes" in grader_response_stripped: 60 | return True 61 | elif "no" in grader_response_stripped: 62 | return False 63 | else: 64 | return False 65 | 66 | @staticmethod 67 | def _construct_grader_prompt(prompt: str, response: str, acceptable_answers: List[str]) -> str: 68 | grader_prompt = f""" 69 | Your task is to grade the following proposed answer against the provided answer key. The ground truth is the gold standard regardless of any other information you may have. Return ONLY the word "yes" or "no", with no additional text, based on whether the proposed answer aligns with any of the ground truth answers. Answer "yes" if correct, "no" if incorrect. 70 | 71 | **Question:** 72 | {prompt} 73 | 74 | **Ground Truth Answers (Answer Key):** 75 | {acceptable_answers} 76 | 77 | **Proposed Answer to Grade:** 78 | {response} 79 | 80 | Now your answer is (yes or no): 81 | """ 82 | return grader_prompt 83 | -------------------------------------------------------------------------------- /tests/test_logprobs_scorer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import numpy as np 17 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer 18 | 19 | 20 | @pytest.fixture 21 | def mock_single_response_logprobs(): 22 | """Fixture to provide mock single response logprobs.""" 23 | return [{"logprob": -0.1, "top_logprobs": [{"logprob": -0.1}, {"logprob": -1.0}, {"logprob": -2.0}]}, {"logprob": -0.2, "top_logprobs": [{"logprob": -0.2}, {"logprob": -0.5}, {"logprob": -1.5}]}] 24 | 25 | 26 | @pytest.fixture 27 | def mock_logprobs_results(mock_single_response_logprobs): 28 | """Fixture to provide mock logprobs results.""" 29 | return [mock_single_response_logprobs, mock_single_response_logprobs] 30 | 31 | 32 | @pytest.fixture 33 | def scorer(): 34 | """Fixture to create a LogprobsScorer instance.""" 35 | return LogprobsScorer() 36 | 37 | 38 | def test_norm_prob(mock_single_response_logprobs, scorer): 39 | """Test the _norm_prob method.""" 40 | result = scorer._norm_prob(mock_single_response_logprobs) 41 | assert isinstance(result, float) 42 | assert result > 0.0 and result <= 1.0 43 | 44 | 45 | def test_seq_prob(mock_single_response_logprobs, scorer): 46 | """Test the _seq_prob method.""" 47 | result = scorer._seq_prob(mock_single_response_logprobs) 48 | assert isinstance(result, float) 49 | assert result > 0.0 and result <= 1.0 50 | 51 | 52 | def test_entropy_from_logprobs(scorer): 53 | """Test the _entropy_from_logprobs method.""" 54 | logprobs_list = np.array([-0.1, -0.2, -0.3]) 55 | result = scorer._entropy_from_logprobs(logprobs_list) 56 | assert isinstance(result, float) 57 | assert result >= 0.0 58 | 59 | 60 | def test_entropy_from_probs(scorer): 61 | """Test the _entropy_from_probs method.""" 62 | probs_list = np.array([0.5, 0.3, 0.2]) 63 | result = scorer._entropy_from_probs(probs_list) 64 | assert isinstance(result, float) 65 | assert result >= 0.0 66 | 67 | 68 | def test_entropy_from_probs_with_texts(scorer): 69 | """Test the _entropy_from_probs method with texts.""" 70 | probs_list = np.array([0.5, 0.3, 0.2]) 71 | texts = ["a", "b", "a"] 72 | result = scorer._entropy_from_probs(probs_list, texts) 73 | assert isinstance(result, float) 74 | assert result >= 0.0 75 | 76 | 77 | def test_extract_probs(mock_single_response_logprobs, scorer): 78 | """Test the extract_probs method.""" 79 | result = scorer.extract_probs(mock_single_response_logprobs) 80 | assert isinstance(result, np.ndarray) 81 | assert result.shape == (len(mock_single_response_logprobs),) 82 | assert np.all(result > 0.0) and np.all(result <= 1.0) 83 | 84 | 85 | def test_extract_logprobs(mock_single_response_logprobs, scorer): 86 | """Test the extract_logprobs method.""" 87 | result = scorer.extract_logprobs(mock_single_response_logprobs) 88 | assert isinstance(result, np.ndarray) 89 | assert result.shape == (len(mock_single_response_logprobs),) 90 | assert np.all(result < 0.0) # Logprobs should be negative 91 | 92 | 93 | def test_extract_top_logprobs(mock_single_response_logprobs, scorer): 94 | """Test the extract_top_logprobs method.""" 95 | result = scorer.extract_top_logprobs(mock_single_response_logprobs) 96 | assert isinstance(result, list) 97 | assert len(result) == len(mock_single_response_logprobs) 98 | for top_logprobs in result: 99 | assert isinstance(top_logprobs, np.ndarray) 100 | assert top_logprobs.shape[0] > 0 101 | 102 | 103 | def test_compute_single_generation_scores(mock_logprobs_results, scorer): 104 | """Test the _compute_single_generation_scores method.""" 105 | 106 | def mock_score_fn(single_response_logprobs): 107 | return 0.9 108 | 109 | result = scorer._compute_single_generation_scores(mock_logprobs_results, mock_score_fn) 110 | assert isinstance(result, list) 111 | assert len(result) == len(mock_logprobs_results) 112 | assert all(score == 0.9 for score in result) 113 | -------------------------------------------------------------------------------- /tests/test_p_true.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import AsyncMock, MagicMock 3 | from uqlm.white_box.p_true import PTrueScorer, PTRUE_SYSTEM_PROMPT 4 | from uqlm.utils.response_generator import ResponseGenerator 5 | from langchain_openai import AzureChatOpenAI 6 | 7 | # REUSABLE TEST DATA 8 | MOCKED_PROMPTS = ["What is 2+2?"] 9 | MOCKED_RESPONSES = ["4"] 10 | MOCKED_SAMPLED_RESPONSES = [["4", "5"]] 11 | 12 | 13 | # REUSABLE MOCK OBJECT CREATOR 14 | def create_mock_llm(): 15 | """Reusable mock LLM object""" 16 | mock_llm = MagicMock(spec=AzureChatOpenAI) 17 | mock_llm.logprobs = True 18 | mock_llm.temperature = 0.7 19 | 20 | # Mock the agenerate method 21 | async def mock_agenerate(messages, **kwargs): 22 | class MockGeneration: 23 | def __init__(self): 24 | self.text = "Mocked response" 25 | self.generation_info = {"logprobs_result": [{"token": "True", "logprob": -0.1}]} 26 | 27 | class MockResult: 28 | def __init__(self): 29 | self.generations = [[MockGeneration()]] 30 | 31 | return MockResult() 32 | 33 | mock_llm.agenerate = mock_agenerate 34 | return mock_llm 35 | 36 | 37 | @pytest.fixture 38 | def mock_response_generator(): 39 | """Fixture to create a mock ResponseGenerator.""" 40 | mock_response_generator = AsyncMock() 41 | mock_response_generator.generate_responses = AsyncMock(return_value={"metadata": {"logprobs": [[{"token": "True", "logprob": -0.1}], [{"token": "False", "logprob": -2.0}]]}}) 42 | return mock_response_generator 43 | 44 | 45 | @pytest.fixture 46 | def ptrue_scorer(mock_response_generator, monkeypatch): 47 | """Fixture to create a PTrueScorer with a mocked ResponseGenerator.""" 48 | mock_llm = create_mock_llm() 49 | 50 | # Replace the ResponseGenerator with the mock 51 | monkeypatch.setattr(ResponseGenerator, "__init__", lambda self, *args, **kwargs: None) 52 | monkeypatch.setattr(ResponseGenerator, "generate_responses", mock_response_generator.generate_responses) 53 | 54 | scorer = PTrueScorer(llm=mock_llm) 55 | scorer.response_generator = mock_response_generator 56 | return scorer 57 | 58 | 59 | @pytest.mark.asyncio 60 | async def test_ptrue_scorer_evaluate(ptrue_scorer, mock_response_generator): 61 | """Test the evaluate method of PTrueScorer.""" 62 | result = await ptrue_scorer.evaluate(MOCKED_PROMPTS, MOCKED_RESPONSES, MOCKED_SAMPLED_RESPONSES) 63 | 64 | # Verify the ResponseGenerator was called with the correct arguments 65 | mock_response_generator.generate_responses.assert_called_once() 66 | args, kwargs = mock_response_generator.generate_responses.call_args 67 | 68 | # Normalize the actual prompt to remove extra whitespace 69 | actual_prompt = kwargs["prompts"][0].strip() 70 | expected_prompt_start = "Question: What is 2+2?" 71 | 72 | assert actual_prompt.startswith(expected_prompt_start), f"Expected prompt to start with '{expected_prompt_start}', but got '{actual_prompt}'" 73 | 74 | assert kwargs["system_prompt"] == PTRUE_SYSTEM_PROMPT 75 | 76 | # Verify the result 77 | assert "p_true" in result 78 | assert len(result["p_true"]) == 2 79 | assert result["p_true"] == [0.9048374180359595, 0.8646647167633873] # Based on mocked logprobs 80 | 81 | 82 | def test_extract_ptrue_from_logprobs_result(): 83 | """Test the _extract_ptrue_from_logprobs_result method.""" 84 | logprobs_result = [{"token": "True", "logprob": -0.1}] 85 | score = PTrueScorer._extract_ptrue_from_logprobs_result(logprobs_result) 86 | assert score == pytest.approx(0.9048, rel=1e-3) 87 | 88 | logprobs_result = [{"token": "False", "logprob": -0.1}] 89 | score = PTrueScorer._extract_ptrue_from_logprobs_result(logprobs_result) 90 | assert score == pytest.approx(0.0952, rel=1e-3) 91 | 92 | logprobs_result = [{"token": "Unknown", "logprob": -0.1}] 93 | score = PTrueScorer._extract_ptrue_from_logprobs_result(logprobs_result) 94 | assert score != score # NaN check 95 | 96 | 97 | def test_construct_ptrue_prompt(): 98 | """Test the _construct_ptrue_prompt method.""" 99 | prompt = "What is 2+2?" 100 | response = "4" 101 | sampled_responses = ["4", "5"] 102 | 103 | result = PTrueScorer._construct_ptrue_prompt(prompt, response, sampled_responses) 104 | assert "Question: What is 2+2?" in result 105 | assert "Proposed Answer: 4" in result 106 | assert "Here are some possible answers:" in result 107 | assert "4" in result 108 | assert "5" in result 109 | 110 | # Test without sampled_responses 111 | result = PTrueScorer._construct_ptrue_prompt(prompt, response, None) 112 | assert "Here are some possible answers:" not in result 113 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "uqlm" 3 | version = "0.4.5" 4 | description = "UQLM (Uncertainty Quantification for Language Models) is a Python package for UQ-based LLM hallucination detection." 5 | authors = ["Dylan Bouchard ", "Mohit Singh Chauhan "] 6 | maintainers = [ 7 | "Dylan Bouchard ", 8 | "Mohit Singh Chauhan ", 9 | "David Skarbrevik ", 10 | "Ho-Kyeong Ra ", 11 | "Viren Bajaj ", 12 | "Zeya Ahmad " 13 | ] 14 | repository = "https://github.com/cvs-health/uqlm" 15 | homepage = "https://github.com/cvs-health/uqlm" 16 | documentation = "https://cvs-health.github.io/uqlm/latest/index.html" 17 | license = "Apache-2.0" 18 | readme = "assets/README_PYPI.md" 19 | classifiers = [ 20 | "Programming Language :: Python :: 3 :: Only", 21 | "Operating System :: OS Independent", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | "Topic :: Software Development :: Libraries :: Python Modules" 24 | ] 25 | packages = [ 26 | { include = "uqlm" }, 27 | { include = "uqlm/scorers" }, 28 | { include = "uqlm/judges" }, 29 | { include = "uqlm/black_box" }, 30 | { include = "uqlm/white_box" }, 31 | { include = "uqlm/calibration" }, 32 | { include = "uqlm/resources" }, 33 | { include = "uqlm/utils" }, 34 | ] 35 | exclude = ["docs", "docs_src"] # never include documentation in build 36 | keywords = ["LLM", "large language model", "LLM evaluation", "hallucination", "uncertainty quantification"] # Add your keywords here 37 | 38 | [tool.poetry.dependencies] 39 | python = ">=3.10, <4.0" 40 | langchain = ">=0.3.7,<1.1.0" 41 | langchain-model-profiles = "^0.0.3" 42 | transformers = "^4.45.2" 43 | scikit-learn = [ 44 | { version = "^1.5.2", markers = "python_version < '3.13'" }, 45 | { version = "^1.7.0", markers = "python_version >= '3.13'" } 46 | ] 47 | numpy = [ 48 | { version = "^1.26.4", markers = "python_version < '3.13'" }, 49 | { version = ">=2.3.1", markers = "python_version >= '3.13'" } 50 | ] 51 | scipy = {version = "^1.15.0", markers = "python_version >= '3.13'"} 52 | matplotlib = "^3.10.0" 53 | optuna = "^4.0.0" 54 | bert-score = "^0.3.0" 55 | pandas = "^2.3.0" 56 | sentence-transformers = ">=3.4,<6.0" 57 | datasets = ">=3.3.2,<5.0.0" 58 | rich = "^13.8.0" 59 | ipywidgets = "^8.1.7" 60 | 61 | [tool.poetry.group.dev] 62 | optional = true 63 | 64 | [tool.poetry.group.dev.dependencies] 65 | python-dotenv = "^1.2.0" 66 | ruff = "0.9.7" 67 | pre-commit = "^4.1.0" 68 | ipykernel = "^6.29.5" 69 | langchain-openai = ">=0.2.6" 70 | langchain-google-vertexai = ">=2.0.8" 71 | 72 | [tool.poetry.group.docs] 73 | optional = true 74 | 75 | [tool.poetry.group.docs.dependencies] 76 | sphinx= "7.4.7" 77 | pydata_sphinx_theme = "0.16.1" 78 | sphinxcontrib-bibtex = "2.6.3" 79 | sphinx-autodoc-typehints = "2.3.0" 80 | sphinx-gallery = "0.18.0" 81 | sphinx-favicon = "1.0.1" 82 | nbsphinx = "0.9.6" 83 | 84 | 85 | [tool.poetry.group.test] 86 | optional = true 87 | 88 | [tool.poetry.group.test.dependencies] 89 | ipykernel = "^6.29.5" 90 | langchain-openai = ">=0.2.6" 91 | pytest = "^8.3.5" 92 | langchain-google-vertexai = ">=2.0.8" 93 | pytest-asyncio = ">=0.25.3,<1.2.0" 94 | pytest-cov = ">=6,<8" 95 | pytest-rerunfailures = "^16.0" 96 | 97 | 98 | 99 | [tool.pytest.ini_options] 100 | reruns = 3 101 | reruns_delay = 2.0 102 | markers = [ 103 | "asyncio" 104 | ] 105 | 106 | [build-system] 107 | requires = ["poetry-core"] 108 | build-backend = "poetry.core.masonry.api" 109 | 110 | 111 | [tool.ruff] 112 | # Exclude a variety of commonly ignored directories. 113 | exclude = [ 114 | ".bzr", 115 | ".direnv", 116 | ".eggs", 117 | ".git", 118 | ".git-rewrite", 119 | ".hg", 120 | ".ipynb_checkpoints", 121 | ".mypy_cache", 122 | ".nox", 123 | ".pants.d", 124 | ".pyenv", 125 | ".pytest_cache", 126 | ".pytype", 127 | ".ruff_cache", 128 | ".svn", 129 | ".tox", 130 | ".venv", 131 | ".vscode", 132 | "__pypackages__", 133 | "_build", 134 | "buck-out", 135 | "build", 136 | "dist", 137 | "node_modules", 138 | "site-packages", 139 | "venv", 140 | ] 141 | # Core settings 142 | line-length = 400 143 | 144 | [tool.ruff.lint] 145 | #What rules to enable 146 | select = ["E", "F"] 147 | # E = pycodestyle errors 148 | # F = pyflakes 149 | # I = isort (import sorting) 150 | # B = bugbear (best practices) 151 | # UP = pyupgrade (modernization) 152 | # D = pydocstyle (docstring rules) 153 | # S = bandit (security) 154 | 155 | #What rules to ignore 156 | ignore = [] 157 | 158 | 159 | [tool.ruff.format] 160 | quote-style = "double" 161 | indent-style = "space" 162 | skip-magic-trailing-comma = true 163 | line-ending = "lf" 164 | docstring-code-format = true -------------------------------------------------------------------------------- /docs/source/contribute.rst: -------------------------------------------------------------------------------- 1 | .. _contribute: 2 | 3 | Contributing to UQLM 4 | ==================== 5 | 6 | Welcome and thank you for considering contributing to UQLM! 7 | 8 | It takes a lot of time and effort to use software much less build upon it, so we deeply appreciate your desire to help make this project thrive. 9 | 10 | Table of Contents 11 | ----------------- 12 | 1. :ref:`How to Contribute` 13 | * :ref:`Reporting Bugs` 14 | * :ref:`Suggesting Enhancements` 15 | * :ref:`Pull Requests` 16 | 2. :ref:`Development Setup` 17 | 3. :ref:`Style Guides` 18 | * :ref:`Code Style` 19 | 4. :ref:`License` 20 | 21 | .. _how-to-contribute: 22 | 23 | How to Contribute 24 | ----------------- 25 | 26 | .. _reporting-bugs: 27 | 28 | Reporting Bugs 29 | ************** 30 | 31 | If you find a bug, please report it by opening an issue on GitHub. Include as much detail as possible: 32 | * Steps to reproduce the bug. 33 | * Expected and actual behavior. 34 | * Screenshots if applicable. 35 | * Any other information that might help us understand the problem. 36 | 37 | .. _suggesting-enhancements: 38 | 39 | Suggesting Enhancements 40 | *********************** 41 | 42 | We welcome suggestions for new features or improvements. To suggest an enhancement, please open an issue on GitHub and include: 43 | 44 | * A clear description of the suggested enhancement. 45 | * Why you believe this enhancement would be useful. 46 | * Any relevant examples or mockups. 47 | 48 | .. _pull-requests: 49 | 50 | Pull Requests 51 | ************* 52 | 53 | 1. Fork the repository. 54 | 2. Create a new branch (``git checkout -b feature/your-feature-name``). 55 | 3. Make your changes. 56 | 4. Commit your changes (``git commit -m 'Add some feature'```). 57 | 5. Push to the branch (``git push origin feature/your-feature-name``). 58 | 6. Open a pull request. 59 | 60 | Please ensure your pull request adheres to the following guidelines: 61 | 62 | * Follow the project's code style. 63 | * Include tests for any new features or bug fixes. 64 | 65 | .. _development-setup: 66 | 67 | Development Setup 68 | ----------------- 69 | 70 | 1. Clone the repository: ``git clone https://github.aetna.com/analytics-org/uqlm.git`` 71 | 2. Navigate to the project directory: ``cd uqlm`` 72 | 3. Create and activate a virtual environment (using ``venv`` or ``conda``) 73 | 4. Install poetry (if you don't already have it): ``pip install poetry`` 74 | 5. Install uqlm with dev dependencies: ``poetry install --with dev`` 75 | 6. Install our pre-commit hooks to ensure code style compliance: ``pre-commit install`` 76 | 7. Run tests to ensure everything is working: ``pre-commit run --all-files``` 77 | 78 | You're ready to develop! 79 | 80 | **For documentation contributions** 81 | 82 | Our documentation lives on the gh-pages branch and is hosted via GitHub Pages. 83 | 84 | There are two relevant directories: 85 | 86 | * ``docs_src`` - where source documentation files are located 87 | * ``docs`` - where the built documentation is located that is served by GitHub Pages 88 | 89 | To build the documentation locally: 90 | 91 | #. Create a virtual environment with your favorite tool(ex. conda, virtualenv, uv, and etc.) 92 | 93 | #. Checkout the ``gh-pages`` branch and create new branch from it 94 | 95 | #. Navigate to the ``docs_src/latest`` directory 96 | 97 | * If this is version upgrade: 98 | 99 | #. Copy ``latest`` contents to ``docs_src/{version_number}`` folder update the version in ``conf.py`` file 100 | 101 | #. Copy ``latest`` contents from ``docs/`` to ``docs/{version_number}`` folder 102 | 103 | #. Update the versions in ``docs_src/latest/index.rst`` file and ``docs_src/versions.json`` 104 | 105 | #. ``cd uqlm`` 106 | 107 | #. ``pip install -e .`` # installs current uqlm repo as package to environment 108 | 109 | #. ``cd docs_src/latest`` 110 | 111 | #. ``brew install pandoc`` # to use nbsphinx extension 112 | 113 | #. ``make install`` # installs sphinx related python packages 114 | 115 | #. ``make github`` # builds docs html 116 | 117 | #. ``make local`` # locally test doc site 118 | 119 | 120 | .. _style-guides: 121 | 122 | Style Guides 123 | ------------ 124 | 125 | .. _code-style: 126 | 127 | Code Style 128 | ********** 129 | 130 | - We use `Ruff `_ to lint and format our files. 131 | - Our pre-commit hook will run Ruff linting and formatting when you commit. 132 | - You can manually run Ruff at any time `Ruff usage `_. 133 | 134 | Please ensure your code is properly formatted and linted before committing. 135 | 136 | .. _license: 137 | 138 | License 139 | ------- 140 | 141 | Before contributing to this CVS Health sponsored project, you will need to sign the associated `Contributor License Agreement (CLA) `_. 142 | 143 | 144 | Thanks again for using and supporting uqlm! -------------------------------------------------------------------------------- /docs/source/_notebooks/index.rst: -------------------------------------------------------------------------------- 1 | Example Notebooks 2 | ================= 3 | 4 | UQLM offers a broad collection of tutorial notebooks to demonstrate usage of the various scorers. These notebooks aim to have versatile coverage of various LLMs and datasets, but you can easily replace them with your LLM and dataset of choice. Below is a list of these tutorials: 5 | 6 | 7 | 8 | .. raw:: html 9 | 10 |
11 | 12 | .. thumbnail-parent-div-open 13 | 14 | 15 | .. raw:: html 16 | 17 |
18 | 19 | .. only:: html 20 | 21 | .. image:: /_static/images/no_image.png 22 | :alt: 23 | 24 | :doc:`examples/black_box_demo` 25 | 26 | .. raw:: html 27 | 28 |
Black-Box Demo
29 |
30 | 31 | 32 | .. raw:: html 33 | 34 |
35 | 36 | .. only:: html 37 | 38 | .. image:: /_static/images/no_image.png 39 | :alt: 40 | 41 | :doc:`examples/white_box_single_generation_demo` 42 | 43 | .. raw:: html 44 | 45 |
White-Box Single-Generation Demo
46 |
47 | 48 | 49 | .. raw:: html 50 | 51 |
52 | 53 | .. only:: html 54 | 55 | .. image:: /_static/images/no_image.png 56 | :alt: 57 | 58 | :doc:`examples/white_box_multi_generation_demo` 59 | 60 | .. raw:: html 61 | 62 |
White-Box Multi-Generation Demo
63 |
64 | 65 | 66 | .. raw:: html 67 | 68 |
69 | 70 | .. only:: html 71 | 72 | .. image:: /_static/images/no_image.png 73 | :alt: 74 | 75 | :doc:`examples/ensemble_off_the_shelf_demo` 76 | 77 | .. raw:: html 78 | 79 |
BS Detector Off-the-Shelf Ensemble Demo
80 |
81 | 82 | 83 | .. raw:: html 84 | 85 |
86 | 87 | .. only:: html 88 | 89 | .. image:: /_static/images/no_image.png 90 | :alt: 91 | 92 | :doc:`examples/ensemble_tuning_demo` 93 | 94 | .. raw:: html 95 | 96 |
Ensemble Uncertainty Quantification Demo
97 |
98 | 99 | 100 | .. raw:: html 101 | 102 |
103 | 104 | .. only:: html 105 | 106 | .. image:: /_static/images/no_image.png 107 | :alt: 108 | 109 | :doc:`examples/judges_demo` 110 | 111 | .. raw:: html 112 | 113 |
LLM-as-a-Judge Demo
114 |
115 | 116 | 117 | .. raw:: html 118 | 119 |
120 | 121 | .. only:: html 122 | 123 | .. image:: /_static/images/no_image.png 124 | :alt: 125 | 126 | :doc:`examples/multimodal_demo` 127 | 128 | .. raw:: html 129 | 130 |
Multimodal Demo
131 |
132 | 133 | 134 | .. raw:: html 135 | 136 |
137 | 138 | .. only:: html 139 | 140 | .. image:: /_static/images/no_image.png 141 | :alt: 142 | 143 | :doc:`examples/semantic_entropy_demo` 144 | 145 | .. raw:: html 146 | 147 |
Semantic Entropy Demo
148 |
149 | 150 | 151 | .. raw:: html 152 | 153 |
154 | 155 | .. only:: html 156 | 157 | .. image:: /_static/images/no_image.png 158 | :alt: 159 | 160 | :doc:`examples/semantic_density_demo` 161 | 162 | .. raw:: html 163 | 164 |
Semantic Density Demo
165 |
166 | 167 | 168 | .. raw:: html 169 | 170 |
171 | 172 | .. only:: html 173 | 174 | .. image:: /_static/images/no_image.png 175 | :alt: 176 | 177 | :doc:`examples/score_calibration_demo` 178 | 179 | .. raw:: html 180 | 181 |
Score Calibration Demo
182 |
183 | 184 | 185 | .. thumbnail-parent-div-close 186 | 187 | 188 | .. raw:: html 189 | 190 |
191 | 192 | 193 | .. toctree:: 194 | :hidden: 195 | 196 | examples/ensemble_off_the_shelf_demo.ipynb 197 | examples/ensemble_tuning_demo.ipynb 198 | examples/judges_demo.ipynb 199 | examples/semantic_entropy_demo.ipynb 200 | examples/semantic_density_demo.ipynb 201 | examples/white_box_multi_generation_demo.ipynb 202 | examples/white_box_single_generation_demo.ipynb 203 | examples/black_box_demo.ipynb 204 | examples/multimodal_demo.ipynb 205 | examples/score_calibration_demo.ipynb -------------------------------------------------------------------------------- /uqlm/black_box/consistency.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any, Optional, Tuple 2 | import time 3 | import numpy as np 4 | from rich.progress import Progress 5 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer 6 | from uqlm.nli.nli import NLI 7 | from uqlm.nli.cluster import SemanticClusterer 8 | 9 | 10 | class ConsistencyScorer(SimilarityScorer): 11 | def __init__(self, nli_model_name: str = "microsoft/deberta-large-mnli", max_length: int = 2000, use_best: bool = False, scorers: List[str] = ["noncontradiction", "entailment"]): 12 | """ 13 | Initialize the NonContradictionScorer. 14 | 15 | Parameters 16 | ---------- 17 | use_best : bool, default=False 18 | Specifies whether to swap the original response for the uncertainty-minimized response 19 | based on semantic entropy clusters. 20 | """ 21 | super().__init__() 22 | self.nli_model_name = nli_model_name 23 | self.max_length = max_length 24 | self.use_best = use_best 25 | self.nli = NLI(nli_model_name=nli_model_name, max_length=max_length) 26 | self.scorers = scorers 27 | 28 | def evaluate(self, responses: List[str], sampled_responses: List[List[str]], available_nli_scores: Dict[Tuple[str, str], float] = dict(), progress_bar: Optional[Progress] = None) -> Dict[str, Any]: 29 | """ 30 | Evaluate confidence scores on LLM responses. 31 | 32 | Parameters 33 | ---------- 34 | responses : list of strings 35 | Original LLM response 36 | 37 | sampled_responses : list of list of strings 38 | Sampled candidate responses to be compared to the original response 39 | 40 | progress_bar : rich.progress.Progress, default=None 41 | If provided, displays a progress bar while scoring responses 42 | 43 | Returns 44 | ------- 45 | Dict 46 | Dictionary containing mean NLI and (optionally) semantic entropy scores. 47 | The dictionary will also contain original and multiple responses, updated if `use_best` is True 48 | """ 49 | self.available_nli_scores = available_nli_scores 50 | self.num_responses = len(sampled_responses[0]) 51 | observed_consistency_data = {"noncontradiction": [], "entailment": [], "discrete_semantic_entropy": [], "tokenprob_semantic_entropy": [], "responses": responses, "sampled_responses": sampled_responses} 52 | 53 | def _process_i(i, response): 54 | oc_result_i = self._observed_consistency_i(original=response, candidates=sampled_responses[i]) 55 | for scorer in self.scorers: 56 | observed_consistency_data[scorer].append(oc_result_i[scorer]) 57 | responses[i] = oc_result_i["response"] # Replace with optimized response if use_best 58 | sampled_responses[i] = oc_result_i["candidates"] # Replace with updated candidates if use_best 59 | 60 | if progress_bar: 61 | progress_task = progress_bar.add_task(" - Scoring responses with entailment/contradiction...", total=len(responses)) 62 | for i, response in enumerate(responses): 63 | _process_i(i, response) 64 | if progress_bar: 65 | progress_bar.update(progress_task, advance=1) 66 | time.sleep(0.1) 67 | 68 | if self.use_best: 69 | observed_consistency_data["responses"] = responses 70 | observed_consistency_data["sampled_responses"] = sampled_responses 71 | return observed_consistency_data 72 | 73 | def _observed_consistency_i(self, original: str, candidates: List[str]) -> Dict[str, Any]: 74 | """ 75 | Compute observed consistency score on the provided original response and multiple candidates. 76 | """ 77 | best_response = original 78 | if self.use_best: 79 | all_responses = [original] + candidates 80 | 81 | self.clusterer = SemanticClusterer(nli=self.nli) 82 | _, response_probabilities = self.clusterer.compute_response_probabilities(logprobs_results=None, num_responses=len(all_responses)) 83 | best_response, _, _, _ = self.clusterer.evaluate(responses=all_responses, response_probabilities=response_probabilities) 84 | 85 | candidates = all_responses.remove(best_response) 86 | self.available_nli_scores = self.clusterer.nli_scores 87 | 88 | nli_scores = {} 89 | for s_ in self.scorers: 90 | nli_scores[s_] = [] 91 | for candidate in candidates: 92 | if s_ in self.available_nli_scores: 93 | if (candidate, best_response) in self.available_nli_scores[s_]: 94 | nli_scores[s_].append(self.available_nli_scores[s_][(candidate, best_response)]) 95 | continue 96 | nli_scores[s_].append(self.nli.get_nli_results(response1=best_response, response2=candidate)[s_ + "_score"]) 97 | 98 | result = {n: np.mean(nli_scores[n]) for n in self.scorers} 99 | result.update({"candidates": candidates, "response": best_response}) 100 | return result 101 | -------------------------------------------------------------------------------- /tests/test_whiteboxuq.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import json 17 | from uqlm.scorers import WhiteBoxUQ 18 | from langchain_openai import AzureChatOpenAI 19 | 20 | datafile_path = "tests/data/scorers/whitebox_results_file.json" 21 | with open(datafile_path, "r") as f: 22 | expected_result = json.load(f) 23 | 24 | data = expected_result["data"] 25 | metadata = expected_result["metadata"] 26 | 27 | PROMPTS = data["prompts"] 28 | MOCKED_RESPONSES = data["responses"] 29 | MOCKED_LOGPROBS = data["logprobs"] 30 | 31 | mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1.0, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com") 32 | 33 | 34 | @pytest.mark.asyncio 35 | async def test_whiteboxuq_basic(monkeypatch): 36 | wbuq = WhiteBoxUQ(llm=mock_object, scorers=["normalized_probability", "min_probability"]) 37 | 38 | async def mock_generate_original_responses(*args, **kwargs): 39 | wbuq.logprobs = MOCKED_LOGPROBS 40 | return MOCKED_RESPONSES 41 | 42 | monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses) 43 | 44 | for show_progress_bars in [False, True]: 45 | results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=show_progress_bars) 46 | 47 | for i in range(len(PROMPTS)): 48 | assert results.data["normalized_probability"][i] == pytest.approx(data["normalized_probability"][i]) 49 | assert results.data["min_probability"][i] == pytest.approx(data["min_probability"][i]) 50 | 51 | assert results.metadata == metadata 52 | 53 | 54 | @pytest.mark.asyncio 55 | async def test_whiteboxuq_top_logprobs(monkeypatch): 56 | wbuq = WhiteBoxUQ(llm=mock_object, scorers=["sequence_probability"]) 57 | 58 | async def mock_generate_original_responses(*args, **kwargs): 59 | wbuq.logprobs = MOCKED_LOGPROBS 60 | return MOCKED_RESPONSES 61 | 62 | monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses) 63 | 64 | results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False) 65 | assert "sequence_probability" in results.data 66 | 67 | 68 | @pytest.mark.asyncio 69 | async def test_whiteboxuq_sampled_logprobs(monkeypatch): 70 | wbuq = WhiteBoxUQ(llm=mock_object, scorers=["monte_carlo_probability"]) 71 | 72 | async def mock_generate_original_responses(*args, **kwargs): 73 | wbuq.logprobs = MOCKED_LOGPROBS 74 | return MOCKED_RESPONSES 75 | 76 | async def mock_generate_candidate_responses(*args, **kwargs): 77 | wbuq.multiple_logprobs = [[[{"token": "Hello", "logprob": -0.1}]]] * len(PROMPTS) 78 | return [["Hello world"] * 5] * len(PROMPTS) 79 | 80 | monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses) 81 | monkeypatch.setattr(wbuq, "generate_candidate_responses", mock_generate_candidate_responses) 82 | 83 | results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False) 84 | assert "monte_carlo_probability" in results.data 85 | 86 | 87 | @pytest.mark.asyncio 88 | async def test_whiteboxuq_p_true(monkeypatch): 89 | wbuq = WhiteBoxUQ(llm=mock_object, scorers=["p_true"]) 90 | 91 | async def mock_generate_original_responses(*args, **kwargs): 92 | wbuq.logprobs = MOCKED_LOGPROBS 93 | return MOCKED_RESPONSES 94 | 95 | async def mock_p_true_evaluate(*args, **kwargs): 96 | return {"p_true": [0.9] * len(PROMPTS)} 97 | 98 | monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses) 99 | monkeypatch.setattr(wbuq.p_true_scorer, "evaluate", mock_p_true_evaluate) 100 | 101 | results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False) 102 | assert "p_true" in results.data 103 | 104 | 105 | def test_whiteboxuq_invalid_scorer(): 106 | with pytest.raises(ValueError, match="Invalid scorer provided: invalid_scorer"): 107 | WhiteBoxUQ(llm=mock_object, scorers=["invalid_scorer"]) 108 | 109 | 110 | @pytest.mark.asyncio 111 | async def test_whiteboxuq_top_logprobs_full(monkeypatch): 112 | wbuq = WhiteBoxUQ(llm=mock_object, scorers=["mean_token_negentropy"], top_k_logprobs=10) 113 | 114 | async def mock_generate_original_responses(*args, **kwargs): 115 | wbuq.logprobs = MOCKED_LOGPROBS 116 | return MOCKED_RESPONSES 117 | 118 | monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses) 119 | 120 | # Optional: monkeypatch the scorer to ensure evaluate is called and returns something 121 | wbuq.top_logprobs_scorer.evaluate = lambda logprobs_results: {"mean_token_negentropy": [0.8] * len(PROMPTS)} 122 | 123 | results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False) 124 | assert "mean_token_negentropy" in results.data 125 | -------------------------------------------------------------------------------- /uqlm/nli/nli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | import warnings 17 | import torch 18 | from typing import Any, Dict 19 | from transformers import AutoTokenizer, AutoModelForSequenceClassification 20 | from transformers import logging 21 | 22 | from uqlm.utils.device import get_best_device 23 | 24 | logging.set_verbosity_error() 25 | 26 | 27 | class NLI: 28 | def __init__(self, device: Any = None, verbose: bool = False, nli_model_name: str = "microsoft/deberta-large-mnli", max_length: int = 2000) -> None: 29 | """ 30 | A class to computing NLI-based confidence scores. This class offers two types of confidence scores, namely 31 | noncontradiction probability :footcite:`chen2023quantifyinguncertaintyanswerslanguage` and semantic entropy 32 | :footcite:`farquhar2024detectinghallucinations`. 33 | 34 | Parameters 35 | ---------- 36 | device : torch.device input or torch.device object, default=None 37 | Specifies the device that classifiers use for prediction. Set to "cuda" for classifiers to be able to 38 | leverage the GPU. 39 | 40 | verbose : bool, default=False 41 | Specifies whether to print verbose status updates of NLI scoring process 42 | 43 | nli_model_name : str, default="microsoft/deberta-large-mnli" 44 | Specifies which NLI model to use. Must be acceptable input to AutoTokenizer.from_pretrained() and 45 | AutoModelForSequenceClassification.from_pretrained() 46 | 47 | max_length : int, default=2000 48 | Specifies the maximum allowed string length. Responses longer than this value will be truncated to 49 | avoid OutOfMemoryError 50 | """ 51 | # Handle device detection 52 | if device is None: 53 | device = get_best_device() 54 | elif isinstance(device, str): 55 | device = torch.device(device) 56 | 57 | self.device = device 58 | self.verbose = verbose 59 | self.max_length = max_length 60 | self.tokenizer = AutoTokenizer.from_pretrained(nli_model_name) 61 | model = AutoModelForSequenceClassification.from_pretrained(nli_model_name) 62 | self.model = model.to(self.device) if self.device else model 63 | self.label_mapping = ["contradiction", "neutral", "entailment"] 64 | self.probabilities = dict() 65 | 66 | def predict(self, premise: str, hypothesis: str) -> Any: 67 | """ 68 | This method compute probability of contradiction on the provide inputs. 69 | 70 | Parameters 71 | ---------- 72 | premise : str 73 | An input for the sequence classification DeBERTa model. 74 | 75 | hypothesis : str 76 | An input for the sequence classification DeBERTa model. 77 | 78 | Returns 79 | ------- 80 | numpy.ndarray 81 | Probabilities computed by NLI model 82 | """ 83 | if len(premise) > self.max_length or len(hypothesis) > self.max_length: 84 | warnings.warn("Maximum response length exceeded for NLI comparison. Truncation will occur. To adjust, change the value of max_length") 85 | concat = premise[0 : self.max_length] + " [SEP] " + hypothesis[0 : self.max_length] 86 | encoded_inputs = self.tokenizer(concat, padding=True, return_tensors="pt") 87 | if self.device: 88 | encoded_inputs = {name: tensor.to(self.device) for name, tensor in encoded_inputs.items()} 89 | logits = self.model(**encoded_inputs).logits 90 | np_logits = logits.detach().cpu().numpy() if self.device else logits.detach().numpy() 91 | probabilites = np.exp(np_logits) / np.exp(np_logits).sum(axis=-1, keepdims=True) 92 | return probabilites 93 | 94 | def get_nli_results(self, response1: str, response2: str) -> Dict[str, Any]: 95 | """This method computes mean NLI score and determines whether entailment exists.""" 96 | if response1 == response2: 97 | avg_noncontradiction_score, entailment, avg_entailment_score = 1, True, 1 98 | else: 99 | left = self.predict(premise=response1, hypothesis=response2) 100 | left_label = self.label_mapping[left.argmax(axis=1)[0]] 101 | 102 | right = self.predict(premise=response2, hypothesis=response1) 103 | right_label = self.label_mapping[right.argmax(axis=1)[0]] 104 | s1, s2 = 1 - left[:, 0], 1 - right[:, 0] 105 | 106 | entailment = left_label == "entailment" or right_label == "entailment" 107 | avg_noncontradiction_score = ((s1 + s2) / 2)[0] 108 | avg_entailment_score = ((left[:, -1] + right[:, -1]) / 2)[0] 109 | self.probabilities.update({f"{response1}_{response2}": left, f"{response2}_{response1}": right}) 110 | return {"noncontradiction_score": avg_noncontradiction_score, "entailment": entailment, "entailment_score": avg_entailment_score} 111 | -------------------------------------------------------------------------------- /tests/test_semanticdensity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 CVS Health and/or one of its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import json 17 | from uqlm.scorers import SemanticDensity 18 | from unittest.mock import AsyncMock, MagicMock 19 | from uqlm.utils.results import UQResult 20 | from langchain_openai import AzureChatOpenAI 21 | 22 | datafile_path = "tests/data/scorers/semanticdensity_results_file.json" 23 | with open(datafile_path, "r") as f: 24 | expected_result = json.load(f) 25 | 26 | data = expected_result["data"] 27 | metadata = expected_result["metadata"] 28 | 29 | mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com") 30 | 31 | 32 | @pytest.mark.flaky(reruns=3) 33 | @pytest.mark.asyncio 34 | async def test_semanticdensity(monkeypatch): 35 | PROMPTS = data["prompts"] 36 | MOCKED_RESPONSES = data["responses"] 37 | MOCKED_SAMPLED_RESPONSES = data["sampled_responses"] 38 | 39 | # Initiate SemanticDensity class object 40 | sd_object = SemanticDensity(llm=mock_object, device="cpu") 41 | 42 | async def mock_generate_original_responses(*args, **kwargs): 43 | sd_object.logprobs = [None] * 5 44 | return MOCKED_RESPONSES 45 | 46 | async def mock_generate_candidate_responses(*args, **kwargs): 47 | sd_object.multiple_logprobs = data["multiple_logprobs"] 48 | return MOCKED_SAMPLED_RESPONSES 49 | 50 | monkeypatch.setattr(sd_object, "generate_original_responses", mock_generate_original_responses) 51 | monkeypatch.setattr(sd_object, "generate_candidate_responses", mock_generate_candidate_responses) 52 | 53 | for show_progress_bars in [True, False]: 54 | se_results = await sd_object.generate_and_score(prompts=PROMPTS, show_progress_bars=show_progress_bars) 55 | sd_object.logprobs = None 56 | sd_results = sd_object.score(responses=MOCKED_RESPONSES, sampled_responses=MOCKED_SAMPLED_RESPONSES) 57 | assert sd_results.data["responses"] == data["responses"] 58 | assert sd_results.data["sampled_responses"] == data["sampled_responses"] 59 | assert sd_results.data["prompts"] == data["prompts"] 60 | assert all([abs(sd_results.data["semantic_density_values"][i] - data["semantic_density_values"][i]) < 1e-5 for i in range(len(PROMPTS))]) 61 | assert se_results.metadata == metadata 62 | 63 | 64 | @pytest.mark.asyncio 65 | async def test_generate_and_score_mocked(): 66 | mock_llm = MagicMock() 67 | mock_llm.logprobs = True 68 | 69 | semantic_density = SemanticDensity(llm=mock_llm) 70 | semantic_density._setup_nli = MagicMock() 71 | semantic_density._construct_progress_bar = MagicMock() 72 | semantic_density._display_generation_header = MagicMock() 73 | semantic_density.generate_original_responses = AsyncMock(return_value=["response1", "response2"]) 74 | semantic_density.generate_candidate_responses = AsyncMock(return_value=[["sample1", "sample2"], ["sample3", "sample4"]]) 75 | semantic_density.score = MagicMock(return_value=UQResult({"data": {}, "metadata": {}})) 76 | 77 | prompts = ["prompt1", "prompt2"] 78 | 79 | # Manually set prompts since score is mocked 80 | semantic_density.prompts = prompts 81 | 82 | result = await semantic_density.generate_and_score(prompts, num_responses=2) 83 | 84 | assert isinstance(result, UQResult) 85 | assert semantic_density.prompts == prompts 86 | assert semantic_density.num_responses == 2 87 | semantic_density.generate_original_responses.assert_called_once_with(prompts, progress_bar=semantic_density.progress_bar) 88 | semantic_density.generate_candidate_responses.assert_called_once_with(prompts, num_responses=2, progress_bar=semantic_density.progress_bar) 89 | semantic_density.score.assert_called_once() 90 | 91 | 92 | def test_score_mocked(): 93 | semantic_density = SemanticDensity() 94 | semantic_density._semantic_density_process = MagicMock(return_value=("density_value", None)) 95 | semantic_density._construct_progress_bar = MagicMock() 96 | semantic_density._display_scoring_header = MagicMock() 97 | semantic_density._stop_progress_bar = MagicMock() 98 | semantic_density._construct_black_box_return_data = MagicMock(return_value={}) 99 | semantic_density.progress_bar = MagicMock() 100 | semantic_density.progress_bar.add_task = MagicMock(return_value="task_id") 101 | semantic_density.progress_bar.update = MagicMock() 102 | 103 | # Required attributes 104 | responses = ["response1", "response2"] 105 | sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]] 106 | prompts = ["prompt1", "prompt2"] 107 | sampled_logprobs_results = [["logprob1", "logprob2"], ["logprob3", "logprob4"]] 108 | logprobs_results = [None, None] 109 | 110 | result = semantic_density.score(prompts=prompts, responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results) 111 | 112 | assert "semantic_density_values" in result.data 113 | assert "multiple_logprobs" in result.data 114 | semantic_density._semantic_density_process.assert_called() 115 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, caste, color, religion, or sexual 11 | identity and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the overall 27 | community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or advances of 32 | any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email address, 36 | without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official email address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement at 64 | dylan.bouchard@cvshealth.com. 65 | All complaints will be reviewed and investigated promptly and fairly. 66 | 67 | All community leaders are obligated to respect the privacy and security of the 68 | reporter of any incident. 69 | 70 | ## Enforcement Guidelines 71 | 72 | Community leaders will follow these Community Impact Guidelines in determining 73 | the consequences for any action they deem in violation of this Code of Conduct: 74 | 75 | ### 1. Correction 76 | 77 | **Community Impact**: Use of inappropriate language or other behavior deemed 78 | unprofessional or unwelcome in the community. 79 | 80 | **Consequence**: A private, written warning from community leaders, providing 81 | clarity around the nature of the violation and an explanation of why the 82 | behavior was inappropriate. A public apology may be requested. 83 | 84 | ### 2. Warning 85 | 86 | **Community Impact**: A violation through a single incident or series of 87 | actions. 88 | 89 | **Consequence**: A warning with consequences for continued behavior. No 90 | interaction with the people involved, including unsolicited interaction with 91 | those enforcing the Code of Conduct, for a specified period of time. This 92 | includes avoiding interactions in community spaces as well as external channels 93 | like social media. Violating these terms may lead to a temporary or permanent 94 | ban. 95 | 96 | ### 3. Temporary Ban 97 | 98 | **Community Impact**: A serious violation of community standards, including 99 | sustained inappropriate behavior. 100 | 101 | **Consequence**: A temporary ban from any sort of interaction or public 102 | communication with the community for a specified period of time. No public or 103 | private interaction with the people involved, including unsolicited interaction 104 | with those enforcing the Code of Conduct, is allowed during this period. 105 | Violating these terms may lead to a permanent ban. 106 | 107 | ### 4. Permanent Ban 108 | 109 | **Community Impact**: Demonstrating a pattern of violation of community 110 | standards, including sustained inappropriate behavior, harassment of an 111 | individual, or aggression toward or disparagement of classes of individuals. 112 | 113 | **Consequence**: A permanent ban from any sort of public interaction within the 114 | community. 115 | 116 | ## Attribution 117 | 118 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 119 | version 2.1, available at 120 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 121 | 122 | Community Impact Guidelines were inspired by 123 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 124 | 125 | For answers to common questions about this code of conduct, see the FAQ at 126 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 127 | [https://www.contributor-covenant.org/translations][translations]. 128 | 129 | [homepage]: https://www.contributor-covenant.org 130 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 131 | [Mozilla CoC]: https://github.com/mozilla/diversity 132 | [FAQ]: https://www.contributor-covenant.org/faq 133 | [translations]: https://www.contributor-covenant.org/translations 134 | -------------------------------------------------------------------------------- /docs/source/refs.bib: -------------------------------------------------------------------------------- 1 | @misc{bouchard2025actionableframeworkassessingbias, 2 | title={An Actionable Framework for Assessing Bias and Fairness in Large Language Model Use Cases}, 3 | author={Dylan Bouchard}, 4 | year={2025}, 5 | eprint={2407.10853}, 6 | archivePrefix={arXiv}, 7 | primaryClass={cs.CL}, 8 | url={https://arxiv.org/abs/2407.10853}, 9 | } 10 | 11 | # BLACK-BOX SCORERS 12 | # Contradiction Probability 13 | 14 | @misc{chen2023quantifyinguncertaintyanswerslanguage, 15 | title={Quantifying Uncertainty in Answers from any Language Model and Enhancing their Trustworthiness}, 16 | author={Jiuhai Chen and Jonas Mueller}, 17 | year={2023}, 18 | eprint={2308.16175}, 19 | archivePrefix={arXiv}, 20 | primaryClass={cs.CL}, 21 | url={https://arxiv.org/abs/2308.16175}, 22 | } 23 | 24 | @misc{lin2024generatingconfidenceuncertaintyquantification, 25 | title={Generating with Confidence: Uncertainty Quantification for Black-box Large Language Models}, 26 | author={Zhen Lin and Shubhendu Trivedi and Jimeng Sun}, 27 | year={2024}, 28 | eprint={2305.19187}, 29 | archivePrefix={arXiv}, 30 | primaryClass={cs.CL}, 31 | url={https://arxiv.org/abs/2305.19187}, 32 | } 33 | 34 | 35 | # Semantic Entropy 36 | 37 | @misc{farquhar2024detectinghallucinations, 38 | title={Detecting Hallucinations in Large Language Models Using Semantic Entropy}, 39 | author={Sebastian Farquhar and Jannik Kossen and Lorenz Kuhn and Yarin Gal}, 40 | year={2024}, 41 | url={https://doi.org/10.1038/s41586-024-07421-0}, 42 | } 43 | 44 | @misc{kuhn2023semanticuncertaintylinguisticinvariances, 45 | title={Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation}, 46 | author={Lorenz Kuhn and Yarin Gal and Sebastian Farquhar}, 47 | year={2023}, 48 | eprint={2302.09664}, 49 | archivePrefix={arXiv}, 50 | primaryClass={cs.CL}, 51 | url={https://arxiv.org/abs/2302.09664}, 52 | } 53 | 54 | # Semantic Density 55 | 56 | @misc{qiu2024semanticdensityuncertaintyquantification, 57 | title={Semantic Density: Uncertainty Quantification for Large Language Models through Confidence Measurement in Semantic Space}, 58 | author={Xin Qiu and Risto Miikkulainen}, 59 | year={2024}, 60 | eprint={2405.13845}, 61 | archivePrefix={arXiv}, 62 | primaryClass={cs.CL}, 63 | url={https://arxiv.org/abs/2405.13845}, 64 | } 65 | 66 | # Exact Match 67 | 68 | @misc{cole2023selectivelyansweringambiguousquestions, 69 | title={Selectively Answering Ambiguous Questions}, 70 | author={Jeremy R. Cole and Michael J. Q. Zhang and Daniel Gillick and Julian Martin Eisenschlos and Bhuwan Dhingra and Jacob Eisenstein}, 71 | year={2023}, 72 | eprint={2305.14613}, 73 | archivePrefix={arXiv}, 74 | primaryClass={cs.CL}, 75 | url={https://arxiv.org/abs/2305.14613}, 76 | } 77 | 78 | # BERT-score 79 | 80 | @misc{zhang2020bertscoreevaluatingtextgeneration, 81 | title={BERTScore: Evaluating Text Generation with BERT}, 82 | author={Tianyi Zhang and Varsha Kishore and Felix Wu and Kilian Q. Weinberger and Yoav Artzi}, 83 | year={2020}, 84 | eprint={1904.09675}, 85 | archivePrefix={arXiv}, 86 | primaryClass={cs.CL}, 87 | url={https://arxiv.org/abs/1904.09675}, 88 | } 89 | 90 | # BLUERT-score 91 | 92 | @misc{sellam2020bleurtlearningrobustmetrics, 93 | title={BLEURT: Learning Robust Metrics for Text Generation}, 94 | author={Thibault Sellam and Dipanjan Das and Ankur P. Parikh}, 95 | year={2020}, 96 | eprint={2004.04696}, 97 | archivePrefix={arXiv}, 98 | primaryClass={cs.CL}, 99 | url={https://arxiv.org/abs/2004.04696}, 100 | } 101 | 102 | # Cosine Similarity 103 | 104 | @misc{shorinwa2024surveyuncertaintyquantificationlarge, 105 | title={A Survey on Uncertainty Quantification of Large Language Models: Taxonomy, Open Research Challenges, and Future Directions}, 106 | author={Ola Shorinwa and Zhiting Mei and Justin Lidard and Allen Z. Ren and Anirudha Majumdar}, 107 | year={2024}, 108 | eprint={2412.05563}, 109 | archivePrefix={arXiv}, 110 | primaryClass={cs.CL}, 111 | url={https://arxiv.org/abs/2412.05563}, 112 | } 113 | 114 | # WHITE-BOX SCORERS 115 | 116 | # Minimum Token Probability 117 | 118 | @misc{manakul2023selfcheckgptzeroresourceblackboxhallucination, 119 | title={SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models}, 120 | author={Potsawee Manakul and Adian Liusie and Mark J. F. Gales}, 121 | year={2023}, 122 | eprint={2303.08896}, 123 | archivePrefix={arXiv}, 124 | primaryClass={cs.CL}, 125 | url={https://arxiv.org/abs/2303.08896}, 126 | } 127 | 128 | # Length-Normalized Joint Token Probability 129 | 130 | @misc{malinin2021uncertaintyestimationautoregressivestructured, 131 | title={Uncertainty Estimation in Autoregressive Structured Prediction}, 132 | author={Andrey Malinin and Mark Gales}, 133 | year={2021}, 134 | eprint={2002.07650}, 135 | archivePrefix={arXiv}, 136 | primaryClass={stat.ML}, 137 | url={https://arxiv.org/abs/2002.07650}, 138 | } 139 | 140 | # LLM-as-a-Judge Scorers 141 | 142 | # Categorical LLM-as-a-Judge 143 | 144 | @misc{luo2023chatgptfactualinconsistencyevaluator, 145 | title={ChatGPT as a Factual Inconsistency Evaluator for Text Summarization}, 146 | author={Zheheng Luo and Qianqian Xie and Sophia Ananiadou}, 147 | year={2023}, 148 | eprint={2303.15621}, 149 | archivePrefix={arXiv}, 150 | primaryClass={cs.CL}, 151 | url={https://arxiv.org/abs/2303.15621}, 152 | } 153 | 154 | # Continuous LLM-as-a-Judge 155 | 156 | # Panel of LLM Judges 157 | 158 | @misc{verga2024replacingjudgesjuriesevaluating, 159 | title={Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models}, 160 | author={Pat Verga and Sebastian Hofstatter and Sophia Althammer and Yixuan Su and Aleksandra Piktus and Arkady Arkhangorodsky and Minjie Xu and Naomi White and Patrick Lewis}, 161 | year={2024}, 162 | eprint={2404.18796}, 163 | archivePrefix={arXiv}, 164 | primaryClass={cs.CL}, 165 | url={https://arxiv.org/abs/2404.18796}, 166 | } 167 | -------------------------------------------------------------------------------- /tests/test_sampled_logprobs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import MagicMock, patch 3 | from uqlm.white_box.sampled_logprobs import SampledLogprobsScorer, SAMPLED_LOGPROBS_SCORER_NAMES 4 | 5 | 6 | @pytest.fixture 7 | def scorer(): 8 | """Fixture to create a SampledLogprobsScorer instance.""" 9 | mock_llm = MagicMock() 10 | return SampledLogprobsScorer(llm=mock_llm) 11 | 12 | 13 | def test_initialization(scorer): 14 | """Test the initialization of SampledLogprobsScorer.""" 15 | assert scorer.scorers == SAMPLED_LOGPROBS_SCORER_NAMES 16 | assert scorer.llm is not None 17 | assert scorer.nli_model_name == "microsoft/deberta-large-mnli" 18 | assert scorer.max_length == 2000 19 | assert scorer.prompts_in_nli is True 20 | assert scorer.length_normalize is True 21 | 22 | 23 | @pytest.mark.parametrize("scorer_name", SAMPLED_LOGPROBS_SCORER_NAMES) 24 | def test_evaluate_with_mocked_scorers(scorer, scorer_name): 25 | """Test the evaluate method with mocked scorers.""" 26 | responses = ["response1", "response2"] 27 | sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]] 28 | logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]] 29 | sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]] 30 | prompts = ["prompt1", "prompt2"] 31 | 32 | # Mock individual scorer methods 33 | with patch.object(scorer, "monte_carlo_probability", return_value=[0.5, 0.6]) as mock_mc, patch.object(scorer, "compute_consistency_confidence", return_value=[0.7, 0.8]) as mock_cc, patch.object(scorer, "compute_semantic_negentropy", return_value=[0.9, 1.0]) as mock_sn, patch.object(scorer, "compute_semantic_density", return_value=[1.1, 1.2]) as mock_sd: 34 | scorer.scorers = [scorer_name] 35 | result = scorer.evaluate(responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results, prompts=prompts) 36 | 37 | # Verify the correct scorer method was called 38 | if scorer_name == "monte_carlo_probability": 39 | mock_mc.assert_called_once() 40 | elif scorer_name == "consistency_and_confidence": 41 | mock_cc.assert_called_once() 42 | elif scorer_name == "semantic_negentropy": 43 | mock_sn.assert_called_once() 44 | elif scorer_name == "semantic_density": 45 | mock_sd.assert_called_once() 46 | 47 | # Verify the result contains the correct scorer output 48 | assert scorer_name in result 49 | assert isinstance(result[scorer_name], list) 50 | 51 | 52 | def test_monte_carlo_probability(scorer): 53 | """Test the monte_carlo_probability method.""" 54 | responses = ["response1", "response2"] 55 | logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]] 56 | sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]] 57 | 58 | # Mock _compute_single_generation_scores 59 | with patch.object(scorer, "_compute_single_generation_scores", return_value=[0.8, 0.9]): 60 | result = scorer.monte_carlo_probability(responses=responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results) 61 | assert isinstance(result, list) 62 | assert len(result) == len(responses) 63 | 64 | 65 | def test_compute_consistency_confidence(scorer): 66 | """Test the compute_consistency_confidence method.""" 67 | responses = ["response1", "response2"] 68 | sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]] 69 | logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]] 70 | 71 | # Mock CosineScorer and _compute_single_generation_scores 72 | with patch("uqlm.black_box.cosine.CosineScorer.evaluate", return_value=[0.5, 0.6]), patch.object(scorer, "_compute_single_generation_scores", return_value=[0.7, 0.8]): 73 | result = scorer.compute_consistency_confidence(responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results) 74 | assert isinstance(result, list) 75 | assert len(result) == len(responses) 76 | 77 | 78 | def test_compute_semantic_negentropy(scorer): 79 | """Test the compute_semantic_negentropy method.""" 80 | responses = ["response1", "response2"] 81 | prompts = ["prompt1", "prompt2"] 82 | sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]] 83 | logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]] 84 | sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]] 85 | 86 | # Mock SemanticEntropy 87 | with patch("uqlm.scorers.entropy.SemanticEntropy.score", return_value=MagicMock(to_dict=lambda: {"data": {"tokenprob_confidence_scores": [0.9, 1.0]}})): 88 | result = scorer.compute_semantic_negentropy(responses=responses, prompts=prompts, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results) 89 | assert isinstance(result, list) 90 | assert len(result) == len(responses) 91 | 92 | 93 | def test_compute_semantic_density(scorer): 94 | """Test the compute_semantic_density method.""" 95 | responses = ["response1", "response2"] 96 | sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]] 97 | logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]] 98 | sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]] 99 | prompts = ["prompt1", "prompt2"] 100 | 101 | # Mock the semantic_negentropy_scorer and its clusterer 102 | mock_clusterer = MagicMock() 103 | mock_clusterer.nli.probabilities = [0.1, 0.2] 104 | mock_semantic_negentropy_scorer = MagicMock() 105 | mock_semantic_negentropy_scorer.clusterer = mock_clusterer 106 | 107 | # Assign the mocked semantic_negentropy_scorer to the scorer 108 | scorer.semantic_negentropy_scorer = mock_semantic_negentropy_scorer 109 | 110 | # Mock SemanticDensity 111 | with patch("uqlm.scorers.density.SemanticDensity.score", return_value=MagicMock(to_dict=lambda: {"data": {"semantic_density_values": [1.1, 1.2]}})): 112 | result = scorer.compute_semantic_density(responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results, prompts=prompts) 113 | assert isinstance(result, list) 114 | assert len(result) == len(responses) 115 | assert result == [1.1, 1.2] 116 | -------------------------------------------------------------------------------- /uqlm/nli/cluster.py: -------------------------------------------------------------------------------- 1 | from collections import deque, Counter 2 | from typing import Any, Dict, List, Tuple 3 | from uqlm.nli.nli import NLI 4 | import numpy as np 5 | 6 | 7 | class SemanticClusterer: 8 | def __init__(self, nli: NLI = None, length_normalize: bool = False): 9 | self.nli = nli 10 | self.length_normalize = length_normalize 11 | self.nli_scores = {"noncontradiction": dict(), "entailment": dict()} 12 | 13 | def evaluate(self, responses: List[str], prompt: str = None, response_probabilities: List[float] = None) -> Tuple[str, List[List[str]], List[float], Dict[Tuple[str, str], float]]: 14 | """ 15 | Evaluate the cluster of responses. 16 | """ 17 | clustered_responses, cluster_indices, noncontradiction_scores, entailment_scores = self.cluster_responses(responses=responses, prompt=prompt) 18 | self.nli_scores["noncontradiction"].update(noncontradiction_scores) 19 | self.nli_scores["entailment"].update(entailment_scores) 20 | cluster_probabilities = self.compute_cluster_probabilities(response_probabilities=response_probabilities, cluster_indices=cluster_indices) 21 | best_response = self.best_response_selection(clustered_responses=clustered_responses, cluster_probabilities=cluster_probabilities) 22 | return best_response, clustered_responses, cluster_probabilities, cluster_indices 23 | 24 | def cluster_responses(self, responses: List[str], prompt: str = None) -> Any: 25 | """ 26 | This method create clusters from a list of responses based on the semantic meaning of each response. 27 | 28 | Parameters 29 | ---------- 30 | responses : list of str, default=None 31 | A list of model responses 32 | 33 | prompt : str, default=None 34 | A prompt for the responses. 35 | 36 | Returns 37 | ---------- 38 | A list of lists, where each list represents a cluster. 39 | """ 40 | clusters, cluster_indices = [deque([responses[0]])], [deque([0])] 41 | noncontradiction_scores = {} 42 | entailments = {} 43 | entailment_scores = {} 44 | for i in range(1, len(responses)): 45 | new_cluster_indicator = True 46 | for j, cluster in enumerate(clusters): 47 | text1 = f"{prompt}\n{cluster[0]}" if prompt else cluster[0] 48 | text2 = f"{prompt}\n{responses[i]}" if prompt else responses[i] 49 | key, rev_key = (text1, text2), (text2, text1) 50 | if key in noncontradiction_scores: 51 | # Do not recompute if pair already assessed 52 | entailment = entailments[key] 53 | else: 54 | # Compute nli score and entailment if pair not yet assessed 55 | nli_result = self.nli.get_nli_results(response1=text1, response2=text2) 56 | noncontradiction_score, entailment, entailment_score = nli_result["noncontradiction_score"], nli_result["entailment"], nli_result["entailment_score"] 57 | noncontradiction_scores[key], noncontradiction_scores[rev_key] = noncontradiction_score, noncontradiction_score 58 | entailments[key], entailments[rev_key] = entailment, entailment 59 | entailment_scores[key], entailment_scores[rev_key] = entailment_score, entailment_score 60 | if entailment: 61 | new_cluster_indicator = False 62 | cluster.append(responses[i]) 63 | cluster_indices[j].append(i) 64 | 65 | if new_cluster_indicator: 66 | clusters.append(deque([responses[i]])) 67 | cluster_indices.append(deque([i])) 68 | 69 | # Arrange cluster so that first element is mode (if exists) else longest 70 | clusters = [self._sort_responses(list(cluster)) for cluster in clusters] 71 | 72 | return clusters, cluster_indices, noncontradiction_scores, entailment_scores 73 | 74 | def compute_response_probabilities(self, logprobs_results: List[List[Dict[str, Any]]], num_responses: int = None) -> List[float]: 75 | """Compute response probabilities""" 76 | uniform_response_probabilities = [1 / num_responses] * num_responses 77 | tokenprob_response_probabilities = [self.length_norm_sequence_prob(logprobs_i, self.length_normalize) if logprobs_i else np.nan for logprobs_i in logprobs_results] if logprobs_results else None 78 | return tokenprob_response_probabilities, uniform_response_probabilities 79 | 80 | def compute_cluster_probabilities(self, response_probabilities: List[float], cluster_indices: List[List[int]]) -> List[float]: 81 | """Compute cluster probabilities""" 82 | cluster_probabilities = [0] * len(cluster_indices) 83 | for i, cluster_index in enumerate(cluster_indices): 84 | cluster_probabilities[i] = sum([response_probabilities[j] for j in cluster_index]) 85 | return self._normalize_cluster_probabilities(cluster_probabilities=cluster_probabilities) 86 | 87 | @staticmethod 88 | def length_norm_sequence_prob(logprobs: List[Dict[str, Any]], length_normalize: bool = True) -> float: 89 | "Compute length normalized sequence logprob" 90 | factor = 1 / len(logprobs) if length_normalize else 1 91 | return np.exp(np.sum([d["logprob"] for d in logprobs]) * factor) 92 | 93 | @staticmethod 94 | def best_response_selection(clustered_responses: List[List[str]], cluster_probabilities: List[float]) -> str: 95 | """Select the best response from the clustered responses based on the cluster probabilities""" 96 | return clustered_responses[cluster_probabilities.index(max(cluster_probabilities))][0] 97 | 98 | @staticmethod 99 | def _normalize_cluster_probabilities(cluster_probabilities: List[float]) -> float: 100 | """Normalize cluster probabilities""" 101 | total_probability = sum(cluster_probabilities) 102 | return [cp_i / total_probability for cp_i in cluster_probabilities] 103 | 104 | @staticmethod 105 | def _sort_responses(responses: List[str]) -> List[str]: 106 | """Sorts responses in a cluster""" 107 | counter = Counter(responses) 108 | mode_str, count = counter.most_common(1)[0] 109 | if count > 1: 110 | return sorted(responses, key=lambda x: (x != mode_str, x)) 111 | else: 112 | return sorted(responses, key=len, reverse=True) 113 | -------------------------------------------------------------------------------- /tests/data/scorers/ensemble_results_file.json: -------------------------------------------------------------------------------- 1 | {"ensemble1": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [0.9999998323932312, 0.9999993853802055, 0.9997710794982175, 0.7865512731195814, 0.9999998323932312], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.99908431799287, 0.14620509247832553, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.25, 0.25, 0.25, 0.25], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "5", "bytes": [53], "logprob": -1.9227449, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}, "bsdetector": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5599999999999999, 0.13999999999999999, 0.3], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -3.1737043e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.080879845, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.001702437, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -5.5122365e-07, "top_logprobs": []}]]}}, "ensemble2": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [null, null, null, null, null], "ensemble_scores": [0.9999996647864624, 0.999998770760411, 0.5911959418126744, 0.9399798095276081, 0.9999996647864624], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.1823918836253489, 0.8799596190552161, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5, 0.5], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0012972581, "top_logprobs": []}, {"token": " mar", "bytes": [32, 109, 97, 114], "logprob": -1.7015977, "top_logprobs": []}, {"token": "bles", "bytes": [98, 108, 101, 115], "logprob": 0.0, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.69366264, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.12787926, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.0036003059, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}} --------------------------------------------------------------------------------