├── assets
    └── images
    │   ├── cranberry_jam.png
    │   ├── panel_output2.png
    │   ├── stacked_logo.png
    │   ├── uqlm_flow_ds.png
    │   ├── horizontal_logo.png
    │   ├── judges_graphic.png
    │   ├── uqensemble_tune.png
    │   ├── black_box_graphic.png
    │   ├── black_box_output4.png
    │   ├── uqensemble_output2.png
    │   ├── uqlm_flow_ds_dark.png
    │   ├── white_box_graphic.png
    │   ├── white_box_output2.png
    │   ├── judges_graphic_dark.png
    │   ├── black_box_graphic_dark.png
    │   ├── horizontal_logo_large.png
    │   ├── white_box_graphic_dark.png
    │   ├── uqensemble_generate_score.png
    │   ├── uqensemble_generate_score_dark.png
    │   └── COPYRIGHT.md
├── docs
    ├── source
    │   ├── _static
    │   │   ├── research
    │   │   │   └── logo.png
    │   │   ├── images
    │   │   │   ├── no_image.png
    │   │   │   ├── uqlm_flow_ds.png
    │   │   │   ├── favicon
    │   │   │   │   ├── favicon.ico
    │   │   │   │   ├── favicon-16x16.png
    │   │   │   │   ├── favicon-32x32.png
    │   │   │   │   ├── apple-touch-icon.png
    │   │   │   │   ├── android-chrome-192x192.png
    │   │   │   │   └── android-chrome-512x512.png
    │   │   │   ├── horizontal_logo.png
    │   │   │   ├── judges_graphic.png
    │   │   │   ├── panel_output2.png
    │   │   │   ├── black_box_graphic.png
    │   │   │   ├── black_box_output4.png
    │   │   │   ├── uqlm_flow_ds_dark.png
    │   │   │   ├── white_box_graphic.png
    │   │   │   ├── white_box_output2.png
    │   │   │   ├── judges_graphic_dark.png
    │   │   │   ├── uqensemble_output2.png
    │   │   │   ├── black_box_graphic_dark.png
    │   │   │   ├── horizontal_logo_large.png
    │   │   │   ├── horizontal_logo_no_bg.png
    │   │   │   ├── white_box_graphic_dark.png
    │   │   │   ├── uqensemble_generate_score.png
    │   │   │   ├── uqensemble_generate_score_dark.png
    │   │   │   └── COPYRIGHT.md
    │   │   └── custom.css
    │   ├── _templates
    │   │   ├── base.rst
    │   │   ├── class.rst
    │   │   └── module.rst
    │   ├── api.rst
    │   ├── conf.py
    │   ├── contribute.rst
    │   ├── _notebooks
    │   │   └── index.rst
    │   └── refs.bib
    ├── make.bat
    └── Makefile
├── .pre-commit-config.yaml
├── .github
    ├── dependabot.yml
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   ├── ci.yaml
    │   ├── linting.yml
    │   ├── update_version_json.py
    │   └── documentation.yaml
├── tests
    ├── data
    │   ├── scorers
    │   │   ├── DATA_COPYRIGHT.md
    │   │   ├── test_data_panelquantifier.json
    │   │   ├── blackbox_results_file.json
    │   │   ├── whitebox_results_file.json
    │   │   ├── generate_data_semanticentropy.py
    │   │   ├── generate_data_whitebox.py
    │   │   ├── generate_data_blackbox.py
    │   │   ├── semanticentropy_results_file.json
    │   │   ├── generate_data_llmjudge.py
    │   │   ├── generate_data_ensemble.py
    │   │   ├── bsdetector_results_file.json
    │   │   └── ensemble_results_file.json
    │   └── similarity
    │   │   ├── DATA_COPYRIGHT.md
    │   │   └── generate_data_similarity.py
    ├── __init__.py
    ├── test_postprocessor.py
    ├── test_similarity.py
    ├── test_grader.py
    ├── test_semanticentropy.py
    ├── test_blackboxuq.py
    ├── test_nli.py
    ├── test_top_logprobs.py
    ├── test_logprobs_scorer.py
    ├── test_p_true.py
    ├── test_whiteboxuq.py
    ├── test_semanticdensity.py
    └── test_sampled_logprobs.py
├── uqlm
    ├── resources
    │   └── __init__.py
    ├── judges
    │   └── __init__.py
    ├── black_box
    │   ├── baseclass
    │   │   ├── __init__ .py
    │   │   └── similarity_scorer.py
    │   ├── __init__.py
    │   ├── match.py
    │   ├── bert.py
    │   ├── cosine.py
    │   └── consistency.py
    ├── white_box
    │   ├── baseclass
    │   │   ├── __init__.py
    │   │   └── logprobs_scorer.py
    │   ├── __init__.py
    │   ├── single_logprobs.py
    │   ├── top_logprobs.py
    │   └── p_true.py
    ├── scorers
    │   ├── baseclass
    │   │   └── __init__.py
    │   └── __init__.py
    ├── nli
    │   ├── __init__.py
    │   ├── nli.py
    │   └── cluster.py
    ├── calibration
    │   └── __init__.py
    ├── __init__.py
    └── utils
    │   ├── postprocessors.py
    │   ├── warn.py
    │   ├── device.py
    │   ├── results.py
    │   ├── __init__.py
    │   ├── display.py
    │   ├── llm_config.py
    │   └── grader.py
├── examples
    └── uqe_config_tuned.json
├── .gitignore
├── CONTRIBUTING.md
├── pyproject.toml
└── CODE_OF_CONDUCT.md


/assets/images/cranberry_jam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/cranberry_jam.png


--------------------------------------------------------------------------------
/assets/images/panel_output2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/panel_output2.png


--------------------------------------------------------------------------------
/assets/images/stacked_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/stacked_logo.png


--------------------------------------------------------------------------------
/assets/images/uqlm_flow_ds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqlm_flow_ds.png


--------------------------------------------------------------------------------
/assets/images/horizontal_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/horizontal_logo.png


--------------------------------------------------------------------------------
/assets/images/judges_graphic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/judges_graphic.png


--------------------------------------------------------------------------------
/assets/images/uqensemble_tune.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_tune.png


--------------------------------------------------------------------------------
/assets/images/black_box_graphic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/black_box_graphic.png


--------------------------------------------------------------------------------
/assets/images/black_box_output4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/black_box_output4.png


--------------------------------------------------------------------------------
/assets/images/uqensemble_output2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_output2.png


--------------------------------------------------------------------------------
/assets/images/uqlm_flow_ds_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqlm_flow_ds_dark.png


--------------------------------------------------------------------------------
/assets/images/white_box_graphic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/white_box_graphic.png


--------------------------------------------------------------------------------
/assets/images/white_box_output2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/white_box_output2.png


--------------------------------------------------------------------------------
/assets/images/judges_graphic_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/judges_graphic_dark.png


--------------------------------------------------------------------------------
/docs/source/_static/research/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/research/logo.png


--------------------------------------------------------------------------------
/assets/images/black_box_graphic_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/black_box_graphic_dark.png


--------------------------------------------------------------------------------
/assets/images/horizontal_logo_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/horizontal_logo_large.png


--------------------------------------------------------------------------------
/assets/images/white_box_graphic_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/white_box_graphic_dark.png


--------------------------------------------------------------------------------
/docs/source/_static/images/no_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/no_image.png


--------------------------------------------------------------------------------
/assets/images/uqensemble_generate_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_generate_score.png


--------------------------------------------------------------------------------
/docs/source/_static/images/uqlm_flow_ds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqlm_flow_ds.png


--------------------------------------------------------------------------------
/docs/source/_static/images/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/favicon.ico


--------------------------------------------------------------------------------
/docs/source/_static/images/horizontal_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/horizontal_logo.png


--------------------------------------------------------------------------------
/docs/source/_static/images/judges_graphic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/judges_graphic.png


--------------------------------------------------------------------------------
/docs/source/_static/images/panel_output2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/panel_output2.png


--------------------------------------------------------------------------------
/assets/images/uqensemble_generate_score_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/assets/images/uqensemble_generate_score_dark.png


--------------------------------------------------------------------------------
/docs/source/_static/images/black_box_graphic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/black_box_graphic.png


--------------------------------------------------------------------------------
/docs/source/_static/images/black_box_output4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/black_box_output4.png


--------------------------------------------------------------------------------
/docs/source/_static/images/uqlm_flow_ds_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqlm_flow_ds_dark.png


--------------------------------------------------------------------------------
/docs/source/_static/images/white_box_graphic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/white_box_graphic.png


--------------------------------------------------------------------------------
/docs/source/_static/images/white_box_output2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/white_box_output2.png


--------------------------------------------------------------------------------
/docs/source/_static/images/judges_graphic_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/judges_graphic_dark.png


--------------------------------------------------------------------------------
/docs/source/_static/images/uqensemble_output2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqensemble_output2.png


--------------------------------------------------------------------------------
/docs/source/_templates/base.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 | 
3 | .. currentmodule:: {{ module }}
4 | 
5 | .. auto{{ objtype }}:: {{ objname }}


--------------------------------------------------------------------------------
/docs/source/_static/images/black_box_graphic_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/black_box_graphic_dark.png


--------------------------------------------------------------------------------
/docs/source/_static/images/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/favicon-16x16.png


--------------------------------------------------------------------------------
/docs/source/_static/images/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/favicon-32x32.png


--------------------------------------------------------------------------------
/docs/source/_static/images/horizontal_logo_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/horizontal_logo_large.png


--------------------------------------------------------------------------------
/docs/source/_static/images/horizontal_logo_no_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/horizontal_logo_no_bg.png


--------------------------------------------------------------------------------
/docs/source/_static/images/white_box_graphic_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/white_box_graphic_dark.png


--------------------------------------------------------------------------------
/docs/source/_static/images/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/apple-touch-icon.png


--------------------------------------------------------------------------------
/docs/source/_static/images/uqensemble_generate_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqensemble_generate_score.png


--------------------------------------------------------------------------------
/docs/source/_static/images/favicon/android-chrome-192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/android-chrome-192x192.png


--------------------------------------------------------------------------------
/docs/source/_static/images/favicon/android-chrome-512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/favicon/android-chrome-512x512.png


--------------------------------------------------------------------------------
/docs/source/_static/images/uqensemble_generate_score_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvs-health/uqlm/HEAD/docs/source/_static/images/uqensemble_generate_score_dark.png


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
3 |     rev: v0.9.7
4 |     hooks:
5 |       - id: ruff
6 |         args: [ --fix ]
7 |       - id: ruff-format
8 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "pip"
4 |     directory: "/" # Location of your requirements.txt or other package manifest
5 |     schedule:
6 |       interval: "weekly" # Check for updates daily
7 |     target-branch: "develop" # Target branch for updates
8 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | API
 2 | ===
 3 | 
 4 | .. autosummary::
 5 |     :toctree: _autosummary
 6 |     :template: module.rst
 7 |     :recursive:
 8 | 
 9 |     uqlm.scorers
10 |     uqlm.black_box
11 |     uqlm.white_box
12 |     uqlm.judges
13 |     uqlm.nli
14 |     uqlm.calibration
15 |     uqlm.resources
16 |     uqlm.utils


--------------------------------------------------------------------------------
/assets/images/COPYRIGHT.md:
--------------------------------------------------------------------------------
1 | Copyright 2025 CVS Health and/or one of its affiliates
2 | 
3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
4 | 
5 |     http://www.apache.org/licenses/LICENSE-2.0
6 | 
7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.


--------------------------------------------------------------------------------
/docs/source/_static/images/COPYRIGHT.md:
--------------------------------------------------------------------------------
1 | Copyright 2025 CVS Health and/or one of its affiliates
2 | 
3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
4 | 
5 |     http://www.apache.org/licenses/LICENSE-2.0
6 | 
7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.


--------------------------------------------------------------------------------
/tests/data/scorers/DATA_COPYRIGHT.md:
--------------------------------------------------------------------------------
1 | Copyright 2025 CVS Health and/or one of its affiliates
2 | 
3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
4 | 
5 |     http://www.apache.org/licenses/LICENSE-2.0
6 | 
7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
8 | 


--------------------------------------------------------------------------------
/tests/data/similarity/DATA_COPYRIGHT.md:
--------------------------------------------------------------------------------
1 | Copyright 2025 CVS Health and/or one of its affiliates
2 | 
3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
4 | 
5 |     http://www.apache.org/licenses/LICENSE-2.0
6 | 
7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
8 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/data/scorers/test_data_panelquantifier.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "prompts": [
 3 |         "What is Kathy Saltzman's occupation?",
 4 |         "What is Eleanor Davis's occupation?"
 5 |     ],
 6 |     "responses": [
 7 |         "Kathy Saltzman is a Senior Software Engineer at Dropbox.",
 8 |         "Eleanor Davis is a cartoonist and illustrator."
 9 |     ],
10 |     "scores": {
11 |         "judge_1": [0.8,0.9],
12 |         "judge_2": [0.8, 0.9],
13 |         "avg": [0.8, 0.9],
14 |         "max": [0.8, 0.9],
15 |         "min": [0.8, 0.9],
16 |         "median": [0.8, 0.9]
17 |     },
18 |     "metadata": {
19 |         "num_judges": 2,
20 |         "temperature": 0.7
21 |     }
22 | }


--------------------------------------------------------------------------------
/uqlm/resources/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/uqlm/judges/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.judges.judge import LLMJudge
16 | 
17 | __all__ = ["LLMJudge"]
18 | 


--------------------------------------------------------------------------------
/uqlm/black_box/baseclass/__init__ .py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.black_box.similarity_scorer import SimilarityScorer
16 | 
17 | __all__ = ["SimilarityScorer"]
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Version [e.g. 22]
29 | 
30 | **Additional context**
31 | Add any other context about the problem here.
32 | 


--------------------------------------------------------------------------------
/uqlm/white_box/baseclass/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer
16 | 
17 | __all__ = ["LogprobsScorer"]
18 | 


--------------------------------------------------------------------------------
/uqlm/scorers/baseclass/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.scorers.baseclass.uncertainty import UncertaintyQuantifier
16 | 
17 | __all__ = ["UncertaintyQuantifier"]
18 | 


--------------------------------------------------------------------------------
/uqlm/nli/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from uqlm.nli.nli import NLI
17 | from uqlm.nli.cluster import SemanticClusterer
18 | 
19 | __all__ = ["NLI", "SemanticClusterer"]
20 | 


--------------------------------------------------------------------------------
/uqlm/calibration/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.calibration.score_calibrator import ScoreCalibrator
16 | from uqlm.calibration.evaluate import evaluate_calibration
17 | 
18 | __all__ = ["ScoreCalibrator", "evaluate_calibration"]
19 | 


--------------------------------------------------------------------------------
/docs/source/_templates/class.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autoclass:: {{ objname }}
 6 |    :members:
 7 |    :show-inheritance:
 8 |    :inherited-members:
 9 | 
10 |    {% block methods %}
11 |    .. automethod:: __init__
12 | 
13 |    {% if methods %}
14 |    .. rubric:: {{ _('Methods') }}
15 | 
16 |    .. autosummary::
17 |    {% for item in methods %}
18 |       ~{{ name }}.{{ item }}
19 |    {%- endfor %}
20 |    {% endif %}
21 |    {% endblock %}
22 | 
23 |    {% block attributes %}
24 |    {% if attributes %}
25 |    .. rubric:: {{ _('Attributes') }}
26 | 
27 |    .. autosummary::
28 |    {% for item in attributes %}
29 |       ~{{ name }}.{{ item }}
30 |    {%- endfor %}
31 |    {% endif %}
32 |    {% endblock %}
33 | 
34 | {% block references %}
35 | .. rubric:: {{ _('References') }}
36 | 
37 | .. footbibliography::
38 | {% endblock %}   


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/uqlm/black_box/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.black_box.bert import BertScorer
16 | 
17 | from uqlm.black_box.cosine import CosineScorer
18 | from uqlm.black_box.match import MatchScorer
19 | from uqlm.black_box.consistency import ConsistencyScorer
20 | 
21 | __all__ = ["BertScorer", "CosineScorer", "MatchScorer", "ConsistencyScorer"]
22 | 


--------------------------------------------------------------------------------
/tests/test_postprocessor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.utils.postprocessors import math_postprocessor
16 | 
17 | 
18 | TEST_DATA = {"$3.134": "3", "the answer is 12 cookies": "12", "Hmmm /n perhaps 555.,7&333$5x": "555"}
19 | 
20 | 
21 | def test_math_postprocessor():
22 |     for key in TEST_DATA:
23 |         assert TEST_DATA[key] == math_postprocessor(key)
24 | 


--------------------------------------------------------------------------------
/uqlm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.scorers.ensemble import UQEnsemble
16 | from uqlm.scorers.entropy import SemanticEntropy
17 | from uqlm.scorers.panel import LLMPanel
18 | from uqlm.scorers.white_box import WhiteBoxUQ
19 | from uqlm.scorers.black_box import BlackBoxUQ
20 | 
21 | __all__ = ["UQEnsemble", "SemanticEntropy", "LLMPanel", "WhiteBoxUQ", "BlackBoxUQ"]
22 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'uqlm/**'
 7 |       - 'tests/**'
 8 |       - 'poetry.lock'
 9 |   pull_request:
10 |     paths:
11 |       - 'uqlm/**'
12 |       - 'tests/**'
13 |       - 'poetry.lock'
14 | 
15 | jobs:
16 |   run-tests:
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         os: [ubuntu-latest, macos-latest, windows-latest]
21 |         python-version:
22 |           - "3.10"
23 |           - "3.11"
24 |           - "3.12"
25 |           - "3.13.3"
26 | 
27 |     name: Test
28 |     runs-on: ${{ matrix.os }}
29 | 
30 |     steps:
31 |       - name: Checkout code
32 |         uses: actions/checkout@v5
33 | 
34 |       - name: Set up Python
35 |         uses: actions/setup-python@v5
36 |         with:
37 |           python-version: ${{matrix.python-version}}
38 | 
39 |       - name: Install dependencies
40 |         run: python -m pip install pytest pytest-asyncio pytest-rerunfailures langchain-openai .
41 | 
42 |       - name: Run tests
43 |         run: pytest -v
44 | 


--------------------------------------------------------------------------------
/uqlm/scorers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.scorers.ensemble import UQEnsemble
16 | from uqlm.scorers.entropy import SemanticEntropy
17 | from uqlm.scorers.panel import LLMPanel
18 | from uqlm.scorers.white_box import WhiteBoxUQ
19 | from uqlm.scorers.black_box import BlackBoxUQ
20 | from uqlm.scorers.density import SemanticDensity
21 | 
22 | __all__ = ["UQEnsemble", "SemanticEntropy", "LLMPanel", "WhiteBoxUQ", "BlackBoxUQ", "SemanticDensity"]
23 | 


--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
 1 | name: Linting with Ruff
 2 | 
 3 | on:
 4 |     pull_request:
 5 |         branches:
 6 |           - main
 7 |           - develop
 8 |     workflow_dispatch:
 9 | 
10 | concurrency:
11 |     group: ${{ github.workflow }}-${{ github.ref }}
12 |     cancel-in-progress: true
13 | 
14 | jobs:
15 |     ruff-formatting:
16 |         runs-on: ubuntu-latest
17 |         steps:
18 |         - uses: actions/checkout@v4
19 |         - name: Set up Python
20 |           uses: actions/setup-python@v5
21 |           with:
22 |             python-version: "3.9"
23 |             cache: 'pip'
24 |         - name: Get Ruff version and install
25 |           run: |
26 |               pip install poetry
27 |               RUFF_VERSION=$(poetry show --only=dev | grep '^ruff ' | awk '{print $3}')
28 |               echo "Installing ruff==$RUFF_VERSION"
29 |               pip install ruff==$RUFF_VERSION
30 |         - name: Lint with Ruff
31 |           run: |
32 |             ruff check uqlm/
33 |         - name: Check for unformatted files
34 |           run: |
35 |             ruff format --check uqlm/
36 | 


--------------------------------------------------------------------------------
/uqlm/black_box/baseclass/similarity_scorer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from typing import List
17 | 
18 | 
19 | class SimilarityScorer(ABC):
20 |     """Abstract class for text similarity scorers"""
21 | 
22 |     @abstractmethod
23 |     def __init__(self):
24 |         """Abstract constructor method"""
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def evaluate(self, responses: List[str], sampled_responses: List[str]) -> List[float]:
29 |         """Abstract method for metric computation"""
30 |         pass
31 | 


--------------------------------------------------------------------------------
/uqlm/white_box/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from uqlm.white_box.single_logprobs import SingleLogprobsScorer, SINGLE_LOGPROBS_SCORER_NAMES
16 | from uqlm.white_box.top_logprobs import TopLogprobsScorer, TOP_LOGPROBS_SCORER_NAMES
17 | from uqlm.white_box.sampled_logprobs import SampledLogprobsScorer, SAMPLED_LOGPROBS_SCORER_NAMES
18 | from uqlm.white_box.p_true import PTrueScorer
19 | 
20 | __all__ = ["SingleLogprobsScorer", "TopLogprobsScorer", "SampledLogprobsScorer", "PTrueScorer", "SINGLE_LOGPROBS_SCORER_NAMES", "TOP_LOGPROBS_SCORER_NAMES", "SAMPLED_LOGPROBS_SCORER_NAMES"]
21 | 


--------------------------------------------------------------------------------
/uqlm/utils/postprocessors.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | def math_postprocessor(input_string: str) -> str:
17 |     """
18 |     Parameters
19 |     ----------
20 | 
21 |     input_string: str
22 |         The string from which the numerical answer will be extracted. Only the integer part is extracted.
23 | 
24 |     Returns
25 |     -------
26 |     str
27 |         The postprocessed string containing the integer part of the answer.
28 |     """
29 |     result = ""
30 |     for char in input_string:
31 |         if char.isdigit():
32 |             result += char
33 |         elif char == ".":
34 |             break
35 |     return result
36 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | # Get the parent dir name which is this docs' version
23 | VERSION := $(notdir $(CURDIR))
24 | 
25 | github:
26 | 	@rm -rf build/html build/doctrees source/_autosummary/*.rst
27 | 	@cp -rf ../../assets/* source/_static/
28 | 	@cp -rf ../../examples/* source/_notebooks/examples/
29 | 	@make html
30 | 	@mkdir -p ../../docs/$(VERSION)
31 | 	@rm -rf ../../docs/$(VERSION)/*
32 | 	@cp -a build/html/. ../../docs/$(VERSION)/
33 | 	@cp ../versions.json ../../docs/versions.json
34 | 
35 | local:
36 | 	@python -m http.server --directory ../../docs/$(VERSION)/ 8080
37 | 


--------------------------------------------------------------------------------
/examples/uqe_config_tuned.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "weights": [
 3 |     0.0025387213438219183,
 4 |     0.8341461711896818,
 5 |     0.040041378980108795,
 6 |     0.12327372848638753
 7 |   ],
 8 |   "thresh": 0.74,
 9 |   "components": [
10 |     "exact_match",
11 |     "noncontradiction",
12 |     "normalized_probability",
13 |     "judge_1"
14 |   ],
15 |   "llm_config": {
16 |     "class_name": "AzureChatOpenAI",
17 |     "module": "langchain_openai.chat_models.azure",
18 |     "deployment_name": "gpt-4o-mini",
19 |     "logprobs": true,
20 |     "model_version": "",
21 |     "n": 5,
22 |     "openai_api_type": "azure",
23 |     "openai_api_version": "2024-02-15-preview",
24 |     "profile": {},
25 |     "streaming": false,
26 |     "use_previous_response_id": false,
27 |     "verbose": false
28 |   },
29 |   "llm_scorers": {
30 |     "judge_1": {
31 |       "class_name": "AzureChatOpenAI",
32 |       "module": "langchain_openai.chat_models.azure",
33 |       "deployment_name": "gpt-4o-mini",
34 |       "logprobs": true,
35 |       "model_version": "",
36 |       "n": 5,
37 |       "openai_api_type": "azure",
38 |       "openai_api_version": "2024-02-15-preview",
39 |       "profile": {},
40 |       "streaming": false,
41 |       "use_previous_response_id": false,
42 |       "verbose": false
43 |     }
44 |   }
45 | }


--------------------------------------------------------------------------------
/uqlm/utils/warn.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import warnings
16 | 
17 | 
18 | class UQLMBetaWarning(Warning):
19 |     """Custom warning class for beta features in UQLM."""
20 | 
21 |     pass
22 | 
23 | 
24 | def beta_warning(message: str):
25 |     """Issues a beta warning with a custom message."""
26 |     warnings.warn(message, category=UQLMBetaWarning, stacklevel=2)
27 | 
28 | 
29 | class UQLMDeprecationWarning(Warning):
30 |     """Custom warning class for future deprecation of features in UQLM."""
31 | 
32 |     pass
33 | 
34 | 
35 | def deprecation_warning(message: str):
36 |     """Issues a beta warning with a custom message."""
37 |     warnings.warn(message, category=UQLMDeprecationWarning, stacklevel=2)
38 | 


--------------------------------------------------------------------------------
/uqlm/utils/device.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | 
18 | def get_best_device() -> torch.device:
19 |     """
20 |     Detects and returns the best available PyTorch device.
21 |     Prioritizes CUDA (NVIDIA GPU), then MPS (macOS), then CPU.
22 | 
23 |     Returns
24 |     -------
25 |     torch.device
26 |         The best available device.
27 | 
28 |     Examples
29 |     --------
30 |     >>> device = get_best_device()
31 |     >>> print(f"Using {device.type} device")
32 |     """
33 |     if torch.cuda.is_available():
34 |         return torch.device("cuda")
35 |     elif torch.backends.mps.is_available():
36 |         return torch.device("mps")
37 |     else:
38 |         return torch.device("cpu")
39 | 


--------------------------------------------------------------------------------
/.github/workflows/update_version_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def rebuild_version_json(current_version, gh_pages_path, site_url="https://cvs-health.github.io/uqlm"):
 7 |     version_json_path = Path(gh_pages_path) / "versions.json"
 8 |     entries = []
 9 | 
10 |     # List only v* folders, ignore 'latest'
11 |     folders = [p for p in Path(gh_pages_path).iterdir() if p.is_dir() and p.name.startswith("v")]
12 | 
13 |     folders = sorted(folders, key=lambda f: tuple([int(x) for x in f.name[1:].split(".")]), reverse=True)
14 | 
15 |     entries.append({"name": f"v{current_version} (latest)", "version": current_version, "url": f"{site_url}/latest/"})
16 |     for folder in folders:
17 |         version = folder.name[1:]  # strip leading 'v'
18 |         entry = {"name": f"v{version}", "version": version, "url": f"{site_url}/v{version}/"}
19 | 
20 |         entries.append(entry)
21 | 
22 |     # Save version.json
23 |     with open(version_json_path, "w") as f:
24 |         json.dump(entries, f, indent=4)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     if len(sys.argv) != 3:
29 |         print("Usage: python rebuild_version_json.py <current_version> <gh_pages_path>")
30 |         sys.exit(1)
31 | 
32 |     current_version = sys.argv[1]
33 |     gh_pages_path = sys.argv[2]
34 |     rebuild_version_json(current_version, gh_pages_path)
35 | 


--------------------------------------------------------------------------------
/docs/source/_templates/module.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. automodule:: {{ fullname }}
 4 | 
 5 |    {% block attributes %}
 6 |    {% if attributes %}
 7 |    .. rubric:: {{ _('Module Attributes') }}
 8 | 
 9 |    .. autosummary::
10 |       :toctree:
11 |    {% for item in attributes %}
12 |       {{ item }}
13 |    {%- endfor %}
14 |    {% endif %}
15 |    {% endblock %}
16 | 
17 |    {% block functions %}
18 |    {% if functions %}
19 |    .. rubric:: {{ _('Functions') }}
20 | 
21 |    .. autosummary::
22 |       :toctree:
23 |    {% for item in functions %}
24 |       {{ item }}
25 |    {%- endfor %}
26 |    {% endif %}
27 |    {% endblock %}
28 | 
29 |    {% block classes %}
30 |    {% if classes %}
31 |    .. rubric:: {{ _('Classes') }}
32 | 
33 |    .. autosummary::
34 |       :toctree:
35 |       :template: class.rst
36 |    {% for item in classes %}
37 |       {{ item }}
38 |    {%- endfor %}
39 |    {% endif %}
40 |    {% endblock %}
41 | 
42 |    {% block exceptions %}
43 |    {% if exceptions %}
44 |    .. rubric:: {{ _('Exceptions') }}
45 | 
46 |    .. autosummary::
47 |       :toctree:
48 |    {% for item in exceptions %}
49 |       {{ item }}
50 |    {%- endfor %}
51 |    {% endif %}
52 |    {% endblock %}
53 | 
54 | {% block modules %}
55 | {% if modules %}
56 | .. rubric:: Modules
57 | 
58 | .. autosummary::
59 |    :toctree:
60 |    :template: module.rst
61 |    :recursive:
62 | {% for item in modules %}
63 |    {{ item }}
64 | {%- endfor %}
65 | {% endif %}
66 | {% endblock %}


--------------------------------------------------------------------------------
/tests/data/scorers/blackbox_results_file.json:
--------------------------------------------------------------------------------
1 | {"data": {"responses": ["30", "8", "17", "$5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8 emails", "8", "8", "8"], ["17", "17 marbles", "17", "17", "17"], ["$5", "$5", "$5", "$5", "$5"], ["11", "11", "11", "11", "11"]], "prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "exact_match": [1.0, 0.8, 0.8, 1.0, 1.0], "semantic_negentropy": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 0.9909547328948974, 0.9893265008926392, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "scorers": ["noncontradiction", "exact_match", "semantic_negentropy"]}}


--------------------------------------------------------------------------------
/uqlm/utils/results.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Dict, Any
16 | import pandas as pd
17 | 
18 | 
19 | class UQResult:
20 |     def __init__(self, result: Dict[str, Any]) -> None:
21 |         """
22 |         Class that characterizes result of an UncertaintyQuantifier.
23 | 
24 |         Parameters
25 |         ----------
26 |         result: dict
27 |             A dictionary that is defined during `evaluate` or `tune_params` method
28 |         """
29 |         self.data = result.get("data")
30 |         self.metadata = result.get("metadata")
31 |         self.result_dict = result
32 | 
33 |     def to_dict(self) -> Dict[str, Any]:
34 |         """
35 |         Returns result in dictionary form
36 |         """
37 |         return self.result_dict
38 | 
39 |     def to_df(self) -> pd.DataFrame:
40 |         """
41 |         Returns result in pd.DataFrame
42 |         """
43 |         rename_dict = {col: col[:-1] for col in self.result_dict["data"].keys() if col.endswith("s") and col not in ["sampled_responses", "raw_sampled_responses"]}
44 | 
45 |         return pd.DataFrame(self.result_dict["data"]).rename(columns=rename_dict)
46 | 


--------------------------------------------------------------------------------
/tests/data/scorers/whitebox_results_file.json:
--------------------------------------------------------------------------------
1 | {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "$5", "11"], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -9.0883464e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -1.2664457e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0007860411, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.038273167, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.00026145502, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -2.5226382e-05, "top_logprobs": []}]], "normalized_probability": [0.999999091165773, 0.9999987335551019, 0.9992142677493774, 0.9809171172485425, 0.9999747739361825], "min_probability": [0.999999091165773, 0.9999987335551019, 0.9992142677493774, 0.9624499954009256, 0.9999747739361825]}, "metadata": {"temperature": 1.0}}


--------------------------------------------------------------------------------
/uqlm/white_box/single_logprobs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | from typing import List, Dict, Any
18 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer
19 | 
20 | 
21 | SINGLE_LOGPROBS_SCORER_NAMES = ["normalized_probability", "min_probability", "sequence_probability"]
22 | 
23 | 
24 | class SingleLogprobsScorer(LogprobsScorer):
25 |     def __init__(self, scorers: List[str] = SINGLE_LOGPROBS_SCORER_NAMES):
26 |         """Class for computing WhiteBox UQ scores with a single generation"""
27 |         super().__init__()
28 |         self.scorers = scorers
29 | 
30 |     def evaluate(self, logprobs_results: List[List[Dict[str, Any]]]) -> Dict[str, List[float]]:
31 |         """Compute scores from logprobs results"""
32 |         scores_dict = {"normalized_probability": self._compute_single_generation_scores(logprobs_results, self._norm_prob), "min_probability": self._compute_single_generation_scores(logprobs_results, self._min_prob), "sequence_probability": self._compute_single_generation_scores(logprobs_results, self._seq_prob)}
33 |         return {k: scores_dict[k] for k in self.scorers}
34 | 
35 |     def _min_prob(self, single_response_logprobs: List[Dict[str, Any]]) -> float:
36 |         """Compute minimum token probability"""
37 |         probs = self.extract_probs(single_response_logprobs)
38 |         return np.min(probs)
39 | 


--------------------------------------------------------------------------------
/uqlm/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from uqlm.utils.plots import plot_model_accuracies, plot_filtered_accuracy, plot_ranked_auc
17 | from uqlm.utils.dataloader import load_dataset, load_example_dataset
18 | from uqlm.utils.postprocessors import math_postprocessor
19 | from uqlm.utils.response_generator import ResponseGenerator
20 | from uqlm.utils.results import UQResult
21 | from uqlm.utils.tuner import Tuner
22 | from uqlm.utils.grader import LLMGrader
23 | from uqlm.utils.llm_config import save_llm_config, load_llm_config
24 | from uqlm.utils.display import ConditionalBarColumn, ConditionalTimeElapsedColumn, ConditionalTextColumn, ConditionalSpinnerColumn
25 | from uqlm.utils.warn import beta_warning, deprecation_warning
26 | from uqlm.utils.device import get_best_device
27 | 
28 | __all__ = [
29 |     "plot_model_accuracies",
30 |     "plot_filtered_accuracy",
31 |     "plot_ranked_auc",
32 |     "load_example_dataset",
33 |     "load_dataset",
34 |     "load_example_dataset",
35 |     "math_postprocessor",
36 |     "ResponseGenerator",
37 |     "UQResult",
38 |     "Tuner",
39 |     "LLMGrader",
40 |     "save_llm_config",
41 |     "load_llm_config",
42 |     "ConditionalBarColumn",
43 |     "ConditionalTimeElapsedColumn",
44 |     "ConditionalTextColumn",
45 |     "ConditionalSpinnerColumn",
46 |     "beta_warning",
47 |     "deprecation_warning",
48 |     "get_best_device",
49 | ]
50 | 


--------------------------------------------------------------------------------
/uqlm/utils/display.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from rich.progress import SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
17 | 
18 | HEADERS = ["🤖 Generation", "📈 Scoring", "⚙️ Optimization", "🤖🧮 Generation with Logprobs", "", "  - [black]Grading responses against provided ground truth answers with default grader..."]
19 | OPTIMIZATION_TASKS = ["  - [black]Optimizing weights...", "  - [black]Jointly optimizing weights and threshold using grid search...", "  - [black]Optimizing weights using grid search...", "  - [black]Optimizing threshold with grid search..."]
20 | 
21 | 
22 | class ConditionalBarColumn(BarColumn):
23 |     def render(self, task):
24 |         if task.description in HEADERS:
25 |             return ""
26 |         return super().render(task)
27 | 
28 | 
29 | class ConditionalTimeElapsedColumn(TimeElapsedColumn):
30 |     def render(self, task):
31 |         if task.description in HEADERS:
32 |             return ""
33 |         return super().render(task)
34 | 
35 | 
36 | class ConditionalTextColumn(TextColumn):
37 |     def render(self, task):
38 |         if task.description in HEADERS:
39 |             return ""
40 |         elif task.description in OPTIMIZATION_TASKS:
41 |             return f"[progress.percentage]{task.percentage:>3.0f}%"
42 |         return super().render(task)
43 | 
44 | 
45 | class ConditionalSpinnerColumn(SpinnerColumn):
46 |     def render(self, task):
47 |         if task.description in HEADERS:
48 |             return ""
49 |         return super().render(task)
50 | 


--------------------------------------------------------------------------------
/tests/data/scorers/generate_data_semanticentropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | import json
18 | from dotenv import load_dotenv, find_dotenv
19 | from langchain_openai import AzureChatOpenAI
20 | 
21 | from uqlm.utils import load_example_dataset
22 | from uqlm.scorers import SemanticEntropy
23 | 
24 | 
25 | async def main():
26 |     # User to populate .env file with API credentials
27 |     load_dotenv(find_dotenv())
28 | 
29 |     API_KEY = os.getenv("API_KEY")
30 |     API_BASE = os.getenv("API_BASE")
31 |     API_TYPE = os.getenv("API_TYPE")
32 |     API_VERSION = os.getenv("API_VERSION")
33 |     DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")
34 | 
35 |     llm = AzureChatOpenAI(
36 |         deployment_name=DEPLOYMENT_NAME,
37 |         openai_api_key=API_KEY,
38 |         azure_endpoint=API_BASE,
39 |         openai_api_type=API_TYPE,
40 |         openai_api_version=API_VERSION,
41 |         temperature=1,  # User to set temperature
42 |     )
43 | 
44 |     # svamp dataset to be used as a prod dataset
45 |     svamp = load_example_dataset("gsm8k").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5)
46 | 
47 |     # Define prompts
48 |     MATH_INSTRUCTION = "Solve the math problem, but return only the numerical answer.\n"
49 |     prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question]
50 | 
51 |     se = SemanticEntropy(llm=llm, use_best=False)
52 | 
53 |     results = await se.generate_and_score(prompts=prompts)
54 | 
55 |     results_file = "semanticentropy_results_file.json"
56 |     with open(results_file, "w") as f:
57 |         json.dump(results.to_dict(), f)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/tests/data/scorers/generate_data_whitebox.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | import json
18 | from dotenv import load_dotenv, find_dotenv
19 | 
20 | from uqlm.utils.dataloader import load_example_dataset
21 | from uqlm.scorers import WhiteBoxUQ
22 | from langchain_openai import AzureChatOpenAI
23 | 
24 | 
25 | async def main():
26 |     # svamp dataset to be used as a prod dataset
27 |     svamp = load_example_dataset("svamp").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5)
28 | 
29 |     # Define prompts
30 |     MATH_INSTRUCTION = "When you solve this math problem only return the answer with no additional text.\n"
31 |     prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question]
32 | 
33 |     # User to populate .env file with API credentials
34 |     load_dotenv(find_dotenv())
35 | 
36 |     API_KEY = os.getenv("API_KEY")
37 |     API_BASE = os.getenv("API_BASE")
38 |     API_TYPE = os.getenv("API_TYPE")
39 |     API_VERSION = os.getenv("API_VERSION")
40 |     DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")
41 | 
42 |     # This will be our main LLM for generation
43 |     gpt = AzureChatOpenAI(
44 |         deployment_name=DEPLOYMENT_NAME,
45 |         openai_api_key=API_KEY,
46 |         azure_endpoint=API_BASE,
47 |         openai_api_type=API_TYPE,
48 |         openai_api_version=API_VERSION,
49 |         temperature=1,  # User to set temperature
50 |     )
51 | 
52 |     wbuq = WhiteBoxUQ(llm=gpt)
53 | 
54 |     results = await wbuq.generate_and_score(prompts=prompts)
55 | 
56 |     results_file = "whitebox_results_file.json"
57 |     with open(results_file, "w") as f:
58 |         json.dump(results.to_dict(), f)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/tests/data/scorers/generate_data_blackbox.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | import json
18 | from dotenv import load_dotenv, find_dotenv
19 | 
20 | from uqlm.utils.dataloader import load_example_dataset
21 | from uqlm.scorers import BlackBoxUQ
22 | from langchain_openai import AzureChatOpenAI
23 | 
24 | 
25 | async def main():
26 |     # svamp dataset to be used as a prod dataset
27 |     svamp = load_example_dataset("svamp").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5)
28 | 
29 |     # Define prompts
30 |     MATH_INSTRUCTION = "When you solve this math problem only return the answer with no additional text.\n"
31 |     prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question]
32 | 
33 |     # User to populate .env file with API credentials
34 |     load_dotenv(find_dotenv())
35 | 
36 |     API_KEY = os.getenv("API_KEY")
37 |     API_BASE = os.getenv("API_BASE")
38 |     API_TYPE = os.getenv("API_TYPE")
39 |     API_VERSION = os.getenv("API_VERSION")
40 |     DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")
41 | 
42 |     # This will be our main LLM for generation
43 |     gpt = AzureChatOpenAI(
44 |         deployment_name=DEPLOYMENT_NAME,
45 |         openai_api_key=API_KEY,
46 |         azure_endpoint=API_BASE,
47 |         openai_api_type=API_TYPE,
48 |         openai_api_version=API_VERSION,
49 |         temperature=1,  # User to set temperature
50 |     )
51 | 
52 |     bbuq = BlackBoxUQ(llm=gpt, scorers=["noncontradiction", "exact_match", "semantic_negentropy"])
53 | 
54 |     results = await bbuq.generate_and_score(prompts=prompts, num_responses=5)
55 | 
56 |     results_file = "blackbox_results_file.json"
57 |     with open(results_file, "w") as f:
58 |         json.dump(results.to_dict(), f)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/tests/data/scorers/semanticentropy_results_file.json:
--------------------------------------------------------------------------------
1 | {"data": {"responses": ["5", "$3", "12", "308", "35"], "entropy_values": [0.0, 1.3296613488547582, 0.6365141682948128, 0.45056120886630463, 0.8675632284814612], "confidence_scores": [1.0, 0.25790187148969435, 0.644754678724236, 0.7485370014199393, 0.5158037429793888], "sampled_responses": [["5", "5", "5", "5", "5 miles"], ["$9", "$6", "Josh makes 12 bracelets. His cost for supplies is:\n12 bracelets * $1/bracelet = $12\n\nHe sells each bracelet for $1.50, so his revenue from selling 12 bracelets is:\n12 bracelets * $1.50/bracelet = $18\n\nHis profit, therefore, is:\n$18 - $12 = $6\n\nAfter buying the cookies, he still has $3, so the cost of the cookies is:\n$6 - $3 = $3\n\nTherefore, the cost of the box of cookies is $3.", "$6", "$9"], ["12", "36", "12", "12", "36"], ["308", "308", "308", "315", "308"], ["32.5", "32", "32.5", "30.", "32"]], "prompts": ["Solve the math problem, but return only the numerical answer.\nVery early this morning, Elise left home in a cab headed for the hospital. Fortunately, the roads were clear, and the cab company only charged her a base price of $3, and $4 for every mile she traveled. If Elise paid a total of $23, how far is the hospital from her house?", "Solve the math problem, but return only the numerical answer.\nJosh is saving up for a box of cookies. To raise the money, he is going to make bracelets and sell them. It costs $1 for supplies for each bracelet and he sells each one for $1.5. If he makes 12 bracelets and after buying the cookies still has $3, how much did the box of cookies cost?", "Solve the math problem, but return only the numerical answer.\nColin can skip at six times the speed that Brandon can.  Brandon can skip at one-third the speed that Tony can.  And Tony can skip at twice the speed that Bruce can.  At what speed, in miles per hour, can Colin skip if Bruce skips at 1 mile per hour?", "Solve the math problem, but return only the numerical answer.\nJanet, a third grade teacher, is picking up the sack lunch order from a local deli for the field trip she is taking her class on. There are 35 children in her class, 5 volunteer chaperones, and herself. She she also ordered three additional sack lunches, just in case there was a problem. Each sack lunch costs $7. How much do all the lunches cost in total?", "Solve the math problem, but return only the numerical answer.\nAt 30, Anika is 4/3 the age of Maddie. What would be their average age in 15 years?"]}, "metadata": {"parameters": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5}}}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cov.xml
 56 | cov-term-missing.txt
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | docs/
 78 | docs_srcs/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | # macos
138 | *.DS_Store
139 | 
140 | # download data
141 | **/BLEURT-20/
142 | 
143 | # for dev
144 | /experiments
145 | 
146 | 
147 | .vscode/
148 | .settings/


--------------------------------------------------------------------------------
/uqlm/black_box/match.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | from typing import List, Optional
18 | from rich.progress import Progress
19 | 
20 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer
21 | 
22 | import time
23 | 
24 | 
25 | class MatchScorer(SimilarityScorer):
26 |     def __init__(self) -> None:
27 |         """
28 |         Class for computing exact match rates between original responses and candidates. This
29 |         method is based on Cole et al.(2023) :footcite:`cole2023selectivelyansweringambiguousquestions`.
30 |         """
31 |         pass
32 | 
33 |     def evaluate(self, responses: List[str], sampled_responses: List[List[str]], progress_bar: Optional[Progress] = None) -> List[float]:
34 |         """
35 |         This method computes exact match rates for the provided pairs of texts.
36 | 
37 |         Parameters
38 |         ----------
39 |         responses : list of strings
40 |             Original LLM response
41 | 
42 |         sampled_responses : list of list of strings
43 |             Candidate responses to be compared to the original response
44 | 
45 |         progress_bar : rich.progress.Progress, default=None
46 |             If provided, displays a progress bar while scoring responses
47 | 
48 |         Returns
49 |         -------
50 |         List of float
51 |             Exact match rates
52 |         """
53 |         if progress_bar:
54 |             progress_task = progress_bar.add_task("  - Scoring responses with exact match...", total=len(responses))
55 |         results = []
56 |         for i, (response, candidates) in enumerate(zip(responses, sampled_responses)):
57 |             score = self._compute_score(response=response, candidates=candidates)
58 |             results.append(score)
59 |             if progress_bar:
60 |                 progress_bar.update(progress_task, advance=1)
61 |         time.sleep(0.1)
62 |         return results
63 | 
64 |     @staticmethod
65 |     def _compute_score(response: str, candidates: List[str]) -> List[float]:
66 |         """Get mean exact match rate between response and set of candidates"""
67 |         return np.mean([1 if response == c else 0 for c in candidates])
68 | 


--------------------------------------------------------------------------------
/tests/test_similarity.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import json
16 | import numpy as np
17 | from uqlm.black_box import BertScorer, CosineScorer, MatchScorer
18 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer
19 | 
20 | datafile_path = "tests/data/similarity/similarity_results_file.json"
21 | with open(datafile_path, "r") as f:
22 |     data = json.load(f)
23 | 
24 | responses = data["responses"]
25 | sampled_responses = data["sampled_responses"]
26 | 
27 | 
28 | def test_bert():
29 |     bert = BertScorer(device="cpu")
30 |     bert_result = bert.evaluate(responses=responses, sampled_responses=sampled_responses)
31 |     assert all([abs(bert_result[i] - data["bert_result"][i]) < 1e-5 for i in range(len(bert_result))])
32 | 
33 | 
34 | def test_cosine(monkeypatch):
35 |     embeddings1, embeddings2 = data["embeddings1"], data["embeddings2"]
36 | 
37 |     cosine = CosineScorer()
38 | 
39 |     # Mock return from  ('SentenceTransformer.encode' method)
40 |     def mock_encode(*args, **kwargs):
41 |         if len(embeddings1) >= len(embeddings2):
42 |             return np.array(embeddings1.pop(0))
43 |         return np.array(embeddings2.pop(0))
44 | 
45 |     monkeypatch.setattr(cosine.model, "encode", mock_encode)
46 | 
47 |     cosine_result = cosine.evaluate(responses=responses, sampled_responses=sampled_responses)
48 |     assert all([abs(cosine_result[i] - data["cosine_result"][i]) < 1e-5 for i in range(len(cosine_result))])
49 | 
50 | 
51 | def test_exact_match():
52 |     match = MatchScorer()
53 |     match_result = match.evaluate(responses=responses, sampled_responses=sampled_responses)
54 |     assert all([abs(match_result[i] - data["match_result"][i]) < 1e-5 for i in range(len(match_result))])
55 | 
56 | 
57 | def test_abstract_base_class():
58 |     """Test to cover abstract base class"""
59 | 
60 |     class TestSimilarityScorer(SimilarityScorer):
61 |         def __init__(self):
62 |             super().__init__()
63 | 
64 |         def evaluate(self, responses, sampled_responses):
65 |             super().evaluate(responses, sampled_responses)
66 |             return [1.0]
67 | 
68 |     scorer = TestSimilarityScorer()
69 |     result = scorer.evaluate(["test"], ["sample"])
70 |     assert result == [1.0]
71 | 


--------------------------------------------------------------------------------
/tests/test_grader.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | from unittest.mock import AsyncMock, MagicMock
17 | from uqlm.utils.grader import LLMGrader
18 | 
19 | 
20 | @pytest.mark.asyncio
21 | async def test_grade_responses():
22 |     """Test the grade_responses method"""
23 |     mock_llm = MagicMock()
24 |     mock_response_generator = AsyncMock()
25 |     mock_response_generator.generate_responses.return_value = {"data": {"response": ["yes", "no", "yes"]}}
26 |     mock_llm.response_generator = mock_response_generator
27 | 
28 |     grader = LLMGrader(llm=mock_llm)
29 |     grader.response_generator = mock_response_generator
30 | 
31 |     prompts = ["What is 2+2?", "What is the capital of France?", "What is 5*5?"]
32 |     responses = ["4", "Berlin", "25"]
33 |     answers = [["4"], ["Paris"], ["25"]]
34 | 
35 |     result = await grader.grade_responses(prompts, responses, answers)
36 |     assert result == [True, False, True]
37 | 
38 | 
39 | def test_extract_grades():
40 |     """Test the _extract_grades method"""
41 |     assert LLMGrader._extract_grades("yes") is True
42 |     assert LLMGrader._extract_grades("no") is False
43 |     assert LLMGrader._extract_grades("YES") is True
44 |     assert LLMGrader._extract_grades("NO") is False
45 |     assert LLMGrader._extract_grades("maybe") is False
46 | 
47 | 
48 | def test_construct_grader_prompt():
49 |     """Test the _construct_grader_prompt method"""
50 |     prompt = "What is 2+2?"
51 |     response = "4"
52 |     acceptable_answers = ["4", "four"]
53 | 
54 |     expected_prompt = """
55 |         Your task is to grade the following proposed answer against the provided answer key. The ground truth is the gold standard regardless of any other information you may have. Return ONLY the word "yes" or "no", with no additional text, based on whether the proposed answer aligns with any of the ground truth answers. Answer "yes" if correct, "no" if incorrect.
56 | 
57 |         **Question:**
58 |         What is 2+2?
59 | 
60 |         **Ground Truth Answers (Answer Key):**
61 |         ['4', 'four']
62 | 
63 |         **Proposed Answer to Grade:**
64 |         4
65 | 
66 |         Now your answer is (yes or no):
67 |         """
68 |     result = LLMGrader._construct_grader_prompt(prompt, response, acceptable_answers)
69 |     assert result.strip() == expected_prompt.strip()
70 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to UQLM
 2 | 
 3 | Welcome and thank you for considering contributing to UQLM!
 4 | 
 5 | It takes a lot of time and effort to use software much less build upon it, so we deeply appreciate your desire to help make this project thrive.
 6 | 
 7 | ## Table of Contents
 8 | 
 9 | 1. [How to Contribute](#how-to-contribute)
10 |     - [Reporting Bugs](#reporting-bugs)
11 |     - [Suggesting Enhancements](#suggesting-enhancements)
12 |     - [Pull Requests](#pull-requests)
13 | 2. [Development Setup](#development-setup)
14 | 3. [Style Guides](#style-guides)
15 |     - [Code Style](#code-style)
16 | 
17 | ## How to Contribute
18 | 
19 | ### Reporting Bugs
20 | 
21 | If you find a bug, please report it by opening an issue on GitHub. Include as much detail as possible:
22 | - Steps to reproduce the bug.
23 | - Expected and actual behavior.
24 | - Screenshots if applicable.
25 | - Any other information that might help us understand the problem.
26 | 
27 | ### Suggesting Enhancements
28 | 
29 | We welcome suggestions for new features or improvements. To suggest an enhancement, please open an issue on GitHub and include:
30 | - A clear description of the suggested enhancement.
31 | - Why you believe this enhancement would be useful.
32 | - Any relevant examples or mockups.
33 | 
34 | ### Pull Requests
35 | 
36 | 1. Fork the repository.
37 | 2. Create a new branch (`git checkout -b feature/your-feature-name`).
38 | 3. Make your changes.
39 | 4. Commit your changes (`git commit -m 'Add some feature'`).
40 | 5. Push to the branch (`git push origin feature/your-feature-name`).
41 | 6. Open a pull request.
42 | 
43 | Please ensure your pull request adheres to the following guidelines:
44 | - Follow the project's code style.
45 | - Include tests for any new features or bug fixes.
46 | 
47 | ## Development Setup
48 | 
49 | 1. Clone the repository: `git clone https://github.com/cvs-health/uqlm.git`
50 | 2. Navigate to the project directory: `cd uqlm`
51 | 3. Create and activate a virtual environment (using `venv` or `conda`)
52 | 4. Install poetry (if you don't already have it): `pip install poetry`
53 | 5. Install uqlm with dev dependencies: `poetry install --with dev`
54 | 6. Install our pre-commit hooks to ensure code style compliance: `pre-commit install`
55 | 7. Run tests to ensure everything is working: `pre-commit run --all-files`
56 | 
57 | You're ready to develop!
58 | 
59 | ## Style Guides
60 | 
61 | ### Code Style
62 | 
63 | - We use [Ruff](https://github.com/astral-sh/ruff) to lint and format our files.
64 | - Our pre-commit hook will run Ruff linting and formatting when you commit.
65 | - You can manually run Ruff at any time (see [Ruff usage](https://github.com/astral-sh/ruff#usage)).
66 | 
67 | Please ensure your code is properly formatted and linted before committing.
68 | 
69 | ## License
70 | 
71 | Before contributing to this CVS Health sponsored project, you will need to sign the associated [Contributor License Agreement (CLA)](https://forms.office.com/r/iFZWwzjt9C)
72 | 
73 | ---
74 | 
75 | Thanks again for using and supporting uqlm!


--------------------------------------------------------------------------------
/tests/test_semanticentropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import json
17 | from uqlm.scorers import SemanticEntropy
18 | from langchain_openai import AzureChatOpenAI
19 | 
20 | datafile_path = "tests/data/scorers/semanticentropy_results_file.json"
21 | with open(datafile_path, "r") as f:
22 |     expected_result = json.load(f)
23 | 
24 | data = expected_result["data"]
25 | metadata = expected_result["metadata"]
26 | 
27 | mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")
28 | 
29 | 
30 | @pytest.mark.flaky(reruns=3)
31 | @pytest.mark.asyncio
32 | async def test_semanticentropy(monkeypatch):
33 |     PROMPTS = data["prompts"]
34 |     MOCKED_RESPONSES = data["responses"]
35 |     MOCKED_SAMPLED_RESPONSES = data["sampled_responses"]
36 | 
37 |     # Initiate SemanticEntropy class object
38 |     se_object = SemanticEntropy(llm=mock_object, use_best=False, device="cpu")
39 | 
40 |     async def mock_generate_original_responses(*args, **kwargs):
41 |         se_object.logprobs = [None] * 5
42 |         return MOCKED_RESPONSES
43 | 
44 |     async def mock_generate_candidate_responses(*args, **kwargs):
45 |         se_object.multiple_logprobs = [[None] * 5] * 5
46 |         return MOCKED_SAMPLED_RESPONSES
47 | 
48 |     monkeypatch.setattr(se_object, "generate_original_responses", mock_generate_original_responses)
49 |     monkeypatch.setattr(se_object, "generate_candidate_responses", mock_generate_candidate_responses)
50 | 
51 |     for show_progress_bars in [False, True]:
52 |         se_results = await se_object.generate_and_score(prompts=PROMPTS, show_progress_bars=show_progress_bars)
53 |         se_object.logprobs = None
54 |         se_results = se_object.score(responses=MOCKED_RESPONSES, sampled_responses=MOCKED_SAMPLED_RESPONSES)
55 |         assert se_results.data["responses"] == data["responses"]
56 |         assert se_results.data["sampled_responses"] == data["sampled_responses"]
57 |         assert se_results.data["prompts"] == data["prompts"]
58 |         assert all([abs(se_results.data["discrete_entropy_values"][i] - data["entropy_values"][i]) < 1e-5 for i in range(len(PROMPTS))])
59 |         assert all([abs(se_results.data["discrete_confidence_scores"][i] - data["confidence_scores"][i]) < 1e-5 for i in range(len(PROMPTS))])
60 |         assert se_results.metadata == metadata
61 | 


--------------------------------------------------------------------------------
/tests/data/similarity/generate_data_similarity.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import json
17 | import asyncio
18 | 
19 | from uqlm.black_box import BertScorer, BLEURTScorer, CosineScorer, MatchScorer
20 | 
21 | 
22 | async def main():
23 |     # Load data
24 |     current_directory = os.getcwd()
25 |     datafile_path = os.path.join("/".join(current_directory.split("/")[:-1]), "scorers/bsdetector_results_file.json")
26 |     with open(datafile_path, "r") as f:
27 |         data = json.load(f)
28 | 
29 |     responses = data["responses"]
30 |     sampled_responses = data["sampled_responses"]
31 | 
32 |     store_results = dict()
33 |     store_results.update({"responses": responses, "sampled_responses": sampled_responses})
34 | 
35 |     # 1. Bert Scorer
36 |     bert = BertScorer()
37 |     bert_result = bert.evaluate(responses=responses, sampled_responses=sampled_responses)
38 | 
39 |     store_results.update(
40 |         {
41 |             "bert_result": bert_result
42 |             # 'F1': F1
43 |         }
44 |     )
45 | 
46 |     # 2. Bleurt Scorer
47 |     bluert = BLEURTScorer()
48 |     bluert_result = bluert.evaluate(responses=responses, sampled_responses=sampled_responses)
49 |     bluert_scorer_result = []
50 |     for i in range(len(responses)):
51 |         bluert_scorer_result.append(bluert.bleurt_scorer.score(references=[responses[i]] * len(sampled_responses[i]), candidates=sampled_responses[i]))
52 | 
53 |     store_results.update({"bluert_result": bluert_result, "bluert_score": bluert_scorer_result})
54 | 
55 |     # 3. Cosine Similarity Scorer
56 |     cosine = CosineScorer()
57 |     cosine_result = cosine.evaluate(responses=responses, sampled_responses=sampled_responses)
58 |     embeddings1, embeddings2 = [], []
59 |     for i in range(len(responses)):
60 |         embeddings1.append(cosine.model.encode([responses[i]] * len(sampled_responses[i])).tolist())
61 |         embeddings2.append(cosine.model.encode(sampled_responses[i]).tolist())
62 | 
63 |     store_results.update({"cosine_result": cosine_result, "embeddings1": embeddings1, "embeddings2": embeddings2})
64 | 
65 |     # 4. Exact Match scorer
66 |     match = MatchScorer()
67 |     match_result = match.evaluate(responses=responses, sampled_responses=sampled_responses)
68 | 
69 |     store_results.update({"match_result": match_result})
70 | 
71 |     # Store results
72 |     results_file = "similarity_results_file.json"
73 |     with open(results_file, "w") as f:
74 |         json.dump(store_results, f)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     asyncio.run(main())
79 | 


--------------------------------------------------------------------------------
/tests/data/scorers/generate_data_llmjudge.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import asyncio
17 | import json
18 | from dotenv import load_dotenv, find_dotenv
19 | from uqlm.judges import LLMJudge
20 | from uqlm.utils import ResponseGenerator
21 | from langchain_openai import AzureChatOpenAI
22 | 
23 | 
24 | async def main():
25 |     # This notebook generate results based on these input & using "exai-gpt-35-turbo-16k" model
26 |     prompts = ["Which part of the human body produces insulin?", "What color are the two stars on the national flag of Syria", "How many 'm's are there in the word strawberry"]
27 | 
28 |     # User to populate .env file with API credentials
29 |     load_dotenv(find_dotenv())
30 | 
31 |     API_KEY = os.getenv("API_KEY")
32 |     API_BASE = os.getenv("API_BASE")
33 |     API_TYPE = os.getenv("API_TYPE")
34 |     API_VERSION = os.getenv("API_VERSION")
35 |     DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")
36 | 
37 |     original_llm = AzureChatOpenAI(
38 |         deployment_name=DEPLOYMENT_NAME,
39 |         openai_api_key=API_KEY,
40 |         azure_endpoint=API_BASE,
41 |         openai_api_type=API_TYPE,
42 |         openai_api_version=API_VERSION,
43 |         temperature=1,  # User to set temperature
44 |     )
45 | 
46 |     rg = ResponseGenerator(llm=original_llm, max_calls_per_min=250)
47 |     generations = await rg.generate_responses(prompts=prompts, count=1)
48 |     responses = generations["data"]["response"]
49 | 
50 |     judge = LLMJudge(llm=original_llm, max_calls_per_min=250)
51 | 
52 |     # Generate data for all templates
53 |     templates = ["true_false_uncertain", "true_false", "continuous", "likert"]
54 |     # Structure: one file with all template data
55 |     all_results = {
56 |         "prompts": prompts,
57 |         "responses": responses,
58 |         "templates": {},  # This will hold data for each template
59 |     }
60 |     for template in templates:
61 |         judge = LLMJudge(llm=original_llm, max_calls_per_min=250, scoring_template=template)
62 |         judge_result = await judge.judge_responses(prompts=prompts, responses=responses)
63 |         extract_answer = judge._extract_answers(responses=judge_result["judge_responses"])
64 |         # Store results for this template
65 |         all_results["templates"][template] = {"judge_result": judge_result, "extract_answer": extract_answer}
66 |     # Save single comprehensive file
67 |     results_file = "llmjudge_results_file.json"
68 |     with open(results_file, "w") as f:
69 |         json.dump(all_results, f)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     asyncio.run(main())
74 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Path setup --------------------------------------------------------------
 7 | import os
 8 | import sys
 9 | import importlib.metadata
10 | sys.path.insert(0, os.path.abspath('../../../uqlm'))
11 | 
12 | # -- Project information -----------------------------------------------------
13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
14 | 
15 | project = 'uqlm'
16 | copyright = '2025, CVS Health'
17 | author = 'Dylan Bouchard, Mohit Singh Chauhan'
18 | release = '0.1'
19 | # version = importlib.metadata.version("uqlm")
20 | # release = ".".join(version.rsplit(".")[:-1])
21 | 
22 | # -- General configuration ---------------------------------------------------
23 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
24 | 
25 | extensions = [
26 |     'sphinx.ext.autodoc', # Core library for html generation from docstrings
27 |     "sphinx_autodoc_typehints", # Automatically document type hints
28 |     'sphinx.ext.autosummary',  # Create neat summary tables
29 |     'sphinx.ext.napoleon', # NumPy and Google style docsrings parsing
30 |     "sphinx.ext.duration", # build duration
31 |     "sphinx.ext.doctest", # Test snippets in the documentation
32 |     "sphinxcontrib.bibtex", # Bibliographic references
33 |     "sphinx_favicon", # Add favicon
34 |     "nbsphinx", # Execute Jupyter notebooks + OSX  brew install pandoc
35 | ]
36 | nbsphinx_execute="never"
37 | 
38 | bibtex_bibfiles = ["refs.bib"]
39 | 
40 | autosummary_generate = True
41 | 
42 | templates_path = ['_templates']
43 | 
44 | html_static_path = ['_static']
45 | 
46 | html_css_files = ['custom.css']
47 | 
48 | exclude_patterns = []
49 | 
50 | # -- Options for HTML output -------------------------------------------------
51 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
52 | 
53 | favicons = [
54 |     {
55 |         "rel": "icon",
56 |         "sizes": "16x16",
57 |         "href": "images/favicon/favicon-16x16.png",
58 |         "type": "image/png"
59 |     },
60 |     {
61 |         "rel": "icon",
62 |         "sizes": "32x32",
63 |         "href": "images/favicon/favicon-32x32.png",
64 |         "type": "image/png"
65 |     },
66 |     {
67 |         "rel": "apple-touch-icon",
68 |         "sizes": "180x180",
69 |         "href": "images/favicon/apple-touch-icon.png",
70 |         "type": "image/png"
71 |     },
72 | ]
73 | 
74 | html_theme = 'pydata_sphinx_theme'
75 | 
76 | html_favicon = '_static/images/favicon/favicon.ico'
77 | 
78 | html_theme_options = {
79 |     "github_url": "https://github.com/cvs-health/uqlm",
80 |     "navbar_align": "left",
81 |     "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"],
82 |     "switcher": {
83 |         "json_url": "https://cvs-health.github.io/uqlm/versions.json",
84 |         "version_match": release,
85 |     },
86 |     "logo": {
87 |         "image_light": "_static/images/horizontal_logo.png",
88 |         "image_dark": "_static/images/horizontal_logo_no_bg.png",
89 |     },
90 | }
91 | 
92 | source_suffix = [".rst"]
93 | 


--------------------------------------------------------------------------------
/tests/test_blackboxuq.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import json
17 | from uqlm.scorers import BlackBoxUQ
18 | from uqlm.scorers.baseclass.uncertainty import DEFAULT_BLACK_BOX_SCORERS
19 | from langchain_openai import AzureChatOpenAI
20 | 
21 | datafile_path = "tests/data/scorers/blackbox_results_file.json"
22 | with open(datafile_path, "r") as f:
23 |     expected_result = json.load(f)
24 | 
25 | data = expected_result["data"]
26 | metadata = expected_result["metadata"]
27 | 
28 | PROMPTS = data["prompts"]
29 | MOCKED_RESPONSES = data["responses"]
30 | MOCKED_SAMPLED_RESPONSES = data["sampled_responses"]
31 | 
32 | 
33 | @pytest.fixture
34 | def mock_llm():
35 |     """Define mock LLM object using pytest.fixture."""
36 |     return AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")
37 | 
38 | 
39 | @pytest.mark.flaky(reruns=3)
40 | @pytest.mark.asyncio
41 | async def test_bbuq(monkeypatch, mock_llm):
42 |     uqe = BlackBoxUQ(llm=mock_llm, scorers=["noncontradiction", "exact_match", "semantic_negentropy"], device="cpu")
43 | 
44 |     async def mock_generate_original_responses(*args, **kwargs):
45 |         uqe.logprobs = [None] * 5
46 |         return MOCKED_RESPONSES
47 | 
48 |     async def mock_generate_candidate_responses(*args, **kwargs):
49 |         uqe.multiple_logprobs = [[None] * 5] * 5
50 |         return MOCKED_SAMPLED_RESPONSES
51 | 
52 |     monkeypatch.setattr(uqe, "generate_original_responses", mock_generate_original_responses)
53 |     monkeypatch.setattr(uqe, "generate_candidate_responses", mock_generate_candidate_responses)
54 |     for show_progress_bars in [False, True]:
55 |         results = await uqe.generate_and_score(prompts=PROMPTS, num_responses=5, show_progress_bars=show_progress_bars)
56 | 
57 |         assert all([results.data["exact_match"][i] == pytest.approx(data["exact_match"][i]) for i in range(len(PROMPTS))])
58 | 
59 |         assert all([results.data["noncontradiction"][i] == pytest.approx(data["noncontradiction"][i]) for i in range(len(PROMPTS))])
60 | 
61 |         assert all([results.data["semantic_negentropy"][i] == pytest.approx(data["semantic_negentropy"][i]) for i in range(len(PROMPTS))])
62 | 
63 |         assert results.metadata == metadata
64 | 
65 |     # Test invalid scorer
66 |     with pytest.raises(ValueError):
67 |         BlackBoxUQ(llm=mock_llm, scorers=["invalid_scorer"], device="cpu")
68 | 
69 |     # Test default scorers
70 |     uqe_default = BlackBoxUQ(llm=mock_llm, scorers=None, device="cpu")
71 |     assert len(uqe_default.scorers) == len(DEFAULT_BLACK_BOX_SCORERS)
72 | 
73 |     BlackBoxUQ(llm=mock_llm, scorers=["bert_score"], device="cpu")
74 | 


--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
  1 | /* Custom styles */
  2 | .wy-side-nav-search {
  3 |     background-color: #2980B9;
  4 |  }
  5 |  .wy-nav-content {
  6 |     max-width: 1200px;
  7 |  }
  8 |  .highlight {
  9 |     background: #f8f9fa;
 10 |  }
 11 |  /* Custom admonitions */
 12 |  .admonition.note {
 13 |     background: #e7f2fa;
 14 |  }
 15 |  .admonition.warning {
 16 |     background: #fff3cd;
 17 |  }
 18 |  /* Custom link colors */
 19 |  a {
 20 |     color: #2980B9;
 21 |  }
 22 |  a:hover {
 23 |     color: #3091d1;
 24 |  }
 25 |  
 26 |  /* Custom styles for the gallery */
 27 |  div.sphx-glr-thumbnails {
 28 |    display: grid;
 29 |    grid-template-columns: repeat(3, minmax(0, 1fr));
 30 |    gap: 20px;
 31 |    padding: 20px;
 32 |  }
 33 |  
 34 |  
 35 |  div.sphx-glr-thumbcontainer:hover {
 36 |    border: 1px solid #0066cc;
 37 |    box-shadow: 0 0 15px rgba(0,0,0,0.1);
 38 |  }
 39 |  
 40 |  .sphx-glr-footer {
 41 |    text-align: center;
 42 |    margin: 2em 0;
 43 |  }
 44 |  .sphx-glr-download {
 45 |    margin: 1em 0;
 46 |  }
 47 |  
 48 |  
 49 |  .sphx-glr-thumbcontainer:hover {
 50 |    border-color: #0066cc;
 51 |  }
 52 |  
 53 |  /* Make container relative for absolute positioning of link */
 54 |  div.sphx-glr-thumbcontainer {
 55 |    position: relative;
 56 |    border: solid #ccc 1px;
 57 |    border-radius: 4px;
 58 |    overflow: hidden;
 59 |    background: #212529;
 60 |    display: flex;
 61 |    flex-direction: column;  /* Stack children vertically */
 62 |  }
 63 |  
 64 |  /* Make the link cover the entire container */
 65 |  div.sphx-glr-thumbcontainer a {
 66 |    position: absolute;
 67 |    top: 0;
 68 |    left: 0;
 69 |    width: 100%;
 70 |    height: 100%;
 71 |    z-index: 1;
 72 |  }
 73 |  /* Force image to appear first */
 74 |  div.sphx-glr-thumbcontainer img {
 75 |    width: 100%;
 76 |    height: 150px;
 77 |    object-fit: contain;
 78 |    padding: 5px;
 79 |    order: 0;  /* This makes the image appear first */
 80 |  }
 81 |  
 82 |  /* Title/caption styling */
 83 |  
 84 |  div.sphx-glr-thumbnail-title {
 85 |    text-align: center;
 86 |    color: #4FB6D6;
 87 |    padding: 8px 5px;
 88 |    margin: 0;
 89 |    font-size: 0.9em;
 90 |    background: #212529;
 91 |    order: 1;  /* This makes the title appear after the image */
 92 |  }
 93 |  
 94 |  /* Hide doc captions ONLY in the gallery thumbnails */
 95 |  .sphx-glr-thumbcontainer .docutils.container p,
 96 |  .sphx-glr-thumbcontainer span.doc,
 97 |  .sphx-glr-thumbcontainer .docutils.container .caption-text {
 98 |     display: none !important;
 99 |  }
100 |  /* Keep doc visible everywhere else */
101 |  .docutils.container p,
102 |  span.doc {
103 |     display: inline-block;  /* Default display for docs outside gallery */
104 |  }
105 | 
106 | .responsive-img {
107 |     max-width: 100%;
108 |     height: auto;
109 |     display: block;
110 |     margin: 0 auto;
111 | }
112 | 
113 | /* Hide References section when it's empty */
114 | .references-section:empty,
115 | .references-section:only-child {
116 |     display: none;
117 | }
118 | 
119 | /* Add some styling to make the References section less prominent */
120 | .references-section {
121 |     margin-top: 2em;
122 |     padding-top: 1em;
123 |     border-top: 1px solid #eee;
124 | }
125 | 
126 | /* Make the References heading less prominent */
127 | .references-section .rubric {
128 |     font-size: 1.2em;
129 |     color: #666;
130 | }


--------------------------------------------------------------------------------
/uqlm/black_box/bert.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | from typing import Any, List, Optional
18 | from bert_score import BERTScorer
19 | import torch
20 | 
21 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer
22 | from uqlm.utils.device import get_best_device
23 | 
24 | import time
25 | from rich.progress import Progress
26 | 
27 | 
28 | class BertScorer(SimilarityScorer):
29 |     def __init__(self, device: Any = None) -> None:
30 |         """
31 |         Class for computing BERTScore values between original responses and candidates. For more on
32 |         BERTScore, refer to Zhang et al.(2020) :footcite:`zhang2020bertscoreevaluatingtextgeneration`.
33 | 
34 |         Parameters
35 |         ----------
36 |         device : torch.device input or torch.device object, default=None
37 |             Specifies the device that classifiers use for prediction. Set to "cuda" for classifiers to be able to
38 |             leverage the GPU.
39 |         """
40 |         # Handle device detection
41 |         if device is None:
42 |             device = get_best_device()
43 |         elif isinstance(device, str):
44 |             device = torch.device(device)
45 | 
46 |         from transformers import logging
47 | 
48 |         logging.set_verbosity_error()
49 |         self.bert_scorer = BERTScorer(device=device, lang="en")
50 | 
51 |     def evaluate(self, responses: List[str], sampled_responses: List[List[str]], progress_bar: Optional[Progress] = None) -> List[float]:
52 |         """
53 |         This method computes model-based text similarity metrics values for the provided pairs of texts.
54 | 
55 |         Parameters
56 |         ----------
57 |         responses : list of strings
58 |             Original LLM response
59 | 
60 |         sampled_responses : list of list of strings
61 |             Candidate responses to be compared to the original response
62 | 
63 |         progress_bar : rich.progress.Progress, default=None
64 |             If provided, displays a progress bar while scoring responses
65 | 
66 |         Returns
67 |         -------
68 |         List of float
69 |             Mean BertScore values
70 |         """
71 |         if progress_bar:
72 |             progress_task = progress_bar.add_task("  - Scoring responses with BERTScore...", total=len(responses))
73 |         results = []
74 |         for i in range(len(responses)):
75 |             score = self._compute_score(response=responses[i], candidates=sampled_responses[i])
76 |             results.append(score)
77 |             if progress_bar:
78 |                 progress_bar.update(progress_task, advance=1)
79 |         time.sleep(0.1)
80 |         return results
81 | 
82 |     def _compute_score(self, response: str, candidates: List[str]) -> float:
83 |         """Compute mean BERTScore between a response and candidate responses"""
84 |         num_responses = len(candidates)
85 |         duplicated_response = [response] * num_responses
86 |         P, R, F1 = self.bert_scorer.score(list(duplicated_response), refs=list(candidates))
87 |         return np.mean([float(f) for f in F1])
88 | 


--------------------------------------------------------------------------------
/tests/test_nli.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import gc
16 | import pytest
17 | from uqlm.nli.nli import NLI
18 | 
19 | 
20 | @pytest.fixture
21 | def text1():
22 |     return "Question: What is captial of France, Answer: Paris"
23 | 
24 | 
25 | @pytest.fixture
26 | def text2():
27 |     return "Question: What is captial of France, Answer: Capital of France is Paris city."
28 | 
29 | 
30 | @pytest.fixture
31 | def nli_model():
32 |     return NLI(device="cpu")
33 | 
34 | 
35 | @pytest.fixture
36 | def nli_model_cpu():
37 |     return NLI(verbose=True, device="cpu")
38 | 
39 | 
40 | @pytest.mark.flaky(reruns=3)
41 | def test_nli(text1, text2, nli_model):
42 |     probabilities = nli_model.predict(text1, text2)
43 |     del nli_model
44 |     gc.collect()
45 |     assert abs(float(probabilities[0][0]) - 0.00159405) < 1e-5
46 | 
47 | 
48 | # @pytest.mark.flaky(reruns=3)
49 | # def test_nli2(text1, nli_model_cpu):
50 | #     result = nli_model_cpu._observed_consistency_i(original=text1, candidates=[text1] * 5, use_best=False, compute_entropy=False)
51 | #     assert result["nli_score_i"] == 1
52 | #     assert result["discrete_semantic_entropy"] is None
53 | #     assert result["tokenprob_semantic_entropy"] is None
54 | 
55 | 
56 | @pytest.mark.flaky(reruns=3)
57 | def test_nli3(text1, text2, nli_model_cpu):
58 |     expected_warning = "Maximum response length exceeded for NLI comparison. Truncation will occur. To adjust, change the value of max_length"
59 | 
60 |     with pytest.warns(UserWarning, match=expected_warning):
61 |         nli_model_cpu.predict(text1 * 50, text2)
62 |     del nli_model_cpu
63 |     gc.collect()
64 | 
65 | 
66 | # @pytest.mark.flaky(reruns=3)
67 | # def test_nli4(nli_model_cpu):
68 | #     text1 = "Capital of France is Paris"
69 | #     text2 = " Paris is the capital of France"
70 | #     text3 = "Rome is the capital of Italy"
71 | #     logprobs_results = [
72 | #         [{"token": "Capital", "logprob": 0.6}, {"token": "of", "logprob": 0.5}, {"token": "France", "logprob": 0.3}, {"token": "is", "logprob": 0.3}, {"token": "Paris", "logprob": 0.3}],
73 | #         [{"token": "Paris", "logprob": 0.75}, {"token": "is", "logprob": 0.8}, {"token": "the", "logprob": 0.9}, {"token": "capital", "logprob": 0.6}, {"token": "of", "logprob": 0.6}, {"token": "France", "logprob": 0.6}],
74 | #         [{"token": "Rome", "logprob": 0.75}, {"token": "is", "logprob": 0.8}, {"token": "the", "logprob": 0.9}, {"token": "capital", "logprob": 0.6}, {"token": "of", "logprob": 0.6}, {"token": "Italy", "logprob": 0.6}],
75 | #     ]
76 | #     best_response, semantic_negentropy, nli_scores, tokenprob_semantic_entropy = nli_model_cpu._semantic_entropy_process(candidates=[text1, text2, text3], i=1, logprobs_results=logprobs_results)
77 | 
78 | #     assert best_response == text2
79 | #     assert pytest.approx(semantic_negentropy, abs=1e-5) == 0.6365141682948128
80 | #     assert pytest.approx(list(nli_scores.values()), abs=1e-5) == [0.9997053, 0.9997053, 0.24012965, 0.24012965]
81 | #     assert pytest.approx(tokenprob_semantic_entropy, abs=1e-5) == 0.6918935849478249
82 | #     del nli_model_cpu
83 | #     gc.collect()
84 | 


--------------------------------------------------------------------------------
/uqlm/utils/llm_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import json
16 | from importlib import import_module
17 | from typing import Any, Dict
18 | from langchain_core.language_models.chat_models import BaseChatModel
19 | 
20 | 
21 | def _is_serializable(value: Any) -> bool:
22 |     """Check if a value is JSON serializable."""
23 |     try:
24 |         json.dumps(value)
25 |         return True
26 |     except (TypeError, ValueError):
27 |         return False
28 | 
29 | 
30 | def save_llm_config(llm: BaseChatModel) -> Dict[str, Any]:
31 |     """
32 |     Extract and save LLM configuration by capturing all available parameters.
33 | 
34 |     Parameters
35 |     ----------
36 |     llm : BaseChatModel
37 |         The LLM instance to extract config from
38 | 
39 |     Returns
40 |     -------
41 |     dict
42 |         Dictionary containing LLM configuration
43 |     """
44 |     config = {"class_name": llm.__class__.__name__, "module": llm.__class__.__module__}
45 | 
46 |     # Internal LangChain attributes that shouldn't be passed to constructors
47 |     internal_attrs = {"config_specs", "lc_attributes", "lc_secrets", "model_computed_fields", "model_config", "model_kwargs", "disabled_params", "include_response_headers", "stream_usage", "validate_base_url", "disable_streaming"}
48 | 
49 |     # Endpoint and URL attributes that should not be saved (will be loaded from environment)
50 |     endpoint_attrs = {"base_url", "endpoint", "azure_endpoint", "openai_api_base", "api_base", "api_url", "url"}
51 | 
52 |     # Save all attributes that are serializable and not None
53 |     for attr_name in dir(llm):
54 |         # Skip private attributes, methods, special attributes, internal LangChain attrs, and endpoint attrs
55 |         if attr_name.startswith("_") or callable(getattr(llm, attr_name)) or attr_name in internal_attrs or attr_name in endpoint_attrs:
56 |             continue
57 | 
58 |         try:
59 |             value = getattr(llm, attr_name)
60 |             if value is not None and _is_serializable(value):
61 |                 config[attr_name] = value
62 |         except (AttributeError, TypeError):
63 |             # Skip attributes that can't be accessed or would cause warnings
64 |             continue
65 | 
66 |     return config
67 | 
68 | 
69 | def load_llm_config(llm_config: Dict[str, Any]) -> BaseChatModel:
70 |     """
71 |     Recreate LLM instance from saved configuration.
72 | 
73 |     Parameters
74 |     ----------
75 |     llm_config : dict
76 |         Dictionary containing LLM configuration
77 | 
78 |     Returns
79 |     -------
80 |     BaseChatModel
81 |         Recreated LLM instance
82 |     """
83 |     try:
84 |         # Import the LLM class
85 |         module = import_module(llm_config["module"])
86 |         llm_class = getattr(module, llm_config["class_name"])
87 | 
88 |         # Extract all parameters except class info
89 |         llm_params = {k: v for k, v in llm_config.items() if k not in ["class_name", "module"]}
90 | 
91 |         # Create LLM instance
92 |         return llm_class(**llm_params)
93 |     except Exception as e:
94 |         raise ValueError(f"Could not recreate LLM from config: {e}") from e
95 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yaml:
--------------------------------------------------------------------------------
  1 | # .github/workflows/docs.yml
  2 | 
  3 | name: Build & Deploy Sphinx Docs
  4 | 
  5 | on:
  6 |   push:
  7 |     tags:
  8 |       - 'v*'
  9 | 
 10 | permissions:
 11 |   contents: write 
 12 | 
 13 | jobs:
 14 |   docs:
 15 |     runs-on: ubuntu-latest
 16 |     env:
 17 |       PANDOC_VERSION: ${{ vars.PANDOC_VERSION }}
 18 | 
 19 |     steps:
 20 |       - name: Checkout repository
 21 |         uses: actions/checkout@v4
 22 | 
 23 |       - name: Get tag name
 24 |         id: get_tag
 25 |         run: echo "tag=${GITHUB_REF_NAME}" >> $GITHUB_OUTPUT
 26 | 
 27 |       - name: Show tag
 28 |         run: |
 29 |           echo "Tag: ${{ steps.get_tag.outputs.tag }}"
 30 | 
 31 |       - name: Extract version without "v"
 32 |         id: version
 33 |         run: |
 34 |           RAW_TAG="${GITHUB_REF_NAME}"     
 35 |           VERSION="${RAW_TAG#v}"
 36 |           VERSION="${VERSION%.*}"
 37 |           echo "clean_version=$VERSION" >> $GITHUB_OUTPUT
 38 |           echo $clean_version
 39 | 
 40 |       - name: Update conf.py release version
 41 |         run: |
 42 |           sed -i "s/^release = .*/release = '${{ steps.version.outputs.clean_version }}'/" docs/source/conf.py
 43 |           head -n 20 docs/source/conf.py
 44 | 
 45 |       - name: Set up Python
 46 |         uses: actions/setup-python@v5
 47 |         with:
 48 |           python-version: 3.12
 49 | 
 50 |       - name: Install Poetry
 51 |         run: |
 52 |           pip install poetry
 53 | 
 54 |       - name: Download and install Pandoc
 55 |         run: |
 56 |           FILE="pandoc-${PANDOC_VERSION}-1-amd64.deb"
 57 |           URL="https://github.com/jgm/pandoc/releases/download/${PANDOC_VERSION}/${FILE}"
 58 | 
 59 |           echo "Downloading $FILE..."
 60 |           curl -L -o pandoc.deb "$URL"
 61 | 
 62 |           echo "Installing Pandoc..."
 63 |           sudo dpkg -i pandoc.deb
 64 | 
 65 |       - name: Verify Pandoc version
 66 |         run: pandoc --version
 67 | 
 68 |       - name: Install dependencies
 69 |         run: |
 70 |           poetry lock
 71 |           poetry install --with docs
 72 |           eval $(poetry env activate)
 73 | 
 74 |       - name: Checkout gh-pages branch to get versions.json
 75 |         uses: actions/checkout@v4
 76 |         with:
 77 |           ref: gh-pages
 78 |           path: gh-pages           
 79 | 
 80 |       - name: Update version.json
 81 |         run: |
 82 |           VERSION=${{ steps.version.outputs.clean_version }}
 83 |           python .github/workflows/update_version_json.py "$VERSION" "gh-pages"
 84 |           cat gh-pages/versions.json
 85 |           mkdir docsVersion
 86 |           cp gh-pages/versions.json docsVersion/versions.json           
 87 |       
 88 |       - name: Build Sphinx docs
 89 |         run: |
 90 |           eval $(poetry env activate)
 91 |           make -C docs clean
 92 |           make -C docs html
 93 | 
 94 |       - name: Deploy to GitHub Pages
 95 |         uses: peaceiris/actions-gh-pages@v4
 96 |         with:
 97 |           github_token: ${{ secrets.GITHUB_TOKEN }}
 98 |           publish_dir: ./docsVersion
 99 |           keep_files: true
100 |       
101 |       - name: Deploy to GitHub Pages
102 |         uses: peaceiris/actions-gh-pages@v4
103 |         with:
104 |           github_token: ${{ secrets.GITHUB_TOKEN }}
105 |           publish_dir: ./docs/build/html
106 |           destination_dir: v${{ steps.version.outputs.clean_version }}
107 |           keep_files: true
108 | 
109 |       - name: Deploy to GitHub Pages
110 |         uses: peaceiris/actions-gh-pages@v4
111 |         with:
112 |           github_token: ${{ secrets.GITHUB_TOKEN }}
113 |           publish_dir: ./docs/build/html
114 |           destination_dir: latest
115 |           keep_files: true
116 | 


--------------------------------------------------------------------------------
/uqlm/white_box/baseclass/logprobs_scorer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from abc import ABC
17 | import numpy as np
18 | from typing import List, Dict, Any, Optional, Callable
19 | 
20 | 
21 | class LogprobsScorer(ABC):
22 |     def __init__(self):
23 |         pass
24 | 
25 |     def _norm_prob(self, single_response_logprobs: List[Dict[str, Any]]) -> float:
26 |         """Compute length-normalized sequence probability"""
27 |         logprobs = self.extract_logprobs(single_response_logprobs)
28 |         return np.exp(np.mean(logprobs))
29 | 
30 |     def _seq_prob(self, single_response_logprobs: List[Dict[str, Any]]) -> float:
31 |         """Compute sequence probability"""
32 |         probs = self.extract_probs(single_response_logprobs)
33 |         return np.prod(probs)
34 | 
35 |     def _entropy_from_logprobs(self, logprobs_list: np.ndarray) -> float:
36 |         """Compute entropy from list of logprobs"""
37 |         probs_list = np.exp(logprobs_list)
38 |         return self._entropy_from_probs(probs_list)
39 | 
40 |     def extract_probs(self, single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray:
41 |         """Extract probabilities from token data"""
42 |         return np.exp(self.extract_logprobs(single_response_logprobs))
43 | 
44 |     @staticmethod
45 |     def _compute_single_generation_scores(logprobs_results: List[List[Dict[str, Any]]], score_fn: Callable) -> List[float]:
46 |         """Generic method to compute scores using the provided scoring function"""
47 |         return [np.nan if not r else score_fn(r) for r in logprobs_results]
48 | 
49 |     @staticmethod
50 |     def _entropy_from_probs(probs_list: np.ndarray, texts: Optional[List[str]] = None) -> float:
51 |         """
52 |         Compute entropy from a list of probabilities.
53 |         """
54 |         normalized_probs = probs_list / np.sum(probs_list)  # normalize probabilities to sum to 1
55 | 
56 |         if texts is None:
57 |             # Case 1: If no responses are provided, treat all probabilities as distinct events
58 |             logprobs = np.log(normalized_probs)
59 |             return -np.sum(normalized_probs * logprobs)
60 |         else:
61 |             # Case 2: If responses, account for duplicates
62 |             aggregated_probs = {}
63 |             for text, prob in zip(texts, normalized_probs):
64 |                 if text in aggregated_probs:
65 |                     aggregated_probs[text] += prob
66 |                 else:
67 |                     aggregated_probs[text] = prob
68 |             unique_probs = np.array(list(aggregated_probs.values()))
69 |             logprobs = np.log(unique_probs)
70 |             return -np.sum(unique_probs * logprobs)
71 | 
72 |     @staticmethod
73 |     def extract_top_logprobs(single_response_logprobs: List[Dict[str, Any]]) -> List[np.ndarray]:
74 |         """Extract top log probabilities for each token"""
75 |         return [np.array([item["logprob"] for item in d["top_logprobs"]]) for d in single_response_logprobs]
76 | 
77 |     @staticmethod
78 |     def extract_logprobs(single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray:
79 |         """Extract log probabilities from token data"""
80 |         return np.array([d["logprob"] for d in single_response_logprobs])
81 | 


--------------------------------------------------------------------------------
/uqlm/white_box/top_logprobs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | from typing import List, Dict, Any
18 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer
19 | 
20 | 
21 | TOP_LOGPROBS_SCORER_NAMES = ["min_token_negentropy", "mean_token_negentropy", "probability_margin"]
22 | 
23 | 
24 | class TopLogprobsScorer(LogprobsScorer):
25 |     def __init__(self, scorers: List[str] = TOP_LOGPROBS_SCORER_NAMES):
26 |         """Class for computing WhiteBox UQ scores with a single generation"""
27 |         super().__init__()
28 |         self.scorers = scorers
29 | 
30 |     def evaluate(self, logprobs_results: List[List[Dict[str, Any]]]) -> Dict[str, List[float]]:
31 |         """Compute scores from top logprobs results"""
32 |         scores_dict = {"mean_token_negentropy": self._compute_single_generation_scores(logprobs_results, self._mean_token_negentropy), "min_token_negentropy": self._compute_single_generation_scores(logprobs_results, self._min_token_negentropy), "probability_margin": self._compute_single_generation_scores(logprobs_results, self._probability_margin)}
33 |         return {k: scores_dict[k] for k in self.scorers}
34 | 
35 |     def _compute_token_entropies(self, single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray:
36 |         """Compute entropy for each token in the sequence"""
37 |         top_logprobs_list = self.extract_top_logprobs(single_response_logprobs)
38 |         return np.array([self._entropy_from_logprobs(top_logprobs) for top_logprobs in top_logprobs_list])
39 | 
40 |     def _compute_token_negentropies(self, single_response_logprobs: List[Dict[str, Any]]) -> np.ndarray:
41 |         """Compute negentropy for each token in the sequence"""
42 |         entropies = self._compute_token_entropies(single_response_logprobs)
43 |         top_logprobs_list = self.extract_top_logprobs(single_response_logprobs)
44 |         k_values = np.array([len(top_logprobs) for top_logprobs in top_logprobs_list])
45 |         max_entropies = np.log(k_values)
46 |         negentropies = 1 - entropies / max_entropies
47 |         return negentropies
48 | 
49 |     def _mean_token_negentropy(self, single_response_logprobs: List[Dict[str, Any]]) -> float:
50 |         """Compute mean token negentropy across the sequence"""
51 |         negentropies = self._compute_token_negentropies(single_response_logprobs)
52 |         return np.mean(negentropies)
53 | 
54 |     def _min_token_negentropy(self, single_response_logprobs: List[Dict[str, Any]]) -> float:
55 |         """Compute minimum token negentropy across the sequence"""
56 |         negentropies = self._compute_token_negentropies(single_response_logprobs)
57 |         return np.min(negentropies)
58 | 
59 |     def _probability_margin(self, single_response_logprobs: List[Dict[str, Any]]) -> float:
60 |         """Compute mean probability margin (difference between top two probabilities)"""
61 |         top_logprobs_list = self.extract_top_logprobs(single_response_logprobs)
62 |         margins = []
63 |         try:
64 |             for top_logprobs in top_logprobs_list:
65 |                 probs = np.exp(top_logprobs)
66 |                 probs = np.sort(probs)[::-1]
67 |                 margin = probs[0] - probs[1]
68 |                 margins.append(margin)
69 |             return np.mean(margins)
70 |         except IndexError:
71 |             print("top_logprobs were not available. Unable to compute associated scores.")
72 |             return np.nan
73 | 


--------------------------------------------------------------------------------
/tests/test_top_logprobs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import numpy as np
17 | from uqlm.white_box.top_logprobs import TopLogprobsScorer, TOP_LOGPROBS_SCORER_NAMES
18 | 
19 | 
20 | @pytest.fixture
21 | def mock_logprobs_results():
22 |     """Fixture to provide mock logprobs results."""
23 |     return [[{"token": "a", "logprobs": [-0.1, -1.0, -2.0]}, {"token": "b", "logprobs": [-0.2, -0.5, -1.5]}], [{"token": "c", "logprobs": [-0.3, -0.7, -1.2]}, {"token": "d", "logprobs": [-0.4, -0.8, -1.0]}]]
24 | 
25 | 
26 | @pytest.fixture
27 | def scorer():
28 |     """Fixture to create a TopLogprobsScorer instance."""
29 |     return TopLogprobsScorer()
30 | 
31 | 
32 | def test_evaluate(mock_logprobs_results, scorer, monkeypatch):
33 |     """Test the evaluate method of TopLogprobsScorer."""
34 |     # Mock the extract_top_logprobs method to return only the logprobs
35 |     monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs])
36 | 
37 |     # Mock the _entropy_from_logprobs method to return a fixed entropy value
38 |     monkeypatch.setattr(scorer, "_entropy_from_logprobs", lambda logprobs: 0.5)
39 | 
40 |     result = scorer.evaluate(mock_logprobs_results)
41 | 
42 |     # Verify the result contains all scorer names
43 |     assert set(result.keys()) == set(TOP_LOGPROBS_SCORER_NAMES)
44 | 
45 |     # Verify the length of the results matches the number of sequences
46 |     for key in result:
47 |         assert len(result[key]) == len(mock_logprobs_results)
48 | 
49 | 
50 | def test_mean_token_negentropy(mock_logprobs_results, scorer, monkeypatch):
51 |     """Test the _mean_token_negentropy method."""
52 |     # Mock the extract_top_logprobs method
53 |     monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs])
54 | 
55 |     # Mock the _entropy_from_logprobs method
56 |     monkeypatch.setattr(scorer, "_entropy_from_logprobs", lambda logprobs: 0.5)
57 | 
58 |     result = scorer._mean_token_negentropy(mock_logprobs_results[0])
59 |     assert isinstance(result, float)
60 |     assert result >= 0.0 and result <= 1.0
61 | 
62 | 
63 | def test_min_token_negentropy(mock_logprobs_results, scorer, monkeypatch):
64 |     """Test the _min_token_negentropy method."""
65 |     # Mock the extract_top_logprobs method
66 |     monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs])
67 | 
68 |     # Mock the _entropy_from_logprobs method
69 |     monkeypatch.setattr(scorer, "_entropy_from_logprobs", lambda logprobs: 0.5)
70 | 
71 |     result = scorer._min_token_negentropy(mock_logprobs_results[0])
72 |     assert isinstance(result, float)
73 |     assert result >= 0.0 and result <= 1.0
74 | 
75 | 
76 | def test_probability_margin(mock_logprobs_results, scorer, monkeypatch):
77 |     """Test the _probability_margin method."""
78 |     # Mock the extract_top_logprobs method
79 |     monkeypatch.setattr(scorer, "extract_top_logprobs", lambda logprobs: [logprob["logprobs"] for logprob in logprobs])
80 | 
81 |     result = scorer._probability_margin(mock_logprobs_results[0])
82 |     assert isinstance(result, float)
83 |     assert result >= 0.0 and result <= 1.0
84 | 
85 | 
86 | def test_probability_margin_with_empty_logprobs(scorer):
87 |     """Test the _probability_margin method with empty logprobs."""
88 |     result = scorer._probability_margin([])
89 |     assert np.isnan(result)
90 | 


--------------------------------------------------------------------------------
/uqlm/black_box/cosine.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from typing import Any, List, Tuple, Optional
17 | 
18 | import numpy as np
19 | from numpy.linalg import norm
20 | import time
21 | from rich.progress import Progress
22 | 
23 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer
24 | 
25 | 
26 | class CosineScorer(SimilarityScorer):
27 |     def __init__(self, transformer: str = "all-MiniLM-L6-v2") -> None:
28 |         """Compute cosine similarity betwee original and candidate responses.
29 | 
30 |         Parameters
31 |         ----------
32 |         transformer : str (HuggingFace sentence transformer), default='all-MiniLM-L6-v2'
33 |             Specifies which huggingface sentence transformer to use when computing cosine distance. See
34 |             https://huggingface.co/sentence-transformers?sort_models=likes#models
35 |             for more information. The recommended sentence transformer is 'all-MiniLM-L6-v2'.
36 |         """
37 |         from sentence_transformers import SentenceTransformer
38 | 
39 |         self.transformer = transformer
40 |         self.model = SentenceTransformer(f"sentence-transformers/{transformer}")
41 | 
42 |     def evaluate(self, responses: List[str], sampled_responses: List[List[str]], progress_bar: Optional[Progress] = None) -> List[float]:
43 |         """
44 |         This method computes model-based text similarity metrics values for the provided pairs of texts.
45 | 
46 |         Parameters
47 |         ----------
48 |         responses : list of strings
49 |             Original LLM response
50 | 
51 |         sampled_responses : list of list of strings
52 |             Candidate responses to be compared to the original response
53 | 
54 |         progress_bar : rich.progress.Progress, default=None
55 |             If provided, displays a progress bar while scoring responses
56 | 
57 |         Returns
58 |         -------
59 |         List of float
60 |             Mean cosine similarity values
61 |         """
62 |         if progress_bar:
63 |             progress_task = progress_bar.add_task("  - Scoring responses with cosine similarity...", total=len(responses))
64 |         results = []
65 |         for i in range(len(responses)):
66 |             score = self._compute_score(response=responses[i], candidates=sampled_responses[i])
67 |             results.append(score)
68 |             if progress_bar:
69 |                 progress_bar.update(progress_task, advance=1)
70 |         time.sleep(0.1)
71 |         return results
72 | 
73 |     def _get_embeddings(self, texts1: List[str], texts2: List[str]) -> Tuple[Any, Any]:
74 |         """
75 |         Helper function to get embeddings
76 |         """
77 |         embeddings1 = self.model.encode(texts1)
78 |         embeddings2 = self.model.encode(texts2)
79 |         return embeddings1, embeddings2
80 | 
81 |     def _compute_score(self, response: str, candidates: List[str]) -> float:
82 |         """
83 |         Helper function to get cosine dist
84 |         """
85 |         duplicate_responses = [response] * len(candidates)
86 |         embeddings1, embeddings2 = self._get_embeddings(duplicate_responses, candidates)
87 |         cosine_list = []
88 |         for i in range(0, len(embeddings1)):
89 |             cosine_i = np.dot(embeddings1[i], embeddings2[i]) / (norm(embeddings1[i]) * norm(embeddings2[i]))
90 |             norm_cosine_i = 0.5 + cosine_i / 2
91 |             cosine_list.append(norm_cosine_i)
92 |         return np.mean(cosine_list)
93 | 


--------------------------------------------------------------------------------
/tests/data/scorers/generate_data_ensemble.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 CVS Health and/or one of its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import asyncio
 16 | import os
 17 | import json
 18 | from dotenv import load_dotenv, find_dotenv
 19 | 
 20 | from uqlm.utils.dataloader import load_example_dataset
 21 | from uqlm.scorers import UQEnsemble
 22 | from langchain_openai import AzureChatOpenAI
 23 | 
 24 | 
 25 | async def main():
 26 |     # svamp dataset to be used as a prod dataset
 27 |     svamp = load_example_dataset("svamp").rename(columns={"question_concat": "question", "Answer": "answer"})[["question", "answer"]].tail(5)
 28 | 
 29 |     # Define prompts
 30 |     MATH_INSTRUCTION = "When you solve this math problem only return the answer with no additional text.\n"
 31 |     prompts = [MATH_INSTRUCTION + prompt for prompt in svamp.question]
 32 | 
 33 |     # User to populate .env file with API credentials
 34 |     load_dotenv(find_dotenv())
 35 | 
 36 |     API_KEY = os.getenv("API_KEY")
 37 |     API_BASE = os.getenv("API_BASE")
 38 |     API_TYPE = os.getenv("API_TYPE")
 39 |     API_VERSION = os.getenv("API_VERSION")
 40 |     DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")
 41 | 
 42 |     # This will be our main LLM for generation
 43 |     gpt = AzureChatOpenAI(
 44 |         deployment_name=DEPLOYMENT_NAME,
 45 |         openai_api_key=API_KEY,
 46 |         azure_endpoint=API_BASE,
 47 |         openai_api_type=API_TYPE,
 48 |         openai_api_version=API_VERSION,
 49 |         temperature=1,  # User to set temperature
 50 |     )
 51 | 
 52 |     def math_postprocessor(s: str) -> str:
 53 |         """Helper function to strip non-numeric characters"""
 54 |         return "".join(c for c in s if c.isdigit())
 55 | 
 56 |     components = [
 57 |         "exact_match",  # Measures proportion of candidate responses that match original response
 58 |         "noncontradiction",  # mean non-contradiction probability between candidate responses and original response
 59 |         "min_probability",  # measures semantic volatility
 60 |         gpt,  # Using same LLM as external judge for testing
 61 |     ]
 62 | 
 63 |     uqe = UQEnsemble(
 64 |         llm=gpt,
 65 |         max_calls_per_min=250,
 66 |         postprocessor=math_postprocessor,
 67 |         use_n_param=False,  # Set True if using AzureChatOpenAI for faster generation
 68 |         scorers=components,
 69 |     )
 70 | 
 71 |     results = await uqe.generate_and_score(prompts=prompts, num_responses=5)
 72 |     store_results = {"ensemble1": results.to_dict()}
 73 | 
 74 |     uqe = UQEnsemble(
 75 |         llm=gpt,
 76 |         max_calls_per_min=250,
 77 |         postprocessor=math_postprocessor,
 78 |         use_n_param=False,  # Set True if using AzureChatOpenAI for faster generation
 79 |     )
 80 | 
 81 |     results = await uqe.generate_and_score(prompts=prompts, num_responses=5)
 82 |     store_results["bsdetector"] = results.to_dict()
 83 | 
 84 |     components1 = [
 85 |         "min_probability",  # measures semantic volatility
 86 |         gpt,  # Using same LLM as external judge for testing
 87 |     ]
 88 | 
 89 |     uqe1 = UQEnsemble(
 90 |         llm=gpt,
 91 |         max_calls_per_min=250,
 92 |         postprocessor=math_postprocessor,
 93 |         use_n_param=False,  # Set True if using AzureChatOpenAI for faster generation
 94 |         scorers=components1,
 95 |     )
 96 | 
 97 |     results1 = await uqe1.generate_and_score(prompts=prompts)
 98 |     store_results["ensemble2"] = results1.to_dict()
 99 | 
100 |     results_file = "ensemble_results_file.json"
101 |     with open(results_file, "w") as f:
102 |         json.dump(store_results, f)
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     asyncio.run(main())
107 | 


--------------------------------------------------------------------------------
/uqlm/white_box/p_true.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import time
17 | from typing import Any, Dict, List, Optional
18 | import numpy as np
19 | from rich.progress import Progress
20 | from langchain_core.language_models.chat_models import BaseChatModel
21 | from uqlm.utils.response_generator import ResponseGenerator
22 | 
23 | PTRUE_SYSTEM_PROMPT = """
24 | Your task is to determine whether a given answer to a question is correct.
25 | 
26 | Guidelines for your evaluation:
27 | - Do NOT penalize phrasing differences
28 | - Respond with EXACTLY one word: "True" or "False"
29 | - Answer "True" if the response is correct
30 | - Answer "False" if the response is incorrect
31 | - Do not explain your reasoning or provide any additional commentary
32 | """
33 | 
34 | 
35 | class PTrueScorer:
36 |     def __init__(self, llm: BaseChatModel, max_calls_per_min: Optional[int] = None) -> None:
37 |         llm.logprobs = True
38 |         self.response_generator = ResponseGenerator(llm, max_calls_per_min=max_calls_per_min)
39 |         self.response_generator.response_generator_type = "p_true"
40 | 
41 |     async def evaluate(self, prompts: List[str], responses: List[str], sampled_responses: Optional[List[List[str]]] = None, progress_bar: Optional[Progress] = None) -> Dict[str, float]:
42 |         if not sampled_responses:
43 |             sampled_responses = [None] * len(responses)
44 | 
45 |         ptrue_prompts = [self._construct_ptrue_prompt(original_prompt=original_prompt_i, original_response=original_response_i, sampled_responses=sampled_responses_i) for original_prompt_i, original_response_i, sampled_responses_i in zip(prompts, responses, sampled_responses)]
46 |         ptrue_responses = await self.response_generator.generate_responses(prompts=ptrue_prompts, system_prompt=PTRUE_SYSTEM_PROMPT, progress_bar=progress_bar)
47 |         time.sleep(0.1)
48 |         logprob_results = ptrue_responses["metadata"]["logprobs"]
49 |         ptrue_scores = [self._extract_ptrue_from_logprobs_result(logprob_result) for logprob_result in logprob_results]
50 |         return {"p_true": ptrue_scores}
51 | 
52 |     @staticmethod
53 |     def _extract_ptrue_from_logprobs_result(logprobs_result: List[Dict[str, Any]]) -> float:
54 |         first_token_data = logprobs_result[0]
55 |         token = first_token_data.get("token", "").strip().lower()
56 |         logprob = first_token_data.get("logprob", None)
57 | 
58 |         if logprob is not None:
59 |             prob = np.exp(logprob)
60 |             if token.startswith("true"):
61 |                 return prob  # High prob means high P_true
62 |             elif token.startswith("false"):
63 |                 return 1.0 - prob  # High prob of False means low P_true
64 |             else:
65 |                 return np.nan
66 | 
67 |     @staticmethod
68 |     def _construct_ptrue_prompt(original_prompt: str, original_response: str, sampled_responses: Optional[List[str]] = None) -> str:
69 |         proposed_answers_text = ""
70 |         if sampled_responses:
71 |             unique_responses = list(set(sampled_responses + [original_response]))
72 | 
73 |             if len(unique_responses) > 1:
74 |                 proposed_answers_text = "\n\nHere are some possible answers:\n"
75 |                 for possible_answer in unique_responses:
76 |                     proposed_answers_text += possible_answer + "\n"
77 | 
78 |         ptrue_prompt = f"""
79 |     Question: {original_prompt}
80 |     {proposed_answers_text}
81 |     Proposed Answer: {original_response}
82 | 
83 |     Is the proposed answer to the question true or false? Answer with only one word true/false.
84 | 
85 |     True or False:
86 |         """
87 |         return ptrue_prompt
88 | 


--------------------------------------------------------------------------------
/tests/data/scorers/bsdetector_results_file.json:
--------------------------------------------------------------------------------
1 | {"prompts": ["Which part of the human body produces insulin?", "What color are the two stars on the national flag of Syria", "How many 'm's are there in the word strawberry"], "responses": ["The pancreas is the organ in the human body that produces insulin. More specifically, insulin is produced by specialized cells called beta cells in the pancreas. The release of insulin helps regulate blood sugar levels and is essential for the body's ability to use and store glucose for energy.", "The two stars on the national flag of Syria are red.", "There are no 'm's in the word \"strawberry.\""], "sampled_responses": [["The pancreas is the organ in the human body that produces insulin. Insulin is a hormone that helps regulate blood sugar levels.", "The pancreas is the organ in the human body that produces insulin. Insulin is a hormone that regulates blood sugar levels. It is produced by the beta cells in the islets of Langerhans in the pancreas.", "Insulin is produced by the beta cells in the pancreas, specifically in clusters called the Islets of Langerhans. These cells play a crucial role in regulating blood sugar levels.", "The pancreas is the organ in the human body that produces insulin. Insulin is produced by specific cells within the pancreas called beta cells, located in the islets of Langerhans.", "The pancreas is the organ in the human body that produces insulin. Insulin is produced and released by special cells called beta cells in the pancreas."], ["The two stars on the national flag of Syria are both red.", "The two stars on the national flag of Syria are red.", "The two stars on the national flag of Syria are colored red.", "The two stars on the national flag of Syria are black.", "The two stars on the national flag of Syria are green."], ["There are no 'm's in the word \"strawberry.\"", "There are no 'm's in the word strawberry.", "There is only one 'm' in the word \"strawberry\".", "There are zero 'm's in the word \"strawberry.\"", "There are 0 'm's in the word \"strawberry.\""]], "confidence_scores": [0.8596294307708741, 0.6640560493469239, 0.47659782075881957], "sr_scores": [1.0, 1.0, 0.0], "oc_scores": [0.799470615386963, 0.5200800704956056, 0.6808540296554566], "indicator_scores": [0.0, 0.2, 0.2], "sr_data": {"self_reflection_prompts": ["Question: Which part of the human body produces insulin?, Proposed Answer: The pancreas is the organ in the human body that produces insulin. More specifically, insulin is produced by specialized cells called beta cells in the pancreas. The release of insulin helps regulate blood sugar levels and is essential for the body's ability to use and store glucose for energy.. Your task is to look at the question and answer provided and determine if the answer is correct. You are to respond with ONLY one of: \"Correct\", \"Incorrect\", or \"I am not sure\". YOUR ANSWER MUST ONLY CONTAIN ONE OF \"Correct\", \"Incorrect\", or \"I am not sure\". DO NOT ANSWER THE QUESTION AGAIN. ONLY DETERMINE IF THE ANSWER TO THE QUESTION IS \"Correct\", \"Incorrect\", or \"I am not sure\".", "Question: What color are the two stars on the national flag of Syria, Proposed Answer: The two stars on the national flag of Syria are red.. Your task is to look at the question and answer provided and determine if the answer is correct. You are to respond with ONLY one of: \"Correct\", \"Incorrect\", or \"I am not sure\". YOUR ANSWER MUST ONLY CONTAIN ONE OF \"Correct\", \"Incorrect\", or \"I am not sure\". DO NOT ANSWER THE QUESTION AGAIN. ONLY DETERMINE IF THE ANSWER TO THE QUESTION IS \"Correct\", \"Incorrect\", or \"I am not sure\".", "Question: How many 'm's are there in the word strawberry, Proposed Answer: There are no 'm's in the word \"strawberry.\". Your task is to look at the question and answer provided and determine if the answer is correct. You are to respond with ONLY one of: \"Correct\", \"Incorrect\", or \"I am not sure\". YOUR ANSWER MUST ONLY CONTAIN ONE OF \"Correct\", \"Incorrect\", or \"I am not sure\". DO NOT ANSWER THE QUESTION AGAIN. ONLY DETERMINE IF THE ANSWER TO THE QUESTION IS \"Correct\", \"Incorrect\", or \"I am not sure\"."], "self_reflection_responses": ["Correct", "Correct", "Incorrect"], "self_reflection_scores": [1.0, 1.0, 0.0]}, "correct_indicators": [true, true, false], "updated_oc_scores": [0.8454951459984235, 0.5385066827051275, 0.7085361990580302], "updated_confidence_scores": [0.976214751274521, 0.9289554143270282, 0.10907560046902409], "optimized_parameters": {"weights": [0.8460550066262872, 0.15394499337371276], "thresh": 0.11}}


--------------------------------------------------------------------------------
/uqlm/utils/grader.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 CVS Health and/or one of its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import time
17 | from typing import List, Optional
18 | from rich.progress import Progress
19 | from langchain_core.language_models.chat_models import BaseChatModel
20 | from uqlm.utils.response_generator import ResponseGenerator
21 | 
22 | GRADER_SYSTEM_PROMPT = """
23 | You are an expert grading assistant designed to evaluate answers against a provided answer key. Your task is to determine whether a proposed answer is correct by comparing it to the ground truth answer(s).
24 | 
25 | ## Your Responsibilities:
26 | 
27 | 1. **Accept the ground truth as absolute**: The provided answer key contains the gold standard answer(s) and must be treated as correct, regardless of your own knowledge or beliefs.
28 | 
29 | 2. **Evaluate the proposed answer**: Determine if the proposed answer aligns with any of the ground truth answers in terms of factual content, not just wording.
30 | 
31 | 3. **Focus on semantic equivalence**: Look for meaning rather than exact wording. Two answers can be expressed differently but still be semantically equivalent.
32 | 
33 | 4. **Provide ONLY a binary judgment**: Your entire response must be either the single word "yes" or "no" based solely on the answer's alignment with any of the ground truth answers. Answer "yes" if correct, "no" if incorrect.
34 | 
35 | 5. **Avoid any explanation or reasoning**: Do not provide any justification, commentary, or additional text beyond the single word judgment.
36 | 
37 | 6. **Be charitable but accurate**: Give credit when the proposed answer captures the essential elements of any of the ground truth answers, but don't overlook substantive differences.
38 | 
39 | Remember: You must return ONLY the word "yes" or "no" with no additional text. The ground truth answer(s) must be treated as correct even if you believe otherwise.
40 | """
41 | 
42 | 
43 | class LLMGrader:
44 |     def __init__(self, llm: BaseChatModel, max_calls_per_min: Optional[int] = None) -> None:
45 |         llm.logprobs = True
46 |         self.response_generator = ResponseGenerator(llm, max_calls_per_min=max_calls_per_min)
47 |         self.response_generator.response_generator_type = "grader"
48 | 
49 |     async def grade_responses(self, prompts: List[str], responses: List[str], answers: List[str], progress_bar: Optional[Progress] = None) -> List[bool]:
50 |         grader_prompts = [self._construct_grader_prompt(prompt, response, answer) for prompt, response, answer in zip(prompts, responses, answers)]
51 |         grader_responses = await self.response_generator.generate_responses(prompts=grader_prompts, system_prompt=GRADER_SYSTEM_PROMPT, progress_bar=progress_bar)
52 |         time.sleep(0.1)
53 |         bool_grades = [self._extract_grades(grader_response) for grader_response in grader_responses["data"]["response"]]
54 |         return bool_grades
55 | 
56 |     @staticmethod
57 |     def _extract_grades(grader_response: str) -> bool:
58 |         grader_response_stripped = grader_response.strip().lower()
59 |         if "yes" in grader_response_stripped:
60 |             return True
61 |         elif "no" in grader_response_stripped:
62 |             return False
63 |         else:
64 |             return False
65 | 
66 |     @staticmethod
67 |     def _construct_grader_prompt(prompt: str, response: str, acceptable_answers: List[str]) -> str:
68 |         grader_prompt = f"""
69 |         Your task is to grade the following proposed answer against the provided answer key. The ground truth is the gold standard regardless of any other information you may have. Return ONLY the word "yes" or "no", with no additional text, based on whether the proposed answer aligns with any of the ground truth answers. Answer "yes" if correct, "no" if incorrect.
70 | 
71 |         **Question:**
72 |         {prompt}
73 | 
74 |         **Ground Truth Answers (Answer Key):**
75 |         {acceptable_answers}
76 | 
77 |         **Proposed Answer to Grade:**
78 |         {response}
79 | 
80 |         Now your answer is (yes or no):
81 |         """
82 |         return grader_prompt
83 | 


--------------------------------------------------------------------------------
/tests/test_logprobs_scorer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 CVS Health and/or one of its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import pytest
 16 | import numpy as np
 17 | from uqlm.white_box.baseclass.logprobs_scorer import LogprobsScorer
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def mock_single_response_logprobs():
 22 |     """Fixture to provide mock single response logprobs."""
 23 |     return [{"logprob": -0.1, "top_logprobs": [{"logprob": -0.1}, {"logprob": -1.0}, {"logprob": -2.0}]}, {"logprob": -0.2, "top_logprobs": [{"logprob": -0.2}, {"logprob": -0.5}, {"logprob": -1.5}]}]
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def mock_logprobs_results(mock_single_response_logprobs):
 28 |     """Fixture to provide mock logprobs results."""
 29 |     return [mock_single_response_logprobs, mock_single_response_logprobs]
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def scorer():
 34 |     """Fixture to create a LogprobsScorer instance."""
 35 |     return LogprobsScorer()
 36 | 
 37 | 
 38 | def test_norm_prob(mock_single_response_logprobs, scorer):
 39 |     """Test the _norm_prob method."""
 40 |     result = scorer._norm_prob(mock_single_response_logprobs)
 41 |     assert isinstance(result, float)
 42 |     assert result > 0.0 and result <= 1.0
 43 | 
 44 | 
 45 | def test_seq_prob(mock_single_response_logprobs, scorer):
 46 |     """Test the _seq_prob method."""
 47 |     result = scorer._seq_prob(mock_single_response_logprobs)
 48 |     assert isinstance(result, float)
 49 |     assert result > 0.0 and result <= 1.0
 50 | 
 51 | 
 52 | def test_entropy_from_logprobs(scorer):
 53 |     """Test the _entropy_from_logprobs method."""
 54 |     logprobs_list = np.array([-0.1, -0.2, -0.3])
 55 |     result = scorer._entropy_from_logprobs(logprobs_list)
 56 |     assert isinstance(result, float)
 57 |     assert result >= 0.0
 58 | 
 59 | 
 60 | def test_entropy_from_probs(scorer):
 61 |     """Test the _entropy_from_probs method."""
 62 |     probs_list = np.array([0.5, 0.3, 0.2])
 63 |     result = scorer._entropy_from_probs(probs_list)
 64 |     assert isinstance(result, float)
 65 |     assert result >= 0.0
 66 | 
 67 | 
 68 | def test_entropy_from_probs_with_texts(scorer):
 69 |     """Test the _entropy_from_probs method with texts."""
 70 |     probs_list = np.array([0.5, 0.3, 0.2])
 71 |     texts = ["a", "b", "a"]
 72 |     result = scorer._entropy_from_probs(probs_list, texts)
 73 |     assert isinstance(result, float)
 74 |     assert result >= 0.0
 75 | 
 76 | 
 77 | def test_extract_probs(mock_single_response_logprobs, scorer):
 78 |     """Test the extract_probs method."""
 79 |     result = scorer.extract_probs(mock_single_response_logprobs)
 80 |     assert isinstance(result, np.ndarray)
 81 |     assert result.shape == (len(mock_single_response_logprobs),)
 82 |     assert np.all(result > 0.0) and np.all(result <= 1.0)
 83 | 
 84 | 
 85 | def test_extract_logprobs(mock_single_response_logprobs, scorer):
 86 |     """Test the extract_logprobs method."""
 87 |     result = scorer.extract_logprobs(mock_single_response_logprobs)
 88 |     assert isinstance(result, np.ndarray)
 89 |     assert result.shape == (len(mock_single_response_logprobs),)
 90 |     assert np.all(result < 0.0)  # Logprobs should be negative
 91 | 
 92 | 
 93 | def test_extract_top_logprobs(mock_single_response_logprobs, scorer):
 94 |     """Test the extract_top_logprobs method."""
 95 |     result = scorer.extract_top_logprobs(mock_single_response_logprobs)
 96 |     assert isinstance(result, list)
 97 |     assert len(result) == len(mock_single_response_logprobs)
 98 |     for top_logprobs in result:
 99 |         assert isinstance(top_logprobs, np.ndarray)
100 |         assert top_logprobs.shape[0] > 0
101 | 
102 | 
103 | def test_compute_single_generation_scores(mock_logprobs_results, scorer):
104 |     """Test the _compute_single_generation_scores method."""
105 | 
106 |     def mock_score_fn(single_response_logprobs):
107 |         return 0.9
108 | 
109 |     result = scorer._compute_single_generation_scores(mock_logprobs_results, mock_score_fn)
110 |     assert isinstance(result, list)
111 |     assert len(result) == len(mock_logprobs_results)
112 |     assert all(score == 0.9 for score in result)
113 | 


--------------------------------------------------------------------------------
/tests/test_p_true.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from unittest.mock import AsyncMock, MagicMock
  3 | from uqlm.white_box.p_true import PTrueScorer, PTRUE_SYSTEM_PROMPT
  4 | from uqlm.utils.response_generator import ResponseGenerator
  5 | from langchain_openai import AzureChatOpenAI
  6 | 
  7 | # REUSABLE TEST DATA
  8 | MOCKED_PROMPTS = ["What is 2+2?"]
  9 | MOCKED_RESPONSES = ["4"]
 10 | MOCKED_SAMPLED_RESPONSES = [["4", "5"]]
 11 | 
 12 | 
 13 | # REUSABLE MOCK OBJECT CREATOR
 14 | def create_mock_llm():
 15 |     """Reusable mock LLM object"""
 16 |     mock_llm = MagicMock(spec=AzureChatOpenAI)
 17 |     mock_llm.logprobs = True
 18 |     mock_llm.temperature = 0.7
 19 | 
 20 |     # Mock the agenerate method
 21 |     async def mock_agenerate(messages, **kwargs):
 22 |         class MockGeneration:
 23 |             def __init__(self):
 24 |                 self.text = "Mocked response"
 25 |                 self.generation_info = {"logprobs_result": [{"token": "True", "logprob": -0.1}]}
 26 | 
 27 |         class MockResult:
 28 |             def __init__(self):
 29 |                 self.generations = [[MockGeneration()]]
 30 | 
 31 |         return MockResult()
 32 | 
 33 |     mock_llm.agenerate = mock_agenerate
 34 |     return mock_llm
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def mock_response_generator():
 39 |     """Fixture to create a mock ResponseGenerator."""
 40 |     mock_response_generator = AsyncMock()
 41 |     mock_response_generator.generate_responses = AsyncMock(return_value={"metadata": {"logprobs": [[{"token": "True", "logprob": -0.1}], [{"token": "False", "logprob": -2.0}]]}})
 42 |     return mock_response_generator
 43 | 
 44 | 
 45 | @pytest.fixture
 46 | def ptrue_scorer(mock_response_generator, monkeypatch):
 47 |     """Fixture to create a PTrueScorer with a mocked ResponseGenerator."""
 48 |     mock_llm = create_mock_llm()
 49 | 
 50 |     # Replace the ResponseGenerator with the mock
 51 |     monkeypatch.setattr(ResponseGenerator, "__init__", lambda self, *args, **kwargs: None)
 52 |     monkeypatch.setattr(ResponseGenerator, "generate_responses", mock_response_generator.generate_responses)
 53 | 
 54 |     scorer = PTrueScorer(llm=mock_llm)
 55 |     scorer.response_generator = mock_response_generator
 56 |     return scorer
 57 | 
 58 | 
 59 | @pytest.mark.asyncio
 60 | async def test_ptrue_scorer_evaluate(ptrue_scorer, mock_response_generator):
 61 |     """Test the evaluate method of PTrueScorer."""
 62 |     result = await ptrue_scorer.evaluate(MOCKED_PROMPTS, MOCKED_RESPONSES, MOCKED_SAMPLED_RESPONSES)
 63 | 
 64 |     # Verify the ResponseGenerator was called with the correct arguments
 65 |     mock_response_generator.generate_responses.assert_called_once()
 66 |     args, kwargs = mock_response_generator.generate_responses.call_args
 67 | 
 68 |     # Normalize the actual prompt to remove extra whitespace
 69 |     actual_prompt = kwargs["prompts"][0].strip()
 70 |     expected_prompt_start = "Question: What is 2+2?"
 71 | 
 72 |     assert actual_prompt.startswith(expected_prompt_start), f"Expected prompt to start with '{expected_prompt_start}', but got '{actual_prompt}'"
 73 | 
 74 |     assert kwargs["system_prompt"] == PTRUE_SYSTEM_PROMPT
 75 | 
 76 |     # Verify the result
 77 |     assert "p_true" in result
 78 |     assert len(result["p_true"]) == 2
 79 |     assert result["p_true"] == [0.9048374180359595, 0.8646647167633873]  # Based on mocked logprobs
 80 | 
 81 | 
 82 | def test_extract_ptrue_from_logprobs_result():
 83 |     """Test the _extract_ptrue_from_logprobs_result method."""
 84 |     logprobs_result = [{"token": "True", "logprob": -0.1}]
 85 |     score = PTrueScorer._extract_ptrue_from_logprobs_result(logprobs_result)
 86 |     assert score == pytest.approx(0.9048, rel=1e-3)
 87 | 
 88 |     logprobs_result = [{"token": "False", "logprob": -0.1}]
 89 |     score = PTrueScorer._extract_ptrue_from_logprobs_result(logprobs_result)
 90 |     assert score == pytest.approx(0.0952, rel=1e-3)
 91 | 
 92 |     logprobs_result = [{"token": "Unknown", "logprob": -0.1}]
 93 |     score = PTrueScorer._extract_ptrue_from_logprobs_result(logprobs_result)
 94 |     assert score != score  # NaN check
 95 | 
 96 | 
 97 | def test_construct_ptrue_prompt():
 98 |     """Test the _construct_ptrue_prompt method."""
 99 |     prompt = "What is 2+2?"
100 |     response = "4"
101 |     sampled_responses = ["4", "5"]
102 | 
103 |     result = PTrueScorer._construct_ptrue_prompt(prompt, response, sampled_responses)
104 |     assert "Question: What is 2+2?" in result
105 |     assert "Proposed Answer: 4" in result
106 |     assert "Here are some possible answers:" in result
107 |     assert "4" in result
108 |     assert "5" in result
109 | 
110 |     # Test without sampled_responses
111 |     result = PTrueScorer._construct_ptrue_prompt(prompt, response, None)
112 |     assert "Here are some possible answers:" not in result
113 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "uqlm"
  3 | version = "0.4.5"
  4 | description = "UQLM (Uncertainty Quantification for Language Models) is a Python package for UQ-based LLM hallucination detection."
  5 | authors = ["Dylan Bouchard <dylan.bouchard@cvshealth.com>", "Mohit Singh Chauhan <mohitsingh.chauhan@cvshealth.com>"]
  6 | maintainers = [
  7 |     "Dylan Bouchard <dbouchard92@gmail.com>", 
  8 |     "Mohit Singh Chauhan <mohitcek@gmail.com>",
  9 |     "David Skarbrevik <skarbrevik@gmail.com>",
 10 |     "Ho-Kyeong Ra <doyajii1@gmail.com>",
 11 |     "Viren Bajaj <virenbajaj4@gmail.com>",
 12 |     "Zeya Ahmad <zeyahmd@gmail.com>"
 13 | ]
 14 | repository = "https://github.com/cvs-health/uqlm"
 15 | homepage = "https://github.com/cvs-health/uqlm"
 16 | documentation = "https://cvs-health.github.io/uqlm/latest/index.html"
 17 | license = "Apache-2.0"
 18 | readme = "assets/README_PYPI.md"
 19 | classifiers = [
 20 |     "Programming Language :: Python :: 3 :: Only",
 21 |     "Operating System :: OS Independent",
 22 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 23 |     "Topic :: Software Development :: Libraries :: Python Modules"
 24 | ]
 25 | packages = [
 26 |     { include = "uqlm" },
 27 |     { include = "uqlm/scorers" },
 28 |     { include = "uqlm/judges" },
 29 |     { include = "uqlm/black_box" },
 30 |     { include = "uqlm/white_box" },
 31 |     { include = "uqlm/calibration" },
 32 |     { include = "uqlm/resources" },
 33 |     { include = "uqlm/utils" },
 34 | ]
 35 | exclude = ["docs", "docs_src"] # never include documentation in build
 36 | keywords = ["LLM", "large language model", "LLM evaluation", "hallucination", "uncertainty quantification"]  # Add your keywords here
 37 | 
 38 | [tool.poetry.dependencies]
 39 | python = ">=3.10, <4.0"
 40 | langchain = ">=0.3.7,<1.1.0"
 41 | langchain-model-profiles = "^0.0.3"
 42 | transformers = "^4.45.2"
 43 | scikit-learn = [
 44 |     { version = "^1.5.2", markers = "python_version < '3.13'" },
 45 |     { version = "^1.7.0", markers = "python_version >= '3.13'" }
 46 | ]
 47 | numpy = [
 48 |     { version = "^1.26.4", markers = "python_version < '3.13'" },
 49 |     { version = ">=2.3.1", markers = "python_version >= '3.13'" }
 50 | ]
 51 | scipy = {version = "^1.15.0", markers = "python_version >= '3.13'"}
 52 | matplotlib = "^3.10.0"
 53 | optuna = "^4.0.0"
 54 | bert-score = "^0.3.0"
 55 | pandas = "^2.3.0"
 56 | sentence-transformers = ">=3.4,<6.0"
 57 | datasets = ">=3.3.2,<5.0.0"
 58 | rich = "^13.8.0"
 59 | ipywidgets = "^8.1.7"
 60 | 
 61 | [tool.poetry.group.dev]
 62 | optional = true
 63 | 
 64 | [tool.poetry.group.dev.dependencies]
 65 | python-dotenv = "^1.2.0"
 66 | ruff = "0.9.7"
 67 | pre-commit = "^4.1.0"
 68 | ipykernel = "^6.29.5"
 69 | langchain-openai = ">=0.2.6"
 70 | langchain-google-vertexai = ">=2.0.8"
 71 | 
 72 | [tool.poetry.group.docs]
 73 | optional = true
 74 | 
 75 | [tool.poetry.group.docs.dependencies]
 76 | sphinx= "7.4.7"
 77 | pydata_sphinx_theme = "0.16.1"
 78 | sphinxcontrib-bibtex = "2.6.3"
 79 | sphinx-autodoc-typehints = "2.3.0"
 80 | sphinx-gallery = "0.18.0"
 81 | sphinx-favicon = "1.0.1"
 82 | nbsphinx = "0.9.6"
 83 | 
 84 | 
 85 | [tool.poetry.group.test]
 86 | optional = true
 87 | 
 88 | [tool.poetry.group.test.dependencies]
 89 | ipykernel = "^6.29.5"
 90 | langchain-openai = ">=0.2.6"
 91 | pytest = "^8.3.5"
 92 | langchain-google-vertexai = ">=2.0.8"
 93 | pytest-asyncio = ">=0.25.3,<1.2.0"
 94 | pytest-cov = ">=6,<8"
 95 | pytest-rerunfailures = "^16.0"
 96 | 
 97 | 
 98 | 
 99 | [tool.pytest.ini_options]
100 | reruns = 3
101 | reruns_delay = 2.0
102 | markers = [
103 |     "asyncio"
104 | ]
105 | 
106 | [build-system]
107 | requires = ["poetry-core"]
108 | build-backend = "poetry.core.masonry.api"
109 | 
110 | 
111 | [tool.ruff]
112 | # Exclude a variety of commonly ignored directories.
113 | exclude = [
114 |     ".bzr",
115 |     ".direnv",
116 |     ".eggs",
117 |     ".git",
118 |     ".git-rewrite",
119 |     ".hg",
120 |     ".ipynb_checkpoints",
121 |     ".mypy_cache",
122 |     ".nox",
123 |     ".pants.d",
124 |     ".pyenv",
125 |     ".pytest_cache",
126 |     ".pytype",
127 |     ".ruff_cache",
128 |     ".svn",
129 |     ".tox",
130 |     ".venv",
131 |     ".vscode",
132 |     "__pypackages__",
133 |     "_build",
134 |     "buck-out",
135 |     "build",
136 |     "dist",
137 |     "node_modules",
138 |     "site-packages",
139 |     "venv",
140 | ]
141 | # Core settings
142 | line-length = 400
143 | 
144 | [tool.ruff.lint]
145 | #What rules to enable
146 | select = ["E", "F"]
147 | # E = pycodestyle errors
148 | # F = pyflakes
149 | # I = isort (import sorting)
150 | # B = bugbear (best practices)
151 | # UP = pyupgrade (modernization)
152 | # D = pydocstyle (docstring rules)
153 | # S = bandit (security)
154 | 
155 | #What rules to ignore
156 | ignore = []
157 | 
158 | 
159 | [tool.ruff.format]
160 | quote-style = "double"
161 | indent-style = "space"
162 | skip-magic-trailing-comma = true
163 | line-ending = "lf"
164 | docstring-code-format = true


--------------------------------------------------------------------------------
/docs/source/contribute.rst:
--------------------------------------------------------------------------------
  1 | .. _contribute:
  2 | 
  3 | Contributing to UQLM
  4 | ====================
  5 | 
  6 | Welcome and thank you for considering contributing to UQLM!
  7 | 
  8 | It takes a lot of time and effort to use software much less build upon it, so we deeply appreciate your desire to help make this project thrive.
  9 | 
 10 | Table of Contents
 11 | -----------------
 12 | 1. :ref:`How to Contribute<how-to-contribute>`
 13 |     * :ref:`Reporting Bugs<reporting-bugs>`
 14 |     * :ref:`Suggesting Enhancements<suggesting-enhancements>`
 15 |     * :ref:`Pull Requests<pull-requests>`
 16 | 2. :ref:`Development Setup<development-setup>`
 17 | 3. :ref:`Style Guides<style-guides>`
 18 |     * :ref:`Code Style<code-style>`
 19 | 4. :ref:`License<license>`
 20 | 
 21 | .. _how-to-contribute:
 22 | 
 23 | How to Contribute
 24 | -----------------
 25 | 
 26 | .. _reporting-bugs:
 27 | 
 28 | Reporting Bugs
 29 | **************
 30 | 
 31 | If you find a bug, please report it by opening an issue on GitHub. Include as much detail as possible:
 32 | * Steps to reproduce the bug.
 33 | * Expected and actual behavior.
 34 | * Screenshots if applicable.
 35 | * Any other information that might help us understand the problem.
 36 | 
 37 | .. _suggesting-enhancements:
 38 | 
 39 | Suggesting Enhancements
 40 | ***********************
 41 | 
 42 | We welcome suggestions for new features or improvements. To suggest an enhancement, please open an issue on GitHub and include:
 43 | 
 44 | * A clear description of the suggested enhancement.
 45 | * Why you believe this enhancement would be useful.
 46 | * Any relevant examples or mockups.
 47 | 
 48 | .. _pull-requests:
 49 | 
 50 | Pull Requests
 51 | *************
 52 | 
 53 | 1. Fork the repository.
 54 | 2. Create a new branch (``git checkout -b feature/your-feature-name``).
 55 | 3. Make your changes.
 56 | 4. Commit your changes (``git commit -m 'Add some feature'```).
 57 | 5. Push to the branch (``git push origin feature/your-feature-name``).
 58 | 6. Open a pull request.
 59 | 
 60 | Please ensure your pull request adheres to the following guidelines:
 61 | 
 62 | * Follow the project's code style.
 63 | * Include tests for any new features or bug fixes.
 64 | 
 65 | .. _development-setup:
 66 | 
 67 | Development Setup
 68 | -----------------
 69 | 
 70 | 1. Clone the repository: ``git clone https://github.aetna.com/analytics-org/uqlm.git``
 71 | 2. Navigate to the project directory: ``cd uqlm``
 72 | 3. Create and activate a virtual environment (using ``venv`` or ``conda``)
 73 | 4. Install poetry (if you don't already have it): ``pip install poetry``
 74 | 5. Install uqlm with dev dependencies: ``poetry install --with dev``
 75 | 6. Install our pre-commit hooks to ensure code style compliance: ``pre-commit install``
 76 | 7. Run tests to ensure everything is working: ``pre-commit run --all-files```
 77 | 
 78 | You're ready to develop!
 79 | 
 80 | **For documentation contributions**
 81 | 
 82 | Our documentation lives on the gh-pages branch and is hosted via GitHub Pages.
 83 | 
 84 | There are two relevant directories:
 85 | 
 86 | * ``docs_src`` - where source documentation files are located
 87 | * ``docs`` - where the built documentation is located that is served by GitHub Pages
 88 | 
 89 | To build the documentation locally:
 90 | 
 91 | #. Create a virtual environment with your favorite tool(ex. conda, virtualenv, uv, and etc.)
 92 | 
 93 | #. Checkout the ``gh-pages`` branch and create new branch from it
 94 | 
 95 | #. Navigate to the ``docs_src/latest`` directory
 96 | 
 97 |   * If this is version upgrade:
 98 | 
 99 |     #. Copy ``latest`` contents to ``docs_src/{version_number}`` folder update the version in ``conf.py`` file
100 | 
101 |     #. Copy ``latest`` contents from ``docs/`` to ``docs/{version_number}`` folder
102 | 
103 |     #. Update the versions in ``docs_src/latest/index.rst`` file and ``docs_src/versions.json``
104 | 
105 | #. ``cd uqlm``
106 | 
107 | #. ``pip install -e .`` # installs current uqlm repo as package to environment
108 | 
109 | #. ``cd docs_src/latest``
110 | 
111 | #. ``brew install pandoc`` # to use nbsphinx extension
112 | 
113 | #. ``make install`` # installs sphinx related python packages
114 | 
115 | #. ``make github`` # builds docs html
116 | 
117 | #. ``make local`` # locally test doc site
118 | 
119 | 
120 | .. _style-guides:
121 | 
122 | Style Guides
123 | ------------
124 | 
125 | .. _code-style:
126 | 
127 | Code Style
128 | **********
129 | 
130 | - We use `Ruff <https://github.com/astral-sh/ruff>`_ to lint and format our files.
131 | - Our pre-commit hook will run Ruff linting and formatting when you commit.
132 | - You can manually run Ruff at any time `Ruff usage <https://github.com/astral-sh/ruff#usage>`_.
133 | 
134 | Please ensure your code is properly formatted and linted before committing.
135 | 
136 | .. _license:
137 | 
138 | License
139 | -------
140 | 
141 | Before contributing to this CVS Health sponsored project, you will need to sign the associated `Contributor License Agreement (CLA) <https://TBD>`_.
142 | 
143 | 
144 | Thanks again for using and supporting uqlm!


--------------------------------------------------------------------------------
/docs/source/_notebooks/index.rst:
--------------------------------------------------------------------------------
  1 | Example Notebooks
  2 | =================
  3 | 
  4 | UQLM offers a broad collection of tutorial notebooks to demonstrate usage of the various scorers. These notebooks aim to have versatile coverage of various LLMs and datasets, but you can easily replace them with your LLM and dataset of choice. Below is a list of these tutorials:
  5 | 
  6 | 
  7 | 
  8 | .. raw:: html
  9 | 
 10 |     <div class="sphx-glr-thumbnails">
 11 | 
 12 | .. thumbnail-parent-div-open
 13 | 
 14 | 
 15 | .. raw:: html
 16 | 
 17 |     <div class="sphx-glr-thumbcontainer" tooltip="Black-Box Demo">
 18 | 
 19 | .. only:: html
 20 | 
 21 |   .. image:: /_static/images/no_image.png
 22 |     :alt:
 23 | 
 24 |   :doc:`examples/black_box_demo`
 25 | 
 26 | .. raw:: html
 27 | 
 28 |       <div class="sphx-glr-thumbnail-title">Black-Box Demo</div>
 29 |     </div>
 30 |     
 31 | 
 32 | .. raw:: html
 33 | 
 34 |     <div class="sphx-glr-thumbcontainer" tooltip="White-Box Single-Generation Demo">
 35 | 
 36 | .. only:: html
 37 | 
 38 |   .. image:: /_static/images/no_image.png
 39 |     :alt:
 40 | 
 41 |   :doc:`examples/white_box_single_generation_demo`
 42 | 
 43 | .. raw:: html
 44 | 
 45 |       <div class="sphx-glr-thumbnail-title">White-Box Single-Generation Demo</div>
 46 |     </div>
 47 |     
 48 |     
 49 | .. raw:: html
 50 | 
 51 |     <div class="sphx-glr-thumbcontainer" tooltip="White-Box Multi-Generation Demo">
 52 | 
 53 | .. only:: html
 54 | 
 55 |   .. image:: /_static/images/no_image.png
 56 |     :alt:
 57 | 
 58 |   :doc:`examples/white_box_multi_generation_demo`
 59 | 
 60 | .. raw:: html
 61 | 
 62 |       <div class="sphx-glr-thumbnail-title">White-Box Multi-Generation Demo</div>
 63 |     </div>
 64 | 
 65 | 
 66 | .. raw:: html
 67 | 
 68 |     <div class="sphx-glr-thumbcontainer" tooltip="BS Detector Off-the-Shelf Ensemble Demo">
 69 | 
 70 | .. only:: html
 71 | 
 72 |   .. image:: /_static/images/no_image.png
 73 |     :alt:
 74 | 
 75 |   :doc:`examples/ensemble_off_the_shelf_demo`
 76 | 
 77 | .. raw:: html
 78 | 
 79 |       <div class="sphx-glr-thumbnail-title">BS Detector Off-the-Shelf Ensemble Demo</div>
 80 |     </div>
 81 | 
 82 | 
 83 | .. raw:: html
 84 | 
 85 |     <div class="sphx-glr-thumbcontainer" tooltip="Ensemble Uncertainty Quantification Demo">
 86 | 
 87 | .. only:: html
 88 | 
 89 |   .. image:: /_static/images/no_image.png
 90 |     :alt:
 91 | 
 92 |   :doc:`examples/ensemble_tuning_demo`
 93 | 
 94 | .. raw:: html
 95 | 
 96 |       <div class="sphx-glr-thumbnail-title">Ensemble Uncertainty Quantification Demo</div>
 97 |     </div>  
 98 | 
 99 | 
100 | .. raw:: html
101 | 
102 |     <div class="sphx-glr-thumbcontainer" tooltip="LLM-as-a-Judge Demo">
103 | 
104 | .. only:: html
105 | 
106 |   .. image:: /_static/images/no_image.png
107 |     :alt:
108 | 
109 |   :doc:`examples/judges_demo`
110 | 
111 | .. raw:: html
112 | 
113 |       <div class="sphx-glr-thumbnail-title">LLM-as-a-Judge Demo</div>
114 |     </div>
115 |     
116 |     
117 | .. raw:: html
118 | 
119 |     <div class="sphx-glr-thumbcontainer" tooltip="Multimodal Demo">
120 | 
121 | .. only:: html
122 | 
123 |   .. image:: /_static/images/no_image.png
124 |     :alt:
125 | 
126 |   :doc:`examples/multimodal_demo`
127 | 
128 | .. raw:: html
129 | 
130 |       <div class="sphx-glr-thumbnail-title">Multimodal Demo</div>
131 |     </div>
132 | 
133 | 
134 | .. raw:: html
135 | 
136 |     <div class="sphx-glr-thumbcontainer" tooltip="Semantic Entropy Demo">
137 | 
138 | .. only:: html
139 | 
140 |   .. image:: /_static/images/no_image.png
141 |     :alt:
142 | 
143 |   :doc:`examples/semantic_entropy_demo`
144 | 
145 | .. raw:: html
146 | 
147 |       <div class="sphx-glr-thumbnail-title">Semantic Entropy Demo</div>
148 |     </div>
149 |     
150 |     
151 | .. raw:: html
152 | 
153 |     <div class="sphx-glr-thumbcontainer" tooltip="Semantic Density Demo">
154 | 
155 | .. only:: html
156 | 
157 |   .. image:: /_static/images/no_image.png
158 |     :alt:
159 | 
160 |   :doc:`examples/semantic_density_demo`
161 | 
162 | .. raw:: html
163 | 
164 |       <div class="sphx-glr-thumbnail-title">Semantic Density Demo</div>
165 |     </div>
166 |     
167 |     
168 | .. raw:: html
169 | 
170 |     <div class="sphx-glr-thumbcontainer" tooltip="Score Calibration Demo">
171 | 
172 | .. only:: html
173 | 
174 |   .. image:: /_static/images/no_image.png
175 |     :alt:
176 | 
177 |   :doc:`examples/score_calibration_demo`
178 | 
179 | .. raw:: html
180 | 
181 |       <div class="sphx-glr-thumbnail-title">Score Calibration Demo</div>
182 |     </div>
183 | 
184 | 
185 | .. thumbnail-parent-div-close
186 | 
187 | 
188 | .. raw:: html
189 | 
190 |     </div>
191 | 
192 | 
193 | .. toctree::
194 |    :hidden:
195 | 
196 |    examples/ensemble_off_the_shelf_demo.ipynb
197 |    examples/ensemble_tuning_demo.ipynb
198 |    examples/judges_demo.ipynb
199 |    examples/semantic_entropy_demo.ipynb
200 |    examples/semantic_density_demo.ipynb
201 |    examples/white_box_multi_generation_demo.ipynb
202 |    examples/white_box_single_generation_demo.ipynb
203 |    examples/black_box_demo.ipynb
204 |    examples/multimodal_demo.ipynb
205 |    examples/score_calibration_demo.ipynb


--------------------------------------------------------------------------------
/uqlm/black_box/consistency.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Any, Optional, Tuple
  2 | import time
  3 | import numpy as np
  4 | from rich.progress import Progress
  5 | from uqlm.black_box.baseclass.similarity_scorer import SimilarityScorer
  6 | from uqlm.nli.nli import NLI
  7 | from uqlm.nli.cluster import SemanticClusterer
  8 | 
  9 | 
 10 | class ConsistencyScorer(SimilarityScorer):
 11 |     def __init__(self, nli_model_name: str = "microsoft/deberta-large-mnli", max_length: int = 2000, use_best: bool = False, scorers: List[str] = ["noncontradiction", "entailment"]):
 12 |         """
 13 |         Initialize the NonContradictionScorer.
 14 | 
 15 |         Parameters
 16 |         ----------
 17 |         use_best : bool, default=False
 18 |             Specifies whether to swap the original response for the uncertainty-minimized response
 19 |             based on semantic entropy clusters.
 20 |         """
 21 |         super().__init__()
 22 |         self.nli_model_name = nli_model_name
 23 |         self.max_length = max_length
 24 |         self.use_best = use_best
 25 |         self.nli = NLI(nli_model_name=nli_model_name, max_length=max_length)
 26 |         self.scorers = scorers
 27 | 
 28 |     def evaluate(self, responses: List[str], sampled_responses: List[List[str]], available_nli_scores: Dict[Tuple[str, str], float] = dict(), progress_bar: Optional[Progress] = None) -> Dict[str, Any]:
 29 |         """
 30 |         Evaluate confidence scores on LLM responses.
 31 | 
 32 |         Parameters
 33 |         ----------
 34 |         responses : list of strings
 35 |             Original LLM response
 36 | 
 37 |         sampled_responses : list of list of strings
 38 |             Sampled candidate responses to be compared to the original response
 39 | 
 40 |         progress_bar : rich.progress.Progress, default=None
 41 |             If provided, displays a progress bar while scoring responses
 42 | 
 43 |         Returns
 44 |         -------
 45 |         Dict
 46 |             Dictionary containing mean NLI and (optionally) semantic entropy scores.
 47 |             The dictionary will also contain original and multiple responses, updated if `use_best` is True
 48 |         """
 49 |         self.available_nli_scores = available_nli_scores
 50 |         self.num_responses = len(sampled_responses[0])
 51 |         observed_consistency_data = {"noncontradiction": [], "entailment": [], "discrete_semantic_entropy": [], "tokenprob_semantic_entropy": [], "responses": responses, "sampled_responses": sampled_responses}
 52 | 
 53 |         def _process_i(i, response):
 54 |             oc_result_i = self._observed_consistency_i(original=response, candidates=sampled_responses[i])
 55 |             for scorer in self.scorers:
 56 |                 observed_consistency_data[scorer].append(oc_result_i[scorer])
 57 |             responses[i] = oc_result_i["response"]  # Replace with optimized response if use_best
 58 |             sampled_responses[i] = oc_result_i["candidates"]  # Replace with updated candidates if use_best
 59 | 
 60 |         if progress_bar:
 61 |             progress_task = progress_bar.add_task("  - Scoring responses with entailment/contradiction...", total=len(responses))
 62 |         for i, response in enumerate(responses):
 63 |             _process_i(i, response)
 64 |             if progress_bar:
 65 |                 progress_bar.update(progress_task, advance=1)
 66 |         time.sleep(0.1)
 67 | 
 68 |         if self.use_best:
 69 |             observed_consistency_data["responses"] = responses
 70 |             observed_consistency_data["sampled_responses"] = sampled_responses
 71 |         return observed_consistency_data
 72 | 
 73 |     def _observed_consistency_i(self, original: str, candidates: List[str]) -> Dict[str, Any]:
 74 |         """
 75 |         Compute observed consistency score on the provided original response and multiple candidates.
 76 |         """
 77 |         best_response = original
 78 |         if self.use_best:
 79 |             all_responses = [original] + candidates
 80 | 
 81 |             self.clusterer = SemanticClusterer(nli=self.nli)
 82 |             _, response_probabilities = self.clusterer.compute_response_probabilities(logprobs_results=None, num_responses=len(all_responses))
 83 |             best_response, _, _, _ = self.clusterer.evaluate(responses=all_responses, response_probabilities=response_probabilities)
 84 | 
 85 |             candidates = all_responses.remove(best_response)
 86 |             self.available_nli_scores = self.clusterer.nli_scores
 87 | 
 88 |         nli_scores = {}
 89 |         for s_ in self.scorers:
 90 |             nli_scores[s_] = []
 91 |             for candidate in candidates:
 92 |                 if s_ in self.available_nli_scores:
 93 |                     if (candidate, best_response) in self.available_nli_scores[s_]:
 94 |                         nli_scores[s_].append(self.available_nli_scores[s_][(candidate, best_response)])
 95 |                         continue
 96 |                 nli_scores[s_].append(self.nli.get_nli_results(response1=best_response, response2=candidate)[s_ + "_score"])
 97 | 
 98 |         result = {n: np.mean(nli_scores[n]) for n in self.scorers}
 99 |         result.update({"candidates": candidates, "response": best_response})
100 |         return result
101 | 


--------------------------------------------------------------------------------
/tests/test_whiteboxuq.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 CVS Health and/or one of its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import pytest
 16 | import json
 17 | from uqlm.scorers import WhiteBoxUQ
 18 | from langchain_openai import AzureChatOpenAI
 19 | 
 20 | datafile_path = "tests/data/scorers/whitebox_results_file.json"
 21 | with open(datafile_path, "r") as f:
 22 |     expected_result = json.load(f)
 23 | 
 24 | data = expected_result["data"]
 25 | metadata = expected_result["metadata"]
 26 | 
 27 | PROMPTS = data["prompts"]
 28 | MOCKED_RESPONSES = data["responses"]
 29 | MOCKED_LOGPROBS = data["logprobs"]
 30 | 
 31 | mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1.0, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")
 32 | 
 33 | 
 34 | @pytest.mark.asyncio
 35 | async def test_whiteboxuq_basic(monkeypatch):
 36 |     wbuq = WhiteBoxUQ(llm=mock_object, scorers=["normalized_probability", "min_probability"])
 37 | 
 38 |     async def mock_generate_original_responses(*args, **kwargs):
 39 |         wbuq.logprobs = MOCKED_LOGPROBS
 40 |         return MOCKED_RESPONSES
 41 | 
 42 |     monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses)
 43 | 
 44 |     for show_progress_bars in [False, True]:
 45 |         results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=show_progress_bars)
 46 | 
 47 |         for i in range(len(PROMPTS)):
 48 |             assert results.data["normalized_probability"][i] == pytest.approx(data["normalized_probability"][i])
 49 |             assert results.data["min_probability"][i] == pytest.approx(data["min_probability"][i])
 50 | 
 51 |         assert results.metadata == metadata
 52 | 
 53 | 
 54 | @pytest.mark.asyncio
 55 | async def test_whiteboxuq_top_logprobs(monkeypatch):
 56 |     wbuq = WhiteBoxUQ(llm=mock_object, scorers=["sequence_probability"])
 57 | 
 58 |     async def mock_generate_original_responses(*args, **kwargs):
 59 |         wbuq.logprobs = MOCKED_LOGPROBS
 60 |         return MOCKED_RESPONSES
 61 | 
 62 |     monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses)
 63 | 
 64 |     results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False)
 65 |     assert "sequence_probability" in results.data
 66 | 
 67 | 
 68 | @pytest.mark.asyncio
 69 | async def test_whiteboxuq_sampled_logprobs(monkeypatch):
 70 |     wbuq = WhiteBoxUQ(llm=mock_object, scorers=["monte_carlo_probability"])
 71 | 
 72 |     async def mock_generate_original_responses(*args, **kwargs):
 73 |         wbuq.logprobs = MOCKED_LOGPROBS
 74 |         return MOCKED_RESPONSES
 75 | 
 76 |     async def mock_generate_candidate_responses(*args, **kwargs):
 77 |         wbuq.multiple_logprobs = [[[{"token": "Hello", "logprob": -0.1}]]] * len(PROMPTS)
 78 |         return [["Hello world"] * 5] * len(PROMPTS)
 79 | 
 80 |     monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses)
 81 |     monkeypatch.setattr(wbuq, "generate_candidate_responses", mock_generate_candidate_responses)
 82 | 
 83 |     results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False)
 84 |     assert "monte_carlo_probability" in results.data
 85 | 
 86 | 
 87 | @pytest.mark.asyncio
 88 | async def test_whiteboxuq_p_true(monkeypatch):
 89 |     wbuq = WhiteBoxUQ(llm=mock_object, scorers=["p_true"])
 90 | 
 91 |     async def mock_generate_original_responses(*args, **kwargs):
 92 |         wbuq.logprobs = MOCKED_LOGPROBS
 93 |         return MOCKED_RESPONSES
 94 | 
 95 |     async def mock_p_true_evaluate(*args, **kwargs):
 96 |         return {"p_true": [0.9] * len(PROMPTS)}
 97 | 
 98 |     monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses)
 99 |     monkeypatch.setattr(wbuq.p_true_scorer, "evaluate", mock_p_true_evaluate)
100 | 
101 |     results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False)
102 |     assert "p_true" in results.data
103 | 
104 | 
105 | def test_whiteboxuq_invalid_scorer():
106 |     with pytest.raises(ValueError, match="Invalid scorer provided: invalid_scorer"):
107 |         WhiteBoxUQ(llm=mock_object, scorers=["invalid_scorer"])
108 | 
109 | 
110 | @pytest.mark.asyncio
111 | async def test_whiteboxuq_top_logprobs_full(monkeypatch):
112 |     wbuq = WhiteBoxUQ(llm=mock_object, scorers=["mean_token_negentropy"], top_k_logprobs=10)
113 | 
114 |     async def mock_generate_original_responses(*args, **kwargs):
115 |         wbuq.logprobs = MOCKED_LOGPROBS
116 |         return MOCKED_RESPONSES
117 | 
118 |     monkeypatch.setattr(wbuq, "generate_original_responses", mock_generate_original_responses)
119 | 
120 |     # Optional: monkeypatch the scorer to ensure evaluate is called and returns something
121 |     wbuq.top_logprobs_scorer.evaluate = lambda logprobs_results: {"mean_token_negentropy": [0.8] * len(PROMPTS)}
122 | 
123 |     results = await wbuq.generate_and_score(prompts=PROMPTS, show_progress_bars=False)
124 |     assert "mean_token_negentropy" in results.data
125 | 


--------------------------------------------------------------------------------
/uqlm/nli/nli.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 CVS Health and/or one of its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import numpy as np
 16 | import warnings
 17 | import torch
 18 | from typing import Any, Dict
 19 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
 20 | from transformers import logging
 21 | 
 22 | from uqlm.utils.device import get_best_device
 23 | 
 24 | logging.set_verbosity_error()
 25 | 
 26 | 
 27 | class NLI:
 28 |     def __init__(self, device: Any = None, verbose: bool = False, nli_model_name: str = "microsoft/deberta-large-mnli", max_length: int = 2000) -> None:
 29 |         """
 30 |         A class to computing NLI-based confidence scores. This class offers two types of confidence scores, namely
 31 |         noncontradiction probability :footcite:`chen2023quantifyinguncertaintyanswerslanguage` and semantic entropy
 32 |         :footcite:`farquhar2024detectinghallucinations`.
 33 | 
 34 |         Parameters
 35 |         ----------
 36 |         device : torch.device input or torch.device object, default=None
 37 |             Specifies the device that classifiers use for prediction. Set to "cuda" for classifiers to be able to
 38 |             leverage the GPU.
 39 | 
 40 |         verbose : bool, default=False
 41 |             Specifies whether to print verbose status updates of NLI scoring process
 42 | 
 43 |         nli_model_name : str, default="microsoft/deberta-large-mnli"
 44 |             Specifies which NLI model to use. Must be acceptable input to AutoTokenizer.from_pretrained() and
 45 |             AutoModelForSequenceClassification.from_pretrained()
 46 | 
 47 |         max_length : int, default=2000
 48 |             Specifies the maximum allowed string length. Responses longer than this value will be truncated to
 49 |             avoid OutOfMemoryError
 50 |         """
 51 |         # Handle device detection
 52 |         if device is None:
 53 |             device = get_best_device()
 54 |         elif isinstance(device, str):
 55 |             device = torch.device(device)
 56 | 
 57 |         self.device = device
 58 |         self.verbose = verbose
 59 |         self.max_length = max_length
 60 |         self.tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
 61 |         model = AutoModelForSequenceClassification.from_pretrained(nli_model_name)
 62 |         self.model = model.to(self.device) if self.device else model
 63 |         self.label_mapping = ["contradiction", "neutral", "entailment"]
 64 |         self.probabilities = dict()
 65 | 
 66 |     def predict(self, premise: str, hypothesis: str) -> Any:
 67 |         """
 68 |         This method compute probability of contradiction on the provide inputs.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         premise : str
 73 |             An input for the sequence classification DeBERTa model.
 74 | 
 75 |         hypothesis : str
 76 |             An input for the sequence classification DeBERTa model.
 77 | 
 78 |         Returns
 79 |         -------
 80 |         numpy.ndarray
 81 |             Probabilities computed by NLI model
 82 |         """
 83 |         if len(premise) > self.max_length or len(hypothesis) > self.max_length:
 84 |             warnings.warn("Maximum response length exceeded for NLI comparison. Truncation will occur. To adjust, change the value of max_length")
 85 |         concat = premise[0 : self.max_length] + " [SEP] " + hypothesis[0 : self.max_length]
 86 |         encoded_inputs = self.tokenizer(concat, padding=True, return_tensors="pt")
 87 |         if self.device:
 88 |             encoded_inputs = {name: tensor.to(self.device) for name, tensor in encoded_inputs.items()}
 89 |         logits = self.model(**encoded_inputs).logits
 90 |         np_logits = logits.detach().cpu().numpy() if self.device else logits.detach().numpy()
 91 |         probabilites = np.exp(np_logits) / np.exp(np_logits).sum(axis=-1, keepdims=True)
 92 |         return probabilites
 93 | 
 94 |     def get_nli_results(self, response1: str, response2: str) -> Dict[str, Any]:
 95 |         """This method computes mean NLI score and determines whether entailment exists."""
 96 |         if response1 == response2:
 97 |             avg_noncontradiction_score, entailment, avg_entailment_score = 1, True, 1
 98 |         else:
 99 |             left = self.predict(premise=response1, hypothesis=response2)
100 |             left_label = self.label_mapping[left.argmax(axis=1)[0]]
101 | 
102 |             right = self.predict(premise=response2, hypothesis=response1)
103 |             right_label = self.label_mapping[right.argmax(axis=1)[0]]
104 |             s1, s2 = 1 - left[:, 0], 1 - right[:, 0]
105 | 
106 |             entailment = left_label == "entailment" or right_label == "entailment"
107 |             avg_noncontradiction_score = ((s1 + s2) / 2)[0]
108 |             avg_entailment_score = ((left[:, -1] + right[:, -1]) / 2)[0]
109 |             self.probabilities.update({f"{response1}_{response2}": left, f"{response2}_{response1}": right})
110 |         return {"noncontradiction_score": avg_noncontradiction_score, "entailment": entailment, "entailment_score": avg_entailment_score}
111 | 


--------------------------------------------------------------------------------
/tests/test_semanticdensity.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 CVS Health and/or one of its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import pytest
 16 | import json
 17 | from uqlm.scorers import SemanticDensity
 18 | from unittest.mock import AsyncMock, MagicMock
 19 | from uqlm.utils.results import UQResult
 20 | from langchain_openai import AzureChatOpenAI
 21 | 
 22 | datafile_path = "tests/data/scorers/semanticdensity_results_file.json"
 23 | with open(datafile_path, "r") as f:
 24 |     expected_result = json.load(f)
 25 | 
 26 | data = expected_result["data"]
 27 | metadata = expected_result["metadata"]
 28 | 
 29 | mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")
 30 | 
 31 | 
 32 | @pytest.mark.flaky(reruns=3)
 33 | @pytest.mark.asyncio
 34 | async def test_semanticdensity(monkeypatch):
 35 |     PROMPTS = data["prompts"]
 36 |     MOCKED_RESPONSES = data["responses"]
 37 |     MOCKED_SAMPLED_RESPONSES = data["sampled_responses"]
 38 | 
 39 |     # Initiate SemanticDensity class object
 40 |     sd_object = SemanticDensity(llm=mock_object, device="cpu")
 41 | 
 42 |     async def mock_generate_original_responses(*args, **kwargs):
 43 |         sd_object.logprobs = [None] * 5
 44 |         return MOCKED_RESPONSES
 45 | 
 46 |     async def mock_generate_candidate_responses(*args, **kwargs):
 47 |         sd_object.multiple_logprobs = data["multiple_logprobs"]
 48 |         return MOCKED_SAMPLED_RESPONSES
 49 | 
 50 |     monkeypatch.setattr(sd_object, "generate_original_responses", mock_generate_original_responses)
 51 |     monkeypatch.setattr(sd_object, "generate_candidate_responses", mock_generate_candidate_responses)
 52 | 
 53 |     for show_progress_bars in [True, False]:
 54 |         se_results = await sd_object.generate_and_score(prompts=PROMPTS, show_progress_bars=show_progress_bars)
 55 |         sd_object.logprobs = None
 56 |         sd_results = sd_object.score(responses=MOCKED_RESPONSES, sampled_responses=MOCKED_SAMPLED_RESPONSES)
 57 |         assert sd_results.data["responses"] == data["responses"]
 58 |         assert sd_results.data["sampled_responses"] == data["sampled_responses"]
 59 |         assert sd_results.data["prompts"] == data["prompts"]
 60 |         assert all([abs(sd_results.data["semantic_density_values"][i] - data["semantic_density_values"][i]) < 1e-5 for i in range(len(PROMPTS))])
 61 |         assert se_results.metadata == metadata
 62 | 
 63 | 
 64 | @pytest.mark.asyncio
 65 | async def test_generate_and_score_mocked():
 66 |     mock_llm = MagicMock()
 67 |     mock_llm.logprobs = True
 68 | 
 69 |     semantic_density = SemanticDensity(llm=mock_llm)
 70 |     semantic_density._setup_nli = MagicMock()
 71 |     semantic_density._construct_progress_bar = MagicMock()
 72 |     semantic_density._display_generation_header = MagicMock()
 73 |     semantic_density.generate_original_responses = AsyncMock(return_value=["response1", "response2"])
 74 |     semantic_density.generate_candidate_responses = AsyncMock(return_value=[["sample1", "sample2"], ["sample3", "sample4"]])
 75 |     semantic_density.score = MagicMock(return_value=UQResult({"data": {}, "metadata": {}}))
 76 | 
 77 |     prompts = ["prompt1", "prompt2"]
 78 | 
 79 |     # Manually set prompts since score is mocked
 80 |     semantic_density.prompts = prompts
 81 | 
 82 |     result = await semantic_density.generate_and_score(prompts, num_responses=2)
 83 | 
 84 |     assert isinstance(result, UQResult)
 85 |     assert semantic_density.prompts == prompts
 86 |     assert semantic_density.num_responses == 2
 87 |     semantic_density.generate_original_responses.assert_called_once_with(prompts, progress_bar=semantic_density.progress_bar)
 88 |     semantic_density.generate_candidate_responses.assert_called_once_with(prompts, num_responses=2, progress_bar=semantic_density.progress_bar)
 89 |     semantic_density.score.assert_called_once()
 90 | 
 91 | 
 92 | def test_score_mocked():
 93 |     semantic_density = SemanticDensity()
 94 |     semantic_density._semantic_density_process = MagicMock(return_value=("density_value", None))
 95 |     semantic_density._construct_progress_bar = MagicMock()
 96 |     semantic_density._display_scoring_header = MagicMock()
 97 |     semantic_density._stop_progress_bar = MagicMock()
 98 |     semantic_density._construct_black_box_return_data = MagicMock(return_value={})
 99 |     semantic_density.progress_bar = MagicMock()
100 |     semantic_density.progress_bar.add_task = MagicMock(return_value="task_id")
101 |     semantic_density.progress_bar.update = MagicMock()
102 | 
103 |     # Required attributes
104 |     responses = ["response1", "response2"]
105 |     sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]]
106 |     prompts = ["prompt1", "prompt2"]
107 |     sampled_logprobs_results = [["logprob1", "logprob2"], ["logprob3", "logprob4"]]
108 |     logprobs_results = [None, None]
109 | 
110 |     result = semantic_density.score(prompts=prompts, responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results)
111 | 
112 |     assert "semantic_density_values" in result.data
113 |     assert "multiple_logprobs" in result.data
114 |     semantic_density._semantic_density_process.assert_called()
115 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Contributor Covenant Code of Conduct
  3 | 
  4 | ## Our Pledge
  5 | 
  6 | We as members, contributors, and leaders pledge to make participation in our
  7 | community a harassment-free experience for everyone, regardless of age, body
  8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  9 | identity and expression, level of experience, education, socio-economic status,
 10 | nationality, personal appearance, race, caste, color, religion, or sexual
 11 | identity and orientation.
 12 | 
 13 | We pledge to act and interact in ways that contribute to an open, welcoming,
 14 | diverse, inclusive, and healthy community.
 15 | 
 16 | ## Our Standards
 17 | 
 18 | Examples of behavior that contributes to a positive environment for our
 19 | community include:
 20 | 
 21 | * Demonstrating empathy and kindness toward other people
 22 | * Being respectful of differing opinions, viewpoints, and experiences
 23 | * Giving and gracefully accepting constructive feedback
 24 | * Accepting responsibility and apologizing to those affected by our mistakes,
 25 |   and learning from the experience
 26 | * Focusing on what is best not just for us as individuals, but for the overall
 27 |   community
 28 | 
 29 | Examples of unacceptable behavior include:
 30 | 
 31 | * The use of sexualized language or imagery, and sexual attention or advances of
 32 |   any kind
 33 | * Trolling, insulting or derogatory comments, and personal or political attacks
 34 | * Public or private harassment
 35 | * Publishing others' private information, such as a physical or email address,
 36 |   without their explicit permission
 37 | * Other conduct which could reasonably be considered inappropriate in a
 38 |   professional setting
 39 | 
 40 | ## Enforcement Responsibilities
 41 | 
 42 | Community leaders are responsible for clarifying and enforcing our standards of
 43 | acceptable behavior and will take appropriate and fair corrective action in
 44 | response to any behavior that they deem inappropriate, threatening, offensive,
 45 | or harmful.
 46 | 
 47 | Community leaders have the right and responsibility to remove, edit, or reject
 48 | comments, commits, code, wiki edits, issues, and other contributions that are
 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 50 | decisions when appropriate.
 51 | 
 52 | ## Scope
 53 | 
 54 | This Code of Conduct applies within all community spaces, and also applies when
 55 | an individual is officially representing the community in public spaces.
 56 | Examples of representing our community include using an official email address,
 57 | posting via an official social media account, or acting as an appointed
 58 | representative at an online or offline event.
 59 | 
 60 | ## Enforcement
 61 | 
 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 63 | reported to the community leaders responsible for enforcement at
 64 | dylan.bouchard@cvshealth.com.
 65 | All complaints will be reviewed and investigated promptly and fairly.
 66 | 
 67 | All community leaders are obligated to respect the privacy and security of the
 68 | reporter of any incident.
 69 | 
 70 | ## Enforcement Guidelines
 71 | 
 72 | Community leaders will follow these Community Impact Guidelines in determining
 73 | the consequences for any action they deem in violation of this Code of Conduct:
 74 | 
 75 | ### 1. Correction
 76 | 
 77 | **Community Impact**: Use of inappropriate language or other behavior deemed
 78 | unprofessional or unwelcome in the community.
 79 | 
 80 | **Consequence**: A private, written warning from community leaders, providing
 81 | clarity around the nature of the violation and an explanation of why the
 82 | behavior was inappropriate. A public apology may be requested.
 83 | 
 84 | ### 2. Warning
 85 | 
 86 | **Community Impact**: A violation through a single incident or series of
 87 | actions.
 88 | 
 89 | **Consequence**: A warning with consequences for continued behavior. No
 90 | interaction with the people involved, including unsolicited interaction with
 91 | those enforcing the Code of Conduct, for a specified period of time. This
 92 | includes avoiding interactions in community spaces as well as external channels
 93 | like social media. Violating these terms may lead to a temporary or permanent
 94 | ban.
 95 | 
 96 | ### 3. Temporary Ban
 97 | 
 98 | **Community Impact**: A serious violation of community standards, including
 99 | sustained inappropriate behavior.
100 | 
101 | **Consequence**: A temporary ban from any sort of interaction or public
102 | communication with the community for a specified period of time. No public or
103 | private interaction with the people involved, including unsolicited interaction
104 | with those enforcing the Code of Conduct, is allowed during this period.
105 | Violating these terms may lead to a permanent ban.
106 | 
107 | ### 4. Permanent Ban
108 | 
109 | **Community Impact**: Demonstrating a pattern of violation of community
110 | standards, including sustained inappropriate behavior, harassment of an
111 | individual, or aggression toward or disparagement of classes of individuals.
112 | 
113 | **Consequence**: A permanent ban from any sort of public interaction within the
114 | community.
115 | 
116 | ## Attribution
117 | 
118 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119 | version 2.1, available at
120 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
121 | 
122 | Community Impact Guidelines were inspired by
123 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
124 | 
125 | For answers to common questions about this code of conduct, see the FAQ at
126 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
127 | [https://www.contributor-covenant.org/translations][translations].
128 | 
129 | [homepage]: https://www.contributor-covenant.org
130 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
131 | [Mozilla CoC]: https://github.com/mozilla/diversity
132 | [FAQ]: https://www.contributor-covenant.org/faq
133 | [translations]: https://www.contributor-covenant.org/translations
134 | 


--------------------------------------------------------------------------------
/docs/source/refs.bib:
--------------------------------------------------------------------------------
  1 | @misc{bouchard2025actionableframeworkassessingbias,
  2 |       title={An Actionable Framework for Assessing Bias and Fairness in Large Language Model Use Cases}, 
  3 |       author={Dylan Bouchard},
  4 |       year={2025},
  5 |       eprint={2407.10853},
  6 |       archivePrefix={arXiv},
  7 |       primaryClass={cs.CL},
  8 |       url={https://arxiv.org/abs/2407.10853}, 
  9 | }
 10 | 
 11 | # BLACK-BOX SCORERS
 12 | # Contradiction Probability
 13 | 
 14 | @misc{chen2023quantifyinguncertaintyanswerslanguage,
 15 |       title={Quantifying Uncertainty in Answers from any Language Model and Enhancing their Trustworthiness}, 
 16 |       author={Jiuhai Chen and Jonas Mueller},
 17 |       year={2023},
 18 |       eprint={2308.16175},
 19 |       archivePrefix={arXiv},
 20 |       primaryClass={cs.CL},
 21 |       url={https://arxiv.org/abs/2308.16175}, 
 22 | }
 23 | 
 24 | @misc{lin2024generatingconfidenceuncertaintyquantification,
 25 |       title={Generating with Confidence: Uncertainty Quantification for Black-box Large Language Models}, 
 26 |       author={Zhen Lin and Shubhendu Trivedi and Jimeng Sun},
 27 |       year={2024},
 28 |       eprint={2305.19187},
 29 |       archivePrefix={arXiv},
 30 |       primaryClass={cs.CL},
 31 |       url={https://arxiv.org/abs/2305.19187}, 
 32 | }
 33 | 
 34 | 
 35 | # Semantic Entropy
 36 | 
 37 | @misc{farquhar2024detectinghallucinations,
 38 |       title={Detecting Hallucinations in Large Language Models Using Semantic Entropy},
 39 |       author={Sebastian Farquhar and Jannik Kossen and Lorenz Kuhn and Yarin Gal},
 40 |       year={2024},
 41 |       url={https://doi.org/10.1038/s41586-024-07421-0},
 42 | }
 43 | 
 44 | @misc{kuhn2023semanticuncertaintylinguisticinvariances,
 45 |       title={Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation}, 
 46 |       author={Lorenz Kuhn and Yarin Gal and Sebastian Farquhar},
 47 |       year={2023},
 48 |       eprint={2302.09664},
 49 |       archivePrefix={arXiv},
 50 |       primaryClass={cs.CL},
 51 |       url={https://arxiv.org/abs/2302.09664}, 
 52 | }
 53 | 
 54 | # Semantic Density
 55 | 
 56 | @misc{qiu2024semanticdensityuncertaintyquantification,
 57 |       title={Semantic Density: Uncertainty Quantification for Large Language Models through Confidence Measurement in Semantic Space}, 
 58 |       author={Xin Qiu and Risto Miikkulainen},
 59 |       year={2024},
 60 |       eprint={2405.13845},
 61 |       archivePrefix={arXiv},
 62 |       primaryClass={cs.CL},
 63 |       url={https://arxiv.org/abs/2405.13845}, 
 64 | }
 65 | 
 66 | # Exact Match
 67 | 
 68 | @misc{cole2023selectivelyansweringambiguousquestions,
 69 |       title={Selectively Answering Ambiguous Questions}, 
 70 |       author={Jeremy R. Cole and Michael J. Q. Zhang and Daniel Gillick and Julian Martin Eisenschlos and Bhuwan Dhingra and Jacob Eisenstein},
 71 |       year={2023},
 72 |       eprint={2305.14613},
 73 |       archivePrefix={arXiv},
 74 |       primaryClass={cs.CL},
 75 |       url={https://arxiv.org/abs/2305.14613}, 
 76 | }
 77 | 
 78 | # BERT-score
 79 | 
 80 | @misc{zhang2020bertscoreevaluatingtextgeneration,
 81 |       title={BERTScore: Evaluating Text Generation with BERT}, 
 82 |       author={Tianyi Zhang and Varsha Kishore and Felix Wu and Kilian Q. Weinberger and Yoav Artzi},
 83 |       year={2020},
 84 |       eprint={1904.09675},
 85 |       archivePrefix={arXiv},
 86 |       primaryClass={cs.CL},
 87 |       url={https://arxiv.org/abs/1904.09675}, 
 88 | }
 89 | 
 90 | # BLUERT-score
 91 | 
 92 | @misc{sellam2020bleurtlearningrobustmetrics,
 93 |       title={BLEURT: Learning Robust Metrics for Text Generation}, 
 94 |       author={Thibault Sellam and Dipanjan Das and Ankur P. Parikh},
 95 |       year={2020},
 96 |       eprint={2004.04696},
 97 |       archivePrefix={arXiv},
 98 |       primaryClass={cs.CL},
 99 |       url={https://arxiv.org/abs/2004.04696}, 
100 | }
101 | 
102 | # Cosine Similarity
103 | 
104 | @misc{shorinwa2024surveyuncertaintyquantificationlarge,
105 |       title={A Survey on Uncertainty Quantification of Large Language Models: Taxonomy, Open Research Challenges, and Future Directions}, 
106 |       author={Ola Shorinwa and Zhiting Mei and Justin Lidard and Allen Z. Ren and Anirudha Majumdar},
107 |       year={2024},
108 |       eprint={2412.05563},
109 |       archivePrefix={arXiv},
110 |       primaryClass={cs.CL},
111 |       url={https://arxiv.org/abs/2412.05563}, 
112 | }
113 | 
114 | # WHITE-BOX SCORERS
115 | 
116 | # Minimum Token Probability
117 | 
118 | @misc{manakul2023selfcheckgptzeroresourceblackboxhallucination,
119 |       title={SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models}, 
120 |       author={Potsawee Manakul and Adian Liusie and Mark J. F. Gales},
121 |       year={2023},
122 |       eprint={2303.08896},
123 |       archivePrefix={arXiv},
124 |       primaryClass={cs.CL},
125 |       url={https://arxiv.org/abs/2303.08896}, 
126 | }
127 | 
128 | # Length-Normalized Joint Token Probability
129 | 
130 | @misc{malinin2021uncertaintyestimationautoregressivestructured,
131 |       title={Uncertainty Estimation in Autoregressive Structured Prediction}, 
132 |       author={Andrey Malinin and Mark Gales},
133 |       year={2021},
134 |       eprint={2002.07650},
135 |       archivePrefix={arXiv},
136 |       primaryClass={stat.ML},
137 |       url={https://arxiv.org/abs/2002.07650}, 
138 | }
139 | 
140 | # LLM-as-a-Judge Scorers
141 | 
142 | # Categorical LLM-as-a-Judge
143 | 
144 | @misc{luo2023chatgptfactualinconsistencyevaluator,
145 |       title={ChatGPT as a Factual Inconsistency Evaluator for Text Summarization}, 
146 |       author={Zheheng Luo and Qianqian Xie and Sophia Ananiadou},
147 |       year={2023},
148 |       eprint={2303.15621},
149 |       archivePrefix={arXiv},
150 |       primaryClass={cs.CL},
151 |       url={https://arxiv.org/abs/2303.15621}, 
152 | }
153 | 
154 | # Continuous LLM-as-a-Judge
155 | 
156 | # Panel of LLM Judges
157 | 
158 | @misc{verga2024replacingjudgesjuriesevaluating,
159 |       title={Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models}, 
160 |       author={Pat Verga and Sebastian Hofstatter and Sophia Althammer and Yixuan Su and Aleksandra Piktus and Arkady Arkhangorodsky and Minjie Xu and Naomi White and Patrick Lewis},
161 |       year={2024},
162 |       eprint={2404.18796},
163 |       archivePrefix={arXiv},
164 |       primaryClass={cs.CL},
165 |       url={https://arxiv.org/abs/2404.18796}, 
166 | }
167 | 


--------------------------------------------------------------------------------
/tests/test_sampled_logprobs.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from unittest.mock import MagicMock, patch
  3 | from uqlm.white_box.sampled_logprobs import SampledLogprobsScorer, SAMPLED_LOGPROBS_SCORER_NAMES
  4 | 
  5 | 
  6 | @pytest.fixture
  7 | def scorer():
  8 |     """Fixture to create a SampledLogprobsScorer instance."""
  9 |     mock_llm = MagicMock()
 10 |     return SampledLogprobsScorer(llm=mock_llm)
 11 | 
 12 | 
 13 | def test_initialization(scorer):
 14 |     """Test the initialization of SampledLogprobsScorer."""
 15 |     assert scorer.scorers == SAMPLED_LOGPROBS_SCORER_NAMES
 16 |     assert scorer.llm is not None
 17 |     assert scorer.nli_model_name == "microsoft/deberta-large-mnli"
 18 |     assert scorer.max_length == 2000
 19 |     assert scorer.prompts_in_nli is True
 20 |     assert scorer.length_normalize is True
 21 | 
 22 | 
 23 | @pytest.mark.parametrize("scorer_name", SAMPLED_LOGPROBS_SCORER_NAMES)
 24 | def test_evaluate_with_mocked_scorers(scorer, scorer_name):
 25 |     """Test the evaluate method with mocked scorers."""
 26 |     responses = ["response1", "response2"]
 27 |     sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]]
 28 |     logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]]
 29 |     sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]]
 30 |     prompts = ["prompt1", "prompt2"]
 31 | 
 32 |     # Mock individual scorer methods
 33 |     with patch.object(scorer, "monte_carlo_probability", return_value=[0.5, 0.6]) as mock_mc, patch.object(scorer, "compute_consistency_confidence", return_value=[0.7, 0.8]) as mock_cc, patch.object(scorer, "compute_semantic_negentropy", return_value=[0.9, 1.0]) as mock_sn, patch.object(scorer, "compute_semantic_density", return_value=[1.1, 1.2]) as mock_sd:
 34 |         scorer.scorers = [scorer_name]
 35 |         result = scorer.evaluate(responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results, prompts=prompts)
 36 | 
 37 |         # Verify the correct scorer method was called
 38 |         if scorer_name == "monte_carlo_probability":
 39 |             mock_mc.assert_called_once()
 40 |         elif scorer_name == "consistency_and_confidence":
 41 |             mock_cc.assert_called_once()
 42 |         elif scorer_name == "semantic_negentropy":
 43 |             mock_sn.assert_called_once()
 44 |         elif scorer_name == "semantic_density":
 45 |             mock_sd.assert_called_once()
 46 | 
 47 |         # Verify the result contains the correct scorer output
 48 |         assert scorer_name in result
 49 |         assert isinstance(result[scorer_name], list)
 50 | 
 51 | 
 52 | def test_monte_carlo_probability(scorer):
 53 |     """Test the monte_carlo_probability method."""
 54 |     responses = ["response1", "response2"]
 55 |     logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]]
 56 |     sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]]
 57 | 
 58 |     # Mock _compute_single_generation_scores
 59 |     with patch.object(scorer, "_compute_single_generation_scores", return_value=[0.8, 0.9]):
 60 |         result = scorer.monte_carlo_probability(responses=responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results)
 61 |         assert isinstance(result, list)
 62 |         assert len(result) == len(responses)
 63 | 
 64 | 
 65 | def test_compute_consistency_confidence(scorer):
 66 |     """Test the compute_consistency_confidence method."""
 67 |     responses = ["response1", "response2"]
 68 |     sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]]
 69 |     logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]]
 70 | 
 71 |     # Mock CosineScorer and _compute_single_generation_scores
 72 |     with patch("uqlm.black_box.cosine.CosineScorer.evaluate", return_value=[0.5, 0.6]), patch.object(scorer, "_compute_single_generation_scores", return_value=[0.7, 0.8]):
 73 |         result = scorer.compute_consistency_confidence(responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results)
 74 |         assert isinstance(result, list)
 75 |         assert len(result) == len(responses)
 76 | 
 77 | 
 78 | def test_compute_semantic_negentropy(scorer):
 79 |     """Test the compute_semantic_negentropy method."""
 80 |     responses = ["response1", "response2"]
 81 |     prompts = ["prompt1", "prompt2"]
 82 |     sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]]
 83 |     logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]]
 84 |     sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]]
 85 | 
 86 |     # Mock SemanticEntropy
 87 |     with patch("uqlm.scorers.entropy.SemanticEntropy.score", return_value=MagicMock(to_dict=lambda: {"data": {"tokenprob_confidence_scores": [0.9, 1.0]}})):
 88 |         result = scorer.compute_semantic_negentropy(responses=responses, prompts=prompts, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results)
 89 |         assert isinstance(result, list)
 90 |         assert len(result) == len(responses)
 91 | 
 92 | 
 93 | def test_compute_semantic_density(scorer):
 94 |     """Test the compute_semantic_density method."""
 95 |     responses = ["response1", "response2"]
 96 |     sampled_responses = [["sample1", "sample2"], ["sample3", "sample4"]]
 97 |     logprobs_results = [[{"logprob": -0.1}], [{"logprob": -0.2}]]
 98 |     sampled_logprobs_results = [[[{"logprob": -0.15}]], [[{"logprob": -0.25}]]]
 99 |     prompts = ["prompt1", "prompt2"]
100 | 
101 |     # Mock the semantic_negentropy_scorer and its clusterer
102 |     mock_clusterer = MagicMock()
103 |     mock_clusterer.nli.probabilities = [0.1, 0.2]
104 |     mock_semantic_negentropy_scorer = MagicMock()
105 |     mock_semantic_negentropy_scorer.clusterer = mock_clusterer
106 | 
107 |     # Assign the mocked semantic_negentropy_scorer to the scorer
108 |     scorer.semantic_negentropy_scorer = mock_semantic_negentropy_scorer
109 | 
110 |     # Mock SemanticDensity
111 |     with patch("uqlm.scorers.density.SemanticDensity.score", return_value=MagicMock(to_dict=lambda: {"data": {"semantic_density_values": [1.1, 1.2]}})):
112 |         result = scorer.compute_semantic_density(responses=responses, sampled_responses=sampled_responses, logprobs_results=logprobs_results, sampled_logprobs_results=sampled_logprobs_results, prompts=prompts)
113 |         assert isinstance(result, list)
114 |         assert len(result) == len(responses)
115 |         assert result == [1.1, 1.2]
116 | 


--------------------------------------------------------------------------------
/uqlm/nli/cluster.py:
--------------------------------------------------------------------------------
  1 | from collections import deque, Counter
  2 | from typing import Any, Dict, List, Tuple
  3 | from uqlm.nli.nli import NLI
  4 | import numpy as np
  5 | 
  6 | 
  7 | class SemanticClusterer:
  8 |     def __init__(self, nli: NLI = None, length_normalize: bool = False):
  9 |         self.nli = nli
 10 |         self.length_normalize = length_normalize
 11 |         self.nli_scores = {"noncontradiction": dict(), "entailment": dict()}
 12 | 
 13 |     def evaluate(self, responses: List[str], prompt: str = None, response_probabilities: List[float] = None) -> Tuple[str, List[List[str]], List[float], Dict[Tuple[str, str], float]]:
 14 |         """
 15 |         Evaluate the cluster of responses.
 16 |         """
 17 |         clustered_responses, cluster_indices, noncontradiction_scores, entailment_scores = self.cluster_responses(responses=responses, prompt=prompt)
 18 |         self.nli_scores["noncontradiction"].update(noncontradiction_scores)
 19 |         self.nli_scores["entailment"].update(entailment_scores)
 20 |         cluster_probabilities = self.compute_cluster_probabilities(response_probabilities=response_probabilities, cluster_indices=cluster_indices)
 21 |         best_response = self.best_response_selection(clustered_responses=clustered_responses, cluster_probabilities=cluster_probabilities)
 22 |         return best_response, clustered_responses, cluster_probabilities, cluster_indices
 23 | 
 24 |     def cluster_responses(self, responses: List[str], prompt: str = None) -> Any:
 25 |         """
 26 |         This method create clusters from a list of responses based on the semantic meaning of each response.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         responses : list of str, default=None
 31 |             A list of model responses
 32 | 
 33 |         prompt : str, default=None
 34 |             A prompt for the responses.
 35 | 
 36 |         Returns
 37 |         ----------
 38 |         A list of lists, where each list represents a cluster.
 39 |         """
 40 |         clusters, cluster_indices = [deque([responses[0]])], [deque([0])]
 41 |         noncontradiction_scores = {}
 42 |         entailments = {}
 43 |         entailment_scores = {}
 44 |         for i in range(1, len(responses)):
 45 |             new_cluster_indicator = True
 46 |             for j, cluster in enumerate(clusters):
 47 |                 text1 = f"{prompt}\n{cluster[0]}" if prompt else cluster[0]
 48 |                 text2 = f"{prompt}\n{responses[i]}" if prompt else responses[i]
 49 |                 key, rev_key = (text1, text2), (text2, text1)
 50 |                 if key in noncontradiction_scores:
 51 |                     # Do not recompute if pair already assessed
 52 |                     entailment = entailments[key]
 53 |                 else:
 54 |                     # Compute nli score and entailment if pair not yet assessed
 55 |                     nli_result = self.nli.get_nli_results(response1=text1, response2=text2)
 56 |                     noncontradiction_score, entailment, entailment_score = nli_result["noncontradiction_score"], nli_result["entailment"], nli_result["entailment_score"]
 57 |                     noncontradiction_scores[key], noncontradiction_scores[rev_key] = noncontradiction_score, noncontradiction_score
 58 |                     entailments[key], entailments[rev_key] = entailment, entailment
 59 |                     entailment_scores[key], entailment_scores[rev_key] = entailment_score, entailment_score
 60 |                 if entailment:
 61 |                     new_cluster_indicator = False
 62 |                     cluster.append(responses[i])
 63 |                     cluster_indices[j].append(i)
 64 | 
 65 |             if new_cluster_indicator:
 66 |                 clusters.append(deque([responses[i]]))
 67 |                 cluster_indices.append(deque([i]))
 68 | 
 69 |         # Arrange cluster so that first element is mode (if exists) else longest
 70 |         clusters = [self._sort_responses(list(cluster)) for cluster in clusters]
 71 | 
 72 |         return clusters, cluster_indices, noncontradiction_scores, entailment_scores
 73 | 
 74 |     def compute_response_probabilities(self, logprobs_results: List[List[Dict[str, Any]]], num_responses: int = None) -> List[float]:
 75 |         """Compute response probabilities"""
 76 |         uniform_response_probabilities = [1 / num_responses] * num_responses
 77 |         tokenprob_response_probabilities = [self.length_norm_sequence_prob(logprobs_i, self.length_normalize) if logprobs_i else np.nan for logprobs_i in logprobs_results] if logprobs_results else None
 78 |         return tokenprob_response_probabilities, uniform_response_probabilities
 79 | 
 80 |     def compute_cluster_probabilities(self, response_probabilities: List[float], cluster_indices: List[List[int]]) -> List[float]:
 81 |         """Compute cluster probabilities"""
 82 |         cluster_probabilities = [0] * len(cluster_indices)
 83 |         for i, cluster_index in enumerate(cluster_indices):
 84 |             cluster_probabilities[i] = sum([response_probabilities[j] for j in cluster_index])
 85 |         return self._normalize_cluster_probabilities(cluster_probabilities=cluster_probabilities)
 86 | 
 87 |     @staticmethod
 88 |     def length_norm_sequence_prob(logprobs: List[Dict[str, Any]], length_normalize: bool = True) -> float:
 89 |         "Compute length normalized sequence logprob"
 90 |         factor = 1 / len(logprobs) if length_normalize else 1
 91 |         return np.exp(np.sum([d["logprob"] for d in logprobs]) * factor)
 92 | 
 93 |     @staticmethod
 94 |     def best_response_selection(clustered_responses: List[List[str]], cluster_probabilities: List[float]) -> str:
 95 |         """Select the best response from the clustered responses based on the cluster probabilities"""
 96 |         return clustered_responses[cluster_probabilities.index(max(cluster_probabilities))][0]
 97 | 
 98 |     @staticmethod
 99 |     def _normalize_cluster_probabilities(cluster_probabilities: List[float]) -> float:
100 |         """Normalize cluster probabilities"""
101 |         total_probability = sum(cluster_probabilities)
102 |         return [cp_i / total_probability for cp_i in cluster_probabilities]
103 | 
104 |     @staticmethod
105 |     def _sort_responses(responses: List[str]) -> List[str]:
106 |         """Sorts responses in a cluster"""
107 |         counter = Counter(responses)
108 |         mode_str, count = counter.most_common(1)[0]
109 |         if count > 1:
110 |             return sorted(responses, key=lambda x: (x != mode_str, x))
111 |         else:
112 |             return sorted(responses, key=len, reverse=True)
113 | 


--------------------------------------------------------------------------------
/tests/data/scorers/ensemble_results_file.json:
--------------------------------------------------------------------------------
1 | {"ensemble1": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [0.9999998323932312, 0.9999993853802055, 0.9997710794982175, 0.7865512731195814, 0.9999998323932312], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.99908431799287, 0.14620509247832553, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.25, 0.25, 0.25, 0.25], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "5", "bytes": [53], "logprob": -1.9227449, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}, "bsdetector": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5599999999999999, 0.13999999999999999, 0.3], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -3.1737043e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.080879845, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.001702437, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -5.5122365e-07, "top_logprobs": []}]]}}, "ensemble2": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [null, null, null, null, null], "ensemble_scores": [0.9999996647864624, 0.999998770760411, 0.5911959418126744, 0.9399798095276081, 0.9999996647864624], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.1823918836253489, 0.8799596190552161, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5, 0.5], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0012972581, "top_logprobs": []}, {"token": " mar", "bytes": [32, 109, 97, 114], "logprob": -1.7015977, "top_logprobs": []}, {"token": "bles", "bytes": [98, 108, 101, 115], "logprob": 0.0, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.69366264, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.12787926, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.0036003059, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}}


--------------------------------------------------------------------------------