├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── .env.sample
├── .github
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── ISSUE_TEMPLATE.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yaml
    └── workflows
    │   ├── azure-dev.yaml
    │   ├── bicep-audit.yml
    │   └── python.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
    ├── launch.json
    └── settings.json
├── LICENSE.md
├── README.md
├── azure.yaml
├── docs
    ├── screenshot_compare.png
    └── screenshot_summary.png
├── dontknows.config.json
├── example_config.json
├── example_input
    ├── prompt_ignoresources.txt
    ├── prompt_nomarkdownmention.txt
    ├── prompt_piglatin.txt
    ├── prompt_refined.txt
    ├── prompt_refined_trimmed.txt
    ├── prompt_ungrounded.txt
    ├── prompt_weak.txt
    ├── qa.jsonl
    └── qa_dontknows.jsonl
├── example_results
    ├── baseline
    │   ├── config.json
    │   ├── eval_results.jsonl
    │   ├── evaluate_parameters.json
    │   └── summary.json
    ├── baseline2
    │   ├── config.json
    │   ├── eval_results.jsonl
    │   ├── evaluate_parameters.json
    │   └── summary.json
    ├── prompt_nomarkdownmention
    │   ├── config.json
    │   ├── eval_results.jsonl
    │   ├── evaluate_parameters.json
    │   └── summary.json
    └── prompt_nomarkdownmention2
    │   ├── config.json
    │   ├── eval_results.jsonl
    │   ├── evaluate_parameters.json
    │   └── summary.json
├── infra
    ├── core
    │   ├── ai
    │   │   └── cognitiveservices.bicep
    │   └── security
    │   │   └── role.bicep
    ├── main.bicep
    └── main.parameters.json
├── pyproject.toml
├── src
    └── evaltools
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── cli.py
    │   ├── eval
    │       ├── __init__.py
    │       ├── evaluate.py
    │       └── evaluate_metrics
    │       │   ├── __init__.py
    │       │   ├── base_metric.py
    │       │   ├── builtin_metrics.py
    │       │   ├── code_metrics.py
    │       │   ├── prompt_metrics.py
    │       │   └── prompts
    │       │       ├── dontknowness.prompty
    │       │       ├── mycoherence.prompty
    │       │       ├── mygroundedness.prompty
    │       │       └── myrelevance.prompty
    │   ├── gen
    │       ├── __init__.py
    │       └── generate.py
    │   ├── review
    │       ├── __init__.py
    │       ├── answers.html
    │       ├── diff_app.py
    │       ├── diff_app.tcss
    │       ├── diff_markdown.py
    │       ├── parameters_screen.tcss
    │       ├── requirements.txt
    │       ├── summary_app.py
    │       ├── summary_app.tcss
    │       ├── summary_markdown.py
    │       └── utils.py
    │   └── service_setup.py
└── tests
    ├── test_evaluate.py
    └── test_evaluate_metrics.py


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/devcontainers/python:3.11-bullseye
 2 | 
 3 | # Install pip for Python 3.11
 4 | RUN python -m pip install --upgrade pip
 5 | 
 6 | # Necessary for promptflow keyring to work on Linux with dbus backend
 7 | RUN sudo apt-get update
 8 | RUN sudo apt-get install -y gcc cmake pkg-config libdbus-1-dev libglib2.0-dev
 9 | RUN pip install keyrings.alt dbus-python
10 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "AI RAG Chat Evaluator",
 3 |     "build": {
 4 |         "dockerfile": "Dockerfile",
 5 |         "context": ".."
 6 |     },
 7 |     "features": {
 8 |         "ghcr.io/azure/azure-dev/azd:latest": {}
 9 |     },
10 |     "customizations": {
11 |         "vscode": {
12 |             "extensions": [
13 |                 "ms-azuretools.vscode-bicep",
14 |                 "ms-python.python"
15 |             ]
16 |         }
17 |     },
18 |     "remoteUser": "vscode",
19 |     "hostRequirements": {
20 |         "memory": "8gb"
21 |     },
22 |     "postCreateCommand": "pip install -e .\"[dev]\""
23 | }
24 | 


--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
 1 | OPENAI_HOST="azure"
 2 | OPENAI_GPT_MODEL="gpt-4"
 3 | # For Azure OpenAI only:
 4 | AZURE_OPENAI_EVAL_DEPLOYMENT="<deployment-name>"
 5 | AZURE_OPENAI_ENDPOINT="https://<service-name>.openai.azure.com"
 6 | AZURE_OPENAI_KEY=""
 7 | AZURE_OPENAI_TENANT_ID=""
 8 | # For openai.com only:
 9 | OPENAICOM_KEY=""
10 | OPENAICOM_ORGANIZATION=""
11 | # For generating QA based on search index:
12 | AZURE_SEARCH_ENDPOINT="https://<service-name>.search.windows.net"
13 | AZURE_SEARCH_INDEX="<index-name>"
14 | AZURE_SEARCH_KEY=""
15 | AZURE_SEARCH_TENANT_ID=""
16 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to [project-title]
 2 | 
 3 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
 6 | 
 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
 9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 | 
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 | 
15 |  - [Code of Conduct](#coc)
16 |  - [Issues and Bugs](#issue)
17 |  - [Feature Requests](#feature)
18 |  - [Submission Guidelines](#submit)
19 | 
20 | ## <a name="coc"></a> Code of Conduct
21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
22 | 
23 | ## <a name="issue"></a> Found an Issue?
24 | If you find a bug in the source code or a mistake in the documentation, you can help us by
25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can
26 | [submit a Pull Request](#submit-pr) with a fix.
27 | 
28 | ## <a name="feature"></a> Want a Feature?
29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub
30 | Repository. If you would like to *implement* a new feature, please submit an issue with
31 | a proposal for your work first, to be sure that we can use it.
32 | 
33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr).
34 | 
35 | ## <a name="submit"></a> Submission Guidelines
36 | 
37 | ### <a name="submit-issue"></a> Submitting an Issue
38 | Before you submit an issue, search the archive, maybe your question was already answered.
39 | 
40 | If your issue appears to be a bug, and hasn't been reported, open a new issue.
41 | Help us to maximize the effort we can spend fixing issues and adding new
42 | features, by not reporting duplicate issues.  Providing the following information will increase the
43 | chances of your issue being dealt with quickly:
44 | 
45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps
46 | * **Version** - what version is affected (e.g. 0.1.2)
47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you
48 | * **Browsers and Operating System** - is this a problem with all browsers?
49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps
50 | * **Related Issues** - has a similar issue been reported before?
51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
52 |   causing the problem (line of code or commit)
53 | 
54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new].
55 | 
56 | ### <a name="submit-pr"></a> Submitting a Pull Request (PR)
57 | Before you submit your Pull Request (PR) consider the following guidelines:
58 | 
59 | * Search the repository for an open or closed PR that relates to your submission. You don't want to duplicate effort.
60 | * Make your changes in a new git fork
61 | * Install the development tools and pre-commit hooks:
62 | 
63 |    ```shell
64 |    python3 -m pip install -e ."[dev"]
65 |    pre-commit install
66 |    ```
67 | 
68 | * Commit your changes using a descriptive commit message
69 | * Push your branch to GitHub
70 | * In GitHub, create a pull request and request a review
71 | 
72 | That's it! Thank you for your contribution!
73 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | IF SUFFICIENT INFORMATION IS NOT PROVIDED VIA THE FOLLOWING TEMPLATE THE ISSUE MIGHT BE CLOSED WITHOUT FURTHER CONSIDERATION OR INVESTIGATION
 3 | -->
 4 | > Please provide us with the following information:
 5 | > ---------------------------------------------------------------
 6 | 
 7 | ### This issue is for a: (mark with an `x`)
 8 | ```
 9 | - [ ] bug report -> please search issues before submitting
10 | - [ ] feature request
11 | - [ ] documentation issue or request
12 | - [ ] regression (a behavior that used to work and stopped in a new release)
13 | ```
14 | 
15 | ### Minimal steps to reproduce
16 | >
17 | 
18 | ### Any log messages given by the failure
19 | >
20 | 
21 | ### Expected/desired behavior
22 | >
23 | 
24 | ### OS and Version?
25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?)
26 | 
27 | ### Versions
28 | >
29 | 
30 | ### Mention any other details that might be useful
31 | 
32 | > ---------------------------------------------------------------
33 | > Thanks! We'll be in touch soon.
34 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Purpose
 2 | <!-- Describe the intention of the changes being proposed. What problem does it solve or functionality does it add? -->
 3 | * ...
 4 | 
 5 | ## Does this introduce a breaking change?
 6 | <!-- Mark one with an "x". -->
 7 | ```
 8 | [ ] Yes
 9 | [ ] No
10 | ```
11 | 
12 | ## Pull Request Type
13 | What kind of change does this Pull Request introduce?
14 | 
15 | <!-- Please check the one that applies to this PR using "x". -->
16 | ```
17 | [ ] Bugfix
18 | [ ] Feature
19 | [ ] Code style update (formatting, local variables)
20 | [ ] Refactoring (no functional changes, no api changes)
21 | [ ] Documentation content changes
22 | [ ] Other... Please describe:
23 | ```
24 | 
25 | ## How to Test
26 | *  Get the code
27 | 
28 | ```
29 | git clone [repo-address]
30 | cd [repo-name]
31 | git checkout [branch-name]
32 | npm install
33 | ```
34 | 
35 | * Test the code
36 | <!-- Add steps to run the tests suite and/or manually test -->
37 | ```
38 | ```
39 | 
40 | ## What to Check
41 | Verify that the following are valid
42 | * ...
43 | 
44 | ## Other Information
45 | <!-- Add any other helpful information that may be needed here. -->


--------------------------------------------------------------------------------
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 | 
 9 |   - package-ecosystem: "github-actions"
10 |     directory: "/"
11 |     schedule:
12 |       interval: "weekly"
13 | 
14 |   - package-ecosystem: "pip"
15 |     directory: "/"
16 |     schedule:
17 |       interval: "weekly"
18 | 


--------------------------------------------------------------------------------
/.github/workflows/azure-dev.yaml:
--------------------------------------------------------------------------------
  1 | name: Test azd deployment
  2 | on:
  3 |   workflow_dispatch:
  4 |   push:
  5 |     # Run when commits are pushed to mainline branch (main or master)
  6 |     # Set this to the mainline branch you are using
  7 |     branches:
  8 |       - main
  9 | 
 10 | # GitHub Actions workflow to deploy to Azure using azd
 11 | # To configure required secrets for connecting to Azure, simply run `azd pipeline config`
 12 | 
 13 | # Set up permissions for deploying with secretless Azure federated credentials
 14 | # https://learn.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication
 15 | permissions:
 16 |   id-token: write
 17 |   contents: read
 18 | 
 19 | jobs:
 20 |   build:
 21 |     if: github.repository == 'Azure-samples/ai-rag-chat-evaluator'
 22 |     runs-on: ubuntu-latest
 23 |     env:
 24 |       # azd required
 25 |       AZURE_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }}
 26 |       AZURE_TENANT_ID: ${{ vars.AZURE_TENANT_ID }}
 27 |       AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
 28 |       AZURE_CREDENTIALS: ${{ secrets.AZURE_CREDENTIALS }}
 29 |       # project specific
 30 |       OPENAI_HOST: ${{ vars.OPENAI_HOST }}
 31 |       AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }}
 32 |       AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }}
 33 |       EVAL_GPT_DEPLOYMENT_CAPACITY: ${{ vars.EVAL_GPT_DEPLOYMENT_CAPACITY }}
 34 |       OPENAI_ORGANIZATION: ${{ vars.OPENAI_ORGANIZATION }}
 35 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 36 |     steps:
 37 |       - name: Checkout
 38 |         uses: actions/checkout@v4
 39 | 
 40 |       - name: Install azd
 41 |         uses: Azure/setup-azd@v2.1.0
 42 | 
 43 |       - name: Log in with Azure (Federated Credentials)
 44 |         if: ${{ env.AZURE_CLIENT_ID != '' }}
 45 |         run: |
 46 |           azd auth login `
 47 |             --client-id "$Env:AZURE_CLIENT_ID" `
 48 |             --federated-credential-provider "github" `
 49 |             --tenant-id "$Env:AZURE_TENANT_ID"
 50 |         shell: pwsh
 51 | 
 52 |       - name: Log in with Azure (Client Credentials)
 53 |         if: ${{ env.AZURE_CREDENTIALS != '' }}
 54 |         run: |
 55 |           $info = $Env:AZURE_CREDENTIALS | ConvertFrom-Json -AsHashtable;
 56 |           Write-Host "::add-mask::$($info.clientSecret)"
 57 | 
 58 |           azd auth login `
 59 |             --client-id "$($info.clientId)" `
 60 |             --client-secret "$($info.clientSecret)" `
 61 |             --tenant-id "$($info.tenantId)"
 62 |         shell: pwsh
 63 |         env:
 64 |           AZURE_CREDENTIALS: ${{ secrets.AZURE_CREDENTIALS }}
 65 | 
 66 |       - name: Provision Infrastructure
 67 |         run: azd provision --no-prompt
 68 |         env:
 69 |           AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }}
 70 |           AZURE_LOCATION: ${{ vars.AZURE_LOCATION }}
 71 |           AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
 72 | 
 73 |       - name: Deploy Application
 74 |         run: azd deploy --no-prompt
 75 |         env:
 76 |           AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }}
 77 |           AZURE_LOCATION: ${{ vars.AZURE_LOCATION }}
 78 |           AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
 79 | 
 80 |       - name: Setup python
 81 |         uses: actions/setup-python@v5
 82 |         with:
 83 |           python-version: 3.11
 84 |           architecture: x64
 85 | 
 86 |       - name: Install PromptFlow dbus dependency
 87 |         run: |
 88 |           sudo apt-get update
 89 |           sudo apt-get install -y gcc cmake pkg-config libdbus-1-dev libglib2.0-dev
 90 |           python -m pip install --upgrade pip
 91 |           pip install keyrings.alt dbus-python
 92 | 
 93 |       - name: Install dependencies
 94 |         run: |
 95 |           python -m pip install --upgrade pip
 96 |           pip install -e .[dev]
 97 | 
 98 |       - name: Run evaluation
 99 |         run: |
100 |           azd env get-values > .env
101 |           source .env
102 |           python -m evaltools evaluate --config=example_config.json --numquestions=2 --targeturl=${{ env.TARGET_URL }}
103 |         env:
104 |           TARGET_URL: ${{ secrets.TARGET_URL }}
105 | 


--------------------------------------------------------------------------------
/.github/workflows/bicep-audit.yml:
--------------------------------------------------------------------------------
 1 | name: Validate bicep templates
 2 | on:
 3 |   push:
 4 |     branches: 
 5 |       - main
 6 |     paths:
 7 |       - "**/*.bicep"
 8 |   pull_request:
 9 |     branches: 
10 |       - main
11 |     paths:
12 |       - "**/*.bicep"
13 |   workflow_dispatch:
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     permissions:
19 |       security-events: write
20 |     steps:
21 |       - name: Checkout
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Run Microsoft Security DevOps Analysis
25 |         uses: microsoft/security-devops-action@preview
26 |         id: msdo
27 |         continue-on-error: true
28 |         with:
29 |           tools: templateanalyzer
30 | 
31 |       - name: Upload alerts to Security tab
32 |         uses: github/codeql-action/upload-sarif@v3
33 |         if: github.repository_owner == 'Azure-Samples'
34 |         with:
35 |           sarif_file: ${{ steps.msdo.outputs.sarifFile }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yaml:
--------------------------------------------------------------------------------
 1 | name: Python checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |     paths-ignore:
 7 |       - "**.md"
 8 |       - ".devcontainer/**"
 9 |       - ".github/**"
10 |       - "example_results/**"
11 |       - "example_input/**"
12 |   pull_request:
13 |     branches: [ main ]
14 |     paths-ignore:
15 |       - "**.md"
16 |       - ".devcontainer/**"
17 |       - ".github/**"
18 |       - "example_results/**"
19 |       - "example_input/**"
20 |   workflow_call:
21 | 
22 | jobs:
23 |   test_package:
24 |     name: Test ${{ matrix.os }} Python ${{ matrix.python_version }}
25 |     runs-on: ${{ matrix.os }}
26 |     strategy:
27 |       fail-fast: false
28 |       matrix:
29 |         # macos-13 is x84 and macos-14-large is arm64
30 |         os: ["ubuntu-latest", "windows-latest", "macos-13", "macos-14-large"]
31 |         python_version: ["3.9", "3.10", "3.11", "3.12"]
32 |     steps:
33 |         - uses: actions/checkout@v4
34 |         - name: Setup python
35 |           uses: actions/setup-python@v5
36 |           with:
37 |             python-version: ${{ matrix.python_version }}
38 |             architecture: x64
39 |         - name: Install dependencies
40 |           run: |
41 |             python -m pip install --upgrade pip
42 |             pip install -e .[dev]
43 |         - name: Lint with ruff
44 |           run: ruff check .
45 |         - name: Check formatting with ruff
46 |           run: ruff format . --check
47 |         - name: Run Pytest tests
48 |           run: python -m pytest
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Azure az webapp deployment details
  2 | .azure
  3 | *_env
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # pytype static type analyzer
139 | .pytype/
140 | 
141 | # Cython debug symbols
142 | cython_debug/
143 | 
144 | # NPM
145 | npm-debug.log*
146 | node_modules
147 | static/
148 | 
149 | # From azure-ai-generative
150 | mlruns/
151 | 
152 | .DS_Store
153 | 
154 | # Additional test directories used by maintainer
155 | pamelas_blog_input/
156 | pamelas_blog_results/
157 | pamelas_blog_results_dontknows/
158 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |     -   id: check-yaml
 6 |     -   id: end-of-file-fixer
 7 |     -   id: trailing-whitespace
 8 | -   repo: https://github.com/astral-sh/ruff-pre-commit
 9 |     rev: v0.6.2
10 |     hooks:
11 |         # Run the linter.
12 |         - id: ruff
13 |           args: [ --fix ]
14 |         # Run the formatter.
15 |         - id: ruff-format
16 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python Debugger: Current File",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "justMyCode": false
14 |         }
15 |     ]
16 | }
17 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.testing.pytestArgs": ["tests"],
 3 |     "python.testing.unittestEnabled": false,
 4 |     "python.testing.pytestEnabled": true,
 5 |     "files.exclude": {
 6 |         ".coverage": true,
 7 |         ".pytest_cache": true,
 8 |         "__pycache__": true,
 9 |         ".mypy_cache": true
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Evaluating a RAG Chat App
  2 | 
  3 | This repo contains scripts and tools for evaluating a chat app that uses the RAG architecture.
  4 | There are many parameters that affect the quality and style of answers generated by the chat app,
  5 | such as the system prompt, search parameters, and GPT model parameters.
  6 | 
  7 | Whenever you are making changes to a RAG chat with the goal of improving the answers, you should evaluate the results.
  8 | This repository offers tools to make it easier to run evaluations, plus examples of evaluations
  9 | that we've run on our [popular RAG chat solution](https://github.com/Azure-Samples/azure-search-openai-demo/).
 10 | 
 11 | [📺 Watch a video overview of this repo](https://www.youtube.com/watch?v=mM8pZAI2C5w)
 12 | 
 13 | Table of contents:
 14 | 
 15 | * [Cost estimation](#cost-estimation)
 16 | * [Setting up this project](#setting-up-this-project)
 17 | * [Deploying a GPT-4 model](#deploying-a-gpt-4-model)
 18 | * [Generating ground truth data](#generating-ground-truth-data)
 19 | * [Running an evaluation](#running-an-evaluation)
 20 | * [Viewing the results](#viewing-the-results)
 21 | * [Measuring app's ability to say "I don't know"](#measuring-apps-ability-to-say-i-dont-know)
 22 | 
 23 | ## Cost estimation
 24 | 
 25 | There are several places where this project can incur costs:
 26 | 
 27 | | Cost | Description | Estimated tokens used |
 28 | | --- | --- | --- |
 29 | | Generating ground truth data | This is a one-time cost for generating the initial set of questions and answers, and involves pulling data down from your search index and sending it to the GPT model. | 1000 tokens per question generated, which would be 200,000 tokens for the recommended 200 questions. |
 30 | | Running evaluations | Each time you run an evaluation, you may choose to use the GPT-based evaluators (groundedness, coherence, etc). For each GPT-evaluator used, you will incur costs for the tokens used by the GPT model. | 1000 tokens per question per evaluator used, which would be 600,000 tokens for the default 200 questions and 3 evaluators. |
 31 | 
 32 | For a full estimate of the costs for your region and model, see the [Azure OpenAI pricing page](https://azure.microsoft.com/pricing/details/cognitive-services/openai-service/) or use the [Azure OpenAI pricing calculator](https://azure.com/e/f0dc5c3acb43437d925209c09c775a6d).
 33 | 
 34 | ## Setting up this project
 35 | 
 36 | If you open this project in a Dev Container or GitHub Codespaces, it will automatically set up the environment for you.
 37 | If not, then follow these steps:
 38 | 
 39 | 1. Install Python 3.10 or higher
 40 | 2. Create a Python [virtual environment](https://learn.microsoft.com/azure/developer/python/get-started?tabs=cmd#configure-python-virtual-environment).
 41 | 3. Inside that virtual environment, install the project:
 42 | 
 43 |     ```shell
 44 |     python -m pip install -e .
 45 |     ```
 46 | 
 47 | ## Deploying a GPT-4 model
 48 | 
 49 | It's best to use a GPT-4 model for performing the evaluation, even if your chat app uses GPT-3.5 or another model.
 50 | You can either use an Azure OpenAI instance or an openai.com instance.
 51 | 
 52 | ### Using a new Azure OpenAI instance
 53 | 
 54 | To use a new Azure OpenAI instance, you'll need to create a new instance and deploy the app to it.
 55 | We've made that easy to deploy with the `azd` CLI tool.
 56 | 
 57 | 1. Install the [Azure Developer CLI](https://aka.ms/azure-dev/install)
 58 | 2. Run `azd auth login` to log in to your Azure account
 59 | 3. Run `azd up` to deploy a new GPT-4 instance
 60 | 4. Create a `.env` file based on `.env.sample`:
 61 | 
 62 |     ```shell
 63 |     cp .env.sample .env
 64 |     ```
 65 | 
 66 | 5. Run this commands to get the required values for `AZURE_OPENAI_EVAL_DEPLOYMENT` and `AZURE_OPENAI_SERVICE` from your deployed resource group and paste those values into the `.env` file:
 67 | 
 68 |     ```shell
 69 |     azd env get-value AZURE_OPENAI_EVAL_DEPLOYMENT
 70 |     azd env get-value AZURE_OPENAI_SERVICE
 71 |     ```
 72 | 
 73 | ### Using an existing Azure OpenAI instance
 74 | 
 75 | If you already have an Azure OpenAI instance, you can use that instead of creating a new one.
 76 | 
 77 | 1. Create `.env` file by copying `.env.sample`
 78 | 2. Fill in the values for your instance:
 79 | 
 80 |     ```shell
 81 |     AZURE_OPENAI_EVAL_DEPLOYMENT="<deployment-name>"
 82 |     AZURE_OPENAI_ENDPOINT="https://<service-name>.openai.azure.com"
 83 |     ```
 84 | 
 85 | 3. The scripts default to keyless access (via `AzureDefaultCredential`), but you can optionally use a key by setting `AZURE_OPENAI_KEY` in `.env`.
 86 | 
 87 | ### Using an openai.com instance
 88 | 
 89 | If you have an openai.com instance, you can use that instead of an Azure OpenAI instance.
 90 | 
 91 | 1. Create `.env` file by copying `.env.sample`
 92 | 2. Change `OPENAI_HOST` to "openai" and fill in the key for for your OpenAI account. If you do not have an organization, you can leave that blank.
 93 | 
 94 |     ```shell
 95 |     OPENAI_HOST="openai"
 96 |     OPENAICOM_KEY=""
 97 |     OPENAICOM_ORGANIZATION=""
 98 |     ```
 99 | 
100 | ## Generating ground truth data
101 | 
102 | In order to evaluate new answers, they must be compared to "ground truth" answers: the ideal answer for a particular question. See `example_input/qa.jsonl` for an example of the format.
103 | We recommend at least 200 QA pairs if possible.
104 | 
105 | There are a few ways to get this data:
106 | 
107 | 1. Manually curate a set of questions and answers that you consider to be ideal. This is the most accurate, but also the most time-consuming. Make sure your answers include citations in the expected format. This approach requires domain expertise in the data.
108 | 2. Use a generator script to generate a set of questions and answers, and use them directly. This is the fastest, but may also be the least accurate.
109 | 3. Use a generator script to generate a set of questions and answers, and then manually curate them, rewriting any answers that are subpar and adding missing citations. This is a good middle ground, and is what we recommend.
110 | 
111 | <details>
112 |  <summary>Additional tips for ground truth data generation</summary>
113 | 
114 | * Generate more QA pairs than you need, then prune them down manually based on quality and overlap. Remove low quality answers, and remove questions that are too similar to other questions.
115 | * Be aware of the knowledge distribution in the document set, so you effectively sample questions across the knowledge space.
116 | * Once your chat application is live, continually sample live user questions (within accordance to your privacy policy) to make sure you're representing the sorts of questions that users are asking.
117 | 
118 | </details>
119 | 
120 | ## Running an evaluation
121 | 
122 | We provide a script that loads in the current `azd` environment's variables, installs the requirements for the evaluation, and runs the evaluation against the local app. Run it like this:
123 | 
124 | ```shell
125 | python -m evaltools evaluate --config=example_config.json
126 | ```
127 | 
128 | The config.json should contain these fields as a minimum:
129 | 
130 | ```json
131 | {
132 |     "testdata_path": "example_input/qa.jsonl",
133 |     "target_url": "http://localhost:50505/chat",
134 |     "requested_metrics": ["groundedness", "relevance", "coherence", "latency", "answer_length"],
135 |     "results_dir": "example_results/experiment<TIMESTAMP>"
136 | }
137 | ```
138 | 
139 | ### Running against a local container
140 | 
141 | If you're running this evaluator in a container and your app is running in a container on the same system, use a URL like this for the `target_url`:
142 | 
143 | "target_url": "http://host.docker.internal:50505/chat"
144 | 
145 | ### Running against a deployed app
146 | 
147 | To run against a deployed endpoint, change the `target_url` to the chat endpoint of the deployed app:
148 | 
149 | "target_url": "https://app-backend-j25rgqsibtmlo.azurewebsites.net/chat"
150 | 
151 | ### Running on a subset of questions
152 | 
153 | It's common to run the evaluation on a subset of the questions, to get a quick sense of how the changes are affecting the answers. To do this, use the `--numquestions` parameter:
154 | 
155 | ```shell
156 | python -m evaltools evaluate --config=example_config.json --numquestions=2
157 | ```
158 | 
159 | ### Specifying the evaluate metrics
160 | 
161 | The `evaluate` command will use the metrics specified in the `requested_metrics` field of the config JSON.
162 | Some of those metrics are built-in to the evaluation SDK, and the rest are custom metrics that we've added.
163 | 
164 | #### Built-in metrics
165 | 
166 | These metrics are calculated by sending a call to the GPT model, asking it to provide a 1-5 rating, and storing that rating.
167 | 
168 | > [!IMPORTANT]
169 | > The built-in metrics are only intended for use on evaluating English language answers, since they use English-language prompts internally. For non-English languages, you should use the [custom prompt metrics](#prompt-metrics) instead.
170 | 
171 | * [`gpt_coherence`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-coherence) measures how well the language model can produce output that flows smoothly, reads naturally, and resembles human-like language.
172 | * [`gpt_relevance`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-relevance) assesses the ability of answers to capture the key points of the context.
173 | * [`gpt_groundedness`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-groundedness) assesses the correspondence between claims in an AI-generated answer and the source context, making sure that these claims are substantiated by the context.
174 | * [`gpt_similarity`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-gpt-similarity) measures the similarity between a source data (ground truth) sentence and the generated response by an AI model.
175 | * [`gpt_fluency`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-fluency) measures the grammatical proficiency of a generative AI's predicted answer.
176 | * [`f1_score`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#traditional-machine-learning-f1-score) Measures the ratio of the number of shared words between the model generation and the ground truth answers.
177 | 
178 | #### Custom metrics
179 | 
180 | ##### Prompt metrics
181 | 
182 | The following metrics are implemented very similar to the built-in metrics, but use a locally stored prompt. They're a great fit if you find that the built-in metrics are not working well for you or if you need to translate the prompt to another language.
183 | 
184 | * `mycoherence`: Measures how well the language model can produce output that flows smoothly, reads naturally, and resembles human-like language. Based on `scripts/evaluate_metrics/prompts/coherence.prompty`.
185 | * `myrelevance`: Assesses the ability of answers to capture the key points of the context. Based on `scripts/evaluate_metrics/prompts/relevance.prompty`.
186 | * `mygroundedness`: Assesses the correspondence between claims in an AI-generated answer and the source context, making sure that these claims are substantiated by the context. Based on `scripts/evaluate_metrics/prompts/groundedness.prompty`.
187 | 
188 | ##### Code metrics
189 | 
190 | These metrics are calculated with some local code based on the results of the chat app, and do not require a call to the GPT model.
191 | 
192 | * `latency`: The time it takes for the chat app to generate an answer, in seconds.
193 | * `length`: The length of the generated answer, in characters.
194 | * `has_citation`: Whether the answer contains a correctly formatted citation to a source document, assuming citations are in square brackets.
195 | * `citation_match`: Whether the answer contains at least all of the citations that were in the ground truth answer.
196 | 
197 | ### Sending additional parameters to the app
198 | 
199 | This repo assumes that your chat app is following the [AI Chat Protocol](https://github.com/microsoft/ai-chat-protocol/tree/main/spec#readme), which means that all POST requests look like this:
200 | 
201 | ```json
202 | {"messages": [{"content": "<Actual user question goes here>", "role": "user"}],
203 |  "context": {...},
204 | }
205 | ```
206 | 
207 | Any additional app parameters would be specified in the `context` of that JSON, such as temperature, search settings, prompt overrides, etc. To specify those parameters, add a `target_parameters` key to your config JSON. For example:
208 | 
209 | ```json
210 |     "target_parameters": {
211 |         "overrides": {
212 |             "semantic_ranker": false,
213 |             "prompt_template": "<READFILE>example_input/prompt_refined.txt"
214 |         }
215 |     }
216 | ```
217 | 
218 | The `overrides` key is the same as the `overrides` key in the `context` of the POST request.
219 | As a convenience, you can use the `<READFILE>` prefix to read in a file and use its contents as the value for the parameter.
220 | That way, you can store potential (long) prompts separately from the config JSON file.
221 | 
222 | ### Specifying the location of answer and context in response
223 | 
224 | The evaluator needs to know where to find the answer and context in the response from the chat app.
225 | If your app returns responses following the recommendations of the [AI Chat Protocol](https://github.com/microsoft/ai-chat-protocol/tree/main/spec#readme), then the answer will be "message": "content" and the context will be a list of strings in "context": "data_points": "text".
226 | 
227 | If your app returns responses in a different format, you can specify the [JMESPath expressions](https://jmespath.org/) to extract the answer and context from the response. For example:
228 | 
229 | ```json
230 |     "target_response_answer_jmespath": "message.content",
231 |     "target_response_context_jmespath": "context.data_points.text"
232 | ```
233 | 
234 | ## Viewing the results
235 | 
236 | The results of each evaluation are stored in a results folder (defaulting to `example_results`).
237 | Inside each run's folder, you'll find:
238 | 
239 | * `eval_results.jsonl`: Each question and answer, along with the GPT metrics for each QA pair.
240 | * `parameters.json`: The parameters used for the run, like the overrides.
241 | * `summary.json`: The overall results, like the average GPT metrics.
242 | * `config.json`: The original config used for the run. This is useful for reproducing the run.
243 | 
244 | To make it easier to view and compare results across runs, we've built a few tools,
245 | located inside the `review-tools` folder.
246 | 
247 | ### Using the summary tool
248 | 
249 | To view a summary across all the runs, use the `summary` command with the path to the results folder:
250 | 
251 | ```bash
252 | python -m evaltools summary example_results
253 | ```
254 | 
255 | This will display an interactive table with the results for each run, like this:
256 | 
257 | ![Screenshot of CLI tool with table of results](docs/screenshot_summary.png)
258 | 
259 | To see the parameters used for a particular run, select the folder name.
260 | A modal will appear with the parameters, including any prompt override.
261 | 
262 | ### Using the compare tool
263 | 
264 | To compare the answers generated for each question across 2 runs, use the `compare` command with 2 paths:
265 | 
266 | ```bash
267 | python -m evaltools diff example_results/baseline_1 example_results/baseline_2
268 | ```
269 | 
270 | This will display each question, one at a time, with the two generated answers in scrollable panes,
271 | and the GPT metrics below each answer.
272 | 
273 | ![Screenshot of CLI tool for comparing a question with 2 answers](docs/screenshot_compare.png)]
274 | 
275 | Use the buttons at the bottom to navigate to the next question or quit the tool.
276 | 
277 | You can also filter to only show questions where the value changed for a particular metric, like this:
278 | 
279 | ```bash
280 | python -m evaltools diff example_results/baseline_1 example_results/baseline_2 --changed=has_citation
281 | ```
282 | 
283 | ## Measuring app's ability to say "I don't know"
284 | 
285 | The evaluation flow described above focused on evaluating a model’s answers for a set of questions that *could* be answered by the data. But what about all those questions that can’t be answered by the data? Does your model know how to say “I don’t know?” The GPT models are trained to try and be helpful, so their tendency is to always give some sort of answer, especially for answers that were in their training data. If you want to ensure your app can say “I don’t know” when it should, you need to evaluate it on a different set of questions with a different metric.
286 | 
287 | ### Generating ground truth data for answer-less questions
288 | 
289 | For this evaluation, our ground truth data needs to be a set of question whose answer should provoke an "I don’t know" response from the data. There are several categories of such questions:
290 | 
291 | * **Unknowable**: Questions that are related to the sources but not actually in them (and not public knowledge).
292 | * **Uncitable**: Questions whose answers are well known to the LLM from its training data, but are not in the sources. There are two flavors of these:
293 |   * **Related**: Similar topics to sources, so LLM will be particularly tempted to think the sources know.
294 |   * **Unrelated**: Completely unrelated to sources, so LLM shouldn’t be as tempted to think the sources know.
295 | * **Nonsensical**: Questions that are non-questions, that a human would scratch their head at and ask for clarification.
296 | 
297 | You can write these questions manually, but it’s also possible to generate them using a generator script in this repo,
298 | assuming you already have ground truth data with answerable questions.
299 | 
300 | ```shell
301 | python -m evaltools generate-dontknows --input=example_input/qa.jsonl --output=example_input/qa_dontknows.jsonl --numquestions=45
302 | ```
303 | 
304 | That script sends the current questions to the configured GPT-4 model along with prompts to generate questions of each kind.
305 | 
306 | When it’s done, you should review and curate the resulting ground truth data. Pay special attention to the "unknowable" questions at the top of the file, since you may decide that some of those are actually knowable, and you may want to reword or rewrite entirely.
307 | 
308 | ### Running an evaluation for answer-less questions
309 | 
310 | This repo contains a custom GPT metric called "dontknowness" that rates answers from 1-5, where 1 is "answered the question completely with no certainty" and 5 is "said it didn't know and attempted no answer". The goal is for all answers to be rated 4 or 5.
311 | 
312 | Here's an example configuration JSON that requests that metric, referencing the new ground truth data and a new output folder:
313 | 
314 | ```json
315 | {
316 |     "testdata_path": "example_input/qa_dontknows.jsonl",
317 |     "results_dir": "example_results_dontknows/baseline",
318 |     "requested_metrics": ["dontknowness", "answer_length", "latency", "has_citation"],
319 |     "target_url": "http://localhost:50505/chat",
320 |     "target_parameters": {
321 |     },
322 |     "target_response_answer_jmespath": "message.content",
323 |     "target_response_context_jmespath": "context.data_points.text"
324 | }
325 | ```
326 | 
327 | We recommend a separate output folder, as you'll likely want to make multiple runs and easily compare between those runs using the [review tools](#viewing-the-results).
328 | 
329 | Run the evaluation like this:
330 | 
331 | ```shell
332 | python -m evaltools evaluate --config=dontknows.config.json
333 | ```
334 | 
335 | The results will be stored in the `results_dir` folder, and can be reviewed using the [review tools](#viewing-the-results).
336 | 
337 | ### Improving the app's ability to say "I don't know"
338 | 
339 | If the app is not saying "I don't know" enough, you can use the `diff` tool to compare the answers for the "dontknows" questions across runs, and see if the answers are improving. Changes you can try:
340 | 
341 | * Adjust the prompt to encourage the model to say "I don't know" more often. Remove anything in the prompt that might be distracting or overly encouraging it to answer.
342 | * Try using GPT-4 instead of GPT-3.5. The results will be slower (see the latency column) but it may be more likely to say "I don't know" when it should.
343 | * Adjust the temperature of the model used by your app.
344 | * Add an additional LLM step in your app after generating the answer, to have the LLM rate its own confidence that the answer is found in the sources. If the confidence is low, the app should say "I don't know".
345 | 


--------------------------------------------------------------------------------
/azure.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=https://raw.githubusercontent.com/Azure/azure-dev/main/schemas/v1.0/azure.yaml.json
 2 | 
 3 | name: ai-rag-chat-evaluator
 4 | metadata:
 5 |   template: ai-rag-chat-evaluator@0.0.2
 6 | pipeline:
 7 |   variables:
 8 |     - OPENAI_HOST
 9 |     - AZURE_OPENAI_SERVICE
10 |     - AZURE_OPENAI_RESOURCE_GROUP
11 |     - EVAL_GPT_DEPLOYMENT_CAPACITY
12 |     - OPENAI_ORGANIZATION
13 |   secrets:
14 |     - OPENAI_API_KEY
15 | 


--------------------------------------------------------------------------------
/docs/screenshot_compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/docs/screenshot_compare.png


--------------------------------------------------------------------------------
/docs/screenshot_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/docs/screenshot_summary.png


--------------------------------------------------------------------------------
/dontknows.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "testdata_path": "example_input/qa_dontknows.jsonl",
 3 |     "results_dir": "example_results_dontknows/baseline",
 4 |     "requested_metrics": ["dontknowness", "answer_length", "latency", "has_citation"],
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages"
23 |         }
24 |     },
25 |     "target_response_answer_jmespath": "message.content",
26 |     "target_response_context_jmespath": "context.data_points.text"
27 | }
28 | 


--------------------------------------------------------------------------------
/example_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "testdata_path": "example_input/qa.jsonl",
 3 |     "results_dir": "example_results/experiment<TIMESTAMP>",
 4 |     "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages",
23 |             "seed": 1
24 |         }
25 |     },
26 |     "target_response_answer_jmespath": "message.content",
27 |     "target_response_context_jmespath": "context.data_points.text"
28 | }
29 | 


--------------------------------------------------------------------------------
/example_input/prompt_ignoresources.txt:
--------------------------------------------------------------------------------
1 | Your job is to answer questions to the best of your ability. You will be given sources but you should IGNORE them. Be creative!
2 | 


--------------------------------------------------------------------------------
/example_input/prompt_nomarkdownmention.txt:
--------------------------------------------------------------------------------
1 | Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.
2 | Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.
3 | If the question is not in English, answer in the language used in the question.
4 | Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
5 | 


--------------------------------------------------------------------------------
/example_input/prompt_piglatin.txt:
--------------------------------------------------------------------------------
1 | Your job is to translate the user's question into Pig Latin. Ignore any sources provided and just translate the question. DO NOT answer the question.
2 | 


--------------------------------------------------------------------------------
/example_input/prompt_refined.txt:
--------------------------------------------------------------------------------
 1 | You are an experienced HR generalist that delights in their role of helping employees with their about their healthcare plan and the employee handbook.
 2 | 
 3 | Give an answer using ONLY with the facts listed in the list of sources below indicated by “Sources:”.
 4 | 
 5 | If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.
 6 | 
 7 | Use clear and concise language and write in a confident yet friendly tone. In your answers ensure the employee understands how your response connects to the information in the sources and include all citations necessary to help the employee validate the answer provided.
 8 | 
 9 | For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question.
10 | 
11 | Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].
12 | 


--------------------------------------------------------------------------------
/example_input/prompt_refined_trimmed.txt:
--------------------------------------------------------------------------------
 1 | You are a helpful assistant.
 2 | 
 3 | Give an answer using ONLY with the facts listed in the list of sources below indicated by “Sources:”.
 4 | 
 5 | If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.
 6 | 
 7 | In your answers ensure the user understands how your response connects to the information in the sources and include all citations necessary to help the user validate the answer provided.
 8 | 
 9 | Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].
10 | 
11 | For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question.
12 | 


--------------------------------------------------------------------------------
/example_input/prompt_ungrounded.txt:
--------------------------------------------------------------------------------
1 | Your job is to answer questions to the best of your ability. You will be given sources that you may use for ideas but you can answer using everything you know about the world. Be creative!
2 | 


--------------------------------------------------------------------------------
/example_input/prompt_weak.txt:
--------------------------------------------------------------------------------
1 | You are a helpful assistant.
2 | 


--------------------------------------------------------------------------------
/example_input/qa_dontknows.jsonl:
--------------------------------------------------------------------------------
 1 | {"question": "What are the key benefits of the Northwind Platinum plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 2 | {"question": "How does the Northwind Health Plus work in California?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 3 | {"question": "Should employees stay home when sick?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 4 | {"question": "Can standing desks be reimbursed?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 5 | {"question": "Can deductibles be paid by check?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 6 | {"question": "How does the Northwind Standard plan handle postnatal yoga?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 7 | {"question": "Are domestic partners eligible for the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 8 | {"question": "Are teenage children covered under the insurance?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
 9 | {"question": "What is the process for submitting a claim requiring evidence of medical necessity to Northwind Health?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
10 | {"question": "What is the role of the Manager of Technical Product Marketing at Contoso Electronics?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
11 | {"question": "What is the process for obtaining prior authorization for services under the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
12 | {"question": "What is the policy for working from home?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."}
13 | {"question": "How can employees refer new hires?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
14 | {"question": "What are the benefits of choosing an in-network provider with Northwind Standard?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
15 | {"question": "How does Northwind Health Plus handle Coordination of Benefits (COB)?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
16 | {"question": "What is the role of the clinical review process in the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
17 | {"question": "What are the requirements for data security at Contoso Electronics?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
18 | {"question": "What is the role of Contoso Electronics in the aerospace industry?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
19 | {"question": "Is there a plan with no deductible?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
20 | {"question": "What are the cost implications of choosing an out-of-network provider under Northwind Health Plus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
21 | {"question": "What is the policy for coverage of rehabilitation services under Northwind Health Plus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
22 | {"question": "What is the role of Northwind Standard in relation to in-network and non-participating providers?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
23 | {"question": "What is the coverage for mental health and substance abuse treatments under the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
24 | {"question": "What is the process for obtaining prior authorization for a medical service or treatment under Northwind Health Plus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."}
25 | {"question": "What is the capital of France?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
26 | {"question": "Who wrote the novel \"Pride and Prejudice\"?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
27 | {"question": "What is the chemical symbol for gold?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
28 | {"question": "What is the tallest mountain in the world?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
29 | {"question": "Who painted the Mona Lisa?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
30 | {"question": "What is the largest planet in our solar system?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
31 | {"question": "What is the square root of 81?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
32 | {"question": "Who was the first person to walk on the moon?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
33 | {"question": "What is the freezing point of water in Fahrenheit?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
34 | {"question": "What is the main ingredient in hummus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
35 | {"question": "Who won the FIFA World Cup in 2018?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
36 | {"question": "What is the currency of Japan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."}
37 | {"question": "How many miles does a rainbow weigh?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
38 | {"question": "Can you smell the color of the wind?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
39 | {"question": "What time does the color blue go to sleep?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
40 | {"question": "How many square feet are in a thought?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
41 | {"question": "Can you taste the sound of a sunrise?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
42 | {"question": "How loud is the smell of a rose?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
43 | {"question": "What's the temperature of a dream?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
44 | {"question": "How many inches tall is a whisper?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
45 | {"question": "What flavor is the number seven?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."}
46 | 


--------------------------------------------------------------------------------
/example_results/baseline/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "testdata_path": "example_input/qa.jsonl",
 3 |     "results_dir": "example_results/baseline",
 4 |     "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency"],
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages"
23 |         }
24 |     },
25 |     "target_response_answer_jmespath": "message.content",
26 |     "target_response_context_jmespath": "context.data_points.text"
27 | }
28 | 


--------------------------------------------------------------------------------
/example_results/baseline/evaluate_parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "evaluation_gpt_model": "gpt-4",
 3 |     "evaluation_timestamp": 1724265437,
 4 |     "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl",
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages"
23 |         }
24 |     },
25 |     "num_questions": null
26 | }
27 | 


--------------------------------------------------------------------------------
/example_results/baseline/summary.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gpt_groundedness": {
 3 |         "pass_count": 193,
 4 |         "pass_rate": 0.96,
 5 |         "mean_rating": 4.87
 6 |     },
 7 |     "gpt_relevance": {
 8 |         "pass_count": 197,
 9 |         "pass_rate": 0.98,
10 |         "mean_rating": 4.92
11 |     },
12 |     "answer_length": {
13 |         "mean": 613.11,
14 |         "max": 2210,
15 |         "min": 52
16 |     },
17 |     "latency": {
18 |         "mean": 2.35,
19 |         "max": 5.531913,
20 |         "min": 1.239641
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/example_results/baseline2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "testdata_path": "example_input/qa.jsonl",
 3 |     "results_dir": "example_results/baseline2",
 4 |     "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "has_citation", "citation_match"],
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages"
23 |         }
24 |     },
25 |     "target_response_answer_jmespath": "message.content",
26 |     "target_response_context_jmespath": "context.data_points.text"
27 | }
28 | 


--------------------------------------------------------------------------------
/example_results/baseline2/evaluate_parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "evaluation_gpt_model": "gpt-4",
 3 |     "evaluation_timestamp": 1724281405,
 4 |     "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl",
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages"
23 |         }
24 |     },
25 |     "num_questions": null
26 | }
27 | 


--------------------------------------------------------------------------------
/example_results/baseline2/summary.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gpt_groundedness": {
 3 |         "pass_count": 195,
 4 |         "pass_rate": 0.97,
 5 |         "mean_rating": 4.91
 6 |     },
 7 |     "gpt_relevance": {
 8 |         "pass_count": 199,
 9 |         "pass_rate": 0.99,
10 |         "mean_rating": 4.94
11 |     },
12 |     "answer_length": {
13 |         "mean": 614.39,
14 |         "max": 2423,
15 |         "min": 87
16 |     },
17 |     "latency": {
18 |         "mean": 2.22,
19 |         "max": 7.607282,
20 |         "min": 1.116874
21 |     },
22 |     "has_citation": {
23 |         "total": 199,
24 |         "rate": 1.0
25 |     },
26 |     "citation_match": {
27 |         "total": 0,
28 |         "rate": 0.0
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/example_results/prompt_nomarkdownmention/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "testdata_path": "example_input/qa.jsonl",
 3 |     "results_dir": "example_results/prompt_nomarkdownmention",
 4 |     "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency"],
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages",
23 |             "prompt_template": "<READFILE>example_input/prompt_nomarkdownmention.txt"
24 |         }
25 |     },
26 |     "target_response_answer_jmespath": "message.content",
27 |     "target_response_context_jmespath": "context.data_points.text"
28 | }
29 | 


--------------------------------------------------------------------------------
/example_results/prompt_nomarkdownmention/evaluate_parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "evaluation_gpt_model": "gpt-4",
 3 |     "evaluation_timestamp": 1724274502,
 4 |     "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl",
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages",
23 |             "prompt_template": "Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf]."
24 |         }
25 |     },
26 |     "num_questions": null
27 | }
28 | 


--------------------------------------------------------------------------------
/example_results/prompt_nomarkdownmention/summary.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gpt_groundedness": {
 3 |         "pass_count": 194,
 4 |         "pass_rate": 0.97,
 5 |         "mean_rating": 4.9
 6 |     },
 7 |     "gpt_relevance": {
 8 |         "pass_count": 196,
 9 |         "pass_rate": 0.98,
10 |         "mean_rating": 4.9
11 |     },
12 |     "answer_length": {
13 |         "mean": 620.0,
14 |         "max": 2155,
15 |         "min": 55
16 |     },
17 |     "latency": {
18 |         "mean": 2.43,
19 |         "max": 6.910127,
20 |         "min": 1.188182
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/example_results/prompt_nomarkdownmention2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "testdata_path": "example_input/qa.jsonl",
 3 |     "results_dir": "example_results/prompt_nomarkdownmention2",
 4 |     "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "has_citation", "citation_match"],
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages",
23 |             "prompt_template": "<READFILE>example_input/prompt_nomarkdownmention.txt"
24 |         }
25 |     },
26 |     "target_response_answer_jmespath": "message.content",
27 |     "target_response_context_jmespath": "context.data_points.text"
28 | }
29 | 


--------------------------------------------------------------------------------
/example_results/prompt_nomarkdownmention2/evaluate_parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "evaluation_gpt_model": "gpt-4",
 3 |     "evaluation_timestamp": 1724277997,
 4 |     "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl",
 5 |     "target_url": "http://localhost:50505/chat",
 6 |     "target_parameters": {
 7 |         "overrides": {
 8 |             "top": 3,
 9 |             "temperature": 0.3,
10 |             "minimum_reranker_score": 0,
11 |             "minimum_search_score": 0,
12 |             "retrieval_mode": "hybrid",
13 |             "semantic_ranker": true,
14 |             "semantic_captions": false,
15 |             "suggest_followup_questions": false,
16 |             "use_oid_security_filter": false,
17 |             "use_groups_security_filter": false,
18 |             "vector_fields": [
19 |                 "embedding"
20 |             ],
21 |             "use_gpt4v": false,
22 |             "gpt4v_input": "textAndImages",
23 |             "prompt_template": "Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf]."
24 |         }
25 |     },
26 |     "num_questions": null
27 | }
28 | 


--------------------------------------------------------------------------------
/example_results/prompt_nomarkdownmention2/summary.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gpt_groundedness": {
 3 |         "pass_count": 195,
 4 |         "pass_rate": 0.97,
 5 |         "mean_rating": 4.91
 6 |     },
 7 |     "gpt_relevance": {
 8 |         "pass_count": 197,
 9 |         "pass_rate": 0.98,
10 |         "mean_rating": 4.92
11 |     },
12 |     "answer_length": {
13 |         "mean": 621.93,
14 |         "max": 2177,
15 |         "min": 79
16 |     },
17 |     "latency": {
18 |         "mean": 2.29,
19 |         "max": 5.716042,
20 |         "min": 1.087221
21 |     },
22 |     "has_citation": {
23 |         "total": 199,
24 |         "rate": 1.0
25 |     },
26 |     "citation_match": {
27 |         "total": 0,
28 |         "rate": 0.0
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/infra/core/ai/cognitiveservices.bicep:
--------------------------------------------------------------------------------
 1 | metadata description = 'Creates an Azure Cognitive Services instance.'
 2 | param name string
 3 | param location string = resourceGroup().location
 4 | param tags object = {}
 5 | @description('The custom subdomain name used to access the API. Defaults to the value of the name parameter.')
 6 | param customSubDomainName string = name
 7 | param disableLocalAuth bool = false
 8 | param deployments array = []
 9 | param kind string = 'OpenAI'
10 | 
11 | @allowed([ 'Enabled', 'Disabled' ])
12 | param publicNetworkAccess string = 'Enabled'
13 | param sku object = {
14 |   name: 'S0'
15 | }
16 | 
17 | param allowedIpRules array = []
18 | param networkAcls object = empty(allowedIpRules) ? {
19 |   defaultAction: 'Allow'
20 | } : {
21 |   ipRules: allowedIpRules
22 |   defaultAction: 'Deny'
23 | }
24 | 
25 | resource account 'Microsoft.CognitiveServices/accounts@2023-05-01' = {
26 |   name: name
27 |   location: location
28 |   tags: tags
29 |   kind: kind
30 |   properties: {
31 |     customSubDomainName: customSubDomainName
32 |     publicNetworkAccess: publicNetworkAccess
33 |     networkAcls: networkAcls
34 |     disableLocalAuth: disableLocalAuth
35 |   }
36 |   sku: sku
37 | }
38 | 
39 | @batchSize(1)
40 | resource deployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01' = [for deployment in deployments: {
41 |   parent: account
42 |   name: deployment.name
43 |   properties: {
44 |     model: deployment.model
45 |     raiPolicyName: contains(deployment, 'raiPolicyName') ? deployment.raiPolicyName : null
46 |   }
47 |   sku: contains(deployment, 'sku') ? deployment.sku : {
48 |     name: 'Standard'
49 |     capacity: 20
50 |   }
51 | }]
52 | 
53 | output endpoint string = account.properties.endpoint
54 | output id string = account.id
55 | output name string = account.name
56 | 


--------------------------------------------------------------------------------
/infra/core/security/role.bicep:
--------------------------------------------------------------------------------
 1 | metadata description = 'Creates a role assignment for a service principal.'
 2 | param principalId string
 3 | 
 4 | @allowed([
 5 |   'Device'
 6 |   'ForeignGroup'
 7 |   'Group'
 8 |   'ServicePrincipal'
 9 |   'User'
10 | ])
11 | param principalType string = 'ServicePrincipal'
12 | param roleDefinitionId string
13 | 
14 | resource role 'Microsoft.Authorization/roleAssignments@2022-04-01' = {
15 |   name: guid(subscription().id, resourceGroup().id, principalId, roleDefinitionId)
16 |   properties: {
17 |     principalId: principalId
18 |     principalType: principalType
19 |     roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', roleDefinitionId)
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/infra/main.bicep:
--------------------------------------------------------------------------------
  1 | targetScope = 'subscription'
  2 | 
  3 | @minLength(1)
  4 | @maxLength(64)
  5 | @description('Name of the the environment which is used to generate a short unique hash used in all resources.')
  6 | param environmentName string
  7 | 
  8 | @minLength(1)
  9 | @description('Location for the OpenAI resource')
 10 | @allowed(['australiaeast', 'canadaeast', 'francecentral', 'swedencentral', 'switzerlandnorth'])
 11 | @metadata({
 12 |   azd: {
 13 |     type: 'location'
 14 |   }
 15 | })
 16 | param location string
 17 | 
 18 | @allowed(['azure', 'openai'])
 19 | param openAiHost string // Set in main.parameters.json
 20 | @description('Name of the OpenAI resource group. If not specified, the resource group name will be generated.')
 21 | param openAiResourceGroupName string = ''
 22 | 
 23 | param openAiServiceName string = ''
 24 | 
 25 | param openAiSkuName string = 'S0'
 26 | 
 27 | param openAiApiKey string = ''
 28 | param openAiApiOrganization string = ''
 29 | 
 30 | param evalGptDeploymentName string = 'eval'
 31 | param evalGptModelName string = 'gpt-4'
 32 | param evalGptModelVersion string = '0613'
 33 | param evalGptDeploymentCapacity int = 30
 34 | 
 35 | @description('Id of the user or app to assign application roles')
 36 | param principalId string = ''
 37 | 
 38 | @description('Whether the deployment is running on GitHub Actions')
 39 | param runningOnGh string = ''
 40 | 
 41 | var resourceToken = toLower(uniqueString(subscription().id, environmentName, location))
 42 | var prefix = '${environmentName}${resourceToken}'
 43 | var tags = { 'azd-env-name': environmentName }
 44 | 
 45 | // Organize resources in a resource group
 46 | resource resourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' = if (empty(openAiResourceGroupName)) {
 47 |   name: '${prefix}-rg'
 48 |   location: location
 49 |   tags: tags
 50 | }
 51 | 
 52 | resource openAiResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(openAiResourceGroupName)) {
 53 |   name: !empty(openAiResourceGroupName) ? openAiResourceGroupName : resourceGroup.name
 54 | }
 55 | 
 56 | module openAi 'core/ai/cognitiveservices.bicep' = if (openAiHost == 'azure') {
 57 |   name: 'openai'
 58 |   scope: openAiResourceGroup
 59 |   params: {
 60 |     name: !empty(openAiServiceName) ? openAiServiceName : '${prefix}-openai'
 61 |     location: location
 62 |     tags: tags
 63 |     sku: {
 64 |       name: openAiSkuName
 65 |     }
 66 |     deployments: [{
 67 |       name: evalGptDeploymentName
 68 |       model: {
 69 |         format: 'OpenAI'
 70 |         name: evalGptModelName
 71 |         version: evalGptModelVersion
 72 |       }
 73 |       sku: {
 74 |         name: 'Standard'
 75 |         capacity: evalGptDeploymentCapacity
 76 |       }
 77 |     }]
 78 |     disableLocalAuth: true
 79 |   }
 80 | }
 81 | 
 82 | 
 83 | // USER ROLES
 84 | var principalType = empty(runningOnGh) ? 'User' : 'ServicePrincipal'
 85 | 
 86 | module openAiRoleUser 'core/security/role.bicep' = if (openAiHost == 'azure') {
 87 |   scope: openAiResourceGroup
 88 |   name: 'openai-role-user'
 89 |   params: {
 90 |     principalId: principalId
 91 |     roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd'
 92 |     principalType: principalType
 93 |   }
 94 | }
 95 | 
 96 | 
 97 | output AZURE_LOCATION string = location
 98 | output AZURE_TENANT_ID string = tenant().tenantId
 99 | output AZURE_RESOURCE_GROUP string = resourceGroup.name
100 | 
101 | // Shared by all OpenAI deployments
102 | output OPENAI_HOST string = openAiHost
103 | output OPENAI_GPT_MODEL string = evalGptModelName
104 | // Specific to Azure OpenAI
105 | output AZURE_OPENAI_SERVICE string = (openAiHost == 'azure') ? openAi.outputs.name : ''
106 | output AZURE_OPENAI_RESOURCE_GROUP string = (openAiHost == 'azure') ? openAiResourceGroup.name : ''
107 | output AZURE_OPENAI_EVAL_DEPLOYMENT string = (openAiHost == 'azure') ? evalGptDeploymentName : ''
108 | output AZURE_OPENAI_ENDPOINT string = (openAiHost == 'azure') ? openAi.outputs.endpoint : ''
109 | // Used only with non-Azure OpenAI deployments
110 | output OPENAI_KEY string = (openAiHost == 'openai') ? openAiApiKey : ''
111 | output OPENAI_ORGANIZATION string = (openAiHost == 'openai') ? openAiApiOrganization : ''
112 | 


--------------------------------------------------------------------------------
/infra/main.parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
 3 |   "contentVersion": "1.0.0.0",
 4 |   "parameters": {
 5 |     "environmentName": {
 6 |       "value": "${AZURE_ENV_NAME}"
 7 |     },
 8 |     "location": {
 9 |       "value": "${AZURE_LOCATION}"
10 |     },
11 |     "principalId": {
12 |       "value": "${AZURE_PRINCIPAL_ID}"
13 |     },
14 |     "openAiHost":{
15 |       "value": "${OPENAI_HOST=azure}"
16 |     },
17 |     "openAiServiceName": {
18 |       "value": "${AZURE_OPENAI_SERVICE}"
19 |     },
20 |     "openAiResourceGroupName": {
21 |       "value": "${AZURE_OPENAI_RESOURCE_GROUP}"
22 |     },
23 |     "evalGptDeploymentCapacity": {
24 |       "value": "${EVAL_GPT_DEPLOYMENT_CAPACITY}"
25 |     },
26 |     "openAiSkuName": {
27 |       "value": "S0"
28 |     },
29 |     "openAiApiKey": {
30 |       "value": "${OPENAI_KEY}"
31 |     },
32 |     "openAiApiOrganization": {
33 |       "value": "${OPENAI_ORGANIZATION}"
34 |     },
35 |     "runningOnGh": {
36 |       "value": "${GITHUB_ACTIONS}"
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | line-length = 120
 3 | target-version = "py39"
 4 | lint.isort.known-first-party = ["evaltools"]
 5 | lint.select = ["E", "F", "I", "UP"]
 6 | lint.ignore = ["D203"]
 7 | 
 8 | [tool.black]
 9 | line-length = 120
10 | target-version = ["py39"]
11 | 
12 | [tool.pytest.ini_options]
13 | addopts = "-ra"
14 | 
15 | [project]
16 | name = "evaltools"
17 | version = "0.1.1"
18 | description = "Evaluate chat applications using Azure OpenAI evaluators"
19 | dependencies = [
20 |     "requests",
21 |     "python-dotenv",
22 |     "azure-ai-evaluation==1.8.0",
23 |     "marshmallow==3.23.2", # Older version required due to promptflow issue with _T import
24 |     "azure-search-documents",
25 |     "typer",
26 |     "openai>=1.56.1", # Includes fix for httpx proxies issues
27 |     "pandas",
28 |     "rich",
29 |     "jmespath",
30 |     "textual"
31 | ]
32 | 
33 | [project.optional-dependencies]
34 | dev = [
35 |     "pre-commit",
36 |     "ruff",
37 |     "black",
38 |     "pytest"
39 | ]
40 | 
41 | [tool.setuptools.package-data]
42 | evaltools = ["review/*.tcss"]
43 | 


--------------------------------------------------------------------------------
/src/evaltools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/__init__.py


--------------------------------------------------------------------------------
/src/evaltools/__main__.py:
--------------------------------------------------------------------------------
1 | """Enables the use of `python -m evaltools` to run the CLI."""
2 | 
3 | from .cli import app
4 | 
5 | if __name__ == "__main__":
6 |     app()
7 | 


--------------------------------------------------------------------------------
/src/evaltools/cli.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import dotenv
  5 | import typer
  6 | from rich.logging import RichHandler
  7 | 
  8 | from evaltools import service_setup
  9 | from evaltools.eval.evaluate import run_evaluate_from_config
 10 | from evaltools.gen.generate import generate_dontknows_qa_data, generate_test_qa_data_for_search_index
 11 | from evaltools.review import diff_app, diff_markdown, summary_app, summary_markdown
 12 | 
 13 | app = typer.Typer(pretty_exceptions_enable=False)
 14 | 
 15 | logging.basicConfig(
 16 |     level=logging.WARNING, format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]
 17 | )
 18 | logger = logging.getLogger("evaltools")
 19 | # We only set the level to INFO for our logger,
 20 | # to avoid seeing the noisy INFO level logs from the Azure SDKs
 21 | logger.setLevel(logging.INFO)
 22 | 
 23 | dotenv.load_dotenv(override=True)
 24 | 
 25 | 
 26 | def int_or_none(raw: str) -> int | None:
 27 |     return None if raw == "None" else int(raw)
 28 | 
 29 | 
 30 | def str_or_none(raw: str) -> str | None:
 31 |     return None if raw == "None" else raw
 32 | 
 33 | 
 34 | def path_or_none(raw: str) -> Path | None:
 35 |     return None if raw == "None" else Path(raw)
 36 | 
 37 | 
 38 | @app.command()
 39 | def generate(
 40 |     output: Path = typer.Option(exists=False, dir_okay=False, file_okay=True),
 41 |     numquestions: int = typer.Option(help="Number of questions to generate", default=200),
 42 |     persource: int = typer.Option(help="Number of questions to generate per source", default=5),
 43 |     citationfieldname: str = typer.Option(help="Name of citiation field in ai search index", default="sourcepage"),
 44 | ):
 45 |     generate_test_qa_data_for_search_index(
 46 |         openai_config=service_setup.get_openai_config_dict(),
 47 |         search_client=service_setup.get_search_client(),
 48 |         num_questions_total=numquestions,
 49 |         num_questions_per_source=persource,
 50 |         output_file=Path.cwd() / output,
 51 |         citation_field_name=citationfieldname,
 52 |     )
 53 | 
 54 | 
 55 | @app.command()
 56 | def generate_dontknows(
 57 |     input: Path = typer.Option(exists=True, dir_okay=False, file_okay=True),
 58 |     output: Path = typer.Option(exists=False, dir_okay=False, file_okay=True),
 59 |     numquestions: int = typer.Option(help="Number of questions to generate", default=40),
 60 | ):
 61 |     generate_dontknows_qa_data(
 62 |         openai_config=service_setup.get_openai_config(),
 63 |         num_questions_total=numquestions,
 64 |         input_file=Path.cwd() / input,
 65 |         output_file=Path.cwd() / output,
 66 |     )
 67 | 
 68 | 
 69 | @app.command()
 70 | def evaluate(
 71 |     config: Path = typer.Option(
 72 |         exists=True, dir_okay=False, file_okay=True, help="Path to config.json", default="config.json"
 73 |     ),
 74 |     numquestions: int | None = typer.Option(
 75 |         help="Number of questions to evaluate (defaults to all if not specified).", default=None, parser=int_or_none
 76 |     ),
 77 |     targeturl: str | None = typer.Option(
 78 |         help="URL of the target service to evaluate against (defaults to the one in the config).",
 79 |         default=None,
 80 |         parser=str_or_none,
 81 |     ),
 82 |     resultsdir: Path = typer.Option(
 83 |         help="Directory to save the results of the evaluation", default=None, parser=path_or_none
 84 |     ),
 85 | ):
 86 |     run_evaluate_from_config(Path.cwd(), config, numquestions, targeturl, resultsdir)
 87 | 
 88 | 
 89 | def str_or_none(value: str) -> str | None:
 90 |     return value if value != "None" else None
 91 | 
 92 | 
 93 | @app.command()
 94 | def diff(
 95 |     directory1: Path = typer.Argument(exists=True, dir_okay=True, file_okay=False),
 96 |     directory2: Path = typer.Argument(default=None, exists=True, dir_okay=True, file_okay=False),
 97 |     changed: str | None = typer.Option(
 98 |         help="Show only questions whose values changed for the given column", default=None, parser=str_or_none
 99 |     ),
100 |     output: str | None = typer.Option(help="Output type, can be 'app' or 'markdown'", default=None, parser=str_or_none),
101 | ):
102 |     directories = [directory1] if directory2 is None else [directory1, directory2]
103 |     if output == "markdown":
104 |         print(diff_markdown.main(directories, changed))
105 |     else:
106 |         diff_app.main(directories, changed)
107 | 
108 | 
109 | @app.command()
110 | def summary(
111 |     results_dir: Path = typer.Argument(exists=True, dir_okay=True, file_okay=False),
112 |     output: str | None = typer.Option(help="Output type, can be 'app' or 'markdown'", default=None, parser=str_or_none),
113 |     highlight: str | None = typer.Option(
114 |         help="Highlight a specific run in the summary", default=None, parser=str_or_none
115 |     ),
116 | ):
117 |     if output == "markdown":
118 |         print(summary_markdown.main(results_dir, highlight_run=highlight))
119 |     else:
120 |         summary_app.main(results_dir)
121 | 
122 | 
123 | def cli():
124 |     app()
125 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/eval/__init__.py


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import time
  5 | from pathlib import Path
  6 | 
  7 | import jmespath
  8 | import pandas as pd
  9 | import requests
 10 | from rich.progress import track
 11 | 
 12 | from evaltools import service_setup
 13 | 
 14 | from .evaluate_metrics import metrics_by_name
 15 | 
 16 | logger = logging.getLogger("evaltools")
 17 | 
 18 | 
 19 | def send_question_to_target(
 20 |     question: str,
 21 |     url: str,
 22 |     parameters: dict = {},
 23 |     raise_error=False,
 24 |     response_answer_jmespath="message.content",
 25 |     response_context_jmespath="context.data_points.text",
 26 | ):
 27 |     headers = {"Content-Type": "application/json"}
 28 |     body = {
 29 |         "messages": [{"content": question, "role": "user"}],
 30 |         "context": parameters,
 31 |     }
 32 |     try:
 33 |         r = requests.post(url, headers=headers, json=body)
 34 |         r.encoding = "utf-8"
 35 | 
 36 |         latency = r.elapsed.total_seconds()
 37 | 
 38 |         try:
 39 |             response_dict = r.json()
 40 |         except json.JSONDecodeError:
 41 |             raise ValueError(
 42 |                 f"Response from target {url} is not valid JSON:\n\n{r.text} \n"
 43 |                 "Make sure that your configuration points at a chat endpoint that returns a single JSON object.\n"
 44 |             )
 45 | 
 46 |         try:
 47 |             answer = jmespath.search(response_answer_jmespath, response_dict)
 48 |             data_points = jmespath.search(response_context_jmespath, response_dict)
 49 |             if isinstance(data_points, dict):
 50 |                 context = json.dumps(data_points, ensure_ascii=False)
 51 |             elif isinstance(data_points, list):
 52 |                 context = "\n\n".join(data_points)
 53 |             elif data_points is not None:
 54 |                 # Hopefully it's a string
 55 |                 context = data_points
 56 |             else:
 57 |                 raise ValueError("Context is missing")
 58 |         except Exception:
 59 |             raise ValueError(
 60 |                 "Response does not adhere to the expected schema. "
 61 |                 f"The answer should be accessible via the JMESPath expression '{response_answer_jmespath}' "
 62 |                 f"and the context should be accessible via the JMESPath expression '{response_context_jmespath}'. "
 63 |                 "Either adjust the app response or adjust send_question_to_target() in evaluate.py "
 64 |                 f"to match the actual schema.\nResponse: {response_dict}"
 65 |             )
 66 | 
 67 |         response_obj = {"answer": answer, "context": context, "latency": latency}
 68 |         return response_obj
 69 |     except Exception as e:
 70 |         if raise_error:
 71 |             raise e
 72 |         return {
 73 |             "answer": str(e),
 74 |             "context": str(e),
 75 |             "latency": -1,
 76 |         }
 77 | 
 78 | 
 79 | def truncate_for_log(s: str, max_length=50):
 80 |     return s if len(s) < max_length else s[:max_length] + "..."
 81 | 
 82 | 
 83 | def load_jsonl(path: Path) -> list[dict]:
 84 |     with open(path, encoding="utf-8") as f:
 85 |         return [json.loads(line) for line in f.readlines()]
 86 | 
 87 | 
 88 | def run_evaluation(
 89 |     openai_config: dict,
 90 |     testdata_path: Path,
 91 |     results_dir: Path,
 92 |     target_url: str,
 93 |     target_parameters={},
 94 |     requested_metrics=[],
 95 |     num_questions=None,
 96 |     target_response_answer_jmespath=None,
 97 |     target_response_context_jmespath=None,
 98 |     model=None,
 99 |     azure_credential=None,
100 | ):
101 |     logger.info("Running evaluation using data from %s", testdata_path)
102 |     testdata = load_jsonl(testdata_path)
103 |     if num_questions:
104 |         logger.info("Limiting evaluation to %s questions", num_questions)
105 |         testdata = testdata[:num_questions]
106 | 
107 |     logger.info("Sending a test question to the target to ensure it is running...")
108 |     try:
109 |         question = "What information is in your knowledge base?"
110 |         target_data = send_question_to_target(
111 |             question,
112 |             target_url,
113 |             target_parameters,
114 |             raise_error=True,
115 |             response_answer_jmespath=target_response_answer_jmespath,
116 |             response_context_jmespath=target_response_context_jmespath,
117 |         )
118 |         logger.info(
119 |             'Successfully received response from target for question: "%s"\n"answer": "%s"\n"context": "%s"',
120 |             truncate_for_log(question),
121 |             truncate_for_log(target_data["answer"]),
122 |             truncate_for_log(target_data["context"]),
123 |         )
124 |     except Exception as e:
125 |         logger.error("Failed to send a test question to the target due to error: \n%s", e)
126 |         return False
127 | 
128 |     logger.info("Sending a test chat completion to the GPT deployment to ensure it is running...")
129 |     gpt_response = service_setup.get_openai_client(openai_config, azure_credential).chat.completions.create(
130 |         model=model,
131 |         messages=[{"role": "user", "content": "Hello!"}],
132 |         n=1,
133 |     )
134 |     logger.info('Successfully received response from GPT: "%s"', gpt_response.choices[0].message.content)
135 | 
136 |     logger.info("Starting evaluation...")
137 |     for metric in requested_metrics:
138 |         if metric not in metrics_by_name:
139 |             logger.error(f"Requested metric {metric} is not available. Available metrics: {metrics_by_name.keys()}")
140 |             return False
141 | 
142 |     requested_metrics = [
143 |         metrics_by_name[metric_name] for metric_name in requested_metrics if metric_name in metrics_by_name
144 |     ]
145 | 
146 |     def evaluate_row(row):
147 |         output = {}
148 |         output["question"] = row["question"]
149 |         output["truth"] = row["truth"]
150 |         target_response = send_question_to_target(
151 |             question=row["question"],
152 |             url=target_url,
153 |             parameters=target_parameters,
154 |             response_answer_jmespath=target_response_answer_jmespath,
155 |             response_context_jmespath=target_response_context_jmespath,
156 |         )
157 |         output.update(target_response)
158 |         for metric in requested_metrics:
159 |             result = metric.evaluator_fn(openai_config=openai_config)(
160 |                 query=row["question"],
161 |                 response=output["answer"],
162 |                 context=output["context"],
163 |                 ground_truth=row["truth"],
164 |             )
165 |             output.update(result)
166 | 
167 |         return output
168 | 
169 |     # Run evaluations in serial to avoid rate limiting
170 |     questions_with_ratings = []
171 |     for row in track(testdata, description="Processing..."):
172 |         questions_with_ratings.append(evaluate_row(row))
173 | 
174 |     logger.info("Evaluation calls have completed. Calculating overall metrics now...")
175 |     # Make the results directory if it doesn't exist
176 |     results_dir.mkdir(parents=True, exist_ok=True)
177 |     # Save the results
178 |     with open(results_dir / "eval_results.jsonl", "w", encoding="utf-8") as results_file:
179 |         for row in questions_with_ratings:
180 |             results_file.write(json.dumps(row, ensure_ascii=False) + "\n")
181 | 
182 |     # Calculate aggregate metrics
183 |     df = pd.DataFrame(questions_with_ratings)
184 |     summary = {}
185 |     for metric in requested_metrics:
186 |         summary[metric.METRIC_NAME] = metric.get_aggregate_stats(df)
187 |     # add a metric for the number of questions
188 |     summary["num_questions"] = {"total": len(df)}
189 | 
190 |     # summary statistics
191 |     with open(results_dir / "summary.json", "w", encoding="utf-8") as summary_file:
192 |         summary_file.write(json.dumps(summary, indent=4))
193 | 
194 |     with open(results_dir / "evaluate_parameters.json", "w", encoding="utf-8") as parameters_file:
195 |         parameters = {
196 |             "evaluation_gpt_model": model,
197 |             "evaluation_timestamp": int(time.time()),
198 |             "testdata_path": str(testdata_path),
199 |             "target_url": target_url,
200 |             "target_parameters": target_parameters,
201 |             "num_questions": num_questions,
202 |         }
203 |         parameters_file.write(json.dumps(parameters, indent=4))
204 |     logger.info("Evaluation results saved in %s", results_dir)
205 |     return True
206 | 
207 | 
208 | def process_config(obj: dict):
209 |     """Replace special markers in a config dict with their values:
210 |     * <TIMESTAMP> with current timestamp
211 |     * <READFILE> with contents of file
212 |     """
213 |     if isinstance(obj, dict):
214 |         for key in obj:
215 |             if isinstance(obj[key], dict):
216 |                 process_config(obj[key])
217 |             elif isinstance(obj[key], str) and "<TIMESTAMP>" in obj[key]:
218 |                 logger.info("Replaced %s in config with timestamp", key)
219 |                 obj[key] = obj[key].replace("<TIMESTAMP>", str(int(time.time())))
220 |             elif isinstance(obj[key], str) and "<READFILE>" in obj[key]:
221 |                 with open(obj[key].replace("<READFILE>", ""), encoding="utf-8") as f:
222 |                     logger.info("Replaced %s in config with contents of %s", key, f.name)
223 |                     obj[key] = f.read()
224 | 
225 | 
226 | def run_evaluate_from_config(
227 |     working_dir,
228 |     config_path,
229 |     num_questions=None,
230 |     target_url=None,
231 |     results_dir=None,
232 |     openai_config=None,
233 |     model=None,
234 |     azure_credential=None,
235 | ):
236 |     config_path = working_dir / Path(config_path)
237 |     logger.info("Running evaluation from config %s", config_path)
238 |     with open(config_path, encoding="utf-8") as f:
239 |         config = json.load(f)
240 |         process_config(config)
241 | 
242 |     if results_dir is None:
243 |         results_dir = working_dir / Path(config["results_dir"])
244 | 
245 |     evaluation_run_complete = run_evaluation(
246 |         openai_config=openai_config or service_setup.get_openai_config(),
247 |         testdata_path=working_dir / config["testdata_path"],
248 |         results_dir=results_dir,
249 |         target_url=target_url or config["target_url"],
250 |         target_parameters=config.get("target_parameters", {}),
251 |         num_questions=num_questions,
252 |         requested_metrics=config.get(
253 |             "requested_metrics",
254 |             ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],
255 |         ),
256 |         target_response_answer_jmespath=config.get("target_response_answer_jmespath", "message.content"),
257 |         target_response_context_jmespath=config.get("target_response_context_jmespath", "context.data_points.text"),
258 |         model=model or os.environ["OPENAI_GPT_MODEL"],
259 |         azure_credential=azure_credential,
260 |     )
261 | 
262 |     if evaluation_run_complete:
263 |         results_config_path = results_dir / "config.json"
264 |         logger.info("Saving original config file back to to %s", results_config_path)
265 |         with open(config_path, encoding="utf-8") as input_config:
266 |             with open(results_config_path, "w", encoding="utf-8") as output_config:
267 |                 output_config.write(input_config.read())
268 |     else:
269 |         logger.error("Evaluation was terminated early due to an error ⬆")
270 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | from .builtin_metrics import (
 2 |     BuiltinCoherenceMetric,
 3 |     BuiltinF1ScoreMetric,
 4 |     BuiltinFluencyMetric,
 5 |     BuiltinGroundednessMetric,
 6 |     BuiltinRelevanceMetric,
 7 |     BuiltinSimilarityMetric,
 8 | )
 9 | from .code_metrics import AnswerLengthMetric, CitationMatchMetric, HasCitationMetric, LatencyMetric
10 | from .prompt_metrics import CoherenceMetric, DontKnownessMetric, GroundednessMetric, RelevanceMetric
11 | 
12 | metrics = [
13 |     BuiltinCoherenceMetric,
14 |     BuiltinRelevanceMetric,
15 |     BuiltinGroundednessMetric,
16 |     BuiltinSimilarityMetric,
17 |     BuiltinFluencyMetric,
18 |     BuiltinF1ScoreMetric,
19 |     CoherenceMetric,
20 |     RelevanceMetric,
21 |     GroundednessMetric,
22 |     DontKnownessMetric,
23 |     LatencyMetric,
24 |     AnswerLengthMetric,
25 |     HasCitationMetric,
26 |     CitationMatchMetric,
27 | ]
28 | 
29 | metrics_by_name = {metric.METRIC_NAME: metric for metric in metrics}
30 | 
31 | 
32 | def register_metric(metric_class):
33 |     """Register a new custom metric class."""
34 |     if not hasattr(metric_class, "METRIC_NAME"):
35 |         raise ValueError("Metric class must have a METRIC_NAME attribute")
36 |     # Check if the metric name is already registered
37 |     if metric_class.METRIC_NAME in metrics_by_name:
38 |         raise ValueError(f"Metric with name {metric_class.METRIC_NAME} is already registered")
39 |     metrics_by_name[metric_class.METRIC_NAME] = metric_class
40 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/base_metric.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | import pandas as pd
 5 | 
 6 | logger = logging.getLogger("evaltools")
 7 | 
 8 | 
 9 | class BaseMetric(ABC):
10 |     METRIC_NAME = "name_of_metric"
11 | 
12 |     @classmethod
13 |     @abstractmethod
14 |     def get_aggregate_stats(cls, df):
15 |         """Returns a dictionary of aggregate statistics for the metric"""
16 |         pass
17 | 
18 |     @classmethod
19 |     def get_aggregate_stats_for_numeric_rating(cls, df, rating_column_name):
20 |         # Narrow down dataframe to just the metric
21 |         df = df[[rating_column_name]]
22 | 
23 |         # Drop invalid ratings - strings like "Failed"
24 |         rows_before = len(df)
25 |         df = df.apply(pd.to_numeric, errors="coerce")
26 |         df = df.dropna()
27 |         rows_after = len(df)
28 |         if rows_before != rows_after:
29 |             logger.warning(
30 |                 "Dropped %d invalid ratings for metric %s",
31 |                 rows_before - rows_after,
32 |                 rating_column_name,
33 |             )
34 | 
35 |         # Count how many ratings passed threshold of 4+
36 |         pass_count = int(df[rating_column_name].apply(lambda rating: rating >= 4).sum())
37 | 
38 |         return {
39 |             "pass_count": pass_count,
40 |             "pass_rate": round(pass_count / rows_before, 2),
41 |             "mean_rating": round(df[rating_column_name].mean(), 2),
42 |         }
43 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/builtin_metrics.py:
--------------------------------------------------------------------------------
 1 | from azure.ai.evaluation import (
 2 |     CoherenceEvaluator,
 3 |     F1ScoreEvaluator,
 4 |     FluencyEvaluator,
 5 |     GroundednessEvaluator,
 6 |     RelevanceEvaluator,
 7 |     SimilarityEvaluator,
 8 | )
 9 | 
10 | from .base_metric import BaseMetric
11 | 
12 | 
13 | class BuiltinRatingMetric(BaseMetric):
14 |     @classmethod
15 |     def get_aggregate_stats(cls, df):
16 |         return cls.get_aggregate_stats_for_numeric_rating(df, cls.METRIC_NAME)
17 | 
18 | 
19 | class BuiltinRelevanceMetric(BuiltinRatingMetric):
20 |     METRIC_NAME = "gpt_relevance"
21 | 
22 |     @classmethod
23 |     def evaluator_fn(cls, openai_config, **kwargs):
24 |         return RelevanceEvaluator(openai_config)
25 | 
26 | 
27 | class BuiltinCoherenceMetric(BuiltinRatingMetric):
28 |     METRIC_NAME = "gpt_coherence"
29 | 
30 |     @classmethod
31 |     def evaluator_fn(cls, openai_config, **kwargs):
32 |         return CoherenceEvaluator(openai_config)
33 | 
34 | 
35 | class BuiltinGroundednessMetric(BuiltinRatingMetric):
36 |     METRIC_NAME = "gpt_groundedness"
37 | 
38 |     @classmethod
39 |     def evaluator_fn(cls, openai_config, **kwargs):
40 |         return GroundednessEvaluator(openai_config)
41 | 
42 | 
43 | class BuiltinSimilarityMetric(BuiltinRatingMetric):
44 |     METRIC_NAME = "gpt_similarity"
45 | 
46 |     @classmethod
47 |     def evaluator_fn(cls, openai_config, **kwargs):
48 |         return SimilarityEvaluator(openai_config)
49 | 
50 | 
51 | class BuiltinFluencyMetric(BuiltinRatingMetric):
52 |     METRIC_NAME = "gpt_fluency"
53 | 
54 |     @classmethod
55 |     def evaluator_fn(cls, openai_config, **kwargs):
56 |         return FluencyEvaluator(openai_config)
57 | 
58 | 
59 | class BuiltinF1ScoreMetric(BaseMetric):
60 |     METRIC_NAME = "f1_score"
61 | 
62 |     @classmethod
63 |     def evaluator_fn(cls, **kwargs):
64 |         return F1ScoreEvaluator()
65 | 
66 |     @classmethod
67 |     def get_aggregate_stats(cls, df):
68 |         return {
69 |             "mean": round(df[cls.METRIC_NAME].mean(), 2),
70 |             "max": round(df[cls.METRIC_NAME].max(), 2),
71 |             "min": round(df[cls.METRIC_NAME].min(), 2),
72 |         }
73 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/code_metrics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | 
 4 | from .base_metric import BaseMetric
 5 | 
 6 | logger = logging.getLogger("evaltools")
 7 | 
 8 | 
 9 | class AnswerLengthMetric(BaseMetric):
10 |     METRIC_NAME = "answer_length"
11 | 
12 |     @classmethod
13 |     def evaluator_fn(cls, **kwargs):
14 |         def answer_length(*, response, **kwargs):
15 |             if response is None:
16 |                 logger.warning("Received response of None, can't compute answer_length metric. Setting to -1.")
17 |                 return {cls.METRIC_NAME: -1}
18 |             return {cls.METRIC_NAME: len(response)}
19 | 
20 |         return answer_length
21 | 
22 |     @classmethod
23 |     def get_aggregate_stats(cls, df):
24 |         # remove -1 values from the mean calculation
25 |         df = df[df[cls.METRIC_NAME] != -1]
26 |         return {
27 |             "mean": round(df[cls.METRIC_NAME].mean(), 2),
28 |             "max": int(df[cls.METRIC_NAME].max()),
29 |             "min": int(df[cls.METRIC_NAME].min()),
30 |         }
31 | 
32 | 
33 | class HasCitationMetric(BaseMetric):
34 |     METRIC_NAME = "has_citation"
35 | 
36 |     @classmethod
37 |     def evaluator_fn(cls, **kwargs):
38 |         def has_citation(*, response, **kwargs):
39 |             if response is None:
40 |                 logger.warning("Received response of None, can't compute has_citation metric. Setting to -1.")
41 |                 return {cls.METRIC_NAME: -1}
42 |             return {cls.METRIC_NAME: bool(re.search(r"\[[^\]]+\]", response))}
43 | 
44 |         return has_citation
45 | 
46 |     @classmethod
47 |     def get_aggregate_stats(cls, df):
48 |         df = df[df[cls.METRIC_NAME] != -1]
49 |         return {
50 |             "total": int(df[cls.METRIC_NAME].sum()),
51 |             "rate": round(df[cls.METRIC_NAME].mean(), 2),
52 |         }
53 | 
54 | 
55 | class CitationMatchMetric(BaseMetric):
56 |     METRIC_NAME = "citation_match"
57 | 
58 |     @classmethod
59 |     def evaluator_fn(cls, **kwargs):
60 |         def citation_match(*, response, ground_truth, **kwargs):
61 |             if response is None:
62 |                 logger.warning("Received response of None, can't compute citation_match metric. Setting to -1.")
63 |                 return {cls.METRIC_NAME: -1}
64 |             # Return true if all citations in the truth are present in the response
65 |             truth_citations = set(re.findall(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", ground_truth))
66 |             response_citations = set(re.findall(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", response))
67 |             citation_match = truth_citations.issubset(response_citations)
68 |             return {cls.METRIC_NAME: citation_match}
69 | 
70 |         return citation_match
71 | 
72 |     @classmethod
73 |     def get_aggregate_stats(cls, df):
74 |         df = df[df[cls.METRIC_NAME] != -1]
75 |         return {
76 |             "total": int(df[cls.METRIC_NAME].sum()),
77 |             "rate": round(df[cls.METRIC_NAME].mean(), 2),
78 |         }
79 | 
80 | 
81 | class LatencyMetric(BaseMetric):
82 |     METRIC_NAME = "latency"
83 | 
84 |     @classmethod
85 |     def evaluator_fn(cls, **kwargs):
86 |         def latency(**kwargs):
87 |             # Return no additional data, since latency is already stored in the target response
88 |             return {}
89 | 
90 |         return latency
91 | 
92 |     @classmethod
93 |     def get_aggregate_stats(cls, df):
94 |         return {
95 |             "mean": round(df[cls.METRIC_NAME].mean(), 2),
96 |             "max": df[cls.METRIC_NAME].max(),
97 |             "min": df[cls.METRIC_NAME].min(),
98 |         }
99 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/prompt_metrics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | from promptflow.client import load_flow
 7 | 
 8 | from .base_metric import BaseMetric
 9 | 
10 | PROMPT_TEMPLATE_DIR = Path(__file__).resolve().parent / "prompts"
11 | 
12 | logger = logging.getLogger("evaltools")
13 | 
14 | 
15 | class PromptBasedEvaluator:
16 |     def __init__(self, model_config, path, name):
17 |         prompty_model_config = {"configuration": model_config}
18 |         self._name = name
19 |         self._flow = load_flow(source=path, model=prompty_model_config)
20 | 
21 |     def __call__(self, **kwargs) -> dict:
22 |         llm_output = self._flow(**kwargs)
23 | 
24 |         score = np.nan
25 |         if llm_output:
26 |             match = re.search(r"\d", llm_output)
27 |             if match:
28 |                 score = float(match.group())
29 |             else:
30 |                 logging.warning(
31 |                     "No score found in answer: %s\nMake sure prompty file is correctly formatted.", llm_output
32 |                 )
33 | 
34 |         output = {}
35 |         output[self._name] = float(score)
36 |         return output
37 | 
38 | 
39 | class CustomRatingMetric(BaseMetric):
40 |     @classmethod
41 |     def evaluator_fn(cls, openai_config, **kwargs):
42 |         return PromptBasedEvaluator(
43 |             openai_config, path=PROMPT_TEMPLATE_DIR / f"{cls.METRIC_NAME}.prompty", name=cls.METRIC_NAME
44 |         )
45 | 
46 |     @classmethod
47 |     def get_aggregate_stats(cls, df):
48 |         return cls.get_aggregate_stats_for_numeric_rating(df, cls.METRIC_NAME)
49 | 
50 | 
51 | class RelevanceMetric(CustomRatingMetric):
52 |     METRIC_NAME = "myrelevance"
53 | 
54 | 
55 | class CoherenceMetric(CustomRatingMetric):
56 |     METRIC_NAME = "mycoherence"
57 | 
58 | 
59 | class GroundednessMetric(CustomRatingMetric):
60 |     METRIC_NAME = "mygroundedness"
61 | 
62 | 
63 | class DontKnownessMetric(CustomRatingMetric):
64 |     METRIC_NAME = "dontknowness"
65 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/prompts/dontknowness.prompty:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: DontKnowness Evaluation
 3 | description: Evaluates don't-know-ness of an answer
 4 | model:
 5 |   api: chat
 6 |   configuration:
 7 |     type: azure_openai
 8 |     azure_deployment: ${env:AZURE_OPENAI_EVAL_DEPLOYMENT}
 9 |     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
10 |   parameters:
11 |     temperature: 0.0
12 |     max_tokens: 1
13 |     top_p: 1.0
14 |     presence_penalty: 0
15 |     frequency_penalty: 0
16 |     response_format:
17 |       type: text
18 | 
19 | inputs:
20 |   question:
21 |     type: string
22 |   answer:
23 |     type: string
24 | 
25 | sample:
26 |   question: What are the main goals of Perseverance Mars rover mission?
27 |   answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
28 | ---
29 | system:
30 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
31 | 
32 | user:
33 | The "I don't know"-ness metric is a measure of how much an answer conveys the lack of knowledge or uncertainty, which is useful for making sure a chatbot for a particular domain doesn't answer outside that domain. Score the I-dont-know-ness of the answer between one to five stars using the following rating scale:
34 | One star: the answer completely answers the question and conveys no uncertainty
35 | Two stars: the answer conveys a little uncertainty but mostly attempts to answer the question
36 | Three stars: the answer conveys some uncertainty but still contains some attempt to answer the question
37 | Four stars: the answer conveys uncertainty and makes no attempt to answer the question
38 | Five stars: the answer says straightforwardly that it doesn't know, and makes no attempt to answer the question.
39 | 
40 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
41 | 
42 | question: What are the main goals of Perseverance Mars rover mission?
43 | answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
44 | stars: 1
45 | 
46 | question: What field did Marie Curie excel in?
47 | answer: I'm not sure, but I think Marie Curie excelled in the field of science.
48 | stars: 2
49 | 
50 | question: What are the main components of the Mediterranean diet?
51 | answer: I don't have an answer in my sources but I think the diet has some fats?
52 | stars: 3
53 | 
54 | question: What are the main attractions of the Queen's Royal Castle?
55 | answer: I'm not certain. Perhaps try rephrasing the question?
56 | stars: 4
57 | 
58 | question: Where were The Beatles formed?
59 | answer: I'm sorry, I don't know, that answer is not in my sources.
60 | stars: 5
61 | 
62 | question: {{query}}
63 | answer: {{response}}
64 | stars:
65 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/prompts/mycoherence.prompty:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Coherence
 3 | description: Evaluates coherence score for QA scenario
 4 | model:
 5 |   api: chat
 6 |   configuration:
 7 |     type: azure_openai
 8 |     azure_deployment: ${env:AZURE_DEPLOYMENT}
 9 |     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
10 |   parameters:
11 |     temperature: 0.0
12 |     max_tokens: 1
13 |     top_p: 1.0
14 |     presence_penalty: 0
15 |     frequency_penalty: 0
16 |     response_format:
17 |       type: text
18 | 
19 | inputs:
20 |   query:
21 |     type: string
22 |   response:
23 |     type: string
24 | 
25 | sample:
26 |   query: What are the main goals of Perseverance Mars rover mission?
27 |   response: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
28 | ---
29 | system:
30 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
31 | 
32 | user:
33 | Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
34 | One star: the answer completely lacks coherence
35 | Two stars: the answer mostly lacks coherence
36 | Three stars: the answer is partially coherent
37 | Four stars: the answer is mostly coherent
38 | Five stars: the answer has perfect coherency
39 | 
40 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
41 | 
42 | question: What is your favorite indoor activity and why do you enjoy it?
43 | answer: I like pizza. The sun is shining.
44 | stars: 1
45 | 
46 | question: Can you describe your favorite movie without giving away any spoilers?
47 | answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain.
48 | stars: 2
49 | 
50 | question: What are some benefits of regular exercise?
51 | answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green.
52 | stars: 3
53 | 
54 | question: How do you cope with stress in your daily life?
55 | answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities.
56 | stars: 4
57 | 
58 | question: What can you tell me about climate change and its effects on the environment?
59 | answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
60 | stars: 5
61 | 
62 | question: {{query}}
63 | answer: {{response}}
64 | stars:
65 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/prompts/mygroundedness.prompty:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Groundedness
 3 | description: Evaluates groundedness score for QA scenario
 4 | model:
 5 |   api: chat
 6 |   configuration:
 7 |     type: azure_openai
 8 |     azure_deployment: ${env:AZURE_DEPLOYMENT}
 9 |     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
10 |   parameters:
11 |     temperature: 0.0
12 |     max_tokens: 1
13 |     top_p: 1.0
14 |     presence_penalty: 0
15 |     frequency_penalty: 0
16 |     response_format:
17 |       type: text
18 | 
19 | inputs:
20 |   response:
21 |     type: string
22 |   context:
23 |     type: string
24 | 
25 | sample:
26 |   context: The Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
27 |   response: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
28 | ---
29 | system:
30 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
31 | user:
32 | You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
33 | 1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
34 | 2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
35 | 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
36 | Independent Examples:
37 | ## Example Task #1 Input:
38 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
39 | ## Example Task #1 Output:
40 | 1
41 | ## Example Task #2 Input:
42 | {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."}
43 | ## Example Task #2 Output:
44 | 5
45 | ## Example Task #3 Input:
46 | {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
47 | ## Example Task #3 Output:
48 | 5
49 | ## Example Task #4 Input:
50 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
51 | ## Example Task #4 Output:
52 | 1
53 | ## Actual Task Input:
54 | {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{response}}}
55 | Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
56 | Actual Task Output:
57 | 


--------------------------------------------------------------------------------
/src/evaltools/eval/evaluate_metrics/prompts/myrelevance.prompty:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Relevance
 3 | description: Evaluates relevance score for QA scenario
 4 | model:
 5 |   api: chat
 6 |   configuration:
 7 |     type: azure_openai
 8 |     azure_deployment: ${env:AZURE_DEPLOYMENT}
 9 |     azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
10 |   parameters:
11 |     temperature: 0.0
12 |     max_tokens: 1
13 |     top_p: 1.0
14 |     presence_penalty: 0
15 |     frequency_penalty: 0
16 |     response_format:
17 |       type: text
18 | 
19 | inputs:
20 |   query:
21 |     type: string
22 |   response:
23 |     type: string
24 |   context:
25 |     type: string
26 | 
27 | sample:
28 |   question: What are the main goals of Perseverance Mars rover mission?
29 |   answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth.
30 |   context: The Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
31 | ---
32 | system:
33 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
34 | user:
35 | Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
36 | One star: the answer completely lacks relevance
37 | Two stars: the answer mostly lacks relevance
38 | Three stars: the answer is partially relevant
39 | Four stars: the answer is mostly relevant
40 | Five stars: the answer has perfect relevance
41 | 
42 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
43 | 
44 | context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize.
45 | question: What field did Marie Curie excel in?
46 | answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques.
47 | stars: 1
48 | 
49 | context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history.
50 | question: Where were The Beatles formed?
51 | answer: The band The Beatles began their journey in London, England, and they changed the history of music.
52 | stars: 2
53 | 
54 | context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
55 | question: What are the main goals of Perseverance Mars rover mission?
56 | answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars.
57 | stars: 3
58 | 
59 | context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health.
60 | question: What are the main components of the Mediterranean diet?
61 | answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes.
62 | stars: 4
63 | 
64 | context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty.
65 | question: What are the main attractions of the Queen's Royal Castle?
66 | answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty.
67 | stars: 5
68 | 
69 | context: {{context}}
70 | question: {{query}}
71 | answer: {{response}}
72 | stars:
73 | 


--------------------------------------------------------------------------------
/src/evaltools/gen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/gen/__init__.py


--------------------------------------------------------------------------------
/src/evaltools/gen/generate.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import math
  4 | import random
  5 | from collections.abc import Generator
  6 | from pathlib import Path
  7 | 
  8 | from azure.search.documents import SearchClient
  9 | 
 10 | from evaltools import service_setup
 11 | 
 12 | logger = logging.getLogger("evaltools")
 13 | 
 14 | 
 15 | def generate_test_qa_data(
 16 |     openai_config: dict,
 17 |     num_questions_total: int,
 18 |     num_questions_per_source: int,
 19 |     output_file: Path,
 20 |     source_retriever: Generator[dict, None, None],
 21 |     source_to_text: callable,
 22 |     answer_formatter: callable,
 23 | ):
 24 |     try:
 25 |         from azure.ai.generative.synthetic.qa import QADataGenerator, QAType
 26 |     except ImportError:
 27 |         logger.error(
 28 |             "Azure AI Generative package is deprecated and no longer working, so this functionality is disabled."
 29 |         )
 30 | 
 31 |     logger.info(
 32 |         "Generating %d questions total, %d per source, based on search results",
 33 |         num_questions_total,
 34 |         num_questions_per_source,
 35 |     )
 36 |     qa_generator = QADataGenerator(model_config=openai_config)
 37 | 
 38 |     qa: list[dict] = []
 39 |     for source in source_retriever():
 40 |         if len(qa) > num_questions_total:
 41 |             logger.info("Generated enough questions already, stopping")
 42 |             break
 43 |         result = qa_generator.generate(
 44 |             text=source_to_text(source),
 45 |             qa_type=QAType.LONG_ANSWER,
 46 |             num_questions=num_questions_per_source,
 47 |         )
 48 | 
 49 |         for question, answer in result["question_answers"]:
 50 |             qa.append({"question": question, "truth": answer_formatter(answer, source)})
 51 | 
 52 |     logger.info("Writing %d questions to %s", len(qa), output_file)
 53 |     directory = Path(output_file).parent
 54 |     if not directory.exists():
 55 |         directory.mkdir(parents=True)
 56 |     with open(output_file, "w", encoding="utf-8") as f:
 57 |         for item in qa[0:num_questions_total]:
 58 |             f.write(json.dumps(item) + "\n")
 59 | 
 60 | 
 61 | def generate_test_qa_data_for_search_index(
 62 |     openai_config: dict,
 63 |     num_questions_total: int,
 64 |     num_questions_per_source: int,
 65 |     output_file: Path,
 66 |     search_client: SearchClient,
 67 |     citation_field_name: str,
 68 | ):
 69 |     def source_retriever() -> Generator[dict, None, None]:
 70 |         for doc in search_client.search("", top=1000):
 71 |             logger.info("Processing search document %s", doc[citation_field_name])
 72 |             yield doc
 73 | 
 74 |     def source_to_text(source) -> str:
 75 |         return source["content"]
 76 | 
 77 |     def answer_formatter(answer, source) -> str:
 78 |         return f"{answer} [{source[citation_field_name]}]"
 79 | 
 80 |     generate_test_qa_data(
 81 |         openai_config,
 82 |         num_questions_total,
 83 |         num_questions_per_source,
 84 |         output_file,
 85 |         source_retriever,
 86 |         source_to_text,
 87 |         answer_formatter,
 88 |     )
 89 | 
 90 | 
 91 | def generate_based_on_questions(openai_client, model: str, qa: list, num_questions: int, prompt: str):
 92 |     existing_questions = ""
 93 |     if qa:
 94 |         qa = random.sample(qa, len(qa))  # Shuffle questions for some randomness
 95 |         existing_questions = "\n".join([item["question"] for item in qa])
 96 | 
 97 |     gpt_response = openai_client.chat.completions.create(
 98 |         model=model,
 99 |         messages=[
100 |             {
101 |                 "role": "user",
102 |                 "content": f"{prompt} Only generate {num_questions} TOTAL. Separate each question by a new line. \n{existing_questions}",  # noqa: E501
103 |             }
104 |         ],
105 |         n=1,
106 |         max_tokens=num_questions * 50,
107 |         temperature=0.3,
108 |     )
109 | 
110 |     qa = []
111 |     for message in gpt_response.choices[0].message.content.split("\n")[0:num_questions]:
112 |         qa.append({"question": message, "truth": f"Generated from this prompt: {prompt}"})
113 |     return qa
114 | 
115 | 
116 | def generate_dontknows_qa_data(openai_config: dict, num_questions_total: int, input_file: Path, output_file: Path):
117 |     logger.info("Generating off-topic questions based on %s", input_file)
118 |     with open(input_file, encoding="utf-8") as f:
119 |         qa = [json.loads(line) for line in f.readlines()]
120 | 
121 |     openai_client = service_setup.get_openai_client(openai_config)
122 |     dontknows_qa = []
123 |     num_questions_each = math.ceil(num_questions_total / 4)
124 |     dontknows_qa += generate_based_on_questions(
125 |         openai_client,
126 |         openai_config.model,
127 |         qa,
128 |         num_questions_each,
129 |         f"Given these questions, suggest {num_questions_each} questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone.",  # noqa: E501
130 |     )
131 |     dontknows_qa += generate_based_on_questions(
132 |         openai_client,
133 |         openai_config.model,
134 |         qa,
135 |         num_questions_each,
136 |         f"Given these questions, suggest {num_questions_each} questions with similar keywords that are about publicly known facts.",  # noqa: E501
137 |     )
138 |     dontknows_qa += generate_based_on_questions(
139 |         openai_client,
140 |         openai_config.model,
141 |         qa,
142 |         num_questions_each,
143 |         f"Given these questions, suggest {num_questions_each} questions that are not related to these topics at all but have well known answers.",  # noqa: E501
144 |     )
145 |     remaining = num_questions_total - len(dontknows_qa)
146 |     dontknows_qa += generate_based_on_questions(
147 |         openai_client,
148 |         openai_config.model,
149 |         qa=None,
150 |         num_questions=remaining,
151 |         prompt=f"Suggest {remaining} questions that are nonsensical, and would result in confusion if you asked it.",  # noqa: E501
152 |     )
153 | 
154 |     logger.info("Writing %d off-topic questions to %s", len(dontknows_qa), output_file)
155 |     directory = Path(output_file).parent
156 |     if not directory.exists():
157 |         directory.mkdir(parents=True)
158 |     with open(output_file, "w", encoding="utf-8") as f:
159 |         for item in dontknows_qa:
160 |             f.write(json.dumps(item) + "\n")
161 | 


--------------------------------------------------------------------------------
/src/evaltools/review/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/review/__init__.py


--------------------------------------------------------------------------------
/src/evaltools/review/answers.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <title>Review answers</title>
  5 |     <link
  6 |       rel="stylesheet"
  7 |       href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"
  8 |     />
  9 |     <link
 10 |       rel="stylesheet"
 11 |       href="https://cdn.datatables.net/2.0.2/css/dataTables.dataTables.css"
 12 |     />
 13 |   </head>
 14 |   <body>
 15 |     <div class="container">
 16 |       <h1>Review answers</h1>
 17 | 
 18 |       <label
 19 |         >Select an <code>eval_results.jsonl</code> file to review:
 20 |         <input type="file" id="input" multiple />
 21 |       </label>
 22 | 
 23 |       <table id="jsonlTable" class="table">
 24 |         <thead id="tableHead"></thead>
 25 |         <tbody id="tableBody"></tbody>
 26 |       </table>
 27 |     </div>
 28 | 
 29 |     <div
 30 |       class="modal fade"
 31 |       id="exampleModal"
 32 |       tabindex="-1"
 33 |       aria-labelledby="exampleModalLabel"
 34 |       aria-hidden="true"
 35 |     >
 36 |       <div class="modal-dialog modal-lg">
 37 |         <div class="modal-content">
 38 |           <div class="modal-header">
 39 |             <h5 class="modal-title fs-5" id="exampleModalLabel">
 40 |               <div id="question"></div>
 41 |             </h5>
 42 |             <button
 43 |               type="button"
 44 |               class="btn-close"
 45 |               data-bs-dismiss="modal"
 46 |               aria-label="Close"
 47 |             ></button>
 48 |           </div>
 49 |           <div class="modal-body">
 50 |             <h5>Answer:</h5>
 51 |             <div id="answer"></div>
 52 |             <br />
 53 |             <h5>Truth:</h5>
 54 |             <div id="truth"></div>
 55 |           </div>
 56 |           <div class="modal-footer">
 57 |             <button
 58 |               type="button"
 59 |               class="btn btn-secondary"
 60 |               data-bs-dismiss="modal"
 61 |             >
 62 |               Close
 63 |             </button>
 64 |             <button type="button" class="btn btn-primary">Save changes</button>
 65 |           </div>
 66 |         </div>
 67 |       </div>
 68 |     </div>
 69 |     <script src="https://code.jquery.com/jquery-3.4.1.min.js"></script>
 70 |     <script src="https://cdn.datatables.net/2.0.2/js/dataTables.js"></script>
 71 |     <script
 72 |       src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"
 73 |       integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz"
 74 |       crossorigin="anonymous"
 75 |     ></script>
 76 |     <script>
 77 |       DataTable.render.ellipsis = function (cutoff) {
 78 |         return function (data, type, row) {
 79 |           if (type === "display") {
 80 |             var str = data.toString(); // cast numbers
 81 | 
 82 |             return str.length < cutoff
 83 |               ? str
 84 |               : str.substr(0, cutoff - 1) + "&#8230;";
 85 |           }
 86 | 
 87 |           // Search, order and type can use the original data
 88 |           return data;
 89 |         };
 90 |       };
 91 | 
 92 |       function renderTable(data) {
 93 |         var lines = data.split("\n");
 94 |         var jsonObjects = lines.map((line) =>
 95 |           line === "" ? null : JSON.parse(line)
 96 |         );
 97 |         // Remove the last element if it is null
 98 |         if (jsonObjects[jsonObjects.length - 1] === null) {
 99 |           jsonObjects.pop();
100 |         }
101 |         var keys = Object.keys(jsonObjects[0]);
102 |         // only include keys that are int or float or bool
103 |         keys = keys.filter((key) => {
104 |           var val = jsonObjects[0][key];
105 |           return typeof val === "number" || typeof val === "boolean";
106 |         });
107 |         var columns = ["question", "answer"].concat(keys);
108 |         var thead = $("#tableHead");
109 |         var tbody = $("#tableBody");
110 | 
111 |         var headRow = $("<tr>");
112 |         columns.forEach((column) => {
113 |           // if column ends with _score, remove _score
114 |           headRow.append($("<th>").text(column.split("_score")[0]));
115 |         });
116 |         thead.append(headRow);
117 | 
118 |         jsonObjects.forEach((obj) => {
119 |           var row = $("<tr>");
120 |           columns.forEach((column) => {
121 |             if (typeof obj[column] === "number") {
122 |               row.append($("<td>").text(obj[column].toFixed(2)));
123 |             } else {
124 |               row.append($("<td>").text(obj[column]));
125 |             }
126 |           });
127 |           tbody.append(row);
128 |         });
129 |         var table = $("#jsonlTable").DataTable({
130 |           columnDefs: [
131 |             {
132 |               targets: 1,
133 |               render: DataTable.render.ellipsis(60),
134 |             },
135 |           ],
136 |         });
137 |         table.on("click", "tbody tr", function () {
138 |           let data = table.row(this).data();
139 |           let rowNum = table.row(this).index();
140 |           // populate modal with question and answer from the jsonObjects
141 |           $("#question").text(jsonObjects[rowNum].question);
142 |           $("#answer").text(jsonObjects[rowNum].answer);
143 |           $("#truth").text(jsonObjects[rowNum].truth);
144 |           // show modal
145 |           const myModal = new bootstrap.Modal(
146 |             document.getElementById("exampleModal"),
147 |             {}
148 |           );
149 |           myModal.show();
150 |         });
151 |       }
152 | 
153 |       const inputElement = document.getElementById("input");
154 |       inputElement.addEventListener("change", handleFiles, false);
155 |       function handleFiles() {
156 |         const fileList = this.files;
157 |         const file = fileList[0]; // get the first file from the file list
158 | 
159 |         if (file) {
160 |           const reader = new FileReader();
161 | 
162 |           reader.onload = function (event) {
163 |             const fileContent = event.target.result;
164 |             renderTable(fileContent);
165 |           };
166 | 
167 |           reader.onerror = function () {
168 |             console.error("An error occurred while reading the file.");
169 |           };
170 | 
171 |           reader.readAsText(file); // read the file as text
172 |         }
173 |       }
174 |     </script>
175 |   </body>
176 | </html>
177 | 


--------------------------------------------------------------------------------
/src/evaltools/review/diff_app.py:
--------------------------------------------------------------------------------
 1 | # a CLI tool to diff two JSON files
 2 | from pathlib import Path
 3 | 
 4 | from textual.app import App, ComposeResult
 5 | from textual.containers import Horizontal, Vertical, VerticalScroll
 6 | from textual.widgets import Button, DataTable, Markdown, Static
 7 | 
 8 | from .utils import diff_directories
 9 | 
10 | 
11 | class DiffApp(App):
12 |     CSS_PATH = "diff_app.tcss"
13 | 
14 |     def __init__(self, directories: list[Path], changed: str = None):
15 |         super().__init__()
16 |         # Only include the first directory if the second is not provided
17 |         self.directories = directories
18 |         self.changed = changed
19 |         self.data_dicts = []  # Store dicts keyed by question
20 |         self.result_index = 0  # Based on results in the first directory
21 | 
22 |     def on_mount(self):
23 |         self.data_dicts = diff_directories(self.directories)
24 |         self.next_question()
25 | 
26 |     def on_button_pressed(self, event: Button.Pressed) -> None:
27 |         if event.button.id == "quit":
28 |             self.exit()
29 |         else:
30 |             self.next_question()
31 | 
32 |     def compose(self) -> ComposeResult:
33 |         with Vertical():
34 |             yield Static(id="question")
35 |             with Horizontal(id="sources"):
36 |                 for directory in self.directories:
37 |                     yield Static(directory.name, classes="source")
38 |                 if len(self.directories) == 1:
39 |                     yield Static("Ground truth answer", classes="source")
40 |             with Horizontal(id="answers"):
41 |                 for ind in range(len(self.directories)):
42 |                     with VerticalScroll(classes="answer"):
43 |                         yield Markdown(id=f"answer{ind}")
44 |                 if len(self.directories) == 1:
45 |                     with VerticalScroll(classes="answer"):
46 |                         yield Markdown(id="answer_truth")
47 |             with Horizontal(id="metrics"):
48 |                 for ind in range(len(self.directories)):
49 |                     yield DataTable(id=f"metrics{ind}", show_cursor=False, cell_padding=1)
50 |             with Horizontal(id="buttons"):
51 |                 yield Button.success("Next question", classes="button")
52 |                 yield Button.error("Quit", id="quit", classes="button")
53 | 
54 |     def next_question(self):
55 |         if self.result_index >= len(self.data_dicts[0]):
56 |             self.exit()
57 |             return
58 |         question = list(self.data_dicts[0].keys())[self.result_index]
59 |         self.query_one("#question", Static).update(question)
60 | 
61 |         for ind in range(len(self.directories)):
62 |             try:
63 |                 self.query_one(f"#answer{ind}", Markdown).update(self.data_dicts[ind][question]["answer"])
64 |                 if len(self.directories) == 1:
65 |                     self.query_one("#answer_truth", Markdown).update(self.data_dicts[0][question]["truth"])
66 |             except KeyError:
67 |                 self.query_one(f"#answer{ind}", Markdown).update("No answer found for that question")
68 |                 continue
69 | 
70 |             # Find all fields in the result that have numeric values
71 |             metric_columns = []
72 |             metric_values = []
73 |             question_results = self.data_dicts[ind][question]
74 |             for column, value in question_results.items():
75 |                 if isinstance(value, int | float):
76 |                     metric_columns.append(column)
77 |                     metric_values.append(round(value, 1) if isinstance(value, float) else value)
78 |             datatable = self.query_one(f"#metrics{ind}", DataTable)
79 |             datatable.clear(columns=True).add_columns(*metric_columns)
80 |             datatable.add_row(*metric_values)
81 |             datatable.add_row("" * len(metric_columns))
82 | 
83 |         self.result_index += 1
84 | 
85 | 
86 | def main(directories: list[Path], changed: str | None = None):
87 |     app = DiffApp(directories, changed)
88 |     app.run()
89 | 


--------------------------------------------------------------------------------
/src/evaltools/review/diff_app.tcss:
--------------------------------------------------------------------------------
 1 | Screen {
 2 |     padding: 1;
 3 | }
 4 | 
 5 | #sources {
 6 |     height: 2
 7 | }
 8 | 
 9 | .source {
10 |     width: 1fr;
11 |     border: solid green;
12 | }
13 | 
14 | .answer {
15 |     height: 100%;
16 |     width: 1fr;
17 |     border: solid green;
18 | }
19 | 
20 | #metrics {
21 |     height: 3
22 | }
23 | 
24 | #buttons {
25 |     height: 3;
26 |     align: center middle;
27 | }
28 | 
29 | #quit {
30 |     margin-left: 5;
31 | }
32 | 


--------------------------------------------------------------------------------
/src/evaltools/review/diff_markdown.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Any
 3 | 
 4 | from .utils import diff_directories
 5 | 
 6 | 
 7 | def _round_metric(value: Any) -> Any:
 8 |     if isinstance(value, float):
 9 |         return round(value, 1)
10 |     return value
11 | 
12 | 
13 | def main(directories: list[Path], changed: str | None = None):
14 |     data_dicts = diff_directories(directories, changed)
15 | 
16 |     markdown_str = ""
17 |     for question in data_dicts[0].keys():
18 |         markdown_str += f"**{question}**\n\n"
19 |         # now make an HTML table with the answers
20 |         markdown_str += "<table>\n"
21 |         markdown_str += (
22 |             "<tr><th></th>"
23 |             + "".join([f"<th>{directory.name}</th>" for directory in directories])
24 |             + "<th>ground_truth</th></tr>\n"
25 |         )
26 |         markdown_str += (
27 |             "<tr><th>answer</th>"
28 |             + "".join([f"<td>{data_dict[question]['answer']}</td>" for data_dict in data_dicts])
29 |             + f"<td>{data_dicts[0][question]['truth']}</td></tr>\n"
30 |         )
31 | 
32 |         # now make rows for each metric
33 |         metrics = {}
34 |         question_results = data_dicts[0][question]
35 |         for column, value in question_results.items():
36 |             if isinstance(value, int | float):
37 |                 metrics[column] = []
38 |         for metric_name in metrics.keys():
39 |             first_value = _round_metric(data_dicts[0][question].get(metric_name))
40 |             for ind, data_dict in enumerate(data_dicts):
41 |                 value = _round_metric(data_dict[question].get(metric_name))
42 |                 # Insert arrow emoji based on the difference between metric value and the first data_dict
43 |                 value_emoji = ""
44 |                 if value is not None and ind > 0 and value != first_value:
45 |                     value_emoji = "⬆️" if value > data_dicts[0][question][metric_name] else "⬇️"
46 |                 metrics[metric_name].append(f"{value} {value_emoji}")
47 |         # make a row for each metric
48 |         for metric_name, metric_values in metrics.items():
49 |             markdown_str += (
50 |                 f"<tr><th>{metric_name}</th>"
51 |                 + "".join([f"<td>{value}</td>" for value in metric_values])
52 |                 + "<td>N/A</td></tr>\n"
53 |             )
54 |         markdown_str += "</table>\n\n"
55 |     return markdown_str
56 | 


--------------------------------------------------------------------------------
/src/evaltools/review/parameters_screen.tcss:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ParametersScreen {
 4 |     align: center middle;
 5 | }
 6 | 
 7 | #dialog {
 8 |     padding: 0 1;
 9 |     width: 90%;
10 |     height: 90%;
11 |     border: thick $background 80%;
12 |     background: $surface;
13 | }
14 | 
15 | #header {
16 |     height: 2;
17 | }
18 | 
19 | #body {
20 |     height: 1fr;
21 | }
22 | 
23 | #button {
24 |     margin-top: 1;
25 |     height: 3;
26 | }
27 | 


--------------------------------------------------------------------------------
/src/evaltools/review/requirements.txt:
--------------------------------------------------------------------------------
1 | textual
2 | typer
3 | 


--------------------------------------------------------------------------------
/src/evaltools/review/summary_app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | from textual.app import App, ComposeResult
 5 | from textual.containers import Horizontal, Vertical
 6 | from textual.screen import ModalScreen
 7 | from textual.widgets import Button, DataTable, Label, TextArea
 8 | 
 9 | from .utils import summarize_results
10 | 
11 | 
12 | class ParametersScreen(ModalScreen):
13 |     CSS_PATH = "parameters_screen.tcss"
14 | 
15 |     def __init__(self, folder, parameters) -> None:
16 |         super().__init__()
17 |         self.folder = folder
18 |         self.parameters = parameters
19 | 
20 |     def compose(self) -> ComposeResult:
21 |         yield Vertical(
22 |             Label(f"Parameters for: {self.folder}", id="header"),
23 |             TextArea(json.dumps(self.parameters, indent=4), language="json", id="body"),
24 |             Button("Close", variant="primary", id="button"),
25 |             id="dialog",
26 |         )
27 | 
28 |     def on_button_pressed(self, event: Button.Pressed) -> None:
29 |         self.app.pop_screen()
30 | 
31 | 
32 | class TableApp(App):
33 |     CSS_PATH = "summary_app.tcss"
34 | 
35 |     def __init__(self, results_dir: Path) -> None:
36 |         super().__init__()
37 |         self.rows, self.row_parameters = summarize_results(results_dir)
38 | 
39 |     def compose(self) -> ComposeResult:
40 |         with Vertical():
41 |             yield DataTable(id="table")
42 |         with Horizontal(id="buttons"):
43 |             yield Button.error("Quit", id="quit", classes="button")
44 | 
45 |     def on_button_pressed(self, event: Button.Pressed) -> None:
46 |         if event.button.id == "quit":
47 |             self.exit()
48 | 
49 |     def on_mount(self) -> None:
50 |         table = self.query_one(DataTable)
51 |         table.add_columns(*self.rows[0])
52 |         table.add_rows(self.rows[1:])
53 | 
54 |     def on_data_table_cell_selected(self, event: DataTable.CellSelected) -> None:
55 |         if event.coordinate.column == 0:
56 |             folder = event.value
57 |             if folder in self.row_parameters:
58 |                 parameters = self.row_parameters[folder]
59 |                 self.push_screen(ParametersScreen(folder, parameters))
60 | 
61 | 
62 | def main(directory: Path):
63 |     app = TableApp(directory)
64 |     app.run()
65 | 


--------------------------------------------------------------------------------
/src/evaltools/review/summary_app.tcss:
--------------------------------------------------------------------------------
 1 | #table {
 2 |     height: 100%;
 3 | }
 4 | 
 5 | #buttons {
 6 |     height: 3;
 7 |     align: center middle;
 8 | }
 9 | 
10 | #quit {
11 |     margin-left: 5;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/evaltools/review/summary_markdown.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from .utils import summarize_results
 4 | 
 5 | 
 6 | def main(results_dir: Path, highlight_run: str | None = None) -> str:
 7 |     rows, row_parameters = summarize_results(results_dir)
 8 |     # transpose the rows
 9 |     rows = list(map(list, zip(*rows)))
10 | 
11 |     # make a markdown table
12 |     headers = ["metric", "stat"] + list(row_parameters.keys())
13 |     # find the index of the highlight run
14 |     if highlight_run:
15 |         highlight_run = highlight_run.strip()
16 |         highlight_run_index = headers.index(highlight_run)
17 |     else:
18 |         highlight_run_index = None
19 | 
20 |     # put a star and bold the highlight run
21 |     if highlight_run:
22 |         headers = [f"☞{header}☜" if ind == highlight_run_index else header for ind, header in enumerate(headers)]
23 | 
24 |     table = "| " + " | ".join(headers) + " |\n"
25 |     table += "|" + " |".join(["---"] * len(rows[0])) + " |\n"
26 |     for ind, row in enumerate(rows[1:]):
27 |         if row[0] == "":
28 |             row[0] = "↑"
29 |         # stringifying the row
30 |         row = [str(cell) for cell in row]
31 |         # highlight the cell that corresponds to the highlight run
32 |         if highlight_run:
33 |             row = [f"**{cell}**" if ind == highlight_run_index else cell for ind, cell in enumerate(row)]
34 |         table += "| " + " | ".join(row) + " |\n"
35 |     return table
36 | 


--------------------------------------------------------------------------------
/src/evaltools/review/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def summarize_results(results_dir):
 8 |     run_summaries = {}
 9 |     # first find the shared metrics across the runs
10 |     metric_counts = {}
11 | 
12 |     folders = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))]
13 |     folders.sort()
14 |     for folder in folders:
15 |         with open(Path(results_dir) / folder / "summary.json", encoding="utf-8") as f:
16 |             summary = json.load(f)
17 |             run_summaries[folder] = summary
18 |             # first find the common parameters across the runs
19 |             for metric_name in summary:
20 |                 metric_counts[metric_name] = metric_counts.get(metric_name, 0) + 1
21 | 
22 |     # Only show metrics that have shown up at least twice across runs
23 |     shared_metric_names = [
24 |         metric_name for metric_name, count in metric_counts.items() if count > 1 or len(run_summaries) == 1
25 |     ]
26 |     shared_metric_stats = {metric_name: set() for metric_name in shared_metric_names}
27 | 
28 |     # Now figure out what stat to show about each metric
29 |     for folder, summary in run_summaries.items():
30 |         for metric_name in shared_metric_names:
31 |             if metric_name in summary:
32 |                 metric = summary[metric_name]
33 |                 if "mean_rating" in metric:
34 |                     shared_metric_stats[metric_name].add("mean_rating")
35 |                 elif "mean" in metric:
36 |                     shared_metric_stats[metric_name].add("mean")
37 |                 if "pass_rate" in metric:
38 |                     shared_metric_stats[metric_name].add("pass_rate")
39 |                 elif "rate" in metric:
40 |                     shared_metric_stats[metric_name].add("rate")
41 | 
42 |     first_row = ["folder"]
43 |     # Build second row
44 |     second_row = [""]
45 |     for metric_name in shared_metric_names:
46 |         # The first row of columns should have metric name followed by blank column for each stat above 1 stat
47 |         first_row.append(metric_name)
48 |         if len(shared_metric_stats[metric_name]) > 1:
49 |             first_row.extend([""] * (len(shared_metric_stats[metric_name]) - 1))
50 |         # The second row of columns should just have the stat names
51 |         for stat in shared_metric_stats[metric_name]:
52 |             second_row.append(stat)
53 | 
54 |     rows = [first_row, second_row]
55 |     row_parameters = {}
56 |     # Build rest of the rows
57 |     for folder, summary in run_summaries.items():
58 |         run_row = [folder]
59 |         for metric_name in shared_metric_names:
60 |             for stat in shared_metric_stats[metric_name]:
61 |                 if stat in summary.get(metric_name, {}):
62 |                     run_row.append(summary[metric_name][stat])
63 |                 else:
64 |                     run_row.append("?")
65 |         with open(Path(results_dir) / folder / "eval_results.jsonl", encoding="utf-8") as f:
66 |             run_row.append(sum(1 for _ in f))
67 |         rows.append(run_row)
68 |         with open(Path(results_dir) / folder / "evaluate_parameters.json", encoding="utf-8") as f:
69 |             row_parameters[folder] = json.load(f)
70 | 
71 |     return rows, row_parameters
72 | 
73 | 
74 | def diff_directories(directories: list[Path], changed: str | None = None):
75 |     data_dicts = []
76 |     for directory in directories:
77 |         with open(directory / "eval_results.jsonl", encoding="utf-8") as f:
78 |             data_json = [json.loads(question_json) for question_json in f.readlines()]
79 |             data_dicts.append({question["question"]: question for question in data_json})
80 |     if changed:
81 |         # filter out questions that have the same value for the given column
82 |         for question in list(data_dicts[0].keys()):
83 |             # if question isn't in the second directory, skip
84 |             if question not in data_dicts[1]:
85 |                 data_dicts[0].pop(question)
86 |                 continue
87 |             # if either metric is None, skip
88 |             if data_dicts[0][question].get(changed) is None or data_dicts[1][question].get(changed) is None:
89 |                 data_dicts[0].pop(question)
90 |                 continue
91 |             if data_dicts[0][question].get(changed) == data_dicts[1][question].get(changed):
92 |                 if math.isclose(data_dicts[0][question].get(changed), data_dicts[1][question].get(changed)):
93 |                     data_dicts[0].pop(question)
94 |                     data_dicts[1].pop(question)
95 |     return data_dicts
96 | 


--------------------------------------------------------------------------------
/src/evaltools/service_setup.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Union
  4 | 
  5 | import openai
  6 | from azure.ai.evaluation import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
  7 | from azure.core.credentials import AzureKeyCredential
  8 | from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider
  9 | from azure.search.documents import SearchClient
 10 | 
 11 | logger = logging.getLogger("evaltools")
 12 | 
 13 | 
 14 | def get_azd_credential(tenant_id: Union[str, None]) -> AzureDeveloperCliCredential:
 15 |     if tenant_id:
 16 |         logger.info("Using Azure Developer CLI Credential for tenant %s", tenant_id)
 17 |         return AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60)
 18 |     logger.info("Using Azure Developer CLI Credential for home tenant")
 19 |     return AzureDeveloperCliCredential(process_timeout=60)
 20 | 
 21 | 
 22 | def get_openai_config() -> dict:
 23 |     if os.environ.get("OPENAI_HOST") == "azure":
 24 |         azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
 25 |         azure_deployment = os.environ.get("AZURE_OPENAI_EVAL_DEPLOYMENT")
 26 |         if os.environ.get("AZURE_OPENAI_KEY"):
 27 |             logger.info("Using Azure OpenAI Service with API Key from AZURE_OPENAI_KEY")
 28 |             openai_config: AzureOpenAIModelConfiguration = {
 29 |                 "azure_endpoint": azure_endpoint,
 30 |                 "api_key": os.environ["AZURE_OPENAI_KEY"],
 31 |                 "azure_deployment": azure_deployment,
 32 |             }
 33 |         else:
 34 |             logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential")
 35 |             openai_config: AzureOpenAIModelConfiguration = {
 36 |                 "azure_endpoint": azure_endpoint,
 37 |                 "azure_deployment": azure_deployment,
 38 |             }
 39 |             # azure-ai-evaluate will call DefaultAzureCredential behind the scenes,
 40 |             # so we must be logged in to Azure CLI with the correct tenant
 41 |     else:
 42 |         logger.info("Using OpenAI Service with API Key from OPENAICOM_KEY")
 43 |         openai_config: OpenAIModelConfiguration = {
 44 |             "api_key": os.environ["OPENAICOM_KEY"],
 45 |             "organization": os.environ["OPENAICOM_ORGANIZATION"],
 46 |             "model": os.environ["OPENAI_GPT_MODEL"],
 47 |         }
 48 |     return openai_config
 49 | 
 50 | 
 51 | def get_openai_config_dict() -> dict:
 52 |     """Return a dictionary with OpenAI configuration based on environment variables.
 53 |     This is only used by azure-ai-generative SDK right now, and should be deprecated once
 54 |     the generate functionality is available in azure-ai-evaluation SDK.
 55 |     """
 56 |     if os.environ.get("OPENAI_HOST") == "azure":
 57 |         if os.environ.get("AZURE_OPENAI_KEY"):
 58 |             logger.info("Using Azure OpenAI Service with API Key from AZURE_OPENAI_KEY")
 59 |             api_key = os.environ["AZURE_OPENAI_KEY"]
 60 |         else:
 61 |             logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential")
 62 |             azure_credential = get_azd_credential(os.environ.get("AZURE_OPENAI_TENANT_ID"))
 63 |             api_key = azure_credential.get_token("https://cognitiveservices.azure.com/.default").token
 64 |         openai_config = {
 65 |             "api_type": "azure",
 66 |             "api_base": os.environ["AZURE_OPENAI_ENDPOINT"],
 67 |             "api_key": api_key,
 68 |             "api_version": "2024-02-15-preview",
 69 |             "deployment": os.environ["AZURE_OPENAI_EVAL_DEPLOYMENT"],
 70 |             "model": os.environ["OPENAI_GPT_MODEL"],
 71 |         }
 72 |     else:
 73 |         logger.info("Using OpenAI Service with API Key from OPENAICOM_KEY")
 74 |         openai_config = {
 75 |             "api_type": "openai",
 76 |             "api_key": os.environ["OPENAICOM_KEY"],
 77 |             "organization": os.environ["OPENAICOM_ORGANIZATION"],
 78 |             "model": os.environ["OPENAI_GPT_MODEL"],
 79 |             "deployment": "none-needed-for-openaicom",
 80 |         }
 81 |     return openai_config
 82 | 
 83 | 
 84 | def get_search_client():
 85 |     if api_key := os.environ.get("AZURE_SEARCH_KEY"):
 86 |         logger.info("Using Azure Search Service with API Key from AZURE_SEARCH_KEY")
 87 |         azure_credential = AzureKeyCredential(api_key)
 88 |     else:
 89 |         logger.info("Using Azure Search Service with Azure Developer CLI Credential")
 90 |         azure_credential = get_azd_credential(os.environ.get("AZURE_SEARCH_TENANT_ID"))
 91 | 
 92 |     return SearchClient(
 93 |         endpoint=os.environ["AZURE_SEARCH_ENDPOINT"],
 94 |         index_name=os.environ["AZURE_SEARCH_INDEX"],
 95 |         credential=azure_credential,
 96 |     )
 97 | 
 98 | 
 99 | def get_openai_client(
100 |     oai_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], azure_credential=None
101 | ):
102 |     if "azure_deployment" in oai_config:
103 |         azure_token_provider = None
104 | 
105 |         if azure_credential is None and not os.environ.get("AZURE_OPENAI_KEY"):
106 |             logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential")
107 |             azure_credential = get_azd_credential(os.environ.get("AZURE_OPENAI_TENANT_ID"))
108 |         if azure_credential is not None:
109 |             azure_token_provider = get_bearer_token_provider(
110 |                 azure_credential, "https://cognitiveservices.azure.com/.default"
111 |             )
112 |         return openai.AzureOpenAI(
113 |             api_version="2024-02-15-preview",
114 |             azure_endpoint=oai_config["azure_endpoint"],
115 |             api_key=oai_config["api_key"] if oai_config.get("api_key") else None,
116 |             azure_ad_token_provider=azure_token_provider,
117 |             azure_deployment=oai_config["azure_deployment"],
118 |         )
119 |     elif "organization" in oai_config:
120 |         oai_config: OpenAIModelConfiguration = oai_config
121 |         return openai.OpenAI(api_key=oai_config["api_key"], organization=oai_config["organization"])
122 | 


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | import requests
 4 | 
 5 | from evaltools.eval.evaluate import send_question_to_target
 6 | 
 7 | 
 8 | def test_send_question_to_target_valid():
 9 |     # Test case 1: Valid response
10 |     response = {
11 |         "message": {"content": "This is the answer"},
12 |         "context": {"data_points": {"text": ["Context 1", "Context 2"]}},
13 |     }
14 |     requests.post = lambda url, headers, json: MockResponse(response)
15 |     result = send_question_to_target("Question 1", "http://example.com")
16 |     assert result["answer"] == "This is the answer"
17 |     assert result["context"] == "Context 1\n\nContext 2"
18 |     assert result["latency"] == 1
19 | 
20 | 
21 | def test_send_question_to_target_missing_error_store():
22 |     response = {}
23 |     requests.post = lambda url, headers, json: MockResponse(response)
24 |     result = send_question_to_target("Question", "http://example.com")
25 |     assert result["answer"] == (
26 |         "Response does not adhere to the expected schema. "
27 |         "The answer should be accessible via the JMESPath expression 'message.content' "
28 |         "and the context should be accessible via the JMESPath expression 'context.data_points.text'. "
29 |         "Either adjust the app response or adjust send_question_to_target() "
30 |         "in evaluate.py to match the actual schema.\n"
31 |         "Response: {}"
32 |     )
33 |     assert result["context"] == (
34 |         "Response does not adhere to the expected schema. "
35 |         "The answer should be accessible via the JMESPath expression 'message.content' "
36 |         "and the context should be accessible via the JMESPath expression 'context.data_points.text'. "
37 |         "Either adjust the app response or adjust send_question_to_target() "
38 |         "in evaluate.py to match the actual schema.\n"
39 |         "Response: {}"
40 |     )
41 | 
42 | 
43 | def test_send_question_to_target_missing_all():
44 |     response = {}
45 |     requests.post = lambda url, headers, json: MockResponse(response)
46 |     try:
47 |         send_question_to_target("Question", "Answer", "http://example.com", raise_error=True)
48 |     except Exception as e:
49 |         assert str(e) == (
50 |             "Response does not adhere to the expected schema. "
51 |             "The answer should be accessible via the JMESPath expression 'message.content' "
52 |             "and the context should be accessible via the JMESPath expression 'context.data_points.text'. "
53 |             "Either adjust the app response or adjust send_question_to_target() "
54 |             "in evaluate.py to match the actual schema.\n"
55 |             "Response: {}"
56 |         )
57 | 
58 | 
59 | def test_send_question_to_target_missing_content():
60 |     response = {"message": {}, "context": {"data_points": {"text": ["Context 1", "Context 2"]}}}
61 |     requests.post = lambda url, headers, json: MockResponse(response)
62 |     try:
63 |         send_question_to_target("Question", "Answer", "http://example.com", raise_error=True)
64 |     except Exception as e:
65 |         assert str(e) == (
66 |             "Response does not adhere to the expected schema. "
67 |             "The answer should be accessible via the JMESPath expression 'message.content' "
68 |             "and the context should be accessible via the JMESPath expression 'context.data_points.text'. "
69 |             "Either adjust the app response or adjust send_question_to_target() "
70 |             "in evaluate.py to match the actual schema.\n"
71 |             "Response: {'message': {}, 'context': {'data_points': {'text': ['Context 1', 'Context 2']}}}"
72 |         )
73 | 
74 | 
75 | def test_send_question_to_target_missing_context():
76 |     # Test case 5: Missing 'context' key in response
77 |     response = {"message": {"content": "This is the answer"}}
78 |     requests.post = lambda url, headers, json: MockResponse(response)
79 |     try:
80 |         send_question_to_target("Question", "Answer", "http://example.com", raise_error=True)
81 |     except Exception as e:
82 |         assert str(e) == (
83 |             "Response does not adhere to the expected schema. "
84 |             "The answer should be accessible via the JMESPath expression 'message.content' "
85 |             "and the context should be accessible via the JMESPath expression 'context.data_points.text'. "
86 |             "Either adjust the app response or adjust send_question_to_target() "
87 |             "in evaluate.py to match the actual schema.\n"
88 |             "Response: {'message': {'content': 'This is the answer'}}"
89 |         )
90 | 
91 | 
92 | class MockResponse:
93 |     def __init__(self, json_data):
94 |         self.json_data = json_data
95 |         self.elapsed = timedelta(seconds=1)
96 | 
97 |     def json(self):
98 |         return self.json_data
99 | 


--------------------------------------------------------------------------------
/tests/test_evaluate_metrics.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from evaltools.eval.evaluate_metrics import builtin_metrics, code_metrics, prompt_metrics
  4 | 
  5 | 
  6 | def test_answer_length():
  7 |     metric = code_metrics.AnswerLengthMetric()
  8 |     metric_function = metric.evaluator_fn()
  9 |     assert callable(metric_function)
 10 |     assert metric_function(response="Hello, world!") == {"answer_length": 13}
 11 |     df = pd.DataFrame([{"answer_length": 20}, {"answer_length": 10}, {"answer_length": 5}])
 12 |     assert metric.get_aggregate_stats(df) == {"mean": 11.67, "max": 20, "min": 5}
 13 | 
 14 | 
 15 | def test_answer_length_new():
 16 |     metric = code_metrics.AnswerLengthMetric()
 17 |     metric_function = metric.evaluator_fn()
 18 |     assert metric_function(response=None) == {"answer_length": -1}
 19 |     df = pd.DataFrame([{"answer_length": 20}, {"answer_length": 10}, {"answer_length": 5}, {"answer_length": -1}])
 20 |     assert metric.get_aggregate_stats(df) == {"mean": 11.67, "max": 20, "min": 5}
 21 | 
 22 | 
 23 | def test_has_citation():
 24 |     metric = code_metrics.HasCitationMetric()
 25 |     metric_function = metric.evaluator_fn()
 26 |     assert callable(metric_function)
 27 |     assert metric_function(response="Hello, world!") == {"has_citation": False}
 28 |     assert metric_function(response="Hello, [world.pdf]!") == {"has_citation": True}
 29 | 
 30 |     df = pd.DataFrame([{"has_citation": True}, {"has_citation": False}, {"has_citation": True}])
 31 |     assert metric.get_aggregate_stats(df) == {"total": 2, "rate": 0.67}
 32 | 
 33 | 
 34 | def test_has_citation_none():
 35 |     metric = code_metrics.HasCitationMetric()
 36 |     metric_function = metric.evaluator_fn()
 37 |     assert metric_function(response=None) == {"has_citation": -1}
 38 |     df = pd.DataFrame([{"has_citation": True}, {"has_citation": False}, {"has_citation": -1}])
 39 |     assert metric.get_aggregate_stats(df) == {"total": 1, "rate": 0.5}
 40 | 
 41 | 
 42 | def test_citation_match():
 43 |     metric = code_metrics.CitationMatchMetric()
 44 |     metric_function = metric.evaluator_fn()
 45 |     assert callable(metric_function)
 46 |     assert metric_function(ground_truth="answer in [file.pdf]", response="answer in [file2.pdf]") == {
 47 |         "citation_match": False
 48 |     }
 49 |     assert metric_function(ground_truth="answer in [file2.pdf]", response="answer in [file2.pdf]") == {
 50 |         "citation_match": True
 51 |     }
 52 |     assert metric_function(ground_truth="answer in [file2.pdf]", response="answer in [file1.pdf][file2.pdf]") == {
 53 |         "citation_match": True
 54 |     }
 55 |     df = pd.DataFrame([{"citation_match": True}, {"citation_match": False}, {"citation_match": True}])
 56 |     assert metric.get_aggregate_stats(df) == {"total": 2, "rate": 0.67}
 57 | 
 58 | 
 59 | def test_citation_match_filenames_only():
 60 |     truth = 'Use settings like "python.linting.enabled": true, "[python]" [best-practices-for-prompting-github.html]'
 61 |     response = 'Use extension with setting "python.linting.enabled" [best-practices-for-prompting-github.html]'
 62 |     metric = code_metrics.CitationMatchMetric()
 63 |     metric_function = metric.evaluator_fn()
 64 |     assert metric_function(ground_truth=truth, response=response) == {"citation_match": True}
 65 | 
 66 | 
 67 | def test_citation_match_none():
 68 |     metric = code_metrics.CitationMatchMetric()
 69 |     metric_function = metric.evaluator_fn()
 70 |     assert metric_function(ground_truth="Answer", response=None) == {"citation_match": -1}
 71 |     df = pd.DataFrame([{"citation_match": True}, {"citation_match": False}, {"citation_match": -1}])
 72 |     assert metric.get_aggregate_stats(df) == {"total": 1, "rate": 0.5}
 73 | 
 74 | 
 75 | def test_latency():
 76 |     metric = code_metrics.LatencyMetric()
 77 |     metric_function = metric.evaluator_fn()
 78 |     assert callable(metric_function)
 79 |     assert metric_function(data={"latency": 20}) == {}
 80 |     df = pd.DataFrame([{"latency": 20}, {"latency": 10}, {"latency": 5}])
 81 |     assert metric.get_aggregate_stats(df) == {"mean": 11.67, "max": 20, "min": 5}
 82 | 
 83 | 
 84 | def test_custom_relevance():
 85 |     metric = prompt_metrics.RelevanceMetric()
 86 | 
 87 |     assert callable(metric.evaluator_fn(openai_config=None))
 88 |     df = pd.DataFrame([{"myrelevance": 5}, {"myrelevance": 4}, {"myrelevance": 3}])
 89 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
 90 | 
 91 | 
 92 | def test_custom_coherence():
 93 |     metric = prompt_metrics.CoherenceMetric()
 94 | 
 95 |     assert callable(metric.evaluator_fn(openai_config=None))
 96 |     df = pd.DataFrame([{"mycoherence": 5}, {"mycoherence": 4}, {"mycoherence": 3}])
 97 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
 98 | 
 99 | 
100 | def test_custom_groundedness():
101 |     metric = prompt_metrics.GroundednessMetric()
102 | 
103 |     assert callable(metric.evaluator_fn(openai_config=None))
104 |     df = pd.DataFrame([{"mygroundedness": 5}, {"mygroundedness": 4}, {"mygroundedness": 3}])
105 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
106 | 
107 | 
108 | def test_custom_relevance_missing_values():
109 |     metric = prompt_metrics.RelevanceMetric()
110 | 
111 |     assert callable(metric.evaluator_fn(openai_config=None))
112 |     df = pd.DataFrame([{"myrelevance": 2}, {"myrelevance": 4}, {"myrelevance": "Failed"}])
113 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 3.0, "pass_count": 1, "pass_rate": 0.33}
114 | 
115 | 
116 | def test_builtin_coherence():
117 |     metric = builtin_metrics.BuiltinCoherenceMetric()
118 |     assert metric.METRIC_NAME == "gpt_coherence"
119 |     df = pd.DataFrame([{"gpt_coherence": 5}, {"gpt_coherence": 4}, {"gpt_coherence": 3}])
120 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
121 | 
122 | 
123 | def test_builtin_relevance():
124 |     metric = builtin_metrics.BuiltinRelevanceMetric()
125 |     assert metric.METRIC_NAME == "gpt_relevance"
126 |     df = pd.DataFrame([{"gpt_relevance": 5}, {"gpt_relevance": 4}, {"gpt_relevance": 3}])
127 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
128 | 
129 | 
130 | def test_builtin_groundedness():
131 |     metric = builtin_metrics.BuiltinGroundednessMetric()
132 |     assert metric.METRIC_NAME == "gpt_groundedness"
133 |     df = pd.DataFrame([{"gpt_groundedness": 5}, {"gpt_groundedness": 4}, {"gpt_groundedness": 3}])
134 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
135 | 
136 | 
137 | def test_builtin_fluency():
138 |     metric = builtin_metrics.BuiltinFluencyMetric()
139 |     assert metric.METRIC_NAME == "gpt_fluency"
140 |     df = pd.DataFrame([{"gpt_fluency": 5}, {"gpt_fluency": 4}, {"gpt_fluency": 3}])
141 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
142 | 
143 | 
144 | def test_builtin_similarity():
145 |     metric = builtin_metrics.BuiltinSimilarityMetric()
146 |     assert metric.METRIC_NAME == "gpt_similarity"
147 |     df = pd.DataFrame([{"gpt_similarity": 5}, {"gpt_similarity": 4}, {"gpt_similarity": 3}])
148 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67}
149 | 
150 | 
151 | def test_builtin_f1_score():
152 |     metric = builtin_metrics.BuiltinF1ScoreMetric()
153 |     assert metric.METRIC_NAME == "f1_score"
154 |     df = pd.DataFrame([{"f1_score": 5}, {"f1_score": 4}, {"f1_score": 3}])
155 |     assert metric.get_aggregate_stats(df) == {"mean": 4.0, "max": 5, "min": 3}
156 | 
157 | 
158 | def test_builtin_coherence_missing_values():
159 |     metric = builtin_metrics.BuiltinCoherenceMetric()
160 |     assert metric.METRIC_NAME == "gpt_coherence"
161 |     df = pd.DataFrame([{"gpt_coherence": "Failed"}, {"gpt_coherence": 4}, {"gpt_coherence": 3}])
162 |     assert metric.get_aggregate_stats(df) == {"mean_rating": 3.5, "pass_count": 1, "pass_rate": 0.33}
163 | 


--------------------------------------------------------------------------------