├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .env.sample ├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yaml └── workflows │ ├── azure-dev.yaml │ ├── bicep-audit.yml │ └── python.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode ├── launch.json └── settings.json ├── LICENSE.md ├── README.md ├── azure.yaml ├── docs ├── screenshot_compare.png └── screenshot_summary.png ├── dontknows.config.json ├── example_config.json ├── example_input ├── prompt_ignoresources.txt ├── prompt_nomarkdownmention.txt ├── prompt_piglatin.txt ├── prompt_refined.txt ├── prompt_refined_trimmed.txt ├── prompt_ungrounded.txt ├── prompt_weak.txt ├── qa.jsonl └── qa_dontknows.jsonl ├── example_results ├── baseline │ ├── config.json │ ├── eval_results.jsonl │ ├── evaluate_parameters.json │ └── summary.json ├── baseline2 │ ├── config.json │ ├── eval_results.jsonl │ ├── evaluate_parameters.json │ └── summary.json ├── prompt_nomarkdownmention │ ├── config.json │ ├── eval_results.jsonl │ ├── evaluate_parameters.json │ └── summary.json └── prompt_nomarkdownmention2 │ ├── config.json │ ├── eval_results.jsonl │ ├── evaluate_parameters.json │ └── summary.json ├── infra ├── core │ ├── ai │ │ └── cognitiveservices.bicep │ └── security │ │ └── role.bicep ├── main.bicep └── main.parameters.json ├── pyproject.toml ├── src └── evaltools │ ├── __init__.py │ ├── __main__.py │ ├── cli.py │ ├── eval │ ├── __init__.py │ ├── evaluate.py │ └── evaluate_metrics │ │ ├── __init__.py │ │ ├── base_metric.py │ │ ├── builtin_metrics.py │ │ ├── code_metrics.py │ │ ├── prompt_metrics.py │ │ └── prompts │ │ ├── dontknowness.prompty │ │ ├── mycoherence.prompty │ │ ├── mygroundedness.prompty │ │ └── myrelevance.prompty │ ├── gen │ ├── __init__.py │ └── generate.py │ ├── review │ ├── __init__.py │ ├── answers.html │ ├── diff_app.py │ ├── diff_app.tcss │ ├── diff_markdown.py │ ├── parameters_screen.tcss │ ├── requirements.txt │ ├── summary_app.py │ ├── summary_app.tcss │ ├── summary_markdown.py │ └── utils.py │ └── service_setup.py └── tests ├── test_evaluate.py └── test_evaluate_metrics.py /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/devcontainers/python:3.11-bullseye 2 | 3 | # Install pip for Python 3.11 4 | RUN python -m pip install --upgrade pip 5 | 6 | # Necessary for promptflow keyring to work on Linux with dbus backend 7 | RUN sudo apt-get update 8 | RUN sudo apt-get install -y gcc cmake pkg-config libdbus-1-dev libglib2.0-dev 9 | RUN pip install keyrings.alt dbus-python 10 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AI RAG Chat Evaluator", 3 | "build": { 4 | "dockerfile": "Dockerfile", 5 | "context": ".." 6 | }, 7 | "features": { 8 | "ghcr.io/azure/azure-dev/azd:latest": {} 9 | }, 10 | "customizations": { 11 | "vscode": { 12 | "extensions": [ 13 | "ms-azuretools.vscode-bicep", 14 | "ms-python.python" 15 | ] 16 | } 17 | }, 18 | "remoteUser": "vscode", 19 | "hostRequirements": { 20 | "memory": "8gb" 21 | }, 22 | "postCreateCommand": "pip install -e .\"[dev]\"" 23 | } 24 | -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | OPENAI_HOST="azure" 2 | OPENAI_GPT_MODEL="gpt-4" 3 | # For Azure OpenAI only: 4 | AZURE_OPENAI_EVAL_DEPLOYMENT="" 5 | AZURE_OPENAI_ENDPOINT="https://.openai.azure.com" 6 | AZURE_OPENAI_KEY="" 7 | AZURE_OPENAI_TENANT_ID="" 8 | # For openai.com only: 9 | OPENAICOM_KEY="" 10 | OPENAICOM_ORGANIZATION="" 11 | # For generating QA based on search index: 12 | AZURE_SEARCH_ENDPOINT="https://.search.windows.net" 13 | AZURE_SEARCH_INDEX="" 14 | AZURE_SEARCH_KEY="" 15 | AZURE_SEARCH_TENANT_ID="" 16 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to [project-title] 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | 11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 14 | 15 | - [Code of Conduct](#coc) 16 | - [Issues and Bugs](#issue) 17 | - [Feature Requests](#feature) 18 | - [Submission Guidelines](#submit) 19 | 20 | ## Code of Conduct 21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 22 | 23 | ## Found an Issue? 24 | If you find a bug in the source code or a mistake in the documentation, you can help us by 25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can 26 | [submit a Pull Request](#submit-pr) with a fix. 27 | 28 | ## Want a Feature? 29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub 30 | Repository. If you would like to *implement* a new feature, please submit an issue with 31 | a proposal for your work first, to be sure that we can use it. 32 | 33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr). 34 | 35 | ## Submission Guidelines 36 | 37 | ### Submitting an Issue 38 | Before you submit an issue, search the archive, maybe your question was already answered. 39 | 40 | If your issue appears to be a bug, and hasn't been reported, open a new issue. 41 | Help us to maximize the effort we can spend fixing issues and adding new 42 | features, by not reporting duplicate issues. Providing the following information will increase the 43 | chances of your issue being dealt with quickly: 44 | 45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps 46 | * **Version** - what version is affected (e.g. 0.1.2) 47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you 48 | * **Browsers and Operating System** - is this a problem with all browsers? 49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps 50 | * **Related Issues** - has a similar issue been reported before? 51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be 52 | causing the problem (line of code or commit) 53 | 54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new]. 55 | 56 | ### Submitting a Pull Request (PR) 57 | Before you submit your Pull Request (PR) consider the following guidelines: 58 | 59 | * Search the repository for an open or closed PR that relates to your submission. You don't want to duplicate effort. 60 | * Make your changes in a new git fork 61 | * Install the development tools and pre-commit hooks: 62 | 63 | ```shell 64 | python3 -m pip install -e ."[dev"] 65 | pre-commit install 66 | ``` 67 | 68 | * Commit your changes using a descriptive commit message 69 | * Push your branch to GitHub 70 | * In GitHub, create a pull request and request a review 71 | 72 | That's it! Thank you for your contribution! 73 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 4 | > Please provide us with the following information: 5 | > --------------------------------------------------------------- 6 | 7 | ### This issue is for a: (mark with an `x`) 8 | ``` 9 | - [ ] bug report -> please search issues before submitting 10 | - [ ] feature request 11 | - [ ] documentation issue or request 12 | - [ ] regression (a behavior that used to work and stopped in a new release) 13 | ``` 14 | 15 | ### Minimal steps to reproduce 16 | > 17 | 18 | ### Any log messages given by the failure 19 | > 20 | 21 | ### Expected/desired behavior 22 | > 23 | 24 | ### OS and Version? 25 | > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?) 26 | 27 | ### Versions 28 | > 29 | 30 | ### Mention any other details that might be useful 31 | 32 | > --------------------------------------------------------------- 33 | > Thanks! We'll be in touch soon. 34 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Purpose 2 | 3 | * ... 4 | 5 | ## Does this introduce a breaking change? 6 | 7 | ``` 8 | [ ] Yes 9 | [ ] No 10 | ``` 11 | 12 | ## Pull Request Type 13 | What kind of change does this Pull Request introduce? 14 | 15 | 16 | ``` 17 | [ ] Bugfix 18 | [ ] Feature 19 | [ ] Code style update (formatting, local variables) 20 | [ ] Refactoring (no functional changes, no api changes) 21 | [ ] Documentation content changes 22 | [ ] Other... Please describe: 23 | ``` 24 | 25 | ## How to Test 26 | * Get the code 27 | 28 | ``` 29 | git clone [repo-address] 30 | cd [repo-name] 31 | git checkout [branch-name] 32 | npm install 33 | ``` 34 | 35 | * Test the code 36 | 37 | ``` 38 | ``` 39 | 40 | ## What to Check 41 | Verify that the following are valid 42 | * ... 43 | 44 | ## Other Information 45 | -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | interval: "weekly" 13 | 14 | - package-ecosystem: "pip" 15 | directory: "/" 16 | schedule: 17 | interval: "weekly" 18 | -------------------------------------------------------------------------------- /.github/workflows/azure-dev.yaml: -------------------------------------------------------------------------------- 1 | name: Test azd deployment 2 | on: 3 | workflow_dispatch: 4 | push: 5 | # Run when commits are pushed to mainline branch (main or master) 6 | # Set this to the mainline branch you are using 7 | branches: 8 | - main 9 | 10 | # GitHub Actions workflow to deploy to Azure using azd 11 | # To configure required secrets for connecting to Azure, simply run `azd pipeline config` 12 | 13 | # Set up permissions for deploying with secretless Azure federated credentials 14 | # https://learn.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication 15 | permissions: 16 | id-token: write 17 | contents: read 18 | 19 | jobs: 20 | build: 21 | if: github.repository == 'Azure-samples/ai-rag-chat-evaluator' 22 | runs-on: ubuntu-latest 23 | env: 24 | # azd required 25 | AZURE_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} 26 | AZURE_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} 27 | AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} 28 | AZURE_CREDENTIALS: ${{ secrets.AZURE_CREDENTIALS }} 29 | # project specific 30 | OPENAI_HOST: ${{ vars.OPENAI_HOST }} 31 | AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }} 32 | AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }} 33 | EVAL_GPT_DEPLOYMENT_CAPACITY: ${{ vars.EVAL_GPT_DEPLOYMENT_CAPACITY }} 34 | OPENAI_ORGANIZATION: ${{ vars.OPENAI_ORGANIZATION }} 35 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 36 | steps: 37 | - name: Checkout 38 | uses: actions/checkout@v4 39 | 40 | - name: Install azd 41 | uses: Azure/setup-azd@v2.1.0 42 | 43 | - name: Log in with Azure (Federated Credentials) 44 | if: ${{ env.AZURE_CLIENT_ID != '' }} 45 | run: | 46 | azd auth login ` 47 | --client-id "$Env:AZURE_CLIENT_ID" ` 48 | --federated-credential-provider "github" ` 49 | --tenant-id "$Env:AZURE_TENANT_ID" 50 | shell: pwsh 51 | 52 | - name: Log in with Azure (Client Credentials) 53 | if: ${{ env.AZURE_CREDENTIALS != '' }} 54 | run: | 55 | $info = $Env:AZURE_CREDENTIALS | ConvertFrom-Json -AsHashtable; 56 | Write-Host "::add-mask::$($info.clientSecret)" 57 | 58 | azd auth login ` 59 | --client-id "$($info.clientId)" ` 60 | --client-secret "$($info.clientSecret)" ` 61 | --tenant-id "$($info.tenantId)" 62 | shell: pwsh 63 | env: 64 | AZURE_CREDENTIALS: ${{ secrets.AZURE_CREDENTIALS }} 65 | 66 | - name: Provision Infrastructure 67 | run: azd provision --no-prompt 68 | env: 69 | AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }} 70 | AZURE_LOCATION: ${{ vars.AZURE_LOCATION }} 71 | AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} 72 | 73 | - name: Deploy Application 74 | run: azd deploy --no-prompt 75 | env: 76 | AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }} 77 | AZURE_LOCATION: ${{ vars.AZURE_LOCATION }} 78 | AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} 79 | 80 | - name: Setup python 81 | uses: actions/setup-python@v5 82 | with: 83 | python-version: 3.11 84 | architecture: x64 85 | 86 | - name: Install PromptFlow dbus dependency 87 | run: | 88 | sudo apt-get update 89 | sudo apt-get install -y gcc cmake pkg-config libdbus-1-dev libglib2.0-dev 90 | python -m pip install --upgrade pip 91 | pip install keyrings.alt dbus-python 92 | 93 | - name: Install dependencies 94 | run: | 95 | python -m pip install --upgrade pip 96 | pip install -e .[dev] 97 | 98 | - name: Run evaluation 99 | run: | 100 | azd env get-values > .env 101 | source .env 102 | python -m evaltools evaluate --config=example_config.json --numquestions=2 --targeturl=${{ env.TARGET_URL }} 103 | env: 104 | TARGET_URL: ${{ secrets.TARGET_URL }} 105 | -------------------------------------------------------------------------------- /.github/workflows/bicep-audit.yml: -------------------------------------------------------------------------------- 1 | name: Validate bicep templates 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - "**/*.bicep" 8 | pull_request: 9 | branches: 10 | - main 11 | paths: 12 | - "**/*.bicep" 13 | workflow_dispatch: 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | permissions: 19 | security-events: write 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v4 23 | 24 | - name: Run Microsoft Security DevOps Analysis 25 | uses: microsoft/security-devops-action@preview 26 | id: msdo 27 | continue-on-error: true 28 | with: 29 | tools: templateanalyzer 30 | 31 | - name: Upload alerts to Security tab 32 | uses: github/codeql-action/upload-sarif@v3 33 | if: github.repository_owner == 'Azure-Samples' 34 | with: 35 | sarif_file: ${{ steps.msdo.outputs.sarifFile }} 36 | -------------------------------------------------------------------------------- /.github/workflows/python.yaml: -------------------------------------------------------------------------------- 1 | name: Python checks 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths-ignore: 7 | - "**.md" 8 | - ".devcontainer/**" 9 | - ".github/**" 10 | - "example_results/**" 11 | - "example_input/**" 12 | pull_request: 13 | branches: [ main ] 14 | paths-ignore: 15 | - "**.md" 16 | - ".devcontainer/**" 17 | - ".github/**" 18 | - "example_results/**" 19 | - "example_input/**" 20 | workflow_call: 21 | 22 | jobs: 23 | test_package: 24 | name: Test ${{ matrix.os }} Python ${{ matrix.python_version }} 25 | runs-on: ${{ matrix.os }} 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | # macos-13 is x84 and macos-14-large is arm64 30 | os: ["ubuntu-latest", "windows-latest", "macos-13", "macos-14-large"] 31 | python_version: ["3.9", "3.10", "3.11", "3.12"] 32 | steps: 33 | - uses: actions/checkout@v4 34 | - name: Setup python 35 | uses: actions/setup-python@v5 36 | with: 37 | python-version: ${{ matrix.python_version }} 38 | architecture: x64 39 | - name: Install dependencies 40 | run: | 41 | python -m pip install --upgrade pip 42 | pip install -e .[dev] 43 | - name: Lint with ruff 44 | run: ruff check . 45 | - name: Check formatting with ruff 46 | run: ruff format . --check 47 | - name: Run Pytest tests 48 | run: python -m pytest 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Azure az webapp deployment details 2 | .azure 3 | *_env 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ 143 | 144 | # NPM 145 | npm-debug.log* 146 | node_modules 147 | static/ 148 | 149 | # From azure-ai-generative 150 | mlruns/ 151 | 152 | .DS_Store 153 | 154 | # Additional test directories used by maintainer 155 | pamelas_blog_input/ 156 | pamelas_blog_results/ 157 | pamelas_blog_results_dontknows/ 158 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/astral-sh/ruff-pre-commit 9 | rev: v0.6.2 10 | hooks: 11 | # Run the linter. 12 | - id: ruff 13 | args: [ --fix ] 14 | # Run the formatter. 15 | - id: ruff-format 16 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python Debugger: Current File", 9 | "type": "debugpy", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "justMyCode": false 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": ["tests"], 3 | "python.testing.unittestEnabled": false, 4 | "python.testing.pytestEnabled": true, 5 | "files.exclude": { 6 | ".coverage": true, 7 | ".pytest_cache": true, 8 | "__pycache__": true, 9 | ".mypy_cache": true 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Evaluating a RAG Chat App 2 | 3 | This repo contains scripts and tools for evaluating a chat app that uses the RAG architecture. 4 | There are many parameters that affect the quality and style of answers generated by the chat app, 5 | such as the system prompt, search parameters, and GPT model parameters. 6 | 7 | Whenever you are making changes to a RAG chat with the goal of improving the answers, you should evaluate the results. 8 | This repository offers tools to make it easier to run evaluations, plus examples of evaluations 9 | that we've run on our [popular RAG chat solution](https://github.com/Azure-Samples/azure-search-openai-demo/). 10 | 11 | [📺 Watch a video overview of this repo](https://www.youtube.com/watch?v=mM8pZAI2C5w) 12 | 13 | Table of contents: 14 | 15 | * [Cost estimation](#cost-estimation) 16 | * [Setting up this project](#setting-up-this-project) 17 | * [Deploying a GPT-4 model](#deploying-a-gpt-4-model) 18 | * [Generating ground truth data](#generating-ground-truth-data) 19 | * [Running an evaluation](#running-an-evaluation) 20 | * [Viewing the results](#viewing-the-results) 21 | * [Measuring app's ability to say "I don't know"](#measuring-apps-ability-to-say-i-dont-know) 22 | 23 | ## Cost estimation 24 | 25 | There are several places where this project can incur costs: 26 | 27 | | Cost | Description | Estimated tokens used | 28 | | --- | --- | --- | 29 | | Generating ground truth data | This is a one-time cost for generating the initial set of questions and answers, and involves pulling data down from your search index and sending it to the GPT model. | 1000 tokens per question generated, which would be 200,000 tokens for the recommended 200 questions. | 30 | | Running evaluations | Each time you run an evaluation, you may choose to use the GPT-based evaluators (groundedness, coherence, etc). For each GPT-evaluator used, you will incur costs for the tokens used by the GPT model. | 1000 tokens per question per evaluator used, which would be 600,000 tokens for the default 200 questions and 3 evaluators. | 31 | 32 | For a full estimate of the costs for your region and model, see the [Azure OpenAI pricing page](https://azure.microsoft.com/pricing/details/cognitive-services/openai-service/) or use the [Azure OpenAI pricing calculator](https://azure.com/e/f0dc5c3acb43437d925209c09c775a6d). 33 | 34 | ## Setting up this project 35 | 36 | If you open this project in a Dev Container or GitHub Codespaces, it will automatically set up the environment for you. 37 | If not, then follow these steps: 38 | 39 | 1. Install Python 3.10 or higher 40 | 2. Create a Python [virtual environment](https://learn.microsoft.com/azure/developer/python/get-started?tabs=cmd#configure-python-virtual-environment). 41 | 3. Inside that virtual environment, install the project: 42 | 43 | ```shell 44 | python -m pip install -e . 45 | ``` 46 | 47 | ## Deploying a GPT-4 model 48 | 49 | It's best to use a GPT-4 model for performing the evaluation, even if your chat app uses GPT-3.5 or another model. 50 | You can either use an Azure OpenAI instance or an openai.com instance. 51 | 52 | ### Using a new Azure OpenAI instance 53 | 54 | To use a new Azure OpenAI instance, you'll need to create a new instance and deploy the app to it. 55 | We've made that easy to deploy with the `azd` CLI tool. 56 | 57 | 1. Install the [Azure Developer CLI](https://aka.ms/azure-dev/install) 58 | 2. Run `azd auth login` to log in to your Azure account 59 | 3. Run `azd up` to deploy a new GPT-4 instance 60 | 4. Create a `.env` file based on `.env.sample`: 61 | 62 | ```shell 63 | cp .env.sample .env 64 | ``` 65 | 66 | 5. Run this commands to get the required values for `AZURE_OPENAI_EVAL_DEPLOYMENT` and `AZURE_OPENAI_SERVICE` from your deployed resource group and paste those values into the `.env` file: 67 | 68 | ```shell 69 | azd env get-value AZURE_OPENAI_EVAL_DEPLOYMENT 70 | azd env get-value AZURE_OPENAI_SERVICE 71 | ``` 72 | 73 | ### Using an existing Azure OpenAI instance 74 | 75 | If you already have an Azure OpenAI instance, you can use that instead of creating a new one. 76 | 77 | 1. Create `.env` file by copying `.env.sample` 78 | 2. Fill in the values for your instance: 79 | 80 | ```shell 81 | AZURE_OPENAI_EVAL_DEPLOYMENT="" 82 | AZURE_OPENAI_ENDPOINT="https://.openai.azure.com" 83 | ``` 84 | 85 | 3. The scripts default to keyless access (via `AzureDefaultCredential`), but you can optionally use a key by setting `AZURE_OPENAI_KEY` in `.env`. 86 | 87 | ### Using an openai.com instance 88 | 89 | If you have an openai.com instance, you can use that instead of an Azure OpenAI instance. 90 | 91 | 1. Create `.env` file by copying `.env.sample` 92 | 2. Change `OPENAI_HOST` to "openai" and fill in the key for for your OpenAI account. If you do not have an organization, you can leave that blank. 93 | 94 | ```shell 95 | OPENAI_HOST="openai" 96 | OPENAICOM_KEY="" 97 | OPENAICOM_ORGANIZATION="" 98 | ``` 99 | 100 | ## Generating ground truth data 101 | 102 | In order to evaluate new answers, they must be compared to "ground truth" answers: the ideal answer for a particular question. See `example_input/qa.jsonl` for an example of the format. 103 | We recommend at least 200 QA pairs if possible. 104 | 105 | There are a few ways to get this data: 106 | 107 | 1. Manually curate a set of questions and answers that you consider to be ideal. This is the most accurate, but also the most time-consuming. Make sure your answers include citations in the expected format. This approach requires domain expertise in the data. 108 | 2. Use a generator script to generate a set of questions and answers, and use them directly. This is the fastest, but may also be the least accurate. 109 | 3. Use a generator script to generate a set of questions and answers, and then manually curate them, rewriting any answers that are subpar and adding missing citations. This is a good middle ground, and is what we recommend. 110 | 111 |
112 | Additional tips for ground truth data generation 113 | 114 | * Generate more QA pairs than you need, then prune them down manually based on quality and overlap. Remove low quality answers, and remove questions that are too similar to other questions. 115 | * Be aware of the knowledge distribution in the document set, so you effectively sample questions across the knowledge space. 116 | * Once your chat application is live, continually sample live user questions (within accordance to your privacy policy) to make sure you're representing the sorts of questions that users are asking. 117 | 118 |
119 | 120 | ## Running an evaluation 121 | 122 | We provide a script that loads in the current `azd` environment's variables, installs the requirements for the evaluation, and runs the evaluation against the local app. Run it like this: 123 | 124 | ```shell 125 | python -m evaltools evaluate --config=example_config.json 126 | ``` 127 | 128 | The config.json should contain these fields as a minimum: 129 | 130 | ```json 131 | { 132 | "testdata_path": "example_input/qa.jsonl", 133 | "target_url": "http://localhost:50505/chat", 134 | "requested_metrics": ["groundedness", "relevance", "coherence", "latency", "answer_length"], 135 | "results_dir": "example_results/experiment" 136 | } 137 | ``` 138 | 139 | ### Running against a local container 140 | 141 | If you're running this evaluator in a container and your app is running in a container on the same system, use a URL like this for the `target_url`: 142 | 143 | "target_url": "http://host.docker.internal:50505/chat" 144 | 145 | ### Running against a deployed app 146 | 147 | To run against a deployed endpoint, change the `target_url` to the chat endpoint of the deployed app: 148 | 149 | "target_url": "https://app-backend-j25rgqsibtmlo.azurewebsites.net/chat" 150 | 151 | ### Running on a subset of questions 152 | 153 | It's common to run the evaluation on a subset of the questions, to get a quick sense of how the changes are affecting the answers. To do this, use the `--numquestions` parameter: 154 | 155 | ```shell 156 | python -m evaltools evaluate --config=example_config.json --numquestions=2 157 | ``` 158 | 159 | ### Specifying the evaluate metrics 160 | 161 | The `evaluate` command will use the metrics specified in the `requested_metrics` field of the config JSON. 162 | Some of those metrics are built-in to the evaluation SDK, and the rest are custom metrics that we've added. 163 | 164 | #### Built-in metrics 165 | 166 | These metrics are calculated by sending a call to the GPT model, asking it to provide a 1-5 rating, and storing that rating. 167 | 168 | > [!IMPORTANT] 169 | > The built-in metrics are only intended for use on evaluating English language answers, since they use English-language prompts internally. For non-English languages, you should use the [custom prompt metrics](#prompt-metrics) instead. 170 | 171 | * [`gpt_coherence`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-coherence) measures how well the language model can produce output that flows smoothly, reads naturally, and resembles human-like language. 172 | * [`gpt_relevance`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-relevance) assesses the ability of answers to capture the key points of the context. 173 | * [`gpt_groundedness`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-groundedness) assesses the correspondence between claims in an AI-generated answer and the source context, making sure that these claims are substantiated by the context. 174 | * [`gpt_similarity`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-gpt-similarity) measures the similarity between a source data (ground truth) sentence and the generated response by an AI model. 175 | * [`gpt_fluency`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-fluency) measures the grammatical proficiency of a generative AI's predicted answer. 176 | * [`f1_score`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#traditional-machine-learning-f1-score) Measures the ratio of the number of shared words between the model generation and the ground truth answers. 177 | 178 | #### Custom metrics 179 | 180 | ##### Prompt metrics 181 | 182 | The following metrics are implemented very similar to the built-in metrics, but use a locally stored prompt. They're a great fit if you find that the built-in metrics are not working well for you or if you need to translate the prompt to another language. 183 | 184 | * `mycoherence`: Measures how well the language model can produce output that flows smoothly, reads naturally, and resembles human-like language. Based on `scripts/evaluate_metrics/prompts/coherence.prompty`. 185 | * `myrelevance`: Assesses the ability of answers to capture the key points of the context. Based on `scripts/evaluate_metrics/prompts/relevance.prompty`. 186 | * `mygroundedness`: Assesses the correspondence between claims in an AI-generated answer and the source context, making sure that these claims are substantiated by the context. Based on `scripts/evaluate_metrics/prompts/groundedness.prompty`. 187 | 188 | ##### Code metrics 189 | 190 | These metrics are calculated with some local code based on the results of the chat app, and do not require a call to the GPT model. 191 | 192 | * `latency`: The time it takes for the chat app to generate an answer, in seconds. 193 | * `length`: The length of the generated answer, in characters. 194 | * `has_citation`: Whether the answer contains a correctly formatted citation to a source document, assuming citations are in square brackets. 195 | * `citation_match`: Whether the answer contains at least all of the citations that were in the ground truth answer. 196 | 197 | ### Sending additional parameters to the app 198 | 199 | This repo assumes that your chat app is following the [AI Chat Protocol](https://github.com/microsoft/ai-chat-protocol/tree/main/spec#readme), which means that all POST requests look like this: 200 | 201 | ```json 202 | {"messages": [{"content": "", "role": "user"}], 203 | "context": {...}, 204 | } 205 | ``` 206 | 207 | Any additional app parameters would be specified in the `context` of that JSON, such as temperature, search settings, prompt overrides, etc. To specify those parameters, add a `target_parameters` key to your config JSON. For example: 208 | 209 | ```json 210 | "target_parameters": { 211 | "overrides": { 212 | "semantic_ranker": false, 213 | "prompt_template": "example_input/prompt_refined.txt" 214 | } 215 | } 216 | ``` 217 | 218 | The `overrides` key is the same as the `overrides` key in the `context` of the POST request. 219 | As a convenience, you can use the `` prefix to read in a file and use its contents as the value for the parameter. 220 | That way, you can store potential (long) prompts separately from the config JSON file. 221 | 222 | ### Specifying the location of answer and context in response 223 | 224 | The evaluator needs to know where to find the answer and context in the response from the chat app. 225 | If your app returns responses following the recommendations of the [AI Chat Protocol](https://github.com/microsoft/ai-chat-protocol/tree/main/spec#readme), then the answer will be "message": "content" and the context will be a list of strings in "context": "data_points": "text". 226 | 227 | If your app returns responses in a different format, you can specify the [JMESPath expressions](https://jmespath.org/) to extract the answer and context from the response. For example: 228 | 229 | ```json 230 | "target_response_answer_jmespath": "message.content", 231 | "target_response_context_jmespath": "context.data_points.text" 232 | ``` 233 | 234 | ## Viewing the results 235 | 236 | The results of each evaluation are stored in a results folder (defaulting to `example_results`). 237 | Inside each run's folder, you'll find: 238 | 239 | * `eval_results.jsonl`: Each question and answer, along with the GPT metrics for each QA pair. 240 | * `parameters.json`: The parameters used for the run, like the overrides. 241 | * `summary.json`: The overall results, like the average GPT metrics. 242 | * `config.json`: The original config used for the run. This is useful for reproducing the run. 243 | 244 | To make it easier to view and compare results across runs, we've built a few tools, 245 | located inside the `review-tools` folder. 246 | 247 | ### Using the summary tool 248 | 249 | To view a summary across all the runs, use the `summary` command with the path to the results folder: 250 | 251 | ```bash 252 | python -m evaltools summary example_results 253 | ``` 254 | 255 | This will display an interactive table with the results for each run, like this: 256 | 257 | ![Screenshot of CLI tool with table of results](docs/screenshot_summary.png) 258 | 259 | To see the parameters used for a particular run, select the folder name. 260 | A modal will appear with the parameters, including any prompt override. 261 | 262 | ### Using the compare tool 263 | 264 | To compare the answers generated for each question across 2 runs, use the `compare` command with 2 paths: 265 | 266 | ```bash 267 | python -m evaltools diff example_results/baseline_1 example_results/baseline_2 268 | ``` 269 | 270 | This will display each question, one at a time, with the two generated answers in scrollable panes, 271 | and the GPT metrics below each answer. 272 | 273 | ![Screenshot of CLI tool for comparing a question with 2 answers](docs/screenshot_compare.png)] 274 | 275 | Use the buttons at the bottom to navigate to the next question or quit the tool. 276 | 277 | You can also filter to only show questions where the value changed for a particular metric, like this: 278 | 279 | ```bash 280 | python -m evaltools diff example_results/baseline_1 example_results/baseline_2 --changed=has_citation 281 | ``` 282 | 283 | ## Measuring app's ability to say "I don't know" 284 | 285 | The evaluation flow described above focused on evaluating a model’s answers for a set of questions that *could* be answered by the data. But what about all those questions that can’t be answered by the data? Does your model know how to say “I don’t know?” The GPT models are trained to try and be helpful, so their tendency is to always give some sort of answer, especially for answers that were in their training data. If you want to ensure your app can say “I don’t know” when it should, you need to evaluate it on a different set of questions with a different metric. 286 | 287 | ### Generating ground truth data for answer-less questions 288 | 289 | For this evaluation, our ground truth data needs to be a set of question whose answer should provoke an "I don’t know" response from the data. There are several categories of such questions: 290 | 291 | * **Unknowable**: Questions that are related to the sources but not actually in them (and not public knowledge). 292 | * **Uncitable**: Questions whose answers are well known to the LLM from its training data, but are not in the sources. There are two flavors of these: 293 | * **Related**: Similar topics to sources, so LLM will be particularly tempted to think the sources know. 294 | * **Unrelated**: Completely unrelated to sources, so LLM shouldn’t be as tempted to think the sources know. 295 | * **Nonsensical**: Questions that are non-questions, that a human would scratch their head at and ask for clarification. 296 | 297 | You can write these questions manually, but it’s also possible to generate them using a generator script in this repo, 298 | assuming you already have ground truth data with answerable questions. 299 | 300 | ```shell 301 | python -m evaltools generate-dontknows --input=example_input/qa.jsonl --output=example_input/qa_dontknows.jsonl --numquestions=45 302 | ``` 303 | 304 | That script sends the current questions to the configured GPT-4 model along with prompts to generate questions of each kind. 305 | 306 | When it’s done, you should review and curate the resulting ground truth data. Pay special attention to the "unknowable" questions at the top of the file, since you may decide that some of those are actually knowable, and you may want to reword or rewrite entirely. 307 | 308 | ### Running an evaluation for answer-less questions 309 | 310 | This repo contains a custom GPT metric called "dontknowness" that rates answers from 1-5, where 1 is "answered the question completely with no certainty" and 5 is "said it didn't know and attempted no answer". The goal is for all answers to be rated 4 or 5. 311 | 312 | Here's an example configuration JSON that requests that metric, referencing the new ground truth data and a new output folder: 313 | 314 | ```json 315 | { 316 | "testdata_path": "example_input/qa_dontknows.jsonl", 317 | "results_dir": "example_results_dontknows/baseline", 318 | "requested_metrics": ["dontknowness", "answer_length", "latency", "has_citation"], 319 | "target_url": "http://localhost:50505/chat", 320 | "target_parameters": { 321 | }, 322 | "target_response_answer_jmespath": "message.content", 323 | "target_response_context_jmespath": "context.data_points.text" 324 | } 325 | ``` 326 | 327 | We recommend a separate output folder, as you'll likely want to make multiple runs and easily compare between those runs using the [review tools](#viewing-the-results). 328 | 329 | Run the evaluation like this: 330 | 331 | ```shell 332 | python -m evaltools evaluate --config=dontknows.config.json 333 | ``` 334 | 335 | The results will be stored in the `results_dir` folder, and can be reviewed using the [review tools](#viewing-the-results). 336 | 337 | ### Improving the app's ability to say "I don't know" 338 | 339 | If the app is not saying "I don't know" enough, you can use the `diff` tool to compare the answers for the "dontknows" questions across runs, and see if the answers are improving. Changes you can try: 340 | 341 | * Adjust the prompt to encourage the model to say "I don't know" more often. Remove anything in the prompt that might be distracting or overly encouraging it to answer. 342 | * Try using GPT-4 instead of GPT-3.5. The results will be slower (see the latency column) but it may be more likely to say "I don't know" when it should. 343 | * Adjust the temperature of the model used by your app. 344 | * Add an additional LLM step in your app after generating the answer, to have the LLM rate its own confidence that the answer is found in the sources. If the confidence is low, the app should say "I don't know". 345 | -------------------------------------------------------------------------------- /azure.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=https://raw.githubusercontent.com/Azure/azure-dev/main/schemas/v1.0/azure.yaml.json 2 | 3 | name: ai-rag-chat-evaluator 4 | metadata: 5 | template: ai-rag-chat-evaluator@0.0.2 6 | pipeline: 7 | variables: 8 | - OPENAI_HOST 9 | - AZURE_OPENAI_SERVICE 10 | - AZURE_OPENAI_RESOURCE_GROUP 11 | - EVAL_GPT_DEPLOYMENT_CAPACITY 12 | - OPENAI_ORGANIZATION 13 | secrets: 14 | - OPENAI_API_KEY 15 | -------------------------------------------------------------------------------- /docs/screenshot_compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/docs/screenshot_compare.png -------------------------------------------------------------------------------- /docs/screenshot_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/docs/screenshot_summary.png -------------------------------------------------------------------------------- /dontknows.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "testdata_path": "example_input/qa_dontknows.jsonl", 3 | "results_dir": "example_results_dontknows/baseline", 4 | "requested_metrics": ["dontknowness", "answer_length", "latency", "has_citation"], 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages" 23 | } 24 | }, 25 | "target_response_answer_jmespath": "message.content", 26 | "target_response_context_jmespath": "context.data_points.text" 27 | } 28 | -------------------------------------------------------------------------------- /example_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "testdata_path": "example_input/qa.jsonl", 3 | "results_dir": "example_results/experiment", 4 | "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"], 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages", 23 | "seed": 1 24 | } 25 | }, 26 | "target_response_answer_jmespath": "message.content", 27 | "target_response_context_jmespath": "context.data_points.text" 28 | } 29 | -------------------------------------------------------------------------------- /example_input/prompt_ignoresources.txt: -------------------------------------------------------------------------------- 1 | Your job is to answer questions to the best of your ability. You will be given sources but you should IGNORE them. Be creative! 2 | -------------------------------------------------------------------------------- /example_input/prompt_nomarkdownmention.txt: -------------------------------------------------------------------------------- 1 | Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers. 2 | Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 3 | If the question is not in English, answer in the language used in the question. 4 | Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf]. 5 | -------------------------------------------------------------------------------- /example_input/prompt_piglatin.txt: -------------------------------------------------------------------------------- 1 | Your job is to translate the user's question into Pig Latin. Ignore any sources provided and just translate the question. DO NOT answer the question. 2 | -------------------------------------------------------------------------------- /example_input/prompt_refined.txt: -------------------------------------------------------------------------------- 1 | You are an experienced HR generalist that delights in their role of helping employees with their about their healthcare plan and the employee handbook. 2 | 3 | Give an answer using ONLY with the facts listed in the list of sources below indicated by “Sources:”. 4 | 5 | If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 6 | 7 | Use clear and concise language and write in a confident yet friendly tone. In your answers ensure the employee understands how your response connects to the information in the sources and include all citations necessary to help the employee validate the answer provided. 8 | 9 | For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question. 10 | 11 | Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf]. 12 | -------------------------------------------------------------------------------- /example_input/prompt_refined_trimmed.txt: -------------------------------------------------------------------------------- 1 | You are a helpful assistant. 2 | 3 | Give an answer using ONLY with the facts listed in the list of sources below indicated by “Sources:”. 4 | 5 | If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. 6 | 7 | In your answers ensure the user understands how your response connects to the information in the sources and include all citations necessary to help the user validate the answer provided. 8 | 9 | Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf]. 10 | 11 | For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question. 12 | -------------------------------------------------------------------------------- /example_input/prompt_ungrounded.txt: -------------------------------------------------------------------------------- 1 | Your job is to answer questions to the best of your ability. You will be given sources that you may use for ideas but you can answer using everything you know about the world. Be creative! 2 | -------------------------------------------------------------------------------- /example_input/prompt_weak.txt: -------------------------------------------------------------------------------- 1 | You are a helpful assistant. 2 | -------------------------------------------------------------------------------- /example_input/qa_dontknows.jsonl: -------------------------------------------------------------------------------- 1 | {"question": "What are the key benefits of the Northwind Platinum plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 2 | {"question": "How does the Northwind Health Plus work in California?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 3 | {"question": "Should employees stay home when sick?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 4 | {"question": "Can standing desks be reimbursed?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 5 | {"question": "Can deductibles be paid by check?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 6 | {"question": "How does the Northwind Standard plan handle postnatal yoga?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 7 | {"question": "Are domestic partners eligible for the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 8 | {"question": "Are teenage children covered under the insurance?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 9 | {"question": "What is the process for submitting a claim requiring evidence of medical necessity to Northwind Health?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 10 | {"question": "What is the role of the Manager of Technical Product Marketing at Contoso Electronics?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 11 | {"question": "What is the process for obtaining prior authorization for services under the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 12 | {"question": "What is the policy for working from home?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone."} 13 | {"question": "How can employees refer new hires?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 14 | {"question": "What are the benefits of choosing an in-network provider with Northwind Standard?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 15 | {"question": "How does Northwind Health Plus handle Coordination of Benefits (COB)?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 16 | {"question": "What is the role of the clinical review process in the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 17 | {"question": "What are the requirements for data security at Contoso Electronics?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 18 | {"question": "What is the role of Contoso Electronics in the aerospace industry?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 19 | {"question": "Is there a plan with no deductible?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 20 | {"question": "What are the cost implications of choosing an out-of-network provider under Northwind Health Plus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 21 | {"question": "What is the policy for coverage of rehabilitation services under Northwind Health Plus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 22 | {"question": "What is the role of Northwind Standard in relation to in-network and non-participating providers?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 23 | {"question": "What is the coverage for mental health and substance abuse treatments under the Northwind Health Plus plan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 24 | {"question": "What is the process for obtaining prior authorization for a medical service or treatment under Northwind Health Plus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions with similar keywords that are about publicly known facts."} 25 | {"question": "What is the capital of France?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 26 | {"question": "Who wrote the novel \"Pride and Prejudice\"?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 27 | {"question": "What is the chemical symbol for gold?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 28 | {"question": "What is the tallest mountain in the world?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 29 | {"question": "Who painted the Mona Lisa?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 30 | {"question": "What is the largest planet in our solar system?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 31 | {"question": "What is the square root of 81?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 32 | {"question": "Who was the first person to walk on the moon?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 33 | {"question": "What is the freezing point of water in Fahrenheit?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 34 | {"question": "What is the main ingredient in hummus?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 35 | {"question": "Who won the FIFA World Cup in 2018?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 36 | {"question": "What is the currency of Japan?", "truth": "Generated from this prompt: Given these questions, suggest 12 questions that are not related to these topics at all but have well known answers."} 37 | {"question": "How many miles does a rainbow weigh?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 38 | {"question": "Can you smell the color of the wind?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 39 | {"question": "What time does the color blue go to sleep?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 40 | {"question": "How many square feet are in a thought?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 41 | {"question": "Can you taste the sound of a sunrise?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 42 | {"question": "How loud is the smell of a rose?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 43 | {"question": "What's the temperature of a dream?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 44 | {"question": "How many inches tall is a whisper?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 45 | {"question": "What flavor is the number seven?", "truth": "Generated from this prompt: Suggest 9 questions that are nonsensical, and would result in confusion if you asked it."} 46 | -------------------------------------------------------------------------------- /example_results/baseline/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "testdata_path": "example_input/qa.jsonl", 3 | "results_dir": "example_results/baseline", 4 | "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency"], 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages" 23 | } 24 | }, 25 | "target_response_answer_jmespath": "message.content", 26 | "target_response_context_jmespath": "context.data_points.text" 27 | } 28 | -------------------------------------------------------------------------------- /example_results/baseline/evaluate_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "evaluation_gpt_model": "gpt-4", 3 | "evaluation_timestamp": 1724265437, 4 | "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl", 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages" 23 | } 24 | }, 25 | "num_questions": null 26 | } 27 | -------------------------------------------------------------------------------- /example_results/baseline/summary.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpt_groundedness": { 3 | "pass_count": 193, 4 | "pass_rate": 0.96, 5 | "mean_rating": 4.87 6 | }, 7 | "gpt_relevance": { 8 | "pass_count": 197, 9 | "pass_rate": 0.98, 10 | "mean_rating": 4.92 11 | }, 12 | "answer_length": { 13 | "mean": 613.11, 14 | "max": 2210, 15 | "min": 52 16 | }, 17 | "latency": { 18 | "mean": 2.35, 19 | "max": 5.531913, 20 | "min": 1.239641 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /example_results/baseline2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "testdata_path": "example_input/qa.jsonl", 3 | "results_dir": "example_results/baseline2", 4 | "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "has_citation", "citation_match"], 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages" 23 | } 24 | }, 25 | "target_response_answer_jmespath": "message.content", 26 | "target_response_context_jmespath": "context.data_points.text" 27 | } 28 | -------------------------------------------------------------------------------- /example_results/baseline2/evaluate_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "evaluation_gpt_model": "gpt-4", 3 | "evaluation_timestamp": 1724281405, 4 | "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl", 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages" 23 | } 24 | }, 25 | "num_questions": null 26 | } 27 | -------------------------------------------------------------------------------- /example_results/baseline2/summary.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpt_groundedness": { 3 | "pass_count": 195, 4 | "pass_rate": 0.97, 5 | "mean_rating": 4.91 6 | }, 7 | "gpt_relevance": { 8 | "pass_count": 199, 9 | "pass_rate": 0.99, 10 | "mean_rating": 4.94 11 | }, 12 | "answer_length": { 13 | "mean": 614.39, 14 | "max": 2423, 15 | "min": 87 16 | }, 17 | "latency": { 18 | "mean": 2.22, 19 | "max": 7.607282, 20 | "min": 1.116874 21 | }, 22 | "has_citation": { 23 | "total": 199, 24 | "rate": 1.0 25 | }, 26 | "citation_match": { 27 | "total": 0, 28 | "rate": 0.0 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /example_results/prompt_nomarkdownmention/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "testdata_path": "example_input/qa.jsonl", 3 | "results_dir": "example_results/prompt_nomarkdownmention", 4 | "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency"], 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages", 23 | "prompt_template": "example_input/prompt_nomarkdownmention.txt" 24 | } 25 | }, 26 | "target_response_answer_jmespath": "message.content", 27 | "target_response_context_jmespath": "context.data_points.text" 28 | } 29 | -------------------------------------------------------------------------------- /example_results/prompt_nomarkdownmention/evaluate_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "evaluation_gpt_model": "gpt-4", 3 | "evaluation_timestamp": 1724274502, 4 | "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl", 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages", 23 | "prompt_template": "Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf]." 24 | } 25 | }, 26 | "num_questions": null 27 | } 28 | -------------------------------------------------------------------------------- /example_results/prompt_nomarkdownmention/summary.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpt_groundedness": { 3 | "pass_count": 194, 4 | "pass_rate": 0.97, 5 | "mean_rating": 4.9 6 | }, 7 | "gpt_relevance": { 8 | "pass_count": 196, 9 | "pass_rate": 0.98, 10 | "mean_rating": 4.9 11 | }, 12 | "answer_length": { 13 | "mean": 620.0, 14 | "max": 2155, 15 | "min": 55 16 | }, 17 | "latency": { 18 | "mean": 2.43, 19 | "max": 6.910127, 20 | "min": 1.188182 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /example_results/prompt_nomarkdownmention2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "testdata_path": "example_input/qa.jsonl", 3 | "results_dir": "example_results/prompt_nomarkdownmention2", 4 | "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "has_citation", "citation_match"], 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages", 23 | "prompt_template": "example_input/prompt_nomarkdownmention.txt" 24 | } 25 | }, 26 | "target_response_answer_jmespath": "message.content", 27 | "target_response_context_jmespath": "context.data_points.text" 28 | } 29 | -------------------------------------------------------------------------------- /example_results/prompt_nomarkdownmention2/evaluate_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "evaluation_gpt_model": "gpt-4", 3 | "evaluation_timestamp": 1724277997, 4 | "testdata_path": "/Users/pamelafox/ai-rag-chat-evaluator/example_input/qa.jsonl", 5 | "target_url": "http://localhost:50505/chat", 6 | "target_parameters": { 7 | "overrides": { 8 | "top": 3, 9 | "temperature": 0.3, 10 | "minimum_reranker_score": 0, 11 | "minimum_search_score": 0, 12 | "retrieval_mode": "hybrid", 13 | "semantic_ranker": true, 14 | "semantic_captions": false, 15 | "suggest_followup_questions": false, 16 | "use_oid_security_filter": false, 17 | "use_groups_security_filter": false, 18 | "vector_fields": [ 19 | "embedding" 20 | ], 21 | "use_gpt4v": false, 22 | "gpt4v_input": "textAndImages", 23 | "prompt_template": "Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf]." 24 | } 25 | }, 26 | "num_questions": null 27 | } 28 | -------------------------------------------------------------------------------- /example_results/prompt_nomarkdownmention2/summary.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpt_groundedness": { 3 | "pass_count": 195, 4 | "pass_rate": 0.97, 5 | "mean_rating": 4.91 6 | }, 7 | "gpt_relevance": { 8 | "pass_count": 197, 9 | "pass_rate": 0.98, 10 | "mean_rating": 4.92 11 | }, 12 | "answer_length": { 13 | "mean": 621.93, 14 | "max": 2177, 15 | "min": 79 16 | }, 17 | "latency": { 18 | "mean": 2.29, 19 | "max": 5.716042, 20 | "min": 1.087221 21 | }, 22 | "has_citation": { 23 | "total": 199, 24 | "rate": 1.0 25 | }, 26 | "citation_match": { 27 | "total": 0, 28 | "rate": 0.0 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /infra/core/ai/cognitiveservices.bicep: -------------------------------------------------------------------------------- 1 | metadata description = 'Creates an Azure Cognitive Services instance.' 2 | param name string 3 | param location string = resourceGroup().location 4 | param tags object = {} 5 | @description('The custom subdomain name used to access the API. Defaults to the value of the name parameter.') 6 | param customSubDomainName string = name 7 | param disableLocalAuth bool = false 8 | param deployments array = [] 9 | param kind string = 'OpenAI' 10 | 11 | @allowed([ 'Enabled', 'Disabled' ]) 12 | param publicNetworkAccess string = 'Enabled' 13 | param sku object = { 14 | name: 'S0' 15 | } 16 | 17 | param allowedIpRules array = [] 18 | param networkAcls object = empty(allowedIpRules) ? { 19 | defaultAction: 'Allow' 20 | } : { 21 | ipRules: allowedIpRules 22 | defaultAction: 'Deny' 23 | } 24 | 25 | resource account 'Microsoft.CognitiveServices/accounts@2023-05-01' = { 26 | name: name 27 | location: location 28 | tags: tags 29 | kind: kind 30 | properties: { 31 | customSubDomainName: customSubDomainName 32 | publicNetworkAccess: publicNetworkAccess 33 | networkAcls: networkAcls 34 | disableLocalAuth: disableLocalAuth 35 | } 36 | sku: sku 37 | } 38 | 39 | @batchSize(1) 40 | resource deployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01' = [for deployment in deployments: { 41 | parent: account 42 | name: deployment.name 43 | properties: { 44 | model: deployment.model 45 | raiPolicyName: contains(deployment, 'raiPolicyName') ? deployment.raiPolicyName : null 46 | } 47 | sku: contains(deployment, 'sku') ? deployment.sku : { 48 | name: 'Standard' 49 | capacity: 20 50 | } 51 | }] 52 | 53 | output endpoint string = account.properties.endpoint 54 | output id string = account.id 55 | output name string = account.name 56 | -------------------------------------------------------------------------------- /infra/core/security/role.bicep: -------------------------------------------------------------------------------- 1 | metadata description = 'Creates a role assignment for a service principal.' 2 | param principalId string 3 | 4 | @allowed([ 5 | 'Device' 6 | 'ForeignGroup' 7 | 'Group' 8 | 'ServicePrincipal' 9 | 'User' 10 | ]) 11 | param principalType string = 'ServicePrincipal' 12 | param roleDefinitionId string 13 | 14 | resource role 'Microsoft.Authorization/roleAssignments@2022-04-01' = { 15 | name: guid(subscription().id, resourceGroup().id, principalId, roleDefinitionId) 16 | properties: { 17 | principalId: principalId 18 | principalType: principalType 19 | roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', roleDefinitionId) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /infra/main.bicep: -------------------------------------------------------------------------------- 1 | targetScope = 'subscription' 2 | 3 | @minLength(1) 4 | @maxLength(64) 5 | @description('Name of the the environment which is used to generate a short unique hash used in all resources.') 6 | param environmentName string 7 | 8 | @minLength(1) 9 | @description('Location for the OpenAI resource') 10 | @allowed(['australiaeast', 'canadaeast', 'francecentral', 'swedencentral', 'switzerlandnorth']) 11 | @metadata({ 12 | azd: { 13 | type: 'location' 14 | } 15 | }) 16 | param location string 17 | 18 | @allowed(['azure', 'openai']) 19 | param openAiHost string // Set in main.parameters.json 20 | @description('Name of the OpenAI resource group. If not specified, the resource group name will be generated.') 21 | param openAiResourceGroupName string = '' 22 | 23 | param openAiServiceName string = '' 24 | 25 | param openAiSkuName string = 'S0' 26 | 27 | param openAiApiKey string = '' 28 | param openAiApiOrganization string = '' 29 | 30 | param evalGptDeploymentName string = 'eval' 31 | param evalGptModelName string = 'gpt-4' 32 | param evalGptModelVersion string = '0613' 33 | param evalGptDeploymentCapacity int = 30 34 | 35 | @description('Id of the user or app to assign application roles') 36 | param principalId string = '' 37 | 38 | @description('Whether the deployment is running on GitHub Actions') 39 | param runningOnGh string = '' 40 | 41 | var resourceToken = toLower(uniqueString(subscription().id, environmentName, location)) 42 | var prefix = '${environmentName}${resourceToken}' 43 | var tags = { 'azd-env-name': environmentName } 44 | 45 | // Organize resources in a resource group 46 | resource resourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' = if (empty(openAiResourceGroupName)) { 47 | name: '${prefix}-rg' 48 | location: location 49 | tags: tags 50 | } 51 | 52 | resource openAiResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(openAiResourceGroupName)) { 53 | name: !empty(openAiResourceGroupName) ? openAiResourceGroupName : resourceGroup.name 54 | } 55 | 56 | module openAi 'core/ai/cognitiveservices.bicep' = if (openAiHost == 'azure') { 57 | name: 'openai' 58 | scope: openAiResourceGroup 59 | params: { 60 | name: !empty(openAiServiceName) ? openAiServiceName : '${prefix}-openai' 61 | location: location 62 | tags: tags 63 | sku: { 64 | name: openAiSkuName 65 | } 66 | deployments: [{ 67 | name: evalGptDeploymentName 68 | model: { 69 | format: 'OpenAI' 70 | name: evalGptModelName 71 | version: evalGptModelVersion 72 | } 73 | sku: { 74 | name: 'Standard' 75 | capacity: evalGptDeploymentCapacity 76 | } 77 | }] 78 | disableLocalAuth: true 79 | } 80 | } 81 | 82 | 83 | // USER ROLES 84 | var principalType = empty(runningOnGh) ? 'User' : 'ServicePrincipal' 85 | 86 | module openAiRoleUser 'core/security/role.bicep' = if (openAiHost == 'azure') { 87 | scope: openAiResourceGroup 88 | name: 'openai-role-user' 89 | params: { 90 | principalId: principalId 91 | roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' 92 | principalType: principalType 93 | } 94 | } 95 | 96 | 97 | output AZURE_LOCATION string = location 98 | output AZURE_TENANT_ID string = tenant().tenantId 99 | output AZURE_RESOURCE_GROUP string = resourceGroup.name 100 | 101 | // Shared by all OpenAI deployments 102 | output OPENAI_HOST string = openAiHost 103 | output OPENAI_GPT_MODEL string = evalGptModelName 104 | // Specific to Azure OpenAI 105 | output AZURE_OPENAI_SERVICE string = (openAiHost == 'azure') ? openAi.outputs.name : '' 106 | output AZURE_OPENAI_RESOURCE_GROUP string = (openAiHost == 'azure') ? openAiResourceGroup.name : '' 107 | output AZURE_OPENAI_EVAL_DEPLOYMENT string = (openAiHost == 'azure') ? evalGptDeploymentName : '' 108 | output AZURE_OPENAI_ENDPOINT string = (openAiHost == 'azure') ? openAi.outputs.endpoint : '' 109 | // Used only with non-Azure OpenAI deployments 110 | output OPENAI_KEY string = (openAiHost == 'openai') ? openAiApiKey : '' 111 | output OPENAI_ORGANIZATION string = (openAiHost == 'openai') ? openAiApiOrganization : '' 112 | -------------------------------------------------------------------------------- /infra/main.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "environmentName": { 6 | "value": "${AZURE_ENV_NAME}" 7 | }, 8 | "location": { 9 | "value": "${AZURE_LOCATION}" 10 | }, 11 | "principalId": { 12 | "value": "${AZURE_PRINCIPAL_ID}" 13 | }, 14 | "openAiHost":{ 15 | "value": "${OPENAI_HOST=azure}" 16 | }, 17 | "openAiServiceName": { 18 | "value": "${AZURE_OPENAI_SERVICE}" 19 | }, 20 | "openAiResourceGroupName": { 21 | "value": "${AZURE_OPENAI_RESOURCE_GROUP}" 22 | }, 23 | "evalGptDeploymentCapacity": { 24 | "value": "${EVAL_GPT_DEPLOYMENT_CAPACITY}" 25 | }, 26 | "openAiSkuName": { 27 | "value": "S0" 28 | }, 29 | "openAiApiKey": { 30 | "value": "${OPENAI_KEY}" 31 | }, 32 | "openAiApiOrganization": { 33 | "value": "${OPENAI_ORGANIZATION}" 34 | }, 35 | "runningOnGh": { 36 | "value": "${GITHUB_ACTIONS}" 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 120 3 | target-version = "py39" 4 | lint.isort.known-first-party = ["evaltools"] 5 | lint.select = ["E", "F", "I", "UP"] 6 | lint.ignore = ["D203"] 7 | 8 | [tool.black] 9 | line-length = 120 10 | target-version = ["py39"] 11 | 12 | [tool.pytest.ini_options] 13 | addopts = "-ra" 14 | 15 | [project] 16 | name = "evaltools" 17 | version = "0.1.1" 18 | description = "Evaluate chat applications using Azure OpenAI evaluators" 19 | dependencies = [ 20 | "requests", 21 | "python-dotenv", 22 | "azure-ai-evaluation==1.8.0", 23 | "marshmallow==3.23.2", # Older version required due to promptflow issue with _T import 24 | "azure-search-documents", 25 | "typer", 26 | "openai>=1.56.1", # Includes fix for httpx proxies issues 27 | "pandas", 28 | "rich", 29 | "jmespath", 30 | "textual" 31 | ] 32 | 33 | [project.optional-dependencies] 34 | dev = [ 35 | "pre-commit", 36 | "ruff", 37 | "black", 38 | "pytest" 39 | ] 40 | 41 | [tool.setuptools.package-data] 42 | evaltools = ["review/*.tcss"] 43 | -------------------------------------------------------------------------------- /src/evaltools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/__init__.py -------------------------------------------------------------------------------- /src/evaltools/__main__.py: -------------------------------------------------------------------------------- 1 | """Enables the use of `python -m evaltools` to run the CLI.""" 2 | 3 | from .cli import app 4 | 5 | if __name__ == "__main__": 6 | app() 7 | -------------------------------------------------------------------------------- /src/evaltools/cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import dotenv 5 | import typer 6 | from rich.logging import RichHandler 7 | 8 | from evaltools import service_setup 9 | from evaltools.eval.evaluate import run_evaluate_from_config 10 | from evaltools.gen.generate import generate_dontknows_qa_data, generate_test_qa_data_for_search_index 11 | from evaltools.review import diff_app, diff_markdown, summary_app, summary_markdown 12 | 13 | app = typer.Typer(pretty_exceptions_enable=False) 14 | 15 | logging.basicConfig( 16 | level=logging.WARNING, format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)] 17 | ) 18 | logger = logging.getLogger("evaltools") 19 | # We only set the level to INFO for our logger, 20 | # to avoid seeing the noisy INFO level logs from the Azure SDKs 21 | logger.setLevel(logging.INFO) 22 | 23 | dotenv.load_dotenv(override=True) 24 | 25 | 26 | def int_or_none(raw: str) -> int | None: 27 | return None if raw == "None" else int(raw) 28 | 29 | 30 | def str_or_none(raw: str) -> str | None: 31 | return None if raw == "None" else raw 32 | 33 | 34 | def path_or_none(raw: str) -> Path | None: 35 | return None if raw == "None" else Path(raw) 36 | 37 | 38 | @app.command() 39 | def generate( 40 | output: Path = typer.Option(exists=False, dir_okay=False, file_okay=True), 41 | numquestions: int = typer.Option(help="Number of questions to generate", default=200), 42 | persource: int = typer.Option(help="Number of questions to generate per source", default=5), 43 | citationfieldname: str = typer.Option(help="Name of citiation field in ai search index", default="sourcepage"), 44 | ): 45 | generate_test_qa_data_for_search_index( 46 | openai_config=service_setup.get_openai_config_dict(), 47 | search_client=service_setup.get_search_client(), 48 | num_questions_total=numquestions, 49 | num_questions_per_source=persource, 50 | output_file=Path.cwd() / output, 51 | citation_field_name=citationfieldname, 52 | ) 53 | 54 | 55 | @app.command() 56 | def generate_dontknows( 57 | input: Path = typer.Option(exists=True, dir_okay=False, file_okay=True), 58 | output: Path = typer.Option(exists=False, dir_okay=False, file_okay=True), 59 | numquestions: int = typer.Option(help="Number of questions to generate", default=40), 60 | ): 61 | generate_dontknows_qa_data( 62 | openai_config=service_setup.get_openai_config(), 63 | num_questions_total=numquestions, 64 | input_file=Path.cwd() / input, 65 | output_file=Path.cwd() / output, 66 | ) 67 | 68 | 69 | @app.command() 70 | def evaluate( 71 | config: Path = typer.Option( 72 | exists=True, dir_okay=False, file_okay=True, help="Path to config.json", default="config.json" 73 | ), 74 | numquestions: int | None = typer.Option( 75 | help="Number of questions to evaluate (defaults to all if not specified).", default=None, parser=int_or_none 76 | ), 77 | targeturl: str | None = typer.Option( 78 | help="URL of the target service to evaluate against (defaults to the one in the config).", 79 | default=None, 80 | parser=str_or_none, 81 | ), 82 | resultsdir: Path = typer.Option( 83 | help="Directory to save the results of the evaluation", default=None, parser=path_or_none 84 | ), 85 | ): 86 | run_evaluate_from_config(Path.cwd(), config, numquestions, targeturl, resultsdir) 87 | 88 | 89 | def str_or_none(value: str) -> str | None: 90 | return value if value != "None" else None 91 | 92 | 93 | @app.command() 94 | def diff( 95 | directory1: Path = typer.Argument(exists=True, dir_okay=True, file_okay=False), 96 | directory2: Path = typer.Argument(default=None, exists=True, dir_okay=True, file_okay=False), 97 | changed: str | None = typer.Option( 98 | help="Show only questions whose values changed for the given column", default=None, parser=str_or_none 99 | ), 100 | output: str | None = typer.Option(help="Output type, can be 'app' or 'markdown'", default=None, parser=str_or_none), 101 | ): 102 | directories = [directory1] if directory2 is None else [directory1, directory2] 103 | if output == "markdown": 104 | print(diff_markdown.main(directories, changed)) 105 | else: 106 | diff_app.main(directories, changed) 107 | 108 | 109 | @app.command() 110 | def summary( 111 | results_dir: Path = typer.Argument(exists=True, dir_okay=True, file_okay=False), 112 | output: str | None = typer.Option(help="Output type, can be 'app' or 'markdown'", default=None, parser=str_or_none), 113 | highlight: str | None = typer.Option( 114 | help="Highlight a specific run in the summary", default=None, parser=str_or_none 115 | ), 116 | ): 117 | if output == "markdown": 118 | print(summary_markdown.main(results_dir, highlight_run=highlight)) 119 | else: 120 | summary_app.main(results_dir) 121 | 122 | 123 | def cli(): 124 | app() 125 | -------------------------------------------------------------------------------- /src/evaltools/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/eval/__init__.py -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import time 5 | from pathlib import Path 6 | 7 | import jmespath 8 | import pandas as pd 9 | import requests 10 | from rich.progress import track 11 | 12 | from evaltools import service_setup 13 | 14 | from .evaluate_metrics import metrics_by_name 15 | 16 | logger = logging.getLogger("evaltools") 17 | 18 | 19 | def send_question_to_target( 20 | question: str, 21 | url: str, 22 | parameters: dict = {}, 23 | raise_error=False, 24 | response_answer_jmespath="message.content", 25 | response_context_jmespath="context.data_points.text", 26 | ): 27 | headers = {"Content-Type": "application/json"} 28 | body = { 29 | "messages": [{"content": question, "role": "user"}], 30 | "context": parameters, 31 | } 32 | try: 33 | r = requests.post(url, headers=headers, json=body) 34 | r.encoding = "utf-8" 35 | 36 | latency = r.elapsed.total_seconds() 37 | 38 | try: 39 | response_dict = r.json() 40 | except json.JSONDecodeError: 41 | raise ValueError( 42 | f"Response from target {url} is not valid JSON:\n\n{r.text} \n" 43 | "Make sure that your configuration points at a chat endpoint that returns a single JSON object.\n" 44 | ) 45 | 46 | try: 47 | answer = jmespath.search(response_answer_jmespath, response_dict) 48 | data_points = jmespath.search(response_context_jmespath, response_dict) 49 | if isinstance(data_points, dict): 50 | context = json.dumps(data_points, ensure_ascii=False) 51 | elif isinstance(data_points, list): 52 | context = "\n\n".join(data_points) 53 | elif data_points is not None: 54 | # Hopefully it's a string 55 | context = data_points 56 | else: 57 | raise ValueError("Context is missing") 58 | except Exception: 59 | raise ValueError( 60 | "Response does not adhere to the expected schema. " 61 | f"The answer should be accessible via the JMESPath expression '{response_answer_jmespath}' " 62 | f"and the context should be accessible via the JMESPath expression '{response_context_jmespath}'. " 63 | "Either adjust the app response or adjust send_question_to_target() in evaluate.py " 64 | f"to match the actual schema.\nResponse: {response_dict}" 65 | ) 66 | 67 | response_obj = {"answer": answer, "context": context, "latency": latency} 68 | return response_obj 69 | except Exception as e: 70 | if raise_error: 71 | raise e 72 | return { 73 | "answer": str(e), 74 | "context": str(e), 75 | "latency": -1, 76 | } 77 | 78 | 79 | def truncate_for_log(s: str, max_length=50): 80 | return s if len(s) < max_length else s[:max_length] + "..." 81 | 82 | 83 | def load_jsonl(path: Path) -> list[dict]: 84 | with open(path, encoding="utf-8") as f: 85 | return [json.loads(line) for line in f.readlines()] 86 | 87 | 88 | def run_evaluation( 89 | openai_config: dict, 90 | testdata_path: Path, 91 | results_dir: Path, 92 | target_url: str, 93 | target_parameters={}, 94 | requested_metrics=[], 95 | num_questions=None, 96 | target_response_answer_jmespath=None, 97 | target_response_context_jmespath=None, 98 | model=None, 99 | azure_credential=None, 100 | ): 101 | logger.info("Running evaluation using data from %s", testdata_path) 102 | testdata = load_jsonl(testdata_path) 103 | if num_questions: 104 | logger.info("Limiting evaluation to %s questions", num_questions) 105 | testdata = testdata[:num_questions] 106 | 107 | logger.info("Sending a test question to the target to ensure it is running...") 108 | try: 109 | question = "What information is in your knowledge base?" 110 | target_data = send_question_to_target( 111 | question, 112 | target_url, 113 | target_parameters, 114 | raise_error=True, 115 | response_answer_jmespath=target_response_answer_jmespath, 116 | response_context_jmespath=target_response_context_jmespath, 117 | ) 118 | logger.info( 119 | 'Successfully received response from target for question: "%s"\n"answer": "%s"\n"context": "%s"', 120 | truncate_for_log(question), 121 | truncate_for_log(target_data["answer"]), 122 | truncate_for_log(target_data["context"]), 123 | ) 124 | except Exception as e: 125 | logger.error("Failed to send a test question to the target due to error: \n%s", e) 126 | return False 127 | 128 | logger.info("Sending a test chat completion to the GPT deployment to ensure it is running...") 129 | gpt_response = service_setup.get_openai_client(openai_config, azure_credential).chat.completions.create( 130 | model=model, 131 | messages=[{"role": "user", "content": "Hello!"}], 132 | n=1, 133 | ) 134 | logger.info('Successfully received response from GPT: "%s"', gpt_response.choices[0].message.content) 135 | 136 | logger.info("Starting evaluation...") 137 | for metric in requested_metrics: 138 | if metric not in metrics_by_name: 139 | logger.error(f"Requested metric {metric} is not available. Available metrics: {metrics_by_name.keys()}") 140 | return False 141 | 142 | requested_metrics = [ 143 | metrics_by_name[metric_name] for metric_name in requested_metrics if metric_name in metrics_by_name 144 | ] 145 | 146 | def evaluate_row(row): 147 | output = {} 148 | output["question"] = row["question"] 149 | output["truth"] = row["truth"] 150 | target_response = send_question_to_target( 151 | question=row["question"], 152 | url=target_url, 153 | parameters=target_parameters, 154 | response_answer_jmespath=target_response_answer_jmespath, 155 | response_context_jmespath=target_response_context_jmespath, 156 | ) 157 | output.update(target_response) 158 | for metric in requested_metrics: 159 | result = metric.evaluator_fn(openai_config=openai_config)( 160 | query=row["question"], 161 | response=output["answer"], 162 | context=output["context"], 163 | ground_truth=row["truth"], 164 | ) 165 | output.update(result) 166 | 167 | return output 168 | 169 | # Run evaluations in serial to avoid rate limiting 170 | questions_with_ratings = [] 171 | for row in track(testdata, description="Processing..."): 172 | questions_with_ratings.append(evaluate_row(row)) 173 | 174 | logger.info("Evaluation calls have completed. Calculating overall metrics now...") 175 | # Make the results directory if it doesn't exist 176 | results_dir.mkdir(parents=True, exist_ok=True) 177 | # Save the results 178 | with open(results_dir / "eval_results.jsonl", "w", encoding="utf-8") as results_file: 179 | for row in questions_with_ratings: 180 | results_file.write(json.dumps(row, ensure_ascii=False) + "\n") 181 | 182 | # Calculate aggregate metrics 183 | df = pd.DataFrame(questions_with_ratings) 184 | summary = {} 185 | for metric in requested_metrics: 186 | summary[metric.METRIC_NAME] = metric.get_aggregate_stats(df) 187 | # add a metric for the number of questions 188 | summary["num_questions"] = {"total": len(df)} 189 | 190 | # summary statistics 191 | with open(results_dir / "summary.json", "w", encoding="utf-8") as summary_file: 192 | summary_file.write(json.dumps(summary, indent=4)) 193 | 194 | with open(results_dir / "evaluate_parameters.json", "w", encoding="utf-8") as parameters_file: 195 | parameters = { 196 | "evaluation_gpt_model": model, 197 | "evaluation_timestamp": int(time.time()), 198 | "testdata_path": str(testdata_path), 199 | "target_url": target_url, 200 | "target_parameters": target_parameters, 201 | "num_questions": num_questions, 202 | } 203 | parameters_file.write(json.dumps(parameters, indent=4)) 204 | logger.info("Evaluation results saved in %s", results_dir) 205 | return True 206 | 207 | 208 | def process_config(obj: dict): 209 | """Replace special markers in a config dict with their values: 210 | * with current timestamp 211 | * with contents of file 212 | """ 213 | if isinstance(obj, dict): 214 | for key in obj: 215 | if isinstance(obj[key], dict): 216 | process_config(obj[key]) 217 | elif isinstance(obj[key], str) and "" in obj[key]: 218 | logger.info("Replaced %s in config with timestamp", key) 219 | obj[key] = obj[key].replace("", str(int(time.time()))) 220 | elif isinstance(obj[key], str) and "" in obj[key]: 221 | with open(obj[key].replace("", ""), encoding="utf-8") as f: 222 | logger.info("Replaced %s in config with contents of %s", key, f.name) 223 | obj[key] = f.read() 224 | 225 | 226 | def run_evaluate_from_config( 227 | working_dir, 228 | config_path, 229 | num_questions=None, 230 | target_url=None, 231 | results_dir=None, 232 | openai_config=None, 233 | model=None, 234 | azure_credential=None, 235 | ): 236 | config_path = working_dir / Path(config_path) 237 | logger.info("Running evaluation from config %s", config_path) 238 | with open(config_path, encoding="utf-8") as f: 239 | config = json.load(f) 240 | process_config(config) 241 | 242 | if results_dir is None: 243 | results_dir = working_dir / Path(config["results_dir"]) 244 | 245 | evaluation_run_complete = run_evaluation( 246 | openai_config=openai_config or service_setup.get_openai_config(), 247 | testdata_path=working_dir / config["testdata_path"], 248 | results_dir=results_dir, 249 | target_url=target_url or config["target_url"], 250 | target_parameters=config.get("target_parameters", {}), 251 | num_questions=num_questions, 252 | requested_metrics=config.get( 253 | "requested_metrics", 254 | ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"], 255 | ), 256 | target_response_answer_jmespath=config.get("target_response_answer_jmespath", "message.content"), 257 | target_response_context_jmespath=config.get("target_response_context_jmespath", "context.data_points.text"), 258 | model=model or os.environ["OPENAI_GPT_MODEL"], 259 | azure_credential=azure_credential, 260 | ) 261 | 262 | if evaluation_run_complete: 263 | results_config_path = results_dir / "config.json" 264 | logger.info("Saving original config file back to to %s", results_config_path) 265 | with open(config_path, encoding="utf-8") as input_config: 266 | with open(results_config_path, "w", encoding="utf-8") as output_config: 267 | output_config.write(input_config.read()) 268 | else: 269 | logger.error("Evaluation was terminated early due to an error ⬆") 270 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .builtin_metrics import ( 2 | BuiltinCoherenceMetric, 3 | BuiltinF1ScoreMetric, 4 | BuiltinFluencyMetric, 5 | BuiltinGroundednessMetric, 6 | BuiltinRelevanceMetric, 7 | BuiltinSimilarityMetric, 8 | ) 9 | from .code_metrics import AnswerLengthMetric, CitationMatchMetric, HasCitationMetric, LatencyMetric 10 | from .prompt_metrics import CoherenceMetric, DontKnownessMetric, GroundednessMetric, RelevanceMetric 11 | 12 | metrics = [ 13 | BuiltinCoherenceMetric, 14 | BuiltinRelevanceMetric, 15 | BuiltinGroundednessMetric, 16 | BuiltinSimilarityMetric, 17 | BuiltinFluencyMetric, 18 | BuiltinF1ScoreMetric, 19 | CoherenceMetric, 20 | RelevanceMetric, 21 | GroundednessMetric, 22 | DontKnownessMetric, 23 | LatencyMetric, 24 | AnswerLengthMetric, 25 | HasCitationMetric, 26 | CitationMatchMetric, 27 | ] 28 | 29 | metrics_by_name = {metric.METRIC_NAME: metric for metric in metrics} 30 | 31 | 32 | def register_metric(metric_class): 33 | """Register a new custom metric class.""" 34 | if not hasattr(metric_class, "METRIC_NAME"): 35 | raise ValueError("Metric class must have a METRIC_NAME attribute") 36 | # Check if the metric name is already registered 37 | if metric_class.METRIC_NAME in metrics_by_name: 38 | raise ValueError(f"Metric with name {metric_class.METRIC_NAME} is already registered") 39 | metrics_by_name[metric_class.METRIC_NAME] = metric_class 40 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/base_metric.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC, abstractmethod 3 | 4 | import pandas as pd 5 | 6 | logger = logging.getLogger("evaltools") 7 | 8 | 9 | class BaseMetric(ABC): 10 | METRIC_NAME = "name_of_metric" 11 | 12 | @classmethod 13 | @abstractmethod 14 | def get_aggregate_stats(cls, df): 15 | """Returns a dictionary of aggregate statistics for the metric""" 16 | pass 17 | 18 | @classmethod 19 | def get_aggregate_stats_for_numeric_rating(cls, df, rating_column_name): 20 | # Narrow down dataframe to just the metric 21 | df = df[[rating_column_name]] 22 | 23 | # Drop invalid ratings - strings like "Failed" 24 | rows_before = len(df) 25 | df = df.apply(pd.to_numeric, errors="coerce") 26 | df = df.dropna() 27 | rows_after = len(df) 28 | if rows_before != rows_after: 29 | logger.warning( 30 | "Dropped %d invalid ratings for metric %s", 31 | rows_before - rows_after, 32 | rating_column_name, 33 | ) 34 | 35 | # Count how many ratings passed threshold of 4+ 36 | pass_count = int(df[rating_column_name].apply(lambda rating: rating >= 4).sum()) 37 | 38 | return { 39 | "pass_count": pass_count, 40 | "pass_rate": round(pass_count / rows_before, 2), 41 | "mean_rating": round(df[rating_column_name].mean(), 2), 42 | } 43 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/builtin_metrics.py: -------------------------------------------------------------------------------- 1 | from azure.ai.evaluation import ( 2 | CoherenceEvaluator, 3 | F1ScoreEvaluator, 4 | FluencyEvaluator, 5 | GroundednessEvaluator, 6 | RelevanceEvaluator, 7 | SimilarityEvaluator, 8 | ) 9 | 10 | from .base_metric import BaseMetric 11 | 12 | 13 | class BuiltinRatingMetric(BaseMetric): 14 | @classmethod 15 | def get_aggregate_stats(cls, df): 16 | return cls.get_aggregate_stats_for_numeric_rating(df, cls.METRIC_NAME) 17 | 18 | 19 | class BuiltinRelevanceMetric(BuiltinRatingMetric): 20 | METRIC_NAME = "gpt_relevance" 21 | 22 | @classmethod 23 | def evaluator_fn(cls, openai_config, **kwargs): 24 | return RelevanceEvaluator(openai_config) 25 | 26 | 27 | class BuiltinCoherenceMetric(BuiltinRatingMetric): 28 | METRIC_NAME = "gpt_coherence" 29 | 30 | @classmethod 31 | def evaluator_fn(cls, openai_config, **kwargs): 32 | return CoherenceEvaluator(openai_config) 33 | 34 | 35 | class BuiltinGroundednessMetric(BuiltinRatingMetric): 36 | METRIC_NAME = "gpt_groundedness" 37 | 38 | @classmethod 39 | def evaluator_fn(cls, openai_config, **kwargs): 40 | return GroundednessEvaluator(openai_config) 41 | 42 | 43 | class BuiltinSimilarityMetric(BuiltinRatingMetric): 44 | METRIC_NAME = "gpt_similarity" 45 | 46 | @classmethod 47 | def evaluator_fn(cls, openai_config, **kwargs): 48 | return SimilarityEvaluator(openai_config) 49 | 50 | 51 | class BuiltinFluencyMetric(BuiltinRatingMetric): 52 | METRIC_NAME = "gpt_fluency" 53 | 54 | @classmethod 55 | def evaluator_fn(cls, openai_config, **kwargs): 56 | return FluencyEvaluator(openai_config) 57 | 58 | 59 | class BuiltinF1ScoreMetric(BaseMetric): 60 | METRIC_NAME = "f1_score" 61 | 62 | @classmethod 63 | def evaluator_fn(cls, **kwargs): 64 | return F1ScoreEvaluator() 65 | 66 | @classmethod 67 | def get_aggregate_stats(cls, df): 68 | return { 69 | "mean": round(df[cls.METRIC_NAME].mean(), 2), 70 | "max": round(df[cls.METRIC_NAME].max(), 2), 71 | "min": round(df[cls.METRIC_NAME].min(), 2), 72 | } 73 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/code_metrics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from .base_metric import BaseMetric 5 | 6 | logger = logging.getLogger("evaltools") 7 | 8 | 9 | class AnswerLengthMetric(BaseMetric): 10 | METRIC_NAME = "answer_length" 11 | 12 | @classmethod 13 | def evaluator_fn(cls, **kwargs): 14 | def answer_length(*, response, **kwargs): 15 | if response is None: 16 | logger.warning("Received response of None, can't compute answer_length metric. Setting to -1.") 17 | return {cls.METRIC_NAME: -1} 18 | return {cls.METRIC_NAME: len(response)} 19 | 20 | return answer_length 21 | 22 | @classmethod 23 | def get_aggregate_stats(cls, df): 24 | # remove -1 values from the mean calculation 25 | df = df[df[cls.METRIC_NAME] != -1] 26 | return { 27 | "mean": round(df[cls.METRIC_NAME].mean(), 2), 28 | "max": int(df[cls.METRIC_NAME].max()), 29 | "min": int(df[cls.METRIC_NAME].min()), 30 | } 31 | 32 | 33 | class HasCitationMetric(BaseMetric): 34 | METRIC_NAME = "has_citation" 35 | 36 | @classmethod 37 | def evaluator_fn(cls, **kwargs): 38 | def has_citation(*, response, **kwargs): 39 | if response is None: 40 | logger.warning("Received response of None, can't compute has_citation metric. Setting to -1.") 41 | return {cls.METRIC_NAME: -1} 42 | return {cls.METRIC_NAME: bool(re.search(r"\[[^\]]+\]", response))} 43 | 44 | return has_citation 45 | 46 | @classmethod 47 | def get_aggregate_stats(cls, df): 48 | df = df[df[cls.METRIC_NAME] != -1] 49 | return { 50 | "total": int(df[cls.METRIC_NAME].sum()), 51 | "rate": round(df[cls.METRIC_NAME].mean(), 2), 52 | } 53 | 54 | 55 | class CitationMatchMetric(BaseMetric): 56 | METRIC_NAME = "citation_match" 57 | 58 | @classmethod 59 | def evaluator_fn(cls, **kwargs): 60 | def citation_match(*, response, ground_truth, **kwargs): 61 | if response is None: 62 | logger.warning("Received response of None, can't compute citation_match metric. Setting to -1.") 63 | return {cls.METRIC_NAME: -1} 64 | # Return true if all citations in the truth are present in the response 65 | truth_citations = set(re.findall(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", ground_truth)) 66 | response_citations = set(re.findall(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", response)) 67 | citation_match = truth_citations.issubset(response_citations) 68 | return {cls.METRIC_NAME: citation_match} 69 | 70 | return citation_match 71 | 72 | @classmethod 73 | def get_aggregate_stats(cls, df): 74 | df = df[df[cls.METRIC_NAME] != -1] 75 | return { 76 | "total": int(df[cls.METRIC_NAME].sum()), 77 | "rate": round(df[cls.METRIC_NAME].mean(), 2), 78 | } 79 | 80 | 81 | class LatencyMetric(BaseMetric): 82 | METRIC_NAME = "latency" 83 | 84 | @classmethod 85 | def evaluator_fn(cls, **kwargs): 86 | def latency(**kwargs): 87 | # Return no additional data, since latency is already stored in the target response 88 | return {} 89 | 90 | return latency 91 | 92 | @classmethod 93 | def get_aggregate_stats(cls, df): 94 | return { 95 | "mean": round(df[cls.METRIC_NAME].mean(), 2), 96 | "max": df[cls.METRIC_NAME].max(), 97 | "min": df[cls.METRIC_NAME].min(), 98 | } 99 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/prompt_metrics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | from promptflow.client import load_flow 7 | 8 | from .base_metric import BaseMetric 9 | 10 | PROMPT_TEMPLATE_DIR = Path(__file__).resolve().parent / "prompts" 11 | 12 | logger = logging.getLogger("evaltools") 13 | 14 | 15 | class PromptBasedEvaluator: 16 | def __init__(self, model_config, path, name): 17 | prompty_model_config = {"configuration": model_config} 18 | self._name = name 19 | self._flow = load_flow(source=path, model=prompty_model_config) 20 | 21 | def __call__(self, **kwargs) -> dict: 22 | llm_output = self._flow(**kwargs) 23 | 24 | score = np.nan 25 | if llm_output: 26 | match = re.search(r"\d", llm_output) 27 | if match: 28 | score = float(match.group()) 29 | else: 30 | logging.warning( 31 | "No score found in answer: %s\nMake sure prompty file is correctly formatted.", llm_output 32 | ) 33 | 34 | output = {} 35 | output[self._name] = float(score) 36 | return output 37 | 38 | 39 | class CustomRatingMetric(BaseMetric): 40 | @classmethod 41 | def evaluator_fn(cls, openai_config, **kwargs): 42 | return PromptBasedEvaluator( 43 | openai_config, path=PROMPT_TEMPLATE_DIR / f"{cls.METRIC_NAME}.prompty", name=cls.METRIC_NAME 44 | ) 45 | 46 | @classmethod 47 | def get_aggregate_stats(cls, df): 48 | return cls.get_aggregate_stats_for_numeric_rating(df, cls.METRIC_NAME) 49 | 50 | 51 | class RelevanceMetric(CustomRatingMetric): 52 | METRIC_NAME = "myrelevance" 53 | 54 | 55 | class CoherenceMetric(CustomRatingMetric): 56 | METRIC_NAME = "mycoherence" 57 | 58 | 59 | class GroundednessMetric(CustomRatingMetric): 60 | METRIC_NAME = "mygroundedness" 61 | 62 | 63 | class DontKnownessMetric(CustomRatingMetric): 64 | METRIC_NAME = "dontknowness" 65 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/prompts/dontknowness.prompty: -------------------------------------------------------------------------------- 1 | --- 2 | name: DontKnowness Evaluation 3 | description: Evaluates don't-know-ness of an answer 4 | model: 5 | api: chat 6 | configuration: 7 | type: azure_openai 8 | azure_deployment: ${env:AZURE_OPENAI_EVAL_DEPLOYMENT} 9 | azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} 10 | parameters: 11 | temperature: 0.0 12 | max_tokens: 1 13 | top_p: 1.0 14 | presence_penalty: 0 15 | frequency_penalty: 0 16 | response_format: 17 | type: text 18 | 19 | inputs: 20 | question: 21 | type: string 22 | answer: 23 | type: string 24 | 25 | sample: 26 | question: What are the main goals of Perseverance Mars rover mission? 27 | answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth. 28 | --- 29 | system: 30 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. 31 | 32 | user: 33 | The "I don't know"-ness metric is a measure of how much an answer conveys the lack of knowledge or uncertainty, which is useful for making sure a chatbot for a particular domain doesn't answer outside that domain. Score the I-dont-know-ness of the answer between one to five stars using the following rating scale: 34 | One star: the answer completely answers the question and conveys no uncertainty 35 | Two stars: the answer conveys a little uncertainty but mostly attempts to answer the question 36 | Three stars: the answer conveys some uncertainty but still contains some attempt to answer the question 37 | Four stars: the answer conveys uncertainty and makes no attempt to answer the question 38 | Five stars: the answer says straightforwardly that it doesn't know, and makes no attempt to answer the question. 39 | 40 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 41 | 42 | question: What are the main goals of Perseverance Mars rover mission? 43 | answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth. 44 | stars: 1 45 | 46 | question: What field did Marie Curie excel in? 47 | answer: I'm not sure, but I think Marie Curie excelled in the field of science. 48 | stars: 2 49 | 50 | question: What are the main components of the Mediterranean diet? 51 | answer: I don't have an answer in my sources but I think the diet has some fats? 52 | stars: 3 53 | 54 | question: What are the main attractions of the Queen's Royal Castle? 55 | answer: I'm not certain. Perhaps try rephrasing the question? 56 | stars: 4 57 | 58 | question: Where were The Beatles formed? 59 | answer: I'm sorry, I don't know, that answer is not in my sources. 60 | stars: 5 61 | 62 | question: {{query}} 63 | answer: {{response}} 64 | stars: 65 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/prompts/mycoherence.prompty: -------------------------------------------------------------------------------- 1 | --- 2 | name: Coherence 3 | description: Evaluates coherence score for QA scenario 4 | model: 5 | api: chat 6 | configuration: 7 | type: azure_openai 8 | azure_deployment: ${env:AZURE_DEPLOYMENT} 9 | azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} 10 | parameters: 11 | temperature: 0.0 12 | max_tokens: 1 13 | top_p: 1.0 14 | presence_penalty: 0 15 | frequency_penalty: 0 16 | response_format: 17 | type: text 18 | 19 | inputs: 20 | query: 21 | type: string 22 | response: 23 | type: string 24 | 25 | sample: 26 | query: What are the main goals of Perseverance Mars rover mission? 27 | response: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth. 28 | --- 29 | system: 30 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. 31 | 32 | user: 33 | Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: 34 | One star: the answer completely lacks coherence 35 | Two stars: the answer mostly lacks coherence 36 | Three stars: the answer is partially coherent 37 | Four stars: the answer is mostly coherent 38 | Five stars: the answer has perfect coherency 39 | 40 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 41 | 42 | question: What is your favorite indoor activity and why do you enjoy it? 43 | answer: I like pizza. The sun is shining. 44 | stars: 1 45 | 46 | question: Can you describe your favorite movie without giving away any spoilers? 47 | answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain. 48 | stars: 2 49 | 50 | question: What are some benefits of regular exercise? 51 | answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. 52 | stars: 3 53 | 54 | question: How do you cope with stress in your daily life? 55 | answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities. 56 | stars: 4 57 | 58 | question: What can you tell me about climate change and its effects on the environment? 59 | answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. 60 | stars: 5 61 | 62 | question: {{query}} 63 | answer: {{response}} 64 | stars: 65 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/prompts/mygroundedness.prompty: -------------------------------------------------------------------------------- 1 | --- 2 | name: Groundedness 3 | description: Evaluates groundedness score for QA scenario 4 | model: 5 | api: chat 6 | configuration: 7 | type: azure_openai 8 | azure_deployment: ${env:AZURE_DEPLOYMENT} 9 | azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} 10 | parameters: 11 | temperature: 0.0 12 | max_tokens: 1 13 | top_p: 1.0 14 | presence_penalty: 0 15 | frequency_penalty: 0 16 | response_format: 17 | type: text 18 | 19 | inputs: 20 | response: 21 | type: string 22 | context: 23 | type: string 24 | 25 | sample: 26 | context: The Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. 27 | response: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth. 28 | --- 29 | system: 30 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. 31 | user: 32 | You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: 33 | 1. 5: The ANSWER follows logically from the information contained in the CONTEXT. 34 | 2. 1: The ANSWER is logically false from the information contained in the CONTEXT. 35 | 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. 36 | Independent Examples: 37 | ## Example Task #1 Input: 38 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} 39 | ## Example Task #1 Output: 40 | 1 41 | ## Example Task #2 Input: 42 | {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} 43 | ## Example Task #2 Output: 44 | 5 45 | ## Example Task #3 Input: 46 | {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} 47 | ## Example Task #3 Output: 48 | 5 49 | ## Example Task #4 Input: 50 | {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} 51 | ## Example Task #4 Output: 52 | 1 53 | ## Actual Task Input: 54 | {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{response}}} 55 | Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. 56 | Actual Task Output: 57 | -------------------------------------------------------------------------------- /src/evaltools/eval/evaluate_metrics/prompts/myrelevance.prompty: -------------------------------------------------------------------------------- 1 | --- 2 | name: Relevance 3 | description: Evaluates relevance score for QA scenario 4 | model: 5 | api: chat 6 | configuration: 7 | type: azure_openai 8 | azure_deployment: ${env:AZURE_DEPLOYMENT} 9 | azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} 10 | parameters: 11 | temperature: 0.0 12 | max_tokens: 1 13 | top_p: 1.0 14 | presence_penalty: 0 15 | frequency_penalty: 0 16 | response_format: 17 | type: text 18 | 19 | inputs: 20 | query: 21 | type: string 22 | response: 23 | type: string 24 | context: 25 | type: string 26 | 27 | sample: 28 | question: What are the main goals of Perseverance Mars rover mission? 29 | answer: The main goals of the Perseverance Mars rover mission are to search for signs of ancient life and collect rock and soil samples for possible return to Earth. 30 | context: The Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. 31 | --- 32 | system: 33 | You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. 34 | user: 35 | Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: 36 | One star: the answer completely lacks relevance 37 | Two stars: the answer mostly lacks relevance 38 | Three stars: the answer is partially relevant 39 | Four stars: the answer is mostly relevant 40 | Five stars: the answer has perfect relevance 41 | 42 | This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. 43 | 44 | context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize. 45 | question: What field did Marie Curie excel in? 46 | answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques. 47 | stars: 1 48 | 49 | context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history. 50 | question: Where were The Beatles formed? 51 | answer: The band The Beatles began their journey in London, England, and they changed the history of music. 52 | stars: 2 53 | 54 | context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. 55 | question: What are the main goals of Perseverance Mars rover mission? 56 | answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars. 57 | stars: 3 58 | 59 | context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health. 60 | question: What are the main components of the Mediterranean diet? 61 | answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes. 62 | stars: 4 63 | 64 | context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty. 65 | question: What are the main attractions of the Queen's Royal Castle? 66 | answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty. 67 | stars: 5 68 | 69 | context: {{context}} 70 | question: {{query}} 71 | answer: {{response}} 72 | stars: 73 | -------------------------------------------------------------------------------- /src/evaltools/gen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/gen/__init__.py -------------------------------------------------------------------------------- /src/evaltools/gen/generate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import math 4 | import random 5 | from collections.abc import Generator 6 | from pathlib import Path 7 | 8 | from azure.search.documents import SearchClient 9 | 10 | from evaltools import service_setup 11 | 12 | logger = logging.getLogger("evaltools") 13 | 14 | 15 | def generate_test_qa_data( 16 | openai_config: dict, 17 | num_questions_total: int, 18 | num_questions_per_source: int, 19 | output_file: Path, 20 | source_retriever: Generator[dict, None, None], 21 | source_to_text: callable, 22 | answer_formatter: callable, 23 | ): 24 | try: 25 | from azure.ai.generative.synthetic.qa import QADataGenerator, QAType 26 | except ImportError: 27 | logger.error( 28 | "Azure AI Generative package is deprecated and no longer working, so this functionality is disabled." 29 | ) 30 | 31 | logger.info( 32 | "Generating %d questions total, %d per source, based on search results", 33 | num_questions_total, 34 | num_questions_per_source, 35 | ) 36 | qa_generator = QADataGenerator(model_config=openai_config) 37 | 38 | qa: list[dict] = [] 39 | for source in source_retriever(): 40 | if len(qa) > num_questions_total: 41 | logger.info("Generated enough questions already, stopping") 42 | break 43 | result = qa_generator.generate( 44 | text=source_to_text(source), 45 | qa_type=QAType.LONG_ANSWER, 46 | num_questions=num_questions_per_source, 47 | ) 48 | 49 | for question, answer in result["question_answers"]: 50 | qa.append({"question": question, "truth": answer_formatter(answer, source)}) 51 | 52 | logger.info("Writing %d questions to %s", len(qa), output_file) 53 | directory = Path(output_file).parent 54 | if not directory.exists(): 55 | directory.mkdir(parents=True) 56 | with open(output_file, "w", encoding="utf-8") as f: 57 | for item in qa[0:num_questions_total]: 58 | f.write(json.dumps(item) + "\n") 59 | 60 | 61 | def generate_test_qa_data_for_search_index( 62 | openai_config: dict, 63 | num_questions_total: int, 64 | num_questions_per_source: int, 65 | output_file: Path, 66 | search_client: SearchClient, 67 | citation_field_name: str, 68 | ): 69 | def source_retriever() -> Generator[dict, None, None]: 70 | for doc in search_client.search("", top=1000): 71 | logger.info("Processing search document %s", doc[citation_field_name]) 72 | yield doc 73 | 74 | def source_to_text(source) -> str: 75 | return source["content"] 76 | 77 | def answer_formatter(answer, source) -> str: 78 | return f"{answer} [{source[citation_field_name]}]" 79 | 80 | generate_test_qa_data( 81 | openai_config, 82 | num_questions_total, 83 | num_questions_per_source, 84 | output_file, 85 | source_retriever, 86 | source_to_text, 87 | answer_formatter, 88 | ) 89 | 90 | 91 | def generate_based_on_questions(openai_client, model: str, qa: list, num_questions: int, prompt: str): 92 | existing_questions = "" 93 | if qa: 94 | qa = random.sample(qa, len(qa)) # Shuffle questions for some randomness 95 | existing_questions = "\n".join([item["question"] for item in qa]) 96 | 97 | gpt_response = openai_client.chat.completions.create( 98 | model=model, 99 | messages=[ 100 | { 101 | "role": "user", 102 | "content": f"{prompt} Only generate {num_questions} TOTAL. Separate each question by a new line. \n{existing_questions}", # noqa: E501 103 | } 104 | ], 105 | n=1, 106 | max_tokens=num_questions * 50, 107 | temperature=0.3, 108 | ) 109 | 110 | qa = [] 111 | for message in gpt_response.choices[0].message.content.split("\n")[0:num_questions]: 112 | qa.append({"question": message, "truth": f"Generated from this prompt: {prompt}"}) 113 | return qa 114 | 115 | 116 | def generate_dontknows_qa_data(openai_config: dict, num_questions_total: int, input_file: Path, output_file: Path): 117 | logger.info("Generating off-topic questions based on %s", input_file) 118 | with open(input_file, encoding="utf-8") as f: 119 | qa = [json.loads(line) for line in f.readlines()] 120 | 121 | openai_client = service_setup.get_openai_client(openai_config) 122 | dontknows_qa = [] 123 | num_questions_each = math.ceil(num_questions_total / 4) 124 | dontknows_qa += generate_based_on_questions( 125 | openai_client, 126 | openai_config.model, 127 | qa, 128 | num_questions_each, 129 | f"Given these questions, suggest {num_questions_each} questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone.", # noqa: E501 130 | ) 131 | dontknows_qa += generate_based_on_questions( 132 | openai_client, 133 | openai_config.model, 134 | qa, 135 | num_questions_each, 136 | f"Given these questions, suggest {num_questions_each} questions with similar keywords that are about publicly known facts.", # noqa: E501 137 | ) 138 | dontknows_qa += generate_based_on_questions( 139 | openai_client, 140 | openai_config.model, 141 | qa, 142 | num_questions_each, 143 | f"Given these questions, suggest {num_questions_each} questions that are not related to these topics at all but have well known answers.", # noqa: E501 144 | ) 145 | remaining = num_questions_total - len(dontknows_qa) 146 | dontknows_qa += generate_based_on_questions( 147 | openai_client, 148 | openai_config.model, 149 | qa=None, 150 | num_questions=remaining, 151 | prompt=f"Suggest {remaining} questions that are nonsensical, and would result in confusion if you asked it.", # noqa: E501 152 | ) 153 | 154 | logger.info("Writing %d off-topic questions to %s", len(dontknows_qa), output_file) 155 | directory = Path(output_file).parent 156 | if not directory.exists(): 157 | directory.mkdir(parents=True) 158 | with open(output_file, "w", encoding="utf-8") as f: 159 | for item in dontknows_qa: 160 | f.write(json.dumps(item) + "\n") 161 | -------------------------------------------------------------------------------- /src/evaltools/review/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/ai-rag-chat-evaluator/655d0c40e2ee92a07064cd5b7ae65d14d2b7c679/src/evaltools/review/__init__.py -------------------------------------------------------------------------------- /src/evaltools/review/answers.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Review answers 5 | 9 | 13 | 14 | 15 |
16 |

Review answers

17 | 18 | 22 | 23 | 24 | 25 | 26 |
27 |
28 | 29 | 69 | 70 | 71 | 76 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /src/evaltools/review/diff_app.py: -------------------------------------------------------------------------------- 1 | # a CLI tool to diff two JSON files 2 | from pathlib import Path 3 | 4 | from textual.app import App, ComposeResult 5 | from textual.containers import Horizontal, Vertical, VerticalScroll 6 | from textual.widgets import Button, DataTable, Markdown, Static 7 | 8 | from .utils import diff_directories 9 | 10 | 11 | class DiffApp(App): 12 | CSS_PATH = "diff_app.tcss" 13 | 14 | def __init__(self, directories: list[Path], changed: str = None): 15 | super().__init__() 16 | # Only include the first directory if the second is not provided 17 | self.directories = directories 18 | self.changed = changed 19 | self.data_dicts = [] # Store dicts keyed by question 20 | self.result_index = 0 # Based on results in the first directory 21 | 22 | def on_mount(self): 23 | self.data_dicts = diff_directories(self.directories) 24 | self.next_question() 25 | 26 | def on_button_pressed(self, event: Button.Pressed) -> None: 27 | if event.button.id == "quit": 28 | self.exit() 29 | else: 30 | self.next_question() 31 | 32 | def compose(self) -> ComposeResult: 33 | with Vertical(): 34 | yield Static(id="question") 35 | with Horizontal(id="sources"): 36 | for directory in self.directories: 37 | yield Static(directory.name, classes="source") 38 | if len(self.directories) == 1: 39 | yield Static("Ground truth answer", classes="source") 40 | with Horizontal(id="answers"): 41 | for ind in range(len(self.directories)): 42 | with VerticalScroll(classes="answer"): 43 | yield Markdown(id=f"answer{ind}") 44 | if len(self.directories) == 1: 45 | with VerticalScroll(classes="answer"): 46 | yield Markdown(id="answer_truth") 47 | with Horizontal(id="metrics"): 48 | for ind in range(len(self.directories)): 49 | yield DataTable(id=f"metrics{ind}", show_cursor=False, cell_padding=1) 50 | with Horizontal(id="buttons"): 51 | yield Button.success("Next question", classes="button") 52 | yield Button.error("Quit", id="quit", classes="button") 53 | 54 | def next_question(self): 55 | if self.result_index >= len(self.data_dicts[0]): 56 | self.exit() 57 | return 58 | question = list(self.data_dicts[0].keys())[self.result_index] 59 | self.query_one("#question", Static).update(question) 60 | 61 | for ind in range(len(self.directories)): 62 | try: 63 | self.query_one(f"#answer{ind}", Markdown).update(self.data_dicts[ind][question]["answer"]) 64 | if len(self.directories) == 1: 65 | self.query_one("#answer_truth", Markdown).update(self.data_dicts[0][question]["truth"]) 66 | except KeyError: 67 | self.query_one(f"#answer{ind}", Markdown).update("No answer found for that question") 68 | continue 69 | 70 | # Find all fields in the result that have numeric values 71 | metric_columns = [] 72 | metric_values = [] 73 | question_results = self.data_dicts[ind][question] 74 | for column, value in question_results.items(): 75 | if isinstance(value, int | float): 76 | metric_columns.append(column) 77 | metric_values.append(round(value, 1) if isinstance(value, float) else value) 78 | datatable = self.query_one(f"#metrics{ind}", DataTable) 79 | datatable.clear(columns=True).add_columns(*metric_columns) 80 | datatable.add_row(*metric_values) 81 | datatable.add_row("" * len(metric_columns)) 82 | 83 | self.result_index += 1 84 | 85 | 86 | def main(directories: list[Path], changed: str | None = None): 87 | app = DiffApp(directories, changed) 88 | app.run() 89 | -------------------------------------------------------------------------------- /src/evaltools/review/diff_app.tcss: -------------------------------------------------------------------------------- 1 | Screen { 2 | padding: 1; 3 | } 4 | 5 | #sources { 6 | height: 2 7 | } 8 | 9 | .source { 10 | width: 1fr; 11 | border: solid green; 12 | } 13 | 14 | .answer { 15 | height: 100%; 16 | width: 1fr; 17 | border: solid green; 18 | } 19 | 20 | #metrics { 21 | height: 3 22 | } 23 | 24 | #buttons { 25 | height: 3; 26 | align: center middle; 27 | } 28 | 29 | #quit { 30 | margin-left: 5; 31 | } 32 | -------------------------------------------------------------------------------- /src/evaltools/review/diff_markdown.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any 3 | 4 | from .utils import diff_directories 5 | 6 | 7 | def _round_metric(value: Any) -> Any: 8 | if isinstance(value, float): 9 | return round(value, 1) 10 | return value 11 | 12 | 13 | def main(directories: list[Path], changed: str | None = None): 14 | data_dicts = diff_directories(directories, changed) 15 | 16 | markdown_str = "" 17 | for question in data_dicts[0].keys(): 18 | markdown_str += f"**{question}**\n\n" 19 | # now make an HTML table with the answers 20 | markdown_str += "\n" 21 | markdown_str += ( 22 | "" 23 | + "".join([f"" for directory in directories]) 24 | + "\n" 25 | ) 26 | markdown_str += ( 27 | "" 28 | + "".join([f"" for data_dict in data_dicts]) 29 | + f"\n" 30 | ) 31 | 32 | # now make rows for each metric 33 | metrics = {} 34 | question_results = data_dicts[0][question] 35 | for column, value in question_results.items(): 36 | if isinstance(value, int | float): 37 | metrics[column] = [] 38 | for metric_name in metrics.keys(): 39 | first_value = _round_metric(data_dicts[0][question].get(metric_name)) 40 | for ind, data_dict in enumerate(data_dicts): 41 | value = _round_metric(data_dict[question].get(metric_name)) 42 | # Insert arrow emoji based on the difference between metric value and the first data_dict 43 | value_emoji = "" 44 | if value is not None and ind > 0 and value != first_value: 45 | value_emoji = "⬆️" if value > data_dicts[0][question][metric_name] else "⬇️" 46 | metrics[metric_name].append(f"{value} {value_emoji}") 47 | # make a row for each metric 48 | for metric_name, metric_values in metrics.items(): 49 | markdown_str += ( 50 | f"" 51 | + "".join([f"" for value in metric_values]) 52 | + "\n" 53 | ) 54 | markdown_str += "
{directory.name}ground_truth
answer{data_dict[question]['answer']}{data_dicts[0][question]['truth']}
{metric_name}{value}N/A
\n\n" 55 | return markdown_str 56 | -------------------------------------------------------------------------------- /src/evaltools/review/parameters_screen.tcss: -------------------------------------------------------------------------------- 1 | 2 | 3 | ParametersScreen { 4 | align: center middle; 5 | } 6 | 7 | #dialog { 8 | padding: 0 1; 9 | width: 90%; 10 | height: 90%; 11 | border: thick $background 80%; 12 | background: $surface; 13 | } 14 | 15 | #header { 16 | height: 2; 17 | } 18 | 19 | #body { 20 | height: 1fr; 21 | } 22 | 23 | #button { 24 | margin-top: 1; 25 | height: 3; 26 | } 27 | -------------------------------------------------------------------------------- /src/evaltools/review/requirements.txt: -------------------------------------------------------------------------------- 1 | textual 2 | typer 3 | -------------------------------------------------------------------------------- /src/evaltools/review/summary_app.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | from textual.app import App, ComposeResult 5 | from textual.containers import Horizontal, Vertical 6 | from textual.screen import ModalScreen 7 | from textual.widgets import Button, DataTable, Label, TextArea 8 | 9 | from .utils import summarize_results 10 | 11 | 12 | class ParametersScreen(ModalScreen): 13 | CSS_PATH = "parameters_screen.tcss" 14 | 15 | def __init__(self, folder, parameters) -> None: 16 | super().__init__() 17 | self.folder = folder 18 | self.parameters = parameters 19 | 20 | def compose(self) -> ComposeResult: 21 | yield Vertical( 22 | Label(f"Parameters for: {self.folder}", id="header"), 23 | TextArea(json.dumps(self.parameters, indent=4), language="json", id="body"), 24 | Button("Close", variant="primary", id="button"), 25 | id="dialog", 26 | ) 27 | 28 | def on_button_pressed(self, event: Button.Pressed) -> None: 29 | self.app.pop_screen() 30 | 31 | 32 | class TableApp(App): 33 | CSS_PATH = "summary_app.tcss" 34 | 35 | def __init__(self, results_dir: Path) -> None: 36 | super().__init__() 37 | self.rows, self.row_parameters = summarize_results(results_dir) 38 | 39 | def compose(self) -> ComposeResult: 40 | with Vertical(): 41 | yield DataTable(id="table") 42 | with Horizontal(id="buttons"): 43 | yield Button.error("Quit", id="quit", classes="button") 44 | 45 | def on_button_pressed(self, event: Button.Pressed) -> None: 46 | if event.button.id == "quit": 47 | self.exit() 48 | 49 | def on_mount(self) -> None: 50 | table = self.query_one(DataTable) 51 | table.add_columns(*self.rows[0]) 52 | table.add_rows(self.rows[1:]) 53 | 54 | def on_data_table_cell_selected(self, event: DataTable.CellSelected) -> None: 55 | if event.coordinate.column == 0: 56 | folder = event.value 57 | if folder in self.row_parameters: 58 | parameters = self.row_parameters[folder] 59 | self.push_screen(ParametersScreen(folder, parameters)) 60 | 61 | 62 | def main(directory: Path): 63 | app = TableApp(directory) 64 | app.run() 65 | -------------------------------------------------------------------------------- /src/evaltools/review/summary_app.tcss: -------------------------------------------------------------------------------- 1 | #table { 2 | height: 100%; 3 | } 4 | 5 | #buttons { 6 | height: 3; 7 | align: center middle; 8 | } 9 | 10 | #quit { 11 | margin-left: 5; 12 | } 13 | -------------------------------------------------------------------------------- /src/evaltools/review/summary_markdown.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .utils import summarize_results 4 | 5 | 6 | def main(results_dir: Path, highlight_run: str | None = None) -> str: 7 | rows, row_parameters = summarize_results(results_dir) 8 | # transpose the rows 9 | rows = list(map(list, zip(*rows))) 10 | 11 | # make a markdown table 12 | headers = ["metric", "stat"] + list(row_parameters.keys()) 13 | # find the index of the highlight run 14 | if highlight_run: 15 | highlight_run = highlight_run.strip() 16 | highlight_run_index = headers.index(highlight_run) 17 | else: 18 | highlight_run_index = None 19 | 20 | # put a star and bold the highlight run 21 | if highlight_run: 22 | headers = [f"☞{header}☜" if ind == highlight_run_index else header for ind, header in enumerate(headers)] 23 | 24 | table = "| " + " | ".join(headers) + " |\n" 25 | table += "|" + " |".join(["---"] * len(rows[0])) + " |\n" 26 | for ind, row in enumerate(rows[1:]): 27 | if row[0] == "": 28 | row[0] = "↑" 29 | # stringifying the row 30 | row = [str(cell) for cell in row] 31 | # highlight the cell that corresponds to the highlight run 32 | if highlight_run: 33 | row = [f"**{cell}**" if ind == highlight_run_index else cell for ind, cell in enumerate(row)] 34 | table += "| " + " | ".join(row) + " |\n" 35 | return table 36 | -------------------------------------------------------------------------------- /src/evaltools/review/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | from pathlib import Path 5 | 6 | 7 | def summarize_results(results_dir): 8 | run_summaries = {} 9 | # first find the shared metrics across the runs 10 | metric_counts = {} 11 | 12 | folders = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))] 13 | folders.sort() 14 | for folder in folders: 15 | with open(Path(results_dir) / folder / "summary.json", encoding="utf-8") as f: 16 | summary = json.load(f) 17 | run_summaries[folder] = summary 18 | # first find the common parameters across the runs 19 | for metric_name in summary: 20 | metric_counts[metric_name] = metric_counts.get(metric_name, 0) + 1 21 | 22 | # Only show metrics that have shown up at least twice across runs 23 | shared_metric_names = [ 24 | metric_name for metric_name, count in metric_counts.items() if count > 1 or len(run_summaries) == 1 25 | ] 26 | shared_metric_stats = {metric_name: set() for metric_name in shared_metric_names} 27 | 28 | # Now figure out what stat to show about each metric 29 | for folder, summary in run_summaries.items(): 30 | for metric_name in shared_metric_names: 31 | if metric_name in summary: 32 | metric = summary[metric_name] 33 | if "mean_rating" in metric: 34 | shared_metric_stats[metric_name].add("mean_rating") 35 | elif "mean" in metric: 36 | shared_metric_stats[metric_name].add("mean") 37 | if "pass_rate" in metric: 38 | shared_metric_stats[metric_name].add("pass_rate") 39 | elif "rate" in metric: 40 | shared_metric_stats[metric_name].add("rate") 41 | 42 | first_row = ["folder"] 43 | # Build second row 44 | second_row = [""] 45 | for metric_name in shared_metric_names: 46 | # The first row of columns should have metric name followed by blank column for each stat above 1 stat 47 | first_row.append(metric_name) 48 | if len(shared_metric_stats[metric_name]) > 1: 49 | first_row.extend([""] * (len(shared_metric_stats[metric_name]) - 1)) 50 | # The second row of columns should just have the stat names 51 | for stat in shared_metric_stats[metric_name]: 52 | second_row.append(stat) 53 | 54 | rows = [first_row, second_row] 55 | row_parameters = {} 56 | # Build rest of the rows 57 | for folder, summary in run_summaries.items(): 58 | run_row = [folder] 59 | for metric_name in shared_metric_names: 60 | for stat in shared_metric_stats[metric_name]: 61 | if stat in summary.get(metric_name, {}): 62 | run_row.append(summary[metric_name][stat]) 63 | else: 64 | run_row.append("?") 65 | with open(Path(results_dir) / folder / "eval_results.jsonl", encoding="utf-8") as f: 66 | run_row.append(sum(1 for _ in f)) 67 | rows.append(run_row) 68 | with open(Path(results_dir) / folder / "evaluate_parameters.json", encoding="utf-8") as f: 69 | row_parameters[folder] = json.load(f) 70 | 71 | return rows, row_parameters 72 | 73 | 74 | def diff_directories(directories: list[Path], changed: str | None = None): 75 | data_dicts = [] 76 | for directory in directories: 77 | with open(directory / "eval_results.jsonl", encoding="utf-8") as f: 78 | data_json = [json.loads(question_json) for question_json in f.readlines()] 79 | data_dicts.append({question["question"]: question for question in data_json}) 80 | if changed: 81 | # filter out questions that have the same value for the given column 82 | for question in list(data_dicts[0].keys()): 83 | # if question isn't in the second directory, skip 84 | if question not in data_dicts[1]: 85 | data_dicts[0].pop(question) 86 | continue 87 | # if either metric is None, skip 88 | if data_dicts[0][question].get(changed) is None or data_dicts[1][question].get(changed) is None: 89 | data_dicts[0].pop(question) 90 | continue 91 | if data_dicts[0][question].get(changed) == data_dicts[1][question].get(changed): 92 | if math.isclose(data_dicts[0][question].get(changed), data_dicts[1][question].get(changed)): 93 | data_dicts[0].pop(question) 94 | data_dicts[1].pop(question) 95 | return data_dicts 96 | -------------------------------------------------------------------------------- /src/evaltools/service_setup.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Union 4 | 5 | import openai 6 | from azure.ai.evaluation import AzureOpenAIModelConfiguration, OpenAIModelConfiguration 7 | from azure.core.credentials import AzureKeyCredential 8 | from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider 9 | from azure.search.documents import SearchClient 10 | 11 | logger = logging.getLogger("evaltools") 12 | 13 | 14 | def get_azd_credential(tenant_id: Union[str, None]) -> AzureDeveloperCliCredential: 15 | if tenant_id: 16 | logger.info("Using Azure Developer CLI Credential for tenant %s", tenant_id) 17 | return AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60) 18 | logger.info("Using Azure Developer CLI Credential for home tenant") 19 | return AzureDeveloperCliCredential(process_timeout=60) 20 | 21 | 22 | def get_openai_config() -> dict: 23 | if os.environ.get("OPENAI_HOST") == "azure": 24 | azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT") 25 | azure_deployment = os.environ.get("AZURE_OPENAI_EVAL_DEPLOYMENT") 26 | if os.environ.get("AZURE_OPENAI_KEY"): 27 | logger.info("Using Azure OpenAI Service with API Key from AZURE_OPENAI_KEY") 28 | openai_config: AzureOpenAIModelConfiguration = { 29 | "azure_endpoint": azure_endpoint, 30 | "api_key": os.environ["AZURE_OPENAI_KEY"], 31 | "azure_deployment": azure_deployment, 32 | } 33 | else: 34 | logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential") 35 | openai_config: AzureOpenAIModelConfiguration = { 36 | "azure_endpoint": azure_endpoint, 37 | "azure_deployment": azure_deployment, 38 | } 39 | # azure-ai-evaluate will call DefaultAzureCredential behind the scenes, 40 | # so we must be logged in to Azure CLI with the correct tenant 41 | else: 42 | logger.info("Using OpenAI Service with API Key from OPENAICOM_KEY") 43 | openai_config: OpenAIModelConfiguration = { 44 | "api_key": os.environ["OPENAICOM_KEY"], 45 | "organization": os.environ["OPENAICOM_ORGANIZATION"], 46 | "model": os.environ["OPENAI_GPT_MODEL"], 47 | } 48 | return openai_config 49 | 50 | 51 | def get_openai_config_dict() -> dict: 52 | """Return a dictionary with OpenAI configuration based on environment variables. 53 | This is only used by azure-ai-generative SDK right now, and should be deprecated once 54 | the generate functionality is available in azure-ai-evaluation SDK. 55 | """ 56 | if os.environ.get("OPENAI_HOST") == "azure": 57 | if os.environ.get("AZURE_OPENAI_KEY"): 58 | logger.info("Using Azure OpenAI Service with API Key from AZURE_OPENAI_KEY") 59 | api_key = os.environ["AZURE_OPENAI_KEY"] 60 | else: 61 | logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential") 62 | azure_credential = get_azd_credential(os.environ.get("AZURE_OPENAI_TENANT_ID")) 63 | api_key = azure_credential.get_token("https://cognitiveservices.azure.com/.default").token 64 | openai_config = { 65 | "api_type": "azure", 66 | "api_base": os.environ["AZURE_OPENAI_ENDPOINT"], 67 | "api_key": api_key, 68 | "api_version": "2024-02-15-preview", 69 | "deployment": os.environ["AZURE_OPENAI_EVAL_DEPLOYMENT"], 70 | "model": os.environ["OPENAI_GPT_MODEL"], 71 | } 72 | else: 73 | logger.info("Using OpenAI Service with API Key from OPENAICOM_KEY") 74 | openai_config = { 75 | "api_type": "openai", 76 | "api_key": os.environ["OPENAICOM_KEY"], 77 | "organization": os.environ["OPENAICOM_ORGANIZATION"], 78 | "model": os.environ["OPENAI_GPT_MODEL"], 79 | "deployment": "none-needed-for-openaicom", 80 | } 81 | return openai_config 82 | 83 | 84 | def get_search_client(): 85 | if api_key := os.environ.get("AZURE_SEARCH_KEY"): 86 | logger.info("Using Azure Search Service with API Key from AZURE_SEARCH_KEY") 87 | azure_credential = AzureKeyCredential(api_key) 88 | else: 89 | logger.info("Using Azure Search Service with Azure Developer CLI Credential") 90 | azure_credential = get_azd_credential(os.environ.get("AZURE_SEARCH_TENANT_ID")) 91 | 92 | return SearchClient( 93 | endpoint=os.environ["AZURE_SEARCH_ENDPOINT"], 94 | index_name=os.environ["AZURE_SEARCH_INDEX"], 95 | credential=azure_credential, 96 | ) 97 | 98 | 99 | def get_openai_client( 100 | oai_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], azure_credential=None 101 | ): 102 | if "azure_deployment" in oai_config: 103 | azure_token_provider = None 104 | 105 | if azure_credential is None and not os.environ.get("AZURE_OPENAI_KEY"): 106 | logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential") 107 | azure_credential = get_azd_credential(os.environ.get("AZURE_OPENAI_TENANT_ID")) 108 | if azure_credential is not None: 109 | azure_token_provider = get_bearer_token_provider( 110 | azure_credential, "https://cognitiveservices.azure.com/.default" 111 | ) 112 | return openai.AzureOpenAI( 113 | api_version="2024-02-15-preview", 114 | azure_endpoint=oai_config["azure_endpoint"], 115 | api_key=oai_config["api_key"] if oai_config.get("api_key") else None, 116 | azure_ad_token_provider=azure_token_provider, 117 | azure_deployment=oai_config["azure_deployment"], 118 | ) 119 | elif "organization" in oai_config: 120 | oai_config: OpenAIModelConfiguration = oai_config 121 | return openai.OpenAI(api_key=oai_config["api_key"], organization=oai_config["organization"]) 122 | -------------------------------------------------------------------------------- /tests/test_evaluate.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import requests 4 | 5 | from evaltools.eval.evaluate import send_question_to_target 6 | 7 | 8 | def test_send_question_to_target_valid(): 9 | # Test case 1: Valid response 10 | response = { 11 | "message": {"content": "This is the answer"}, 12 | "context": {"data_points": {"text": ["Context 1", "Context 2"]}}, 13 | } 14 | requests.post = lambda url, headers, json: MockResponse(response) 15 | result = send_question_to_target("Question 1", "http://example.com") 16 | assert result["answer"] == "This is the answer" 17 | assert result["context"] == "Context 1\n\nContext 2" 18 | assert result["latency"] == 1 19 | 20 | 21 | def test_send_question_to_target_missing_error_store(): 22 | response = {} 23 | requests.post = lambda url, headers, json: MockResponse(response) 24 | result = send_question_to_target("Question", "http://example.com") 25 | assert result["answer"] == ( 26 | "Response does not adhere to the expected schema. " 27 | "The answer should be accessible via the JMESPath expression 'message.content' " 28 | "and the context should be accessible via the JMESPath expression 'context.data_points.text'. " 29 | "Either adjust the app response or adjust send_question_to_target() " 30 | "in evaluate.py to match the actual schema.\n" 31 | "Response: {}" 32 | ) 33 | assert result["context"] == ( 34 | "Response does not adhere to the expected schema. " 35 | "The answer should be accessible via the JMESPath expression 'message.content' " 36 | "and the context should be accessible via the JMESPath expression 'context.data_points.text'. " 37 | "Either adjust the app response or adjust send_question_to_target() " 38 | "in evaluate.py to match the actual schema.\n" 39 | "Response: {}" 40 | ) 41 | 42 | 43 | def test_send_question_to_target_missing_all(): 44 | response = {} 45 | requests.post = lambda url, headers, json: MockResponse(response) 46 | try: 47 | send_question_to_target("Question", "Answer", "http://example.com", raise_error=True) 48 | except Exception as e: 49 | assert str(e) == ( 50 | "Response does not adhere to the expected schema. " 51 | "The answer should be accessible via the JMESPath expression 'message.content' " 52 | "and the context should be accessible via the JMESPath expression 'context.data_points.text'. " 53 | "Either adjust the app response or adjust send_question_to_target() " 54 | "in evaluate.py to match the actual schema.\n" 55 | "Response: {}" 56 | ) 57 | 58 | 59 | def test_send_question_to_target_missing_content(): 60 | response = {"message": {}, "context": {"data_points": {"text": ["Context 1", "Context 2"]}}} 61 | requests.post = lambda url, headers, json: MockResponse(response) 62 | try: 63 | send_question_to_target("Question", "Answer", "http://example.com", raise_error=True) 64 | except Exception as e: 65 | assert str(e) == ( 66 | "Response does not adhere to the expected schema. " 67 | "The answer should be accessible via the JMESPath expression 'message.content' " 68 | "and the context should be accessible via the JMESPath expression 'context.data_points.text'. " 69 | "Either adjust the app response or adjust send_question_to_target() " 70 | "in evaluate.py to match the actual schema.\n" 71 | "Response: {'message': {}, 'context': {'data_points': {'text': ['Context 1', 'Context 2']}}}" 72 | ) 73 | 74 | 75 | def test_send_question_to_target_missing_context(): 76 | # Test case 5: Missing 'context' key in response 77 | response = {"message": {"content": "This is the answer"}} 78 | requests.post = lambda url, headers, json: MockResponse(response) 79 | try: 80 | send_question_to_target("Question", "Answer", "http://example.com", raise_error=True) 81 | except Exception as e: 82 | assert str(e) == ( 83 | "Response does not adhere to the expected schema. " 84 | "The answer should be accessible via the JMESPath expression 'message.content' " 85 | "and the context should be accessible via the JMESPath expression 'context.data_points.text'. " 86 | "Either adjust the app response or adjust send_question_to_target() " 87 | "in evaluate.py to match the actual schema.\n" 88 | "Response: {'message': {'content': 'This is the answer'}}" 89 | ) 90 | 91 | 92 | class MockResponse: 93 | def __init__(self, json_data): 94 | self.json_data = json_data 95 | self.elapsed = timedelta(seconds=1) 96 | 97 | def json(self): 98 | return self.json_data 99 | -------------------------------------------------------------------------------- /tests/test_evaluate_metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from evaltools.eval.evaluate_metrics import builtin_metrics, code_metrics, prompt_metrics 4 | 5 | 6 | def test_answer_length(): 7 | metric = code_metrics.AnswerLengthMetric() 8 | metric_function = metric.evaluator_fn() 9 | assert callable(metric_function) 10 | assert metric_function(response="Hello, world!") == {"answer_length": 13} 11 | df = pd.DataFrame([{"answer_length": 20}, {"answer_length": 10}, {"answer_length": 5}]) 12 | assert metric.get_aggregate_stats(df) == {"mean": 11.67, "max": 20, "min": 5} 13 | 14 | 15 | def test_answer_length_new(): 16 | metric = code_metrics.AnswerLengthMetric() 17 | metric_function = metric.evaluator_fn() 18 | assert metric_function(response=None) == {"answer_length": -1} 19 | df = pd.DataFrame([{"answer_length": 20}, {"answer_length": 10}, {"answer_length": 5}, {"answer_length": -1}]) 20 | assert metric.get_aggregate_stats(df) == {"mean": 11.67, "max": 20, "min": 5} 21 | 22 | 23 | def test_has_citation(): 24 | metric = code_metrics.HasCitationMetric() 25 | metric_function = metric.evaluator_fn() 26 | assert callable(metric_function) 27 | assert metric_function(response="Hello, world!") == {"has_citation": False} 28 | assert metric_function(response="Hello, [world.pdf]!") == {"has_citation": True} 29 | 30 | df = pd.DataFrame([{"has_citation": True}, {"has_citation": False}, {"has_citation": True}]) 31 | assert metric.get_aggregate_stats(df) == {"total": 2, "rate": 0.67} 32 | 33 | 34 | def test_has_citation_none(): 35 | metric = code_metrics.HasCitationMetric() 36 | metric_function = metric.evaluator_fn() 37 | assert metric_function(response=None) == {"has_citation": -1} 38 | df = pd.DataFrame([{"has_citation": True}, {"has_citation": False}, {"has_citation": -1}]) 39 | assert metric.get_aggregate_stats(df) == {"total": 1, "rate": 0.5} 40 | 41 | 42 | def test_citation_match(): 43 | metric = code_metrics.CitationMatchMetric() 44 | metric_function = metric.evaluator_fn() 45 | assert callable(metric_function) 46 | assert metric_function(ground_truth="answer in [file.pdf]", response="answer in [file2.pdf]") == { 47 | "citation_match": False 48 | } 49 | assert metric_function(ground_truth="answer in [file2.pdf]", response="answer in [file2.pdf]") == { 50 | "citation_match": True 51 | } 52 | assert metric_function(ground_truth="answer in [file2.pdf]", response="answer in [file1.pdf][file2.pdf]") == { 53 | "citation_match": True 54 | } 55 | df = pd.DataFrame([{"citation_match": True}, {"citation_match": False}, {"citation_match": True}]) 56 | assert metric.get_aggregate_stats(df) == {"total": 2, "rate": 0.67} 57 | 58 | 59 | def test_citation_match_filenames_only(): 60 | truth = 'Use settings like "python.linting.enabled": true, "[python]" [best-practices-for-prompting-github.html]' 61 | response = 'Use extension with setting "python.linting.enabled" [best-practices-for-prompting-github.html]' 62 | metric = code_metrics.CitationMatchMetric() 63 | metric_function = metric.evaluator_fn() 64 | assert metric_function(ground_truth=truth, response=response) == {"citation_match": True} 65 | 66 | 67 | def test_citation_match_none(): 68 | metric = code_metrics.CitationMatchMetric() 69 | metric_function = metric.evaluator_fn() 70 | assert metric_function(ground_truth="Answer", response=None) == {"citation_match": -1} 71 | df = pd.DataFrame([{"citation_match": True}, {"citation_match": False}, {"citation_match": -1}]) 72 | assert metric.get_aggregate_stats(df) == {"total": 1, "rate": 0.5} 73 | 74 | 75 | def test_latency(): 76 | metric = code_metrics.LatencyMetric() 77 | metric_function = metric.evaluator_fn() 78 | assert callable(metric_function) 79 | assert metric_function(data={"latency": 20}) == {} 80 | df = pd.DataFrame([{"latency": 20}, {"latency": 10}, {"latency": 5}]) 81 | assert metric.get_aggregate_stats(df) == {"mean": 11.67, "max": 20, "min": 5} 82 | 83 | 84 | def test_custom_relevance(): 85 | metric = prompt_metrics.RelevanceMetric() 86 | 87 | assert callable(metric.evaluator_fn(openai_config=None)) 88 | df = pd.DataFrame([{"myrelevance": 5}, {"myrelevance": 4}, {"myrelevance": 3}]) 89 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 90 | 91 | 92 | def test_custom_coherence(): 93 | metric = prompt_metrics.CoherenceMetric() 94 | 95 | assert callable(metric.evaluator_fn(openai_config=None)) 96 | df = pd.DataFrame([{"mycoherence": 5}, {"mycoherence": 4}, {"mycoherence": 3}]) 97 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 98 | 99 | 100 | def test_custom_groundedness(): 101 | metric = prompt_metrics.GroundednessMetric() 102 | 103 | assert callable(metric.evaluator_fn(openai_config=None)) 104 | df = pd.DataFrame([{"mygroundedness": 5}, {"mygroundedness": 4}, {"mygroundedness": 3}]) 105 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 106 | 107 | 108 | def test_custom_relevance_missing_values(): 109 | metric = prompt_metrics.RelevanceMetric() 110 | 111 | assert callable(metric.evaluator_fn(openai_config=None)) 112 | df = pd.DataFrame([{"myrelevance": 2}, {"myrelevance": 4}, {"myrelevance": "Failed"}]) 113 | assert metric.get_aggregate_stats(df) == {"mean_rating": 3.0, "pass_count": 1, "pass_rate": 0.33} 114 | 115 | 116 | def test_builtin_coherence(): 117 | metric = builtin_metrics.BuiltinCoherenceMetric() 118 | assert metric.METRIC_NAME == "gpt_coherence" 119 | df = pd.DataFrame([{"gpt_coherence": 5}, {"gpt_coherence": 4}, {"gpt_coherence": 3}]) 120 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 121 | 122 | 123 | def test_builtin_relevance(): 124 | metric = builtin_metrics.BuiltinRelevanceMetric() 125 | assert metric.METRIC_NAME == "gpt_relevance" 126 | df = pd.DataFrame([{"gpt_relevance": 5}, {"gpt_relevance": 4}, {"gpt_relevance": 3}]) 127 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 128 | 129 | 130 | def test_builtin_groundedness(): 131 | metric = builtin_metrics.BuiltinGroundednessMetric() 132 | assert metric.METRIC_NAME == "gpt_groundedness" 133 | df = pd.DataFrame([{"gpt_groundedness": 5}, {"gpt_groundedness": 4}, {"gpt_groundedness": 3}]) 134 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 135 | 136 | 137 | def test_builtin_fluency(): 138 | metric = builtin_metrics.BuiltinFluencyMetric() 139 | assert metric.METRIC_NAME == "gpt_fluency" 140 | df = pd.DataFrame([{"gpt_fluency": 5}, {"gpt_fluency": 4}, {"gpt_fluency": 3}]) 141 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 142 | 143 | 144 | def test_builtin_similarity(): 145 | metric = builtin_metrics.BuiltinSimilarityMetric() 146 | assert metric.METRIC_NAME == "gpt_similarity" 147 | df = pd.DataFrame([{"gpt_similarity": 5}, {"gpt_similarity": 4}, {"gpt_similarity": 3}]) 148 | assert metric.get_aggregate_stats(df) == {"mean_rating": 4.0, "pass_count": 2, "pass_rate": 0.67} 149 | 150 | 151 | def test_builtin_f1_score(): 152 | metric = builtin_metrics.BuiltinF1ScoreMetric() 153 | assert metric.METRIC_NAME == "f1_score" 154 | df = pd.DataFrame([{"f1_score": 5}, {"f1_score": 4}, {"f1_score": 3}]) 155 | assert metric.get_aggregate_stats(df) == {"mean": 4.0, "max": 5, "min": 3} 156 | 157 | 158 | def test_builtin_coherence_missing_values(): 159 | metric = builtin_metrics.BuiltinCoherenceMetric() 160 | assert metric.METRIC_NAME == "gpt_coherence" 161 | df = pd.DataFrame([{"gpt_coherence": "Failed"}, {"gpt_coherence": 4}, {"gpt_coherence": 3}]) 162 | assert metric.get_aggregate_stats(df) == {"mean_rating": 3.5, "pass_count": 1, "pass_rate": 0.33} 163 | --------------------------------------------------------------------------------