├── .github └── workflows │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── benchllm ├── __init__.py ├── cache.py ├── cli │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ ├── add_test.py │ │ ├── evaluate.py │ │ ├── list_tests.py │ │ └── run_suite.py │ ├── evaluator │ │ ├── __init__.py │ │ ├── interactive.py │ │ └── web.py │ ├── listener.py │ ├── main.py │ └── utils.py ├── data_types.py ├── evaluator │ ├── __init__.py │ ├── embedding.py │ ├── evaluator.py │ ├── semantic.py │ └── string_match.py ├── input_types.py ├── listener.py ├── similarity.py ├── singleton.py ├── tester.py └── utils.py ├── examples ├── chat │ ├── 1.yml │ ├── 2.yml │ └── eval.py ├── langchain_agent_search_calculator │ ├── 1.yml │ ├── 10.yml │ ├── 2.yml │ ├── 3.yml │ ├── 4.yml │ ├── 5.yml │ ├── 6.yml │ ├── 7.yml │ ├── 8.yml │ ├── 9.yml │ ├── converter.py │ ├── eval.py │ └── script.py ├── openai-evals │ └── eval.py ├── qa │ ├── 1.yml │ ├── 2.yml │ ├── eval.py │ └── script.py ├── similarity │ ├── benchllm.yml │ ├── eval.py │ ├── idempotency.yml │ ├── location.yml │ ├── number_detail.yml │ ├── solarsystem.yml │ ├── v7.yml │ └── water_boiling.yml ├── vector_retrieval │ ├── 1.yml │ ├── 2.yml │ ├── eval.py │ └── utils.py └── weather_functions │ ├── default.yml │ ├── eval.py │ ├── forecast.py │ ├── rainy.yml │ ├── sunny.yml │ └── tomorrow.yml ├── pyproject.toml └── test ├── __init__.py ├── cache ├── test_file_cache.py └── test_memory_cache.py ├── cli ├── test_interactive.py ├── test_list_tests.py └── test_run_suite.py ├── evaulator ├── test_evalutator.py ├── test_semantic.py └── test_string_match.py ├── test_tester.py └── utils.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: "3.9" 17 | - run: pip install pip --upgrade 18 | - name: Setup Poetry 19 | uses: abatilo/actions-poetry@v2 20 | with: 21 | poetry-version: "1.3.1" 22 | - name: Install dependencies 23 | run: | 24 | poetry install --no-interaction --no-root --all-extras -vvv 25 | poetry build 26 | - name: Publish 27 | env: 28 | POETRY_HTTP_BASIC_PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} 29 | POETRY_HTTP_BASIC_PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 30 | run: | 31 | poetry publish 32 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | ci: 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.10"] 19 | poetry-version: ["1.3.1"] 20 | os: [ubuntu-latest, windows-latest] 21 | runs-on: ${{ matrix.os }} 22 | steps: 23 | - uses: actions/checkout@v2 24 | - uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Upgrade pip 28 | run: python -m pip install --upgrade pip 29 | - name: Setup Poetry 30 | uses: abatilo/actions-poetry@v2 31 | with: 32 | poetry-version: ${{ matrix.poetry-version }} 33 | - name: Install dependencies 34 | run: | 35 | poetry install --no-interaction --no-root --all-extras -vvv 36 | pip install wheel 37 | pip install --upgrade setuptools 38 | pip install --editable ".[test,examples,dev]" 39 | pip install pytest pytest-describe 40 | - name: Run Tests 41 | run: python -m pytest 42 | - name: Run BenchLLM examples 43 | run: bench run examples 44 | env: 45 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .DS_Store 163 | 164 | # Vector QA Retrieval PDF documents and index shouldn't be tracked 165 | examples/vector_retrieval/example_documents 166 | examples/vector_retrieval/faiss_example_index 167 | 168 | # Don't stage any files in the output directory 169 | output/ 170 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "black-formatter.args": ["--line-length", "120"], 3 | "editor.codeActionsOnSave": { 4 | "source.organizeImports": true 5 | }, 6 | "editor.formatOnSave": true, 7 | "editor.rulers": [120], 8 | "[python]": { 9 | "editor.defaultFormatter": "ms-python.black-formatter" 10 | }, 11 | "[yaml]": { 12 | "editor.defaultFormatter": "esbenp.prettier-vscode" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 V7 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🏋️‍♂️ BenchLLM 🏋️‍♀️ 2 | 3 | 🦾 Continuous Integration for LLM powered applications 🦙🦅🤖 4 | 5 | [![GitHub Repo stars](https://img.shields.io/github/stars/v7labs/BenchLLM?style=social)](https://github.com/v7labs/BenchLLM/stargazers) 6 | [![Twitter Follow](https://img.shields.io/twitter/follow/V7Labs?style=social)](https://twitter.com/V7Labs) 7 | [![Discord Follow](https://dcbadge.vercel.app/api/server/x7ExfHb3bG?style=flat)](https://discord.gg/x7ExfHb3bG) 8 | 9 | [**BenchLLM**](https://benchllm.com/) is a Python-based open-source library that streamlines the testing of Large Language Models (LLMs) and AI-powered applications. It measures the accuracy of your model, agents, or chains by validating responses on any number of tests via LLMs. 10 | 11 | BenchLLM is actively used at [V7](https://www.v7labs.com) for improving our LLM applications and is now Open Sourced under MIT License to share with the wider community 12 | 13 | ## 💡 Get help on [Discord](https://discord.gg/x7ExfHb3bG) or [Tweet at us](https://twitter.com/V7Labs) 14 | 15 |
16 | 17 | Use BenchLLM to: 18 | 19 | - Test the responses of your LLM across any number of prompts. 20 | - Continuous integration for chains like [Langchain](https://github.com/hwchase17/langchain), agents like [AutoGPT](https://github.com/Significant-Gravitas/Auto-GPT), or LLM models like [Llama](https://github.com/facebookresearch/llama) or GPT-4. 21 | - Eliminate flaky chains and create confidence in your code. 22 | - Spot inaccurate responses and hallucinations in your application at every version. 23 | 24 |
25 | 26 | > ⚠️ **NOTE:** BenchLLM is in the early stage of development and will be subject to rapid changes. 27 | > 28 | > For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page. 29 | 30 | ## 🧪 BenchLLM Testing Methodology 31 | 32 | BenchLLM implements a distinct two-step methodology for validating your machine learning models: 33 | 34 | 1. **Testing**: This stage involves running your code against any number of expected responses and capturing the predictions produced by your model without immediate judgment or comparison. 35 | 36 | 2. **Evaluation**: The recorded predictions are compared against the expected output using LLMs to verify factual similarity (or optionally manually). Detailed comparison reports, including pass/fail status and other metrics, are generated. 37 | 38 | This methodical separation offers a comprehensive view of your model's performance and allows for better control and refinement of each step. 39 | 40 | ## 🚀 Install 41 | 42 | To install BenchLLM we use pip 43 | 44 | ``` 45 | pip install benchllm 46 | ``` 47 | 48 | ## 💻 Usage 49 | 50 | Start by importing the library and use the @benchllm.test decorator to mark the function you'd like to test: 51 | 52 | ```python 53 | import benchllm 54 | 55 | # Your custom model implementation 56 | def run_my_model(input): 57 | # Your model's logic goes here. 58 | return some_result 59 | 60 | @benchllm.test(suite="/path/to/test/suite") # If the tests are in the same directory, just use @benchllm.test. 61 | def invoke_model(input: str): 62 | return run_my_model(input) 63 | ``` 64 | 65 | Next, prepare your tests. These are YAML/JSON files structured as follows: 66 | 67 | ```yml 68 | input: What's 1+1? Be very terse, only numeric output 69 | expected: 70 | - 2 71 | - 2.0 72 | ``` 73 | 74 | In the above example, the `input` is the query or instruction that your model will process, and `expected` contains the potential responses that your model should return. It's important to note that `input` can be a simple `str` or a more complex nested dictionary; BenchLLM will extract the type of the `input` argument in the Python code and load the `input` field from the YAML file accordingly. 75 | 76 | By default, BenchLLM uses OpenAI's GPT-3 model for the `semantic` evaluator. This requires setting the `OPENAI_API_KEY` environment variable. If you do not want to use this default evaluator, you can specify an alternative one (discussed in further detail below): 77 | 78 | ```bash 79 | export OPENAI_API_KEY='your-api-key' 80 | ``` 81 | 82 | Replace 'your-api-key' with your actual OpenAI API key. 83 | 84 | To initiate testing, use the `bench run` command: 85 | 86 | ```bash 87 | $ bench run 88 | ``` 89 | 90 | By default, the bench run command looks for Python files implementing the @test decorator in the current directory. To target a specific file or folder, specify it directly: 91 | 92 | ```bash 93 | $ bench run path/to/my/file.py or/path/to/folder/with/files 94 | ``` 95 | 96 | The `--retry-count` parameter allows BenchLLM to run a test multiple times, useful for models that may have variability in their outputs: 97 | 98 | ```bash 99 | $ bench run --retry-count 5 100 | ``` 101 | 102 | BenchLLM offers multiple evaluation methods to determine if the prediction matches the test case's expected values. You can use the `--evaluator` parameter to specify the evaluation method: 103 | 104 | There are multiple ways to evaluate if the test functions prediction matches the test cases expected values. 105 | By default GPT-3 is used to compare the output. You can use `--evaluator` to use a different method 106 | 107 | - `semantic`, checks semantic similarity using language models like GPT-3, GPT-3.5, or GPT-4 (`--model` parameter). Please note, for this evaluator, you need to set the `OPENAI_API_KEY` environment variable. 108 | - `embedding`, uses cosine distance between embedded vectors. Please note, for this evaluator, you need to set the `OPENAI_API_KEY` environment variable. 109 | - `string-match`, checks if the strings are matching (case insensitive) 110 | - `interactive`, user manually accepts or fails tests in the terminal 111 | - `web`, uses pywebio fora simple local web interface 112 | 113 | The non interactive evaluators also supports `--workers N` to run in the evaluations in parallel 114 | 115 | ```bash 116 | $ bench run --evaluator string-match --workers 5 117 | ``` 118 | 119 | To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches: 120 | 121 | - `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N` 122 | - `file`, stores the cache at the end of the run as a JSON file in output/cache.json. This is the default behavior. 123 | - `none`, does not use any cache. 124 | 125 | ```bash 126 | $ bench run examples --cache memory 127 | ``` 128 | 129 | When working on developing chains or training agent models, there may be instances where these models need to interact with external functions — for instance, querying a weather forecast or executing an SQL query. In such scenarios, BenchLLM facilitates the ability to mock these functions. This helps you make your tests more predictable and enables the discovery of unexpected function calls. 130 | 131 | ```yml 132 | input: I live in London, can I expect rain today? 133 | expected: ["no"] 134 | calls: 135 | - name: forecast.get_n_day_weather_forecast 136 | returns: It's sunny in London. 137 | arguments: 138 | location: London 139 | num_days: 1 140 | ``` 141 | 142 | In the example above, the function `get_n_day_weather_forecast` in the `forecast` module is mocked. In other words, every time this function is invoked, the model will receive `"It's sunny in London"`. BenchLLM also provides warnings if the function is invoked with argument values different from `get_n_day_weather_forecast(location=London, num_days=1)`. Please note, the provision of these argument parameters is optional. 143 | 144 | ### 🧮 Eval 145 | 146 | While _bench run_ runs each test function and then evaluates their output, it can often be beneficial to separate these into two steps. For example, if you want a person to manually do the evaluation or if you want to try multiple evaluation methods on the same function. 147 | 148 | ```bash 149 | $ bench run --no-eval 150 | ``` 151 | 152 | This will generate json files in `output/latest/predictions` 153 | Then later you can evaluate them with 154 | 155 | ```bash 156 | $ bench eval output/latest/predictions 157 | ``` 158 | 159 | ## 🔌 API 160 | 161 | For more detailed control, BenchLLM provides an API. 162 | You are not required to add YML/JSON tests to be able to evaluate your model. 163 | You can instead: 164 | 165 | - Instantiate `Test` objects 166 | - Use a `Tester` object to generate predictions 167 | - Use an `Evaluator` object to evaluate your model 168 | 169 | ```python 170 | from benchllm import StringMatchEvaluator, Test, Tester 171 | 172 | # Instantiate your Test objects 173 | tests = [ 174 | Test(input="What's 1+1?", expected=["2", "It's 2"]), 175 | Test(input="First rule of fight club?", expected=["Do not talk about fight club"]), 176 | ] 177 | 178 | # Use a Tester object to generate predictions using any test functions 179 | tester = Tester(my_test_function) 180 | tester.add_tests(tests) 181 | predictions = tester.run() 182 | 183 | # Use an Evaluator object to evaluate your model 184 | evaluator = StringMatchEvaluator() 185 | evaluator.load(predictions) 186 | results = evaluator.run() 187 | 188 | print(results) 189 | ``` 190 | 191 | If you want to incorporate caching and run multiple parallel evaluation jobs, you can modify your evaluator as follows: 192 | 193 | ```python 194 | from benchllm.cache import FileCache 195 | 196 | ... 197 | 198 | evaluator = FileCache(StringMatchEvaluator(workers=2), Path("path/to/cache.json")) 199 | evaluator.load(predictions) 200 | results = evaluator.run() 201 | ``` 202 | 203 | In this example, `FileCache` is used to enable caching, and the `workers` parameter of `StringMatchEvaluator` is set to `2` to allow for parallel evaluations. The cache results are saved in a file specified by `Path("path/to/cache.json")`. 204 | 205 | ## ☕️ Commands 206 | 207 | - `bench add`: Add a new test to a suite. 208 | - `bench tests`: List all tests in a suite. 209 | - `bench run`: Run all or target test suites. 210 | - `bench eval`: Runs the evaluation of an existing test run. 211 | 212 | ## 🙌 Contribute 213 | 214 | BenchLLM is developed for Python 3.10, although it may work with other Python versions as well. We recommend using a Python 3.10 environment and pip >= 23. You can use conda or any other environment manager to set up the environment: 215 | 216 | ```bash 217 | $ conda create --name benchllm python=3.10 218 | $ conda activate benchllm 219 | $ pip install -e ".[dev]" 220 | ``` 221 | 222 | To run all the examples first install the examples extra dependencies 223 | 224 | ```bash 225 | $ pip install -e ".[examples]" 226 | ``` 227 | 228 | Contribution steps: 229 | 230 | 1. Fork the repository. 231 | 2. Create a new branch for your changes. 232 | 3. Make your changes. 233 | 4. Test your changes. 234 | 5. Submit a pull request. 235 | 236 | We adhere to the PEP8 style guide. Please follow this guide when contributing. 237 | 238 | If you need any support, feel free to open an issue on our GitHub page. 239 | -------------------------------------------------------------------------------- /benchllm/__init__.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from pathlib import Path 3 | from typing import Callable, Type, TypeVar 4 | 5 | from .data_types import Evaluation, Prediction, Test # noqa 6 | from .evaluator import ( # noqa 7 | EmbeddingEvaluator, 8 | Evaluator, 9 | SemanticEvaluator, 10 | StringMatchEvaluator, 11 | ) 12 | from .input_types import ChatInput, SimilarityInput # noqa 13 | from .similarity import semantically_similar # noqa 14 | from .singleton import TestSingleton # noqa 15 | from .tester import Tester # noqa 16 | 17 | T = TypeVar("T") 18 | 19 | __all__ = [ 20 | "test", 21 | "Tester", 22 | "Prediction", 23 | "Test", 24 | "Evaluation", 25 | "StringMatchEvaluator", 26 | "SemanticEvaluator", 27 | "Evaluator", 28 | "EmbeddingEvaluator", 29 | ] 30 | 31 | 32 | def test_wrapper(func: Callable[[T], str], input_type: Type[T], suite: Path) -> None: 33 | test_singleton = TestSingleton() 34 | test_singleton.register(func, input_type=input_type, suite=suite) 35 | 36 | 37 | def test(*, suite: str = ".") -> Callable[[Callable[[T], str]], None]: 38 | def test_decorator(func: Callable[[T], str]) -> None: 39 | suite_path = Path(suite) 40 | if not suite_path.is_absolute(): 41 | suite_path = Path(inspect.getfile(func)).parent / suite 42 | type = func.__annotations__.get("input") 43 | if type is None: 44 | raise Exception("Your test function needs to have an input parameter annotated with the input type") 45 | return test_wrapper(func, type, suite_path) 46 | 47 | return test_decorator 48 | -------------------------------------------------------------------------------- /benchllm/cache.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | from benchllm.data_types import Evaluation, Prediction 8 | from benchllm.evaluator import Evaluator 9 | from benchllm.input_types import Json 10 | from benchllm.listener import EvaluatorListener 11 | 12 | 13 | class MemoryValue(BaseModel): 14 | passed: bool 15 | score: float 16 | 17 | 18 | class MemoryCache(Evaluator): 19 | """Caches the results of the evaluator in memory""" 20 | 21 | def __init__(self, evaluator: Evaluator): 22 | super().__init__(workers=evaluator.workers) 23 | self._data: dict = {} 24 | self._evaluator = evaluator 25 | self._num_cache_misses = 0 26 | self._num_cache_hits = 0 27 | 28 | def _key(self, answer1: Json, answer2: Json) -> str: 29 | key1, key2 = json.dumps([answer1, answer2]), json.dumps([answer2, answer1]) 30 | return key1 if key1 < key2 else key2 31 | 32 | def lookup(self, answer1: Json, answer2: Json) -> Optional[MemoryValue]: 33 | result = self._data.get(self._key(answer1, answer2), None) 34 | if result: 35 | return MemoryValue(**result) 36 | return None 37 | 38 | def store(self, answer1: Json, answer2: Json, value: MemoryValue) -> None: 39 | key = self._key(answer1, answer2) 40 | self._data[key] = value.dict() 41 | 42 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: 43 | uncached_expectations = [] 44 | candidates = [] 45 | for expected in prediction.test.expected: 46 | lookup = self.lookup(expected, prediction.output) 47 | if lookup is None: 48 | uncached_expectations.append(expected) 49 | else: 50 | candidates.append(Evaluator.Candidate(prediction=prediction.output, expected=expected, **lookup.dict())) 51 | 52 | # If any of the cached candidates passed, we return them. 53 | if any([candidate.passed for candidate in candidates]): 54 | self._num_cache_hits += 1 55 | return candidates 56 | 57 | # If all expectations were found in the cache but were negative matches, 58 | # we increment the cache hits counter and return None as there's no match. 59 | if not uncached_expectations: 60 | self._num_cache_hits += 1 61 | return candidates 62 | 63 | self._num_cache_misses += 1 64 | # set prediction.test.expected to only the ones that were not cached 65 | prediction = Prediction(**prediction.dict()) 66 | prediction.test.expected = uncached_expectations 67 | candidates = self._evaluator.evaluate_prediction(prediction) 68 | for candidate in candidates: 69 | self.store(candidate.expected, candidate.prediction, MemoryValue(**candidate.dict())) 70 | return candidates 71 | 72 | @property 73 | def num_cache_hits(self) -> int: 74 | return self._num_cache_hits 75 | 76 | @property 77 | def num_cache_misses(self) -> int: 78 | return self._num_cache_misses 79 | 80 | 81 | class FileCache(MemoryCache, EvaluatorListener): 82 | """Caches the results of the evaluator in a json file""" 83 | 84 | def __init__(self, evaluator: Evaluator, path: Path): 85 | super().__init__(evaluator) 86 | self._path = path 87 | self.add_listener(self) 88 | self._load() 89 | 90 | def _load(self) -> None: 91 | if self._path.exists(): 92 | try: 93 | cache = json.loads(self._path.read_text(encoding="UTF-8"), parse_int=str) 94 | if cache["version"] != "1": 95 | raise ValueError("Unsupported cache version") 96 | self._data = cache["entries"] 97 | except Exception: 98 | print(f"Failed to load cache file {self._path}") 99 | self._data = {} 100 | 101 | def _save(self) -> None: 102 | cache = {"entries": self._data, "version": "1"} 103 | self._path.write_text(json.dumps(cache, indent=4), encoding="UTF-8") 104 | 105 | def evaluate_ended(self, evaluations: list[Evaluation]) -> None: 106 | self._save() 107 | -------------------------------------------------------------------------------- /benchllm/cli/__init__.py: -------------------------------------------------------------------------------- 1 | from .commands.add_test import add_test # noqa 2 | from .commands.evaluate import evaluate_predictions # noqa 3 | from .commands.list_tests import list_tests # noqa 4 | from .commands.run_suite import run_suite # noqa 5 | 6 | __all__ = ["add_test", "evaluate_predictions", "list_tests", "run_suite"] 7 | -------------------------------------------------------------------------------- /benchllm/cli/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/v7labs/benchllm/0b9d133a531538270e447b1e7ee1b8bb710a4061/benchllm/cli/commands/__init__.py -------------------------------------------------------------------------------- /benchllm/cli/commands/add_test.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | import typer 5 | import yaml 6 | 7 | 8 | def add_test(*, input: str, expected: list[str], name: str, overwrite: bool, suite_path: Optional[Path]) -> None: 9 | if suite_path is None: 10 | typer.secho("No default suite was specified.", fg=typer.colors.RED, bold=True) 11 | raise typer.Exit() 12 | 13 | if not suite_path.exists(): 14 | typer.secho("The specified suite does not exist.", fg=typer.colors.RED, bold=True) 15 | raise typer.Exit() 16 | 17 | test_path = suite_path / f"{name}.yml" 18 | if test_path.exists() and not overwrite: 19 | typer.secho( 20 | f"The test {test_path} already exists. Use --overwrite to overwrite it.", 21 | fg=typer.colors.RED, 22 | bold=True, 23 | ) 24 | raise typer.Exit() 25 | 26 | with open(test_path, "w") as f: 27 | yaml.safe_dump({"input": input, "expected": expected}, f) 28 | typer.secho(f"{test_path} added successfully!", fg=typer.colors.GREEN, bold=True) 29 | -------------------------------------------------------------------------------- /benchllm/cli/commands/evaluate.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from benchllm.cache import FileCache 4 | from benchllm.cli.listener import ReportListener, RichCliListener 5 | from benchllm.cli.utils import add_cache, get_evaluator 6 | from benchllm.utils import find_json_yml_files, load_prediction_files 7 | 8 | 9 | def evaluate_predictions( 10 | file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str, cache: str 11 | ) -> bool: 12 | files = find_json_yml_files(file_or_dir) 13 | 14 | cli_listener = RichCliListener(root_dir=Path.cwd(), interactive=evaluator_name == "interactive", eval_only=True) 15 | report_listener = ReportListener(output_dir=output_dir) 16 | 17 | load_prediction_files(file_or_dir) 18 | 19 | evaluator = get_evaluator(evaluator_name, model, workers) 20 | evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json") 21 | 22 | cli_listener.set_evaulator(evaluator) 23 | 24 | evaluator.add_listener(cli_listener) 25 | evaluator.add_listener(report_listener) 26 | for file in files: 27 | evaluator.load_prediction_file(file) 28 | 29 | evaluator.run() 30 | return not evaluator.failed 31 | -------------------------------------------------------------------------------- /benchllm/cli/commands/list_tests.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import typer 6 | import yaml 7 | from rich.console import Console 8 | from rich.table import Table 9 | 10 | 11 | def list_tests(*, suite_path: Optional[Path]) -> None: 12 | if suite_path is None: 13 | typer.secho("No default suite was specified.", fg=typer.colors.RED, bold=True) 14 | raise typer.Exit() 15 | 16 | if not suite_path.exists(): 17 | typer.secho("The specified suite does not exist.", fg=typer.colors.RED, bold=True) 18 | raise typer.Exit() 19 | 20 | console = Console() 21 | 22 | table = Table() 23 | table.add_column("Input") 24 | table.add_column("No.", justify="right") 25 | table.add_column("Expected") 26 | 27 | test_paths = list(suite_path.glob("*.yml")) 28 | for test_path in test_paths: 29 | with open(test_path, "r") as f: 30 | example = yaml.safe_load(f) 31 | for i, expected in enumerate(example["expected"], 1): 32 | if i == 1: 33 | input = json.dumps(example["input"]) 34 | else: 35 | input = "" 36 | table.add_row(input, str(i), json.dumps(expected)) 37 | table.add_section() 38 | 39 | if test_paths: 40 | console.print(table) 41 | else: 42 | typer.secho("No tests found in the specified suite directory.", fg=typer.colors.RED, bold=True) 43 | -------------------------------------------------------------------------------- /benchllm/cli/commands/run_suite.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import typer 4 | 5 | from benchllm.cache import FileCache 6 | from benchllm.cli.listener import ReportListener, RichCliListener 7 | from benchllm.cli.utils import add_cache, get_evaluator 8 | from benchllm.tester import Tester 9 | from benchllm.utils import find_files 10 | 11 | 12 | def run_suite( 13 | *, 14 | file_search_paths: list[Path], 15 | model: str, 16 | output_dir: Path, 17 | no_eval: bool, 18 | workers: int, 19 | evaluator_name: str, 20 | retry_count: int, 21 | cache: str, 22 | ) -> bool: 23 | files = find_files(file_search_paths) 24 | if not files: 25 | typer.secho( 26 | f"No python files with @benchllm.test found in {', '.join(map(str, file_search_paths))}", 27 | fg=typer.colors.RED, 28 | bold=True, 29 | ) 30 | return False 31 | 32 | cli_listener = RichCliListener(root_dir=Path.cwd(), interactive=evaluator_name == "interactive", test_only=no_eval) 33 | report_listener = ReportListener(output_dir=output_dir) 34 | 35 | tester = Tester(retry_count=retry_count) 36 | tester.add_listener(cli_listener) 37 | tester.add_listener(report_listener) 38 | 39 | # Load the the python files first, then the tests. 40 | for file in files: 41 | tester.load_module(file) 42 | 43 | # Finally, start collecting the predictions. 44 | tester.run() 45 | 46 | if no_eval: 47 | return True 48 | 49 | evaluator = get_evaluator(evaluator_name, model, workers) 50 | evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json") 51 | 52 | cli_listener.set_evaulator(evaluator) 53 | 54 | evaluator.add_listener(cli_listener) 55 | evaluator.add_listener(report_listener) 56 | evaluator.load(tester.predictions) 57 | 58 | evaluator.run() 59 | return not evaluator.failed 60 | -------------------------------------------------------------------------------- /benchllm/cli/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | from benchllm.cli.evaluator.interactive import InteractiveEvaluator # noqa 2 | from benchllm.cli.evaluator.web import WebEvaluator # noqa 3 | -------------------------------------------------------------------------------- /benchllm/cli/evaluator/interactive.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import click 4 | import typer 5 | 6 | from benchllm.data_types import Prediction 7 | from benchllm.evaluator import Evaluator 8 | 9 | 10 | class InteractiveEvaluator(Evaluator): 11 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: 12 | header = ( 13 | f'{typer.style("Does ", bold=True)}' 14 | f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}" 15 | f'{typer.style(" match any of the following expected prompts?", bold=True)}' 16 | ) 17 | typer.echo("") 18 | typer.echo(header) 19 | 20 | for i, expected in enumerate(prediction.test.expected, start=1): 21 | typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False) 22 | typer.secho(expected, bold=True) 23 | 24 | options = [str(idx) for idx, _ in enumerate(prediction.test.expected, start=1)] + ["n"] 25 | 26 | prompt_string = f"[{typer.style('matching number', fg=typer.colors.GREEN, bold=True)} or {typer.style('n', fg=typer.colors.RED, bold=True)}]" 27 | click_choice = click.Choice(options) 28 | response = typer.prompt(prompt_string, default="n", type=click_choice, show_choices=False).lower() 29 | if response == "n": 30 | return [ 31 | Evaluator.Candidate(prediction=prediction.output, expected=expected, score=0.0, passed=False) 32 | for expected in prediction.test.expected 33 | ] 34 | return [ 35 | Evaluator.Candidate( 36 | prediction=prediction.output, 37 | expected=prediction.test.expected[int(response) - 1], 38 | score=1.0, 39 | passed=True, 40 | ) 41 | ] 42 | -------------------------------------------------------------------------------- /benchllm/cli/evaluator/web.py: -------------------------------------------------------------------------------- 1 | import signal 2 | from typing import Optional 3 | 4 | import typer 5 | from pywebio import session 6 | from pywebio.input import radio 7 | from pywebio.output import put_markdown 8 | 9 | from benchllm.data_types import Prediction 10 | from benchllm.evaluator import Evaluator 11 | 12 | 13 | class WebEvaluator(Evaluator): 14 | def __init__(self) -> None: 15 | super().__init__(workers=1) 16 | 17 | @session.defer_call 18 | def on_close() -> None: 19 | print("shutting down") 20 | typer.secho( 21 | f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True 22 | ) 23 | # sys.exit doesn't work here, so we have to raise a signal to kill the process 24 | signal.raise_signal(signal.SIGINT) 25 | 26 | put_markdown("# BenchLLM Web Evaluator") 27 | 28 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: 29 | test_name = prediction.test.file_path or prediction.test.id 30 | 31 | put_markdown(f"## {test_name}") 32 | put_markdown(f"*Question*: `{prediction.test.input}`") 33 | put_markdown(f"*Prediction*: `{prediction.output}`") 34 | 35 | table = [["Question:", f"{prediction.test.input}", ""], ["Prediction:", prediction.output], ""] 36 | label = f"Question: {prediction.test.input}Prediction: {prediction.output}" 37 | 38 | options: list[dict[str, Optional[int | str]]] = [ 39 | {"label": expected, "value": idx} for idx, expected in enumerate(prediction.test.expected) 40 | ] 41 | options.append({"label": "None", "value": None, "selected": True}) 42 | answer = radio("Pick the matching answer", options=options, required=True) 43 | 44 | if answer and isinstance(answer, int): 45 | return [ 46 | Evaluator.Candidate( 47 | prediction=prediction.output, expected=prediction.test.expected[answer], score=1.0, passed=True 48 | ) 49 | ] 50 | else: 51 | return [ 52 | Evaluator.Candidate(prediction=prediction.output, expected=expected, score=0.0, passed=False) 53 | for expected in prediction.test.expected 54 | ] 55 | -------------------------------------------------------------------------------- /benchllm/cli/listener.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | import typer 7 | from rich import print 8 | from rich.console import Console 9 | from rich.markup import render 10 | from rich.table import Table 11 | 12 | from benchllm.cache import MemoryCache 13 | from benchllm.data_types import ( 14 | CallErrorType, 15 | Evaluation, 16 | FunctionID, 17 | Prediction, 18 | Test, 19 | TestFunction, 20 | ) 21 | from benchllm.evaluator import Evaluator 22 | from benchllm.listener import EvaluatorListener, TesterListener 23 | from benchllm.utils import collect_call_errors 24 | 25 | 26 | class ReportListener(TesterListener, EvaluatorListener): 27 | def __init__(self, *, output_dir: Path) -> None: 28 | super().__init__() 29 | self.output_dir = output_dir 30 | 31 | def test_ended(self, prediction: Prediction) -> None: 32 | path = self.output_dir / "predictions" / f"{prediction.test.id}.json" 33 | path.parent.mkdir(parents=True, exist_ok=True) 34 | 35 | with open(path, "w") as f: 36 | json.dump(json.loads(prediction.json()), f, indent=2) 37 | 38 | def evaluate_prediction_ended(self, evaluation: Evaluation) -> None: 39 | evaluation_json = json.loads(evaluation.json()) 40 | prediction_json = evaluation_json.pop("prediction") 41 | prediction_json["evaluation"] = evaluation_json 42 | 43 | path = self.output_dir / "evaluations" / f"{evaluation.prediction.test.id}.json" 44 | path.parent.mkdir(parents=True, exist_ok=True) 45 | 46 | with open(path, "w") as f: 47 | json.dump(prediction_json, f, indent=2) 48 | 49 | 50 | class RichCliListener(TesterListener, EvaluatorListener): 51 | def __init__( 52 | self, 53 | root_dir: Path, 54 | *, 55 | interactive: bool, 56 | test_only: bool = False, 57 | eval_only: bool = False, 58 | ) -> None: 59 | super().__init__() 60 | self.root_dir = root_dir 61 | self.interactive = interactive 62 | self._eval_only = eval_only 63 | self._test_only = test_only 64 | self._evaluator: Optional[Evaluator] = None 65 | 66 | def set_evaulator(self, evaluator: Evaluator) -> None: 67 | self._evaluator = evaluator 68 | 69 | def test_run_started(self) -> None: 70 | print_centered(" Run Tests ") 71 | 72 | def test_run_ended(self, predications: list[Prediction]) -> None: 73 | if not self._test_only: 74 | return 75 | total_test_time = sum(prediction.time_elapsed for prediction in predications) or 0.0 76 | tmp = f" [green]{len(predications)} tests[/green], in [blue]{format_time(total_test_time)}[/blue] " 77 | print_centered(tmp) 78 | 79 | def test_function_started(self, test_function: TestFunction) -> None: 80 | typer.echo(f"{test_function.function_id.relative_str(self.root_dir)} ", nl=False) 81 | 82 | def test_function_ended(self) -> None: 83 | typer.echo("") 84 | 85 | def test_started(self, test: Test) -> None: 86 | pass 87 | 88 | def test_ended(self, prediction: Prediction) -> None: 89 | typer.secho(".", fg=typer.colors.GREEN, bold=True, nl=False) 90 | 91 | def test_skipped(self, test: Test, error: bool = False) -> None: 92 | if error: 93 | typer.secho("E", fg=typer.colors.RED, bold=True, nl=False) 94 | else: 95 | typer.secho("s", fg=typer.colors.YELLOW, bold=True, nl=False) 96 | 97 | def evaluate_started(self) -> None: 98 | print_centered(" Evaluate Tests ") 99 | 100 | def evaluate_module_started(self, function_id: FunctionID) -> None: 101 | typer.echo(f"{function_id.relative_str(self.root_dir)} ", nl=False) 102 | 103 | def evaluate_module_ended(self) -> None: 104 | typer.echo("") 105 | 106 | def evaluate_prediction_started(self, prediction: Prediction) -> None: 107 | pass 108 | 109 | def evaluate_prediction_ended(self, evaluation: Evaluation) -> None: 110 | if self.interactive: 111 | return 112 | 113 | if evaluation.passed: 114 | typer.secho(".", fg=typer.colors.GREEN, bold=True, nl=False) 115 | else: 116 | typer.secho("F", fg=typer.colors.RED, bold=True, nl=False) 117 | 118 | def handle_call_error(self, evaluations) -> None: 119 | predictions_with_calls = [ 120 | evaluation.prediction for evaluation in evaluations if evaluation.prediction.test.calls 121 | ] 122 | if not predictions_with_calls: 123 | return 124 | 125 | print_centered(" Call Warnings ") 126 | 127 | for prediction in predictions_with_calls: 128 | errors = collect_call_errors(prediction) 129 | if not errors: 130 | continue 131 | relative_path = prediction.function_id.relative_str(self.root_dir) 132 | print_centered(f" [yellow]{relative_path}[/yellow] :: [yellow]{prediction.test.file_path}[/yellow] ", "-") 133 | 134 | for error in errors: 135 | if error.error_type == CallErrorType.MISSING_ARGUMENT: 136 | print( 137 | f'[blue][bold]{error.function_name}[/bold][/blue] was never called with [blue][bold]"{error.argument_name}"[/bold][/blue]' 138 | ) 139 | elif error.error_type == CallErrorType.MISSING_FUNCTION: 140 | print(f"[blue][bold]{error.function_name}[/bold][/blue] was never declared") 141 | elif error.error_type == CallErrorType.VALUE_MISMATCH: 142 | print( 143 | f'[blue][bold]{error.function_name}[/bold][/blue] was called with "{error.argument_name}=[red][bold]{error.actual_value}[/bold][/red]", expected "[green][bold]{error.expected_value}[/bold][/green]"' 144 | ) 145 | 146 | def evaluate_ended(self, evaluations: list[Evaluation]) -> None: 147 | self.handle_call_error(evaluations) 148 | 149 | failed = [evaluation for evaluation in evaluations if not evaluation.passed] 150 | total_test_time = ( 151 | 0.0 if self._eval_only else sum(evaluation.prediction.time_elapsed for evaluation in evaluations) or 0.0 152 | ) 153 | total_eval_time = sum(evaluation.eval_time_elapsed for evaluation in evaluations) or 0.0 154 | if failed: 155 | print_centered(" Failures ") 156 | for failure in failed: 157 | prediction = failure.prediction 158 | relative_path = prediction.function_id.relative_str(self.root_dir) 159 | print_centered( 160 | f" [red]{relative_path}[/red] :: [red]{prediction.test.file_path} ({failure.score:.2f})[/red] ", "-" 161 | ) 162 | 163 | console = Console() 164 | 165 | table = Table(show_header=False, show_lines=True) 166 | table.add_row(f"Input", str(prediction.test.input)) 167 | table.add_row(f"Output", f"[red]{prediction.output}[/red]") 168 | for i, answer in enumerate(prediction.test.expected): 169 | table.add_row(f"Expected #{i+1}", str(answer)) 170 | console.print(table) 171 | 172 | tmp = f" [red]{len(failed)} failed[/red], [green]{len(evaluations) - len(failed)} passed[/green], in [blue]{format_time(total_eval_time + total_test_time)}[/blue] " 173 | if isinstance(self._evaluator, MemoryCache): 174 | tmp += f"(cached hits {self._evaluator.num_cache_hits}, cached misses {self._evaluator.num_cache_misses}) " 175 | 176 | print_centered(tmp) 177 | 178 | 179 | def print_centered(text: str, sep: str = "=") -> None: 180 | console = Console() 181 | terminal_width = console.width 182 | 183 | padding = (terminal_width - len(render(text))) // 2 184 | print(sep * padding, f"[bold]{text}[/bold]", sep * padding, sep="") 185 | 186 | 187 | def format_time(seconds: float) -> str: 188 | delta = datetime.timedelta(seconds=seconds) 189 | if seconds < 1: 190 | milliseconds = int(seconds * 1000) 191 | return f"{milliseconds:.2f}ms" 192 | elif seconds < 60: 193 | return f"{seconds:.2f}s" 194 | else: 195 | return str(delta) 196 | -------------------------------------------------------------------------------- /benchllm/cli/main.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Annotated, Optional 3 | from uuid import uuid4 4 | 5 | import typer 6 | 7 | from benchllm.cli import add_test, evaluate_predictions, list_tests, run_suite 8 | from benchllm.cli.utils import output_dir_factory 9 | 10 | app = typer.Typer(add_completion=False) 11 | 12 | 13 | @app.command(help="Run tests and evaluations.") 14 | def run( 15 | output_dir: Annotated[ 16 | Path, typer.Option(help="Output directory to save evaluation reports into.", default_factory=output_dir_factory) 17 | ], 18 | file_or_dir: Annotated[ 19 | Optional[list[Path]], 20 | typer.Argument( 21 | help="Paths to python files or directories implemented @benchllm.test functions.", 22 | exists=True, 23 | resolve_path=True, 24 | ), 25 | ] = None, 26 | model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3", 27 | eval: Annotated[bool, typer.Option(help="Run final evaluation.")] = True, 28 | workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1, 29 | retry_count: Annotated[int, typer.Option(help="Rerun tests to spot flaky output")] = 1, 30 | evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic", 31 | cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file", 32 | ) -> None: 33 | if not file_or_dir: 34 | file_or_dir = [Path.cwd()] 35 | 36 | success = run_suite( 37 | file_search_paths=file_or_dir, 38 | model=model, 39 | output_dir=output_dir, 40 | workers=workers, 41 | evaluator_name=evaluator, 42 | no_eval=not eval, 43 | retry_count=retry_count, 44 | cache=cache, 45 | ) 46 | if not success: 47 | raise typer.Exit(code=1) 48 | 49 | 50 | @app.command(help="Evaluate predictions") 51 | def eval( 52 | file_or_dir: Annotated[ 53 | list[Path], 54 | typer.Argument( 55 | help="Paths to json files or directories containing json files to evaluate.", 56 | exists=True, 57 | resolve_path=True, 58 | ), 59 | ], 60 | output_dir: Annotated[ 61 | Path, typer.Option(help="Output directory to save evaluation reports into.", default_factory=output_dir_factory) 62 | ], 63 | model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3", 64 | workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1, 65 | evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic", 66 | cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file", 67 | ) -> None: 68 | success = evaluate_predictions( 69 | file_or_dir=file_or_dir, 70 | model=model, 71 | output_dir=output_dir, 72 | workers=workers, 73 | evaluator_name=evaluator, 74 | cache=cache, 75 | ) 76 | if not success: 77 | raise typer.Exit(code=1) 78 | 79 | 80 | @app.command(help="Add a new test case to a suite.") 81 | def add( 82 | suite_path: Annotated[Optional[Path], typer.Argument(help="Test suite directory.")], 83 | input: Annotated[str, typer.Option(help="Input prompt to send to your model.")], 84 | expected: Annotated[ 85 | list[str], typer.Option(help="Expected output prompt. You can use this option multiple times.") 86 | ], 87 | name: Annotated[ 88 | str, 89 | typer.Option( 90 | help="Name of the test case. Generated UUID when not specified.", 91 | default_factory=uuid4, 92 | ), 93 | ], 94 | overwrite: Annotated[bool, typer.Option(help="Overwrite existing test case.")] = False, 95 | ) -> None: 96 | add_test(input=input, expected=expected, name=name, overwrite=overwrite, suite_path=suite_path) 97 | 98 | 99 | @app.command(help="List all tests.") 100 | def tests(suite_path: Annotated[Path, typer.Argument(help="Test suite directory.")]) -> None: 101 | list_tests(suite_path=suite_path) 102 | 103 | 104 | def main() -> None: 105 | app() 106 | 107 | 108 | if __name__ == "__main__": 109 | main() 110 | -------------------------------------------------------------------------------- /benchllm/cli/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pathlib import Path 3 | 4 | from benchllm.cache import FileCache, MemoryCache 5 | from benchllm.cli.evaluator import InteractiveEvaluator, WebEvaluator 6 | from benchllm.evaluator import ( 7 | EmbeddingEvaluator, 8 | Evaluator, 9 | SemanticEvaluator, 10 | StringMatchEvaluator, 11 | ) 12 | 13 | 14 | def output_dir_factory() -> Path: 15 | timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 16 | output_dir = Path.cwd() / "output" / str(timestamp) 17 | output_dir.mkdir(exist_ok=True, parents=True) 18 | 19 | latest = Path.cwd() / "output" / "latest" 20 | if latest.exists(): 21 | latest.unlink() 22 | latest.symlink_to(output_dir) 23 | return output_dir 24 | 25 | 26 | def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator: 27 | if evaluator_name == "semantic": 28 | return SemanticEvaluator(model=model, workers=workers) 29 | elif evaluator_name == "interactive": 30 | return InteractiveEvaluator() 31 | elif evaluator_name == "string-match": 32 | return StringMatchEvaluator(workers=workers) 33 | elif evaluator_name == "web": 34 | return WebEvaluator() 35 | elif evaluator_name == "embedding": 36 | return EmbeddingEvaluator() 37 | else: 38 | raise ValueError(f"Unknown evaluator {evaluator_name}") 39 | 40 | 41 | def add_cache(cache_name: str, evaluator: Evaluator, cache_path: Path) -> Evaluator: 42 | if cache_name == "file": 43 | return FileCache(evaluator, cache_path) 44 | elif cache_name == "memory": 45 | return MemoryCache(evaluator) 46 | elif cache_name == "none": 47 | return evaluator 48 | else: 49 | raise ValueError(f"Unknown cache {cache_name}, valid values are 'file', 'memory', 'none'") 50 | -------------------------------------------------------------------------------- /benchllm/data_types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from pathlib import Path 3 | from typing import Any, Callable, Generic, Optional, TypeVar 4 | from uuid import uuid4 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | 9 | class TestCall(BaseModel): 10 | __test__ = False 11 | name: str 12 | arguments: dict[str, Any] 13 | returns: Any 14 | 15 | 16 | class Test(BaseModel): 17 | __test__ = False 18 | id: str = Field(default_factory=lambda: str(uuid4())) 19 | input: Any 20 | expected: list[str] 21 | file_path: Optional[Path] = None 22 | calls: Optional[list[TestCall]] = None 23 | 24 | 25 | class FunctionID(BaseModel): 26 | module_path: Path 27 | line_number: int 28 | name: str 29 | 30 | def __hash__(self) -> int: 31 | return hash((self.module_path, self.line_number)) 32 | 33 | def __str__(self) -> str: 34 | return f"{self.module_path}:{self.line_number} ({self.name})" 35 | 36 | def relative_str(self, root_dir: Path) -> str: 37 | try: 38 | return str( 39 | FunctionID( 40 | module_path=self.module_path.relative_to(root_dir), line_number=self.line_number, name=self.name 41 | ) 42 | ) 43 | except ValueError: 44 | # we can't be sure that the module_path loaded from json files is relative to the root_dir 45 | return str(FunctionID(module_path=self.module_path, line_number=self.line_number, name=self.name)) 46 | 47 | @staticmethod 48 | def default() -> "FunctionID": 49 | return FunctionID(module_path=Path(""), line_number=0, name="default") 50 | 51 | 52 | class Prediction(BaseModel): 53 | test: Test 54 | output: str 55 | time_elapsed: float 56 | function_id: FunctionID 57 | calls: dict[str, list[dict[str, Any]]] = {} 58 | 59 | 60 | class CallErrorType(str, Enum): 61 | MISSING_FUNCTION = "Missing function" 62 | MISSING_ARGUMENT = "Missing argument" 63 | VALUE_MISMATCH = "Value mismatch" 64 | 65 | 66 | class CallError(BaseModel): 67 | function_name: str 68 | argument_name: Optional[str] = None 69 | expected_value: Optional[Any] = None 70 | actual_value: Optional[Any] = None 71 | error_type: CallErrorType 72 | 73 | 74 | class Evaluation(BaseModel): 75 | prediction: Prediction 76 | passed: bool 77 | eval_time_elapsed: float 78 | score: float 79 | 80 | 81 | T = TypeVar("T") 82 | 83 | 84 | class TestFunction(BaseModel, Generic[T]): 85 | function: Callable[[T], Any] 86 | function_id: FunctionID 87 | input_type: T 88 | suite: Optional[Path] = None 89 | -------------------------------------------------------------------------------- /benchllm/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | from benchllm.evaluator.evaluator import Evaluator # noqa 2 | # Adding an empty comment to force import order to avoid circular imports 3 | from benchllm.evaluator.embedding import EmbeddingEvaluator # noqa 4 | from benchllm.evaluator.semantic import SemanticEvaluator # noqa 5 | from benchllm.evaluator.string_match import StringMatchEvaluator # noqa 6 | -------------------------------------------------------------------------------- /benchllm/evaluator/embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import openai 3 | 4 | from benchllm.data_types import Prediction 5 | from benchllm.evaluator import Evaluator 6 | 7 | 8 | class EmbeddingEvaluator(Evaluator): 9 | def __init__(self, *, engine: str = "text-similarity-davinci-001", threshold: float = 0.9, workers: int = 1): 10 | super().__init__(workers=workers) 11 | self._engine = engine 12 | self._threshold = threshold 13 | 14 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: 15 | output_embedding = get_embedding(prediction.output, engine=self._engine) 16 | candidates = [] 17 | for expected in prediction.test.expected: 18 | expected_embedding = get_embedding(expected, engine=self._engine) 19 | similarity = cosine_similarity(output_embedding, expected_embedding) 20 | candidates.append( 21 | Evaluator.Candidate( 22 | prediction=prediction.output, 23 | expected=expected, 24 | score=similarity, 25 | passed=similarity > self._threshold, 26 | ) 27 | ) 28 | return candidates 29 | 30 | 31 | # these also exist in openai.embeddings_utils but have additional dependencies 32 | def get_embedding(text: str, engine: str, **kwargs) -> list[float]: 33 | text = text.replace("\n", " ") 34 | return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0]["embedding"] 35 | 36 | 37 | def cosine_similarity(a, b): 38 | return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) 39 | -------------------------------------------------------------------------------- /benchllm/evaluator/evaluator.py: -------------------------------------------------------------------------------- 1 | import json 2 | from abc import ABC, abstractmethod 3 | from concurrent.futures import ThreadPoolExecutor 4 | from itertools import groupby 5 | from operator import attrgetter 6 | from pathlib import Path 7 | from timeit import default_timer as timer 8 | from typing import Optional 9 | 10 | import yaml 11 | from pydantic import BaseModel 12 | 13 | from benchllm.data_types import Evaluation, FunctionID, Prediction 14 | from benchllm.input_types import Json 15 | from benchllm.listener import EvaluatorListener 16 | 17 | 18 | class Evaluator(ABC): 19 | def __init__(self, workers: int = 1): 20 | self._predictions: list[Prediction] = [] 21 | self._listeners: list[EvaluatorListener] = [] 22 | self._evaluations: list[Evaluation] = [] 23 | self._workers: int = workers 24 | 25 | class Candidate(BaseModel): 26 | prediction: Json 27 | expected: Json 28 | score: float 29 | passed: bool 30 | 31 | def add_listener(self, listener: EvaluatorListener) -> None: 32 | self._listeners.append(listener) 33 | 34 | def load(self, predictions: list[Prediction]) -> None: 35 | self._predictions.extend(predictions) 36 | 37 | def load_prediction_file(self, path: Path) -> None: 38 | if path.suffix == ".yml" or path.suffix == ".yaml": 39 | data = yaml.safe_load(path.read_bytes()) 40 | self.load([Prediction(**data)]) 41 | elif path.suffix == ".json": 42 | data = json.loads(path.read_text(encoding="UTF-8")) 43 | self.load([Prediction(**data)]) 44 | 45 | def run(self) -> list[Evaluation]: 46 | self._broadcast_evaluate_started() 47 | sorted_predictions = sorted(self._predictions, key=lambda x: str(x.function_id)) 48 | grouped_predictions_by_function = [ 49 | (function, list(group)) for function, group in groupby(sorted_predictions, key=attrgetter("function_id")) 50 | ] 51 | with ThreadPoolExecutor(max_workers=self._workers) as executor: 52 | for function, predictions in grouped_predictions_by_function: 53 | self._broadcast_evaluate_module_started(function) 54 | for evaluation in executor.map(self._run_evaluation, predictions): 55 | self._evaluations.append(evaluation) 56 | self._broadcast_evaluate_module_ended() 57 | self._broadcast_evaluate_ended(self._evaluations) 58 | return self._evaluations 59 | 60 | def _run_evaluation(self, prediction: Prediction) -> Evaluation: 61 | self._broadcast_evaluate_prediction_started(prediction) 62 | start = timer() 63 | candidates = self.evaluate_prediction(prediction) 64 | end = timer() 65 | 66 | evaluation = Evaluation( 67 | prediction=prediction, 68 | passed=any([candidate.passed for candidate in candidates]), 69 | eval_time_elapsed=end - start, 70 | score=max([candidate.score for candidate in candidates], default=0.0), 71 | ) 72 | self._broadcast_evaluate_prediction_ended(evaluation) 73 | return evaluation 74 | 75 | @property 76 | def passed(self) -> list[Evaluation]: 77 | return [evaluation for evaluation in self._evaluations if evaluation.passed] 78 | 79 | @property 80 | def failed(self) -> list[Evaluation]: 81 | return [evaluation for evaluation in self._evaluations if not evaluation.passed] 82 | 83 | @property 84 | def evaluations(self) -> list[Evaluation]: 85 | return self._evaluations 86 | 87 | @property 88 | def workers(self) -> int: 89 | return self._workers 90 | 91 | @property 92 | def predictions(self) -> list[Prediction]: 93 | return self._predictions 94 | 95 | @abstractmethod 96 | def evaluate_prediction(self, prediction: Prediction) -> list[Candidate]: 97 | """Evaluate a single prediction, return a Match if the prediction matches the expected output.""" 98 | pass 99 | 100 | def max_threads(self) -> int: 101 | return 1 102 | 103 | def _broadcast_evaluate_started(self) -> None: 104 | for listener in self._listeners: 105 | listener.evaluate_started() 106 | 107 | def _broadcast_evaluate_prediction_started(self, prediction: Prediction) -> None: 108 | for listener in self._listeners: 109 | listener.evaluate_prediction_started(prediction) 110 | 111 | def _broadcast_evaluate_prediction_ended(self, evaluation: Evaluation) -> None: 112 | for listener in self._listeners: 113 | listener.evaluate_prediction_ended(evaluation) 114 | 115 | def _broadcast_evaluate_module_started(self, function_id: FunctionID) -> None: 116 | for listener in self._listeners: 117 | listener.evaluate_module_started(function_id) 118 | 119 | def _broadcast_evaluate_module_ended(self) -> None: 120 | for listener in self._listeners: 121 | listener.evaluate_module_ended() 122 | 123 | def _broadcast_evaluate_ended(self, evaluations: list[Evaluation]) -> None: 124 | for listener in self._listeners: 125 | listener.evaluate_ended(evaluations) 126 | -------------------------------------------------------------------------------- /benchllm/evaluator/semantic.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from benchllm.data_types import Prediction 4 | from benchllm.evaluator import Evaluator 5 | from benchllm.similarity import semantically_similar 6 | 7 | 8 | class SemanticEvaluator(Evaluator): 9 | def __init__(self, *, model: str = "gpt-3", workers: int = 1, early_quitting: bool = True): 10 | super().__init__(workers=workers) 11 | self.model = model 12 | self.early_quitting = early_quitting 13 | 14 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: 15 | candidates = [] 16 | for expected in prediction.test.expected: 17 | if semantically_similar(expected, prediction.output, model=self.model): 18 | candidate = Evaluator.Candidate(prediction=prediction.output, expected=expected, score=1.0, passed=True) 19 | if self.early_quitting: 20 | return [candidate] 21 | else: 22 | candidates.append(candidate) 23 | else: 24 | candidates.append( 25 | Evaluator.Candidate(prediction=prediction.output, expected=expected, score=0.0, passed=False) 26 | ) 27 | return candidates 28 | -------------------------------------------------------------------------------- /benchllm/evaluator/string_match.py: -------------------------------------------------------------------------------- 1 | from benchllm.data_types import Prediction 2 | from benchllm.evaluator import Evaluator 3 | 4 | 5 | class StringMatchEvaluator(Evaluator): 6 | def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1): 7 | super().__init__(workers=workers) 8 | 9 | self._case_sensitive = case_sensitive 10 | self._fuzzy = fuzzy 11 | 12 | def match_strings(self, expected: str, output: str) -> bool: 13 | if not self._case_sensitive: 14 | expected = expected.lower() 15 | output = output.lower() 16 | 17 | if self._fuzzy: 18 | return expected in output or output in expected 19 | 20 | return expected == output 21 | 22 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: 23 | output = prediction.output 24 | candidates = [] 25 | for expected in prediction.test.expected: 26 | if self.match_strings(expected, output): 27 | candidates.append(Evaluator.Candidate(prediction=output, expected=expected, score=1.0, passed=True)) 28 | else: 29 | candidates.append(Evaluator.Candidate(prediction=output, expected=expected, score=0.0, passed=False)) 30 | return candidates 31 | -------------------------------------------------------------------------------- /benchllm/input_types.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, Union 2 | 3 | from pydantic import BaseModel 4 | 5 | Json = Union[str, bool, list, dict] 6 | 7 | 8 | class ChatInputItem(TypedDict): 9 | role: str 10 | content: str 11 | 12 | 13 | ChatInput = list[ChatInputItem] 14 | 15 | 16 | class SimilarityInput(BaseModel): 17 | prompt_1: str 18 | prompt_2: str 19 | -------------------------------------------------------------------------------- /benchllm/listener.py: -------------------------------------------------------------------------------- 1 | from .data_types import Evaluation, FunctionID, Prediction, Test, TestFunction 2 | 3 | 4 | class TesterListener: 5 | def test_run_started(self) -> None: 6 | pass 7 | 8 | def test_run_ended(self, predications: list[Prediction]) -> None: 9 | pass 10 | 11 | def test_function_started(self, test_function: TestFunction) -> None: 12 | pass 13 | 14 | def test_function_ended(self) -> None: 15 | pass 16 | 17 | def test_started(self, test: Test) -> None: 18 | pass 19 | 20 | def test_ended(self, prediction: Prediction) -> None: 21 | pass 22 | 23 | def test_skipped(self, test: Test, error: bool = False) -> None: 24 | pass 25 | 26 | 27 | class EvaluatorListener: 28 | def evaluate_started(self) -> None: 29 | pass 30 | 31 | def evaluate_prediction_started(self, prediction: Prediction) -> None: 32 | pass 33 | 34 | def evaluate_prediction_ended(self, evaluation: Evaluation) -> None: 35 | pass 36 | 37 | def evaluate_module_started(self, function_id: FunctionID) -> None: 38 | pass 39 | 40 | def evaluate_module_ended(self) -> None: 41 | pass 42 | 43 | def evaluate_ended(self, evaluations: list[Evaluation]) -> None: 44 | pass 45 | -------------------------------------------------------------------------------- /benchllm/similarity.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | 4 | def completion_func(prompt: str) -> str: 5 | response = openai.Completion.create( 6 | prompt=prompt, engine="text-davinci-003", max_tokens=100, temperature=0.7, n=1, stop=None 7 | ) 8 | return response.choices[0].text.strip() 9 | 10 | 11 | def chat_completion_func(prompt: str, *, model: str) -> str: 12 | response = openai.ChatCompletion.create( 13 | model=model, messages=[{"role": "user", "content": prompt}], max_tokens=100, temperature=0.7, n=1, stop=None 14 | ) 15 | return response.choices[0].message.content.strip() 16 | 17 | 18 | def complete_text(prompt: str, *, model: str) -> str: 19 | full_prompt = f""" 20 | You will get two anwsers to a question, you should determine if they are semantically similar or not. 21 | You can only answer "same" or "different", nothing else. 22 | 23 | input: {{ 24 | "answer_1": "I was created by X", 25 | "answer_2": "X created me" 26 | }} 27 | output: same 28 | 29 | input: {{ 30 | "answer_1": "There are 52 days in a year", 31 | "answer_2": "A year is fairly long" 32 | }} 33 | output: different 34 | 35 | input: {prompt} 36 | output:""" 37 | 38 | model_func = completion_func if model == "gpt-3" else lambda prompt: chat_completion_func(prompt, model=model) 39 | return model_func(prompt=full_prompt) 40 | 41 | 42 | def semantically_similar(answer1: str, answer2: str, model: str = "gpt-3") -> bool: 43 | response = complete_text( 44 | f"""{{ 45 | "answer_1": "{answer1}", 46 | "answer_2": "{answer2}" 47 | }}""", 48 | model=model, 49 | ) 50 | if response not in ["same", "different"]: 51 | raise ValueError(f"Unexpected response: {response}") 52 | return response == "same" 53 | -------------------------------------------------------------------------------- /benchllm/singleton.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Callable, Generic, Type, TypeVar 3 | 4 | from pydantic import BaseModel 5 | 6 | T = TypeVar("T") 7 | 8 | 9 | class FunctionRegistry(BaseModel, Generic[T]): 10 | func: Callable[[T], T] 11 | type: Any 12 | suite: Path 13 | 14 | 15 | class TestSingleton(Generic[T]): 16 | _instance = None 17 | functions: list[FunctionRegistry[T]] = [] 18 | 19 | def __new__(cls: Type["TestSingleton"], *args: list, **kwargs: dict) -> "TestSingleton": 20 | if not cls._instance: 21 | cls._instance = super().__new__(cls) 22 | return cls._instance 23 | 24 | def register(self, func: Callable[[T], T], input_type: Type[T], suite: Path) -> None: 25 | self.functions.append(FunctionRegistry(func=func, type=input_type, suite=suite)) 26 | 27 | def clear(self) -> None: 28 | self.functions = [] 29 | -------------------------------------------------------------------------------- /benchllm/tester.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import inspect 3 | import json 4 | import sys 5 | import uuid 6 | from contextlib import contextmanager 7 | from pathlib import Path 8 | from timeit import default_timer as timer 9 | from types import ModuleType 10 | from typing import Any, Callable, Iterator, Optional, Union 11 | 12 | import yaml 13 | from pydantic import ValidationError, parse_obj_as 14 | 15 | from .data_types import FunctionID, Prediction, Test, TestFunction 16 | from .listener import TesterListener 17 | from .singleton import TestSingleton 18 | 19 | CallableTest = Union[TestFunction, Callable[[Any], Any]] 20 | 21 | 22 | class Tester: 23 | __test__ = False 24 | 25 | def __init__(self, test_function: Optional[CallableTest] = None, *, retry_count: int = 1) -> None: 26 | self._tests: dict[FunctionID, list[Test]] = {} 27 | self._test_functions: dict[FunctionID, TestFunction] = {} 28 | self._listeners: list[TesterListener] = [] 29 | self._predictions: list[Prediction] = [] 30 | self._retry_count = retry_count 31 | 32 | if test_function: 33 | self.add_test_function(test_function=test_function) 34 | 35 | def add_listener(self, listener: TesterListener) -> None: 36 | self._listeners.append(listener) 37 | 38 | def add_tests(self, tests: list[Test], function_id: FunctionID = FunctionID.default()) -> None: 39 | self._tests.setdefault(function_id, []).extend(tests) 40 | 41 | def add_test(self, test: Test, function_id: FunctionID = FunctionID.default()) -> None: 42 | self._tests.setdefault(function_id, []).append(test) 43 | 44 | def add_test_function(self, test_function: CallableTest) -> None: 45 | """Adds a test function to the tester, either a TestFunction or a Callable, Callables will get a default FunctionID""" 46 | if isinstance(test_function, TestFunction): 47 | self._test_functions[test_function.function_id] = test_function 48 | return 49 | self.add_test_function(TestFunction(function=test_function, function_id=FunctionID.default(), input_type=Any)) 50 | 51 | def load_tests(self, suite: Path, function_id: FunctionID) -> None: 52 | if self._test_functions.get(function_id) is None: 53 | raise Exception(f"No test function loaded for module {function_id}") 54 | 55 | for test in load_files(suite): 56 | self.add_test(test, function_id) 57 | 58 | def load_module(self, path: Union[str, Path]) -> None: 59 | path = Path(path) 60 | test_singleton = TestSingleton() 61 | test_singleton.clear() 62 | 63 | import_module_from_file(path) 64 | 65 | if not test_singleton.functions: 66 | raise NoBenchLLMTestFunction() 67 | 68 | for function in test_singleton.functions: 69 | function_id = FunctionID( 70 | module_path=path, 71 | line_number=inspect.getsourcelines(function.func)[1], 72 | name=function.func.__name__, 73 | ) 74 | 75 | self.add_test_function( 76 | TestFunction( 77 | function=function.func, 78 | function_id=function_id, 79 | input_type=function.type, 80 | suite=function.suite, 81 | ) 82 | ) 83 | self.load_tests(function.suite, function_id) 84 | 85 | def run(self) -> list[Prediction]: 86 | """Runs each test through the test function and stores the result""" 87 | 88 | self._broadcast_test_run_started() 89 | 90 | if not self._test_functions: 91 | raise Exception("No function loaded, run load_module() first") 92 | 93 | if not self._tests: 94 | raise Exception("No tests loaded, run load_tests() first") 95 | 96 | for test_function in self._test_functions.values(): 97 | self._broadcast_test_function_started(test_function) 98 | for test in self._tests.get(test_function.function_id, []): 99 | for _ in range(self._retry_count): 100 | # Checks that the arity of the function matches the number of inputs. 101 | if "__annotations__" in dir(test_function.input_type): 102 | if len(test.input) != len(test_function.input_type.__annotations__): 103 | raise Exception( 104 | f"Your test function needs to have an input parameter annotated with the input type, {test.input}\n\n{test_function.input_type.__annotations__}" 105 | ) 106 | 107 | # Now, try to parse the input. If we fail, we will skip the test. 108 | try: 109 | input = parse_obj_as(test_function.input_type, test.input) 110 | except ValidationError: 111 | self._broadcast_test_skipped(test, error=True) 112 | continue 113 | 114 | self._broadcast_test_started(test) 115 | start = timer() 116 | 117 | # set up mock functions for the test calls 118 | calls_made: dict[str, Any] = {} 119 | with setup_mocks(test, calls_made): 120 | output = test_function.function(input) 121 | 122 | end = timer() 123 | prediction = Prediction( 124 | test=test, 125 | output=output, 126 | time_elapsed=end - start, 127 | function_id=test_function.function_id, 128 | calls=calls_made, 129 | ) 130 | self._predictions.append(prediction) 131 | self._broadcast_test_ended(prediction) 132 | self._broadcast_test_function_ended() 133 | self._broadcast_test_run_ended(self._predictions) 134 | return self._predictions 135 | 136 | @property 137 | def predictions(self) -> list[Prediction]: 138 | return self._predictions 139 | 140 | def tests(self, function_id: FunctionID = FunctionID.default()) -> list[Test]: 141 | return self._tests.get(function_id, []) 142 | 143 | def _broadcast_test_run_started(self) -> None: 144 | for listener in self._listeners: 145 | listener.test_run_started() 146 | 147 | def _broadcast_test_run_ended(self, predications: list[Prediction]) -> None: 148 | for listener in self._listeners: 149 | listener.test_run_ended(predications) 150 | 151 | def _broadcast_test_function_started(self, test_function: TestFunction) -> None: 152 | for listener in self._listeners: 153 | listener.test_function_started(test_function) 154 | 155 | def _broadcast_test_function_ended(self) -> None: 156 | for listener in self._listeners: 157 | listener.test_function_ended() 158 | 159 | def _broadcast_test_started(self, test: Test) -> None: 160 | for listener in self._listeners: 161 | listener.test_started(test) 162 | 163 | def _broadcast_test_ended(self, prediction: Prediction) -> None: 164 | for listener in self._listeners: 165 | listener.test_ended(prediction) 166 | 167 | def _broadcast_test_skipped(self, test: Test, error: bool = False) -> None: 168 | for listener in self._listeners: 169 | listener.test_skipped(test, error) 170 | 171 | 172 | def load_files(directory: Union[str, Path]) -> list[Test]: 173 | directory_path = Path(directory) 174 | tests = [] 175 | for file_path in directory_path.rglob("*"): 176 | if not file_path.is_file(): 177 | continue 178 | if file_path.suffix not in {".json", ".yml", ".yaml"}: 179 | continue 180 | with open(file_path, "r") as file: 181 | if file_path.suffix == ".json": 182 | data = json.load(file) 183 | elif file_path.suffix in {".yml", ".yaml"}: 184 | data = yaml.safe_load(file) 185 | try: 186 | test = init_test({**data, **{"file_path": file_path}}) 187 | except ValidationError: 188 | raise TestLoadException(file_path, "failed to parse your test file") from None 189 | tests.append(test) 190 | return tests 191 | 192 | 193 | def init_test(data: dict) -> Test: 194 | if "id" not in data: 195 | data["id"] = str(uuid.uuid4()) 196 | 197 | dump_data = {k: v for k, v in data.items() if k != "file_path"} 198 | with open(data["file_path"], "w") as f: 199 | if data["file_path"].suffix == ".json": 200 | json.dump(dump_data, f, indent=2, sort_keys=True) 201 | elif data["file_path"].suffix in {".yml", ".yaml"}: 202 | yaml.safe_dump(dump_data, f, indent=2) 203 | 204 | return Test(**data) 205 | 206 | 207 | class TestLoadException(Exception): 208 | def __init__(self, file_path: Path, error_message: str) -> None: 209 | self.file_path = file_path 210 | self.error_message = error_message 211 | 212 | def __str__(self) -> str: 213 | return f"Failed to load '{self.file_path}'\n{self.error_message}" 214 | 215 | 216 | class NoBenchLLMTestFunction(Exception): 217 | pass 218 | 219 | 220 | def import_module_from_file(file_path: Path) -> ModuleType: 221 | # Make sure the file exists. 222 | if not file_path.exists(): 223 | raise FileNotFoundError(f"File not found: {file_path}") 224 | 225 | # Get the module name from the file path (remove the .py extension). 226 | module_name = file_path.stem 227 | 228 | # Create a module specification from the file path. 229 | spec = importlib.util.spec_from_file_location(module_name, file_path) 230 | 231 | if not spec or not spec.loader: 232 | raise Exception(f"Failed to create module specification from {file_path}") 233 | 234 | # Create a new module based on the spec. 235 | module = importlib.util.module_from_spec(spec) 236 | 237 | if not module: 238 | raise Exception(f"Failed to load module from {file_path}") 239 | 240 | # Temporarly add the directory of the file to the system path so that the module can import other modules. 241 | file_module = file_path.resolve().parent 242 | old_sys_path = sys.path.copy() 243 | sys.path.append(str(file_module)) 244 | 245 | # Execute the module. 246 | spec.loader.exec_module(module) 247 | 248 | # Restore the system path 249 | sys.path = old_sys_path 250 | 251 | # Return the module. 252 | return module 253 | 254 | 255 | @contextmanager 256 | def setup_mocks(test: Test, calls_made: dict[str, Any]) -> Iterator[None]: 257 | """Sets up mock functions for the test calls""" 258 | old_functions = [] 259 | for call in test.calls or []: 260 | mock_name = call.name 261 | module_name, function_name = mock_name.rsplit(".", 1) 262 | # we need to import the module before we can mock the function 263 | module = importlib.import_module(module_name) 264 | old_functions.append((module, function_name, getattr(module, function_name))) 265 | 266 | def mock_function(*args: tuple, **kwargs: dict[str, Any]) -> Any: 267 | assert not args, "Positional arguments are not supported" 268 | if mock_name not in calls_made: 269 | calls_made[mock_name] = [] 270 | calls_made[mock_name].append(kwargs) 271 | return call.returns 272 | 273 | try: 274 | setattr(module, function_name, mock_function) 275 | except AttributeError: 276 | print(f"Function {function_name} doesn't exist in module {module_name}") 277 | 278 | try: 279 | yield 280 | finally: 281 | # restore the old function 282 | for old_function in old_functions: 283 | setattr(old_function[0], old_function[1], old_function[2]) 284 | -------------------------------------------------------------------------------- /benchllm/utils.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | from pathlib import Path 4 | 5 | import yaml 6 | 7 | from benchllm.data_types import CallError, CallErrorType, Prediction 8 | 9 | 10 | class DecoratorFinder(ast.NodeVisitor): 11 | def __init__(self) -> None: 12 | self.has_decorator: bool = False 13 | self.module_aliases: list[str] = [] 14 | 15 | def visit_Import(self, node: ast.Import) -> None: 16 | for alias in node.names: 17 | if alias.name == "benchllm": 18 | self.module_aliases.append(alias.asname or alias.name) 19 | self.generic_visit(node) 20 | 21 | def visit_FunctionDef(self, node: ast.FunctionDef) -> None: 22 | for decorator in node.decorator_list: 23 | if isinstance(decorator, ast.Call) and isinstance(decorator.func, ast.Attribute): 24 | decorator = decorator.func 25 | if decorator.attr == "test": 26 | if isinstance(decorator.value, ast.Name) and decorator.value.id in self.module_aliases: 27 | self.has_decorator = True 28 | self.generic_visit(node) 29 | 30 | 31 | def check_file(path: Path) -> bool: 32 | with open(path, "r", encoding="utf8") as f: 33 | tree = ast.parse(f.read()) 34 | finder = DecoratorFinder() 35 | finder.visit(tree) 36 | return finder.has_decorator 37 | 38 | 39 | def find_files(paths: list[Path]) -> list[Path]: 40 | python_files = set() 41 | for path in paths: 42 | if path.suffix == ".py" and not path.name.startswith("."): 43 | if check_file(path): 44 | python_files.add(path) 45 | else: 46 | for file in path.rglob("*.py"): 47 | if file.name.startswith("."): 48 | continue 49 | if check_file(file): 50 | python_files.add(file) 51 | return list(python_files) 52 | 53 | 54 | def find_json_yml_files(paths: list[Path]) -> list[Path]: 55 | files = [] 56 | for path in paths: 57 | if path.is_file(): 58 | if path.suffix in (".yml", ".json", ".yaml"): 59 | files.append(path) 60 | else: 61 | continue 62 | else: 63 | for file in path.rglob("*"): 64 | if file.suffix in (".yml", ".json", ".yaml"): 65 | files.append(file) 66 | return list(set(files)) 67 | 68 | 69 | def load_prediction_files(paths: list[Path]) -> list[Prediction]: 70 | predictions = [] 71 | for path in paths: 72 | for file_path in path.rglob("*"): 73 | if not file_path.is_file(): 74 | continue 75 | if file_path.suffix not in {".json", ".yml", ".yaml"}: 76 | continue 77 | with open(file_path, "r") as file: 78 | if file_path.suffix == ".json": 79 | data = json.load(file) 80 | predictions.append(Prediction(**data)) 81 | elif file_path.suffix in {".yml", ".yaml"}: 82 | data = yaml.safe_load(file) 83 | predictions.append(Prediction(**data)) 84 | return predictions 85 | 86 | 87 | def collect_call_errors(prediction: Prediction) -> list[CallError]: 88 | """Assert that the calls in the prediction match the expected calls.""" 89 | if prediction.test.calls is None: 90 | return [] 91 | errors = [] 92 | lookup = {call.name: call for call in prediction.test.calls} 93 | 94 | for function_name, invocations in prediction.calls.items(): 95 | call = lookup[function_name] 96 | if not call: 97 | errors.append(CallError(function_name=function_name, error_type=CallErrorType.MISSING_FUNCTION)) 98 | continue 99 | 100 | for arguments in invocations: 101 | for argument_name, argument_value in call.arguments.items(): 102 | if argument_name not in arguments: 103 | errors.append( 104 | CallError( 105 | function_name=function_name, 106 | argument_name=argument_name, 107 | error_type=CallErrorType.MISSING_ARGUMENT, 108 | ) 109 | ) 110 | for argument_name, argument_value in arguments.items(): 111 | if argument_name in call.arguments and argument_value != call.arguments[argument_name]: 112 | errors.append( 113 | CallError( 114 | function_name=function_name, 115 | argument_name=argument_name, 116 | expected_value=call.arguments[argument_name], 117 | actual_value=argument_value, 118 | error_type=CallErrorType.VALUE_MISMATCH, 119 | ) 120 | ) 121 | return errors 122 | -------------------------------------------------------------------------------- /examples/chat/1.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - Product Lead 3 | - Andrea is a Product Lead at V7 4 | id: 4fbd341e-c489-4e39-8e24-5cfbac039cae 5 | input: 6 | - content: Who is Andrea? 7 | role: user 8 | - content: As an AI language model, I don't have information on who Andrea is. 9 | role: assistant 10 | - content: Apologies, I should have let you know that Andrea is Product Lead at V7. 11 | role: user 12 | - content: Thanks, duly noted. 13 | role: assistant 14 | - content: What does Andrea do at V7? Be very concise. 15 | role: user 16 | -------------------------------------------------------------------------------- /examples/chat/2.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - Michael Jordan starred in the movie "Space Jam" 3 | id: 31228a9a-d515-463a-9956-d0f235de8357 4 | input: 5 | - content: Who is Michael Jordan? 6 | role: user 7 | - content: Michael Jordan is an ex NBA player. 8 | role: assistant 9 | - content: What was the team that made him famous? 10 | role: user 11 | - content: The Chicago Bulls 12 | role: assistant 13 | - content: He starred in a famous movie. Which one? 14 | role: user 15 | -------------------------------------------------------------------------------- /examples/chat/eval.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | import benchllm 4 | from benchllm.input_types import ChatInput 5 | 6 | 7 | def chat(messages: ChatInput, model="gpt-3.5-turbo"): 8 | response = openai.ChatCompletion.create(model=model, messages=messages) 9 | return response.choices[0].message.content.strip() 10 | 11 | 12 | @benchllm.test(suite=".") 13 | def gpt_3_5(input: ChatInput): 14 | return chat(input) 15 | 16 | 17 | @benchllm.test(suite=".") 18 | def gpt_4(input: ChatInput): 19 | return chat(input, model="gpt-4") 20 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/1.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - approximately 38,625,801 3 | id: 41006468-536f-452a-9871-f66c456d0f14 4 | input: How many people live in canada as of 2023? 5 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/10.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - approximately 0.2791714614499425 3 | id: 4913087a-adc2-429b-9c31-96861ec561bf 4 | input: what is 1213 divided by 4345? 5 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/2.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 3 | 4.9373857399466665 4 | id: cd67b0fd-9a79-4e6f-a92f-64855b0a5d49 5 | input: who is dua lipa's boyfriend? what is his age raised to the .43 power? 6 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/3.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 3 | 4.9373857399466665 4 | id: 1d745ec5-20d5-43b7-acc5-026f22a30ea0 5 | input: what is dua lipa's boyfriend age raised to the .43 power? 6 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/4.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - approximately 3,435 mi 3 | id: d1a8f38d-304c-4c0f-8c0e-57113a680f6a 4 | input: how far is it from paris to boston in miles 5 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/5.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - approximately 2.682651500990882 3 | id: 01abd22e-f72d-4851-9e30-b38f73af47ca 4 | input: 5 | what was the total number of points scored in the 2023 super bowl? what is 6 | that number raised to the .23 power? 7 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/6.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - approximately 2.682651500990882 3 | id: bdad58a1-b79b-4de7-9df6-95f5bf32ba04 4 | input: 5 | what was the total number of points scored in the 2023 super bowl raised to 6 | the .23 power? 7 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/7.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - "30" 3 | id: e4c807cf-047b-41f1-aced-ac268b1d6ba3 4 | input: 5 | how many more points were scored in the 2023 super bowl than in the 2022 super 6 | bowl? 7 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/8.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - approximately 1.9347796717823205 3 | id: c6b43b02-0d44-4c0b-b648-d7df2a97230f 4 | input: what is 153 raised to .1312 power? 5 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/9.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - approximately 1.7589107138176394 3 | id: ffb37883-fee7-4e64-9583-4831dde81057 4 | input: 5 | who is kendall jenner's boyfriend? what is his height (in inches) raised to 6 | .13 power? 7 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/converter.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import yaml 4 | from langchain.evaluation.loading import load_dataset 5 | 6 | SUITE_NAME = "langchain_agent_search_calculator" 7 | suite_path = Path("examples") / SUITE_NAME 8 | suite_path.mkdir(exist_ok=True, parents=True) 9 | 10 | dataset = load_dataset("agent-search-calculator") 11 | 12 | for i, data in enumerate(dataset, start=1): 13 | # { 14 | # 'steps': [{'tool': 'Search', 'tool_input': 'Population of Canada 2023'}], 15 | # 'answer': 'approximately 38,625,801', 16 | # 'question': 'How many people live in canada as of 2023?' 17 | # } 18 | with open(suite_path / f"{i}.yml", "w") as fp: 19 | benchllm_dict = {"expected": [data["answer"]], "input": data["question"]} 20 | yaml.safe_dump(benchllm_dict, fp) 21 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/eval.py: -------------------------------------------------------------------------------- 1 | from langchain.agents import AgentType, initialize_agent, load_tools 2 | from langchain.llms import OpenAI 3 | 4 | # import benchllm 5 | 6 | tools = load_tools(["serpapi", "llm-math"], llm=OpenAI(temperature=0)) 7 | agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True) 8 | 9 | 10 | # @benchllm.test 11 | def run(input: str): 12 | try: 13 | return agent(input)["output"] 14 | except Exception as e: 15 | return str(e) 16 | -------------------------------------------------------------------------------- /examples/langchain_agent_search_calculator/script.py: -------------------------------------------------------------------------------- 1 | from langchain.agents import AgentType, initialize_agent, load_tools 2 | from langchain.llms import OpenAI 3 | 4 | from benchllm import SemanticEvaluator, Test, Tester 5 | 6 | tools = load_tools(["serpapi", "llm-math"], llm=OpenAI(temperature=0)) 7 | agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True) 8 | 9 | tests = [Test(input="How many people live in canada as of 2023?", expected=["approximately 38,625,801"])] 10 | 11 | tester = Tester(lambda input: agent(input)["output"]) 12 | tester.add_tests(tests) 13 | predictions = tester.run() 14 | 15 | evaluator = SemanticEvaluator() 16 | evaluator.load(predictions) 17 | report = evaluator.run() 18 | 19 | print(report) 20 | -------------------------------------------------------------------------------- /examples/openai-evals/eval.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | import benchllm 4 | from benchllm.input_types import ChatInput 5 | 6 | 7 | def chat(messages: ChatInput): 8 | messages = [{"role": message.role, "content": message.content} for message in messages] 9 | 10 | response = openai.ChatCompletion.create( 11 | model="gpt-3.5-turbo", 12 | messages=messages, 13 | max_tokens=100, 14 | temperature=0.7, 15 | n=1, 16 | stop=None, 17 | ) 18 | 19 | return response.choices[0].message.content.strip() 20 | 21 | 22 | @benchllm.test(suite=".") 23 | def run(input: ChatInput): 24 | value = chat(input) 25 | return value 26 | -------------------------------------------------------------------------------- /examples/qa/1.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - 2 3 | - 2.0 4 | id: 699e66ad-5018-4b2f-98f6-e6f22324fd7b 5 | input: What's 1+1? Be very terse, only numeric output 6 | -------------------------------------------------------------------------------- /examples/qa/2.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - V7 3 | - I was created by V7 4 | id: 7449d32b-6db1-4d93-8df4-5b70558f7bd4 5 | input: Who created you? 6 | -------------------------------------------------------------------------------- /examples/qa/eval.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | import benchllm 4 | 5 | 6 | def complete_text(prompt): 7 | full_prompt = f""" 8 | You are a friendly AI bot created by V7. You are tasked with answering questions about the world. 9 | Q: {prompt} 10 | A:""" 11 | response = openai.Completion.create( 12 | engine="text-davinci-003", 13 | prompt=full_prompt, 14 | max_tokens=100, 15 | temperature=0.7, 16 | n=1, 17 | stop=None, 18 | ) 19 | return response.choices[0].text.strip() 20 | 21 | 22 | @benchllm.test(suite=".") 23 | def run(input: str): 24 | return complete_text(input) 25 | -------------------------------------------------------------------------------- /examples/qa/script.py: -------------------------------------------------------------------------------- 1 | from benchllm import StringMatchEvaluator, Test, Tester 2 | 3 | tests = [ 4 | Test(input="What's 1+1?", expected=["2", "It's 2"]), 5 | Test(input="What's Obama's first name?", expected=["Barack"]), 6 | ] 7 | 8 | tester = Tester(lambda _: 2) 9 | tester.add_tests(tests) 10 | predictions = tester.run() 11 | 12 | evaluator = StringMatchEvaluator() 13 | evaluator.load(predictions) 14 | report = evaluator.run() 15 | 16 | print(report) 17 | -------------------------------------------------------------------------------- /examples/similarity/benchllm.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - false 3 | id: d1df3377-f58a-4737-ae4c-e97443bc47d0 4 | input: 5 | prompt_1: BenchLLM was developed by V7, a company based in London 6 | prompt_2: BenchLLM was developed by Humanloop 7 | -------------------------------------------------------------------------------- /examples/similarity/eval.py: -------------------------------------------------------------------------------- 1 | import benchllm 2 | from benchllm import SimilarityInput 3 | from benchllm.similarity import semantically_similar 4 | 5 | 6 | @benchllm.test(suite=".") 7 | def run(input: SimilarityInput): 8 | return semantically_similar(input.prompt_1, input.prompt_2) 9 | -------------------------------------------------------------------------------- /examples/similarity/idempotency.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - true 3 | id: 3c4fc7dc-60e4-454f-9487-9833873d25e3 4 | input: 5 | prompt_1: V7 6 | prompt_2: V7 7 | -------------------------------------------------------------------------------- /examples/similarity/location.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - false 3 | id: 26ffcec1-71d0-4eeb-a120-066f733e4be0 4 | input: 5 | prompt_1: The Eiffel Tower is located in London. 6 | prompt_2: The Eiffel Tower is located in Paris. 7 | -------------------------------------------------------------------------------- /examples/similarity/number_detail.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - true 3 | id: 236c701d-0ea5-48e8-8b6a-90a8400f3050 4 | input: 5 | prompt_1: The population of Spain is 47 million. 6 | prompt_2: 47.42 million people live in Spain. 7 | -------------------------------------------------------------------------------- /examples/similarity/solarsystem.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - true 3 | id: 5b10574a-ed92-4349-9fae-65bcb1bcbb3a 4 | input: 5 | prompt_1: The Earth revolves around the Sun. 6 | prompt_2: The Sun is at the center of the solar system. 7 | -------------------------------------------------------------------------------- /examples/similarity/v7.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - true 3 | id: 60df4985-7ba9-440a-9dab-0aa3ed986daa 4 | input: 5 | prompt_1: I was created by V7 6 | prompt_2: V7 created me 7 | -------------------------------------------------------------------------------- /examples/similarity/water_boiling.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - true 3 | id: 92d8cf57-b726-42f8-b460-b90fbf4cb752 4 | input: 5 | prompt_1: Water boils at 100 degrees Celsius. 6 | prompt_2: The boiling point of water is 100 degrees Celsius. 7 | -------------------------------------------------------------------------------- /examples/vector_retrieval/1.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - "yes" 3 | id: 78ce604e-dc76-4f21-9514-c48b17948773 4 | input: 5 | Is the Seychelles parakeet extinct? (respond only with "yes", "no" or "don't 6 | know") 7 | -------------------------------------------------------------------------------- /examples/vector_retrieval/2.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - "1893" 3 | id: 9a82f1b1-455a-4321-bc7d-4b9e3cf05144 4 | input: 5 | When was the last year one saw a Seychelles parakeet? Respond ONLY with the 6 | year 7 | -------------------------------------------------------------------------------- /examples/vector_retrieval/eval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | import benchllm 5 | 6 | current_dir = Path(__file__).resolve().parent 7 | 8 | sys.path.append(str(current_dir)) 9 | from utils import initiate_test_faiss 10 | 11 | 12 | @benchllm.test(suite=".") 13 | def run(input: str): 14 | qa = initiate_test_faiss() 15 | resp = qa.run(input) 16 | return resp 17 | -------------------------------------------------------------------------------- /examples/vector_retrieval/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import requests 4 | from langchain.chains import RetrievalQA 5 | from langchain.document_loaders import DirectoryLoader, PyPDFLoader 6 | from langchain.embeddings.openai import OpenAIEmbeddings 7 | from langchain.llms import OpenAI 8 | from langchain.text_splitter import CharacterTextSplitter 9 | from langchain.vectorstores import FAISS 10 | 11 | embeddings = OpenAIEmbeddings() 12 | current_dir = Path(__file__).resolve().parent 13 | 14 | DB_NAME = Path(current_dir, "faiss_example_index") 15 | TEST_FILE_URLS = [ 16 | "https://en.wikipedia.org/api/rest_v1/page/pdf/Artificial_general_intelligence", 17 | "https://en.wikipedia.org/api/rest_v1/page/pdf/Socrates", 18 | "https://en.wikipedia.org/api/rest_v1/page/pdf/Seychelles_parakeet", 19 | ] 20 | PDF_FOLDER = Path(current_dir, "example_documents") 21 | 22 | 23 | def download_pdf(url: str, dst: Path) -> None: 24 | """Downloads a PDF file from a given URL if it doesn't exist in the destination directory""" 25 | dst_path = dst / f"{url.split('/')[-1].replace(' ', '_')}.pdf" 26 | 27 | if not dst_path.is_file(): 28 | print("Downloading", url) 29 | dst_path.parent.mkdir(parents=True, exist_ok=True) 30 | 31 | response = requests.get(url) 32 | response.raise_for_status() 33 | 34 | with dst_path.open("wb") as f: 35 | f.write(response.content) 36 | 37 | 38 | def download_and_load_documents(pdfs_path: Path, urls: list[str]): 39 | """Downloads PDFs from a list of URLs and loads them into a list of documents""" 40 | for url in urls: 41 | download_pdf(url, pdfs_path) 42 | 43 | loader = DirectoryLoader(str(pdfs_path), glob="*.pdf", loader_cls=PyPDFLoader) 44 | return loader.load() 45 | 46 | 47 | def set_up_faiss_db(db_path: Path, pdfs_path: Path, chunk_size=420, chunk_overlap=30): 48 | """Setups up a Faiss DB by loading documents and creating an index""" 49 | 50 | if db_path.exists(): 51 | try: 52 | return FAISS.load_local(str(db_path), embeddings) 53 | except Exception as e: 54 | print(f"Failed to load local FAISS DB: {e}") 55 | raise 56 | 57 | documents = download_and_load_documents(pdfs_path, TEST_FILE_URLS) 58 | if not documents: 59 | raise ValueError(f"No documents loaded from {pdfs_path}") 60 | 61 | text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 62 | docs = text_splitter.split_documents(documents) 63 | 64 | db = FAISS.from_documents(docs, embeddings) 65 | db.save_local(str(db_path)) 66 | 67 | return db 68 | 69 | 70 | def initiate_test_faiss(): 71 | """Initiates a Faiss test by creating a RetrievalQA object""" 72 | db = set_up_faiss_db(DB_NAME, PDF_FOLDER) 73 | qa = RetrievalQA.from_chain_type( 74 | llm=OpenAI(temperature=0), chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 3}) 75 | ) 76 | return qa 77 | -------------------------------------------------------------------------------- /examples/weather_functions/default.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - "Yes" 3 | id: d80b8d57-9c50-44fe-b492-c3cab549d9f3 4 | input: Will it rain tomorrow? 5 | -------------------------------------------------------------------------------- /examples/weather_functions/eval.py: -------------------------------------------------------------------------------- 1 | from forecast import run 2 | 3 | import benchllm 4 | 5 | 6 | @benchllm.test() 7 | def eval(input: str): 8 | return run(input) 9 | -------------------------------------------------------------------------------- /examples/weather_functions/forecast.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import openai 4 | 5 | 6 | def get_n_day_weather_forecast(location: str, num_days: int): 7 | return f"The weather in {location} will be rainy for the next {num_days} days." 8 | 9 | 10 | def chain(prompt: list[dict], functions): 11 | response = openai.ChatCompletion.create( 12 | model="gpt-3.5-turbo-0613", messages=prompt, temperature=0.0, functions=functions 13 | ) 14 | 15 | choice = response["choices"][0] 16 | if choice.get("finish_reason") == "function_call": 17 | function_call = choice["message"]["function_call"] 18 | function_name = function_call["name"] 19 | function_args = json.loads(function_call["arguments"]) 20 | fn = globals()[function_name] 21 | output = fn(**function_args) 22 | prompt.append({"role": "function", "name": function_name, "content": output}) 23 | return chain(prompt, functions) 24 | else: 25 | return response.choices[0].message.content.strip() 26 | 27 | 28 | def run(question: str): 29 | messages = [ 30 | { 31 | "role": "user", 32 | "content": "Only answer questions with 'yes', 'no' or 'unknown', you must not reply with anything else", 33 | }, 34 | {"role": "system", "content": "Use the get_n_day_weather_forecast function for weather questions"}, 35 | {"role": "user", "content": question}, 36 | ] 37 | 38 | functions = [ 39 | { 40 | "name": "get_n_day_weather_forecast", 41 | "description": "Get an N-day weather forecast", 42 | "parameters": { 43 | "type": "object", 44 | "properties": { 45 | "location": { 46 | "type": "string", 47 | "description": "The city and state, e.g. San Francisco, CA", 48 | }, 49 | "num_days": { 50 | "type": "integer", 51 | "description": "The number of days to forecast. E.g. 1 for today, 2 for tomorrow etc", 52 | }, 53 | }, 54 | "required": ["location", "format", "num_days"], 55 | }, 56 | }, 57 | ] 58 | return chain(messages, functions) 59 | -------------------------------------------------------------------------------- /examples/weather_functions/rainy.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - "yes" 3 | id: 4b717b39-da96-4ca3-8aaf-070c7f11449c 4 | input: I'm going to San Francisco today, do I need an umbrella? 5 | calls: 6 | - name: forecast.get_n_day_weather_forecast 7 | returns: It's rainy in San Francisco today. 8 | arguments: 9 | location: San Francisco 10 | num_days: 1 11 | -------------------------------------------------------------------------------- /examples/weather_functions/sunny.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - "no" 3 | id: 4b717b39-da96-4ca3-8aaf-070c7f11449c 4 | input: I live in London, can I expect rain today? 5 | calls: 6 | - name: forecast.get_n_day_weather_forecast 7 | returns: It's sunny today in London today. 8 | arguments: 9 | location: London 10 | num_days: 1 11 | -------------------------------------------------------------------------------- /examples/weather_functions/tomorrow.yml: -------------------------------------------------------------------------------- 1 | expected: 2 | - "yes" 3 | id: 4b717b39-da96-4ca3-8aaf-070c7f11449c 4 | input: Do I need a sun hat tomorrow? 5 | calls: 6 | - name: forecast.get_n_day_weather_forecast 7 | returns: It's sunny in your location tomorrow. 8 | arguments: 9 | num_days: 2 10 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ "poetry-core",] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "benchllm" 7 | version = "0.3.0" 8 | description = "Tool for testing LLMs" 9 | homepage = "https://github.com/v7labs/benchllm" 10 | authors = [ "Simon Edwardsson ", "Andrea Azzini "] 11 | readme = "README.md" 12 | license = "MIT" 13 | keywords = [] 14 | classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",] 15 | [[tool.poetry.packages]] 16 | include = "benchllm" 17 | 18 | [tool.isort] 19 | profile = "black" 20 | 21 | [tool.mypy] 22 | plugins = [ "pydantic.mypy",] 23 | follow_imports = "silent" 24 | warn_redundant_casts = true 25 | warn_unused_ignores = true 26 | check_untyped_defs = true 27 | no_implicit_reexport = true 28 | ignore_missing_imports = true 29 | disallow_any_unimported = true 30 | disallow_any_expr = false 31 | disallow_any_decorated = false 32 | disallow_any_explicit = false 33 | disallow_subclassing_any = true 34 | python_version = "3.10" 35 | disallow_untyped_calls = true 36 | disallow_untyped_defs = true 37 | disallow_incomplete_defs = true 38 | disallow_untyped_decorators = true 39 | no_implicit_optional = true 40 | warn_return_any = false 41 | warn_unreachable = true 42 | pretty = true 43 | 44 | [tool.pydantic-mypy] 45 | init_forbid_extra = true 46 | init_typed = true 47 | warn_required_dynamic_aliases = true 48 | warn_untyped_fields = true 49 | 50 | [tool.black] 51 | line-length = 120 52 | 53 | [tool.flake8] 54 | max-line-length = 120 55 | ignore = [ "E203", "W503", "E402",] 56 | 57 | [tool.poetry.dependencies] 58 | python = ">=3.10,<3.12" 59 | pyyaml = ">=5.1" 60 | typer = { version = "*", extras = ["all"] } 61 | pydantic = "^1.10.9" 62 | openai = "*" 63 | langchain = { version = "*", optional = true } 64 | pypdf = { version = "*", optional = true } 65 | tiktoken = { version = "*", optional = true } 66 | faiss-cpu = { version = "*", optional = true } 67 | types-pyyaml = { version = "*", optional = true } 68 | pytest = { version = "*", optional = true } 69 | pywebio = { version = "*", optional = true } 70 | 71 | [tool.poetry.extras] 72 | dev = [ "black", "isort", "flake8", "mypy", "pytest", "types-pyyaml"] 73 | test = [ "pytest"] 74 | examples = ["langchain", "tiktoken", "faiss-cpu", "pypdf"] 75 | 76 | [tool.poetry.scripts] 77 | bench = "benchllm.cli.main:main" 78 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/v7labs/benchllm/0b9d133a531538270e447b1e7ee1b8bb710a4061/test/__init__.py -------------------------------------------------------------------------------- /test/cache/test_file_cache.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | from unittest.mock import patch 4 | 5 | from benchllm import Prediction, StringMatchEvaluator, Test 6 | from benchllm.cache import FileCache 7 | from benchllm.data_types import FunctionID 8 | 9 | EXAMPLE_PREDICTIONS = [ 10 | Prediction( 11 | test=Test(input="foo", expected=["abc", "def", "ghi"]), 12 | output="no-match", 13 | time_elapsed=0, 14 | function_id=FunctionID.default(), 15 | ), 16 | Prediction( 17 | test=Test(input="foo", expected=["abc", "def", "ghi"]), 18 | output="def", 19 | time_elapsed=0, 20 | function_id=FunctionID.default(), 21 | ), 22 | Prediction( 23 | test=Test(input="foo", expected=["abc", "def", "ghi"]), 24 | output="no-match", 25 | time_elapsed=0, 26 | function_id=FunctionID.default(), 27 | ), 28 | ] 29 | 30 | EXAMPLE_PREDICTIONS_ALL_SAME = [ 31 | Prediction( 32 | test=Test(input="foo", expected=["match"]), 33 | output="match", 34 | time_elapsed=0, 35 | function_id=FunctionID.default(), 36 | ), 37 | Prediction( 38 | test=Test(input="foo", expected=["match"]), 39 | output="match", 40 | time_elapsed=0, 41 | function_id=FunctionID.default(), 42 | ), 43 | ] 44 | 45 | 46 | def test_file_writes_at_end(): 47 | with patch.object( 48 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction 49 | ) as mock_method: 50 | with tempfile.TemporaryDirectory() as temp_dir: 51 | cache_path = Path(temp_dir, "cache.json") 52 | evaluator = FileCache(StringMatchEvaluator(), cache_path) 53 | evaluator.load(EXAMPLE_PREDICTIONS) 54 | 55 | evaluations = evaluator.run() 56 | assert cache_path.exists() 57 | assert not evaluations[0].passed 58 | assert evaluations[1].passed 59 | assert not evaluations[2].passed 60 | assert mock_method.call_count == 2 61 | assert evaluator.num_cache_hits == 1 62 | mock_method.reset_mock() 63 | 64 | # second run will use cache 65 | evaluator = FileCache(StringMatchEvaluator(), cache_path) 66 | evaluator.load(EXAMPLE_PREDICTIONS) 67 | 68 | evaluations = evaluator.run() 69 | assert cache_path.exists() 70 | assert not evaluations[0].passed 71 | assert evaluations[1].passed 72 | assert not evaluations[2].passed 73 | assert mock_method.call_count == 0 74 | assert evaluator.num_cache_hits == 3 75 | -------------------------------------------------------------------------------- /test/cache/test_memory_cache.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | from benchllm import Prediction, StringMatchEvaluator, Test 4 | from benchllm.cache import MemoryCache 5 | from benchllm.data_types import FunctionID 6 | 7 | EXAMPLE_PREDICTIONS = [ 8 | Prediction( 9 | test=Test(input="foo", expected=["abc", "def", "ghi"]), 10 | output="no-match", 11 | time_elapsed=0, 12 | function_id=FunctionID.default(), 13 | ), 14 | Prediction( 15 | test=Test(input="foo", expected=["abc", "def", "ghi"]), 16 | output="def", 17 | time_elapsed=0, 18 | function_id=FunctionID.default(), 19 | ), 20 | Prediction( 21 | test=Test(input="foo", expected=["abc", "def", "ghi"]), 22 | output="no-match", 23 | time_elapsed=0, 24 | function_id=FunctionID.default(), 25 | ), 26 | ] 27 | 28 | EXAMPLE_PREDICTIONS_ALL_SAME = [ 29 | Prediction( 30 | test=Test(input="foo", expected=["match"]), 31 | output="match", 32 | time_elapsed=0, 33 | function_id=FunctionID.default(), 34 | ), 35 | Prediction( 36 | test=Test(input="foo", expected=["match"]), 37 | output="match", 38 | time_elapsed=0, 39 | function_id=FunctionID.default(), 40 | ), 41 | ] 42 | 43 | 44 | EXAMPLE_PREDICTIONS_CACHING_NEGATIVE = [ 45 | Prediction( 46 | test=Test(input="foo", expected=["no-match"]), 47 | output="match", 48 | time_elapsed=0, 49 | function_id=FunctionID.default(), 50 | ), 51 | Prediction( 52 | test=Test(input="foo", expected=["no-match", "match"]), 53 | output="match", 54 | time_elapsed=0, 55 | function_id=FunctionID.default(), 56 | ), 57 | Prediction( 58 | test=Test(input="foo", expected=["match", "no-match"]), 59 | output="match", 60 | time_elapsed=0, 61 | function_id=FunctionID.default(), 62 | ), 63 | ] 64 | 65 | 66 | def test_memory_cache_will_prevent_calls_to_evaluate_prediction_on_second_run(): 67 | with patch.object( 68 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction 69 | ) as mock_method: 70 | evaluator = MemoryCache(StringMatchEvaluator()) 71 | evaluator.load(EXAMPLE_PREDICTIONS) 72 | evaluations = evaluator.run() 73 | assert not evaluations[0].passed 74 | assert evaluations[1].passed 75 | assert not evaluations[2].passed 76 | assert mock_method.call_count == 2 77 | assert evaluator.num_cache_hits == 1 78 | mock_method.reset_mock() 79 | 80 | # second run will use cache 81 | evaluations = evaluator.run() 82 | assert not evaluations[0].passed 83 | assert evaluations[1].passed 84 | assert not evaluations[2].passed 85 | assert evaluator.num_cache_hits == 4 86 | assert mock_method.call_count == 0 87 | 88 | 89 | def test_memory_cache_caches_during_run(): 90 | with patch.object( 91 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction 92 | ) as mock_method: 93 | evaluator = MemoryCache(StringMatchEvaluator()) 94 | evaluator.load(EXAMPLE_PREDICTIONS_ALL_SAME) 95 | 96 | evaluations = evaluator.run() 97 | assert evaluations[0].passed 98 | assert evaluations[1].passed 99 | assert mock_method.call_count == 1 100 | assert evaluator.num_cache_hits == 1 101 | 102 | 103 | def test_memory_cache_caches_always_tries_to_pass(): 104 | with patch.object( 105 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction 106 | ) as mock_method: 107 | evaluator = MemoryCache(StringMatchEvaluator()) 108 | evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE) 109 | 110 | evaluations = evaluator.run() 111 | assert not evaluations[0].passed 112 | assert evaluations[1].passed 113 | assert evaluations[2].passed 114 | assert mock_method.call_count == 2 115 | assert evaluator.num_cache_hits == 1 116 | 117 | 118 | def test_memory_cache_does_not_pass_on_cached_negatives(): 119 | with patch.object( 120 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction 121 | ) as mock_method: 122 | evaluator = MemoryCache(StringMatchEvaluator()) 123 | evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE) 124 | 125 | evaluator.run() 126 | assert mock_method.call_count == 2 127 | assert mock_method.call_args_list.pop(0).args[0].test.expected == ["no-match"] 128 | assert mock_method.call_args_list.pop(0).args[0].test.expected == ["match"] 129 | 130 | 131 | def test_memory_cache_supports_numbers(): 132 | with patch.object( 133 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction 134 | ) as mock_method: 135 | evaluator = MemoryCache(StringMatchEvaluator()) 136 | evaluator.load( 137 | [ 138 | Prediction( 139 | test=Test(input="foo", expected=["42"]), 140 | output="42", 141 | time_elapsed=0, 142 | function_id=FunctionID.default(), 143 | ), 144 | Prediction( 145 | test=Test(input="foo", expected=["42"]), 146 | output="42", 147 | time_elapsed=0, 148 | function_id=FunctionID.default(), 149 | ), 150 | Prediction( 151 | test=Test(input="foo", expected=["42"]), 152 | output="24", 153 | time_elapsed=0, 154 | function_id=FunctionID.default(), 155 | ), 156 | ] 157 | ) 158 | evaluations = evaluator.run() 159 | assert evaluations[0].passed 160 | assert evaluations[1].passed 161 | assert not evaluations[2].passed 162 | assert mock_method.call_count == 2 163 | assert evaluator.num_cache_hits == 1 164 | -------------------------------------------------------------------------------- /test/cli/test_interactive.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import typer 4 | 5 | from benchllm.cli.evaluator import InteractiveEvaluator 6 | from benchllm.data_types import FunctionID, Prediction, Test 7 | 8 | TEST_PREDICTION = [ 9 | Prediction( 10 | test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]), 11 | output="I am Yoda.", 12 | time_elapsed=0, 13 | function_id=FunctionID.default(), 14 | ) 15 | ] 16 | 17 | 18 | def test_interactive_press_y_passes(): 19 | evalautor = InteractiveEvaluator() 20 | evalautor.load(TEST_PREDICTION) 21 | with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "1"): 22 | result = evalautor.run() 23 | assert result[0].passed 24 | 25 | with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "2"): 26 | result = evalautor.run() 27 | assert result[0].passed 28 | 29 | 30 | def test_interactive_press_n_fails(): 31 | evalautor = InteractiveEvaluator() 32 | evalautor.load(TEST_PREDICTION) 33 | with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "n"): 34 | result = evalautor.run() 35 | assert not result[0].passed 36 | -------------------------------------------------------------------------------- /test/cli/test_list_tests.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from typer.testing import CliRunner 4 | 5 | from benchllm.cli.main import app 6 | 7 | runner = CliRunner() 8 | 9 | 10 | def test_list_tests(): 11 | result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")]) 12 | assert "Input" in result.stdout 13 | assert "No." in result.stdout 14 | assert "Expected" in result.stdout 15 | -------------------------------------------------------------------------------- /test/cli/test_run_suite.py: -------------------------------------------------------------------------------- 1 | from test.utils import create_openai_object 2 | from unittest.mock import MagicMock, patch 3 | 4 | from typer.testing import CliRunner 5 | 6 | from benchllm.cli.main import app 7 | 8 | runner = CliRunner() 9 | 10 | 11 | @patch("openai.Completion.create", return_value=create_openai_object("Hello, user!")) 12 | def test_run_multiple_suites(completion_mock: MagicMock): 13 | runner.invoke(app, ["run", "examples/qa", "examples/similarity"]) 14 | completion_mock.assert_called() 15 | 16 | 17 | @patch("openai.Completion.create", return_value=create_openai_object("Hello, user!")) 18 | def test_run_target_suite(completion_mock: MagicMock): 19 | runner.invoke(app, ["run", "examples/qa"]) 20 | completion_mock.assert_called() 21 | -------------------------------------------------------------------------------- /test/evaulator/test_evalutator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | from pathlib import Path 4 | from test.utils import create_openai_object 5 | from unittest.mock import MagicMock, Mock, call, patch 6 | 7 | from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test 8 | from benchllm.cache import MemoryCache 9 | from benchllm.data_types import FunctionID 10 | from benchllm.evaluator import Evaluator 11 | 12 | 13 | class NoopEvaluator(Evaluator): 14 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]: 15 | return [Evaluator.Candidate(prediction=prediction.output, expected=prediction.output, score=1.0, passed=True)] 16 | 17 | 18 | def test_evaluator_can_load_prediction_file(): 19 | prediction = { 20 | "output": "42", 21 | "test": {"input": "1+1", "expected": ["2"]}, 22 | "time_elapsed": 0, 23 | "function_id": {"module_path": "test", "line_number": 1, "name": "test"}, 24 | } 25 | with tempfile.TemporaryDirectory() as tmpdir: 26 | prediction_path = Path(tmpdir, "prediction.json") 27 | prediction_path.write_bytes(json.dumps(prediction).encode()) 28 | 29 | evaluator = NoopEvaluator() 30 | evaluator.load_prediction_file(prediction_path) 31 | 32 | assert evaluator.predictions[0].output == "42" 33 | assert evaluator.predictions[0].test.input == "1+1" 34 | assert evaluator.predictions[0].test.expected == ["2"] 35 | -------------------------------------------------------------------------------- /test/evaulator/test_semantic.py: -------------------------------------------------------------------------------- 1 | from test.utils import create_openai_object 2 | from unittest.mock import MagicMock, patch 3 | 4 | from benchllm import Prediction, SemanticEvaluator, Test 5 | from benchllm.data_types import FunctionID 6 | 7 | 8 | @patch("openai.Completion.create", return_value=create_openai_object("same")) 9 | def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock): 10 | evaluator = SemanticEvaluator(model="gpt-3") 11 | evaluator.load( 12 | [ 13 | Prediction( 14 | test=Test(input="Who are you?", expected=["Yoda I am."]), 15 | output="I am Yoda.", 16 | time_elapsed=0, 17 | function_id=FunctionID.default(), 18 | ) 19 | ] 20 | ) 21 | evaluations = evaluator.run() 22 | completion_mock.assert_called_once() 23 | assert evaluations[0].passed 24 | 25 | 26 | @patch("openai.Completion.create", return_value=create_openai_object("different")) 27 | def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock): 28 | evaluator = SemanticEvaluator(model="gpt-3") 29 | evaluator.load( 30 | [ 31 | Prediction( 32 | test=Test(input="What are you?", expected=["Everything"]), 33 | output="Nothing", 34 | time_elapsed=0, 35 | function_id=FunctionID.default(), 36 | ), 37 | ] 38 | ) 39 | evaluations = evaluator.run() 40 | completion_mock.assert_called_once() 41 | assert not evaluations[0].passed 42 | 43 | 44 | @patch("openai.Completion.create", return_value=create_openai_object("same")) 45 | def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock): 46 | evaluator = SemanticEvaluator(model="gpt-3", workers=10) 47 | evaluator.load( 48 | [ 49 | Prediction( 50 | test=Test(input="Who are you?", expected=["Yoda I am."]), 51 | output="I am Yoda.", 52 | time_elapsed=0, 53 | function_id=FunctionID.default(), 54 | ) 55 | for _ in range(100) 56 | ] 57 | ) 58 | evaluations = evaluator.run() 59 | assert completion_mock.call_count == 100 60 | assert all([evaluation.passed for evaluation in evaluations]) 61 | -------------------------------------------------------------------------------- /test/evaulator/test_string_match.py: -------------------------------------------------------------------------------- 1 | from benchllm import Prediction, StringMatchEvaluator, Test 2 | from benchllm.data_types import FunctionID 3 | 4 | 5 | def test_string_match_passes_if_output_is_equal_to_expected(): 6 | evaluator = StringMatchEvaluator() 7 | evaluator.load( 8 | [ 9 | Prediction( 10 | test=Test(input="foo", expected=["bar"]), output="42", time_elapsed=0, function_id=FunctionID.default() 11 | ), 12 | Prediction( 13 | test=Test(input="foo", expected=["42"]), output="42", time_elapsed=0, function_id=FunctionID.default() 14 | ), 15 | Prediction( 16 | test=Test(input="foo", expected=["BAR"]), output="bar", time_elapsed=0, function_id=FunctionID.default() 17 | ), 18 | ] 19 | ) 20 | evaluations = evaluator.run() 21 | assert not evaluations[0].passed 22 | assert evaluations[1].passed 23 | assert evaluations[2].passed 24 | 25 | 26 | def test_string_match_passes_if_output_is_equal_to_expected_case_sensitive(): 27 | evaluator = StringMatchEvaluator(case_sensitive=True) 28 | evaluator.load( 29 | [ 30 | Prediction( 31 | test=Test(input="foo", expected=["BAR"]), output="BAR", time_elapsed=0, function_id=FunctionID.default() 32 | ), 33 | Prediction( 34 | test=Test(input="foo", expected=["BAR"]), output="bar", time_elapsed=0, function_id=FunctionID.default() 35 | ), 36 | ] 37 | ) 38 | evaluations = evaluator.run() 39 | assert evaluations[0].passed 40 | assert not evaluations[1].passed 41 | 42 | 43 | def test_string_match_passes_if_output_is_equal_to_expected_fuzzy(): 44 | evaluator = StringMatchEvaluator(fuzzy=True) 45 | evaluator.load( 46 | [ 47 | Prediction( 48 | test=Test(input="foo", expected=["abc def ghi"]), 49 | output="def", 50 | time_elapsed=0, 51 | function_id=FunctionID.default(), 52 | ), 53 | Prediction( 54 | test=Test(input="foo", expected=["abc def ghi"]), 55 | output="adg", 56 | time_elapsed=0, 57 | function_id=FunctionID.default(), 58 | ), 59 | ] 60 | ) 61 | evaluations = evaluator.run() 62 | assert evaluations[0].passed 63 | assert not evaluations[1].passed 64 | -------------------------------------------------------------------------------- /test/test_tester.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | from unittest.mock import Mock, call 4 | 5 | from benchllm import Test, Tester 6 | 7 | 8 | def test_tester_run_through_each_test_once(): 9 | test_function = Mock(return_value="42") 10 | test = Tester(test_function=test_function) 11 | test.add_test(Test(input="1+1", expected=["2"])) 12 | test.add_test(Test(input="2+2", expected=["4"])) 13 | predictions = test.run() 14 | 15 | assert test_function.call_count == 2 16 | 17 | print(test_function.call_args_list) 18 | test_function.assert_has_calls([call("1+1"), call("2+2")]) 19 | assert predictions[0].output == "42" 20 | assert predictions[1].output == "42" 21 | 22 | 23 | def test_tester_parses_yml_correctly(): 24 | python_code = """ 25 | import benchllm 26 | 27 | @benchllm.test(suite=".") 28 | def test(input: str): 29 | return "42" 30 | """ 31 | test_case = """ 32 | input: 1+1 33 | expected: [2] 34 | """ 35 | with tempfile.TemporaryDirectory() as temp_dir: 36 | temp_dir = Path(temp_dir) 37 | with open(temp_dir / "test.py", "w") as f: 38 | f.write(python_code) 39 | with open(temp_dir / "test.yaml", "w") as f: 40 | f.write(test_case) 41 | 42 | test = Tester() 43 | test.load_module(temp_dir / "test.py") 44 | predictions = test.run() 45 | 46 | assert predictions[0].output == "42" 47 | assert predictions[0].test.input == "1+1" 48 | assert predictions[0].test.expected == ["2"] 49 | -------------------------------------------------------------------------------- /test/utils.py: -------------------------------------------------------------------------------- 1 | from openai.openai_object import OpenAIObject 2 | 3 | 4 | def create_openai_object(text): 5 | obj = OpenAIObject() 6 | message = OpenAIObject() 7 | message.text = text 8 | obj.choices = [message] 9 | return obj 10 | --------------------------------------------------------------------------------