├── .github
└── workflows
│ ├── publish.yml
│ └── tests.yml
├── .gitignore
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── benchllm
├── __init__.py
├── cache.py
├── cli
│ ├── __init__.py
│ ├── commands
│ │ ├── __init__.py
│ │ ├── add_test.py
│ │ ├── evaluate.py
│ │ ├── list_tests.py
│ │ └── run_suite.py
│ ├── evaluator
│ │ ├── __init__.py
│ │ ├── interactive.py
│ │ └── web.py
│ ├── listener.py
│ ├── main.py
│ └── utils.py
├── data_types.py
├── evaluator
│ ├── __init__.py
│ ├── embedding.py
│ ├── evaluator.py
│ ├── semantic.py
│ └── string_match.py
├── input_types.py
├── listener.py
├── similarity.py
├── singleton.py
├── tester.py
└── utils.py
├── examples
├── chat
│ ├── 1.yml
│ ├── 2.yml
│ └── eval.py
├── langchain_agent_search_calculator
│ ├── 1.yml
│ ├── 10.yml
│ ├── 2.yml
│ ├── 3.yml
│ ├── 4.yml
│ ├── 5.yml
│ ├── 6.yml
│ ├── 7.yml
│ ├── 8.yml
│ ├── 9.yml
│ ├── converter.py
│ ├── eval.py
│ └── script.py
├── openai-evals
│ └── eval.py
├── qa
│ ├── 1.yml
│ ├── 2.yml
│ ├── eval.py
│ └── script.py
├── similarity
│ ├── benchllm.yml
│ ├── eval.py
│ ├── idempotency.yml
│ ├── location.yml
│ ├── number_detail.yml
│ ├── solarsystem.yml
│ ├── v7.yml
│ └── water_boiling.yml
├── vector_retrieval
│ ├── 1.yml
│ ├── 2.yml
│ ├── eval.py
│ └── utils.py
└── weather_functions
│ ├── default.yml
│ ├── eval.py
│ ├── forecast.py
│ ├── rainy.yml
│ ├── sunny.yml
│ └── tomorrow.yml
├── pyproject.toml
└── test
├── __init__.py
├── cache
├── test_file_cache.py
└── test_memory_cache.py
├── cli
├── test_interactive.py
├── test_list_tests.py
└── test_run_suite.py
├── evaulator
├── test_evalutator.py
├── test_semantic.py
└── test_string_match.py
├── test_tester.py
└── utils.py
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | release:
5 | types: [created]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: "3.9"
17 | - run: pip install pip --upgrade
18 | - name: Setup Poetry
19 | uses: abatilo/actions-poetry@v2
20 | with:
21 | poetry-version: "1.3.1"
22 | - name: Install dependencies
23 | run: |
24 | poetry install --no-interaction --no-root --all-extras -vvv
25 | poetry build
26 | - name: Publish
27 | env:
28 | POETRY_HTTP_BASIC_PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
29 | POETRY_HTTP_BASIC_PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
30 | run: |
31 | poetry publish
32 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 |
9 | concurrency:
10 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
11 | cancel-in-progress: true
12 |
13 | jobs:
14 | ci:
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | python-version: ["3.10"]
19 | poetry-version: ["1.3.1"]
20 | os: [ubuntu-latest, windows-latest]
21 | runs-on: ${{ matrix.os }}
22 | steps:
23 | - uses: actions/checkout@v2
24 | - uses: actions/setup-python@v2
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Upgrade pip
28 | run: python -m pip install --upgrade pip
29 | - name: Setup Poetry
30 | uses: abatilo/actions-poetry@v2
31 | with:
32 | poetry-version: ${{ matrix.poetry-version }}
33 | - name: Install dependencies
34 | run: |
35 | poetry install --no-interaction --no-root --all-extras -vvv
36 | pip install wheel
37 | pip install --upgrade setuptools
38 | pip install --editable ".[test,examples,dev]"
39 | pip install pytest pytest-describe
40 | - name: Run Tests
41 | run: python -m pytest
42 | - name: Run BenchLLM examples
43 | run: bench run examples
44 | env:
45 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | .DS_Store
163 |
164 | # Vector QA Retrieval PDF documents and index shouldn't be tracked
165 | examples/vector_retrieval/example_documents
166 | examples/vector_retrieval/faiss_example_index
167 |
168 | # Don't stage any files in the output directory
169 | output/
170 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "black-formatter.args": ["--line-length", "120"],
3 | "editor.codeActionsOnSave": {
4 | "source.organizeImports": true
5 | },
6 | "editor.formatOnSave": true,
7 | "editor.rulers": [120],
8 | "[python]": {
9 | "editor.defaultFormatter": "ms-python.black-formatter"
10 | },
11 | "[yaml]": {
12 | "editor.defaultFormatter": "esbenp.prettier-vscode"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 V7
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🏋️♂️ BenchLLM 🏋️♀️
2 |
3 | 🦾 Continuous Integration for LLM powered applications 🦙🦅🤖
4 |
5 | [](https://github.com/v7labs/BenchLLM/stargazers)
6 | [](https://twitter.com/V7Labs)
7 | [](https://discord.gg/x7ExfHb3bG)
8 |
9 | [**BenchLLM**](https://benchllm.com/) is a Python-based open-source library that streamlines the testing of Large Language Models (LLMs) and AI-powered applications. It measures the accuracy of your model, agents, or chains by validating responses on any number of tests via LLMs.
10 |
11 | BenchLLM is actively used at [V7](https://www.v7labs.com) for improving our LLM applications and is now Open Sourced under MIT License to share with the wider community
12 |
13 | ## 💡 Get help on [Discord](https://discord.gg/x7ExfHb3bG) or [Tweet at us](https://twitter.com/V7Labs)
14 |
15 |
16 |
17 | Use BenchLLM to:
18 |
19 | - Test the responses of your LLM across any number of prompts.
20 | - Continuous integration for chains like [Langchain](https://github.com/hwchase17/langchain), agents like [AutoGPT](https://github.com/Significant-Gravitas/Auto-GPT), or LLM models like [Llama](https://github.com/facebookresearch/llama) or GPT-4.
21 | - Eliminate flaky chains and create confidence in your code.
22 | - Spot inaccurate responses and hallucinations in your application at every version.
23 |
24 |
25 |
26 | > ⚠️ **NOTE:** BenchLLM is in the early stage of development and will be subject to rapid changes.
27 | >
28 | > For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
29 |
30 | ## 🧪 BenchLLM Testing Methodology
31 |
32 | BenchLLM implements a distinct two-step methodology for validating your machine learning models:
33 |
34 | 1. **Testing**: This stage involves running your code against any number of expected responses and capturing the predictions produced by your model without immediate judgment or comparison.
35 |
36 | 2. **Evaluation**: The recorded predictions are compared against the expected output using LLMs to verify factual similarity (or optionally manually). Detailed comparison reports, including pass/fail status and other metrics, are generated.
37 |
38 | This methodical separation offers a comprehensive view of your model's performance and allows for better control and refinement of each step.
39 |
40 | ## 🚀 Install
41 |
42 | To install BenchLLM we use pip
43 |
44 | ```
45 | pip install benchllm
46 | ```
47 |
48 | ## 💻 Usage
49 |
50 | Start by importing the library and use the @benchllm.test decorator to mark the function you'd like to test:
51 |
52 | ```python
53 | import benchllm
54 |
55 | # Your custom model implementation
56 | def run_my_model(input):
57 | # Your model's logic goes here.
58 | return some_result
59 |
60 | @benchllm.test(suite="/path/to/test/suite") # If the tests are in the same directory, just use @benchllm.test.
61 | def invoke_model(input: str):
62 | return run_my_model(input)
63 | ```
64 |
65 | Next, prepare your tests. These are YAML/JSON files structured as follows:
66 |
67 | ```yml
68 | input: What's 1+1? Be very terse, only numeric output
69 | expected:
70 | - 2
71 | - 2.0
72 | ```
73 |
74 | In the above example, the `input` is the query or instruction that your model will process, and `expected` contains the potential responses that your model should return. It's important to note that `input` can be a simple `str` or a more complex nested dictionary; BenchLLM will extract the type of the `input` argument in the Python code and load the `input` field from the YAML file accordingly.
75 |
76 | By default, BenchLLM uses OpenAI's GPT-3 model for the `semantic` evaluator. This requires setting the `OPENAI_API_KEY` environment variable. If you do not want to use this default evaluator, you can specify an alternative one (discussed in further detail below):
77 |
78 | ```bash
79 | export OPENAI_API_KEY='your-api-key'
80 | ```
81 |
82 | Replace 'your-api-key' with your actual OpenAI API key.
83 |
84 | To initiate testing, use the `bench run` command:
85 |
86 | ```bash
87 | $ bench run
88 | ```
89 |
90 | By default, the bench run command looks for Python files implementing the @test decorator in the current directory. To target a specific file or folder, specify it directly:
91 |
92 | ```bash
93 | $ bench run path/to/my/file.py or/path/to/folder/with/files
94 | ```
95 |
96 | The `--retry-count` parameter allows BenchLLM to run a test multiple times, useful for models that may have variability in their outputs:
97 |
98 | ```bash
99 | $ bench run --retry-count 5
100 | ```
101 |
102 | BenchLLM offers multiple evaluation methods to determine if the prediction matches the test case's expected values. You can use the `--evaluator` parameter to specify the evaluation method:
103 |
104 | There are multiple ways to evaluate if the test functions prediction matches the test cases expected values.
105 | By default GPT-3 is used to compare the output. You can use `--evaluator` to use a different method
106 |
107 | - `semantic`, checks semantic similarity using language models like GPT-3, GPT-3.5, or GPT-4 (`--model` parameter). Please note, for this evaluator, you need to set the `OPENAI_API_KEY` environment variable.
108 | - `embedding`, uses cosine distance between embedded vectors. Please note, for this evaluator, you need to set the `OPENAI_API_KEY` environment variable.
109 | - `string-match`, checks if the strings are matching (case insensitive)
110 | - `interactive`, user manually accepts or fails tests in the terminal
111 | - `web`, uses pywebio fora simple local web interface
112 |
113 | The non interactive evaluators also supports `--workers N` to run in the evaluations in parallel
114 |
115 | ```bash
116 | $ bench run --evaluator string-match --workers 5
117 | ```
118 |
119 | To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches:
120 |
121 | - `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N`
122 | - `file`, stores the cache at the end of the run as a JSON file in output/cache.json. This is the default behavior.
123 | - `none`, does not use any cache.
124 |
125 | ```bash
126 | $ bench run examples --cache memory
127 | ```
128 |
129 | When working on developing chains or training agent models, there may be instances where these models need to interact with external functions — for instance, querying a weather forecast or executing an SQL query. In such scenarios, BenchLLM facilitates the ability to mock these functions. This helps you make your tests more predictable and enables the discovery of unexpected function calls.
130 |
131 | ```yml
132 | input: I live in London, can I expect rain today?
133 | expected: ["no"]
134 | calls:
135 | - name: forecast.get_n_day_weather_forecast
136 | returns: It's sunny in London.
137 | arguments:
138 | location: London
139 | num_days: 1
140 | ```
141 |
142 | In the example above, the function `get_n_day_weather_forecast` in the `forecast` module is mocked. In other words, every time this function is invoked, the model will receive `"It's sunny in London"`. BenchLLM also provides warnings if the function is invoked with argument values different from `get_n_day_weather_forecast(location=London, num_days=1)`. Please note, the provision of these argument parameters is optional.
143 |
144 | ### 🧮 Eval
145 |
146 | While _bench run_ runs each test function and then evaluates their output, it can often be beneficial to separate these into two steps. For example, if you want a person to manually do the evaluation or if you want to try multiple evaluation methods on the same function.
147 |
148 | ```bash
149 | $ bench run --no-eval
150 | ```
151 |
152 | This will generate json files in `output/latest/predictions`
153 | Then later you can evaluate them with
154 |
155 | ```bash
156 | $ bench eval output/latest/predictions
157 | ```
158 |
159 | ## 🔌 API
160 |
161 | For more detailed control, BenchLLM provides an API.
162 | You are not required to add YML/JSON tests to be able to evaluate your model.
163 | You can instead:
164 |
165 | - Instantiate `Test` objects
166 | - Use a `Tester` object to generate predictions
167 | - Use an `Evaluator` object to evaluate your model
168 |
169 | ```python
170 | from benchllm import StringMatchEvaluator, Test, Tester
171 |
172 | # Instantiate your Test objects
173 | tests = [
174 | Test(input="What's 1+1?", expected=["2", "It's 2"]),
175 | Test(input="First rule of fight club?", expected=["Do not talk about fight club"]),
176 | ]
177 |
178 | # Use a Tester object to generate predictions using any test functions
179 | tester = Tester(my_test_function)
180 | tester.add_tests(tests)
181 | predictions = tester.run()
182 |
183 | # Use an Evaluator object to evaluate your model
184 | evaluator = StringMatchEvaluator()
185 | evaluator.load(predictions)
186 | results = evaluator.run()
187 |
188 | print(results)
189 | ```
190 |
191 | If you want to incorporate caching and run multiple parallel evaluation jobs, you can modify your evaluator as follows:
192 |
193 | ```python
194 | from benchllm.cache import FileCache
195 |
196 | ...
197 |
198 | evaluator = FileCache(StringMatchEvaluator(workers=2), Path("path/to/cache.json"))
199 | evaluator.load(predictions)
200 | results = evaluator.run()
201 | ```
202 |
203 | In this example, `FileCache` is used to enable caching, and the `workers` parameter of `StringMatchEvaluator` is set to `2` to allow for parallel evaluations. The cache results are saved in a file specified by `Path("path/to/cache.json")`.
204 |
205 | ## ☕️ Commands
206 |
207 | - `bench add`: Add a new test to a suite.
208 | - `bench tests`: List all tests in a suite.
209 | - `bench run`: Run all or target test suites.
210 | - `bench eval`: Runs the evaluation of an existing test run.
211 |
212 | ## 🙌 Contribute
213 |
214 | BenchLLM is developed for Python 3.10, although it may work with other Python versions as well. We recommend using a Python 3.10 environment and pip >= 23. You can use conda or any other environment manager to set up the environment:
215 |
216 | ```bash
217 | $ conda create --name benchllm python=3.10
218 | $ conda activate benchllm
219 | $ pip install -e ".[dev]"
220 | ```
221 |
222 | To run all the examples first install the examples extra dependencies
223 |
224 | ```bash
225 | $ pip install -e ".[examples]"
226 | ```
227 |
228 | Contribution steps:
229 |
230 | 1. Fork the repository.
231 | 2. Create a new branch for your changes.
232 | 3. Make your changes.
233 | 4. Test your changes.
234 | 5. Submit a pull request.
235 |
236 | We adhere to the PEP8 style guide. Please follow this guide when contributing.
237 |
238 | If you need any support, feel free to open an issue on our GitHub page.
239 |
--------------------------------------------------------------------------------
/benchllm/__init__.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from pathlib import Path
3 | from typing import Callable, Type, TypeVar
4 |
5 | from .data_types import Evaluation, Prediction, Test # noqa
6 | from .evaluator import ( # noqa
7 | EmbeddingEvaluator,
8 | Evaluator,
9 | SemanticEvaluator,
10 | StringMatchEvaluator,
11 | )
12 | from .input_types import ChatInput, SimilarityInput # noqa
13 | from .similarity import semantically_similar # noqa
14 | from .singleton import TestSingleton # noqa
15 | from .tester import Tester # noqa
16 |
17 | T = TypeVar("T")
18 |
19 | __all__ = [
20 | "test",
21 | "Tester",
22 | "Prediction",
23 | "Test",
24 | "Evaluation",
25 | "StringMatchEvaluator",
26 | "SemanticEvaluator",
27 | "Evaluator",
28 | "EmbeddingEvaluator",
29 | ]
30 |
31 |
32 | def test_wrapper(func: Callable[[T], str], input_type: Type[T], suite: Path) -> None:
33 | test_singleton = TestSingleton()
34 | test_singleton.register(func, input_type=input_type, suite=suite)
35 |
36 |
37 | def test(*, suite: str = ".") -> Callable[[Callable[[T], str]], None]:
38 | def test_decorator(func: Callable[[T], str]) -> None:
39 | suite_path = Path(suite)
40 | if not suite_path.is_absolute():
41 | suite_path = Path(inspect.getfile(func)).parent / suite
42 | type = func.__annotations__.get("input")
43 | if type is None:
44 | raise Exception("Your test function needs to have an input parameter annotated with the input type")
45 | return test_wrapper(func, type, suite_path)
46 |
47 | return test_decorator
48 |
--------------------------------------------------------------------------------
/benchllm/cache.py:
--------------------------------------------------------------------------------
1 | import json
2 | from pathlib import Path
3 | from typing import Optional
4 |
5 | from pydantic import BaseModel
6 |
7 | from benchllm.data_types import Evaluation, Prediction
8 | from benchllm.evaluator import Evaluator
9 | from benchllm.input_types import Json
10 | from benchllm.listener import EvaluatorListener
11 |
12 |
13 | class MemoryValue(BaseModel):
14 | passed: bool
15 | score: float
16 |
17 |
18 | class MemoryCache(Evaluator):
19 | """Caches the results of the evaluator in memory"""
20 |
21 | def __init__(self, evaluator: Evaluator):
22 | super().__init__(workers=evaluator.workers)
23 | self._data: dict = {}
24 | self._evaluator = evaluator
25 | self._num_cache_misses = 0
26 | self._num_cache_hits = 0
27 |
28 | def _key(self, answer1: Json, answer2: Json) -> str:
29 | key1, key2 = json.dumps([answer1, answer2]), json.dumps([answer2, answer1])
30 | return key1 if key1 < key2 else key2
31 |
32 | def lookup(self, answer1: Json, answer2: Json) -> Optional[MemoryValue]:
33 | result = self._data.get(self._key(answer1, answer2), None)
34 | if result:
35 | return MemoryValue(**result)
36 | return None
37 |
38 | def store(self, answer1: Json, answer2: Json, value: MemoryValue) -> None:
39 | key = self._key(answer1, answer2)
40 | self._data[key] = value.dict()
41 |
42 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
43 | uncached_expectations = []
44 | candidates = []
45 | for expected in prediction.test.expected:
46 | lookup = self.lookup(expected, prediction.output)
47 | if lookup is None:
48 | uncached_expectations.append(expected)
49 | else:
50 | candidates.append(Evaluator.Candidate(prediction=prediction.output, expected=expected, **lookup.dict()))
51 |
52 | # If any of the cached candidates passed, we return them.
53 | if any([candidate.passed for candidate in candidates]):
54 | self._num_cache_hits += 1
55 | return candidates
56 |
57 | # If all expectations were found in the cache but were negative matches,
58 | # we increment the cache hits counter and return None as there's no match.
59 | if not uncached_expectations:
60 | self._num_cache_hits += 1
61 | return candidates
62 |
63 | self._num_cache_misses += 1
64 | # set prediction.test.expected to only the ones that were not cached
65 | prediction = Prediction(**prediction.dict())
66 | prediction.test.expected = uncached_expectations
67 | candidates = self._evaluator.evaluate_prediction(prediction)
68 | for candidate in candidates:
69 | self.store(candidate.expected, candidate.prediction, MemoryValue(**candidate.dict()))
70 | return candidates
71 |
72 | @property
73 | def num_cache_hits(self) -> int:
74 | return self._num_cache_hits
75 |
76 | @property
77 | def num_cache_misses(self) -> int:
78 | return self._num_cache_misses
79 |
80 |
81 | class FileCache(MemoryCache, EvaluatorListener):
82 | """Caches the results of the evaluator in a json file"""
83 |
84 | def __init__(self, evaluator: Evaluator, path: Path):
85 | super().__init__(evaluator)
86 | self._path = path
87 | self.add_listener(self)
88 | self._load()
89 |
90 | def _load(self) -> None:
91 | if self._path.exists():
92 | try:
93 | cache = json.loads(self._path.read_text(encoding="UTF-8"), parse_int=str)
94 | if cache["version"] != "1":
95 | raise ValueError("Unsupported cache version")
96 | self._data = cache["entries"]
97 | except Exception:
98 | print(f"Failed to load cache file {self._path}")
99 | self._data = {}
100 |
101 | def _save(self) -> None:
102 | cache = {"entries": self._data, "version": "1"}
103 | self._path.write_text(json.dumps(cache, indent=4), encoding="UTF-8")
104 |
105 | def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
106 | self._save()
107 |
--------------------------------------------------------------------------------
/benchllm/cli/__init__.py:
--------------------------------------------------------------------------------
1 | from .commands.add_test import add_test # noqa
2 | from .commands.evaluate import evaluate_predictions # noqa
3 | from .commands.list_tests import list_tests # noqa
4 | from .commands.run_suite import run_suite # noqa
5 |
6 | __all__ = ["add_test", "evaluate_predictions", "list_tests", "run_suite"]
7 |
--------------------------------------------------------------------------------
/benchllm/cli/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v7labs/benchllm/0b9d133a531538270e447b1e7ee1b8bb710a4061/benchllm/cli/commands/__init__.py
--------------------------------------------------------------------------------
/benchllm/cli/commands/add_test.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Optional
3 |
4 | import typer
5 | import yaml
6 |
7 |
8 | def add_test(*, input: str, expected: list[str], name: str, overwrite: bool, suite_path: Optional[Path]) -> None:
9 | if suite_path is None:
10 | typer.secho("No default suite was specified.", fg=typer.colors.RED, bold=True)
11 | raise typer.Exit()
12 |
13 | if not suite_path.exists():
14 | typer.secho("The specified suite does not exist.", fg=typer.colors.RED, bold=True)
15 | raise typer.Exit()
16 |
17 | test_path = suite_path / f"{name}.yml"
18 | if test_path.exists() and not overwrite:
19 | typer.secho(
20 | f"The test {test_path} already exists. Use --overwrite to overwrite it.",
21 | fg=typer.colors.RED,
22 | bold=True,
23 | )
24 | raise typer.Exit()
25 |
26 | with open(test_path, "w") as f:
27 | yaml.safe_dump({"input": input, "expected": expected}, f)
28 | typer.secho(f"{test_path} added successfully!", fg=typer.colors.GREEN, bold=True)
29 |
--------------------------------------------------------------------------------
/benchllm/cli/commands/evaluate.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from benchllm.cache import FileCache
4 | from benchllm.cli.listener import ReportListener, RichCliListener
5 | from benchllm.cli.utils import add_cache, get_evaluator
6 | from benchllm.utils import find_json_yml_files, load_prediction_files
7 |
8 |
9 | def evaluate_predictions(
10 | file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str, cache: str
11 | ) -> bool:
12 | files = find_json_yml_files(file_or_dir)
13 |
14 | cli_listener = RichCliListener(root_dir=Path.cwd(), interactive=evaluator_name == "interactive", eval_only=True)
15 | report_listener = ReportListener(output_dir=output_dir)
16 |
17 | load_prediction_files(file_or_dir)
18 |
19 | evaluator = get_evaluator(evaluator_name, model, workers)
20 | evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
21 |
22 | cli_listener.set_evaulator(evaluator)
23 |
24 | evaluator.add_listener(cli_listener)
25 | evaluator.add_listener(report_listener)
26 | for file in files:
27 | evaluator.load_prediction_file(file)
28 |
29 | evaluator.run()
30 | return not evaluator.failed
31 |
--------------------------------------------------------------------------------
/benchllm/cli/commands/list_tests.py:
--------------------------------------------------------------------------------
1 | import json
2 | from pathlib import Path
3 | from typing import Optional
4 |
5 | import typer
6 | import yaml
7 | from rich.console import Console
8 | from rich.table import Table
9 |
10 |
11 | def list_tests(*, suite_path: Optional[Path]) -> None:
12 | if suite_path is None:
13 | typer.secho("No default suite was specified.", fg=typer.colors.RED, bold=True)
14 | raise typer.Exit()
15 |
16 | if not suite_path.exists():
17 | typer.secho("The specified suite does not exist.", fg=typer.colors.RED, bold=True)
18 | raise typer.Exit()
19 |
20 | console = Console()
21 |
22 | table = Table()
23 | table.add_column("Input")
24 | table.add_column("No.", justify="right")
25 | table.add_column("Expected")
26 |
27 | test_paths = list(suite_path.glob("*.yml"))
28 | for test_path in test_paths:
29 | with open(test_path, "r") as f:
30 | example = yaml.safe_load(f)
31 | for i, expected in enumerate(example["expected"], 1):
32 | if i == 1:
33 | input = json.dumps(example["input"])
34 | else:
35 | input = ""
36 | table.add_row(input, str(i), json.dumps(expected))
37 | table.add_section()
38 |
39 | if test_paths:
40 | console.print(table)
41 | else:
42 | typer.secho("No tests found in the specified suite directory.", fg=typer.colors.RED, bold=True)
43 |
--------------------------------------------------------------------------------
/benchllm/cli/commands/run_suite.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import typer
4 |
5 | from benchllm.cache import FileCache
6 | from benchllm.cli.listener import ReportListener, RichCliListener
7 | from benchllm.cli.utils import add_cache, get_evaluator
8 | from benchllm.tester import Tester
9 | from benchllm.utils import find_files
10 |
11 |
12 | def run_suite(
13 | *,
14 | file_search_paths: list[Path],
15 | model: str,
16 | output_dir: Path,
17 | no_eval: bool,
18 | workers: int,
19 | evaluator_name: str,
20 | retry_count: int,
21 | cache: str,
22 | ) -> bool:
23 | files = find_files(file_search_paths)
24 | if not files:
25 | typer.secho(
26 | f"No python files with @benchllm.test found in {', '.join(map(str, file_search_paths))}",
27 | fg=typer.colors.RED,
28 | bold=True,
29 | )
30 | return False
31 |
32 | cli_listener = RichCliListener(root_dir=Path.cwd(), interactive=evaluator_name == "interactive", test_only=no_eval)
33 | report_listener = ReportListener(output_dir=output_dir)
34 |
35 | tester = Tester(retry_count=retry_count)
36 | tester.add_listener(cli_listener)
37 | tester.add_listener(report_listener)
38 |
39 | # Load the the python files first, then the tests.
40 | for file in files:
41 | tester.load_module(file)
42 |
43 | # Finally, start collecting the predictions.
44 | tester.run()
45 |
46 | if no_eval:
47 | return True
48 |
49 | evaluator = get_evaluator(evaluator_name, model, workers)
50 | evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
51 |
52 | cli_listener.set_evaulator(evaluator)
53 |
54 | evaluator.add_listener(cli_listener)
55 | evaluator.add_listener(report_listener)
56 | evaluator.load(tester.predictions)
57 |
58 | evaluator.run()
59 | return not evaluator.failed
60 |
--------------------------------------------------------------------------------
/benchllm/cli/evaluator/__init__.py:
--------------------------------------------------------------------------------
1 | from benchllm.cli.evaluator.interactive import InteractiveEvaluator # noqa
2 | from benchllm.cli.evaluator.web import WebEvaluator # noqa
3 |
--------------------------------------------------------------------------------
/benchllm/cli/evaluator/interactive.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import click
4 | import typer
5 |
6 | from benchllm.data_types import Prediction
7 | from benchllm.evaluator import Evaluator
8 |
9 |
10 | class InteractiveEvaluator(Evaluator):
11 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
12 | header = (
13 | f'{typer.style("Does ", bold=True)}'
14 | f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}"
15 | f'{typer.style(" match any of the following expected prompts?", bold=True)}'
16 | )
17 | typer.echo("")
18 | typer.echo(header)
19 |
20 | for i, expected in enumerate(prediction.test.expected, start=1):
21 | typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
22 | typer.secho(expected, bold=True)
23 |
24 | options = [str(idx) for idx, _ in enumerate(prediction.test.expected, start=1)] + ["n"]
25 |
26 | prompt_string = f"[{typer.style('matching number', fg=typer.colors.GREEN, bold=True)} or {typer.style('n', fg=typer.colors.RED, bold=True)}]"
27 | click_choice = click.Choice(options)
28 | response = typer.prompt(prompt_string, default="n", type=click_choice, show_choices=False).lower()
29 | if response == "n":
30 | return [
31 | Evaluator.Candidate(prediction=prediction.output, expected=expected, score=0.0, passed=False)
32 | for expected in prediction.test.expected
33 | ]
34 | return [
35 | Evaluator.Candidate(
36 | prediction=prediction.output,
37 | expected=prediction.test.expected[int(response) - 1],
38 | score=1.0,
39 | passed=True,
40 | )
41 | ]
42 |
--------------------------------------------------------------------------------
/benchllm/cli/evaluator/web.py:
--------------------------------------------------------------------------------
1 | import signal
2 | from typing import Optional
3 |
4 | import typer
5 | from pywebio import session
6 | from pywebio.input import radio
7 | from pywebio.output import put_markdown
8 |
9 | from benchllm.data_types import Prediction
10 | from benchllm.evaluator import Evaluator
11 |
12 |
13 | class WebEvaluator(Evaluator):
14 | def __init__(self) -> None:
15 | super().__init__(workers=1)
16 |
17 | @session.defer_call
18 | def on_close() -> None:
19 | print("shutting down")
20 | typer.secho(
21 | f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True
22 | )
23 | # sys.exit doesn't work here, so we have to raise a signal to kill the process
24 | signal.raise_signal(signal.SIGINT)
25 |
26 | put_markdown("# BenchLLM Web Evaluator")
27 |
28 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
29 | test_name = prediction.test.file_path or prediction.test.id
30 |
31 | put_markdown(f"## {test_name}")
32 | put_markdown(f"*Question*: `{prediction.test.input}`")
33 | put_markdown(f"*Prediction*: `{prediction.output}`")
34 |
35 | table = [["Question:", f"{prediction.test.input}", ""], ["Prediction:", prediction.output], ""]
36 | label = f"Question: {prediction.test.input}Prediction: {prediction.output}"
37 |
38 | options: list[dict[str, Optional[int | str]]] = [
39 | {"label": expected, "value": idx} for idx, expected in enumerate(prediction.test.expected)
40 | ]
41 | options.append({"label": "None", "value": None, "selected": True})
42 | answer = radio("Pick the matching answer", options=options, required=True)
43 |
44 | if answer and isinstance(answer, int):
45 | return [
46 | Evaluator.Candidate(
47 | prediction=prediction.output, expected=prediction.test.expected[answer], score=1.0, passed=True
48 | )
49 | ]
50 | else:
51 | return [
52 | Evaluator.Candidate(prediction=prediction.output, expected=expected, score=0.0, passed=False)
53 | for expected in prediction.test.expected
54 | ]
55 |
--------------------------------------------------------------------------------
/benchllm/cli/listener.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import json
3 | from pathlib import Path
4 | from typing import Optional
5 |
6 | import typer
7 | from rich import print
8 | from rich.console import Console
9 | from rich.markup import render
10 | from rich.table import Table
11 |
12 | from benchllm.cache import MemoryCache
13 | from benchllm.data_types import (
14 | CallErrorType,
15 | Evaluation,
16 | FunctionID,
17 | Prediction,
18 | Test,
19 | TestFunction,
20 | )
21 | from benchllm.evaluator import Evaluator
22 | from benchllm.listener import EvaluatorListener, TesterListener
23 | from benchllm.utils import collect_call_errors
24 |
25 |
26 | class ReportListener(TesterListener, EvaluatorListener):
27 | def __init__(self, *, output_dir: Path) -> None:
28 | super().__init__()
29 | self.output_dir = output_dir
30 |
31 | def test_ended(self, prediction: Prediction) -> None:
32 | path = self.output_dir / "predictions" / f"{prediction.test.id}.json"
33 | path.parent.mkdir(parents=True, exist_ok=True)
34 |
35 | with open(path, "w") as f:
36 | json.dump(json.loads(prediction.json()), f, indent=2)
37 |
38 | def evaluate_prediction_ended(self, evaluation: Evaluation) -> None:
39 | evaluation_json = json.loads(evaluation.json())
40 | prediction_json = evaluation_json.pop("prediction")
41 | prediction_json["evaluation"] = evaluation_json
42 |
43 | path = self.output_dir / "evaluations" / f"{evaluation.prediction.test.id}.json"
44 | path.parent.mkdir(parents=True, exist_ok=True)
45 |
46 | with open(path, "w") as f:
47 | json.dump(prediction_json, f, indent=2)
48 |
49 |
50 | class RichCliListener(TesterListener, EvaluatorListener):
51 | def __init__(
52 | self,
53 | root_dir: Path,
54 | *,
55 | interactive: bool,
56 | test_only: bool = False,
57 | eval_only: bool = False,
58 | ) -> None:
59 | super().__init__()
60 | self.root_dir = root_dir
61 | self.interactive = interactive
62 | self._eval_only = eval_only
63 | self._test_only = test_only
64 | self._evaluator: Optional[Evaluator] = None
65 |
66 | def set_evaulator(self, evaluator: Evaluator) -> None:
67 | self._evaluator = evaluator
68 |
69 | def test_run_started(self) -> None:
70 | print_centered(" Run Tests ")
71 |
72 | def test_run_ended(self, predications: list[Prediction]) -> None:
73 | if not self._test_only:
74 | return
75 | total_test_time = sum(prediction.time_elapsed for prediction in predications) or 0.0
76 | tmp = f" [green]{len(predications)} tests[/green], in [blue]{format_time(total_test_time)}[/blue] "
77 | print_centered(tmp)
78 |
79 | def test_function_started(self, test_function: TestFunction) -> None:
80 | typer.echo(f"{test_function.function_id.relative_str(self.root_dir)} ", nl=False)
81 |
82 | def test_function_ended(self) -> None:
83 | typer.echo("")
84 |
85 | def test_started(self, test: Test) -> None:
86 | pass
87 |
88 | def test_ended(self, prediction: Prediction) -> None:
89 | typer.secho(".", fg=typer.colors.GREEN, bold=True, nl=False)
90 |
91 | def test_skipped(self, test: Test, error: bool = False) -> None:
92 | if error:
93 | typer.secho("E", fg=typer.colors.RED, bold=True, nl=False)
94 | else:
95 | typer.secho("s", fg=typer.colors.YELLOW, bold=True, nl=False)
96 |
97 | def evaluate_started(self) -> None:
98 | print_centered(" Evaluate Tests ")
99 |
100 | def evaluate_module_started(self, function_id: FunctionID) -> None:
101 | typer.echo(f"{function_id.relative_str(self.root_dir)} ", nl=False)
102 |
103 | def evaluate_module_ended(self) -> None:
104 | typer.echo("")
105 |
106 | def evaluate_prediction_started(self, prediction: Prediction) -> None:
107 | pass
108 |
109 | def evaluate_prediction_ended(self, evaluation: Evaluation) -> None:
110 | if self.interactive:
111 | return
112 |
113 | if evaluation.passed:
114 | typer.secho(".", fg=typer.colors.GREEN, bold=True, nl=False)
115 | else:
116 | typer.secho("F", fg=typer.colors.RED, bold=True, nl=False)
117 |
118 | def handle_call_error(self, evaluations) -> None:
119 | predictions_with_calls = [
120 | evaluation.prediction for evaluation in evaluations if evaluation.prediction.test.calls
121 | ]
122 | if not predictions_with_calls:
123 | return
124 |
125 | print_centered(" Call Warnings ")
126 |
127 | for prediction in predictions_with_calls:
128 | errors = collect_call_errors(prediction)
129 | if not errors:
130 | continue
131 | relative_path = prediction.function_id.relative_str(self.root_dir)
132 | print_centered(f" [yellow]{relative_path}[/yellow] :: [yellow]{prediction.test.file_path}[/yellow] ", "-")
133 |
134 | for error in errors:
135 | if error.error_type == CallErrorType.MISSING_ARGUMENT:
136 | print(
137 | f'[blue][bold]{error.function_name}[/bold][/blue] was never called with [blue][bold]"{error.argument_name}"[/bold][/blue]'
138 | )
139 | elif error.error_type == CallErrorType.MISSING_FUNCTION:
140 | print(f"[blue][bold]{error.function_name}[/bold][/blue] was never declared")
141 | elif error.error_type == CallErrorType.VALUE_MISMATCH:
142 | print(
143 | f'[blue][bold]{error.function_name}[/bold][/blue] was called with "{error.argument_name}=[red][bold]{error.actual_value}[/bold][/red]", expected "[green][bold]{error.expected_value}[/bold][/green]"'
144 | )
145 |
146 | def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
147 | self.handle_call_error(evaluations)
148 |
149 | failed = [evaluation for evaluation in evaluations if not evaluation.passed]
150 | total_test_time = (
151 | 0.0 if self._eval_only else sum(evaluation.prediction.time_elapsed for evaluation in evaluations) or 0.0
152 | )
153 | total_eval_time = sum(evaluation.eval_time_elapsed for evaluation in evaluations) or 0.0
154 | if failed:
155 | print_centered(" Failures ")
156 | for failure in failed:
157 | prediction = failure.prediction
158 | relative_path = prediction.function_id.relative_str(self.root_dir)
159 | print_centered(
160 | f" [red]{relative_path}[/red] :: [red]{prediction.test.file_path} ({failure.score:.2f})[/red] ", "-"
161 | )
162 |
163 | console = Console()
164 |
165 | table = Table(show_header=False, show_lines=True)
166 | table.add_row(f"Input", str(prediction.test.input))
167 | table.add_row(f"Output", f"[red]{prediction.output}[/red]")
168 | for i, answer in enumerate(prediction.test.expected):
169 | table.add_row(f"Expected #{i+1}", str(answer))
170 | console.print(table)
171 |
172 | tmp = f" [red]{len(failed)} failed[/red], [green]{len(evaluations) - len(failed)} passed[/green], in [blue]{format_time(total_eval_time + total_test_time)}[/blue] "
173 | if isinstance(self._evaluator, MemoryCache):
174 | tmp += f"(cached hits {self._evaluator.num_cache_hits}, cached misses {self._evaluator.num_cache_misses}) "
175 |
176 | print_centered(tmp)
177 |
178 |
179 | def print_centered(text: str, sep: str = "=") -> None:
180 | console = Console()
181 | terminal_width = console.width
182 |
183 | padding = (terminal_width - len(render(text))) // 2
184 | print(sep * padding, f"[bold]{text}[/bold]", sep * padding, sep="")
185 |
186 |
187 | def format_time(seconds: float) -> str:
188 | delta = datetime.timedelta(seconds=seconds)
189 | if seconds < 1:
190 | milliseconds = int(seconds * 1000)
191 | return f"{milliseconds:.2f}ms"
192 | elif seconds < 60:
193 | return f"{seconds:.2f}s"
194 | else:
195 | return str(delta)
196 |
--------------------------------------------------------------------------------
/benchllm/cli/main.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Annotated, Optional
3 | from uuid import uuid4
4 |
5 | import typer
6 |
7 | from benchllm.cli import add_test, evaluate_predictions, list_tests, run_suite
8 | from benchllm.cli.utils import output_dir_factory
9 |
10 | app = typer.Typer(add_completion=False)
11 |
12 |
13 | @app.command(help="Run tests and evaluations.")
14 | def run(
15 | output_dir: Annotated[
16 | Path, typer.Option(help="Output directory to save evaluation reports into.", default_factory=output_dir_factory)
17 | ],
18 | file_or_dir: Annotated[
19 | Optional[list[Path]],
20 | typer.Argument(
21 | help="Paths to python files or directories implemented @benchllm.test functions.",
22 | exists=True,
23 | resolve_path=True,
24 | ),
25 | ] = None,
26 | model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3",
27 | eval: Annotated[bool, typer.Option(help="Run final evaluation.")] = True,
28 | workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
29 | retry_count: Annotated[int, typer.Option(help="Rerun tests to spot flaky output")] = 1,
30 | evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
31 | cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
32 | ) -> None:
33 | if not file_or_dir:
34 | file_or_dir = [Path.cwd()]
35 |
36 | success = run_suite(
37 | file_search_paths=file_or_dir,
38 | model=model,
39 | output_dir=output_dir,
40 | workers=workers,
41 | evaluator_name=evaluator,
42 | no_eval=not eval,
43 | retry_count=retry_count,
44 | cache=cache,
45 | )
46 | if not success:
47 | raise typer.Exit(code=1)
48 |
49 |
50 | @app.command(help="Evaluate predictions")
51 | def eval(
52 | file_or_dir: Annotated[
53 | list[Path],
54 | typer.Argument(
55 | help="Paths to json files or directories containing json files to evaluate.",
56 | exists=True,
57 | resolve_path=True,
58 | ),
59 | ],
60 | output_dir: Annotated[
61 | Path, typer.Option(help="Output directory to save evaluation reports into.", default_factory=output_dir_factory)
62 | ],
63 | model: Annotated[str, typer.Option(help="Model to use to run the evaluation.")] = "gpt-3",
64 | workers: Annotated[int, typer.Option(help="Number of workers to use to run the evaluation.")] = 1,
65 | evaluator: Annotated[str, typer.Option(help="Evaluator to use to run the evaluation.")] = "semantic",
66 | cache: Annotated[str, typer.Option(help="Type of cache to use.")] = "file",
67 | ) -> None:
68 | success = evaluate_predictions(
69 | file_or_dir=file_or_dir,
70 | model=model,
71 | output_dir=output_dir,
72 | workers=workers,
73 | evaluator_name=evaluator,
74 | cache=cache,
75 | )
76 | if not success:
77 | raise typer.Exit(code=1)
78 |
79 |
80 | @app.command(help="Add a new test case to a suite.")
81 | def add(
82 | suite_path: Annotated[Optional[Path], typer.Argument(help="Test suite directory.")],
83 | input: Annotated[str, typer.Option(help="Input prompt to send to your model.")],
84 | expected: Annotated[
85 | list[str], typer.Option(help="Expected output prompt. You can use this option multiple times.")
86 | ],
87 | name: Annotated[
88 | str,
89 | typer.Option(
90 | help="Name of the test case. Generated UUID when not specified.",
91 | default_factory=uuid4,
92 | ),
93 | ],
94 | overwrite: Annotated[bool, typer.Option(help="Overwrite existing test case.")] = False,
95 | ) -> None:
96 | add_test(input=input, expected=expected, name=name, overwrite=overwrite, suite_path=suite_path)
97 |
98 |
99 | @app.command(help="List all tests.")
100 | def tests(suite_path: Annotated[Path, typer.Argument(help="Test suite directory.")]) -> None:
101 | list_tests(suite_path=suite_path)
102 |
103 |
104 | def main() -> None:
105 | app()
106 |
107 |
108 | if __name__ == "__main__":
109 | main()
110 |
--------------------------------------------------------------------------------
/benchllm/cli/utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from pathlib import Path
3 |
4 | from benchllm.cache import FileCache, MemoryCache
5 | from benchllm.cli.evaluator import InteractiveEvaluator, WebEvaluator
6 | from benchllm.evaluator import (
7 | EmbeddingEvaluator,
8 | Evaluator,
9 | SemanticEvaluator,
10 | StringMatchEvaluator,
11 | )
12 |
13 |
14 | def output_dir_factory() -> Path:
15 | timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
16 | output_dir = Path.cwd() / "output" / str(timestamp)
17 | output_dir.mkdir(exist_ok=True, parents=True)
18 |
19 | latest = Path.cwd() / "output" / "latest"
20 | if latest.exists():
21 | latest.unlink()
22 | latest.symlink_to(output_dir)
23 | return output_dir
24 |
25 |
26 | def get_evaluator(evaluator_name: str, model: str, workers: int) -> Evaluator:
27 | if evaluator_name == "semantic":
28 | return SemanticEvaluator(model=model, workers=workers)
29 | elif evaluator_name == "interactive":
30 | return InteractiveEvaluator()
31 | elif evaluator_name == "string-match":
32 | return StringMatchEvaluator(workers=workers)
33 | elif evaluator_name == "web":
34 | return WebEvaluator()
35 | elif evaluator_name == "embedding":
36 | return EmbeddingEvaluator()
37 | else:
38 | raise ValueError(f"Unknown evaluator {evaluator_name}")
39 |
40 |
41 | def add_cache(cache_name: str, evaluator: Evaluator, cache_path: Path) -> Evaluator:
42 | if cache_name == "file":
43 | return FileCache(evaluator, cache_path)
44 | elif cache_name == "memory":
45 | return MemoryCache(evaluator)
46 | elif cache_name == "none":
47 | return evaluator
48 | else:
49 | raise ValueError(f"Unknown cache {cache_name}, valid values are 'file', 'memory', 'none'")
50 |
--------------------------------------------------------------------------------
/benchllm/data_types.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from pathlib import Path
3 | from typing import Any, Callable, Generic, Optional, TypeVar
4 | from uuid import uuid4
5 |
6 | from pydantic import BaseModel, Field
7 |
8 |
9 | class TestCall(BaseModel):
10 | __test__ = False
11 | name: str
12 | arguments: dict[str, Any]
13 | returns: Any
14 |
15 |
16 | class Test(BaseModel):
17 | __test__ = False
18 | id: str = Field(default_factory=lambda: str(uuid4()))
19 | input: Any
20 | expected: list[str]
21 | file_path: Optional[Path] = None
22 | calls: Optional[list[TestCall]] = None
23 |
24 |
25 | class FunctionID(BaseModel):
26 | module_path: Path
27 | line_number: int
28 | name: str
29 |
30 | def __hash__(self) -> int:
31 | return hash((self.module_path, self.line_number))
32 |
33 | def __str__(self) -> str:
34 | return f"{self.module_path}:{self.line_number} ({self.name})"
35 |
36 | def relative_str(self, root_dir: Path) -> str:
37 | try:
38 | return str(
39 | FunctionID(
40 | module_path=self.module_path.relative_to(root_dir), line_number=self.line_number, name=self.name
41 | )
42 | )
43 | except ValueError:
44 | # we can't be sure that the module_path loaded from json files is relative to the root_dir
45 | return str(FunctionID(module_path=self.module_path, line_number=self.line_number, name=self.name))
46 |
47 | @staticmethod
48 | def default() -> "FunctionID":
49 | return FunctionID(module_path=Path(""), line_number=0, name="default")
50 |
51 |
52 | class Prediction(BaseModel):
53 | test: Test
54 | output: str
55 | time_elapsed: float
56 | function_id: FunctionID
57 | calls: dict[str, list[dict[str, Any]]] = {}
58 |
59 |
60 | class CallErrorType(str, Enum):
61 | MISSING_FUNCTION = "Missing function"
62 | MISSING_ARGUMENT = "Missing argument"
63 | VALUE_MISMATCH = "Value mismatch"
64 |
65 |
66 | class CallError(BaseModel):
67 | function_name: str
68 | argument_name: Optional[str] = None
69 | expected_value: Optional[Any] = None
70 | actual_value: Optional[Any] = None
71 | error_type: CallErrorType
72 |
73 |
74 | class Evaluation(BaseModel):
75 | prediction: Prediction
76 | passed: bool
77 | eval_time_elapsed: float
78 | score: float
79 |
80 |
81 | T = TypeVar("T")
82 |
83 |
84 | class TestFunction(BaseModel, Generic[T]):
85 | function: Callable[[T], Any]
86 | function_id: FunctionID
87 | input_type: T
88 | suite: Optional[Path] = None
89 |
--------------------------------------------------------------------------------
/benchllm/evaluator/__init__.py:
--------------------------------------------------------------------------------
1 | from benchllm.evaluator.evaluator import Evaluator # noqa
2 | # Adding an empty comment to force import order to avoid circular imports
3 | from benchllm.evaluator.embedding import EmbeddingEvaluator # noqa
4 | from benchllm.evaluator.semantic import SemanticEvaluator # noqa
5 | from benchllm.evaluator.string_match import StringMatchEvaluator # noqa
6 |
--------------------------------------------------------------------------------
/benchllm/evaluator/embedding.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import openai
3 |
4 | from benchllm.data_types import Prediction
5 | from benchllm.evaluator import Evaluator
6 |
7 |
8 | class EmbeddingEvaluator(Evaluator):
9 | def __init__(self, *, engine: str = "text-similarity-davinci-001", threshold: float = 0.9, workers: int = 1):
10 | super().__init__(workers=workers)
11 | self._engine = engine
12 | self._threshold = threshold
13 |
14 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
15 | output_embedding = get_embedding(prediction.output, engine=self._engine)
16 | candidates = []
17 | for expected in prediction.test.expected:
18 | expected_embedding = get_embedding(expected, engine=self._engine)
19 | similarity = cosine_similarity(output_embedding, expected_embedding)
20 | candidates.append(
21 | Evaluator.Candidate(
22 | prediction=prediction.output,
23 | expected=expected,
24 | score=similarity,
25 | passed=similarity > self._threshold,
26 | )
27 | )
28 | return candidates
29 |
30 |
31 | # these also exist in openai.embeddings_utils but have additional dependencies
32 | def get_embedding(text: str, engine: str, **kwargs) -> list[float]:
33 | text = text.replace("\n", " ")
34 | return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0]["embedding"]
35 |
36 |
37 | def cosine_similarity(a, b):
38 | return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
39 |
--------------------------------------------------------------------------------
/benchllm/evaluator/evaluator.py:
--------------------------------------------------------------------------------
1 | import json
2 | from abc import ABC, abstractmethod
3 | from concurrent.futures import ThreadPoolExecutor
4 | from itertools import groupby
5 | from operator import attrgetter
6 | from pathlib import Path
7 | from timeit import default_timer as timer
8 | from typing import Optional
9 |
10 | import yaml
11 | from pydantic import BaseModel
12 |
13 | from benchllm.data_types import Evaluation, FunctionID, Prediction
14 | from benchllm.input_types import Json
15 | from benchllm.listener import EvaluatorListener
16 |
17 |
18 | class Evaluator(ABC):
19 | def __init__(self, workers: int = 1):
20 | self._predictions: list[Prediction] = []
21 | self._listeners: list[EvaluatorListener] = []
22 | self._evaluations: list[Evaluation] = []
23 | self._workers: int = workers
24 |
25 | class Candidate(BaseModel):
26 | prediction: Json
27 | expected: Json
28 | score: float
29 | passed: bool
30 |
31 | def add_listener(self, listener: EvaluatorListener) -> None:
32 | self._listeners.append(listener)
33 |
34 | def load(self, predictions: list[Prediction]) -> None:
35 | self._predictions.extend(predictions)
36 |
37 | def load_prediction_file(self, path: Path) -> None:
38 | if path.suffix == ".yml" or path.suffix == ".yaml":
39 | data = yaml.safe_load(path.read_bytes())
40 | self.load([Prediction(**data)])
41 | elif path.suffix == ".json":
42 | data = json.loads(path.read_text(encoding="UTF-8"))
43 | self.load([Prediction(**data)])
44 |
45 | def run(self) -> list[Evaluation]:
46 | self._broadcast_evaluate_started()
47 | sorted_predictions = sorted(self._predictions, key=lambda x: str(x.function_id))
48 | grouped_predictions_by_function = [
49 | (function, list(group)) for function, group in groupby(sorted_predictions, key=attrgetter("function_id"))
50 | ]
51 | with ThreadPoolExecutor(max_workers=self._workers) as executor:
52 | for function, predictions in grouped_predictions_by_function:
53 | self._broadcast_evaluate_module_started(function)
54 | for evaluation in executor.map(self._run_evaluation, predictions):
55 | self._evaluations.append(evaluation)
56 | self._broadcast_evaluate_module_ended()
57 | self._broadcast_evaluate_ended(self._evaluations)
58 | return self._evaluations
59 |
60 | def _run_evaluation(self, prediction: Prediction) -> Evaluation:
61 | self._broadcast_evaluate_prediction_started(prediction)
62 | start = timer()
63 | candidates = self.evaluate_prediction(prediction)
64 | end = timer()
65 |
66 | evaluation = Evaluation(
67 | prediction=prediction,
68 | passed=any([candidate.passed for candidate in candidates]),
69 | eval_time_elapsed=end - start,
70 | score=max([candidate.score for candidate in candidates], default=0.0),
71 | )
72 | self._broadcast_evaluate_prediction_ended(evaluation)
73 | return evaluation
74 |
75 | @property
76 | def passed(self) -> list[Evaluation]:
77 | return [evaluation for evaluation in self._evaluations if evaluation.passed]
78 |
79 | @property
80 | def failed(self) -> list[Evaluation]:
81 | return [evaluation for evaluation in self._evaluations if not evaluation.passed]
82 |
83 | @property
84 | def evaluations(self) -> list[Evaluation]:
85 | return self._evaluations
86 |
87 | @property
88 | def workers(self) -> int:
89 | return self._workers
90 |
91 | @property
92 | def predictions(self) -> list[Prediction]:
93 | return self._predictions
94 |
95 | @abstractmethod
96 | def evaluate_prediction(self, prediction: Prediction) -> list[Candidate]:
97 | """Evaluate a single prediction, return a Match if the prediction matches the expected output."""
98 | pass
99 |
100 | def max_threads(self) -> int:
101 | return 1
102 |
103 | def _broadcast_evaluate_started(self) -> None:
104 | for listener in self._listeners:
105 | listener.evaluate_started()
106 |
107 | def _broadcast_evaluate_prediction_started(self, prediction: Prediction) -> None:
108 | for listener in self._listeners:
109 | listener.evaluate_prediction_started(prediction)
110 |
111 | def _broadcast_evaluate_prediction_ended(self, evaluation: Evaluation) -> None:
112 | for listener in self._listeners:
113 | listener.evaluate_prediction_ended(evaluation)
114 |
115 | def _broadcast_evaluate_module_started(self, function_id: FunctionID) -> None:
116 | for listener in self._listeners:
117 | listener.evaluate_module_started(function_id)
118 |
119 | def _broadcast_evaluate_module_ended(self) -> None:
120 | for listener in self._listeners:
121 | listener.evaluate_module_ended()
122 |
123 | def _broadcast_evaluate_ended(self, evaluations: list[Evaluation]) -> None:
124 | for listener in self._listeners:
125 | listener.evaluate_ended(evaluations)
126 |
--------------------------------------------------------------------------------
/benchllm/evaluator/semantic.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from benchllm.data_types import Prediction
4 | from benchllm.evaluator import Evaluator
5 | from benchllm.similarity import semantically_similar
6 |
7 |
8 | class SemanticEvaluator(Evaluator):
9 | def __init__(self, *, model: str = "gpt-3", workers: int = 1, early_quitting: bool = True):
10 | super().__init__(workers=workers)
11 | self.model = model
12 | self.early_quitting = early_quitting
13 |
14 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
15 | candidates = []
16 | for expected in prediction.test.expected:
17 | if semantically_similar(expected, prediction.output, model=self.model):
18 | candidate = Evaluator.Candidate(prediction=prediction.output, expected=expected, score=1.0, passed=True)
19 | if self.early_quitting:
20 | return [candidate]
21 | else:
22 | candidates.append(candidate)
23 | else:
24 | candidates.append(
25 | Evaluator.Candidate(prediction=prediction.output, expected=expected, score=0.0, passed=False)
26 | )
27 | return candidates
28 |
--------------------------------------------------------------------------------
/benchllm/evaluator/string_match.py:
--------------------------------------------------------------------------------
1 | from benchllm.data_types import Prediction
2 | from benchllm.evaluator import Evaluator
3 |
4 |
5 | class StringMatchEvaluator(Evaluator):
6 | def __init__(self, *, case_sensitive: bool = False, fuzzy: bool = False, workers: int = 1):
7 | super().__init__(workers=workers)
8 |
9 | self._case_sensitive = case_sensitive
10 | self._fuzzy = fuzzy
11 |
12 | def match_strings(self, expected: str, output: str) -> bool:
13 | if not self._case_sensitive:
14 | expected = expected.lower()
15 | output = output.lower()
16 |
17 | if self._fuzzy:
18 | return expected in output or output in expected
19 |
20 | return expected == output
21 |
22 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
23 | output = prediction.output
24 | candidates = []
25 | for expected in prediction.test.expected:
26 | if self.match_strings(expected, output):
27 | candidates.append(Evaluator.Candidate(prediction=output, expected=expected, score=1.0, passed=True))
28 | else:
29 | candidates.append(Evaluator.Candidate(prediction=output, expected=expected, score=0.0, passed=False))
30 | return candidates
31 |
--------------------------------------------------------------------------------
/benchllm/input_types.py:
--------------------------------------------------------------------------------
1 | from typing import TypedDict, Union
2 |
3 | from pydantic import BaseModel
4 |
5 | Json = Union[str, bool, list, dict]
6 |
7 |
8 | class ChatInputItem(TypedDict):
9 | role: str
10 | content: str
11 |
12 |
13 | ChatInput = list[ChatInputItem]
14 |
15 |
16 | class SimilarityInput(BaseModel):
17 | prompt_1: str
18 | prompt_2: str
19 |
--------------------------------------------------------------------------------
/benchllm/listener.py:
--------------------------------------------------------------------------------
1 | from .data_types import Evaluation, FunctionID, Prediction, Test, TestFunction
2 |
3 |
4 | class TesterListener:
5 | def test_run_started(self) -> None:
6 | pass
7 |
8 | def test_run_ended(self, predications: list[Prediction]) -> None:
9 | pass
10 |
11 | def test_function_started(self, test_function: TestFunction) -> None:
12 | pass
13 |
14 | def test_function_ended(self) -> None:
15 | pass
16 |
17 | def test_started(self, test: Test) -> None:
18 | pass
19 |
20 | def test_ended(self, prediction: Prediction) -> None:
21 | pass
22 |
23 | def test_skipped(self, test: Test, error: bool = False) -> None:
24 | pass
25 |
26 |
27 | class EvaluatorListener:
28 | def evaluate_started(self) -> None:
29 | pass
30 |
31 | def evaluate_prediction_started(self, prediction: Prediction) -> None:
32 | pass
33 |
34 | def evaluate_prediction_ended(self, evaluation: Evaluation) -> None:
35 | pass
36 |
37 | def evaluate_module_started(self, function_id: FunctionID) -> None:
38 | pass
39 |
40 | def evaluate_module_ended(self) -> None:
41 | pass
42 |
43 | def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
44 | pass
45 |
--------------------------------------------------------------------------------
/benchllm/similarity.py:
--------------------------------------------------------------------------------
1 | import openai
2 |
3 |
4 | def completion_func(prompt: str) -> str:
5 | response = openai.Completion.create(
6 | prompt=prompt, engine="text-davinci-003", max_tokens=100, temperature=0.7, n=1, stop=None
7 | )
8 | return response.choices[0].text.strip()
9 |
10 |
11 | def chat_completion_func(prompt: str, *, model: str) -> str:
12 | response = openai.ChatCompletion.create(
13 | model=model, messages=[{"role": "user", "content": prompt}], max_tokens=100, temperature=0.7, n=1, stop=None
14 | )
15 | return response.choices[0].message.content.strip()
16 |
17 |
18 | def complete_text(prompt: str, *, model: str) -> str:
19 | full_prompt = f"""
20 | You will get two anwsers to a question, you should determine if they are semantically similar or not.
21 | You can only answer "same" or "different", nothing else.
22 |
23 | input: {{
24 | "answer_1": "I was created by X",
25 | "answer_2": "X created me"
26 | }}
27 | output: same
28 |
29 | input: {{
30 | "answer_1": "There are 52 days in a year",
31 | "answer_2": "A year is fairly long"
32 | }}
33 | output: different
34 |
35 | input: {prompt}
36 | output:"""
37 |
38 | model_func = completion_func if model == "gpt-3" else lambda prompt: chat_completion_func(prompt, model=model)
39 | return model_func(prompt=full_prompt)
40 |
41 |
42 | def semantically_similar(answer1: str, answer2: str, model: str = "gpt-3") -> bool:
43 | response = complete_text(
44 | f"""{{
45 | "answer_1": "{answer1}",
46 | "answer_2": "{answer2}"
47 | }}""",
48 | model=model,
49 | )
50 | if response not in ["same", "different"]:
51 | raise ValueError(f"Unexpected response: {response}")
52 | return response == "same"
53 |
--------------------------------------------------------------------------------
/benchllm/singleton.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Any, Callable, Generic, Type, TypeVar
3 |
4 | from pydantic import BaseModel
5 |
6 | T = TypeVar("T")
7 |
8 |
9 | class FunctionRegistry(BaseModel, Generic[T]):
10 | func: Callable[[T], T]
11 | type: Any
12 | suite: Path
13 |
14 |
15 | class TestSingleton(Generic[T]):
16 | _instance = None
17 | functions: list[FunctionRegistry[T]] = []
18 |
19 | def __new__(cls: Type["TestSingleton"], *args: list, **kwargs: dict) -> "TestSingleton":
20 | if not cls._instance:
21 | cls._instance = super().__new__(cls)
22 | return cls._instance
23 |
24 | def register(self, func: Callable[[T], T], input_type: Type[T], suite: Path) -> None:
25 | self.functions.append(FunctionRegistry(func=func, type=input_type, suite=suite))
26 |
27 | def clear(self) -> None:
28 | self.functions = []
29 |
--------------------------------------------------------------------------------
/benchllm/tester.py:
--------------------------------------------------------------------------------
1 | import importlib.util
2 | import inspect
3 | import json
4 | import sys
5 | import uuid
6 | from contextlib import contextmanager
7 | from pathlib import Path
8 | from timeit import default_timer as timer
9 | from types import ModuleType
10 | from typing import Any, Callable, Iterator, Optional, Union
11 |
12 | import yaml
13 | from pydantic import ValidationError, parse_obj_as
14 |
15 | from .data_types import FunctionID, Prediction, Test, TestFunction
16 | from .listener import TesterListener
17 | from .singleton import TestSingleton
18 |
19 | CallableTest = Union[TestFunction, Callable[[Any], Any]]
20 |
21 |
22 | class Tester:
23 | __test__ = False
24 |
25 | def __init__(self, test_function: Optional[CallableTest] = None, *, retry_count: int = 1) -> None:
26 | self._tests: dict[FunctionID, list[Test]] = {}
27 | self._test_functions: dict[FunctionID, TestFunction] = {}
28 | self._listeners: list[TesterListener] = []
29 | self._predictions: list[Prediction] = []
30 | self._retry_count = retry_count
31 |
32 | if test_function:
33 | self.add_test_function(test_function=test_function)
34 |
35 | def add_listener(self, listener: TesterListener) -> None:
36 | self._listeners.append(listener)
37 |
38 | def add_tests(self, tests: list[Test], function_id: FunctionID = FunctionID.default()) -> None:
39 | self._tests.setdefault(function_id, []).extend(tests)
40 |
41 | def add_test(self, test: Test, function_id: FunctionID = FunctionID.default()) -> None:
42 | self._tests.setdefault(function_id, []).append(test)
43 |
44 | def add_test_function(self, test_function: CallableTest) -> None:
45 | """Adds a test function to the tester, either a TestFunction or a Callable, Callables will get a default FunctionID"""
46 | if isinstance(test_function, TestFunction):
47 | self._test_functions[test_function.function_id] = test_function
48 | return
49 | self.add_test_function(TestFunction(function=test_function, function_id=FunctionID.default(), input_type=Any))
50 |
51 | def load_tests(self, suite: Path, function_id: FunctionID) -> None:
52 | if self._test_functions.get(function_id) is None:
53 | raise Exception(f"No test function loaded for module {function_id}")
54 |
55 | for test in load_files(suite):
56 | self.add_test(test, function_id)
57 |
58 | def load_module(self, path: Union[str, Path]) -> None:
59 | path = Path(path)
60 | test_singleton = TestSingleton()
61 | test_singleton.clear()
62 |
63 | import_module_from_file(path)
64 |
65 | if not test_singleton.functions:
66 | raise NoBenchLLMTestFunction()
67 |
68 | for function in test_singleton.functions:
69 | function_id = FunctionID(
70 | module_path=path,
71 | line_number=inspect.getsourcelines(function.func)[1],
72 | name=function.func.__name__,
73 | )
74 |
75 | self.add_test_function(
76 | TestFunction(
77 | function=function.func,
78 | function_id=function_id,
79 | input_type=function.type,
80 | suite=function.suite,
81 | )
82 | )
83 | self.load_tests(function.suite, function_id)
84 |
85 | def run(self) -> list[Prediction]:
86 | """Runs each test through the test function and stores the result"""
87 |
88 | self._broadcast_test_run_started()
89 |
90 | if not self._test_functions:
91 | raise Exception("No function loaded, run load_module() first")
92 |
93 | if not self._tests:
94 | raise Exception("No tests loaded, run load_tests() first")
95 |
96 | for test_function in self._test_functions.values():
97 | self._broadcast_test_function_started(test_function)
98 | for test in self._tests.get(test_function.function_id, []):
99 | for _ in range(self._retry_count):
100 | # Checks that the arity of the function matches the number of inputs.
101 | if "__annotations__" in dir(test_function.input_type):
102 | if len(test.input) != len(test_function.input_type.__annotations__):
103 | raise Exception(
104 | f"Your test function needs to have an input parameter annotated with the input type, {test.input}\n\n{test_function.input_type.__annotations__}"
105 | )
106 |
107 | # Now, try to parse the input. If we fail, we will skip the test.
108 | try:
109 | input = parse_obj_as(test_function.input_type, test.input)
110 | except ValidationError:
111 | self._broadcast_test_skipped(test, error=True)
112 | continue
113 |
114 | self._broadcast_test_started(test)
115 | start = timer()
116 |
117 | # set up mock functions for the test calls
118 | calls_made: dict[str, Any] = {}
119 | with setup_mocks(test, calls_made):
120 | output = test_function.function(input)
121 |
122 | end = timer()
123 | prediction = Prediction(
124 | test=test,
125 | output=output,
126 | time_elapsed=end - start,
127 | function_id=test_function.function_id,
128 | calls=calls_made,
129 | )
130 | self._predictions.append(prediction)
131 | self._broadcast_test_ended(prediction)
132 | self._broadcast_test_function_ended()
133 | self._broadcast_test_run_ended(self._predictions)
134 | return self._predictions
135 |
136 | @property
137 | def predictions(self) -> list[Prediction]:
138 | return self._predictions
139 |
140 | def tests(self, function_id: FunctionID = FunctionID.default()) -> list[Test]:
141 | return self._tests.get(function_id, [])
142 |
143 | def _broadcast_test_run_started(self) -> None:
144 | for listener in self._listeners:
145 | listener.test_run_started()
146 |
147 | def _broadcast_test_run_ended(self, predications: list[Prediction]) -> None:
148 | for listener in self._listeners:
149 | listener.test_run_ended(predications)
150 |
151 | def _broadcast_test_function_started(self, test_function: TestFunction) -> None:
152 | for listener in self._listeners:
153 | listener.test_function_started(test_function)
154 |
155 | def _broadcast_test_function_ended(self) -> None:
156 | for listener in self._listeners:
157 | listener.test_function_ended()
158 |
159 | def _broadcast_test_started(self, test: Test) -> None:
160 | for listener in self._listeners:
161 | listener.test_started(test)
162 |
163 | def _broadcast_test_ended(self, prediction: Prediction) -> None:
164 | for listener in self._listeners:
165 | listener.test_ended(prediction)
166 |
167 | def _broadcast_test_skipped(self, test: Test, error: bool = False) -> None:
168 | for listener in self._listeners:
169 | listener.test_skipped(test, error)
170 |
171 |
172 | def load_files(directory: Union[str, Path]) -> list[Test]:
173 | directory_path = Path(directory)
174 | tests = []
175 | for file_path in directory_path.rglob("*"):
176 | if not file_path.is_file():
177 | continue
178 | if file_path.suffix not in {".json", ".yml", ".yaml"}:
179 | continue
180 | with open(file_path, "r") as file:
181 | if file_path.suffix == ".json":
182 | data = json.load(file)
183 | elif file_path.suffix in {".yml", ".yaml"}:
184 | data = yaml.safe_load(file)
185 | try:
186 | test = init_test({**data, **{"file_path": file_path}})
187 | except ValidationError:
188 | raise TestLoadException(file_path, "failed to parse your test file") from None
189 | tests.append(test)
190 | return tests
191 |
192 |
193 | def init_test(data: dict) -> Test:
194 | if "id" not in data:
195 | data["id"] = str(uuid.uuid4())
196 |
197 | dump_data = {k: v for k, v in data.items() if k != "file_path"}
198 | with open(data["file_path"], "w") as f:
199 | if data["file_path"].suffix == ".json":
200 | json.dump(dump_data, f, indent=2, sort_keys=True)
201 | elif data["file_path"].suffix in {".yml", ".yaml"}:
202 | yaml.safe_dump(dump_data, f, indent=2)
203 |
204 | return Test(**data)
205 |
206 |
207 | class TestLoadException(Exception):
208 | def __init__(self, file_path: Path, error_message: str) -> None:
209 | self.file_path = file_path
210 | self.error_message = error_message
211 |
212 | def __str__(self) -> str:
213 | return f"Failed to load '{self.file_path}'\n{self.error_message}"
214 |
215 |
216 | class NoBenchLLMTestFunction(Exception):
217 | pass
218 |
219 |
220 | def import_module_from_file(file_path: Path) -> ModuleType:
221 | # Make sure the file exists.
222 | if not file_path.exists():
223 | raise FileNotFoundError(f"File not found: {file_path}")
224 |
225 | # Get the module name from the file path (remove the .py extension).
226 | module_name = file_path.stem
227 |
228 | # Create a module specification from the file path.
229 | spec = importlib.util.spec_from_file_location(module_name, file_path)
230 |
231 | if not spec or not spec.loader:
232 | raise Exception(f"Failed to create module specification from {file_path}")
233 |
234 | # Create a new module based on the spec.
235 | module = importlib.util.module_from_spec(spec)
236 |
237 | if not module:
238 | raise Exception(f"Failed to load module from {file_path}")
239 |
240 | # Temporarly add the directory of the file to the system path so that the module can import other modules.
241 | file_module = file_path.resolve().parent
242 | old_sys_path = sys.path.copy()
243 | sys.path.append(str(file_module))
244 |
245 | # Execute the module.
246 | spec.loader.exec_module(module)
247 |
248 | # Restore the system path
249 | sys.path = old_sys_path
250 |
251 | # Return the module.
252 | return module
253 |
254 |
255 | @contextmanager
256 | def setup_mocks(test: Test, calls_made: dict[str, Any]) -> Iterator[None]:
257 | """Sets up mock functions for the test calls"""
258 | old_functions = []
259 | for call in test.calls or []:
260 | mock_name = call.name
261 | module_name, function_name = mock_name.rsplit(".", 1)
262 | # we need to import the module before we can mock the function
263 | module = importlib.import_module(module_name)
264 | old_functions.append((module, function_name, getattr(module, function_name)))
265 |
266 | def mock_function(*args: tuple, **kwargs: dict[str, Any]) -> Any:
267 | assert not args, "Positional arguments are not supported"
268 | if mock_name not in calls_made:
269 | calls_made[mock_name] = []
270 | calls_made[mock_name].append(kwargs)
271 | return call.returns
272 |
273 | try:
274 | setattr(module, function_name, mock_function)
275 | except AttributeError:
276 | print(f"Function {function_name} doesn't exist in module {module_name}")
277 |
278 | try:
279 | yield
280 | finally:
281 | # restore the old function
282 | for old_function in old_functions:
283 | setattr(old_function[0], old_function[1], old_function[2])
284 |
--------------------------------------------------------------------------------
/benchllm/utils.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import json
3 | from pathlib import Path
4 |
5 | import yaml
6 |
7 | from benchllm.data_types import CallError, CallErrorType, Prediction
8 |
9 |
10 | class DecoratorFinder(ast.NodeVisitor):
11 | def __init__(self) -> None:
12 | self.has_decorator: bool = False
13 | self.module_aliases: list[str] = []
14 |
15 | def visit_Import(self, node: ast.Import) -> None:
16 | for alias in node.names:
17 | if alias.name == "benchllm":
18 | self.module_aliases.append(alias.asname or alias.name)
19 | self.generic_visit(node)
20 |
21 | def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
22 | for decorator in node.decorator_list:
23 | if isinstance(decorator, ast.Call) and isinstance(decorator.func, ast.Attribute):
24 | decorator = decorator.func
25 | if decorator.attr == "test":
26 | if isinstance(decorator.value, ast.Name) and decorator.value.id in self.module_aliases:
27 | self.has_decorator = True
28 | self.generic_visit(node)
29 |
30 |
31 | def check_file(path: Path) -> bool:
32 | with open(path, "r", encoding="utf8") as f:
33 | tree = ast.parse(f.read())
34 | finder = DecoratorFinder()
35 | finder.visit(tree)
36 | return finder.has_decorator
37 |
38 |
39 | def find_files(paths: list[Path]) -> list[Path]:
40 | python_files = set()
41 | for path in paths:
42 | if path.suffix == ".py" and not path.name.startswith("."):
43 | if check_file(path):
44 | python_files.add(path)
45 | else:
46 | for file in path.rglob("*.py"):
47 | if file.name.startswith("."):
48 | continue
49 | if check_file(file):
50 | python_files.add(file)
51 | return list(python_files)
52 |
53 |
54 | def find_json_yml_files(paths: list[Path]) -> list[Path]:
55 | files = []
56 | for path in paths:
57 | if path.is_file():
58 | if path.suffix in (".yml", ".json", ".yaml"):
59 | files.append(path)
60 | else:
61 | continue
62 | else:
63 | for file in path.rglob("*"):
64 | if file.suffix in (".yml", ".json", ".yaml"):
65 | files.append(file)
66 | return list(set(files))
67 |
68 |
69 | def load_prediction_files(paths: list[Path]) -> list[Prediction]:
70 | predictions = []
71 | for path in paths:
72 | for file_path in path.rglob("*"):
73 | if not file_path.is_file():
74 | continue
75 | if file_path.suffix not in {".json", ".yml", ".yaml"}:
76 | continue
77 | with open(file_path, "r") as file:
78 | if file_path.suffix == ".json":
79 | data = json.load(file)
80 | predictions.append(Prediction(**data))
81 | elif file_path.suffix in {".yml", ".yaml"}:
82 | data = yaml.safe_load(file)
83 | predictions.append(Prediction(**data))
84 | return predictions
85 |
86 |
87 | def collect_call_errors(prediction: Prediction) -> list[CallError]:
88 | """Assert that the calls in the prediction match the expected calls."""
89 | if prediction.test.calls is None:
90 | return []
91 | errors = []
92 | lookup = {call.name: call for call in prediction.test.calls}
93 |
94 | for function_name, invocations in prediction.calls.items():
95 | call = lookup[function_name]
96 | if not call:
97 | errors.append(CallError(function_name=function_name, error_type=CallErrorType.MISSING_FUNCTION))
98 | continue
99 |
100 | for arguments in invocations:
101 | for argument_name, argument_value in call.arguments.items():
102 | if argument_name not in arguments:
103 | errors.append(
104 | CallError(
105 | function_name=function_name,
106 | argument_name=argument_name,
107 | error_type=CallErrorType.MISSING_ARGUMENT,
108 | )
109 | )
110 | for argument_name, argument_value in arguments.items():
111 | if argument_name in call.arguments and argument_value != call.arguments[argument_name]:
112 | errors.append(
113 | CallError(
114 | function_name=function_name,
115 | argument_name=argument_name,
116 | expected_value=call.arguments[argument_name],
117 | actual_value=argument_value,
118 | error_type=CallErrorType.VALUE_MISMATCH,
119 | )
120 | )
121 | return errors
122 |
--------------------------------------------------------------------------------
/examples/chat/1.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - Product Lead
3 | - Andrea is a Product Lead at V7
4 | id: 4fbd341e-c489-4e39-8e24-5cfbac039cae
5 | input:
6 | - content: Who is Andrea?
7 | role: user
8 | - content: As an AI language model, I don't have information on who Andrea is.
9 | role: assistant
10 | - content: Apologies, I should have let you know that Andrea is Product Lead at V7.
11 | role: user
12 | - content: Thanks, duly noted.
13 | role: assistant
14 | - content: What does Andrea do at V7? Be very concise.
15 | role: user
16 |
--------------------------------------------------------------------------------
/examples/chat/2.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - Michael Jordan starred in the movie "Space Jam"
3 | id: 31228a9a-d515-463a-9956-d0f235de8357
4 | input:
5 | - content: Who is Michael Jordan?
6 | role: user
7 | - content: Michael Jordan is an ex NBA player.
8 | role: assistant
9 | - content: What was the team that made him famous?
10 | role: user
11 | - content: The Chicago Bulls
12 | role: assistant
13 | - content: He starred in a famous movie. Which one?
14 | role: user
15 |
--------------------------------------------------------------------------------
/examples/chat/eval.py:
--------------------------------------------------------------------------------
1 | import openai
2 |
3 | import benchllm
4 | from benchllm.input_types import ChatInput
5 |
6 |
7 | def chat(messages: ChatInput, model="gpt-3.5-turbo"):
8 | response = openai.ChatCompletion.create(model=model, messages=messages)
9 | return response.choices[0].message.content.strip()
10 |
11 |
12 | @benchllm.test(suite=".")
13 | def gpt_3_5(input: ChatInput):
14 | return chat(input)
15 |
16 |
17 | @benchllm.test(suite=".")
18 | def gpt_4(input: ChatInput):
19 | return chat(input, model="gpt-4")
20 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/1.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - approximately 38,625,801
3 | id: 41006468-536f-452a-9871-f66c456d0f14
4 | input: How many people live in canada as of 2023?
5 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/10.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - approximately 0.2791714614499425
3 | id: 4913087a-adc2-429b-9c31-96861ec561bf
4 | input: what is 1213 divided by 4345?
5 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/2.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - her boyfriend is Romain Gravas. his age raised to the .43 power is approximately
3 | 4.9373857399466665
4 | id: cd67b0fd-9a79-4e6f-a92f-64855b0a5d49
5 | input: who is dua lipa's boyfriend? what is his age raised to the .43 power?
6 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/3.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - her boyfriend is Romain Gravas. his age raised to the .43 power is approximately
3 | 4.9373857399466665
4 | id: 1d745ec5-20d5-43b7-acc5-026f22a30ea0
5 | input: what is dua lipa's boyfriend age raised to the .43 power?
6 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/4.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - approximately 3,435 mi
3 | id: d1a8f38d-304c-4c0f-8c0e-57113a680f6a
4 | input: how far is it from paris to boston in miles
5 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/5.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - approximately 2.682651500990882
3 | id: 01abd22e-f72d-4851-9e30-b38f73af47ca
4 | input:
5 | what was the total number of points scored in the 2023 super bowl? what is
6 | that number raised to the .23 power?
7 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/6.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - approximately 2.682651500990882
3 | id: bdad58a1-b79b-4de7-9df6-95f5bf32ba04
4 | input:
5 | what was the total number of points scored in the 2023 super bowl raised to
6 | the .23 power?
7 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/7.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - "30"
3 | id: e4c807cf-047b-41f1-aced-ac268b1d6ba3
4 | input:
5 | how many more points were scored in the 2023 super bowl than in the 2022 super
6 | bowl?
7 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/8.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - approximately 1.9347796717823205
3 | id: c6b43b02-0d44-4c0b-b648-d7df2a97230f
4 | input: what is 153 raised to .1312 power?
5 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/9.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - approximately 1.7589107138176394
3 | id: ffb37883-fee7-4e64-9583-4831dde81057
4 | input:
5 | who is kendall jenner's boyfriend? what is his height (in inches) raised to
6 | .13 power?
7 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/converter.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import yaml
4 | from langchain.evaluation.loading import load_dataset
5 |
6 | SUITE_NAME = "langchain_agent_search_calculator"
7 | suite_path = Path("examples") / SUITE_NAME
8 | suite_path.mkdir(exist_ok=True, parents=True)
9 |
10 | dataset = load_dataset("agent-search-calculator")
11 |
12 | for i, data in enumerate(dataset, start=1):
13 | # {
14 | # 'steps': [{'tool': 'Search', 'tool_input': 'Population of Canada 2023'}],
15 | # 'answer': 'approximately 38,625,801',
16 | # 'question': 'How many people live in canada as of 2023?'
17 | # }
18 | with open(suite_path / f"{i}.yml", "w") as fp:
19 | benchllm_dict = {"expected": [data["answer"]], "input": data["question"]}
20 | yaml.safe_dump(benchllm_dict, fp)
21 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/eval.py:
--------------------------------------------------------------------------------
1 | from langchain.agents import AgentType, initialize_agent, load_tools
2 | from langchain.llms import OpenAI
3 |
4 | # import benchllm
5 |
6 | tools = load_tools(["serpapi", "llm-math"], llm=OpenAI(temperature=0))
7 | agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
8 |
9 |
10 | # @benchllm.test
11 | def run(input: str):
12 | try:
13 | return agent(input)["output"]
14 | except Exception as e:
15 | return str(e)
16 |
--------------------------------------------------------------------------------
/examples/langchain_agent_search_calculator/script.py:
--------------------------------------------------------------------------------
1 | from langchain.agents import AgentType, initialize_agent, load_tools
2 | from langchain.llms import OpenAI
3 |
4 | from benchllm import SemanticEvaluator, Test, Tester
5 |
6 | tools = load_tools(["serpapi", "llm-math"], llm=OpenAI(temperature=0))
7 | agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
8 |
9 | tests = [Test(input="How many people live in canada as of 2023?", expected=["approximately 38,625,801"])]
10 |
11 | tester = Tester(lambda input: agent(input)["output"])
12 | tester.add_tests(tests)
13 | predictions = tester.run()
14 |
15 | evaluator = SemanticEvaluator()
16 | evaluator.load(predictions)
17 | report = evaluator.run()
18 |
19 | print(report)
20 |
--------------------------------------------------------------------------------
/examples/openai-evals/eval.py:
--------------------------------------------------------------------------------
1 | import openai
2 |
3 | import benchllm
4 | from benchllm.input_types import ChatInput
5 |
6 |
7 | def chat(messages: ChatInput):
8 | messages = [{"role": message.role, "content": message.content} for message in messages]
9 |
10 | response = openai.ChatCompletion.create(
11 | model="gpt-3.5-turbo",
12 | messages=messages,
13 | max_tokens=100,
14 | temperature=0.7,
15 | n=1,
16 | stop=None,
17 | )
18 |
19 | return response.choices[0].message.content.strip()
20 |
21 |
22 | @benchllm.test(suite=".")
23 | def run(input: ChatInput):
24 | value = chat(input)
25 | return value
26 |
--------------------------------------------------------------------------------
/examples/qa/1.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - 2
3 | - 2.0
4 | id: 699e66ad-5018-4b2f-98f6-e6f22324fd7b
5 | input: What's 1+1? Be very terse, only numeric output
6 |
--------------------------------------------------------------------------------
/examples/qa/2.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - V7
3 | - I was created by V7
4 | id: 7449d32b-6db1-4d93-8df4-5b70558f7bd4
5 | input: Who created you?
6 |
--------------------------------------------------------------------------------
/examples/qa/eval.py:
--------------------------------------------------------------------------------
1 | import openai
2 |
3 | import benchllm
4 |
5 |
6 | def complete_text(prompt):
7 | full_prompt = f"""
8 | You are a friendly AI bot created by V7. You are tasked with answering questions about the world.
9 | Q: {prompt}
10 | A:"""
11 | response = openai.Completion.create(
12 | engine="text-davinci-003",
13 | prompt=full_prompt,
14 | max_tokens=100,
15 | temperature=0.7,
16 | n=1,
17 | stop=None,
18 | )
19 | return response.choices[0].text.strip()
20 |
21 |
22 | @benchllm.test(suite=".")
23 | def run(input: str):
24 | return complete_text(input)
25 |
--------------------------------------------------------------------------------
/examples/qa/script.py:
--------------------------------------------------------------------------------
1 | from benchllm import StringMatchEvaluator, Test, Tester
2 |
3 | tests = [
4 | Test(input="What's 1+1?", expected=["2", "It's 2"]),
5 | Test(input="What's Obama's first name?", expected=["Barack"]),
6 | ]
7 |
8 | tester = Tester(lambda _: 2)
9 | tester.add_tests(tests)
10 | predictions = tester.run()
11 |
12 | evaluator = StringMatchEvaluator()
13 | evaluator.load(predictions)
14 | report = evaluator.run()
15 |
16 | print(report)
17 |
--------------------------------------------------------------------------------
/examples/similarity/benchllm.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - false
3 | id: d1df3377-f58a-4737-ae4c-e97443bc47d0
4 | input:
5 | prompt_1: BenchLLM was developed by V7, a company based in London
6 | prompt_2: BenchLLM was developed by Humanloop
7 |
--------------------------------------------------------------------------------
/examples/similarity/eval.py:
--------------------------------------------------------------------------------
1 | import benchllm
2 | from benchllm import SimilarityInput
3 | from benchllm.similarity import semantically_similar
4 |
5 |
6 | @benchllm.test(suite=".")
7 | def run(input: SimilarityInput):
8 | return semantically_similar(input.prompt_1, input.prompt_2)
9 |
--------------------------------------------------------------------------------
/examples/similarity/idempotency.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - true
3 | id: 3c4fc7dc-60e4-454f-9487-9833873d25e3
4 | input:
5 | prompt_1: V7
6 | prompt_2: V7
7 |
--------------------------------------------------------------------------------
/examples/similarity/location.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - false
3 | id: 26ffcec1-71d0-4eeb-a120-066f733e4be0
4 | input:
5 | prompt_1: The Eiffel Tower is located in London.
6 | prompt_2: The Eiffel Tower is located in Paris.
7 |
--------------------------------------------------------------------------------
/examples/similarity/number_detail.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - true
3 | id: 236c701d-0ea5-48e8-8b6a-90a8400f3050
4 | input:
5 | prompt_1: The population of Spain is 47 million.
6 | prompt_2: 47.42 million people live in Spain.
7 |
--------------------------------------------------------------------------------
/examples/similarity/solarsystem.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - true
3 | id: 5b10574a-ed92-4349-9fae-65bcb1bcbb3a
4 | input:
5 | prompt_1: The Earth revolves around the Sun.
6 | prompt_2: The Sun is at the center of the solar system.
7 |
--------------------------------------------------------------------------------
/examples/similarity/v7.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - true
3 | id: 60df4985-7ba9-440a-9dab-0aa3ed986daa
4 | input:
5 | prompt_1: I was created by V7
6 | prompt_2: V7 created me
7 |
--------------------------------------------------------------------------------
/examples/similarity/water_boiling.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - true
3 | id: 92d8cf57-b726-42f8-b460-b90fbf4cb752
4 | input:
5 | prompt_1: Water boils at 100 degrees Celsius.
6 | prompt_2: The boiling point of water is 100 degrees Celsius.
7 |
--------------------------------------------------------------------------------
/examples/vector_retrieval/1.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - "yes"
3 | id: 78ce604e-dc76-4f21-9514-c48b17948773
4 | input:
5 | Is the Seychelles parakeet extinct? (respond only with "yes", "no" or "don't
6 | know")
7 |
--------------------------------------------------------------------------------
/examples/vector_retrieval/2.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - "1893"
3 | id: 9a82f1b1-455a-4321-bc7d-4b9e3cf05144
4 | input:
5 | When was the last year one saw a Seychelles parakeet? Respond ONLY with the
6 | year
7 |
--------------------------------------------------------------------------------
/examples/vector_retrieval/eval.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 |
4 | import benchllm
5 |
6 | current_dir = Path(__file__).resolve().parent
7 |
8 | sys.path.append(str(current_dir))
9 | from utils import initiate_test_faiss
10 |
11 |
12 | @benchllm.test(suite=".")
13 | def run(input: str):
14 | qa = initiate_test_faiss()
15 | resp = qa.run(input)
16 | return resp
17 |
--------------------------------------------------------------------------------
/examples/vector_retrieval/utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import requests
4 | from langchain.chains import RetrievalQA
5 | from langchain.document_loaders import DirectoryLoader, PyPDFLoader
6 | from langchain.embeddings.openai import OpenAIEmbeddings
7 | from langchain.llms import OpenAI
8 | from langchain.text_splitter import CharacterTextSplitter
9 | from langchain.vectorstores import FAISS
10 |
11 | embeddings = OpenAIEmbeddings()
12 | current_dir = Path(__file__).resolve().parent
13 |
14 | DB_NAME = Path(current_dir, "faiss_example_index")
15 | TEST_FILE_URLS = [
16 | "https://en.wikipedia.org/api/rest_v1/page/pdf/Artificial_general_intelligence",
17 | "https://en.wikipedia.org/api/rest_v1/page/pdf/Socrates",
18 | "https://en.wikipedia.org/api/rest_v1/page/pdf/Seychelles_parakeet",
19 | ]
20 | PDF_FOLDER = Path(current_dir, "example_documents")
21 |
22 |
23 | def download_pdf(url: str, dst: Path) -> None:
24 | """Downloads a PDF file from a given URL if it doesn't exist in the destination directory"""
25 | dst_path = dst / f"{url.split('/')[-1].replace(' ', '_')}.pdf"
26 |
27 | if not dst_path.is_file():
28 | print("Downloading", url)
29 | dst_path.parent.mkdir(parents=True, exist_ok=True)
30 |
31 | response = requests.get(url)
32 | response.raise_for_status()
33 |
34 | with dst_path.open("wb") as f:
35 | f.write(response.content)
36 |
37 |
38 | def download_and_load_documents(pdfs_path: Path, urls: list[str]):
39 | """Downloads PDFs from a list of URLs and loads them into a list of documents"""
40 | for url in urls:
41 | download_pdf(url, pdfs_path)
42 |
43 | loader = DirectoryLoader(str(pdfs_path), glob="*.pdf", loader_cls=PyPDFLoader)
44 | return loader.load()
45 |
46 |
47 | def set_up_faiss_db(db_path: Path, pdfs_path: Path, chunk_size=420, chunk_overlap=30):
48 | """Setups up a Faiss DB by loading documents and creating an index"""
49 |
50 | if db_path.exists():
51 | try:
52 | return FAISS.load_local(str(db_path), embeddings)
53 | except Exception as e:
54 | print(f"Failed to load local FAISS DB: {e}")
55 | raise
56 |
57 | documents = download_and_load_documents(pdfs_path, TEST_FILE_URLS)
58 | if not documents:
59 | raise ValueError(f"No documents loaded from {pdfs_path}")
60 |
61 | text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
62 | docs = text_splitter.split_documents(documents)
63 |
64 | db = FAISS.from_documents(docs, embeddings)
65 | db.save_local(str(db_path))
66 |
67 | return db
68 |
69 |
70 | def initiate_test_faiss():
71 | """Initiates a Faiss test by creating a RetrievalQA object"""
72 | db = set_up_faiss_db(DB_NAME, PDF_FOLDER)
73 | qa = RetrievalQA.from_chain_type(
74 | llm=OpenAI(temperature=0), chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 3})
75 | )
76 | return qa
77 |
--------------------------------------------------------------------------------
/examples/weather_functions/default.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - "Yes"
3 | id: d80b8d57-9c50-44fe-b492-c3cab549d9f3
4 | input: Will it rain tomorrow?
5 |
--------------------------------------------------------------------------------
/examples/weather_functions/eval.py:
--------------------------------------------------------------------------------
1 | from forecast import run
2 |
3 | import benchllm
4 |
5 |
6 | @benchllm.test()
7 | def eval(input: str):
8 | return run(input)
9 |
--------------------------------------------------------------------------------
/examples/weather_functions/forecast.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import openai
4 |
5 |
6 | def get_n_day_weather_forecast(location: str, num_days: int):
7 | return f"The weather in {location} will be rainy for the next {num_days} days."
8 |
9 |
10 | def chain(prompt: list[dict], functions):
11 | response = openai.ChatCompletion.create(
12 | model="gpt-3.5-turbo-0613", messages=prompt, temperature=0.0, functions=functions
13 | )
14 |
15 | choice = response["choices"][0]
16 | if choice.get("finish_reason") == "function_call":
17 | function_call = choice["message"]["function_call"]
18 | function_name = function_call["name"]
19 | function_args = json.loads(function_call["arguments"])
20 | fn = globals()[function_name]
21 | output = fn(**function_args)
22 | prompt.append({"role": "function", "name": function_name, "content": output})
23 | return chain(prompt, functions)
24 | else:
25 | return response.choices[0].message.content.strip()
26 |
27 |
28 | def run(question: str):
29 | messages = [
30 | {
31 | "role": "user",
32 | "content": "Only answer questions with 'yes', 'no' or 'unknown', you must not reply with anything else",
33 | },
34 | {"role": "system", "content": "Use the get_n_day_weather_forecast function for weather questions"},
35 | {"role": "user", "content": question},
36 | ]
37 |
38 | functions = [
39 | {
40 | "name": "get_n_day_weather_forecast",
41 | "description": "Get an N-day weather forecast",
42 | "parameters": {
43 | "type": "object",
44 | "properties": {
45 | "location": {
46 | "type": "string",
47 | "description": "The city and state, e.g. San Francisco, CA",
48 | },
49 | "num_days": {
50 | "type": "integer",
51 | "description": "The number of days to forecast. E.g. 1 for today, 2 for tomorrow etc",
52 | },
53 | },
54 | "required": ["location", "format", "num_days"],
55 | },
56 | },
57 | ]
58 | return chain(messages, functions)
59 |
--------------------------------------------------------------------------------
/examples/weather_functions/rainy.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - "yes"
3 | id: 4b717b39-da96-4ca3-8aaf-070c7f11449c
4 | input: I'm going to San Francisco today, do I need an umbrella?
5 | calls:
6 | - name: forecast.get_n_day_weather_forecast
7 | returns: It's rainy in San Francisco today.
8 | arguments:
9 | location: San Francisco
10 | num_days: 1
11 |
--------------------------------------------------------------------------------
/examples/weather_functions/sunny.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - "no"
3 | id: 4b717b39-da96-4ca3-8aaf-070c7f11449c
4 | input: I live in London, can I expect rain today?
5 | calls:
6 | - name: forecast.get_n_day_weather_forecast
7 | returns: It's sunny today in London today.
8 | arguments:
9 | location: London
10 | num_days: 1
11 |
--------------------------------------------------------------------------------
/examples/weather_functions/tomorrow.yml:
--------------------------------------------------------------------------------
1 | expected:
2 | - "yes"
3 | id: 4b717b39-da96-4ca3-8aaf-070c7f11449c
4 | input: Do I need a sun hat tomorrow?
5 | calls:
6 | - name: forecast.get_n_day_weather_forecast
7 | returns: It's sunny in your location tomorrow.
8 | arguments:
9 | num_days: 2
10 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [ "poetry-core",]
3 | build-backend = "poetry.core.masonry.api"
4 |
5 | [tool.poetry]
6 | name = "benchllm"
7 | version = "0.3.0"
8 | description = "Tool for testing LLMs"
9 | homepage = "https://github.com/v7labs/benchllm"
10 | authors = [ "Simon Edwardsson ", "Andrea Azzini "]
11 | readme = "README.md"
12 | license = "MIT"
13 | keywords = []
14 | classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",]
15 | [[tool.poetry.packages]]
16 | include = "benchllm"
17 |
18 | [tool.isort]
19 | profile = "black"
20 |
21 | [tool.mypy]
22 | plugins = [ "pydantic.mypy",]
23 | follow_imports = "silent"
24 | warn_redundant_casts = true
25 | warn_unused_ignores = true
26 | check_untyped_defs = true
27 | no_implicit_reexport = true
28 | ignore_missing_imports = true
29 | disallow_any_unimported = true
30 | disallow_any_expr = false
31 | disallow_any_decorated = false
32 | disallow_any_explicit = false
33 | disallow_subclassing_any = true
34 | python_version = "3.10"
35 | disallow_untyped_calls = true
36 | disallow_untyped_defs = true
37 | disallow_incomplete_defs = true
38 | disallow_untyped_decorators = true
39 | no_implicit_optional = true
40 | warn_return_any = false
41 | warn_unreachable = true
42 | pretty = true
43 |
44 | [tool.pydantic-mypy]
45 | init_forbid_extra = true
46 | init_typed = true
47 | warn_required_dynamic_aliases = true
48 | warn_untyped_fields = true
49 |
50 | [tool.black]
51 | line-length = 120
52 |
53 | [tool.flake8]
54 | max-line-length = 120
55 | ignore = [ "E203", "W503", "E402",]
56 |
57 | [tool.poetry.dependencies]
58 | python = ">=3.10,<3.12"
59 | pyyaml = ">=5.1"
60 | typer = { version = "*", extras = ["all"] }
61 | pydantic = "^1.10.9"
62 | openai = "*"
63 | langchain = { version = "*", optional = true }
64 | pypdf = { version = "*", optional = true }
65 | tiktoken = { version = "*", optional = true }
66 | faiss-cpu = { version = "*", optional = true }
67 | types-pyyaml = { version = "*", optional = true }
68 | pytest = { version = "*", optional = true }
69 | pywebio = { version = "*", optional = true }
70 |
71 | [tool.poetry.extras]
72 | dev = [ "black", "isort", "flake8", "mypy", "pytest", "types-pyyaml"]
73 | test = [ "pytest"]
74 | examples = ["langchain", "tiktoken", "faiss-cpu", "pypdf"]
75 |
76 | [tool.poetry.scripts]
77 | bench = "benchllm.cli.main:main"
78 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/v7labs/benchllm/0b9d133a531538270e447b1e7ee1b8bb710a4061/test/__init__.py
--------------------------------------------------------------------------------
/test/cache/test_file_cache.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from pathlib import Path
3 | from unittest.mock import patch
4 |
5 | from benchllm import Prediction, StringMatchEvaluator, Test
6 | from benchllm.cache import FileCache
7 | from benchllm.data_types import FunctionID
8 |
9 | EXAMPLE_PREDICTIONS = [
10 | Prediction(
11 | test=Test(input="foo", expected=["abc", "def", "ghi"]),
12 | output="no-match",
13 | time_elapsed=0,
14 | function_id=FunctionID.default(),
15 | ),
16 | Prediction(
17 | test=Test(input="foo", expected=["abc", "def", "ghi"]),
18 | output="def",
19 | time_elapsed=0,
20 | function_id=FunctionID.default(),
21 | ),
22 | Prediction(
23 | test=Test(input="foo", expected=["abc", "def", "ghi"]),
24 | output="no-match",
25 | time_elapsed=0,
26 | function_id=FunctionID.default(),
27 | ),
28 | ]
29 |
30 | EXAMPLE_PREDICTIONS_ALL_SAME = [
31 | Prediction(
32 | test=Test(input="foo", expected=["match"]),
33 | output="match",
34 | time_elapsed=0,
35 | function_id=FunctionID.default(),
36 | ),
37 | Prediction(
38 | test=Test(input="foo", expected=["match"]),
39 | output="match",
40 | time_elapsed=0,
41 | function_id=FunctionID.default(),
42 | ),
43 | ]
44 |
45 |
46 | def test_file_writes_at_end():
47 | with patch.object(
48 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
49 | ) as mock_method:
50 | with tempfile.TemporaryDirectory() as temp_dir:
51 | cache_path = Path(temp_dir, "cache.json")
52 | evaluator = FileCache(StringMatchEvaluator(), cache_path)
53 | evaluator.load(EXAMPLE_PREDICTIONS)
54 |
55 | evaluations = evaluator.run()
56 | assert cache_path.exists()
57 | assert not evaluations[0].passed
58 | assert evaluations[1].passed
59 | assert not evaluations[2].passed
60 | assert mock_method.call_count == 2
61 | assert evaluator.num_cache_hits == 1
62 | mock_method.reset_mock()
63 |
64 | # second run will use cache
65 | evaluator = FileCache(StringMatchEvaluator(), cache_path)
66 | evaluator.load(EXAMPLE_PREDICTIONS)
67 |
68 | evaluations = evaluator.run()
69 | assert cache_path.exists()
70 | assert not evaluations[0].passed
71 | assert evaluations[1].passed
72 | assert not evaluations[2].passed
73 | assert mock_method.call_count == 0
74 | assert evaluator.num_cache_hits == 3
75 |
--------------------------------------------------------------------------------
/test/cache/test_memory_cache.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import patch
2 |
3 | from benchllm import Prediction, StringMatchEvaluator, Test
4 | from benchllm.cache import MemoryCache
5 | from benchllm.data_types import FunctionID
6 |
7 | EXAMPLE_PREDICTIONS = [
8 | Prediction(
9 | test=Test(input="foo", expected=["abc", "def", "ghi"]),
10 | output="no-match",
11 | time_elapsed=0,
12 | function_id=FunctionID.default(),
13 | ),
14 | Prediction(
15 | test=Test(input="foo", expected=["abc", "def", "ghi"]),
16 | output="def",
17 | time_elapsed=0,
18 | function_id=FunctionID.default(),
19 | ),
20 | Prediction(
21 | test=Test(input="foo", expected=["abc", "def", "ghi"]),
22 | output="no-match",
23 | time_elapsed=0,
24 | function_id=FunctionID.default(),
25 | ),
26 | ]
27 |
28 | EXAMPLE_PREDICTIONS_ALL_SAME = [
29 | Prediction(
30 | test=Test(input="foo", expected=["match"]),
31 | output="match",
32 | time_elapsed=0,
33 | function_id=FunctionID.default(),
34 | ),
35 | Prediction(
36 | test=Test(input="foo", expected=["match"]),
37 | output="match",
38 | time_elapsed=0,
39 | function_id=FunctionID.default(),
40 | ),
41 | ]
42 |
43 |
44 | EXAMPLE_PREDICTIONS_CACHING_NEGATIVE = [
45 | Prediction(
46 | test=Test(input="foo", expected=["no-match"]),
47 | output="match",
48 | time_elapsed=0,
49 | function_id=FunctionID.default(),
50 | ),
51 | Prediction(
52 | test=Test(input="foo", expected=["no-match", "match"]),
53 | output="match",
54 | time_elapsed=0,
55 | function_id=FunctionID.default(),
56 | ),
57 | Prediction(
58 | test=Test(input="foo", expected=["match", "no-match"]),
59 | output="match",
60 | time_elapsed=0,
61 | function_id=FunctionID.default(),
62 | ),
63 | ]
64 |
65 |
66 | def test_memory_cache_will_prevent_calls_to_evaluate_prediction_on_second_run():
67 | with patch.object(
68 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
69 | ) as mock_method:
70 | evaluator = MemoryCache(StringMatchEvaluator())
71 | evaluator.load(EXAMPLE_PREDICTIONS)
72 | evaluations = evaluator.run()
73 | assert not evaluations[0].passed
74 | assert evaluations[1].passed
75 | assert not evaluations[2].passed
76 | assert mock_method.call_count == 2
77 | assert evaluator.num_cache_hits == 1
78 | mock_method.reset_mock()
79 |
80 | # second run will use cache
81 | evaluations = evaluator.run()
82 | assert not evaluations[0].passed
83 | assert evaluations[1].passed
84 | assert not evaluations[2].passed
85 | assert evaluator.num_cache_hits == 4
86 | assert mock_method.call_count == 0
87 |
88 |
89 | def test_memory_cache_caches_during_run():
90 | with patch.object(
91 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
92 | ) as mock_method:
93 | evaluator = MemoryCache(StringMatchEvaluator())
94 | evaluator.load(EXAMPLE_PREDICTIONS_ALL_SAME)
95 |
96 | evaluations = evaluator.run()
97 | assert evaluations[0].passed
98 | assert evaluations[1].passed
99 | assert mock_method.call_count == 1
100 | assert evaluator.num_cache_hits == 1
101 |
102 |
103 | def test_memory_cache_caches_always_tries_to_pass():
104 | with patch.object(
105 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
106 | ) as mock_method:
107 | evaluator = MemoryCache(StringMatchEvaluator())
108 | evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE)
109 |
110 | evaluations = evaluator.run()
111 | assert not evaluations[0].passed
112 | assert evaluations[1].passed
113 | assert evaluations[2].passed
114 | assert mock_method.call_count == 2
115 | assert evaluator.num_cache_hits == 1
116 |
117 |
118 | def test_memory_cache_does_not_pass_on_cached_negatives():
119 | with patch.object(
120 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
121 | ) as mock_method:
122 | evaluator = MemoryCache(StringMatchEvaluator())
123 | evaluator.load(EXAMPLE_PREDICTIONS_CACHING_NEGATIVE)
124 |
125 | evaluator.run()
126 | assert mock_method.call_count == 2
127 | assert mock_method.call_args_list.pop(0).args[0].test.expected == ["no-match"]
128 | assert mock_method.call_args_list.pop(0).args[0].test.expected == ["match"]
129 |
130 |
131 | def test_memory_cache_supports_numbers():
132 | with patch.object(
133 | StringMatchEvaluator, "evaluate_prediction", side_effect=StringMatchEvaluator().evaluate_prediction
134 | ) as mock_method:
135 | evaluator = MemoryCache(StringMatchEvaluator())
136 | evaluator.load(
137 | [
138 | Prediction(
139 | test=Test(input="foo", expected=["42"]),
140 | output="42",
141 | time_elapsed=0,
142 | function_id=FunctionID.default(),
143 | ),
144 | Prediction(
145 | test=Test(input="foo", expected=["42"]),
146 | output="42",
147 | time_elapsed=0,
148 | function_id=FunctionID.default(),
149 | ),
150 | Prediction(
151 | test=Test(input="foo", expected=["42"]),
152 | output="24",
153 | time_elapsed=0,
154 | function_id=FunctionID.default(),
155 | ),
156 | ]
157 | )
158 | evaluations = evaluator.run()
159 | assert evaluations[0].passed
160 | assert evaluations[1].passed
161 | assert not evaluations[2].passed
162 | assert mock_method.call_count == 2
163 | assert evaluator.num_cache_hits == 1
164 |
--------------------------------------------------------------------------------
/test/cli/test_interactive.py:
--------------------------------------------------------------------------------
1 | from unittest import mock
2 |
3 | import typer
4 |
5 | from benchllm.cli.evaluator import InteractiveEvaluator
6 | from benchllm.data_types import FunctionID, Prediction, Test
7 |
8 | TEST_PREDICTION = [
9 | Prediction(
10 | test=Test(input="Who are you?", expected=["Yoda I am.", "Yoda"]),
11 | output="I am Yoda.",
12 | time_elapsed=0,
13 | function_id=FunctionID.default(),
14 | )
15 | ]
16 |
17 |
18 | def test_interactive_press_y_passes():
19 | evalautor = InteractiveEvaluator()
20 | evalautor.load(TEST_PREDICTION)
21 | with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "1"):
22 | result = evalautor.run()
23 | assert result[0].passed
24 |
25 | with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "2"):
26 | result = evalautor.run()
27 | assert result[0].passed
28 |
29 |
30 | def test_interactive_press_n_fails():
31 | evalautor = InteractiveEvaluator()
32 | evalautor.load(TEST_PREDICTION)
33 | with mock.patch.object(typer, "prompt", lambda *args, **kwargs: "n"):
34 | result = evalautor.run()
35 | assert not result[0].passed
36 |
--------------------------------------------------------------------------------
/test/cli/test_list_tests.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from typer.testing import CliRunner
4 |
5 | from benchllm.cli.main import app
6 |
7 | runner = CliRunner()
8 |
9 |
10 | def test_list_tests():
11 | result = runner.invoke(app, ["tests", str(Path.cwd() / "examples/qa")])
12 | assert "Input" in result.stdout
13 | assert "No." in result.stdout
14 | assert "Expected" in result.stdout
15 |
--------------------------------------------------------------------------------
/test/cli/test_run_suite.py:
--------------------------------------------------------------------------------
1 | from test.utils import create_openai_object
2 | from unittest.mock import MagicMock, patch
3 |
4 | from typer.testing import CliRunner
5 |
6 | from benchllm.cli.main import app
7 |
8 | runner = CliRunner()
9 |
10 |
11 | @patch("openai.Completion.create", return_value=create_openai_object("Hello, user!"))
12 | def test_run_multiple_suites(completion_mock: MagicMock):
13 | runner.invoke(app, ["run", "examples/qa", "examples/similarity"])
14 | completion_mock.assert_called()
15 |
16 |
17 | @patch("openai.Completion.create", return_value=create_openai_object("Hello, user!"))
18 | def test_run_target_suite(completion_mock: MagicMock):
19 | runner.invoke(app, ["run", "examples/qa"])
20 | completion_mock.assert_called()
21 |
--------------------------------------------------------------------------------
/test/evaulator/test_evalutator.py:
--------------------------------------------------------------------------------
1 | import json
2 | import tempfile
3 | from pathlib import Path
4 | from test.utils import create_openai_object
5 | from unittest.mock import MagicMock, Mock, call, patch
6 |
7 | from benchllm import Prediction, SemanticEvaluator, StringMatchEvaluator, Test
8 | from benchllm.cache import MemoryCache
9 | from benchllm.data_types import FunctionID
10 | from benchllm.evaluator import Evaluator
11 |
12 |
13 | class NoopEvaluator(Evaluator):
14 | def evaluate_prediction(self, prediction: Prediction) -> list[Evaluator.Candidate]:
15 | return [Evaluator.Candidate(prediction=prediction.output, expected=prediction.output, score=1.0, passed=True)]
16 |
17 |
18 | def test_evaluator_can_load_prediction_file():
19 | prediction = {
20 | "output": "42",
21 | "test": {"input": "1+1", "expected": ["2"]},
22 | "time_elapsed": 0,
23 | "function_id": {"module_path": "test", "line_number": 1, "name": "test"},
24 | }
25 | with tempfile.TemporaryDirectory() as tmpdir:
26 | prediction_path = Path(tmpdir, "prediction.json")
27 | prediction_path.write_bytes(json.dumps(prediction).encode())
28 |
29 | evaluator = NoopEvaluator()
30 | evaluator.load_prediction_file(prediction_path)
31 |
32 | assert evaluator.predictions[0].output == "42"
33 | assert evaluator.predictions[0].test.input == "1+1"
34 | assert evaluator.predictions[0].test.expected == ["2"]
35 |
--------------------------------------------------------------------------------
/test/evaulator/test_semantic.py:
--------------------------------------------------------------------------------
1 | from test.utils import create_openai_object
2 | from unittest.mock import MagicMock, patch
3 |
4 | from benchllm import Prediction, SemanticEvaluator, Test
5 | from benchllm.data_types import FunctionID
6 |
7 |
8 | @patch("openai.Completion.create", return_value=create_openai_object("same"))
9 | def test_semantic_passes_if_output_is_equal(completion_mock: MagicMock):
10 | evaluator = SemanticEvaluator(model="gpt-3")
11 | evaluator.load(
12 | [
13 | Prediction(
14 | test=Test(input="Who are you?", expected=["Yoda I am."]),
15 | output="I am Yoda.",
16 | time_elapsed=0,
17 | function_id=FunctionID.default(),
18 | )
19 | ]
20 | )
21 | evaluations = evaluator.run()
22 | completion_mock.assert_called_once()
23 | assert evaluations[0].passed
24 |
25 |
26 | @patch("openai.Completion.create", return_value=create_openai_object("different"))
27 | def test_semantic_fails_if_output_is_unequal(completion_mock: MagicMock):
28 | evaluator = SemanticEvaluator(model="gpt-3")
29 | evaluator.load(
30 | [
31 | Prediction(
32 | test=Test(input="What are you?", expected=["Everything"]),
33 | output="Nothing",
34 | time_elapsed=0,
35 | function_id=FunctionID.default(),
36 | ),
37 | ]
38 | )
39 | evaluations = evaluator.run()
40 | completion_mock.assert_called_once()
41 | assert not evaluations[0].passed
42 |
43 |
44 | @patch("openai.Completion.create", return_value=create_openai_object("same"))
45 | def test_semantic_passes_if_output_is_equal_multiple_workers(completion_mock: MagicMock):
46 | evaluator = SemanticEvaluator(model="gpt-3", workers=10)
47 | evaluator.load(
48 | [
49 | Prediction(
50 | test=Test(input="Who are you?", expected=["Yoda I am."]),
51 | output="I am Yoda.",
52 | time_elapsed=0,
53 | function_id=FunctionID.default(),
54 | )
55 | for _ in range(100)
56 | ]
57 | )
58 | evaluations = evaluator.run()
59 | assert completion_mock.call_count == 100
60 | assert all([evaluation.passed for evaluation in evaluations])
61 |
--------------------------------------------------------------------------------
/test/evaulator/test_string_match.py:
--------------------------------------------------------------------------------
1 | from benchllm import Prediction, StringMatchEvaluator, Test
2 | from benchllm.data_types import FunctionID
3 |
4 |
5 | def test_string_match_passes_if_output_is_equal_to_expected():
6 | evaluator = StringMatchEvaluator()
7 | evaluator.load(
8 | [
9 | Prediction(
10 | test=Test(input="foo", expected=["bar"]), output="42", time_elapsed=0, function_id=FunctionID.default()
11 | ),
12 | Prediction(
13 | test=Test(input="foo", expected=["42"]), output="42", time_elapsed=0, function_id=FunctionID.default()
14 | ),
15 | Prediction(
16 | test=Test(input="foo", expected=["BAR"]), output="bar", time_elapsed=0, function_id=FunctionID.default()
17 | ),
18 | ]
19 | )
20 | evaluations = evaluator.run()
21 | assert not evaluations[0].passed
22 | assert evaluations[1].passed
23 | assert evaluations[2].passed
24 |
25 |
26 | def test_string_match_passes_if_output_is_equal_to_expected_case_sensitive():
27 | evaluator = StringMatchEvaluator(case_sensitive=True)
28 | evaluator.load(
29 | [
30 | Prediction(
31 | test=Test(input="foo", expected=["BAR"]), output="BAR", time_elapsed=0, function_id=FunctionID.default()
32 | ),
33 | Prediction(
34 | test=Test(input="foo", expected=["BAR"]), output="bar", time_elapsed=0, function_id=FunctionID.default()
35 | ),
36 | ]
37 | )
38 | evaluations = evaluator.run()
39 | assert evaluations[0].passed
40 | assert not evaluations[1].passed
41 |
42 |
43 | def test_string_match_passes_if_output_is_equal_to_expected_fuzzy():
44 | evaluator = StringMatchEvaluator(fuzzy=True)
45 | evaluator.load(
46 | [
47 | Prediction(
48 | test=Test(input="foo", expected=["abc def ghi"]),
49 | output="def",
50 | time_elapsed=0,
51 | function_id=FunctionID.default(),
52 | ),
53 | Prediction(
54 | test=Test(input="foo", expected=["abc def ghi"]),
55 | output="adg",
56 | time_elapsed=0,
57 | function_id=FunctionID.default(),
58 | ),
59 | ]
60 | )
61 | evaluations = evaluator.run()
62 | assert evaluations[0].passed
63 | assert not evaluations[1].passed
64 |
--------------------------------------------------------------------------------
/test/test_tester.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from pathlib import Path
3 | from unittest.mock import Mock, call
4 |
5 | from benchllm import Test, Tester
6 |
7 |
8 | def test_tester_run_through_each_test_once():
9 | test_function = Mock(return_value="42")
10 | test = Tester(test_function=test_function)
11 | test.add_test(Test(input="1+1", expected=["2"]))
12 | test.add_test(Test(input="2+2", expected=["4"]))
13 | predictions = test.run()
14 |
15 | assert test_function.call_count == 2
16 |
17 | print(test_function.call_args_list)
18 | test_function.assert_has_calls([call("1+1"), call("2+2")])
19 | assert predictions[0].output == "42"
20 | assert predictions[1].output == "42"
21 |
22 |
23 | def test_tester_parses_yml_correctly():
24 | python_code = """
25 | import benchllm
26 |
27 | @benchllm.test(suite=".")
28 | def test(input: str):
29 | return "42"
30 | """
31 | test_case = """
32 | input: 1+1
33 | expected: [2]
34 | """
35 | with tempfile.TemporaryDirectory() as temp_dir:
36 | temp_dir = Path(temp_dir)
37 | with open(temp_dir / "test.py", "w") as f:
38 | f.write(python_code)
39 | with open(temp_dir / "test.yaml", "w") as f:
40 | f.write(test_case)
41 |
42 | test = Tester()
43 | test.load_module(temp_dir / "test.py")
44 | predictions = test.run()
45 |
46 | assert predictions[0].output == "42"
47 | assert predictions[0].test.input == "1+1"
48 | assert predictions[0].test.expected == ["2"]
49 |
--------------------------------------------------------------------------------
/test/utils.py:
--------------------------------------------------------------------------------
1 | from openai.openai_object import OpenAIObject
2 |
3 |
4 | def create_openai_object(text):
5 | obj = OpenAIObject()
6 | message = OpenAIObject()
7 | message.text = text
8 | obj.choices = [message]
9 | return obj
10 |
--------------------------------------------------------------------------------