├── tests ├── __init__.py ├── conftest.py ├── show_coverage.py ├── test_json_encoder.py └── test_plugin.py ├── .python-version ├── .gitignore ├── src └── pytest_evals │ ├── __init__.py │ ├── models.py │ ├── json_encoder.py │ ├── ipython_extension.py │ └── plugin.py ├── .pre-commit-config.yaml ├── example ├── t.ipynb ├── example_test.py ├── example_judge_test.py ├── example_notebook.ipynb └── example_notebook_advanced.ipynb ├── LICENSE ├── .github └── workflows │ ├── test.yaml │ ├── pr-triage.yaml │ └── publish.yaml ├── pyproject.toml ├── CHANGELOG.md ├── CONTRIBUTING.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | pytest_plugins = ["pytester"] 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | coverage/ 10 | .coverage 11 | *.py,cover 12 | coverage.xml 13 | junit/ 14 | test-out/ 15 | 16 | .ipynb_checkpoints/ 17 | 18 | # Virtual environments 19 | .venv 20 | 21 | # IDE and system files 22 | .idea/ 23 | .vscode/ 24 | .DS_Store 25 | 26 | 27 | .env 28 | example/experiment_results.csv -------------------------------------------------------------------------------- /src/pytest_evals/__init__.py: -------------------------------------------------------------------------------- 1 | """A pytest plugin for running and analyzing LLM evaluation tests.""" 2 | 3 | from .plugin import ( 4 | eval_bag, 5 | eval_bag_results, 6 | eval_results, 7 | eval_analysis_marker, 8 | eval_marker, 9 | out_path, 10 | ) 11 | from .models import EvalResults, EvalBag 12 | from .ipython_extension import load_ipython_extension 13 | 14 | __all__ = [ 15 | # Core functionality 16 | "EvalResults", 17 | "EvalBag", 18 | "eval_bag", 19 | "eval_bag_results", 20 | "eval_results", 21 | "out_path", 22 | # Marker utilities 23 | "eval_analysis_marker", 24 | "eval_marker", 25 | # Extensions 26 | "load_ipython_extension", 27 | ] 28 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.3.0 4 | hooks: 5 | - id: ruff 6 | args: [--fix] 7 | - id: ruff-format 8 | 9 | - repo: https://github.com/RobertCraigie/pyright-python 10 | rev: v1.1.392.post0 11 | hooks: 12 | - id: pyright 13 | 14 | - repo: local 15 | hooks: 16 | - id: uv-sync 17 | name: UV sync dependencies 18 | entry: uv sync --all-extras --dev 19 | language: system 20 | pass_filenames: false 21 | 22 | - id: pytest 23 | name: Run tests with coverage 24 | entry: coverage run --source=pytest_evals -m pytest 25 | language: system 26 | pass_filenames: false 27 | stages: [pre-commit] 28 | types: [python] 29 | 30 | default_install_hook_types: [pre-commit, commit-msg] 31 | default_stages: [pre-commit, pre-push] -------------------------------------------------------------------------------- /example/t.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": {}, 5 | "cell_type": "code", 6 | "outputs": [], 7 | "execution_count": null, 8 | "source": "%load_ext pytest_evals", 9 | "id": "ec80ea2a86a4bd55" 10 | }, 11 | { 12 | "metadata": {}, 13 | "cell_type": "code", 14 | "outputs": [], 15 | "execution_count": null, 16 | "source": [ 17 | "%%ipytest_evals --run-eval\n", 18 | "from time import sleep\n", 19 | "\n", 20 | "import pytest\n", 21 | "\n", 22 | "\n", 23 | "@pytest.mark.eval(name=\"computer_classifier\")\n", 24 | "@pytest.mark.parametrize(\"n\", range(1,5))\n", 25 | "def test_classifier(n):\n", 26 | " sleep(10*n)" 27 | ], 28 | "id": "7e540c467649a289" 29 | }, 30 | { 31 | "metadata": {}, 32 | "cell_type": "code", 33 | "outputs": [], 34 | "execution_count": null, 35 | "source": "", 36 | "id": "b7e2bf97100392a3" 37 | } 38 | ], 39 | "metadata": {}, 40 | "nbformat": 5, 41 | "nbformat_minor": 9 42 | } 43 | -------------------------------------------------------------------------------- /src/pytest_evals/models.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import Any, Mapping 3 | 4 | from pytest_harvest import ResultsBag 5 | 6 | 7 | class EvalBag(ResultsBag): 8 | pass 9 | 10 | 11 | @dataclasses.dataclass 12 | class EvalResults: 13 | """Data class representing an evaluation result.""" 14 | 15 | eval_name: str 16 | status: str 17 | duration_ms: float 18 | test_params: dict[str, Any] 19 | test_name: str 20 | result: EvalBag 21 | 22 | @classmethod 23 | def from_result_bag(cls, item: Mapping[str, Any]) -> "EvalResults": 24 | """Create an EvalResult instance from a result bag item.""" 25 | return cls( 26 | eval_name=item["eval_name"], 27 | status=item["status"], 28 | duration_ms=item["duration_ms"], 29 | test_params=item["params"], 30 | test_name=item["pytest_obj_name"], 31 | result=EvalBag(item["fixtures"]["eval_bag"]) 32 | if "eval_bag" in item["fixtures"] 33 | else EvalBag(), 34 | ) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Almog Baku 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '**.py' 7 | workflow_call: 8 | 9 | permissions: 10 | contents: write 11 | issues: read 12 | checks: write 13 | pull-requests: write 14 | 15 | jobs: 16 | tests: 17 | name: "Run tests" 18 | runs-on: ubuntu-latest 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v4 22 | - name: Install uv 23 | uses: astral-sh/setup-uv@v5 24 | with: 25 | enable-cache: true 26 | - name: "Set up Python" 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version-file: ".python-version" 30 | - name: Install the project 31 | run: uv sync --all-extras --dev 32 | - name: Test with pytest 33 | run: | 34 | uv run coverage run --source=pytest_evals -m pytest --junitxml=junit/test-results.xml 35 | uv run coverage xml 36 | - name: Get Cover 37 | uses: orgoro/coverage@v3 38 | if: github.event_name == 'pull_request' 39 | with: 40 | coverageFile: coverage.xml 41 | token: ${{ secrets.GITHUB_TOKEN }} 42 | - name: Publish Test Results 43 | uses: EnricoMi/publish-unit-test-result-action/macos@v2 44 | if: always() 45 | with: 46 | files: | 47 | junit/**/*.xml 48 | junit/**/*.trx 49 | junit/**/*.json 50 | -------------------------------------------------------------------------------- /src/pytest_evals/json_encoder.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import json 3 | from enum import Enum 4 | from typing import Callable 5 | 6 | try: 7 | from pydantic import BaseModel 8 | 9 | HAVE_PYDANTIC = True 10 | except ImportError: 11 | HAVE_PYDANTIC = False 12 | BaseModel = type(None) # Create a dummy type that won't match anything 13 | 14 | try: 15 | import pandas as pd 16 | 17 | def is_series(obj): # pyright: ignore [reportRedeclaration] 18 | return isinstance(obj, pd.Series) 19 | 20 | def is_dataframe(obj): # pyright: ignore [reportRedeclaration] 21 | return isinstance(obj, pd.DataFrame) 22 | 23 | HAVE_PANDAS = True 24 | except ImportError: 25 | HAVE_PANDAS = False 26 | 27 | def is_series(obj): 28 | return False 29 | 30 | def is_dataframe(obj): 31 | return False 32 | 33 | 34 | class AdvancedJsonEncoder(json.JSONEncoder): 35 | """JSON encoder that handles Pydantic models (if installed) and other special types.""" 36 | 37 | # noinspection PyBroadException 38 | def default(self, o): 39 | if HAVE_PYDANTIC and isinstance(o, BaseModel): 40 | return json.loads(o.model_dump_json()) # type: ignore 41 | if dataclasses.is_dataclass(o): 42 | return dataclasses.asdict(o) # type: ignore 43 | if isinstance(o, Enum): 44 | return o.value 45 | if isinstance(o, Callable): 46 | try: 47 | return f"<{o.__module__}.{o.__name__}>" 48 | except Exception: 49 | try: 50 | return f"<{o.__module__}.{o.__class__.__name__}>" 51 | except Exception: 52 | return repr(o) 53 | if isinstance(o, type(None)): 54 | return None 55 | if HAVE_PANDAS and is_series(o): 56 | return o.to_dict() 57 | if HAVE_PANDAS and is_dataframe(o): 58 | return o.to_dict(orient="records") 59 | if hasattr(o, "__repr__"): 60 | return repr(o) 61 | return super().default(o) 62 | -------------------------------------------------------------------------------- /.github/workflows/pr-triage.yaml: -------------------------------------------------------------------------------- 1 | name: "Pull Request Triage" 2 | on: 3 | # NB: using `pull_request_target` runs this in the context of 4 | # the base repository, so it has permission to upload to the checks API. 5 | # This means changes won't kick in to this file until merged onto the 6 | # main branch. 7 | pull_request_target: 8 | types: [ opened, edited, reopened, synchronize ] 9 | 10 | permissions: 11 | contents: read 12 | pull-requests: write 13 | issues: write 14 | 15 | jobs: 16 | triage: 17 | name: "Triage Pull Request" 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: codelytv/pr-size-labeler@v1 21 | with: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | xs_label: 'size/xs' 24 | xs_max_size: '15' 25 | s_label: 'size/s' 26 | s_max_size: '100' 27 | m_label: 'size/m' 28 | m_max_size: '500' 29 | l_label: 'size/l' 30 | l_max_size: '1000' 31 | xl_label: 'size/xl' 32 | fail_if_xl: 'false' 33 | message_if_xl: | 34 | This PR exceeds the recommended size of 1000 lines. 35 | Please make sure you are NOT addressing multiple issues with one PR. 36 | Note this PR might be rejected due to its size. 37 | files_to_ignore: '' 38 | # - name: "Check for PR body length" 39 | # shell: bash 40 | # env: 41 | # PR_BODY: ${{ github.event.pull_request.body }} 42 | # run: | 43 | # if [ ${#PR_BODY} -lt 80 ]; then 44 | # echo "::error title=PR body is too short::Your PR is probably isn't descriptive enough.\nYou should give a description that highlights both what you're doing it and *why* you're doing it. Someone reading the PR description without clicking any issue links should be able to roughly understand what's going on." 45 | # exit 1 46 | # fi 47 | - uses: amannn/action-semantic-pull-request@v5 48 | env: 49 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 50 | with: 51 | disallowScopes: | 52 | release -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pytest-evals" 3 | version = "0.0.0-dev" 4 | description = "A pytest plugin for running and analyzing LLM evaluation tests" 5 | authors = [ 6 | { name = "Almog Baku", email = "almog.baku@gmail.com" }, 7 | ] 8 | dependencies = [ 9 | "pytest>=7.0.0", 10 | "pytest-harvest>=1.0.0", 11 | "cloudpickle>=2.0.0", 12 | ] 13 | requires-python = ">=3.9" 14 | readme = "README.md" 15 | license = "MIT" 16 | keywords = ["pytest", "evaluations", "llm", "eval", "openai", "anthropic", "gpt", "pytest-evals"] 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/AlmogBaku/pytest-evals" 20 | Repository = "https://github.com/AlmogBaku/pytest-evals" 21 | Issues = "https://github.com/AlmogBaku/pytest-evals/issues" 22 | 23 | [build-system] 24 | requires = ["hatchling"] 25 | build-backend = "hatchling.build" 26 | 27 | [project.entry-points.pytest11] 28 | evals = "pytest_evals.plugin" 29 | 30 | [tool.hatch.build.targets.wheel] 31 | packages = ["src/pytest_evals"] 32 | 33 | [tool.pytest.ini_options] 34 | addopts = "-ra -q --cov-report=term-missing" 35 | testpaths = ["tests"] 36 | 37 | filterwarnings = [ 38 | "ignore::pytest.PytestDeprecationWarning:pytest_harvest.*", 39 | ] 40 | 41 | [dependency-groups] 42 | dev = [ 43 | "ipytest>=0.14.2", 44 | "matplotlib>=3.9.4", 45 | "notebook>=7.3.2", 46 | "openai>=1.59.6", 47 | "pandas>=2.2.3", 48 | "pytest-cov>=6.0.0", 49 | "pytest-xdist>=3.6.1", 50 | "seaborn>=0.13.2", 51 | ] 52 | 53 | [tool.coverage.report] 54 | exclude_also = [ 55 | "def __repr__", 56 | "raise AssertionError", 57 | "raise NotImplementedError", 58 | "if __name__ == .__main__.:", 59 | "if TYPE_CHECKING:", 60 | "class .*\\bProtocol\\):", 61 | "@(abc\\.)?abstractmethod", 62 | "pragma: no cover", 63 | "import *", 64 | # 1. Exclude an except clause of a specific form: 65 | "except ValueError:\\n\\s*assume\\(False\\)", 66 | # 2. Comments to turn coverage on and off: 67 | "no cover: start(?s:.)*?no cover: stop", 68 | # 3. A pragma comment that excludes an entire file: 69 | "\\A(?s:.*# pragma: exclude file.*)\\Z", 70 | "@pytest\\.fixture.*", 71 | "@pytest\\.mark.*", 72 | ] 73 | 74 | omit = [ 75 | "tests/*", 76 | "/tmp/*", 77 | "/private/*", 78 | "*/temporary/*" 79 | ] 80 | 81 | [tool.pyright] 82 | venvPath = "." 83 | venv = ".venv" -------------------------------------------------------------------------------- /example/example_test.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import pytest 3 | 4 | # Simple test data 5 | TEST_DATA = [ 6 | {"text": "I need to debug this Python code", "label": True}, 7 | {"text": "The cat jumped over the lazy dog", "label": False}, 8 | {"text": "My monitor keeps flickering", "label": True}, 9 | ] 10 | 11 | 12 | @pytest.fixture 13 | def classifier(): 14 | def _classify(text: str) -> bool: 15 | """Simple LLM agent that classifies text as computer-related or not.""" 16 | resp = openai.chat.completions.create( 17 | model="gpt-4o-mini", 18 | messages=[ 19 | { 20 | "role": "system", 21 | "content": "Is this text about a computer-related subject?" 22 | "Reply ONLY with either true or false.", 23 | }, 24 | {"role": "user", "content": text}, 25 | ], 26 | ) 27 | return resp.choices[0].message.content.lower() == "true" # type: ignore 28 | 29 | return _classify 30 | 31 | 32 | @pytest.mark.eval(name="computer_classifier") 33 | @pytest.mark.parametrize("case", TEST_DATA) 34 | def test_classifier(case: dict, eval_bag, classifier) -> None: 35 | # Store input and prediction for analysis 36 | eval_bag.input_text = case["text"] 37 | eval_bag.label = case["label"] 38 | eval_bag.prediction = classifier(case["text"]) 39 | 40 | # Basic assertion 41 | assert eval_bag.prediction == eval_bag.label 42 | 43 | 44 | @pytest.mark.eval_analysis(name="computer_classifier") 45 | def test_analysis(eval_results): 46 | # Calculate true positives, false positives, and false negatives 47 | true_positives = sum( 48 | 1 for r in eval_results if r.result.prediction and r.result.label 49 | ) 50 | false_positives = sum( 51 | 1 for r in eval_results if r.result.prediction and not r.result.label 52 | ) 53 | false_negatives = sum( 54 | 1 for r in eval_results if not r.result.prediction and r.result.label 55 | ) 56 | 57 | total_predictions = len(eval_results) 58 | correct_predictions = sum( 59 | 1 for r in eval_results if r.result.prediction == r.result.label 60 | ) 61 | 62 | # Calculate metrics 63 | accuracy = correct_predictions / total_predictions if total_predictions else 0 64 | precision = ( 65 | true_positives / (true_positives + false_positives) 66 | if (true_positives + false_positives) 67 | else 0 68 | ) 69 | recall = ( 70 | true_positives / (true_positives + false_negatives) 71 | if (true_positives + false_negatives) 72 | else 0 73 | ) 74 | f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0 75 | 76 | print(f"Accuracy: {accuracy:.2%}") 77 | print(f"Precision: {precision:.2%}") 78 | print(f"Recall: {recall:.2%}") 79 | print(f"F1: {f1:.2%}") 80 | 81 | assert f1 >= 0.7 82 | -------------------------------------------------------------------------------- /src/pytest_evals/ipython_extension.py: -------------------------------------------------------------------------------- 1 | # pragma: exclude file 2 | import shlex 3 | from pathlib import Path 4 | 5 | try: 6 | from IPython.core.magic import Magics, magics_class, cell_magic # type: ignore 7 | except ImportError: 8 | 9 | def magics_class(cls): 10 | pass 11 | 12 | class Magics: 13 | def __init__(self, shell): 14 | pass 15 | 16 | def cell_magic(func): 17 | pass 18 | 19 | 20 | @magics_class 21 | class EvalsMagics(Magics): 22 | def __init__(self, shell): 23 | super().__init__(shell) 24 | try: 25 | import ipytest 26 | 27 | ipytest.autoconfig( 28 | run_in_thread=True, # pyright: ignore [reportArgumentType] 29 | addopts=[ # pyright: ignore [reportArgumentType] 30 | "--assert=plain", 31 | "-s", # Don't capture output 32 | "--log-cli-level=ERROR", 33 | ], 34 | ) 35 | except ImportError: 36 | raise ImportError( 37 | "⚠️ `ipytest` is required to use `pytest-evals` in notebooks.\n" 38 | " ↳ Please install it with: `pip install ipytest`" 39 | ) 40 | 41 | # noinspection PyProtectedMember 42 | @staticmethod 43 | def cleanup_ipytest_env(): 44 | import ipytest 45 | 46 | if getattr(ipytest._impl.random_module_path, "_filename", None): 47 | if Path(ipytest._impl.random_module_path._filename).exists(): # pyright: ignore [reportFunctionMemberAccess] 48 | try: 49 | Path(ipytest._impl.random_module_path._filename).unlink() # pyright: ignore [reportFunctionMemberAccess] 50 | del ipytest._impl.random_module_path._filename # pyright: ignore [reportFunctionMemberAccess] 51 | except Exception: 52 | pass 53 | ipytest.clean() 54 | 55 | @cell_magic 56 | def ipytest_evals(self, line, cell): 57 | """ 58 | Execute pytest evaluations in the current IPython cell. 59 | 60 | Usage: 61 | %%pytest_evals [optional arguments] 62 | def test_something(): 63 | assert True 64 | """ 65 | # Force reload to ensure fresh test environment 66 | from pytest_harvest import FIXTURE_STORE 67 | from IPython.core.getipython import get_ipython 68 | import ipytest 69 | 70 | FIXTURE_STORE.clear() 71 | 72 | run_args = shlex.split(line) 73 | 74 | if "--run-eval" not in run_args and "--run-eval-analysis" not in run_args: 75 | run_args.append("--run-eval") 76 | run_args.append("--run-eval-analysis") 77 | 78 | if "-n" in run_args: 79 | raise ValueError( 80 | "The `-n` flag is not supported with `ipytest` (in notebooks)." 81 | ) 82 | 83 | self.cleanup_ipytest_env() 84 | 85 | try: 86 | get_ipython().run_cell(cell) # pyright: ignore [reportOptionalMemberAccess] 87 | 88 | except TypeError as e: 89 | if "raw_cell" in str(e): 90 | raise RuntimeError( 91 | "The ipytest magic cannot evaluate the cell. Most likely you " 92 | "are running a modified ipython version. Consider using " 93 | "`ipytest.run` and `ipytest.clean` directly.", 94 | ) from e 95 | 96 | raise e 97 | 98 | try: 99 | ipytest.run(*run_args) 100 | except KeyboardInterrupt: 101 | self.cleanup_ipytest_env() 102 | raise 103 | 104 | 105 | def load_ipython_extension(ipython): 106 | """ 107 | Register the magic when the extension loads. 108 | """ 109 | ipython.register_magics(EvalsMagics) 110 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPi 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - '**.py' 9 | workflow_dispatch: 10 | inputs: 11 | skip_on_empty_changelog: 12 | description: "Skip the release if the changelog is empty" 13 | required: false 14 | default: false 15 | type: boolean 16 | 17 | permissions: 18 | contents: write 19 | issues: read 20 | checks: write 21 | pull-requests: write 22 | 23 | jobs: 24 | tests: 25 | uses: ./.github/workflows/test.yaml 26 | version: 27 | runs-on: ubuntu-latest 28 | outputs: 29 | version: ${{ steps.changelog.outputs.version }} 30 | tag: ${{ steps.changelog.outputs.tag }} 31 | changelog: ${{ steps.changelog.outputs.changelog }} 32 | clean_changelog: ${{ steps.changelog.outputs.clean_changelog }} 33 | skipped: ${{ steps.changelog.outputs.skipped }} 34 | steps: 35 | - name: Checkout repository 36 | uses: actions/checkout@v4 37 | with: 38 | fetch-depth: 0 39 | - name: Conventional Changelog Action 40 | id: changelog 41 | uses: TriPSs/conventional-changelog-action@v6 42 | with: 43 | release-count: '1' 44 | output-file: "false" 45 | skip-version-file: 'true' 46 | skip-commit: 'true' 47 | skip-git-pull: 'true' 48 | git-push: 'false' 49 | skip-on-empty: ${{ github.event_name == 'workflow_dispatch' && inputs.skip_on_empty_changelog || false }} 50 | fallback-version: '0.1.0' 51 | release: 52 | name: "Release and publish the version" 53 | needs: [ tests, version ] 54 | if: ${{ needs.version.outputs.skipped != 'true' }} 55 | runs-on: ubuntu-latest 56 | environment: 57 | name: pypi 58 | url: https://pypi.org/p/pytest-evals 59 | permissions: 60 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 61 | contents: write 62 | steps: 63 | - name: Checkout repository 64 | uses: actions/checkout@v4 65 | - name: Install uv 66 | uses: astral-sh/setup-uv@v5 67 | with: 68 | enable-cache: true 69 | - name: "Set up Python" 70 | uses: actions/setup-python@v5 71 | with: 72 | python-version-file: ".python-version" 73 | - name: Set version 74 | env: 75 | BUILD_VERSION: "${{ needs.version.outputs.version }}" 76 | run: sed -i "s/version = \"0.0.0-dev\"/version = \"${BUILD_VERSION}\"/g" pyproject.toml 77 | - name: Update changelog 78 | shell: bash 79 | run: | 80 | touch CHANGELOG.md 81 | echo -e "${{ needs.version.outputs.changelog }}\n\n$(cat CHANGELOG.md)" > CHANGELOG.md 82 | - name: Build 83 | run: uv build 84 | - name: Publish 85 | run: uv publish 86 | - name: Update changelog 87 | shell: bash 88 | run: | 89 | git config user.name github-actions 90 | git config user.email github-actions@github.com 91 | 92 | git add CHANGELOG.md 93 | git commit -m "chore(release): ${{ needs.version.outputs.version }}" CHANGELOG.md 94 | git push 95 | - name: Tag 96 | uses: actions/github-script@v7 97 | with: 98 | script: | 99 | github.rest.git.createRef({ 100 | owner: context.repo.owner, 101 | repo: context.repo.repo, 102 | ref: 'refs/tags/${{ needs.version.outputs.tag }}', 103 | sha: context.sha 104 | }) 105 | - name: Release on GitHub 106 | uses: softprops/action-gh-release@v1 107 | with: 108 | tag_name: ${{ needs.version.outputs.tag }} 109 | files: dist/* 110 | body: | 111 | Released to https://pypi.org/project/pytest-evals/${{ needs.version.outputs.version }}/ 112 | --- 113 | ${{ needs.version.outputs.clean_changelog }} 114 | prerelease: ${{ inputs.prerelease }} 115 | name: Version ${{ needs.version.outputs.version }} 116 | generate_release_notes: false -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [0.3.4](https://github.com/AlmogBaku/pytest-evals/compare/v0.3.3...v0.3.4) (2025-02-02) 2 | 3 | 4 | ### Bug Fixes 5 | 6 | * minor bug with model ([ea01570](https://github.com/AlmogBaku/pytest-evals/commit/ea015706e6ba7f9e3e9761f6c289417f34dfd217)) 7 | 8 | 9 | 10 | 11 | 12 | ## [0.3.3](https://github.com/AlmogBaku/pytest-evals/compare/v0.3.2...v0.3.3) (2025-01-29) 13 | 14 | 15 | ### Bug Fixes 16 | 17 | * minor bug ([8bfa9ae](https://github.com/AlmogBaku/pytest-evals/commit/8bfa9aed673e2133949d24333d4ad49635c2596b)) 18 | 19 | 20 | 21 | 22 | 23 | ## [0.3.2](https://github.com/AlmogBaku/pytest-evals/compare/v0.3.1...v0.3.2) (2025-01-28) 24 | 25 | 26 | ### Bug Fixes 27 | 28 | * rare notebook race condition ([b56a3a7](https://github.com/AlmogBaku/pytest-evals/commit/b56a3a73758bd27428f5959f6a296c622b4ae5e6)) 29 | 30 | 31 | 32 | 33 | 34 | ## [0.3.1](https://github.com/AlmogBaku/pytest-evals/compare/v0.3.0...v0.3.1) (2025-01-27) 35 | 36 | 37 | ### Bug Fixes 38 | 39 | * ipytest bugs recovery ([0e47bfd](https://github.com/AlmogBaku/pytest-evals/commit/0e47bfd5716f660b8061e546bca4ef90ee544c3d)) 40 | * json encoder bu ([5df0582](https://github.com/AlmogBaku/pytest-evals/commit/5df05822b53d6218c5fbac077fa6b3042375c6e4)) 41 | 42 | 43 | 44 | 45 | 46 | # [0.3.0](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.7...v0.3.0) (2025-01-21) 47 | 48 | 49 | ### Bug Fixes 50 | 51 | * more tests for encoding pd ([281d3f2](https://github.com/AlmogBaku/pytest-evals/commit/281d3f2a0989698136989fdc4b3a6095ed039d8b)) 52 | 53 | 54 | ### Features 55 | 56 | * save eval cases as csv ([23e5c6c](https://github.com/AlmogBaku/pytest-evals/commit/23e5c6c7cce8b1a4214a1ca5f8969c7da9d06005)) 57 | 58 | 59 | 60 | 61 | 62 | ## [0.2.7](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.6...v0.2.7) (2025-01-21) 63 | 64 | 65 | ### Bug Fixes 66 | 67 | * allow splitting tests in notebooks ([ba4b450](https://github.com/AlmogBaku/pytest-evals/commit/ba4b450ded4a7123ceb943bb6876d5d82d3454a3)) 68 | 69 | 70 | 71 | 72 | 73 | ## [0.2.6](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.5...v0.2.6) (2025-01-20) 74 | 75 | 76 | ### Bug Fixes 77 | 78 | * json encoding of some objects ([46d017e](https://github.com/AlmogBaku/pytest-evals/commit/46d017ef1aa2c1b3fbca3e4fdddb5911b1268bf7)) 79 | * use EvalBag type ([5537e4d](https://github.com/AlmogBaku/pytest-evals/commit/5537e4d0a4f92b130013b596c2f19b3796265bbe)) 80 | 81 | 82 | 83 | 84 | 85 | ## [0.2.5](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.4...v0.2.5) (2025-01-16) 86 | 87 | 88 | ### Bug Fixes 89 | 90 | * add comment ([faa48a4](https://github.com/AlmogBaku/pytest-evals/commit/faa48a4fbd5affe1fb21c13461a2330632dc969a)) 91 | 92 | 93 | 94 | 95 | 96 | ## [0.2.4](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.3...v0.2.4) (2025-01-15) 97 | 98 | 99 | ### Bug Fixes 100 | 101 | * better import and error ([4b981a8](https://github.com/AlmogBaku/pytest-evals/commit/4b981a8654f429b09292426986697feb8eeed72a)) 102 | 103 | 104 | 105 | 106 | 107 | ## [0.2.3](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.2...v0.2.3) (2025-01-14) 108 | 109 | 110 | ### Bug Fixes 111 | 112 | * add custom ipython extension to make it work smoothly in notebooks :) ([9e5c897](https://github.com/AlmogBaku/pytest-evals/commit/9e5c897a47971e36ca9b1c41c89674301de995fe)) 113 | * better tests + coverage ([f87d99f](https://github.com/AlmogBaku/pytest-evals/commit/f87d99f7a50a2630a421a39e3e9927b65a75a2e4)) 114 | 115 | 116 | 117 | 118 | 119 | ## [0.2.2](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.1...v0.2.2) (2025-01-14) 120 | 121 | 122 | ### Bug Fixes 123 | 124 | * add tests, and fix bugs :) ([7d29898](https://github.com/AlmogBaku/pytest-evals/commit/7d2989838a8f0010f4f97c58e9ad3b0f5735c1fc)) 125 | 126 | 127 | 128 | 129 | 130 | ## [0.2.1](https://github.com/AlmogBaku/pytest-evals/compare/v0.2.0...v0.2.1) (2025-01-13) 131 | 132 | 133 | ### Bug Fixes 134 | 135 | * pypi links and metadata ([61adaaa](https://github.com/AlmogBaku/pytest-evals/commit/61adaaaeb8487a68609374f7cc9a77b16d9727e6)) 136 | 137 | 138 | 139 | 140 | 141 | # [0.2.0](https://github.com/AlmogBaku/pytest-evals/compare/v0.1.0...v0.2.0) (2025-01-13) 142 | 143 | 144 | ### Features 145 | 146 | * readme in pypi ([ab87991](https://github.com/AlmogBaku/pytest-evals/commit/ab8799158c256daeb47c4f7e7e3f26471b926dab)) 147 | 148 | 149 | 150 | 151 | 152 | # 0.1.0 (2025-01-13) 153 | -------------------------------------------------------------------------------- /example/example_judge_test.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import pytest 3 | 4 | # Simple test data 5 | TEST_DATA = [ 6 | { 7 | "text": "I am experiencing a frustrating issue with my Python code where the variables keep returning undefined values and the loops aren't terminating properly. I need to debug this to find the root cause.", 8 | "label": "debugging Python code with undefined variables and non-terminating loops", 9 | }, 10 | { 11 | "text": "In a display of remarkable agility, the swift orange cat gracefully propelled itself over the sleeping brown dog, who remained completely undisturbed by this acrobatic feat.", 12 | "label": "agile orange cat jumping over a sleeping brown dog", 13 | }, 14 | { 15 | "text": "The LCD display on my desktop computer has been exhibiting concerning behavior lately - the screen keeps flickering intermittently and displaying random artifacts, making it very difficult to work.", 16 | "label": "LCD monitor displaying intermittent flickering and artifacts", 17 | }, 18 | ] 19 | 20 | 21 | @pytest.fixture 22 | def summarizer(): 23 | def _summarize(text: str) -> str: 24 | """Simple LLM agent that summarizes text""" 25 | res = openai.chat.completions.create( 26 | model="gpt-4o-mini", 27 | messages=[ 28 | { 29 | "role": "system", 30 | "content": "Write a concise summary of the text.", 31 | }, 32 | {"role": "user", "content": text}, 33 | ], 34 | ) 35 | return res.choices[0].message.content # type: ignore 36 | 37 | return _summarize 38 | 39 | 40 | @pytest.fixture 41 | def judge(): 42 | def _judge(text, summary, main_subject) -> bool: 43 | """LLM as a judge that determines if the summary is about the main subject""" 44 | resp = openai.chat.completions.create( 45 | model="gpt-4o", 46 | messages=[ 47 | { 48 | "role": "system", 49 | "content": "Decide whether the summary is about the main subject. " 50 | "Reply ONLY with either true or false.", 51 | }, 52 | { 53 | "role": "user", 54 | "content": f"Original Text: {text}\nSummary: {summary}\nMain Subject: {main_subject}", 55 | }, 56 | ], 57 | ) 58 | return resp.choices[0].message.content.lower() == "true" # type: ignore 59 | 60 | return _judge 61 | 62 | 63 | @pytest.mark.eval(name="summary") 64 | @pytest.mark.parametrize("case", TEST_DATA) 65 | def test_classifier(case: dict, eval_bag, summarizer, judge) -> None: 66 | # Store input and prediction for analysis 67 | eval_bag.input_text = case["text"] 68 | eval_bag.label = case["label"] # the label is the main subject of the text 69 | eval_bag.prediction = summarizer(case["text"]) 70 | eval_bag.judgement = judge(eval_bag.input_text, eval_bag.prediction, eval_bag.label) 71 | 72 | # Basic assertion 73 | assert eval_bag.judgement # Assert that the summary is about the main subject 74 | 75 | 76 | @pytest.mark.eval_analysis(name="summary") 77 | def test_analysis(eval_results): 78 | # Calculate various metrics 79 | total_samples = len(eval_results) 80 | 81 | # Subject relevance (based on judge's assessment) 82 | relevant_summaries = sum(1 for r in eval_results if r.result.judgement) 83 | subject_accuracy = relevant_summaries / total_samples if total_samples else 0 84 | 85 | # Length analysis 86 | avg_summary_length = ( 87 | sum(len(r.result.prediction.split()) for r in eval_results) / total_samples 88 | if total_samples 89 | else 0 90 | ) 91 | avg_input_length = ( 92 | sum(len(r.result.input_text.split()) for r in eval_results) / total_samples 93 | if total_samples 94 | else 0 95 | ) 96 | compression_ratio = avg_summary_length / avg_input_length if avg_input_length else 0 97 | 98 | # Print metrics 99 | print(f"Subject Accuracy: {subject_accuracy:.2%}") 100 | print(f"Average Summary Length: {avg_summary_length:.1f} words") 101 | print(f"Average Input Length: {avg_input_length:.1f} words") 102 | print(f"Compression Ratio: {compression_ratio:.2f}") 103 | 104 | # Basic quality assertions 105 | assert subject_accuracy >= 0.7, "Subject accuracy below threshold" 106 | assert 0.2 <= compression_ratio <= 0.8, "Compression ratio outside acceptable range" 107 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to pytest-evals 2 | 3 | Thank you for considering contributing to `pytest-evals`! 🎉 4 | 5 | Whether you're reporting bugs, improving docs, or suggesting features - every contribution matters and helps make 6 | testing better for the Python community. No contribution is too small, and we're excited to help you get started! 7 | 8 | ## Show Us How You Use It! 9 | 10 | Share your experiences! Whether it's evaluation patterns, example notebooks, or testing approaches - your real-world 11 | usage helps others get started. Even a simple write-up of how you use pytest-evals makes a difference! 🚀 12 | 13 | ## Prerequisites 14 | 15 | - Python 3.9 or higher ([python.org/downloads](https://www.python.org/downloads/)) 16 | - [uv](https://github.com/astral/uv) for Python package and environment management 17 | - [pre-commit](https://pre-commit.com/) for git hooks management 18 | 19 | ## Development Setup 20 | 21 | 1. Clone your fork: 22 | ```bash 23 | git clone git@github.com:AlmogBaku/pytest-evals.git 24 | ``` 25 | 26 | 2. Set up development environment: 27 | ```bash 28 | # Install all dependencies including dev extras 29 | uv sync --all-extras --dev 30 | 31 | # Install pre-commit hooks 32 | pre-commit install 33 | ``` 34 | 35 | ## Before Submitting a PR 36 | 37 | 1. Run pre-commit hooks: 38 | ```bash 39 | pre-commit run --all-files 40 | ``` 41 | 42 | 2. Run tests with coverage: 43 | ```bash 44 | coverage run --source=pytest_evals -m pytest 45 | coverage report 46 | ``` 47 | 48 | ## Testing Guidelines 49 | 50 | We value testing to keep pytest-evals reliable and maintainable. When adding new features or fixing bugs: 51 | 52 | - Include tests that cover the new functionality or reproduce the bug 53 | - Aim for clear, readable test cases that help document behavior 54 | - Consider edge cases and error conditions 55 | - Use the existing test suite as a guide for style and structure 56 | 57 | If you need help with testing, feel free to ask in your PR - we're here to help! 58 | 59 | To run the test suite: 60 | 61 | ```bash 62 | # Run tests with coverage reporting 63 | coverage run --source=pytest_evals -m pytest 64 | coverage report 65 | ``` 66 | 67 | Remember: if you're adding new functionality, including tests helps everyone understand how your code works and ensures 68 | it keeps working as the project evolves. If you're stuck with testing, don't hesitate to ask for help in your PR - we're 69 | here to help! 70 | 71 | ## PR Process 72 | 73 | Individual commits should not be tagged separately, but will generally be assumed to match the PR. For instance, if you 74 | have a bugfix in with a breaking change, it's generally encouraged to submit the bugfix separately, but if you must put 75 | them in one PR, mark the commit separately. 76 | 77 | ### Commit Message Format 78 | 79 | We are using [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) to standardize our commit messages. 80 | This allows us to automatically generate changelogs and release notes, to create a more readable git history and to 81 | automatically trigger semantic versioning. 82 | 83 | Please make sure to follow the following format when writing commit messages and PR titles: 84 | 85 | ``` 86 | (): 87 | │ │ │ 88 | │ │ └─⫸ Summary in present tense 89 | │ │ 90 | │ └─⫸ [optional] Commit Scope: ipython, eval, analysis, etc. 91 | │ 92 | └─⫸ Commit Type: build|ci|docs|feat|fix|perf|refactor|test 93 | ``` 94 | 95 | We support the following types: 96 | 97 | | Type | Description | 98 | |------------|-----------------------------------------------------------------------| 99 | | `feat` | A new feature (correlates with `MINOR` in semantic versioning) | 100 | | `fix` | A bug fix | 101 | | `docs` | Documentation only changes | 102 | | `style` | Changes that do not affect code meaning (whitespace, formatting, etc) | 103 | | `refactor` | Code change that neither fixes a bug nor adds a feature | 104 | | `perf` | Code change that improves performance | 105 | | `test` | Adding or correcting tests | 106 | | `build` | Changes affecting build system or dependencies | 107 | | `ci` | Changes to CI configuration | 108 | | `chore` | Other changes that don't modify src or test files | 109 | 110 | Examples: 111 | 112 | ``` 113 | fix: correct metric calculation in eval_results 114 | feat(core): add support for parallel evaluation runs 115 | refactor!: change the evaluation API 116 | docs(readme): clarify usage instructions 117 | ``` 118 | 119 | ### Breaking changes 120 | 121 | Breaking changes should be marked with a `!` after the type/scope. This will trigger a `MAJOR` version bump when the 122 | commit is merged. For example: 123 | 124 | ``` 125 | refactor!: change the evaluation API 126 | ``` 127 | 128 | Breaking changes should be avoided if possible. When necessary, they must be properly documented in the PR description 129 | with: 130 | 131 | - What changed 132 | - Why it was necessary 133 | - Migration instructions for users 134 | 135 | ## Where the CI Tests are configured 136 | 137 | Check the [GitHub Actions workflows](.github/workflows) directory, particularly: 138 | 139 | - `test.yaml` for the main test suite 140 | - `publish.yaml` for the release process 141 | - `pr-triage.yaml` for PR automation 142 | 143 | ## Code of conduct 144 | 145 | Participation in the pytest-evals community is governed by 146 | the [Python Community Code of Conduct](https://www.python.org/psf/conduct/). -------------------------------------------------------------------------------- /tests/show_coverage.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a utility to format the coverage report in markdown format - useful for working with LLMs. 3 | 4 | To use: 5 | ```console 6 | pytest --junitxml=- --cov=./ --cov-report=xml | python tests/show_coverage.py 7 | coverage run --source=pytest_evals -m pytest tests/ && coverage xml && python tests/show_coverage.py 8 | ``` 9 | """ 10 | 11 | import xml.etree.ElementTree as ET 12 | from collections import defaultdict 13 | from dataclasses import dataclass 14 | from typing import List, Tuple 15 | 16 | 17 | @dataclass 18 | class CoverageCase: 19 | """Represents a continuous block of uncovered code""" 20 | 21 | start_line: int 22 | end_line: int 23 | code_lines: List[str] 24 | context: str = "" # Can be used to store function/class name 25 | 26 | 27 | class CoverageReport: 28 | def __init__(self, coverage_xml: str = "coverage.xml"): 29 | self.coverage_xml = coverage_xml 30 | self.files_with_uncovered = 0 31 | self.total_uncovered_lines = 0 32 | 33 | def _group_continuous_lines( 34 | self, lines: List[Tuple[int, str]] 35 | ) -> List[CoverageCase]: 36 | """Group continuous line numbers into cases""" 37 | if not lines: 38 | return [] 39 | 40 | cases = [] 41 | current_case = None 42 | 43 | for line_num, code in lines: 44 | if current_case is None: 45 | current_case = CoverageCase(line_num, line_num, [code]) 46 | elif line_num == current_case.end_line + 1: 47 | current_case.end_line = line_num 48 | current_case.code_lines.append(code) 49 | else: 50 | cases.append(current_case) 51 | current_case = CoverageCase(line_num, line_num, [code]) 52 | 53 | if current_case: 54 | cases.append(current_case) 55 | 56 | return cases 57 | 58 | def _detect_context(self, lines: List[str], start_line: int) -> str: 59 | """Try to detect the context (function/class) for a block of code""" 60 | # Look up to 5 lines before the uncovered block for context 61 | context_range = range(max(0, start_line - 5), start_line) 62 | for i in reversed(context_range): 63 | line = lines[i].strip() 64 | if line.startswith("def ") or line.startswith("class "): 65 | return line.split("(")[0].strip() 66 | return "" 67 | 68 | def format_markdown(self) -> str: 69 | """Format the coverage report in markdown with grouped cases""" 70 | try: 71 | root = ET.parse(self.coverage_xml).getroot() 72 | output = ["# Coverage Report\n"] 73 | 74 | files_report = defaultdict(list) 75 | 76 | for class_elem in root.findall(".//class"): 77 | filename = class_elem.attrib["filename"] 78 | 79 | try: 80 | with open(filename, "r") as f: 81 | file_lines = f.readlines() 82 | 83 | # Get uncovered lines with their code 84 | uncovered_lines = [ 85 | ( 86 | int(line.attrib["number"]), 87 | file_lines[int(line.attrib["number"]) - 1].rstrip(), 88 | ) 89 | for line in class_elem.findall('./lines/line[@hits="0"]') 90 | if file_lines[int(line.attrib["number"]) - 1].strip() 91 | ] 92 | 93 | if uncovered_lines: 94 | self.files_with_uncovered += 1 95 | self.total_uncovered_lines += len(uncovered_lines) 96 | 97 | # Group into cases 98 | cases = self._group_continuous_lines(uncovered_lines) 99 | 100 | # Add context to each case 101 | for case in cases: 102 | case.context = self._detect_context( 103 | file_lines, case.start_line 104 | ) 105 | 106 | files_report[filename].extend(cases) 107 | 108 | except FileNotFoundError: 109 | output.append(f"⚠️ Could not find source file: {filename}\n") 110 | 111 | # Format the report 112 | for filename, cases in files_report.items(): 113 | output.append(f"## {filename}\n") 114 | 115 | for i, case in enumerate(cases, 1): 116 | context = f" ({case.context})" if case.context else "" 117 | output.append(f"### Case {i}{context}\n") 118 | 119 | if case.start_line == case.end_line: 120 | output.append(f"Line {case.start_line}:\n") 121 | else: 122 | output.append(f"Lines {case.start_line}-{case.end_line}:\n") 123 | 124 | output.append("```python") 125 | for line_num, code in zip( 126 | range(case.start_line, case.end_line + 1), case.code_lines 127 | ): 128 | output.append(f"{line_num}: {code}") 129 | output.append("```\n") 130 | 131 | # Add summary 132 | output.extend( 133 | [ 134 | "## Summary\n", 135 | f"- Files with uncovered lines: {self.files_with_uncovered}", 136 | f"- Total uncovered lines: {self.total_uncovered_lines}", 137 | f"- Total cases: {sum(len(cases) for cases in files_report.values())}", 138 | ] 139 | ) 140 | 141 | return "\n".join(output) 142 | 143 | except FileNotFoundError: 144 | return "❌ Error: coverage.xml not found. Run coverage xml first." 145 | except Exception as e: 146 | return f"❌ Error: {str(e)}" 147 | 148 | 149 | def main(): 150 | report = CoverageReport() 151 | print(report.format_markdown()) 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /tests/test_json_encoder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from unittest.mock import patch 6 | 7 | import pandas as pd 8 | from pydantic import BaseModel 9 | 10 | from pytest_evals.json_encoder import AdvancedJsonEncoder 11 | 12 | 13 | # Test structures 14 | @dataclass 15 | class Person: 16 | name: str 17 | age: int 18 | 19 | 20 | class Color(Enum): 21 | RED = "red" 22 | BLUE = "blue" 23 | 24 | 25 | class User(BaseModel): 26 | name: str 27 | age: int 28 | 29 | 30 | def test_advanced_json_encoder(): 31 | """Test all AdvancedJsonEncoder functionality""" 32 | # Setup test data 33 | person = Person(name="John", age=30) 34 | data = { 35 | "person": person, 36 | "color": Color.RED, 37 | "basic": {"num": 42, "list": [1, 2]}, 38 | } 39 | 40 | # Test encoding and decoding 41 | encoded = json.dumps(data, cls=AdvancedJsonEncoder) 42 | decoded = json.loads(encoded) 43 | 44 | # Verify results 45 | assert decoded["person"] == {"name": "John", "age": 30} 46 | assert decoded["color"] == "red" 47 | assert decoded["basic"] == {"num": 42, "list": [1, 2]} 48 | 49 | 50 | def test_pydantic_encoding(): 51 | """Test Pydantic model encoding""" 52 | user = User(name="John", age=30) 53 | encoded = json.dumps(user, cls=AdvancedJsonEncoder) 54 | assert json.loads(encoded) == {"name": "John", "age": 30} 55 | 56 | 57 | def test_function_encoding(): 58 | """Test error on unsupported type""" 59 | assert ( 60 | json.dumps(lambda x: x, cls=AdvancedJsonEncoder) 61 | == '">"' 62 | ) 63 | 64 | 65 | def test_dataframe_encoding(): 66 | """Test DataFrame encoding""" 67 | assert ( 68 | json.dumps(pd.DataFrame([{"field": "value"}]), cls=AdvancedJsonEncoder) 69 | == '[{"field": "value"}]' 70 | ) 71 | 72 | 73 | def test_series_encoding(): 74 | """Test Series encoding""" 75 | assert ( 76 | json.dumps(pd.Series([1, 2, 3]), cls=AdvancedJsonEncoder) 77 | == '{"0": 1, "1": 2, "2": 3}' 78 | ) 79 | 80 | 81 | def test_none_encoding(): 82 | """Test None type encoding""" 83 | data = {"null_value": None} 84 | encoded = json.dumps(data, cls=AdvancedJsonEncoder) 85 | assert json.loads(encoded) == {"null_value": None} 86 | 87 | 88 | def test_unsupported_type_fallback(): 89 | """Test fallback to default encoder for unsupported types""" 90 | 91 | class UnsupportedType: 92 | pass 93 | 94 | assert ".UnsupportedType object" in json.dumps( 95 | UnsupportedType(), cls=AdvancedJsonEncoder 96 | ) 97 | 98 | 99 | # Test for json_encoder.py ImportError case 100 | def test_pydantic_import_error(): 101 | with patch.dict(sys.modules, {"pydantic": None}): 102 | # Force reload of the module to trigger ImportError 103 | import importlib 104 | import pytest_evals.json_encoder 105 | 106 | importlib.reload(pytest_evals.json_encoder) 107 | 108 | assert not pytest_evals.json_encoder.HAVE_PYDANTIC 109 | assert pytest_evals.json_encoder.BaseModel is type(None) 110 | 111 | 112 | def test_pandas_import_error(): 113 | """Test the JSON encoder when pandas is not available""" 114 | with patch.dict(sys.modules, {"pandas": None}): 115 | # Force reload of the module to trigger ImportError 116 | import importlib 117 | import pytest_evals.json_encoder 118 | 119 | importlib.reload(pytest_evals.json_encoder) 120 | 121 | # Verify pandas-related flags and functions 122 | assert not pytest_evals.json_encoder.HAVE_PANDAS 123 | 124 | # Test is_series function 125 | class MockObject: 126 | pass 127 | 128 | mock_obj = MockObject() 129 | assert not pytest_evals.json_encoder.is_series(mock_obj) 130 | 131 | # Test is_dataframe function 132 | assert not pytest_evals.json_encoder.is_dataframe(mock_obj) 133 | 134 | 135 | def test_none_type_variations(): 136 | """Test different scenarios involving None type""" 137 | # Test None in different contexts 138 | test_cases = [ 139 | {"direct_none": None}, 140 | {"nested_none": {"key": None}}, 141 | {"none_in_list": [1, None, 3]}, 142 | {"multiple_nones": [None, None]}, 143 | None, 144 | ] 145 | 146 | for case in test_cases: 147 | encoded = json.dumps(case, cls=AdvancedJsonEncoder) 148 | decoded = json.loads(encoded) 149 | assert decoded == case 150 | 151 | 152 | def test_mixed_none_with_other_types(): 153 | """Test None combined with other supported types""" 154 | 155 | @dataclass 156 | class DataWithNone: 157 | value: None 158 | name: str 159 | 160 | data = DataWithNone(value=None, name="test") 161 | encoded = json.dumps(data, cls=AdvancedJsonEncoder) 162 | decoded = json.loads(encoded) 163 | 164 | assert decoded == {"value": None, "name": "test"} 165 | 166 | # Test with enum 167 | class StatusEnum(Enum): 168 | NONE = None 169 | ACTIVE = "active" 170 | 171 | data = {"status": StatusEnum.NONE} 172 | encoded = json.dumps(data, cls=AdvancedJsonEncoder) 173 | decoded = json.loads(encoded) 174 | 175 | assert decoded == {"status": None} 176 | 177 | 178 | def test_explicit_none_handling(): 179 | """Test the explicit None handling in the default method of AdvancedJsonEncoder""" 180 | 181 | class CustomNone: 182 | """A custom class that returns None from its default encoding""" 183 | 184 | def __repr__(self): 185 | return "None" 186 | 187 | # Create an instance and encode it directly to trigger the default method 188 | encoder = AdvancedJsonEncoder() 189 | result = encoder.default( 190 | type(None)() 191 | ) # This explicitly calls default() with None type 192 | 193 | assert result is None 194 | 195 | # Test in context 196 | data = {"null_value": type(None)()} 197 | encoded = json.dumps(data, cls=AdvancedJsonEncoder) 198 | decoded = json.loads(encoded) 199 | 200 | assert decoded == {"null_value": None} 201 | 202 | 203 | def test_callable_encoding_edge_cases(): 204 | """Test various edge cases in callable encoding""" 205 | 206 | def simple_callable(): 207 | pass 208 | 209 | encoded = json.dumps(simple_callable, cls=AdvancedJsonEncoder) 210 | assert '""' == encoded 211 | 212 | # Test case for when o.__module__ exists but o.__name__ raises an exception 213 | class ComplexCallable: 214 | def __call__(self, *args, **kwargs): 215 | pass 216 | 217 | complex_callable = ComplexCallable() 218 | encoded = json.dumps(complex_callable, cls=AdvancedJsonEncoder) 219 | assert '""' == encoded 220 | -------------------------------------------------------------------------------- /example/example_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "367182cdbb25b3e5", 6 | "metadata": {}, 7 | "source": [ 8 | "# Computer Topic Classifier Evaluation\n", 9 | "\n", 10 | "This notebook demonstrates how to evaluate a simple LLM-based classifier that determines whether a piece of text is about computer-related topics or not. We use `pytest-evals` to run our evaluation and analyze the results.\n", 11 | "\n", 12 | "## Setup\n", 13 | "First, we'll load the required extensions and import necessary libraries." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "id": "ca67ed74e03caa26", 19 | "metadata": { 20 | "ExecuteTime": { 21 | "end_time": "2025-01-21T08:33:23.260489Z", 22 | "start_time": "2025-01-21T08:33:23.177044Z" 23 | } 24 | }, 25 | "source": [ 26 | "%load_ext pytest_evals" 27 | ], 28 | "outputs": [], 29 | "execution_count": 1 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "aa0e586aef4e19e0", 34 | "metadata": {}, 35 | "source": [ 36 | "## Classifier Implementation\n", 37 | "\n", 38 | "Below is our classifier implementation that uses GPT-4 to determine if text is computer-related. The classifier returns a boolean value:\n", 39 | "- `True`: Text is computer-related\n", 40 | "- `False`: Text is not computer-related" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "id": "c957edc789a1fda2", 46 | "metadata": { 47 | "ExecuteTime": { 48 | "end_time": "2025-01-21T08:33:24.345743Z", 49 | "start_time": "2025-01-21T08:33:24.129214Z" 50 | } 51 | }, 52 | "source": [ 53 | "import openai\n", 54 | "\n", 55 | "\n", 56 | "def classify(text: str) -> bool:\n", 57 | " \"\"\"Classify text as computer-related or not using GPT-4.\n", 58 | "\n", 59 | " Args:\n", 60 | " text (str): The input text to classify\n", 61 | "\n", 62 | " Returns:\n", 63 | " bool: True if the text is computer-related, False otherwise\n", 64 | " \"\"\"\n", 65 | " resp = openai.chat.completions.create(\n", 66 | " model=\"gpt-4o-mini\",\n", 67 | " messages=[\n", 68 | " {\n", 69 | " \"role\": \"system\",\n", 70 | " \"content\": \"Is this text about a computer-related subject? \"\n", 71 | " \"Reply ONLY with either true or false.\",\n", 72 | " },\n", 73 | " {\"role\": \"user\", \"content\": text},\n", 74 | " ],\n", 75 | " )\n", 76 | " return resp.choices[0].message.content.lower() == \"true\"" 77 | ], 78 | "outputs": [], 79 | "execution_count": 2 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "132870eda457b817", 84 | "metadata": {}, 85 | "source": [ 86 | "## Test Data\n", 87 | "\n", 88 | "We define a set of test cases to evaluate our classifier. Each test case contains:\n", 89 | "- `text`: The input text to classify\n", 90 | "- `label`: The expected classification (True for computer-related, False otherwise)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "id": "f071596bb3a7d1d5", 96 | "metadata": { 97 | "ExecuteTime": { 98 | "end_time": "2025-01-21T08:33:25.289251Z", 99 | "start_time": "2025-01-21T08:33:25.286883Z" 100 | } 101 | }, 102 | "source": [ 103 | "TEST_DATA = [\n", 104 | " {\"text\": \"I need to debug this Python code\", \"label\": True},\n", 105 | " {\"text\": \"The cat jumped over the lazy dog\", \"label\": False},\n", 106 | " {\"text\": \"My monitor keeps flickering\", \"label\": True},\n", 107 | "]" 108 | ], 109 | "outputs": [], 110 | "execution_count": 3 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "3f5ff914f7ba5295", 115 | "metadata": {}, 116 | "source": [ 117 | "## Evaluation Tests\n", 118 | "\n", 119 | "We use pytest-evals to:\n", 120 | "1. Run individual test cases and collect results\n", 121 | "2. Analyze the overall performance of our classifier\n", 122 | "\n", 123 | "The evaluation requires:\n", 124 | "- Accuracy >= 70%\n", 125 | "- All test cases must match their expected labels" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "id": "4caa983f934e9d8f", 131 | "metadata": { 132 | "ExecuteTime": { 133 | "end_time": "2025-01-21T08:33:30.840853Z", 134 | "start_time": "2025-01-21T08:33:29.078515Z" 135 | } 136 | }, 137 | "source": [ 138 | "%%ipytest_evals\n", 139 | "import pytest\n", 140 | "\n", 141 | "@pytest.mark.eval(name=\"computer_classifier\")\n", 142 | "@pytest.mark.parametrize(\"case\", TEST_DATA)\n", 143 | "def test_classifier(case: dict, eval_bag):\n", 144 | " \"\"\"Test individual classification cases.\n", 145 | " \n", 146 | " Args:\n", 147 | " case (dict): Test case containing text and expected label\n", 148 | " eval_bag: Container for test results\n", 149 | " \"\"\"\n", 150 | " # Store inputs and results in eval_bag for analysis\n", 151 | " eval_bag.input_text = case[\"text\"]\n", 152 | " eval_bag.label = case[\"label\"]\n", 153 | " eval_bag.prediction = classify(case[\"text\"])\n", 154 | "\n", 155 | " # Log results for visibility\n", 156 | " print(f\"Input: {eval_bag.input_text}\")\n", 157 | " print(f\"Prediction: {eval_bag.prediction}\")\n", 158 | "\n", 159 | " assert eval_bag.prediction == eval_bag.label\n", 160 | "\n", 161 | "\n", 162 | "@pytest.mark.eval_analysis(name=\"computer_classifier\")\n", 163 | "def test_analysis(eval_results):\n", 164 | " \"\"\"Analyze overall classifier performance.\n", 165 | " \n", 166 | " Args:\n", 167 | " eval_results: Collection of all test results\n", 168 | " \"\"\"\n", 169 | " total = len(eval_results)\n", 170 | " correct = sum(1 for r in eval_results if r.result.prediction == r.result.label)\n", 171 | " accuracy = correct / total\n", 172 | "\n", 173 | " print(f\"Accuracy: {accuracy:.2%}\")\n", 174 | " assert accuracy >= 0.7 # Require at least 70% accuracy" 175 | ], 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "\n", 182 | "t_fe596c0d68894784969f18775cec634a.py::test_classifier[case0] Input: I need to debug this Python code\n", 183 | "Prediction: True\n", 184 | "\u001B[32mPASSED\u001B[0m\n", 185 | "t_fe596c0d68894784969f18775cec634a.py::test_classifier[case1] Input: The cat jumped over the lazy dog\n", 186 | "Prediction: False\n", 187 | "\u001B[32mPASSED\u001B[0m\n", 188 | "t_fe596c0d68894784969f18775cec634a.py::test_classifier[case2] Input: My monitor keeps flickering\n", 189 | "Prediction: True\n", 190 | "\u001B[32mPASSED\u001B[0m\n", 191 | "t_fe596c0d68894784969f18775cec634a.py::test_analysis Accuracy: 100.00%\n", 192 | "\u001B[32mPASSED\u001B[0m\n", 193 | "\n", 194 | "\u001B[32m======================================== \u001B[32m\u001B[1m4 passed\u001B[0m\u001B[32m in 1.64s\u001B[0m\u001B[32m =========================================\u001B[0m\n" 195 | ] 196 | } 197 | ], 198 | "execution_count": 4 199 | } 200 | ], 201 | "metadata": { 202 | "kernelspec": { 203 | "display_name": "Python 3", 204 | "language": "python", 205 | "name": "python3" 206 | }, 207 | "language_info": { 208 | "codemirror_mode": { 209 | "name": "ipython", 210 | "version": 2 211 | }, 212 | "file_extension": ".py", 213 | "mimetype": "text/x-python", 214 | "name": "python", 215 | "nbconvert_exporter": "python", 216 | "pygments_lexer": "ipython2", 217 | "version": "2.7.6" 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 5 222 | } 223 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # `pytest-evals` 🚀 4 | 5 | Test your LLM outputs against examples - no more manual checking! A (minimalistic) pytest plugin that helps you to 6 | evaluate that your LLM is giving good answers. 7 | 8 | [![PyPI version](https://img.shields.io/pypi/v/pytest-evals.svg)](https://pypi.org/p/pytest-evals) 9 | [![License](https://img.shields.io/github/license/AlmogBaku/pytest-evals.svg)](https://github.com/AlmogBaku/pytest-evals/blob/main/LICENSE) 10 | [![Issues](https://img.shields.io/github/issues/AlmogBaku/pytest-evals.svg)](https://github.com/AlmogBaku/pytest-evals/issues) 11 | [![Stars](https://img.shields.io/github/stars/AlmogBaku/pytest-evals.svg)](https://github.com/AlmogBaku/pytest-evals/stargazers) 12 | 13 | # 🧐 Why pytest-evals? 14 | 15 | Building LLM applications is exciting, but how do you know they're actually working well? `pytest-evals` helps you: 16 | 17 | - 🎯 **Test & Evaluate:** Run your LLM prompt against many cases 18 | - 📈 **Track & Measure:** Collect metrics and analyze the overall performance 19 | - 🔄 **Integrate Easily:** Works with pytest, Jupyter notebooks, and CI/CD pipelines 20 | - ✨ **Scale Up:** Run tests in parallel with [`pytest-xdist`](https://pytest-xdist.readthedocs.io/) and 21 | asynchronously with [`pytest-asyncio`](https://pytest-asyncio.readthedocs.io/). 22 | 23 | # 🚀 Getting Started 24 | 25 | To get started, install `pytest-evals` and write your tests: 26 | 27 | ```bash 28 | pip install pytest-evals 29 | ``` 30 | 31 | #### ⚡️ Quick Example 32 | 33 | For example, say you're building a support ticket classifier. You want to test cases like: 34 | 35 | | Input Text | Expected Classification | 36 | |--------------------------------------------------------|-------------------------| 37 | | My login isn't working and I need to access my account | account_access | 38 | | Can I get a refund for my last order? | billing | 39 | | How do I change my notification settings? | settings | 40 | 41 | `pytest-evals` helps you automatically test how your LLM perform against these cases, track accuracy, and ensure it 42 | keeps working as expected over time. 43 | 44 | ```python 45 | # Predict the LLM performance for each case 46 | @pytest.mark.eval(name="my_classifier") 47 | @pytest.mark.parametrize("case", TEST_DATA) 48 | def test_classifier(case: dict, eval_bag, classifier): 49 | # Run predictions and store results 50 | eval_bag.prediction = classifier(case["Input Text"]) 51 | eval_bag.expected = case["Expected Classification"] 52 | eval_bag.accuracy = eval_bag.prediction == eval_bag.expected 53 | 54 | 55 | # Now let's see how our app performing across all cases... 56 | @pytest.mark.eval_analysis(name="my_classifier") 57 | def test_analysis(eval_results): 58 | accuracy = sum([result.accuracy for result in eval_results]) / len(eval_results) 59 | print(f"Accuracy: {accuracy:.2%}") 60 | assert accuracy >= 0.7 # Ensure our performance is not degrading 🫢 61 | ``` 62 | 63 | Then, run your evaluation tests: 64 | 65 | ```bash 66 | # Run test cases 67 | pytest --run-eval 68 | 69 | # Analyze results 70 | pytest --run-eval-analysis 71 | ``` 72 | 73 | ## 😵‍💫 Why Another Eval Tool? 74 | 75 | **Evaluations are just tests.** No need for complex frameworks or DSLs. `pytest-evals` is minimalistic by design: 76 | 77 | - Use `pytest` - the tool you already know 78 | - Keep tests and evaluations together 79 | - Focus on logic, not infrastructure 80 | 81 | It just collects your results and lets you analyze them as a whole. Nothing more, nothing less. 82 |

(back to top)

83 | 84 | # 📚 User Guide 85 | 86 | Check out our detailed guides and examples: 87 | 88 | - [Basic evaluation](example/example_test.py) 89 | - [Basic of LLM as a judge evaluation](example/example_judge_test.py) 90 | - [Notebook example](example/example_notebook.ipynb) 91 | - [Advanced notebook example](example/example_notebook_advanced.ipynb) 92 | 93 | ## 🤔 How It Works 94 | 95 | Built on top of [pytest-harvest](https://smarie.github.io/python-pytest-harvest/), `pytest-evals` splits evaluation into 96 | two phases: 97 | 98 | 1. **Evaluation Phase**: Run all test cases, collecting results and metrics in `eval_bag`. The results are saved in a 99 | temporary file to allow the analysis phase to access them. 100 | 2. **Analysis Phase**: Process all results at once through `eval_results` to calculate final metrics 101 | 102 | This split allows you to: 103 | 104 | - Run evaluations in parallel (since the analysis test MUST run after all cases are done, we must run them separately) 105 | - Make pass/fail decisions on the overall evaluation results instead of individual test failures (by passing the 106 | `--supress-failed-exit-code --run-eval` flags) 107 | - Collect comprehensive metrics 108 | 109 | **Note**: When running evaluation tests, the rest of your test suite will not run. This is by design to keep the results 110 | clean and focused. 111 | 112 | ## 💾 Saving case results 113 | By default, `pytest-evals` saves the results of each case in a json file to allow the analysis phase to access them. 114 | However, this might not be a friendly format for deeper analysis. To save the results in a more friendly format, as a 115 | CSV file, use the `--save-evals-csv` flag: 116 | 117 | ```bash 118 | pytest --run-eval --save-evals-csv 119 | ``` 120 | 121 | ## 📝 Working with a notebook 122 | 123 | It's also possible to run evaluations from a notebook. To do that, simply 124 | install [ipytest](https://github.com/chmp/ipytest), and load the extension: 125 | 126 | ```python 127 | %load_ext pytest_evals 128 | ``` 129 | 130 | Then, use the magic commands `%%ipytest_eval` in your cell to run evaluations. This will run the evaluation phase and 131 | then the analysis phase. By default, using this magic will run both `--run-eval` and `--run-eval-analysis`, but you can 132 | specify your own flags by passing arguments right after the magic command (e.g., `%%ipytest_eval --run-eval`). 133 | 134 | ```python 135 | %%ipytest_eval 136 | import pytest 137 | 138 | 139 | @pytest.mark.eval(name="my_eval") 140 | def test_agent(eval_bag): 141 | eval_bag.prediction = agent.run(case["input"]) 142 | 143 | 144 | @pytest.mark.eval_analysis(name="my_eval") 145 | def test_analysis(eval_results): 146 | print(f"F1 Score: {calculate_f1(eval_results):.2%}") 147 | ``` 148 | 149 | You can see an example of this in the [`example/example_notebook.ipynb`](example/example_notebook.ipynb) notebook. Or 150 | look at the [advanced example](example/example_notebook_advanced.ipynb) for a more complex example that tracks multiple 151 | experiments. 152 |

(back to top)

153 | 154 | ## 🏗️ Production Use 155 | 156 | ### 📚 Managing Test Data (Evaluation Set) 157 | 158 | It's recommended to use a CSV file to store test data. This makes it easier to manage large datasets and allows you to 159 | communicate with non-technical stakeholders. 160 | 161 | To do this, you can use `pandas` to read the CSV file and pass the test cases as parameters to your tests using 162 | `@pytest.mark.parametrize` 🙃 : 163 | 164 | ```python 165 | import pandas as pd 166 | import pytest 167 | 168 | test_data = pd.read_csv("tests/testdata.csv") 169 | 170 | 171 | @pytest.mark.eval(name="my_eval") 172 | @pytest.mark.parametrize("case", test_data.to_dict(orient="records")) 173 | def test_agent(case, eval_bag, agent): 174 | eval_bag.prediction = agent.run(case["input"]) 175 | ``` 176 | 177 | In case you need to select a subset of the test data (e.g., a golden set), you can simply define an environment variable 178 | to indicate that, and filter the data with `pandas`. 179 | 180 | ### 🔀 CI Integration 181 | 182 | Run tests and analysis as separate steps: 183 | 184 | ```yaml 185 | evaluate: 186 | steps: 187 | - run: pytest --run-eval -n auto --supress-failed-exit-code # Run cases in parallel 188 | - run: pytest --run-eval-analysis # Analyze results 189 | ``` 190 | 191 | Use `--supress-failed-exit-code` with `--run-eval` - let the analysis phase determine success/failure. **If all your 192 | cases pass, your evaluation set is probably too small!** 193 | 194 | ### ⚡️ Parallel Testing 195 | 196 | As your evaluation set grows, you may want to run your test cases in parallel. To do this, install 197 | [`pytest-xdist`](https://pytest-xdist.readthedocs.io/). `pytest-evals` will support that out of the box 🚀. 198 | 199 | ```bash 200 | run: pytest --run-eval -n auto 201 | ``` 202 | 203 |

(back to top)

204 | 205 | # 👷 Contributing 206 | 207 | Contributions make the open-source community a fantastic place to learn, inspire, and create. Any contributions you make 208 | are **greatly appreciated** (not only code! but also documenting, blogging, or giving us feedback) 😍. 209 | 210 | Please fork the repo and create a pull request if you have a suggestion. You can also simply open an issue to give us 211 | some feedback. 212 | 213 | **Don't forget to give the project [a star](#top)! ⭐️** 214 | 215 | For more information about contributing code to the project, read the [CONTRIBUTING.md](CONTRIBUTING.md) guide. 216 | 217 | # 📃 License 218 | 219 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 220 |

(back to top)

-------------------------------------------------------------------------------- /src/pytest_evals/plugin.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from collections import defaultdict 4 | from os.path import isabs 5 | from pathlib import Path 6 | from typing import Any, List, Mapping, cast 7 | 8 | import cloudpickle 9 | import pytest 10 | from pytest_harvest import create_results_bag_fixture, get_session_results_dct 11 | 12 | from .json_encoder import AdvancedJsonEncoder 13 | from .models import EvalResults, EvalBag 14 | 15 | # Constants 16 | EVAL_MARK_NAME = "eval" # pragma: no cover 17 | EVAL_ANALYSIS_MARK_NAME = "eval_analysis" # pragma: no cover 18 | 19 | # Fixtures 20 | eval_bag = create_results_bag_fixture( 21 | "fixture_store", name="eval_bag", bag_type=EvalBag 22 | ) # pragma: no cover 23 | 24 | 25 | @pytest.fixture(scope="function") 26 | def eval_bag_results(request, out_path) -> Mapping[str, Mapping[str, Any]]: 27 | """Fixture that provides access to evaluation results.""" 28 | ret = cast(dict, simple_eval_results(request.session)) 29 | 30 | if not request.session.config.getoption("--run-eval"): 31 | raw = out_path / "eval-results-raw.json" 32 | if raw.exists(): 33 | with open(raw, "r") as f: 34 | ret.update(json.load(f)) 35 | return ret 36 | 37 | 38 | @pytest.fixture(scope="function") 39 | def eval_results(request, eval_bag_results) -> List[EvalResults]: 40 | """Fixture that provides access to evaluation results as EvalResult objects.""" 41 | marker = eval_analysis_marker(request.node.own_markers) 42 | if not marker: 43 | raise ValueError( 44 | f"Only tests marked with {EVAL_ANALYSIS_MARK_NAME} can use the eval_results fixture" 45 | ) 46 | 47 | return [ 48 | EvalResults.from_result_bag(v) 49 | for k, v in eval_bag_results.items() 50 | if v["eval_name"] == marker.kwargs["name"] 51 | ] 52 | 53 | 54 | def pytest_addoption(parser, pluginmanager): 55 | """Add options to the pytest CLI.""" 56 | group = parser.getgroup("Evals", "Evals configuration") 57 | group.addoption( 58 | "--out-path", 59 | action="store", 60 | default="./test-out/", 61 | help="Path to store test artifacts", 62 | ) 63 | group.addoption( 64 | "--supress-failed-exit-code", 65 | action="store_true", 66 | default=False, 67 | help="Supress failed exit code. Useful for CI/CD with a separate step for test analysis", 68 | ) 69 | group.addoption( 70 | "--run-eval", 71 | action="store_true", 72 | default=False, 73 | help="Run evaluation tests(mark with @pytest.mark.eval)", 74 | ) 75 | group.addoption( 76 | "--run-eval-analysis", 77 | action="store_true", 78 | default=False, 79 | help="Run evaluation analysis tests(mark with @pytest.mark.eval_analysis)", 80 | ) 81 | group.addoption( 82 | "--save-evals-csv", 83 | action="store_true", 84 | default=False, 85 | help="Save evaluation cases results to a CSV file", 86 | ) 87 | 88 | 89 | def pytest_configure(config): 90 | """Configure the pytest session with the options.""" 91 | config.addinivalue_line( 92 | "markers", 93 | "eval: mark test as evaluation test. Evaluation tests will only run when --run-eval is passed", 94 | ) 95 | config.addinivalue_line( 96 | "markers", 97 | "eval_analysis: mark test as an evaluation analysis. Analysis tests MUST run after all other tests. Analysis tests will only run when --run_eval-analysis is passed", 98 | ) 99 | 100 | out_path = Path(config.getoption("--out-path")) 101 | if not isabs(out_path): 102 | out_path = Path(config.invocation_dir / out_path) 103 | config.out_path = out_path 104 | config.out_path.mkdir(exist_ok=True) 105 | 106 | if config.getoption("--save-evals-csv") and not config.getoption("--run-eval"): 107 | raise ValueError( 108 | "The --save-evals-csv option can only be used with the --run-eval option" 109 | ) 110 | 111 | 112 | @pytest.fixture 113 | def out_path(request) -> Path: 114 | """Get the output storage path. This is useful for storing test artifacts such as results.""" 115 | return request.config.out_path 116 | 117 | 118 | def is_xdist_session(config): 119 | """Check if the session is a xdist session.""" 120 | return ( 121 | hasattr(config, "workerinput") 122 | or hasattr(config, "workerid") 123 | or config.getoption("dist", "no") != "no" 124 | ) 125 | 126 | 127 | def eval_analysis_marker(markers: list[pytest.Mark]) -> pytest.Mark | None: 128 | """Get the eval_analysis marker if present.""" 129 | m = next((m for m in markers if m.name == EVAL_ANALYSIS_MARK_NAME), None) 130 | if m and "name" not in m.kwargs: 131 | raise ValueError( 132 | f"Marker {EVAL_ANALYSIS_MARK_NAME} must have a 'name' argument" 133 | ) 134 | return m 135 | 136 | 137 | def eval_marker(markers: list[pytest.Mark]) -> pytest.Mark | None: 138 | """Get the eval marker if present.""" 139 | m = next((m for m in markers if m.name == EVAL_MARK_NAME), None) 140 | if m and "name" not in m.kwargs: 141 | raise ValueError(f"Marker {EVAL_MARK_NAME} must have a 'name' argument") 142 | return m 143 | 144 | 145 | def pytest_collection_modifyitems(config, items): 146 | """Modify the collection of items.""" 147 | if ( 148 | is_xdist_session(config) 149 | and config.getoption("--run-eval") 150 | and config.getoption("--run-eval-analysis") 151 | ): 152 | raise ValueError( 153 | "In xdist sessions, evaluation analysis must run after the evaluation tests " 154 | "(as a separated execution). Therefore, --run-eval and --run-eval-analysis " 155 | "cannot be used together" 156 | ) 157 | 158 | run_eval = config.getoption("--run-eval") 159 | run_analysis = config.getoption("--run-eval-analysis") 160 | skip_eval = pytest.mark.skip(reason="need --run-eval option to run") 161 | skip_analysis = pytest.mark.skip(reason="need --run-eval-analysis option to run") 162 | 163 | for item in items[:]: 164 | is_eval = eval_marker(item.own_markers) is not None 165 | is_analysis = eval_analysis_marker(item.own_markers) is not None 166 | 167 | if is_analysis and is_eval: 168 | raise ValueError( 169 | f"{item.nodeid} is marked as both `{EVAL_MARK_NAME}` and " 170 | f"`{EVAL_ANALYSIS_MARK_NAME}`." 171 | ) 172 | 173 | if run_eval or run_analysis: 174 | if is_eval and not run_eval: 175 | item.add_marker(skip_eval) 176 | elif is_analysis and not run_analysis: 177 | item.add_marker(skip_analysis) 178 | elif not is_eval and not is_analysis: 179 | items.remove(item) 180 | else: 181 | if is_eval: 182 | item.add_marker(skip_eval) # pragma: no cover 183 | if is_analysis: 184 | item.add_marker(skip_analysis) # pragma: no cover 185 | 186 | 187 | def pytest_sessionfinish(session): 188 | """Handle session finish.""" 189 | orig_exitstatus = getattr(session, "exitstatus", 0) 190 | if ( 191 | session.config.getoption("--supress-failed-exit-code") 192 | and orig_exitstatus != pytest.ExitCode.INTERNAL_ERROR 193 | ): 194 | session.exitstatus = 0 195 | 196 | if hasattr(session.config, "workerinput"): 197 | return 198 | 199 | if ( 200 | session.config.getoption("--run-eval") 201 | and orig_exitstatus != pytest.ExitCode.INTERNAL_ERROR 202 | ): 203 | res = simple_eval_results(session) 204 | with open(session.config.out_path / "eval-results-raw.json", "w") as f: 205 | json.dump(res, f, cls=AdvancedJsonEncoder) # noqa: ignore 206 | 207 | if session.config.getoption("--save-evals-csv"): 208 | try: 209 | import pandas as pd 210 | except ImportError: 211 | raise ImportError( 212 | "The --save-evals-csv option requires the pandas library" 213 | ) 214 | 215 | results_df = pd.json_normalize( 216 | [ 217 | { 218 | "test_id": name, 219 | "status": data["status"], 220 | "duration_ms": data["duration_ms"], 221 | "pytest_obj_name": data["pytest_obj_name"], 222 | "eval_name": data["eval_name"], 223 | "params": json.loads( 224 | json.dumps(data["params"], cls=AdvancedJsonEncoder) 225 | ), 226 | "eval_bag": json.loads( 227 | json.dumps( 228 | data["fixtures"].get("eval_bag", {}), 229 | cls=AdvancedJsonEncoder, 230 | ) 231 | ), 232 | } 233 | for name, data in res.items() 234 | ] 235 | ) 236 | if not results_df.empty: 237 | results_df = results_df.set_index("test_id") 238 | results_df.to_csv(session.config.out_path / "eval-results-raw.csv") 239 | 240 | 241 | def simple_eval_results(session) -> Mapping[str, Mapping[str, Any]]: 242 | """Get simple evaluation results from the session.""" 243 | res = get_session_results_dct(session, results_bag_fixture_name="eval_bag") 244 | 245 | ret = defaultdict(dict) 246 | for k, v in res.items(): 247 | obj = v.get("pytest_obj", None) 248 | if not obj or not hasattr(obj, "pytestmark"): 249 | continue # pragma: no cover 250 | 251 | e_marker = eval_marker(obj.pytestmark) 252 | if not e_marker: 253 | continue # pragma: no cover 254 | 255 | ret[k] = {k1: v1 for k1, v1 in v.items() if k1 != "pytest_obj"} 256 | ret[k]["pytest_obj_name"] = v["pytest_obj"].__name__ 257 | ret[k]["eval_name"] = e_marker.kwargs["name"] 258 | 259 | return ret 260 | 261 | 262 | # no cover: start 263 | 264 | # XDist harvesting configuration 265 | XDIST_HARVESTED_PATH = Path("./.xdist_harvested/") 266 | 267 | 268 | def pytest_harvest_xdist_worker_dump(worker_id, session_items, fixture_store) -> bool: 269 | """Dump worker results using cloudpickle.""" 270 | with open(XDIST_HARVESTED_PATH / f"{worker_id}.pkl", "wb") as f: 271 | try: 272 | cloudpickle.dump((session_items, fixture_store), f) 273 | except Exception as e: 274 | logging.warning( 275 | f"Error while pickling worker {worker_id}'s harvested results: [{e.__class__}] {e}" 276 | ) 277 | return True 278 | 279 | 280 | def pytest_harvest_xdist_load(): 281 | """Load worker results using cloudpickle.""" 282 | workers_saved_material = dict() 283 | for pkl_file in XDIST_HARVESTED_PATH.glob("*.pkl"): 284 | wid = pkl_file.stem 285 | with pkl_file.open("rb") as f: 286 | workers_saved_material[wid] = cloudpickle.load(f) 287 | return workers_saved_material 288 | 289 | 290 | # no cover: stop 291 | -------------------------------------------------------------------------------- /tests/test_plugin.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from unittest.mock import Mock, patch 4 | 5 | import pytest 6 | 7 | from pytest_evals import eval_analysis_marker 8 | from pytest_evals import plugin 9 | 10 | 11 | def test_eval_marker_configuration(pytester): 12 | """Test basic eval marker functionality 13 | 14 | Verifies that a test with properly configured eval marker: 15 | - Is collected when --run-eval is used 16 | - Successfully executes and passes 17 | """ 18 | pytester.makepyfile(""" 19 | import pytest 20 | 21 | @pytest.mark.eval(name="test_eval") 22 | def test_simple(): 23 | assert True 24 | """) 25 | 26 | result = pytester.runpytest("--run-eval") 27 | result.assert_outcomes(passed=1) 28 | 29 | 30 | def test_eval_analysis_marker_configuration(pytester): 31 | """Test that tests are properly selected/skipped based on eval/eval-analysis markers 32 | 33 | Verifies: 34 | - With --run-eval: eval_analysis tests are skipped 35 | - With --run-eval-analysis: eval tests are skipped 36 | """ 37 | 38 | pytester.makepyfile(""" 39 | import pytest 40 | 41 | @pytest.mark.eval_analysis(name="test_eval") 42 | def test_analysis(eval_results): 43 | assert len(eval_results) == 0 44 | """) 45 | 46 | result = pytester.runpytest("--run-eval-analysis") 47 | result.assert_outcomes(passed=1) 48 | 49 | 50 | def test_missing_name_in_eval_marker(pytester): 51 | """Test that eval marker requires name parameter""" 52 | pytester.makepyfile(""" 53 | import pytest 54 | 55 | @pytest.mark.eval 56 | def test_simple(): 57 | assert True 58 | """) 59 | 60 | result = pytester.runpytest("--run-eval") 61 | assert result.ret != 0 62 | 63 | 64 | # Comprehensive workflow test 65 | def test_complete_evaluation_workflow(pytester): 66 | """Test complete evaluation workflow including fixture behavior""" 67 | pytester.makepyfile(""" 68 | import pytest 69 | 70 | TEST_DATA = [ 71 | {"input": "test1", "expected": True}, 72 | {"input": "test2", "expected": False}, 73 | ] 74 | 75 | @pytest.fixture 76 | def mock_classifier(): 77 | def classify(text: str) -> bool: 78 | return "test1" in text 79 | return classify 80 | 81 | # Evaluation phase with fixture usage 82 | @pytest.mark.eval(name="test_classifier") 83 | @pytest.mark.parametrize("case", TEST_DATA) 84 | def test_classifier(case, eval_bag, mock_classifier): 85 | eval_bag.input = case["input"] 86 | eval_bag.expected = case["expected"] 87 | eval_bag.prediction = mock_classifier(case["input"]) 88 | eval_bag.metadata = {"test_type": "classification"} 89 | assert eval_bag.prediction == case["expected"] 90 | 91 | # Analysis phase with enhanced fixture verification 92 | @pytest.mark.eval_analysis(name="test_classifier") 93 | def test_analysis(eval_results): 94 | assert len(eval_results) == 2 95 | 96 | # Verify fixture data preservation 97 | for result in eval_results: 98 | assert hasattr(result.result, "metadata") 99 | assert result.result.metadata["test_type"] == "classification" 100 | 101 | # Verify analysis results 102 | correct = sum(1 for r in eval_results 103 | if r.result.prediction == r.result.expected) 104 | accuracy = correct / len(eval_results) 105 | assert accuracy == 1.0 106 | """) 107 | 108 | # Run evaluation phase 109 | result_eval = pytester.runpytest("--run-eval") 110 | result_eval.assert_outcomes(passed=2, skipped=1) 111 | 112 | # Run analysis phase 113 | result_analysis = pytester.runpytest("--run-eval-analysis") 114 | result_analysis.assert_outcomes(passed=1, skipped=2) 115 | 116 | 117 | def test_output_file_creation(pytester, tmp_path): 118 | """Test that results are properly saved to output file""" 119 | out_dir = tmp_path / "test-output" 120 | out_dir.mkdir(exist_ok=True) 121 | 122 | pytester.makepyfile(""" 123 | import pytest 124 | 125 | @pytest.mark.eval(name="test_eval") 126 | def test_simple(eval_bag): 127 | eval_bag.result = "test_value" 128 | assert True 129 | """) 130 | 131 | result = pytester.runpytest("--run-eval", f"--out-path={out_dir}", "-v") 132 | result.assert_outcomes(passed=1) 133 | 134 | results_file = Path(out_dir) / "eval-results-raw.json" 135 | assert results_file.exists() 136 | 137 | with open(results_file) as f: 138 | results = json.load(f) 139 | assert any( 140 | "test_value" in str(v.get("fixtures").get("eval_bag")) 141 | for v in results.values() 142 | ) 143 | 144 | 145 | def test_eval_marker_collection_scenarios(pytester): 146 | """Test different scenarios for eval marker collection""" 147 | pytester.makepyfile(""" 148 | import pytest 149 | from pytest_harvest import get_session_results_dct 150 | 151 | # Case 1: No pytestmark attribute 152 | def test_no_pytestmark(): 153 | assert True 154 | 155 | # Case 2: Has pytestmark but not the eval mark 156 | @pytest.mark.skip 157 | def test_other_mark(): 158 | assert True 159 | 160 | # Case 3: Class without pytestmark 161 | class TestClass: 162 | def test_method(self): 163 | assert True 164 | 165 | # Case 4: Class with non-eval pytestmark 166 | class TestClassWithMark: 167 | pytestmark = [pytest.mark.skip] 168 | def test_no_eval_marker(self): 169 | assert True 170 | 171 | # Case 5: Test with eval mark (should be included) 172 | @pytest.mark.eval(name="test") 173 | def test_with_eval(eval_bag): 174 | eval_bag.value = 42 175 | assert True 176 | """) 177 | 178 | result = pytester.runpytest("--run-eval") 179 | result.assert_outcomes(passed=1) 180 | 181 | 182 | @pytest.mark.parametrize( 183 | "scenario", 184 | [ 185 | # Empty file scenario - expect empty results 186 | ("empty_file", {}), 187 | # Valid data scenario - expect one result with specific values 188 | ( 189 | "valid_data", 190 | { 191 | "test_1": { 192 | "eval_name": "sample_eval", 193 | "fixtures": {"eval_bag": {"value": 42}}, 194 | } 195 | }, 196 | ), 197 | # Missing file scenario - expect empty results 198 | ("missing_file", None), 199 | ], 200 | ) 201 | def test_eval_bag_results_scenarios(pytester, tmp_path, scenario): 202 | """Test eval_bag_results behavior with different results file states 203 | 204 | Parameters: 205 | scenario: Tuple of (scenario_name, file_content) where: 206 | - empty_file: Results file exists but is empty ({}) 207 | - valid_data: Results file exists with valid test data 208 | - missing_file: Results file does not exist (None) 209 | 210 | Each scenario should handle the case gracefully and provide appropriate results. 211 | """ 212 | scenario_name, file_content = scenario 213 | out_dir = tmp_path / "test-out" 214 | out_dir.mkdir(parents=True) 215 | results_file = out_dir / "eval-results-raw.json" 216 | 217 | if file_content is not None: 218 | results_file.parent.mkdir(exist_ok=True) 219 | results_file.write_text(json.dumps(file_content)) 220 | 221 | pytester.makepyfile(f""" 222 | def test_results(eval_bag_results): 223 | if "{scenario_name}" == "empty_file": 224 | assert len(eval_bag_results) == 0 225 | elif "{scenario_name}" == "valid_data": 226 | assert len(eval_bag_results) == 1 227 | assert "test_1" in eval_bag_results 228 | assert eval_bag_results["test_1"]["eval_name"] == "sample_eval" 229 | else: # missing_file 230 | assert len(eval_bag_results) == 0 231 | """) 232 | 233 | result = pytester.runpytest(f"--out-path={out_dir}") 234 | result.assert_outcomes(passed=1) 235 | 236 | 237 | # Error handling and configuration tests 238 | def test_invalid_marker_combination(pytester): 239 | """Test that a test cannot have both eval and eval_analysis markers""" 240 | pytester.makepyfile(""" 241 | import pytest 242 | 243 | @pytest.mark.eval(name="test") 244 | @pytest.mark.eval_analysis(name="test") 245 | def test_invalid(): 246 | assert True 247 | """) 248 | 249 | result = pytester.runpytest("--run-eval") 250 | assert result.ret != 0 251 | 252 | 253 | def test_suppress_failed_exit_code_scenarios(pytester): 254 | """Test all scenarios related to suppressing failed exit codes""" 255 | pytester.makepyfile(""" 256 | import pytest 257 | 258 | @pytest.mark.eval(name="test_eval") 259 | def test_failing(): 260 | assert False 261 | 262 | @pytest.mark.eval(name="test_eval") 263 | def test_internal_error(): 264 | raise pytest.UsageError("Internal error") 265 | """) 266 | 267 | # Case 1: Without suppress flag - should fail with non-zero exit code 268 | result1 = pytester.runpytest("--run-eval") 269 | result1.assert_outcomes(failed=2) 270 | assert result1.ret != 0 271 | 272 | # Case 2: With suppress flag - expect zero exit code despite failures 273 | result2 = pytester.runpytest("--run-eval", "--supress-failed-exit-code") 274 | result2.assert_outcomes(failed=2) 275 | assert result2.ret == 0 276 | 277 | 278 | def test_xdist_eval_flags_unit(): 279 | """Unit test for xdist session with both eval and eval-analysis flags""" 280 | config = Mock() 281 | config.getoption.side_effect = lambda x: x in ["--run-eval", "--run-eval-analysis"] 282 | 283 | with patch.object(plugin, "is_xdist_session", return_value=True): 284 | with pytest.raises(ValueError, match="cannot be used together"): 285 | plugin.pytest_collection_modifyitems(config, []) 286 | 287 | 288 | def test_xdist_eval_flags_integration(pytester): 289 | """Integration test for xdist compatibility with eval flags 290 | 291 | Verifies that attempting to run eval and eval-analysis tests together 292 | in distributed mode raises appropriate error with explanation message 293 | """ 294 | pytester.makepyfile(""" 295 | import pytest 296 | 297 | @pytest.mark.eval(name="test") 298 | def test_eval(): 299 | assert True 300 | 301 | @pytest.mark.eval_analysis(name="test") 302 | def test_analysis(eval_results): 303 | assert True 304 | """) 305 | 306 | result = pytester.runpytest("--run-eval", "--run-eval-analysis", "-n", "2") 307 | assert result.ret != 0 308 | result.stdout.fnmatch_lines( 309 | "*evaluation analysis must run after the evaluation tests*" 310 | ) 311 | 312 | 313 | def test_marker_basic_cases(): 314 | """Test eval_analysis_marker validation logic 315 | 316 | Tests multiple marker scenarios: 317 | - Valid marker with name parameter 318 | - Invalid marker missing name parameter 319 | - No markers present 320 | - Other unrelated markers 321 | - Mixed markers (eval with other marks) 322 | 323 | Verifies proper marker validation and selection in each case. 324 | """ 325 | # Valid marker 326 | valid = pytest.mark.eval_analysis(name="test") 327 | assert eval_analysis_marker([valid.mark]) == valid.mark 328 | 329 | # Missing name param 330 | invalid = pytest.mark.eval_analysis() 331 | with pytest.raises(ValueError, match="must have a 'name' argument"): 332 | eval_analysis_marker([invalid.mark]) 333 | 334 | # No markers 335 | assert eval_analysis_marker([]) is None 336 | 337 | # Other markers 338 | other = pytest.mark.skip(reason="skip") 339 | assert eval_analysis_marker([other.mark]) is None 340 | 341 | # Mixed markers 342 | mixed = [other.mark, valid.mark] 343 | assert eval_analysis_marker(mixed) == valid.mark 344 | 345 | 346 | def test_eval_analysis_marker_selection(pytester): 347 | """Test marker skipping behavior""" 348 | pytester.makepyfile(""" 349 | import pytest 350 | 351 | @pytest.mark.eval(name="test") 352 | def test_eval(): 353 | pass 354 | 355 | @pytest.mark.eval_analysis(name="test") 356 | def test_analysis(): 357 | pass 358 | """) 359 | 360 | # Test with run_eval 361 | result1 = pytester.runpytest("--run-eval") 362 | result1.assert_outcomes(skipped=1, passed=1) 363 | 364 | # Test with run_eval_analysis 365 | result2 = pytester.runpytest("--run-eval-analysis") 366 | result2.assert_outcomes(skipped=1, passed=1) 367 | 368 | 369 | def test_worker_session_finish(pytestconfig): 370 | """Test worker session finish handling""" 371 | 372 | class WorkerSession: 373 | class Config: 374 | workerinput = {} 375 | 376 | def getoption(self, *args, **kwargs): 377 | return False 378 | 379 | config = Config() 380 | exitstatus = 0 381 | 382 | assert plugin.pytest_sessionfinish(WorkerSession()) is None 383 | 384 | 385 | def test_save_evals_csv_option(pytester, tmp_path): 386 | """Test the --save-evals-csv option with various scenarios""" 387 | out_dir = tmp_path / "test-output" 388 | out_dir.mkdir(exist_ok=True) 389 | 390 | # Create test file with evaluation test 391 | pytester.makepyfile(""" 392 | import pytest 393 | 394 | @pytest.mark.eval(name="test_eval") 395 | def test_simple(eval_bag): 396 | eval_bag.result = "test_value" 397 | eval_bag.metadata = {"key": "value"} 398 | assert True 399 | """) 400 | 401 | result1 = pytester.runpytest(f"--out-path={out_dir}", "--save-evals-csv") 402 | assert result1.ret != 0 403 | result1.stderr.fnmatch_lines( 404 | "*--save-evals-csv option can only be used with the --run-eval option*" 405 | ) 406 | 407 | # Case 2: Test with both flags and verify CSV creation 408 | result2 = pytester.runpytest( 409 | "--run-eval", f"--out-path={out_dir}", "--save-evals-csv", "-v" 410 | ) 411 | result2.assert_outcomes(passed=1) 412 | 413 | # Verify both JSON and CSV files exist 414 | csv_file = out_dir / "eval-results-raw.csv" 415 | json_file = out_dir / "eval-results-raw.json" 416 | assert csv_file.exists() 417 | assert json_file.exists() 418 | 419 | 420 | def test_save_evals_csv_missing_pandas(pytester, tmp_path, monkeypatch): 421 | """Test handling of missing pandas when --save-evals-csv is used""" 422 | out_dir = tmp_path / "test-output" 423 | out_dir.mkdir(exist_ok=True) 424 | 425 | # Mock pandas to raise ImportError 426 | import sys 427 | 428 | with patch.dict(sys.modules, {"pandas": None}): 429 | pytester.makepyfile(""" 430 | import pytest 431 | 432 | @pytest.mark.eval(name="test_eval") 433 | def test_simple(): 434 | assert True 435 | """) 436 | 437 | result = pytester.runpytest( 438 | "--run-eval", f"--out-path={out_dir}", "--save-evals-csv" 439 | ) 440 | assert result.ret != 0 441 | result.stderr.fnmatch_lines( 442 | "*The --save-evals-csv option requires the pandas library*" 443 | ) 444 | 445 | 446 | def test_csv_data_normalization(pytester, tmp_path): 447 | """Test that complex data structures are properly normalized in CSV output""" 448 | out_dir = tmp_path / "test-output" 449 | out_dir.mkdir(exist_ok=True) 450 | 451 | pytester.makepyfile(""" 452 | import pytest 453 | from datetime import datetime 454 | 455 | TEST_DATA = [ 456 | {"input": "test1", "expected": True}, 457 | {"input": "test2", "expected": False} 458 | ] 459 | 460 | @pytest.mark.eval(name="test_eval") 461 | @pytest.mark.parametrize("case", TEST_DATA) 462 | def test_complex_data(eval_bag, case): 463 | eval_bag.nested_data = { 464 | "list": [1, 2, 3], 465 | "dict": {"a": 1, "b": 2}, 466 | "date": str(datetime.now()), 467 | "case": case 468 | } 469 | assert True 470 | """) 471 | 472 | result = pytester.runpytest( 473 | "--run-eval", f"--out-path={out_dir}", "--save-evals-csv", "-v" 474 | ) 475 | result.assert_outcomes(passed=2) # Two test cases due to parametrize 476 | 477 | # Verify CSV was created with normalized data 478 | csv_file = out_dir / "eval-results-raw.csv" 479 | assert csv_file.exists() 480 | 481 | # Read the CSV content to verify structure (if pandas is available) 482 | try: 483 | import pandas as pd 484 | 485 | df = pd.read_csv(csv_file) 486 | assert not df.empty 487 | assert "eval_bag.nested_data.date" in df.columns 488 | except ImportError: 489 | pass # Skip detailed verification if pandas isn't available 490 | -------------------------------------------------------------------------------- /example/example_notebook_advanced.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a13d6843ef4301", 6 | "metadata": {}, 7 | "source": [ 8 | "# Advanced LLM Evaluation with pytest-evals\n", 9 | "\n", 10 | "This notebook demonstrates advanced techniques for evaluating LLM performance:\n", 11 | "1. Running parallel evaluations across multiple models\n", 12 | "2. Tracking and comparing results across different runs\n", 13 | "3. Visualizing performance trends and model comparisons\n", 14 | "4. Statistical analysis of model performance\n", 15 | "\n", 16 | "## Setup\n", 17 | "First, let's load required extensions and configure our environment." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "id": "cefdc9561d3ab77a", 23 | "metadata": { 24 | "ExecuteTime": { 25 | "end_time": "2025-01-21T08:33:47.364267Z", 26 | "start_time": "2025-01-21T08:33:47.278694Z" 27 | } 28 | }, 29 | "source": [ 30 | "%load_ext pytest_evals" 31 | ], 32 | "outputs": [], 33 | "execution_count": 1 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "80ea828ddde1c26a", 38 | "metadata": {}, 39 | "source": [ 40 | "## Model Implementation\n", 41 | "\n", 42 | "Define our classifier that leverages different LLM models to determine if text is computer-related." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "id": "6e55ed323e3cc980", 48 | "metadata": { 49 | "ExecuteTime": { 50 | "end_time": "2025-01-21T08:33:49.412049Z", 51 | "start_time": "2025-01-21T08:33:49.211224Z" 52 | } 53 | }, 54 | "source": [ 55 | "import openai\n", 56 | "\n", 57 | "\n", 58 | "def classify(text: str, model=\"gpt-4o-mini\") -> bool:\n", 59 | " \"\"\"Classify text as computer-related or not using specified LLM model.\n", 60 | "\n", 61 | " Args:\n", 62 | " text (str): Input text to classify\n", 63 | " model (str): Model identifier (e.g., \"gpt-4o\", \"gpt-4o-mini\")\n", 64 | "\n", 65 | " Returns:\n", 66 | " bool: True if text is computer-related, False otherwise\n", 67 | " \"\"\"\n", 68 | " resp = openai.chat.completions.create(\n", 69 | " model=model,\n", 70 | " messages=[\n", 71 | " {\n", 72 | " \"role\": \"system\",\n", 73 | " \"content\": \"Is this text about a computer-related subject? \"\n", 74 | " \"Reply ONLY with either true or false.\",\n", 75 | " },\n", 76 | " {\"role\": \"user\", \"content\": text},\n", 77 | " ],\n", 78 | " )\n", 79 | " return resp.choices[0].message.content.lower() == \"true\"" 80 | ], 81 | "outputs": [], 82 | "execution_count": 2 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "51c2d42c6a10f861", 87 | "metadata": {}, 88 | "source": [ 89 | "## Test Data and Results Tracking\n", 90 | "\n", 91 | "Set up our test cases and initialize our experiment tracking DataFrame." 92 | ] 93 | }, 94 | { 95 | "metadata": { 96 | "ExecuteTime": { 97 | "end_time": "2025-01-21T08:33:50.609927Z", 98 | "start_time": "2025-01-21T08:33:50.605050Z" 99 | } 100 | }, 101 | "cell_type": "code", 102 | "source": [ 103 | "import pandas as pd\n", 104 | "\n", 105 | "# Define test cases\n", 106 | "TEST_DATA = [\n", 107 | " {\"text\": \"I need to debug this Python code\", \"label\": True},\n", 108 | " {\"text\": \"The cat jumped over the lazy dog\", \"label\": False},\n", 109 | " {\"text\": \"My monitor keeps flickering\", \"label\": True},\n", 110 | " {\"text\": \"The weather is nice today\", \"label\": False},\n", 111 | " {\"text\": \"Updating system drivers fixed the issue\", \"label\": True},\n", 112 | " {\"text\": \"the new llama can understand bizzare nuanced slang\", \"label\": True},\n", 113 | "]\n", 114 | "\n", 115 | "# Initialize experiment tracking\n", 116 | "experiments_df = pd.DataFrame()" 117 | ], 118 | "id": "58c440f913c3d97", 119 | "outputs": [], 120 | "execution_count": 3 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "959af2e2c4f8670", 125 | "metadata": {}, 126 | "source": [ 127 | "## Run Model Evaluations\n", 128 | "\n", 129 | "Execute evaluations across different models and collect detailed metrics." 130 | ] 131 | }, 132 | { 133 | "metadata": { 134 | "ExecuteTime": { 135 | "end_time": "2025-01-21T08:34:02.226687Z", 136 | "start_time": "2025-01-21T08:33:52.406707Z" 137 | } 138 | }, 139 | "cell_type": "code", 140 | "source": [ 141 | "%%ipytest_evals --run-eval\n", 142 | "import pytest\n", 143 | "from collections import defaultdict\n", 144 | "\n", 145 | "@pytest.mark.eval(name=\"computer_classifier\")\n", 146 | "@pytest.mark.parametrize(\"case\", TEST_DATA)\n", 147 | "@pytest.mark.parametrize(\"model\", [\"gpt-4o\", \"gpt-4o-mini\", \"gpt-3.5-turbo\"])\n", 148 | "def test_classifier(case: dict, model, eval_bag):\n", 149 | " \"\"\"Test individual classification cases across different models.\n", 150 | " \n", 151 | " Args:\n", 152 | " case (dict): Test case with text and expected label\n", 153 | " eval_bag: Container for test results\n", 154 | " model: Model identifier\n", 155 | " \"\"\"\n", 156 | " eval_bag.input_text = case[\"text\"]\n", 157 | " eval_bag.label = case[\"label\"]\n", 158 | " eval_bag.prediction = classify(case[\"text\"], model)\n", 159 | " eval_bag.precision = 1 if eval_bag.prediction == eval_bag.label else 0\n", 160 | "\n", 161 | " print(f\"Model: {model}\")\n", 162 | " print(f\"Input: {eval_bag.input_text}\")\n", 163 | " print(f\"Expected: {eval_bag.label}, Predicted: {eval_bag.prediction}\\n\")\n", 164 | "\n", 165 | " assert eval_bag.prediction == eval_bag.label" 166 | ], 167 | "id": "4c8575999b45c4b0", 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "\n", 174 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-case0] Model: gpt-4o\n", 175 | "Input: I need to debug this Python code\n", 176 | "Expected: True, Predicted: True\n", 177 | "\n", 178 | "\u001B[32mPASSED\u001B[0m\n", 179 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-case1] Model: gpt-4o\n", 180 | "Input: The cat jumped over the lazy dog\n", 181 | "Expected: False, Predicted: False\n", 182 | "\n", 183 | "\u001B[32mPASSED\u001B[0m\n", 184 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-case2] Model: gpt-4o\n", 185 | "Input: My monitor keeps flickering\n", 186 | "Expected: True, Predicted: True\n", 187 | "\n", 188 | "\u001B[32mPASSED\u001B[0m\n", 189 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-case3] Model: gpt-4o\n", 190 | "Input: The weather is nice today\n", 191 | "Expected: False, Predicted: False\n", 192 | "\n", 193 | "\u001B[32mPASSED\u001B[0m\n", 194 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-case4] Model: gpt-4o\n", 195 | "Input: Updating system drivers fixed the issue\n", 196 | "Expected: True, Predicted: True\n", 197 | "\n", 198 | "\u001B[32mPASSED\u001B[0m\n", 199 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-case5] Model: gpt-4o\n", 200 | "Input: the new llama can understand bizzare nuanced slang\n", 201 | "Expected: True, Predicted: True\n", 202 | "\n", 203 | "\u001B[32mPASSED\u001B[0m\n", 204 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-mini-case0] Model: gpt-4o-mini\n", 205 | "Input: I need to debug this Python code\n", 206 | "Expected: True, Predicted: True\n", 207 | "\n", 208 | "\u001B[32mPASSED\u001B[0m\n", 209 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-mini-case1] Model: gpt-4o-mini\n", 210 | "Input: The cat jumped over the lazy dog\n", 211 | "Expected: False, Predicted: False\n", 212 | "\n", 213 | "\u001B[32mPASSED\u001B[0m\n", 214 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-mini-case2] Model: gpt-4o-mini\n", 215 | "Input: My monitor keeps flickering\n", 216 | "Expected: True, Predicted: True\n", 217 | "\n", 218 | "\u001B[32mPASSED\u001B[0m\n", 219 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-mini-case3] Model: gpt-4o-mini\n", 220 | "Input: The weather is nice today\n", 221 | "Expected: False, Predicted: False\n", 222 | "\n", 223 | "\u001B[32mPASSED\u001B[0m\n", 224 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-mini-case4] Model: gpt-4o-mini\n", 225 | "Input: Updating system drivers fixed the issue\n", 226 | "Expected: True, Predicted: True\n", 227 | "\n", 228 | "\u001B[32mPASSED\u001B[0m\n", 229 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-4o-mini-case5] Model: gpt-4o-mini\n", 230 | "Input: the new llama can understand bizzare nuanced slang\n", 231 | "Expected: True, Predicted: False\n", 232 | "\n", 233 | "\u001B[31mFAILED\u001B[0m\n", 234 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-3.5-turbo-case0] Model: gpt-3.5-turbo\n", 235 | "Input: I need to debug this Python code\n", 236 | "Expected: True, Predicted: True\n", 237 | "\n", 238 | "\u001B[32mPASSED\u001B[0m\n", 239 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-3.5-turbo-case1] Model: gpt-3.5-turbo\n", 240 | "Input: The cat jumped over the lazy dog\n", 241 | "Expected: False, Predicted: False\n", 242 | "\n", 243 | "\u001B[32mPASSED\u001B[0m\n", 244 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-3.5-turbo-case2] Model: gpt-3.5-turbo\n", 245 | "Input: My monitor keeps flickering\n", 246 | "Expected: True, Predicted: False\n", 247 | "\n", 248 | "\u001B[31mFAILED\u001B[0m\n", 249 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-3.5-turbo-case3] Model: gpt-3.5-turbo\n", 250 | "Input: The weather is nice today\n", 251 | "Expected: False, Predicted: False\n", 252 | "\n", 253 | "\u001B[32mPASSED\u001B[0m\n", 254 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-3.5-turbo-case4] Model: gpt-3.5-turbo\n", 255 | "Input: Updating system drivers fixed the issue\n", 256 | "Expected: True, Predicted: True\n", 257 | "\n", 258 | "\u001B[32mPASSED\u001B[0m\n", 259 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_classifier[gpt-3.5-turbo-case5] Model: gpt-3.5-turbo\n", 260 | "Input: the new llama can understand bizzare nuanced slang\n", 261 | "Expected: True, Predicted: False\n", 262 | "\n", 263 | "\u001B[31mFAILED\u001B[0m\n", 264 | "\n", 265 | "============================================= FAILURES =============================================\n", 266 | "\u001B[31m\u001B[1m________________________________ test_classifier[gpt-4o-mini-case5] ________________________________\u001B[0m\n", 267 | "\n", 268 | "case = {'label': True, 'text': 'the new llama can understand bizzare nuanced slang'}\n", 269 | "model = 'gpt-4o-mini'\n", 270 | "eval_bag = ResultsBag:\n", 271 | "{'input_text': 'the new llama can understand bizzare nuanced slang', 'label': True, 'prediction': False, 'precision': 0}\n", 272 | "\n", 273 | " \u001B[0m\u001B[37m@pytest\u001B[39;49;00m.mark.eval(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcomputer_classifier\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 274 | " \u001B[37m@pytest\u001B[39;49;00m.mark.parametrize(\u001B[33m\"\u001B[39;49;00m\u001B[33mcase\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, TEST_DATA)\u001B[90m\u001B[39;49;00m\n", 275 | " \u001B[37m@pytest\u001B[39;49;00m.mark.parametrize(\u001B[33m\"\u001B[39;49;00m\u001B[33mmodel\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, [\u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-4o\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, \u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-4o-mini\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, \u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-3.5-turbo\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m])\u001B[90m\u001B[39;49;00m\n", 276 | " \u001B[94mdef\u001B[39;49;00m\u001B[90m \u001B[39;49;00m\u001B[92mtest_classifier\u001B[39;49;00m(case: \u001B[96mdict\u001B[39;49;00m, model, eval_bag):\u001B[90m\u001B[39;49;00m\n", 277 | " \u001B[90m \u001B[39;49;00m\u001B[33m\"\"\"Test individual classification cases across different models.\u001B[39;49;00m\n", 278 | " \u001B[33m\u001B[39;49;00m\n", 279 | " \u001B[33m Args:\u001B[39;49;00m\n", 280 | " \u001B[33m case (dict): Test case with text and expected label\u001B[39;49;00m\n", 281 | " \u001B[33m eval_bag: Container for test results\u001B[39;49;00m\n", 282 | " \u001B[33m classifier: Classification function\u001B[39;49;00m\n", 283 | " \u001B[33m model: Model identifier\u001B[39;49;00m\n", 284 | " \u001B[33m \"\"\"\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n", 285 | " eval_bag.input_text = case[\u001B[33m\"\u001B[39;49;00m\u001B[33mtext\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m]\u001B[90m\u001B[39;49;00m\n", 286 | " eval_bag.label = case[\u001B[33m\"\u001B[39;49;00m\u001B[33mlabel\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m]\u001B[90m\u001B[39;49;00m\n", 287 | " eval_bag.prediction = classify(case[\u001B[33m\"\u001B[39;49;00m\u001B[33mtext\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m], model)\u001B[90m\u001B[39;49;00m\n", 288 | " eval_bag.precision = \u001B[94m1\u001B[39;49;00m \u001B[94mif\u001B[39;49;00m eval_bag.prediction == eval_bag.label \u001B[94melse\u001B[39;49;00m \u001B[94m0\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n", 289 | " \u001B[90m\u001B[39;49;00m\n", 290 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mModel: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00mmodel\u001B[33m}\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 291 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mInput: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.input_text\u001B[33m}\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 292 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mExpected: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.label\u001B[33m}\u001B[39;49;00m\u001B[33m, Predicted: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.prediction\u001B[33m}\u001B[39;49;00m\u001B[33m\\n\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 293 | " \u001B[90m\u001B[39;49;00m\n", 294 | "> \u001B[94massert\u001B[39;49;00m eval_bag.prediction == eval_bag.label\u001B[90m\u001B[39;49;00m\n", 295 | "\u001B[1m\u001B[31mE AssertionError: assert False == True\u001B[0m\n", 296 | "\u001B[1m\u001B[31mE + where False = ResultsBag:\\n{'input_text': 'the new llama can understand bizzare nuanced slang', 'label': True, 'prediction': False, 'precision': 0}.prediction\u001B[0m\n", 297 | "\u001B[1m\u001B[31mE + and True = ResultsBag:\\n{'input_text': 'the new llama can understand bizzare nuanced slang', 'label': True, 'prediction': False, 'precision': 0}.label\u001B[0m\n", 298 | "\n", 299 | "\u001B[1m\u001B[31m/var/folders/cn/zpwdtbhd7ylgt032s2t3tpdw0000gn/T/ipykernel_2511/2061077205.py\u001B[0m:25: AssertionError\n", 300 | "\u001B[31m\u001B[1m_______________________________ test_classifier[gpt-3.5-turbo-case2] _______________________________\u001B[0m\n", 301 | "\n", 302 | "case = {'label': True, 'text': 'My monitor keeps flickering'}, model = 'gpt-3.5-turbo'\n", 303 | "eval_bag = ResultsBag:\n", 304 | "{'input_text': 'My monitor keeps flickering', 'label': True, 'prediction': False, 'precision': 0}\n", 305 | "\n", 306 | " \u001B[0m\u001B[37m@pytest\u001B[39;49;00m.mark.eval(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcomputer_classifier\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 307 | " \u001B[37m@pytest\u001B[39;49;00m.mark.parametrize(\u001B[33m\"\u001B[39;49;00m\u001B[33mcase\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, TEST_DATA)\u001B[90m\u001B[39;49;00m\n", 308 | " \u001B[37m@pytest\u001B[39;49;00m.mark.parametrize(\u001B[33m\"\u001B[39;49;00m\u001B[33mmodel\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, [\u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-4o\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, \u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-4o-mini\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, \u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-3.5-turbo\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m])\u001B[90m\u001B[39;49;00m\n", 309 | " \u001B[94mdef\u001B[39;49;00m\u001B[90m \u001B[39;49;00m\u001B[92mtest_classifier\u001B[39;49;00m(case: \u001B[96mdict\u001B[39;49;00m, model, eval_bag):\u001B[90m\u001B[39;49;00m\n", 310 | " \u001B[90m \u001B[39;49;00m\u001B[33m\"\"\"Test individual classification cases across different models.\u001B[39;49;00m\n", 311 | " \u001B[33m\u001B[39;49;00m\n", 312 | " \u001B[33m Args:\u001B[39;49;00m\n", 313 | " \u001B[33m case (dict): Test case with text and expected label\u001B[39;49;00m\n", 314 | " \u001B[33m eval_bag: Container for test results\u001B[39;49;00m\n", 315 | " \u001B[33m classifier: Classification function\u001B[39;49;00m\n", 316 | " \u001B[33m model: Model identifier\u001B[39;49;00m\n", 317 | " \u001B[33m \"\"\"\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n", 318 | " eval_bag.input_text = case[\u001B[33m\"\u001B[39;49;00m\u001B[33mtext\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m]\u001B[90m\u001B[39;49;00m\n", 319 | " eval_bag.label = case[\u001B[33m\"\u001B[39;49;00m\u001B[33mlabel\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m]\u001B[90m\u001B[39;49;00m\n", 320 | " eval_bag.prediction = classify(case[\u001B[33m\"\u001B[39;49;00m\u001B[33mtext\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m], model)\u001B[90m\u001B[39;49;00m\n", 321 | " eval_bag.precision = \u001B[94m1\u001B[39;49;00m \u001B[94mif\u001B[39;49;00m eval_bag.prediction == eval_bag.label \u001B[94melse\u001B[39;49;00m \u001B[94m0\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n", 322 | " \u001B[90m\u001B[39;49;00m\n", 323 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mModel: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00mmodel\u001B[33m}\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 324 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mInput: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.input_text\u001B[33m}\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 325 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mExpected: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.label\u001B[33m}\u001B[39;49;00m\u001B[33m, Predicted: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.prediction\u001B[33m}\u001B[39;49;00m\u001B[33m\\n\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 326 | " \u001B[90m\u001B[39;49;00m\n", 327 | "> \u001B[94massert\u001B[39;49;00m eval_bag.prediction == eval_bag.label\u001B[90m\u001B[39;49;00m\n", 328 | "\u001B[1m\u001B[31mE AssertionError: assert False == True\u001B[0m\n", 329 | "\u001B[1m\u001B[31mE + where False = ResultsBag:\\n{'input_text': 'My monitor keeps flickering', 'label': True, 'prediction': False, 'precision': 0}.prediction\u001B[0m\n", 330 | "\u001B[1m\u001B[31mE + and True = ResultsBag:\\n{'input_text': 'My monitor keeps flickering', 'label': True, 'prediction': False, 'precision': 0}.label\u001B[0m\n", 331 | "\n", 332 | "\u001B[1m\u001B[31m/var/folders/cn/zpwdtbhd7ylgt032s2t3tpdw0000gn/T/ipykernel_2511/2061077205.py\u001B[0m:25: AssertionError\n", 333 | "\u001B[31m\u001B[1m_______________________________ test_classifier[gpt-3.5-turbo-case5] _______________________________\u001B[0m\n", 334 | "\n", 335 | "case = {'label': True, 'text': 'the new llama can understand bizzare nuanced slang'}\n", 336 | "model = 'gpt-3.5-turbo'\n", 337 | "eval_bag = ResultsBag:\n", 338 | "{'input_text': 'the new llama can understand bizzare nuanced slang', 'label': True, 'prediction': False, 'precision': 0}\n", 339 | "\n", 340 | " \u001B[0m\u001B[37m@pytest\u001B[39;49;00m.mark.eval(name=\u001B[33m\"\u001B[39;49;00m\u001B[33mcomputer_classifier\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 341 | " \u001B[37m@pytest\u001B[39;49;00m.mark.parametrize(\u001B[33m\"\u001B[39;49;00m\u001B[33mcase\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, TEST_DATA)\u001B[90m\u001B[39;49;00m\n", 342 | " \u001B[37m@pytest\u001B[39;49;00m.mark.parametrize(\u001B[33m\"\u001B[39;49;00m\u001B[33mmodel\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, [\u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-4o\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, \u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-4o-mini\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m, \u001B[33m\"\u001B[39;49;00m\u001B[33mgpt-3.5-turbo\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m])\u001B[90m\u001B[39;49;00m\n", 343 | " \u001B[94mdef\u001B[39;49;00m\u001B[90m \u001B[39;49;00m\u001B[92mtest_classifier\u001B[39;49;00m(case: \u001B[96mdict\u001B[39;49;00m, model, eval_bag):\u001B[90m\u001B[39;49;00m\n", 344 | " \u001B[90m \u001B[39;49;00m\u001B[33m\"\"\"Test individual classification cases across different models.\u001B[39;49;00m\n", 345 | " \u001B[33m\u001B[39;49;00m\n", 346 | " \u001B[33m Args:\u001B[39;49;00m\n", 347 | " \u001B[33m case (dict): Test case with text and expected label\u001B[39;49;00m\n", 348 | " \u001B[33m eval_bag: Container for test results\u001B[39;49;00m\n", 349 | " \u001B[33m classifier: Classification function\u001B[39;49;00m\n", 350 | " \u001B[33m model: Model identifier\u001B[39;49;00m\n", 351 | " \u001B[33m \"\"\"\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n", 352 | " eval_bag.input_text = case[\u001B[33m\"\u001B[39;49;00m\u001B[33mtext\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m]\u001B[90m\u001B[39;49;00m\n", 353 | " eval_bag.label = case[\u001B[33m\"\u001B[39;49;00m\u001B[33mlabel\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m]\u001B[90m\u001B[39;49;00m\n", 354 | " eval_bag.prediction = classify(case[\u001B[33m\"\u001B[39;49;00m\u001B[33mtext\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m], model)\u001B[90m\u001B[39;49;00m\n", 355 | " eval_bag.precision = \u001B[94m1\u001B[39;49;00m \u001B[94mif\u001B[39;49;00m eval_bag.prediction == eval_bag.label \u001B[94melse\u001B[39;49;00m \u001B[94m0\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n", 356 | " \u001B[90m\u001B[39;49;00m\n", 357 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mModel: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00mmodel\u001B[33m}\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 358 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mInput: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.input_text\u001B[33m}\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 359 | " \u001B[96mprint\u001B[39;49;00m(\u001B[33mf\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m\u001B[33mExpected: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.label\u001B[33m}\u001B[39;49;00m\u001B[33m, Predicted: \u001B[39;49;00m\u001B[33m{\u001B[39;49;00meval_bag.prediction\u001B[33m}\u001B[39;49;00m\u001B[33m\\n\u001B[39;49;00m\u001B[33m\"\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n", 360 | " \u001B[90m\u001B[39;49;00m\n", 361 | "> \u001B[94massert\u001B[39;49;00m eval_bag.prediction == eval_bag.label\u001B[90m\u001B[39;49;00m\n", 362 | "\u001B[1m\u001B[31mE AssertionError: assert False == True\u001B[0m\n", 363 | "\u001B[1m\u001B[31mE + where False = ResultsBag:\\n{'input_text': 'the new llama can understand bizzare nuanced slang', 'label': True, 'prediction': False, 'precision': 0}.prediction\u001B[0m\n", 364 | "\u001B[1m\u001B[31mE + and True = ResultsBag:\\n{'input_text': 'the new llama can understand bizzare nuanced slang', 'label': True, 'prediction': False, 'precision': 0}.label\u001B[0m\n", 365 | "\n", 366 | "\u001B[1m\u001B[31m/var/folders/cn/zpwdtbhd7ylgt032s2t3tpdw0000gn/T/ipykernel_2511/2061077205.py\u001B[0m:25: AssertionError\n", 367 | "\u001B[36m\u001B[1m===================================== short test summary info ======================================\u001B[0m\n", 368 | "\u001B[31mFAILED\u001B[0m t_4373bcce5bc844c49ab1aee5ce828d18.py::\u001B[1mtest_classifier[gpt-4o-mini-case5]\u001B[0m - AssertionError: assert False == True\n", 369 | "\u001B[31mFAILED\u001B[0m t_4373bcce5bc844c49ab1aee5ce828d18.py::\u001B[1mtest_classifier[gpt-3.5-turbo-case2]\u001B[0m - AssertionError: assert False == True\n", 370 | "\u001B[31mFAILED\u001B[0m t_4373bcce5bc844c49ab1aee5ce828d18.py::\u001B[1mtest_classifier[gpt-3.5-turbo-case5]\u001B[0m - AssertionError: assert False == True\n", 371 | "\u001B[31m=================================== \u001B[31m\u001B[1m3 failed\u001B[0m, \u001B[32m15 passed\u001B[0m\u001B[31m in 9.70s\u001B[0m\u001B[31m ===================================\u001B[0m\n" 372 | ] 373 | } 374 | ], 375 | "execution_count": 4 376 | }, 377 | { 378 | "metadata": { 379 | "ExecuteTime": { 380 | "end_time": "2025-01-21T08:34:02.276647Z", 381 | "start_time": "2025-01-21T08:34:02.250710Z" 382 | } 383 | }, 384 | "cell_type": "code", 385 | "source": [ 386 | "%%ipytest_evals --run-eval-analysis\n", 387 | "@pytest.mark.eval_analysis(name=\"computer_classifier\")\n", 388 | "def test_analysis(eval_results):\n", 389 | " \"\"\"Analyze results across all models and compute detailed metrics.\n", 390 | "\n", 391 | " Args:\n", 392 | " eval_results: Collection of all test results\n", 393 | " \"\"\"\n", 394 | " # Group results by model\n", 395 | " res = defaultdict(list)\n", 396 | " for r in eval_results:\n", 397 | " res[r.test_params['model']].append(r)\n", 398 | "\n", 399 | " global experiments_df\n", 400 | "\n", 401 | " # Calculate metrics for each model\n", 402 | " for model, results in res.items():\n", 403 | " tp = sum(1 for r in results if r.result.prediction == r.result.label and r.result.label)\n", 404 | " fp = sum(1 for r in results if r.result.prediction != r.result.label and not r.result.label)\n", 405 | " tn = sum(1 for r in results if r.result.prediction == r.result.label and not r.result.label)\n", 406 | " fn = sum(1 for r in results if r.result.prediction != r.result.label and r.result.label)\n", 407 | "\n", 408 | " # Calculate metrics\n", 409 | " precision = tp / (tp + fp) if (tp + fp) > 0 else 0\n", 410 | " recall = tp / (tp + fn) if (tp + fn) > 0 else 0\n", 411 | " f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0\n", 412 | " accuracy = (tp + tn) / len(results)\n", 413 | "\n", 414 | " experiments_df = pd.concat([experiments_df, pd.DataFrame([{\n", 415 | " 'timestamp': pd.Timestamp.now(),\n", 416 | " 'model': model,\n", 417 | " 'accuracy': accuracy,\n", 418 | " 'precision': precision,\n", 419 | " 'recall': recall,\n", 420 | " 'f1_score': f1,\n", 421 | " 'true_positives': tp,\n", 422 | " 'false_positives': fp,\n", 423 | " 'true_negatives': tn,\n", 424 | " 'false_negatives': fn,\n", 425 | " 'total_samples': len(results),\n", 426 | " }])], ignore_index=True)\n", 427 | "\n", 428 | " assert any(model_results['accuracy'].iloc[-1] >= 0.7\n", 429 | " for _, model_results in experiments_df.groupby('model'))" 430 | ], 431 | "id": "ca31a4d09acc7dcd", 432 | "outputs": [ 433 | { 434 | "name": "stdout", 435 | "output_type": "stream", 436 | "text": [ 437 | "\n", 438 | "t_4373bcce5bc844c49ab1aee5ce828d18.py::test_analysis \u001B[32mPASSED\u001B[0m\n", 439 | "\n", 440 | "\u001B[32m======================================== \u001B[32m\u001B[1m1 passed\u001B[0m\u001B[32m in 0.01s\u001B[0m\u001B[32m =========================================\u001B[0m\n" 441 | ] 442 | } 443 | ], 444 | "execution_count": 5 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "id": "b1d45def36cb41b8", 449 | "metadata": {}, 450 | "source": [ 451 | "## Analyze Results\n", 452 | "\n", 453 | "Examine performance metrics and visualize trends across models and runs." 454 | ] 455 | }, 456 | { 457 | "metadata": { 458 | "ExecuteTime": { 459 | "end_time": "2025-01-21T08:34:08.110500Z", 460 | "start_time": "2025-01-21T08:34:07.915693Z" 461 | } 462 | }, 463 | "cell_type": "code", 464 | "source": [ 465 | "import matplotlib.pyplot as plt\n", 466 | "import seaborn as sns\n", 467 | "\n", 468 | "# Set plotting style\n", 469 | "sns.set_palette(\"husl\")\n", 470 | "\n", 471 | "\n", 472 | "def plot_performance_comparison():\n", 473 | " \"\"\"Create comprehensive performance visualization comparing models.\"\"\"\n", 474 | " if len(experiments_df) > 1:\n", 475 | " fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))\n", 476 | "\n", 477 | " # Plot accuracy trends\n", 478 | " sns.lineplot(\n", 479 | " data=experiments_df,\n", 480 | " x=\"timestamp\",\n", 481 | " y=\"accuracy\",\n", 482 | " hue=\"model\",\n", 483 | " marker=\"o\",\n", 484 | " ax=ax1,\n", 485 | " )\n", 486 | " ax1.set_title(\"Model Accuracy Over Time\")\n", 487 | " ax1.axhline(\n", 488 | " y=0.7, color=\"r\", linestyle=\"--\", alpha=0.5, label=\"Minimum Threshold\"\n", 489 | " )\n", 490 | " ax1.set_ylim(0.5, 1.0)\n", 491 | "\n", 492 | " # Plot error metrics\n", 493 | " error_data = experiments_df.melt(\n", 494 | " id_vars=[\"model\"],\n", 495 | " value_vars=[\"false_positives\", \"false_negatives\"],\n", 496 | " var_name=\"metric\",\n", 497 | " value_name=\"count\",\n", 498 | " )\n", 499 | " sns.barplot(data=error_data, x=\"model\", y=\"count\", hue=\"metric\", ax=ax2)\n", 500 | " ax2.set_title(\"Error Analysis by Model\")\n", 501 | "\n", 502 | " plt.tight_layout()\n", 503 | " plt.show()\n", 504 | "\n", 505 | " # Show summary statistics\n", 506 | " print(\"\\nPerformance Statistics by Model:\")\n", 507 | " display(\n", 508 | " experiments_df.groupby(\"model\")[\n", 509 | " [\"accuracy\", \"true_positives\", \"false_positives\", \"false_negatives\"]\n", 510 | " ].describe()\n", 511 | " )\n", 512 | "\n", 513 | "\n", 514 | "plot_performance_comparison()" 515 | ], 516 | "id": "d4cc96b376881d9a", 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "
" 522 | ], 523 | "image/png": "" 524 | }, 525 | "metadata": {}, 526 | "output_type": "display_data" 527 | }, 528 | { 529 | "name": "stdout", 530 | "output_type": "stream", 531 | "text": [ 532 | "\n", 533 | "Performance Statistics by Model:\n" 534 | ] 535 | }, 536 | { 537 | "data": { 538 | "text/plain": [ 539 | " accuracy \\\n", 540 | " count mean std min 25% 50% 75% \n", 541 | "model \n", 542 | "gpt-3.5-turbo 1.0 0.666667 NaN 0.666667 0.666667 0.666667 0.666667 \n", 543 | "gpt-4o 1.0 1.000000 NaN 1.000000 1.000000 1.000000 1.000000 \n", 544 | "gpt-4o-mini 1.0 0.833333 NaN 0.833333 0.833333 0.833333 0.833333 \n", 545 | "\n", 546 | " true_positives ... false_positives \\\n", 547 | " max count mean ... 75% max \n", 548 | "model ... \n", 549 | "gpt-3.5-turbo 0.666667 1.0 2.0 ... 0.0 0.0 \n", 550 | "gpt-4o 1.000000 1.0 4.0 ... 0.0 0.0 \n", 551 | "gpt-4o-mini 0.833333 1.0 3.0 ... 0.0 0.0 \n", 552 | "\n", 553 | " false_negatives \n", 554 | " count mean std min 25% 50% 75% max \n", 555 | "model \n", 556 | "gpt-3.5-turbo 1.0 2.0 NaN 2.0 2.0 2.0 2.0 2.0 \n", 557 | "gpt-4o 1.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 \n", 558 | "gpt-4o-mini 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0 \n", 559 | "\n", 560 | "[3 rows x 32 columns]" 561 | ], 562 | "text/html": [ 563 | "
\n", 564 | "\n", 581 | "\n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | "
accuracytrue_positives...false_positivesfalse_negatives
countmeanstdmin25%50%75%maxcountmean...75%maxcountmeanstdmin25%50%75%max
model
gpt-3.5-turbo1.00.666667NaN0.6666670.6666670.6666670.6666670.6666671.02.0...0.00.01.02.0NaN2.02.02.02.02.0
gpt-4o1.01.000000NaN1.0000001.0000001.0000001.0000001.0000001.04.0...0.00.01.00.0NaN0.00.00.00.00.0
gpt-4o-mini1.00.833333NaN0.8333330.8333330.8333330.8333330.8333331.03.0...0.00.01.01.0NaN1.01.01.01.01.0
\n", 715 | "

3 rows × 32 columns

\n", 716 | "
" 717 | ] 718 | }, 719 | "metadata": {}, 720 | "output_type": "display_data" 721 | } 722 | ], 723 | "execution_count": 6 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "id": "999f79285821072a", 728 | "metadata": {}, 729 | "source": [ 730 | "## Save Results\n", 731 | "\n", 732 | "Optionally save experiment results for future analysis." 733 | ] 734 | }, 735 | { 736 | "metadata": { 737 | "ExecuteTime": { 738 | "end_time": "2025-01-21T08:34:10.119140Z", 739 | "start_time": "2025-01-21T08:34:10.113540Z" 740 | } 741 | }, 742 | "cell_type": "code", 743 | "source": [ 744 | "# Save experiment results\n", 745 | "experiments_df.to_csv(\"experiment_results.csv\", index=False)\n", 746 | "print(\"Results saved to 'experiment_results.csv'\")" 747 | ], 748 | "id": "bc5e706cfa63d8cd", 749 | "outputs": [ 750 | { 751 | "name": "stdout", 752 | "output_type": "stream", 753 | "text": [ 754 | "Results saved to 'experiment_results.csv'\n" 755 | ] 756 | } 757 | ], 758 | "execution_count": 7 759 | } 760 | ], 761 | "metadata": { 762 | "kernelspec": { 763 | "display_name": "Python 3", 764 | "language": "python", 765 | "name": "python3" 766 | }, 767 | "language_info": { 768 | "codemirror_mode": { 769 | "name": "ipython", 770 | "version": 2 771 | }, 772 | "file_extension": ".py", 773 | "mimetype": "text/x-python", 774 | "name": "python", 775 | "nbconvert_exporter": "python", 776 | "pygments_lexer": "ipython2", 777 | "version": "2.7.6" 778 | } 779 | }, 780 | "nbformat": 4, 781 | "nbformat_minor": 5 782 | } 783 | --------------------------------------------------------------------------------