├── src
    └── zeroeval
    │   ├── observability
    │       ├── integrations
    │       │   ├── __init__.py
    │       │   ├── gemini
    │       │   │   └── __init__.py
    │       │   ├── openai
    │       │   │   └── __init__.py
    │       │   ├── langchain
    │       │   │   └── __init__.py
    │       │   ├── httpx
    │       │   │   ├── __init__.py
    │       │   │   └── README.md
    │       │   ├── vocode
    │       │   │   ├── __init__.py
    │       │   │   └── streaming_tracker.py
    │       │   ├── utils.py
    │       │   ├── registry.py
    │       │   └── base.py
    │       ├── __init__.py
    │       ├── signals.py
    │       ├── utils.py
    │       ├── span.py
    │       └── choice.py
    │   ├── cli
    │       ├── __init__.py
    │       ├── main.py
    │       ├── runner.py
    │       ├── utils.py
    │       └── setup.py
    │   ├── pyproject.toml
    │   ├── core
    │       ├── __init__.py
    │       ├── decorators.py
    │       ├── task.py
    │       ├── metrics.py
    │       ├── evaluator_class.py
    │       ├── run_collection.py
    │       ├── evaluation.py
    │       └── reader.py
    │   ├── errors.py
    │   ├── utils
    │       └── hash.py
    │   ├── cache.py
    │   ├── template.py
    │   ├── types.py
    │   └── providers.py
├── .vscode
    ├── extensions.json
    └── settings.json
├── .gitignore
├── pytest.ini
├── examples_v2
    ├── .env.example
    ├── tracing
    │   ├── openai_basic.py
    │   └── openai_with_spans.py
    ├── README.md
    ├── tuning
    │   ├── README.md
    │   ├── auto_prompt_optimization.py
    │   ├── bookstore_agent.py
    │   ├── bookstore_agent_with_feedback.py
    │   └── bookstore_agent_with_api_feedback.py
    └── ab_testing
    │   └── openai_ab_test.py
├── .github
    └── workflows
    │   ├── gitleaks.yml
    │   └── ci.yml
├── tox.ini
├── pyproject.toml
├── tests
    ├── conftest.py
    ├── core
    │   ├── test_decorator.py
    │   └── test_tracer.py
    ├── README.md
    ├── performance
    │   └── test_span_performance.py
    ├── test_gemini_compatibility.py
    ├── test_client_feedback.py
    ├── test_choice.py
    ├── test_httpx_integration.py
    └── test_gemini_integration.py
├── .cursor
    └── rules
    │   ├── testing.mdc
    │   └── documentation.mdc
└── INTEGRATIONS.md


/src/zeroeval/observability/integrations/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/gemini/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/openai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/langchain/__init__.py:
--------------------------------------------------------------------------------
1 |  


--------------------------------------------------------------------------------
/src/zeroeval/cli/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 
3 | __all__ = ["main"]


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/httpx/__init__.py:
--------------------------------------------------------------------------------
1 | from .integration import HttpxIntegration
2 | 
3 | __all__ = ["HttpxIntegration"]
4 | 


--------------------------------------------------------------------------------
/src/zeroeval/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "zeroeval"
3 | version = "0.6.123"
4 | description = "ZeroEval SDK"
5 | 
6 | [project.scripts]
7 | zeroeval = "zeroeval.cli.main:main"
8 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "recommendations": [
3 |         "ms-python.python",
4 |         "charliermarsh.ruff",
5 |         "ms-python.mypy-type-checker",
6 |         "ms-python.debugpy"
7 |     ]
8 | } 


--------------------------------------------------------------------------------
/src/zeroeval/observability/__init__.py:
--------------------------------------------------------------------------------
1 | # Observability package initialization
2 | from .decorators import span
3 | from .tracer import tracer
4 | from .integrations.openai.integration import zeroeval_prompt
5 | 
6 | __all__ = ["tracer", "span", "zeroeval_prompt"]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | __pycache__/
 3 | 
 4 | .DS_Store
 5 | 
 6 | .venv/
 7 | venv/
 8 | *venv/*
 9 | *.venv/*
10 | 
11 | dist/
12 | dist/*
13 | .pytest_cache/
14 | .ruff_cache/
15 | .mypy_cache/
16 | 
17 | # Build and test directories
18 | .tox/
19 | 
20 | examples/
21 | *.env*


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/vocode/__init__.py:
--------------------------------------------------------------------------------
1 | """ZeroEval integration for Vocode voice SDK."""
2 | 
3 | from .integration import VocodeIntegration
4 | from .streaming_tracker import StreamingSpanTracker, SynthesisResultWrapper
5 | 
6 | __all__ = ["VocodeIntegration", "StreamingSpanTracker", "SynthesisResultWrapper"]


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | markers =
 3 |     core: Core functionality tests
 4 |     performance: Performance tests
 5 | 
 6 | testpaths = tests
 7 | python_files = test_*.py
 8 | python_classes = Test*
 9 | python_functions = test_*
10 | 
11 | addopts = 
12 |     -v
13 |     --strict-markers
14 | 
15 | filterwarnings =
16 |     ignore::DeprecationWarning 


--------------------------------------------------------------------------------
/src/zeroeval/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset_class import Dataset
2 | from .decorators import experiment
3 | from .experiment_class import Experiment
4 | from .init import init
5 | from .task import task
6 | from .run import Run
7 | from .evaluation import evaluation
8 | 
9 | __all__ = ["experiment", "Dataset", "Experiment", "init", "task", "Run", "evaluation"]


--------------------------------------------------------------------------------
/examples_v2/.env.example:
--------------------------------------------------------------------------------
 1 | # ZeroEval Configuration
 2 | ZEROEVAL_API_KEY=your-zeroeval-api-key-here
 3 | ZEROEVAL_API_URL=http://localhost:8000
 4 | 
 5 | # OpenAI Configuration
 6 | OPENAI_API_KEY=your-openai-api-key-here
 7 | 
 8 | # Other API Keys (for future examples)
 9 | # ANTHROPIC_API_KEY=your-anthropic-api-key-here
10 | # GEMINI_API_KEY=your-gemini-api-key-here


--------------------------------------------------------------------------------
/.github/workflows/gitleaks.yml:
--------------------------------------------------------------------------------
 1 | name: gitleaks
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |   workflow_dispatch:
 6 | permissions:
 7 |   contents: read
 8 |   pull-requests: write
 9 | jobs:
10 |   scan:
11 |     name: gitleaks
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |         with:
16 |           fetch-depth: 0
17 |       - uses: gitleaks/gitleaks-action@v2
18 |         env:
19 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 |           GITLEAKS_LICENSE: ${{ secrets.GITLEAKS_LICENSE }}
21 | 


--------------------------------------------------------------------------------
/src/zeroeval/errors.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | 
 5 | 
 6 | class PromptNotFoundError(Exception):
 7 |     def __init__(self, slug: str, version: Optional[int], tag: Optional[str]):
 8 |         self.slug = slug
 9 |         self.version = version
10 |         self.tag = tag
11 |         super().__init__(f"Prompt not found: {slug} (version={version}, tag={tag})")
12 | 
13 | 
14 | class PromptRequestError(Exception):
15 |     def __init__(self, message: str, status: Optional[int] = None):
16 |         self.status = status
17 |         super().__init__(message)
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .base import Integration
 3 | 
 4 | 
 5 | def discover_integrations() -> dict[str, type[Integration]]:
 6 |     """
 7 |     Discover all available integrations.
 8 |     This can be expanded to use entry points for plugin-style discovery.
 9 |     """
10 |     from .langchain.integration import LangChainIntegration
11 |     from .langgraph.integration import LangGraphIntegration
12 |     from .openai.integration import OpenAIIntegration
13 |     from .livekit.integration import LiveKitIntegration
14 |     
15 |     return {
16 |         "openai": OpenAIIntegration,
17 |         "langchain": LangChainIntegration,
18 |         "langgraph": LangGraphIntegration,
19 |         "livekit": LiveKitIntegration,
20 |     }


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py{37,38,39,310,311,312,313}-{core,perf}
 3 | isolated_build = true
 4 | 
 5 | [testenv]
 6 | deps = 
 7 |     pytest>=8.2.2
 8 |     pytest-asyncio>=0.23.7
 9 | commands = pytest -q
10 | 
11 | [testenv:py{37,38,39,310,311,312,313}-core]
12 | description = Core tests on {envname}
13 | commands = pytest tests/core -m core {posargs}
14 | 
15 | [testenv:py{37,38,39,310,311,312,313}-perf]
16 | description = Performance tests on {envname}
17 | commands = pytest tests/performance --runperformance {posargs}
18 | 
19 | # Quick aliases for current interpreter
20 | [testenv:core]
21 | commands = pytest tests/core -m core {posargs}
22 | 
23 | [testenv:perf]
24 | commands = pytest tests/performance --runperformance {posargs}
25 | 
26 | [testenv:all]
27 | commands = pytest tests/ --runperformance {posargs}
28 | 
29 |  


--------------------------------------------------------------------------------
/src/zeroeval/cli/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from .runner import run_script
 4 | from .setup import setup
 5 | 
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser(description="zeroeval CLI")
 9 |     subparsers = parser.add_subparsers(dest="command", help="zeroeval command")
10 | 
11 |     # 'run' command:
12 |     run_parser = subparsers.add_parser("run", help="Run a Python script with zeroeval experiments")
13 |     run_parser.add_argument("script", help="Path to the Python script you'd like to run")
14 | 
15 |     # 'setup' command:
16 |     subparsers.add_parser("setup", help="Setup tokens for zeroeval")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     if args.command == "run":
21 |         run_script(args.script)
22 |     elif args.command == "setup":
23 |         setup()
24 |     else:
25 |         parser.print_help()


--------------------------------------------------------------------------------
/src/zeroeval/cli/runner.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | def run_script(script_path: str):
 7 |     """
 8 |     Imports and executes the user script.
 9 |     With the new API, users explicitly call dataset.run() in their scripts,
10 |     so we just need to execute the script.
11 |     """
12 |     # Add the script's directory to sys.path so imports work
13 |     script_dir = os.path.dirname(os.path.abspath(script_path))
14 |     if script_dir not in sys.path:
15 |         sys.path.insert(0, script_dir)
16 |     
17 |     # Dynamically load and execute the script
18 |     module_name = os.path.splitext(os.path.basename(script_path))[0]
19 |     spec = importlib.util.spec_from_file_location(module_name, script_path)
20 |     
21 |     if spec is None or spec.loader is None:
22 |         raise ImportError(f"Could not load script: {script_path}")
23 |         
24 |     module = importlib.util.module_from_spec(spec)
25 |     sys.modules[module_name] = module
26 |     
27 |     try:
28 |         spec.loader.exec_module(module)
29 |     except Exception as e:
30 |         print(f"Error running script: {e}")
31 |         raise


--------------------------------------------------------------------------------
/src/zeroeval/utils/hash.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import hashlib
 4 | 
 5 | 
 6 | def _normalize_newlines(text: str) -> str:
 7 |     # Convert CRLF and CR to LF
 8 |     if "\r" not in text:
 9 |         return text
10 |     return text.replace("\r\n", "\n").replace("\r", "\n")
11 | 
12 | 
13 | def _strip_trailing_whitespace(text: str) -> str:
14 |     # Remove trailing whitespace on each line
15 |     return "\n".join(line.rstrip() for line in text.split("\n"))
16 | 
17 | 
18 | def normalize_prompt_text(text: str) -> str:
19 |     """
20 |     Normalize prompt content prior to hashing.
21 | 
22 |     Rules:
23 |     - Convert CRLF/CR to LF
24 |     - Strip trailing whitespace on each line
25 |     - Strip leading/trailing whitespace overall
26 |     - Do not modify {{variable}} tokens
27 |     """
28 |     if not isinstance(text, str):
29 |         text = str(text)
30 |     normalized = _normalize_newlines(text)
31 |     normalized = _strip_trailing_whitespace(normalized)
32 |     normalized = normalized.strip()
33 |     return normalized
34 | 
35 | 
36 | def sha256_hex(text: str) -> str:
37 |     """Return lowercase hex SHA-256 of the normalized text."""
38 |     normalized = normalize_prompt_text(text)
39 |     return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/examples_v2/tracing/openai_basic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Simple OpenAI Tracing Example
 4 | ============================
 5 | 
 6 | This example shows how to trace OpenAI API calls with ZeroEval.
 7 | The tracing happens automatically once you initialize the SDK.
 8 | """
 9 | 
10 | import os
11 | from pathlib import Path
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | # Load environment variables BEFORE importing zeroeval
16 | env_path = Path(__file__).parent.parent / ".env"
17 | load_dotenv(env_path)
18 | 
19 | import openai
20 | import zeroeval as ze
21 | 
22 | def main():
23 |     # Initialize ZeroEval (explicitly passing values to ensure they're used)
24 |     ze.init(
25 |     )
26 | 
27 |     # Initialize OpenAI client
28 |     client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
29 | 
30 |     # Make a simple OpenAI call - this will be automatically traced
31 |     response = client.chat.completions.create(
32 |         model="gpt-4o-mini",
33 |         messages=[
34 |             {"role": "user", "content": "What is the capital of France?"}
35 |         ],
36 |         temperature=0.7,
37 |         max_tokens=100
38 |     )
39 | 
40 |     print("Response:", response.choices[0].message.content)
41 |     print("✅ OpenAI call completed and automatically traced!")
42 | 
43 | if __name__ == "__main__":
44 |     main()


--------------------------------------------------------------------------------
/src/zeroeval/core/decorators.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import types
 3 | 
 4 | # This global registry will store metadata about each decorated function.
 5 | registered_experiments = []
 6 | 
 7 | def experiment(dataset=None, model=None):
 8 |     """
 9 |     A decorator that attaches the specified dataset and model to the function.
10 |     Also optionally copies the function globals if needed.
11 |     """
12 |     def decorator(fn):
13 |         # (Optional) Copy the original function's globals if you truly need a new global context.
14 |         new_globals = dict(fn.__globals__)
15 |         new_globals["dataset"] = dataset
16 |         new_globals["model"] = model
17 | 
18 |         # Create a new function object with updated globals
19 |         new_fn = types.FunctionType(
20 |             fn.__code__,
21 |             new_globals,
22 |             fn.__name__,
23 |             fn.__defaults__,
24 |             fn.__closure__
25 |         )
26 | 
27 |         # Maintain function metadata
28 |         new_fn.__kwdefaults__ = fn.__kwdefaults__
29 |         new_fn.__annotations__ = fn.__annotations__
30 |         functools.update_wrapper(new_fn, fn)
31 | 
32 |         # Add an attribute or store in a global registry
33 |         new_fn._exp_metadata = {"dataset": dataset, "model": model}
34 |         registered_experiments.append(new_fn)
35 |         return new_fn
36 |     return decorator


--------------------------------------------------------------------------------
/src/zeroeval/cache.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import time
 4 | from collections import OrderedDict
 5 | from threading import Lock
 6 | from typing import Generic, Hashable, Optional, Tuple, TypeVar
 7 | 
 8 | K = TypeVar("K", bound=Hashable)
 9 | V = TypeVar("V")
10 | 
11 | 
12 | class TTLCache(Generic[K, V]):
13 |     def __init__(self, ttl_seconds: float = 60.0, maxsize: int = 512) -> None:
14 |         self._ttl = float(ttl_seconds)
15 |         self._maxsize = int(maxsize)
16 |         self._data: "OrderedDict[K, Tuple[float, V]]" = OrderedDict()
17 |         self._lock = Lock()
18 | 
19 |     def get(self, key: K) -> Optional[V]:
20 |         now = time.time()
21 |         with self._lock:
22 |             item = self._data.get(key)
23 |             if not item:
24 |                 return None
25 |             ts, value = item
26 |             if now - ts > self._ttl:
27 |                 # Expired
28 |                 self._data.pop(key, None)
29 |                 return None
30 |             # Move to end (LRU)
31 |             self._data.move_to_end(key)
32 |             return value
33 | 
34 |     def set(self, key: K, value: V) -> None:
35 |         with self._lock:
36 |             self._data[key] = (time.time(), value)
37 |             self._data.move_to_end(key)
38 |             if len(self._data) > self._maxsize:
39 |                 self._data.popitem(last=False)
40 | 
41 |     def clear(self) -> None:
42 |         with self._lock:
43 |             self._data.clear()
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/zeroeval/template.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import re
 4 | from typing import Any, Dict, Set
 5 | 
 6 | _IDENTIFIER_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
 7 | 
 8 | 
 9 | def render_template(content: str, variables: Dict[str, Any], *, missing: str = "error") -> str:
10 |     if missing not in {"error", "leave"}:
11 |         raise ValueError("missing must be 'error' or 'leave'")
12 | 
13 |     # Validate variable keys early
14 |     for key in variables.keys():
15 |         if not _IDENTIFIER_RE.match(key):
16 |             raise ValueError(f"Invalid variable name: {key}")
17 | 
18 |     # Handle escaped braces: \{{ and \}}
19 |     ESC_L = "__ZE_ESC_L__"
20 |     ESC_R = "__ZE_ESC_R__"
21 |     tmp = content.replace(r"\{{", ESC_L).replace(r"\}}", ESC_R)
22 | 
23 |     def repl(match: re.Match[str]) -> str:
24 |         name = match.group(1)
25 |         if name in variables:
26 |             return str(variables[name])
27 |         if missing == "error":
28 |             from .errors import PromptRequestError
29 | 
30 |             raise PromptRequestError(f"Missing variable: {name}", status=None)
31 |         return "{{" + name + "}}"
32 | 
33 |     rendered = re.sub(r"\{\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}\}", repl, tmp)
34 |     return rendered.replace(ESC_L, "{{").replace(ESC_R, "}}")
35 | 
36 | 
37 | def extract_variables(content: str) -> Set[str]:
38 |     names: Set[str] = set()
39 |     # Temporarily remove escaped braces
40 |     tmp = content.replace(r"\{{", "").replace(r"\}}", "")
41 |     for m in re.finditer(r"\{\{\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\}\}", tmp):
42 |         names.add(m.group(1))
43 |     return names
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/zeroeval/cli/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | 
 4 | from rich.console import Console
 5 | from rich.panel import Panel
 6 | from rich.progress import Progress, SpinnerColumn
 7 | from rich.text import Text
 8 | from rich.theme import Theme
 9 | 
10 | # Custom theme for our brand
11 | THEME = Theme({
12 |     "info": "cyan",
13 |     "warning": "yellow",
14 |     "error": "red",
15 |     "success": "green",
16 |     "brand": "#BAEd00",  # Adjust this to match your brand color
17 | })
18 | 
19 | console = Console(theme=THEME)
20 | 
21 | def brand_print(message: str, style: str = "brand") -> None:
22 |     """Print with brand styling."""
23 |     console.print(f"● {message}", style=style)
24 | 
25 | def animate_dots(message: str, duration: float = 2.0) -> None:
26 |     """Animate loading dots."""
27 |     frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
28 |     end = time.time() + duration
29 |     
30 |     while time.time() < end:
31 |         for frame in frames:
32 |             sys.stdout.write(f"\r{frame} {message}")
33 |             sys.stdout.flush()
34 |             time.sleep(0.1)
35 |     sys.stdout.write("\r")
36 |     sys.stdout.flush()
37 | 
38 | def show_welcome_box() -> None:
39 |     """Show a beautiful welcome message."""
40 |     message = Text.assemble(
41 |         ("Welcome to ", "white"),
42 |         ("ZeroEval", "brand"),
43 |         ("\nLet's get you set up with something magical ✨", "white")
44 |     )
45 |     console.print(Panel(message, border_style="brand"))
46 | 
47 | def spinner(message: str) -> Progress:
48 |     """Create a spinner with message."""
49 |     return Progress(
50 |         SpinnerColumn("dots", style="brand"),
51 |         *Progress.get_default_columns(),
52 |         console=console,
53 |         transient=True,
54 |     )
55 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Python Configuration - Let uv handle automatic detection
 3 |     "python.defaultInterpreterPath": ".venv/bin/python",
 4 |     "python.terminal.activateEnvironment": true,
 5 |     "python.envFile": "${workspaceFolder}/.env",
 6 |     
 7 |     // Ignore global PYTHONPATH - use project-specific environment
 8 |     "terminal.integrated.env.osx": {
 9 |         "PYTHONPATH": ""
10 |     },
11 |     
12 |     // Type Checking with mypy
13 |     "python.analysis.typeCheckingMode": "basic",
14 |     "mypy-type-checker.importStrategy": "fromEnvironment",
15 |     "mypy-type-checker.preferDaemon": true,
16 |     
17 |     // Ruff Configuration (Linting & Formatting)
18 |     "python.linting.enabled": true,
19 |     "python.linting.ruffEnabled": true,
20 |     "python.linting.pylintEnabled": false,
21 |     "python.linting.flake8Enabled": false,
22 |     "python.linting.mypyEnabled": false,
23 |     
24 |     // Formatting
25 |     "python.formatting.provider": "none",
26 |     "[python]": {
27 |         "editor.defaultFormatter": "charliermarsh.ruff",
28 |         "editor.formatOnSave": false,
29 |         "editor.codeActionsOnSave": {
30 |             "source.organizeImports": "explicit"
31 |         }
32 |     },
33 |     
34 |     // Editor Settings
35 |     "editor.rulers": [88],
36 |     "files.trimTrailingWhitespace": true,
37 |     "files.insertFinalNewline": true,
38 |     
39 |     // Terminal Configuration
40 |     "terminal.integrated.defaultProfile.osx": "zsh",
41 |     "terminal.integrated.cwd": "${workspaceFolder}",
42 |     
43 |     // Hide generated files
44 |     "files.exclude": {
45 |         "**/__pycache__": true,
46 |         "**/.pytest_cache": true,
47 |         "**/.ruff_cache": true,
48 |         "**/.mypy_cache": true,
49 |         "**/*.pyc": true
50 |     }
51 | } 


--------------------------------------------------------------------------------
/examples_v2/README.md:
--------------------------------------------------------------------------------
 1 | # ZeroEval Examples v2
 2 | 
 3 | This directory contains organized, focused examples for ZeroEval SDK features.
 4 | 
 5 | ## Directory Structure
 6 | 
 7 | - **`tracing/`** - Examples for observability and tracing
 8 |   - Basic OpenAI tracing
 9 |   - Custom span creation
10 |   - Advanced tracing patterns
11 | 
12 | - **`ab_testing/`** - Examples for A/B testing with ze.choose()
13 |   - Model comparison testing
14 |   - Weighted variant selection
15 |   - Automatic choice tracking
16 | 
17 | - **`tuning/`** - Examples for Prompt Tuning and Optimization
18 |   - Customer support agent with feedback loop
19 |   - Prompt versioning with ze.prompt()
20 | 
21 | ## Getting Started
22 | 
23 | 1. **Install dependencies**:
24 | 
25 |    ```bash
26 |    # Using pip
27 |    pip install zeroeval openai python-dotenv
28 | 
29 |    # Or using uv (recommended for py-sdk development)
30 |    uv add zeroeval openai python-dotenv
31 |    ```
32 | 
33 | 2. **Set up your environment**:
34 | 
35 |    Copy the example environment file and add your API keys:
36 |    ```bash
37 |    cp .env.example .env
38 |    # Edit .env with your actual API keys
39 |    ```
40 | 
41 | 3. **Explore the examples**:
42 |    ```bash
43 |    # Tracing examples
44 |    cd tracing/
45 |    uv run python openai_basic.py
46 | 
47 |    # A/B testing examples
48 |    cd ab_testing/
49 |    uv run python openai_ab_test.py
50 |    ```
51 | 
52 | Each subdirectory contains its own README with specific setup instructions and explanations.
53 | 
54 | ## Philosophy
55 | 
56 | These examples follow these principles:
57 | 
58 | - **Simple and focused**: Each example demonstrates one clear concept
59 | - **Well-documented**: Extensive comments and explanations
60 | - **Production-ready**: Code patterns you can use in real applications
61 | - **Organized**: Grouped by feature area for easy discovery
62 | 


--------------------------------------------------------------------------------
/src/zeroeval/types.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Any, Optional, Dict
 5 | 
 6 | 
 7 | @dataclass
 8 | class Prompt:
 9 |     content: str
10 |     version: Optional[int]
11 |     version_id: Optional[str]
12 |     tag: Optional[str]
13 |     is_latest: bool
14 |     model: Optional[str]
15 |     created_by: Optional[str]
16 |     updated_by: Optional[str]
17 |     created_at: Optional[str]
18 |     updated_at: Optional[str]
19 |     metadata: Dict[str, Any]
20 |     source: str  # Literal["server", "fallback"] but keep simple for py39 compatibility
21 | 
22 |     @staticmethod
23 |     def from_response(data: Dict[str, Any]) -> "Prompt":
24 |         model_value = data.get("model_id") or data.get("model")
25 |         if isinstance(model_value, str) and model_value:
26 |             model_value = f"zeroeval/{model_value}"
27 |         # Normalize version_id from payload or nested metadata
28 |         version_id_value = data.get("version_id")
29 |         if not version_id_value:
30 |             meta = data.get("metadata") or {}
31 |             if isinstance(meta, dict):
32 |                 version_id_value = meta.get("version_id") or meta.get("prompt_version_id")
33 |         return Prompt(
34 |             content=str(data.get("content", "")),
35 |             version=data.get("version"),
36 |             version_id=version_id_value,
37 |             tag=data.get("tag"),
38 |             is_latest=bool(data.get("is_latest", False)),
39 |             model=model_value,
40 |             created_by=data.get("created_by"),
41 |             updated_by=data.get("updated_by"),
42 |             created_at=data.get("created_at"),
43 |             updated_at=data.get("updated_at"),
44 |             metadata=data.get("metadata", {}) or {},
45 |             source="server",
46 |         )
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/src/zeroeval/core/task.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import inspect
 3 | from typing import Any, Callable, Optional
 4 | 
 5 | 
 6 | def task(outputs: list[str], name: Optional[str] = None) -> Callable:
 7 |     """
 8 |     Decorator to mark a function as a task that can be run on a dataset.
 9 |     
10 |     Args:
11 |         outputs: List of output column names this task produces
12 |         name: Optional custom name for the task (defaults to function name)
13 |         
14 |     Example:
15 |         @task(outputs=["pred"])
16 |         def solve(row):
17 |             return {"pred": llm_answer(row.question)}
18 |             
19 |         @task(outputs=["answer"], name="gpt4_solver")
20 |         def solve_with_gpt4(row):
21 |             return {"answer": gpt4_call(row.question)}
22 |     """
23 |     def decorator(func: Callable) -> Callable:
24 |         # Store metadata on the function
25 |         func._is_task = True
26 |         func._outputs = outputs
27 |         func._task_name = name or func.__name__
28 |         func._task_code = inspect.getsource(func)
29 |         
30 |         @functools.wraps(func)
31 |         def wrapper(*args, **kwargs):
32 |             result = func(*args, **kwargs)
33 |             
34 |             # Validate outputs
35 |             if not isinstance(result, dict):
36 |                 raise TypeError(f"Task {func.__name__} must return a dictionary, got {type(result)}")
37 |             
38 |             missing_outputs = set(outputs) - set(result.keys())
39 |             if missing_outputs:
40 |                 raise ValueError(f"Task {func.__name__} missing outputs: {missing_outputs}")
41 |                 
42 |             return result
43 |             
44 |         # Preserve the metadata on the wrapper
45 |         wrapper._is_task = True
46 |         wrapper._outputs = outputs
47 |         wrapper._task_name = name or func.__name__
48 |         wrapper._task_code = func._task_code
49 |         
50 |         return wrapper
51 |     
52 |     return decorator 


--------------------------------------------------------------------------------
/src/zeroeval/core/metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Column and Run Metrics System
 3 | 
 4 | Separate from row evaluations to avoid schema conflicts.
 5 | Simple, clean decorators for aggregate metrics.
 6 | """
 7 | 
 8 | from typing import Callable, Dict, Any, List
 9 | from functools import wraps
10 | 
11 | # Global registries for metrics
12 | _column_metrics = {}
13 | _run_metrics = {}
14 | 
15 | 
16 | class ColumnMetric:
17 |     def __init__(self, name: str, func: Callable, outputs: List[str]):
18 |         self.name = name
19 |         self.func = func
20 |         self.outputs = outputs
21 |     
22 |     def __call__(self, dataset):
23 |         return self.func(dataset)
24 | 
25 | 
26 | class RunMetric:
27 |     def __init__(self, name: str, func: Callable, outputs: List[str]):
28 |         self.name = name
29 |         self.func = func
30 |         self.outputs = outputs
31 |     
32 |     def __call__(self, runs):
33 |         return self.func(runs)
34 | 
35 | 
36 | def column_metric(outputs: List[str]):
37 |     """Decorator for column-level metrics that operate on entire dataset."""
38 |     def decorator(func: Callable):
39 |         metric = ColumnMetric(
40 |             name=func.__name__,
41 |             func=func,
42 |             outputs=outputs
43 |         )
44 |         _column_metrics[func.__name__] = metric
45 |         return metric
46 |     return decorator
47 | 
48 | 
49 | def run_metric(outputs: List[str]):
50 |     """Decorator for run-level metrics that operate across multiple runs."""
51 |     def decorator(func: Callable):
52 |         metric = RunMetric(
53 |             name=func.__name__,
54 |             func=func,
55 |             outputs=outputs
56 |         )
57 |         _run_metrics[func.__name__] = metric
58 |         return metric
59 |     return decorator
60 | 
61 | 
62 | def get_column_metric(name: str) -> ColumnMetric:
63 |     """Get a column metric by name."""
64 |     return _column_metrics.get(name)
65 | 
66 | 
67 | def get_run_metric(name: str) -> RunMetric:
68 |     """Get a run metric by name."""
69 |     return _run_metrics.get(name) 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "zeroeval"
 3 | version = "0.6.123"
 4 | description = "ZeroEval SDK"
 5 | readme = "README.md"
 6 | authors = [
 7 |     {name = "Sebastian Crossa", email = "seb@zeroeval.com"},
 8 |     {name = "Jonathan Chavez", email = "jona@zeroeval.com"}
 9 | ]
10 | requires-python = ">=3.9,<4.0"
11 | dependencies = [
12 |     "rich >=13.7.1",
13 |     "opentelemetry-api >=1.26.0",
14 |     "opentelemetry-sdk >=1.26.0",
15 |     "opentelemetry-exporter-otlp-proto-http>=1.26.0",
16 |     "requests >=2.32.2",
17 |     "python-dotenv>=1.0.0",
18 | ]
19 | keywords = ["evaluation", "LLM", "observability"]
20 | 
21 | [project.urls]
22 | Repository = "https://github.com/zeroeval"
23 | 
24 | [project.optional-dependencies]
25 | openai = ["openai>=1.59.6"]
26 | gemini = ["google-genai>=1.21.1"]
27 | langchain = ["langchain-core>=0.1.0"]
28 | langgraph = ["langgraph>=0.0.20"]
29 | all = ["openai>=1.59.6", "google-genai>=1.21.1", "langchain-core>=0.1.0", "langgraph>=0.0.20"]
30 | 
31 | [project.scripts]
32 | zeroeval = "zeroeval.cli:main"
33 | 
34 | [dependency-groups]
35 | dev = [
36 |     "mypy>=1.16.1",
37 |     "pytest>=8.2.2,<9.0.0",
38 |     "pytest-asyncio >=0.23.7, <1.0.0",
39 |     "ruff >=0.12.2, <1.0.0",
40 |     "tox >=4.0.0, <5.0.0",
41 |     "google-genai==1.21.1"
42 | ]
43 | 
44 | [build-system]
45 | requires = ["hatchling"]
46 | build-backend = "hatchling.build"
47 | 
48 | [tool.hatch.build.targets.wheel]
49 | packages = ["src/zeroeval"]
50 | 
51 | [tool.uv]
52 | default-groups = []
53 | 
54 | [tool.uv.sources]
55 | 
56 | [tool.pytest.ini_options]
57 | pythonpath = ["src"]
58 | 
59 | [tool.ruff]
60 | # Set target Python version
61 | target-version = "py39"
62 | line-length = 88
63 | 
64 | [tool.ruff.lint]
65 | # Enable Flake8 `E` and `F` codes by default + some additional rules
66 | select = [
67 |     "E",    # pycodestyle errors
68 |     "F",    # pyflakes
69 |     "UP",   # pyupgrade
70 |     "B",    # flake8-bugbear
71 |     "SIM",  # flake8-simplify
72 |     "I",    # isort
73 | ]
74 | ignore = [
75 |     "E501",  # line too long, handled by formatter
76 | ]
77 | 
78 | [tool.ruff.lint.isort]
79 | known-first-party = ["zeroeval"]
80 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Global test configuration for ZeroEval SDK tests."""
 2 | 
 3 | import sys
 4 | from typing import Any, Dict, List
 5 | 
 6 | import pytest
 7 | 
 8 | from zeroeval.observability.tracer import Tracer
 9 | from zeroeval.observability.writer import SpanWriter
10 | 
11 | 
12 | class MockSpanWriter(SpanWriter):
13 |     """A mock writer that stores spans for testing."""
14 | 
15 |     def __init__(self):
16 |         self.spans = []
17 | 
18 |     def write(self, spans: List[Dict[str, Any]]) -> None:
19 |         self.spans.extend(spans)
20 | 
21 |     def clear(self):
22 |         self.spans.clear()
23 | 
24 | 
25 | @pytest.fixture
26 | def tracer():
27 |     """Fixture for a clean tracer instance."""
28 |     t = Tracer()
29 |     mock_writer = MockSpanWriter()
30 | 
31 |     # Store original writer
32 |     original_writer = t._writer
33 |     t._writer = mock_writer
34 | 
35 |     # Clean up state
36 |     t._spans.clear()
37 |     t._traces.clear()
38 |     t._active_spans_ctx.set([])
39 | 
40 |     yield t
41 | 
42 |     # Cleanup
43 |     t.flush()
44 |     mock_writer.clear()
45 |     t._writer = original_writer
46 |     t._shutdown_called = False
47 | 
48 | 
49 | @pytest.fixture
50 | def python_version():
51 |     """Current Python version as tuple."""
52 |     return sys.version_info[:2]
53 | 
54 | 
55 | def pytest_configure(config):
56 |     """Configure pytest markers."""
57 |     config.addinivalue_line("markers", "core: Core functionality tests")
58 |     config.addinivalue_line("markers", "performance: Performance tests")
59 | 
60 | 
61 | def pytest_addoption(parser):
62 |     """Add command line options."""
63 |     parser.addoption(
64 |         "--runperformance",
65 |         action="store_true",
66 |         default=False,
67 |         help="Run performance tests (skipped by default)",
68 |     )
69 | 
70 | 
71 | def pytest_collection_modifyitems(config, items):
72 |     """Skip performance tests unless --runperformance is given."""
73 |     if not config.getoption("--runperformance"):
74 |         skip_perf = pytest.mark.skip(
75 |             reason="Performance tests skipped. Use --runperformance to run."
76 |         )
77 |         for item in items:
78 |             if "performance" in item.keywords:
79 |                 item.add_marker(skip_perf)
80 | 


--------------------------------------------------------------------------------
/tests/core/test_decorator.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from zeroeval.observability import span
 4 | from zeroeval.observability.tracer import Tracer
 5 | 
 6 | 
 7 | @pytest.mark.core
 8 | def test_decorator_success(tracer: Tracer):
 9 |     """Tests that the @span decorator wraps a function and records a span."""
10 | 
11 |     @span(name="my_decorated_function")
12 |     def my_func(a, b):
13 |         return a + b
14 | 
15 |     result = my_func(1, 2)
16 | 
17 |     tracer.flush()
18 | 
19 |     assert result == 3
20 |     mock_writer = tracer._writer
21 |     assert len(mock_writer.spans) == 1
22 | 
23 |     s = mock_writer.spans[0]
24 |     assert s["name"] == "my_decorated_function"
25 |     assert s["status"] == "ok"
26 |     assert '"a": "1"' in s["input_data"]
27 |     assert '"b": "2"' in s["input_data"]
28 |     assert s["output_data"] == "3"
29 | 
30 | 
31 | @pytest.mark.core
32 | def test_decorator_exception(tracer: Tracer):
33 |     """Tests that the @span decorator correctly records an exception."""
34 | 
35 |     @span(name="my_failing_function")
36 |     def my_func():
37 |         raise ValueError("This is a test error")
38 | 
39 |     with pytest.raises(ValueError, match="This is a test error"):
40 |         my_func()
41 | 
42 |     tracer.flush()
43 | 
44 |     mock_writer = tracer._writer
45 |     assert len(mock_writer.spans) == 1
46 | 
47 |     s = mock_writer.spans[0]
48 |     assert s["name"] == "my_failing_function"
49 |     assert s["status"] == "error"
50 |     assert s["error_code"] == "ValueError"
51 |     assert s["error_message"] == "This is a test error"
52 |     assert "Traceback" in s["error_stack"]
53 | 
54 | 
55 | @pytest.mark.core
56 | @pytest.mark.asyncio
57 | async def test_decorator_async_success(tracer: Tracer):
58 |     """Tests that the @span decorator correctly wraps an async function."""
59 | 
60 |     @span(name="my_async_function")
61 |     async def my_async_func(a, b):
62 |         return a + b
63 | 
64 |     result = await my_async_func(3, 4)
65 | 
66 |     tracer.flush()
67 | 
68 |     assert result == 7
69 |     mock_writer = tracer._writer
70 |     assert len(mock_writer.spans) == 1
71 | 
72 |     s = mock_writer.spans[0]
73 |     assert s["name"] == "my_async_function"
74 |     assert s["status"] == "ok"
75 |     assert s["output_data"] == "7"
76 | 


--------------------------------------------------------------------------------
/examples_v2/tracing/openai_with_spans.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | OpenAI Tracing with Manual Spans
 4 | ===============================
 5 | 
 6 | This example shows how to add custom spans around OpenAI calls
 7 | for better observability and context.
 8 | """
 9 | 
10 | import os
11 | from pathlib import Path
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | # Load environment variables BEFORE importing zeroeval
16 | env_path = Path(__file__).parent.parent / ".env"
17 | load_dotenv(env_path)
18 | 
19 | import openai
20 | import zeroeval as ze
21 | 
22 | def main():
23 |     # Initialize ZeroEval (explicitly passing values to ensure they're used)
24 |     ze.init(
25 |         api_key=os.getenv("ZEROEVAL_API_KEY"),
26 |         api_url=os.getenv("ZEROEVAL_API_URL", "http://localhost:8000")
27 |     )
28 | 
29 |     # Initialize OpenAI client
30 |     client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
31 | 
32 |     # Create a span for the entire conversation
33 |     with ze.span("user_question_answering", tags={"feature": "qa_system"}):
34 | 
35 |         # Add a span for preprocessing
36 |         with ze.span("question_preprocessing"):
37 |             question = "What is the capital of France?"
38 |             processed_question = question.strip().lower()
39 |             print(f"Original question: {question}")
40 |             print(f"Processed question: {processed_question}")
41 | 
42 |         # Add a span for the LLM call (this will have nested OpenAI spans automatically)
43 |         with ze.span("llm_generation", tags={"model": "gpt-3.5-turbo"}):
44 |             response = client.chat.completions.create(
45 |                 model="gpt-3.5-turbo",
46 |                 messages=[
47 |                     {"role": "system", "content": "You are a helpful geography assistant."},
48 |                     {"role": "user", "content": question}
49 |                 ],
50 |                 temperature=0.1,
51 |                 max_tokens=50
52 |             )
53 | 
54 |         # Add a span for post-processing
55 |         with ze.span("response_postprocessing"):
56 |             answer = response.choices[0].message.content
57 |             formatted_answer = f"Answer: {answer}"
58 |             print(formatted_answer)
59 | 
60 |     print("✅ Complete conversation traced with custom spans!")
61 | 
62 | if __name__ == "__main__":
63 |     main()


--------------------------------------------------------------------------------
/src/zeroeval/core/evaluator_class.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | from .writer import EvaluatorBackendWriter, EvaluatorWriter
 4 | 
 5 | 
 6 | class Evaluator:
 7 |     """
 8 |     A class that defines an evaluator for an experiment.
 9 |     """
10 |     def __init__(self, name: str, code: str, description: str, experiment_id: str, evaluation_mode: str = "row"):
11 |         self.name = name
12 |         self.code = code
13 |         self.description = description  
14 |         self.experiment_id = experiment_id
15 |         self.evaluation_mode = evaluation_mode
16 |         self._backend_id = None
17 |         # Default to console writer for now, can be changed to backend writer later
18 |         self._writer: EvaluatorWriter = EvaluatorBackendWriter()
19 | 
20 |     def _write(self) -> Optional[str]:
21 |         """Writes the evaluator to the writer if it hasn't been written yet."""
22 |         print(f"[DEBUG] Evaluator._write called - current _backend_id={self._backend_id}")
23 |         if not self._backend_id:
24 |             print(f"[DEBUG] Calling writer._write for evaluator...")
25 |             assigned_id = self._writer._write(self)
26 |             print(f"[DEBUG] Writer returned assigned_id={assigned_id}")
27 |             if assigned_id:
28 |                 self._backend_id = assigned_id
29 |                 print(f"[DEBUG] Set evaluator _backend_id to {self._backend_id}")
30 |         else:
31 |             print(f"[DEBUG] Evaluator already has _backend_id, skipping write")
32 |         return self._backend_id
33 | 
34 | class Evaluation:
35 |     """
36 |     A class that defines the result of an evaluator.
37 |     """
38 |     def __init__(self, evaluator: Evaluator, result: Any, experiment_result_id: str, dataset_row_id: str):
39 |         self.evaluator = evaluator
40 |         self.result = result
41 |         self.experiment_result_id = experiment_result_id
42 |         self.dataset_row_id = dataset_row_id
43 |         # Use the same writer type as the parent evaluator
44 |         self._writer = evaluator._writer
45 |     
46 |     def _write(self) -> None:
47 |         """Write this Evaluation to the writer."""
48 |         print(f"[DEBUG] Evaluation._write called - evaluator._backend_id={self.evaluator._backend_id}")
49 |         result = self._writer._write(self)
50 |         print(f"[DEBUG] Evaluation._write completed - writer returned: {result}")
51 |         return result
52 | 


--------------------------------------------------------------------------------
/.cursor/rules/testing.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: Testing requirements for Python SDK - test after modifications and new features
 3 | globs: ["**/*.py"]
 4 | alwaysApply: true
 5 | ---
 6 | 
 7 | # Testing Guidelines
 8 | 
 9 | ## Test After Every Modification
10 | 
11 | **After each modification, double check the tests and run them.**
12 | 
13 | ## New Feature Testing
14 | 
15 | **If it's a new feature, it should be tested.**
16 | 
17 | ## Implementation Standards
18 | 
19 | **Implementations should be concise and well designed. Simple.**
20 | 
21 | ## Test Structure
22 | 
23 | Tests are located in:
24 | 
25 | - `/tests/` - Main test directory
26 | - `/tests/core/` - Core functionality tests
27 | - `/tests/performance/` - Performance tests
28 | 
29 | ## Running Tests
30 | 
31 | ```bash
32 | # Run all tests
33 | pytest
34 | 
35 | # Run specific test file
36 | pytest tests/core/test_decorator.py
37 | 
38 | # Run with coverage
39 | pytest --cov=src/zeroeval
40 | ```
41 | 
42 | ## Test Requirements
43 | 
44 | ### For Modifications
45 | 
46 | 1. **Run existing tests** to ensure nothing breaks
47 | 2. **Update tests** if behavior changes
48 | 3. **Verify test coverage** remains adequate
49 | 
50 | ### For New Features
51 | 
52 | 1. **Write unit tests** for core functionality
53 | 2. **Add integration tests** if needed
54 | 3. **Include edge case tests**
55 | 4. **Test error conditions**
56 | 
57 | ## Example Test Structure
58 | 
59 | ```python
60 | # tests/core/test_new_feature.py
61 | import pytest
62 | from zeroeval import NewFeature
63 | 
64 | class TestNewFeature:
65 |     def test_basic_functionality(self):
66 |         """Test the core functionality works as expected."""
67 |         feature = NewFeature()
68 |         result = feature.execute()
69 |         assert result is not None
70 | 
71 |     def test_edge_cases(self):
72 |         """Test edge cases and error conditions."""
73 |         feature = NewFeature()
74 |         with pytest.raises(ValueError):
75 |             feature.execute(invalid_input=True)
76 | 
77 |     def test_integration(self):
78 |         """Test integration with other components."""
79 |         # Integration test logic
80 |         pass
81 | ```
82 | 
83 | ## Checklist
84 | 
85 | Before submitting changes:
86 | 
87 | - [ ] All existing tests pass
88 | - [ ] New functionality is tested
89 | - [ ] Tests are simple and focused
90 | - [ ] Edge cases are covered
91 | - [ ] Error conditions are tested
92 | 


--------------------------------------------------------------------------------
/examples_v2/tuning/README.md:
--------------------------------------------------------------------------------
 1 | # Prompt Tuning Examples
 2 | 
 3 | This directory contains examples demonstrating ZeroEval's prompt tuning and optimization features.
 4 | 
 5 | ## Core Concepts
 6 | 
 7 | Prompt tuning in ZeroEval works through a feedback loop:
 8 | 
 9 | 1. **Define Prompt**: Use `ze.prompt()` to register a prompt and bind variables.
10 | 2. **Trace Execution**: Run your agent; the SDK automatically traces the inputs and outputs.
11 | 3. **Send Feedback**: Use `ze.send_feedback()` (or the direct API) to signal what was good or bad about the completion.
12 | 4. **Optimize**: ZeroEval (and integrated optimizers like DSPy) uses this feedback to generate better prompt versions.
13 | 
14 | ## Examples
15 | 
16 | ### 1. Customer Support Agent (`customer_support_agent.py`)
17 | 
18 | A simple example of a support agent that uses `ze.prompt()` for versioned, managed prompts. This demonstrates the basic setup without the automated feedback loop.
19 | 
20 | ### 2. Customer Support Agent with SDK Feedback (`bookstore_agent_with_feedback.py`)
21 | 
22 | An advanced example that implements a complete automated feedback loop using the ZeroEval SDK.
23 | 
24 | **Key Features:**
25 | 
26 | - **Automated Evaluator**: Uses a powerful model (GPT-4o) to grade the agent's responses.
27 | - **Feedback Submission**: Uses `ze.send_feedback()` to programmatically submit the evaluator's scores (thumbs up/down) and reasoning.
28 | - **Metadata Tracking**: Attaches metadata (like scores and evaluator model) to the feedback.
29 | 
30 | **Run it:**
31 | 
32 | ```bash
33 | python tuning/bookstore_agent_with_feedback.py
34 | ```
35 | 
36 | ### 3. Customer Support Agent with API Feedback (`bookstore_agent_with_api_feedback.py`)
37 | 
38 | Demonstrates how to submit feedback using direct HTTP calls to the ZeroEval API, bypassing the SDK's `ze.send_feedback` helper. This is useful for frontend applications or systems where the SDK cannot be installed.
39 | 
40 | **Key Features:**
41 | 
42 | - **Direct API Integration**: Uses `requests` to hit the `/v1/prompts/{slug}/completions/{id}/feedback` endpoint.
43 | - **Payload Structure**: Shows exactly what JSON payload the backend expects.
44 | - **Flexible Integration**: Ideal for custom pipelines or non-Python environments.
45 | 
46 | **Run it:**
47 | 
48 | ```bash
49 | python tuning/bookstore_agent_with_api_feedback.py
50 | ```
51 | 
52 | ## Setup
53 | 
54 | Ensure you have your `.env` file set up in the parent directory with:
55 | 
56 | - `ZEROEVAL_API_KEY`: Your ZeroEval API key (required, starts with `sk_ze_...`)
57 | - `OPENAI_API_KEY`: Your OpenAI API key (required)
58 | - `ZEROEVAL_API_URL`: (Optional) URL of your ZeroEval instance (default: `http://localhost:8000`)
59 | 
60 | **Important**: All examples now pull credentials from environment variables. Never commit hardcoded API keys to version control.
61 | 


--------------------------------------------------------------------------------
/src/zeroeval/core/run_collection.py:
--------------------------------------------------------------------------------
 1 | """RunCollection class for elegant multi-run operations."""
 2 | from typing import List, Union, Callable, Optional, Any
 3 | 
 4 | 
 5 | class RunCollection:
 6 |     """A collection of runs that provides a fluent interface for batch operations."""
 7 |     
 8 |     def __init__(self, runs: List["Run"]):
 9 |         """Initialize with a list of Run objects.
10 |         
11 |         Args:
12 |             runs: List of Run objects to manage
13 |         """
14 |         if not runs:
15 |             raise ValueError("RunCollection requires at least one run")
16 |         self.runs = runs
17 |         
18 |     def eval(self, evaluators: List[Union[Callable, Any]]) -> "RunCollection":
19 |         """Apply evaluators to all runs in the collection.
20 |         
21 |         Args:
22 |             evaluators: List of evaluators to apply
23 |             
24 |         Returns:
25 |             Self for method chaining
26 |         """
27 |         for run in self.runs:
28 |             run.eval(evaluators)
29 |         return self
30 |         
31 |     def column_metrics(self, metrics: List[Union[Callable, Any]]) -> "RunCollection":
32 |         """Apply column metrics to all runs in the collection.
33 |         
34 |         Args:
35 |             metrics: List of column metrics to apply
36 |             
37 |         Returns:
38 |             Self for method chaining
39 |         """
40 |         for run in self.runs:
41 |             run.column_metrics(metrics)
42 |         return self
43 |         
44 |     def run_metrics(self, metrics: List[Union[Callable, Any]]) -> "RunCollection":
45 |         """Apply run metrics across all runs in the collection.
46 |         
47 |         Args:
48 |             metrics: List of run metrics to apply
49 |             
50 |         Returns:
51 |             Self for method chaining
52 |         """
53 |         # Run metrics need access to all runs, so we pass them to the first run
54 |         if self.runs:
55 |             self.runs[0].run_metrics(metrics, self.runs)
56 |         return self
57 |         
58 |     def __len__(self) -> int:
59 |         """Return the number of runs in the collection."""
60 |         return len(self.runs)
61 |         
62 |     def __getitem__(self, index: int) -> "Run":
63 |         """Get a specific run by index."""
64 |         return self.runs[index]
65 |         
66 |     def __iter__(self):
67 |         """Iterate over runs in the collection."""
68 |         return iter(self.runs)
69 |         
70 |     def __repr__(self) -> str:
71 |         """String representation of the collection."""
72 |         return f"RunCollection({len(self.runs)} runs)"
73 |         
74 |     @property
75 |     def first(self) -> "Run":
76 |         """Get the first run in the collection."""
77 |         return self.runs[0] if self.runs else None
78 |         
79 |     @property
80 |     def last(self) -> "Run":
81 |         """Get the last run in the collection."""
82 |         return self.runs[-1] if self.runs else None
83 |         
84 |     def to_list(self) -> List["Run"]:
85 |         """Convert back to a list of runs if needed."""
86 |         return self.runs
87 | 


--------------------------------------------------------------------------------
/tests/core/test_tracer.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from zeroeval.observability import span
  4 | from zeroeval.observability.tracer import Tracer
  5 | 
  6 | 
  7 | @pytest.mark.core
  8 | def test_create_simple_trace(tracer: Tracer):
  9 |     """Tests that a simple parent-child trace is created and flushed correctly."""
 10 |     # Act
 11 |     with span(name="parent"):
 12 |         with span(name="child"):
 13 |             pass
 14 | 
 15 |     tracer.flush()
 16 | 
 17 |     # Assert
 18 |     mock_writer = tracer._writer
 19 |     assert len(mock_writer.spans) == 2
 20 | 
 21 |     parent = next(s for s in mock_writer.spans if s["name"] == "parent")
 22 |     child = next(s for s in mock_writer.spans if s["name"] == "child")
 23 | 
 24 |     assert parent["parent_id"] is None
 25 |     assert child["parent_id"] == parent["span_id"]
 26 |     assert child["trace_id"] == parent["trace_id"]
 27 | 
 28 | 
 29 | @pytest.mark.core
 30 | def test_tracer_is_thread_safe(tracer: Tracer):
 31 |     """Tests that the tracer handles spans from multiple threads correctly."""
 32 |     import threading
 33 | 
 34 |     def create_trace(name: str):
 35 |         with span(name=f"parent-{name}"):
 36 |             with span(name=f"child-{name}"):
 37 |                 pass
 38 | 
 39 |     threads = []
 40 |     for i in range(5):
 41 |         thread = threading.Thread(target=create_trace, args=(f"thread-{i}",))
 42 |         threads.append(thread)
 43 |         thread.start()
 44 | 
 45 |     for thread in threads:
 46 |         thread.join()
 47 | 
 48 |     tracer.flush()
 49 | 
 50 |     mock_writer = tracer._writer
 51 |     assert len(mock_writer.spans) == 10  # 5 threads * 2 spans
 52 | 
 53 |     # Check that each trace is consistent
 54 |     for i in range(5):
 55 |         parent = next(s for s in mock_writer.spans if s["name"] == f"parent-thread-{i}")
 56 |         child = next(s for s in mock_writer.spans if s["name"] == f"child-thread-{i}")
 57 |         assert child["parent_id"] == parent["span_id"]
 58 |         assert child["trace_id"] == parent["trace_id"]
 59 | 
 60 | 
 61 | @pytest.mark.core
 62 | def test_tracer_shutdown(tracer: Tracer):
 63 |     """Tests that the tracer stops accepting spans after shutdown."""
 64 |     with span(name="span_before_shutdown"):
 65 |         pass
 66 | 
 67 |     tracer.shutdown()
 68 | 
 69 |     # This span should be ignored and return a no-op span
 70 |     with span(name="span_after_shutdown") as s:
 71 |         assert s.name == "noop_span"
 72 | 
 73 |     # The flush in shutdown should have sent the first span.
 74 |     mock_writer = tracer._writer
 75 |     assert len(mock_writer.spans) == 1
 76 |     assert mock_writer.spans[0]["name"] == "span_before_shutdown"
 77 | 
 78 | 
 79 | @pytest.mark.core
 80 | def test_auto_flush_on_max_spans(tracer: Tracer):
 81 |     """Tests that the buffer is flushed automatically when it reaches max capacity."""
 82 |     tracer._max_spans = 5  # Set a low limit for testing
 83 | 
 84 |     for i in range(5):
 85 |         # Each trace is 1 span, so it shouldn't trigger the flush until the 5th one.
 86 |         with span(name=f"span-{i}"):
 87 |             pass
 88 | 
 89 |     # The 5th span should trigger a flush
 90 |     mock_writer = tracer._writer
 91 |     assert len(mock_writer.spans) == 5
 92 | 
 93 |     # Another span should not be in the buffer yet
 94 |     with span(name="one_more"):
 95 |         pass
 96 | 
 97 |     assert len(mock_writer.spans) == 5
 98 |     tracer.flush()
 99 |     assert len(mock_writer.spans) == 6
100 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
  1 | # ZeroEval SDK Tests
  2 | 
  3 | Simple test structure for the ZeroEval SDK across Python 3.9-3.13.
  4 | 
  5 | ## Test Categories
  6 | 
  7 | - **Core**: Essential functionality tests (`tests/core/`)
  8 | - **Performance**: CPU usage and memory leak detection (`tests/performance/`)
  9 | 
 10 | ## Setup
 11 | 
 12 | **Important setup steps:**
 13 | 
 14 | 1. **Unset conflicting PYTHONPATH** (critical for proper isolation):
 15 | 
 16 | ```bash
 17 | unset PYTHONPATH
 18 | ```
 19 | 
 20 | 2. **Install dev dependencies** (includes tox for multi-version testing):
 21 | 
 22 | ```bash
 23 | uv sync --group dev
 24 | ```
 25 | 
 26 | ## Running Tests
 27 | 
 28 | ### Current Python Version (uv)
 29 | 
 30 | ```bash
 31 | # Run only core tests
 32 | uv run pytest tests/core/
 33 | 
 34 | # Run performance tests
 35 | uv run pytest tests/performance/ --runperformance
 36 | 
 37 | # Run all tests (core + performance)
 38 | uv run pytest tests/ --runperformance
 39 | ```
 40 | 
 41 | ### Multiple Python Versions (tox)
 42 | 
 43 | ```bash
 44 | # Test core functionality across all available Python versions
 45 | uv run tox -e py{39,310,311,312,313}-core
 46 | 
 47 | # Test performance across all available Python versions
 48 | uv run tox -e py{39,310,311,312,313}-perf
 49 | 
 50 | # Test everything on all available versions
 51 | uv run tox
 52 | ```
 53 | 
 54 | **Note**: Python 3.7 and 3.8 will be skipped if not installed. The SDK has been tested on Python 3.9-3.13.
 55 | 
 56 | ### Quick aliases
 57 | 
 58 | ```bash
 59 | # Current Python version only
 60 | uv run tox -e core    # Core tests
 61 | uv run tox -e perf    # Performance tests
 62 | uv run tox -e all     # Core + performance
 63 | ```
 64 | 
 65 | ## Performance Tests
 66 | 
 67 | Performance tests are **skipped by default** to keep normal test runs fast. Enable them with `--runperformance`.
 68 | 
 69 | These tests check for:
 70 | 
 71 | - CPU performance regressions (>500 spans/sec)
 72 | - Memory leaks (<2100 object growth, varies by Python version)
 73 | - Concurrent access efficiency
 74 | - Buffer management efficiency
 75 | - Deep nesting performance
 76 | 
 77 | ## Project Isolation
 78 | 
 79 | The SDK is properly isolated from the backend using:
 80 | 
 81 | - uv project management
 82 | - Isolated virtual environments
 83 | - Proper PYTHONPATH configuration in `pyproject.toml`
 84 | - Independent dependency management
 85 | 
 86 | ## Directory Layout
 87 | 
 88 | ```
 89 | tests/
 90 | ├── conftest.py                 # Shared fixtures and configuration
 91 | ├── core/                       # Essential functionality
 92 | │   ├── test_tracer.py         # Tracer singleton, spans, flushing
 93 | │   └── test_decorator.py      # @span decorator, context manager
 94 | └── performance/               # Performance and memory
 95 |     └── test_span_performance.py  # CPU, memory, concurrency tests
 96 | ```
 97 | 
 98 | ## Example Output
 99 | 
100 | ```bash
101 | $ uv sync --group dev           # First time setup
102 | $ uv run pytest tests/ --runperformance -v
103 | ============== 13 passed in 4.14s ==============
104 | 
105 | 7 core tests + 6 performance tests ✓
106 | 
107 | $ uv run tox                    # Multi-version testing
108 | ============== Results ==============
109 | ✓ py39-core: 7 passed
110 | ✓ py39-perf: 6 passed
111 | ✓ py310-core: 7 passed
112 | ✓ py310-perf: 6 passed
113 | ✓ py311-core: 7 passed
114 | ✓ py311-perf: 6 passed
115 | ✓ py312-core: 7 passed
116 | ✓ py312-perf: 6 passed
117 | ✓ py313-core: 7 passed
118 | ✓ py313-perf: 6 passed
119 | ```
120 | 


--------------------------------------------------------------------------------
/.cursor/rules/documentation.mdc:
--------------------------------------------------------------------------------
  1 | ---
  2 | description: Documentation requirements for Python SDK changes - update docs with examples
  3 | globs: ["**/*.py"]
  4 | alwaysApply: true
  5 | ---
  6 | 
  7 | # Documentation Guidelines
  8 | 
  9 | ## Documentation Requirement
 10 | 
 11 | **For every change in the Python SDK, check the @docs/ folder and document the changes.**
 12 | 
 13 | ## Documentation Location
 14 | 
 15 | SDK documentation is located at:
 16 | 
 17 | - `/docs/tracing/sdks/python/` - Main Python SDK docs
 18 | - `/docs/tracing/sdks/python/setup.mdx` - Setup instructions
 19 | - `/docs/tracing/sdks/python/reference.mdx` - API reference
 20 | - `/docs/tracing/sdks/python/integrations.mdx` - Integration guides
 21 | 
 22 | ## Documentation Standards
 23 | 
 24 | **The documentation should be simple to understand, detailed, and with a mini example. Always readable and concise.**
 25 | 
 26 | ## Documentation Structure
 27 | 
 28 | ### For New Features
 29 | 
 30 | ````mdx
 31 | ## Feature Name
 32 | 
 33 | Brief description of what this feature does and why it's useful.
 34 | 
 35 | ### Setup
 36 | 
 37 | ```python
 38 | from zeroeval import FeatureName
 39 | 
 40 | # Basic setup
 41 | feature = FeatureName(api_key="your_key")
 42 | ```
 43 | ````
 44 | 
 45 | ### Usage
 46 | 
 47 | ```python
 48 | # Simple example
 49 | result = feature.execute(
 50 |     input_data="example",
 51 |     options={"param": "value"}
 52 | )
 53 | 
 54 | print(result)
 55 | ```
 56 | 
 57 | ### Parameters
 58 | 
 59 | | Parameter    | Type   | Required | Description                  |
 60 | | ------------ | ------ | -------- | ---------------------------- |
 61 | | `input_data` | `str`  | Yes      | Description of the parameter |
 62 | | `options`    | `dict` | No       | Optional configuration       |
 63 | 
 64 | ### Returns
 65 | 
 66 | Returns a `FeatureResult` object with the following properties:
 67 | 
 68 | - `data`: The processed result
 69 | - `metadata`: Additional information about the operation
 70 | 
 71 | ````
 72 | 
 73 | ### For API Changes
 74 | - Update the API reference section
 75 | - Add deprecation warnings if needed
 76 | - Update version information
 77 | - Include migration examples
 78 | 
 79 | ### For Integration Changes
 80 | - Update relevant integration guide
 81 | - Add new integration examples
 82 | - Update compatibility matrix
 83 | 
 84 | ## Example Documentation Update
 85 | 
 86 | When adding a new tracing feature:
 87 | 
 88 | ```mdx
 89 | ## Manual Span Creation
 90 | 
 91 | Create custom spans to track specific operations in your application.
 92 | 
 93 | ### Basic Usage
 94 | 
 95 | ```python
 96 | from zeroeval import tracer
 97 | 
 98 | # Create a custom span
 99 | with tracer.span("custom_operation", tags={"user_id": "123"}) as span:
100 |     # Your operation here
101 |     result = perform_operation()
102 |     span.set_attribute("result_count", len(result))
103 | ````
104 | 
105 | ### Advanced Usage
106 | 
107 | ```python
108 | # Nested spans with custom attributes
109 | with tracer.span("parent_operation") as parent:
110 |     parent.set_attribute("operation_type", "batch_process")
111 | 
112 |     with tracer.span("child_operation", parent=parent) as child:
113 |         child.set_attribute("item_count", 10)
114 |         process_items()
115 | ```
116 | 
117 | ```
118 | 
119 | ## Checklist
120 | 
121 | When making SDK changes:
122 | - [ ] Identify affected documentation sections
123 | - [ ] Update relevant .mdx files
124 | - [ ] Add mini examples for new features
125 | - [ ] Ensure examples are tested and work
126 | - [ ] Keep language simple and clear
127 | - [ ] Include parameter descriptions
128 | - [ ] Add return value documentation
129 | ```
130 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/httpx/README.md:
--------------------------------------------------------------------------------
  1 | # HttpX Integration
  2 | 
  3 | This integration provides network-level tracing for HTTP requests made through the `httpx` library, with special support for LLM provider APIs.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Automatic Request Interception**: Patches httpx Client and AsyncClient to intercept all HTTP requests
  8 | - **Selective Tracing**: Only traces requests to supported LLM provider endpoints
  9 | - **Gemini API Support**: Full support for Google Gemini REST API including:
 10 |   - generateContent endpoint
 11 |   - streamGenerateContent endpoint (with SSE streaming support)
 12 |   - Function/tool calling detection
 13 |   - Usage metadata extraction
 14 |   - Safety ratings capture
 15 | 
 16 | ## How It Works
 17 | 
 18 | The integration works by:
 19 | 
 20 | 1. Patching httpx's `request` method on both sync and async clients
 21 | 2. Filtering requests by URL pattern to identify LLM API calls
 22 | 3. Parsing request/response payloads to extract relevant information
 23 | 4. Creating LLM spans with appropriate attributes
 24 | 5. Creating child tool spans for function calls
 25 | 
 26 | ## Supported Endpoints
 27 | 
 28 | Currently supports:
 29 | 
 30 | - **Google Gemini API**: `https://generativelanguage.googleapis.com/v*/models/*:generateContent`
 31 | - **Google Gemini Streaming**: `https://generativelanguage.googleapis.com/v*/models/*:streamGenerateContent`
 32 | 
 33 | ## Usage
 34 | 
 35 | The integration is automatically enabled when httpx is installed. You can also explicitly enable it:
 36 | 
 37 | ```python
 38 | import ze
 39 | ze.init(
 40 |     api_key="your_api_key",
 41 |     integrations=["HttpxIntegration"]
 42 | )
 43 | ```
 44 | 
 45 | Or disable it:
 46 | 
 47 | ```python
 48 | ze.init(
 49 |     api_key="your_api_key",
 50 |     disabled_integrations=["httpx"]
 51 | )
 52 | ```
 53 | 
 54 | ## Example
 55 | 
 56 | ```python
 57 | import httpx
 58 | import ze
 59 | 
 60 | ze.init(api_key="your_key")
 61 | 
 62 | # Make a direct API call to Gemini
 63 | with httpx.Client() as client:
 64 |     response = client.post(
 65 |         "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent",
 66 |         headers={"x-goog-api-key": "your_gemini_key"},
 67 |         json={
 68 |             "contents": [{"parts": [{"text": "Hello"}]}],
 69 |             "generationConfig": {"temperature": 0.7}
 70 |         }
 71 |     )
 72 | ```
 73 | 
 74 | ## Captured Attributes
 75 | 
 76 | For Gemini API calls, the integration captures:
 77 | 
 78 | **Request Attributes:**
 79 | 
 80 | - Model name
 81 | - Contents (messages)
 82 | - Generation config (temperature, max_output_tokens, etc.)
 83 | - Tools and tool configuration
 84 | - System instruction
 85 | - Cached content reference
 86 | 
 87 | **Response Attributes:**
 88 | 
 89 | - Output text or function calls
 90 | - Token usage (input, output, total)
 91 | - Finish reason
 92 | - Safety ratings
 93 | - Model version
 94 | - Response ID
 95 | - Throughput (chars/second)
 96 | 
 97 | ## Streaming Support
 98 | 
 99 | The integration fully supports streaming responses through Server-Sent Events (SSE). It accumulates chunks and creates the span once streaming completes, including total token usage.
100 | 
101 | ## Extensibility
102 | 
103 | The integration is designed to be easily extensible to support other LLM providers. To add support for a new provider:
104 | 
105 | 1. Add URL pattern matching in `_should_trace_request()`
106 | 2. Add request parsing in a new method like `_parse_provider_request()`
107 | 3. Add response parsing in a new method like `_parse_provider_response()`
108 | 4. Update the wrapper methods to route to appropriate parsers based on URL
109 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/registry.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Integration registry for lazy loading.
  3 | 
  4 | This module provides a registry system for integrations that avoids importing
  5 | all integration modules at startup, reducing initialization overhead.
  6 | """
  7 | 
  8 | from typing import Dict, Type, Callable, Optional
  9 | import importlib
 10 | import logging
 11 | 
 12 | from .base import Integration
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | # Registry of integration name -> (module_path, class_name, package_name)
 18 | INTEGRATION_REGISTRY: Dict[str, tuple[str, str, str]] = {
 19 |     "OpenAIIntegration": (
 20 |         "zeroeval.observability.integrations.openai.integration",
 21 |         "OpenAIIntegration",
 22 |         "openai"
 23 |     ),
 24 |     "LangChainIntegration": (
 25 |         "zeroeval.observability.integrations.langchain.integration",
 26 |         "LangChainIntegration",
 27 |         "langchain_core"
 28 |     ),
 29 |     "LangGraphIntegration": (
 30 |         "zeroeval.observability.integrations.langgraph.integration",
 31 |         "LangGraphIntegration",
 32 |         "langgraph"
 33 |     ),
 34 |     "GeminiIntegration": (
 35 |         "zeroeval.observability.integrations.gemini.integration",
 36 |         "GeminiIntegration",
 37 |         "google.genai"
 38 |     ),
 39 |     "HttpxIntegration": (
 40 |         "zeroeval.observability.integrations.httpx.integration",
 41 |         "HttpxIntegration",
 42 |         "httpx"
 43 |     ),
 44 |     "VocodeIntegration": (
 45 |         "zeroeval.observability.integrations.vocode.integration",
 46 |         "VocodeIntegration",
 47 |         "vocode"
 48 |     ),
 49 | 
 50 | }
 51 | 
 52 | 
 53 | def is_package_available(package_name: str) -> bool:
 54 |     """Check if a package is available without importing it fully."""
 55 |     try:
 56 |         # Use importlib.util to check without actually importing
 57 |         import importlib.util
 58 |         spec = importlib.util.find_spec(package_name.split('.')[0])
 59 |         return spec is not None
 60 |     except (ImportError, AttributeError, ValueError):
 61 |         return False
 62 | 
 63 | 
 64 | def get_integration_class(integration_name: str) -> Optional[Type[Integration]]:
 65 |     """
 66 |     Lazily load and return an integration class.
 67 |     
 68 |     This avoids importing integration modules until they're actually needed.
 69 |     """
 70 |     if integration_name not in INTEGRATION_REGISTRY:
 71 |         logger.warning(f"Unknown integration: {integration_name}")
 72 |         return None
 73 |         
 74 |     module_path, class_name, package_name = INTEGRATION_REGISTRY[integration_name]
 75 |     
 76 |     # First check if the required package is available
 77 |     if not is_package_available(package_name):
 78 |         logger.debug(f"Package {package_name} not available for {integration_name}")
 79 |         return None
 80 |     
 81 |     try:
 82 |         # Only import the integration module if the package is available
 83 |         module = importlib.import_module(module_path)
 84 |         integration_class = getattr(module, class_name)
 85 |         return integration_class
 86 |     except Exception as e:
 87 |         logger.debug(f"Failed to load integration {integration_name}: {e}")
 88 |         return None
 89 | 
 90 | 
 91 | def get_available_integrations() -> Dict[str, Type[Integration]]:
 92 |     """
 93 |     Get all available integration classes.
 94 |     
 95 |     This lazily checks and loads only the integrations whose packages are installed.
 96 |     """
 97 |     available = {}
 98 |     for integration_name in INTEGRATION_REGISTRY:
 99 |         integration_class = get_integration_class(integration_name)
100 |         if integration_class:
101 |             available[integration_name] = integration_class
102 |     return available


--------------------------------------------------------------------------------
/src/zeroeval/core/evaluation.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import inspect
  3 | from typing import Any, Callable, Optional, Union, List, Dict
  4 | from enum import Enum
  5 | 
  6 | 
  7 | class EvaluationMode(Enum):
  8 |     ROW = "row"
  9 |     COLUMN = "column"  
 10 |     RUN = "run"
 11 | 
 12 | 
 13 | class Evaluation:
 14 |     """Represents a registered evaluation function with metadata."""
 15 |     
 16 |     def __init__(
 17 |         self,
 18 |         func: Callable,
 19 |         mode: EvaluationMode,
 20 |         outputs: List[str],
 21 |         name: Optional[str] = None,
 22 |         description: Optional[str] = None
 23 |     ):
 24 |         self.func = func
 25 |         self.mode = mode
 26 |         self.outputs = outputs
 27 |         self.name = name or func.__name__
 28 |         self.description = description or func.__doc__ or ""
 29 |         self._code = inspect.getsource(func)
 30 |         
 31 |     def __call__(self, *args, **kwargs):
 32 |         """Make the evaluation callable."""
 33 |         return self.func(*args, **kwargs)
 34 |         
 35 |     def __repr__(self):
 36 |         return f"Evaluation({self.name}, mode={self.mode.value})"
 37 | 
 38 | 
 39 | # Global registry for evaluations
 40 | _registered_evaluations: Dict[str, Evaluation] = {}
 41 | 
 42 | 
 43 | def evaluation(
 44 |     mode: str = "row",
 45 |     outputs: Optional[List[str]] = None,
 46 |     name: Optional[str] = None
 47 | ) -> Callable:
 48 |     """
 49 |     Decorator to register an evaluation function.
 50 |     
 51 |     Args:
 52 |         mode: Evaluation mode - "row", "column", or "run"
 53 |         outputs: List of output field names this evaluation produces
 54 |         name: Optional custom name for the evaluation
 55 |         
 56 |     Examples:
 57 |         # Row evaluation - gets full row access
 58 |         @evaluation(mode="row", outputs=["exact_match"])
 59 |         def exact_match(row):
 60 |             return {"exact_match": int(row["prediction"] == row["answer"])}
 61 |             
 62 |         # Column evaluation - gets all rows
 63 |         @evaluation(mode="column", outputs=["f1_score"])
 64 |         def f1_score(dataset):
 65 |             predictions = [row["prediction"] for row in dataset]
 66 |             labels = [row["label"] for row in dataset]
 67 |             return {"f1_score": calculate_f1(predictions, labels)}
 68 |             
 69 |         # Run evaluation - gets all runs
 70 |         @evaluation(mode="run", outputs=["pass_at_3"])
 71 |         def pass_at_k(runs, k=3):
 72 |             # Evaluate across multiple runs
 73 |             return {"pass_at_3": calculate_pass_at_k(runs, k)}
 74 |     """
 75 |     if outputs is None:
 76 |         outputs = []
 77 |         
 78 |     def decorator(func: Callable) -> Evaluation:
 79 |         eval_mode = EvaluationMode(mode)
 80 |         
 81 |         # Create evaluation wrapper
 82 |         eval_obj = Evaluation(
 83 |             func=func,
 84 |             mode=eval_mode,
 85 |             outputs=outputs,
 86 |             name=name or func.__name__
 87 |         )
 88 |         
 89 |         # Register in global registry
 90 |         _registered_evaluations[eval_obj.name] = eval_obj
 91 |         
 92 |         # Add metadata to the function
 93 |         func._is_evaluation = True
 94 |         func._evaluation_mode = eval_mode
 95 |         func._outputs = outputs
 96 |         
 97 |         return eval_obj
 98 |     
 99 |     return decorator
100 | 
101 | 
102 | def get_evaluation(name: str) -> Optional[Evaluation]:
103 |     """Get a registered evaluation by name."""
104 |     return _registered_evaluations.get(name)
105 | 
106 | 
107 | def get_all_evaluations() -> Dict[str, Evaluation]:
108 |     """Get all registered evaluations."""
109 |     return _registered_evaluations.copy()
110 | 
111 | 
112 | def clear_evaluations():
113 |     """Clear all registered evaluations (useful for testing)."""
114 |     _registered_evaluations.clear() 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/base.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from abc import ABC, abstractmethod
 3 | from typing import Any, Callable, Optional
 4 | 
 5 | 
 6 | class Integration(ABC):
 7 |     """Base class for all tracing integrations."""
 8 |     
 9 |     # Required package for this integration
10 |     PACKAGE_NAME: str = None
11 |     
12 |     def __init__(self, tracer):
13 |         self.tracer = tracer
14 |         self._original_functions: dict[str, Callable] = {}
15 |         self._setup_attempted = False
16 |         self._setup_successful = False
17 |         self._setup_error = None
18 | 
19 |     @classmethod
20 |     def is_available(cls) -> bool:
21 |         """Check if the required package is installed."""
22 |         if cls.PACKAGE_NAME is None:
23 |             return False
24 |         try:
25 |             importlib.import_module(cls.PACKAGE_NAME)
26 |             return True
27 |         except ImportError:
28 |             return False
29 | 
30 |     @abstractmethod
31 |     def setup(self) -> None:
32 |         """Setup the integration by applying all necessary patches."""
33 |         pass
34 | 
35 |     def safe_setup(self) -> bool:
36 |         """Safely attempt to setup the integration, catching and storing any errors."""
37 |         if self._setup_attempted:
38 |             return self._setup_successful
39 |             
40 |         self._setup_attempted = True
41 |         try:
42 |             self.setup()
43 |             self._setup_successful = True
44 |             return True
45 |         except Exception as exc:
46 |             self._setup_error = exc
47 |             self._setup_successful = False
48 |             return False
49 | 
50 |     def get_setup_error(self) -> Optional[Exception]:
51 |         """Get the error that occurred during setup, if any."""
52 |         return self._setup_error
53 | 
54 |     def teardown(self) -> None:
55 |         """Teardown the integration by removing all patches."""
56 |         for key, original_func in self._original_functions.items():
57 |             obj_name, method_name = key.rsplit('.', 1)
58 |             try:
59 |                 obj = self._get_object_by_path(obj_name)
60 |                 setattr(obj, method_name, original_func)
61 |             except:
62 |                 pass
63 |         self._original_functions.clear()
64 | 
65 |     def _patch_method(self, target_object: Any, method_name: str, wrapper: Callable) -> None:
66 |         """Helper method to patch an object's method."""
67 |         original = getattr(target_object, method_name)
68 | 
69 |         # Skip if already patched by ZeroEval
70 |         if getattr(original, "__ze_patched__", False):
71 |             return
72 | 
73 |         # Identify the patched object name in a readable / unique way.
74 |         if isinstance(target_object, type):
75 |             obj_name = target_object.__name__  # class name
76 |         else:
77 |             obj_name = target_object.__class__.__name__  # instance name
78 | 
79 |         self._original_functions[f"{obj_name}.{method_name}"] = original
80 | 
81 |         patched = wrapper(original)
82 |         # Mark so we can recognise it later and avoid double wrapping
83 |         patched.__ze_patched__ = True
84 | 
85 |         setattr(target_object, method_name, patched)
86 | 
87 |     def _unpatch_method(self, target_object: Any, method_name: str) -> None:
88 |         """Helper method to restore an object's original method."""
89 |         key = f"{target_object.__class__.__name__}.{method_name}"
90 |         if key in self._original_functions:
91 |             setattr(target_object, method_name, self._original_functions[key])
92 |             del self._original_functions[key]
93 | 
94 |     def _get_object_by_path(self, obj_path: str) -> Any:
95 |         """Helper to get an object by its module path."""
96 |         module_path, obj_name = obj_path.rsplit('.', 1)
97 |         module = importlib.import_module(module_path)
98 |         return getattr(module, obj_name)


--------------------------------------------------------------------------------
/src/zeroeval/observability/signals.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Union
  4 | 
  5 | import requests
  6 | 
  7 | from .span import Span
  8 | from .tracer import tracer
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def _send_signals_immediately(
 14 |     entity_type: str, entity_id: str, signals: dict[str, Union[str, bool, int, float]]
 15 | ) -> bool:
 16 |     """
 17 |     Private helper to send a batch of signals for a single entity immediately.
 18 |     """
 19 |     # Get configuration from environment
 20 |     api_url = os.getenv("ZEROEVAL_API_URL", "https://api.zeroeval.com")
 21 |     api_key = os.getenv("ZEROEVAL_API_KEY")
 22 | 
 23 |     if not all([api_key, api_url]):
 24 |         logger.warning(
 25 |             "Cannot send signals. Missing ZEROEVAL_API_KEY or ZEROEVAL_API_URL."
 26 |         )
 27 |         return False
 28 | 
 29 |     # Prepare payload
 30 |     endpoint = f"{api_url}/signals/bulk"
 31 |     headers = {
 32 |         "Authorization": f"Bearer {api_key}",
 33 |         "Content-Type": "application/json",
 34 |     }
 35 |     api_payloads = [
 36 |         {
 37 |             "entity_type": entity_type,
 38 |             "entity_id": entity_id,
 39 |             "name": name,
 40 |             "value": value,
 41 |             "signal_type": "numerical"
 42 |             if isinstance(value, (int, float))
 43 |             else "boolean",
 44 |         }
 45 |         for name, value in signals.items()
 46 |     ]
 47 |     payload = {"signals": api_payloads}
 48 | 
 49 |     # Send immediately and log status
 50 |     try:
 51 |         logger.info(
 52 |             f"Sending {len(signals)} signals for {entity_type}:{entity_id} immediately..."
 53 |         )
 54 |         response = requests.post(endpoint, json=payload, headers=headers, timeout=5.0)
 55 |         logger.info(
 56 |             f"Response for {entity_type}:{entity_id}: HTTP {response.status_code}"
 57 |         )
 58 |         response.raise_for_status()  # Raise exception for 4xx/5xx errors
 59 |         return True
 60 |     except requests.exceptions.RequestException as e:
 61 |         logger.error(f"Failed to send signals for {entity_type}:{entity_id}: {e}")
 62 |         return False
 63 | 
 64 | 
 65 | def set_signal(
 66 |     target: Union[Span, str], signals: dict[str, Union[str, bool, int, float]]
 67 | ) -> bool:
 68 |     """
 69 |     Send signals immediately for a given span, trace, or session.
 70 | 
 71 |     This is a fire-and-forget operation that sends signals directly to the
 72 |     ZeroEval backend, independent of the span flushing mechanism.
 73 |     
 74 |     For Span targets, signals are ALSO attached to the span object itself
 75 |     so they get sent together with the span payload, enabling linkage to AB choices.
 76 | 
 77 |     Args:
 78 |         target: The entity to attach signals to. Can be a `Span` object,
 79 |                 a `trace_id` string, or a `session_id` string.
 80 |         signals: A dictionary of signal names to values.
 81 | 
 82 |     Returns:
 83 |         True if the signals were sent successfully, False otherwise.
 84 |     """
 85 |     if not isinstance(signals, dict) or not signals:
 86 |         logger.warning("No signals provided, nothing to send.")
 87 |         return True
 88 | 
 89 |     # Determine entity type and ID from the target
 90 |     entity_type = "session"  # Default assumption
 91 |     entity_id = None
 92 | 
 93 |     if isinstance(target, Span):
 94 |         entity_type = "span"
 95 |         entity_id = target.span_id
 96 |         
 97 |         # CRITICAL: Also attach signals to the span object itself
 98 |         # This ensures signals are sent WITH the span payload for AB choice linkage
 99 |         for signal_name, signal_value in signals.items():
100 |             target.set_signal(signal_name, signal_value)
101 |         
102 |         logger.debug(f"Attached {len(signals)} signals to span {entity_id} for AB choice linkage")
103 |         
104 |     elif isinstance(target, str):
105 |         entity_id = target
106 |         if tracer.is_active_trace(target):
107 |             entity_type = "trace"
108 |     else:
109 |         raise TypeError(
110 |             f"Unsupported target type '{type(target).__name__}' for signal. Must be Span or str."
111 |         )
112 | 
113 |     return _send_signals_immediately(entity_type, entity_id, signals)
114 | 


--------------------------------------------------------------------------------
/examples_v2/ab_testing/openai_ab_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | OpenAI A/B Testing Example
  4 | ==========================
  5 | 
  6 | This example shows how to use ze.choose() to A/B test between different
  7 | OpenAI models with timeboxed experiments and signal tracking.
  8 | 
  9 | Features:
 10 | - Timeboxed experiment with duration_days
 11 | - Model selection with weighted distribution
 12 | - Signal tracking for response quality
 13 | - Automatic choice recording
 14 | """
 15 | 
 16 | import os
 17 | from pathlib import Path
 18 | 
 19 | from dotenv import load_dotenv
 20 | 
 21 | # Load environment variables BEFORE importing zeroeval
 22 | env_path = Path(__file__).parent.parent / ".env"
 23 | load_dotenv(env_path)
 24 | 
 25 | import openai
 26 | import zeroeval as ze
 27 | 
 28 | def main():
 29 | 
 30 |     # Initialize ZeroEval (automatically picks up environment variables)
 31 |     ze.init()
 32 | 
 33 |     # Initialize OpenAI client
 34 |     client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 35 | 
 36 |     # IMPORTANT: ze.choose() must be called within a span context
 37 |     # Signals attached to this span will be linked to the AB test variant
 38 |     with ze.span("model_ab_test", tags={"feature": "model_comparison"}) as test_span:
 39 | 
 40 |         # Use ze.choose() to select between two models
 41 |         # This attaches the ab_choice_id to test_span for signal linkage
 42 |         # 70% chance for gpt-4o-mini (faster/cheaper), 30% for gpt-4o (more capable)
 43 |         # The experiment runs for 7 days and automatically stops accepting new choices
 44 |         selected_model = ze.choose(
 45 |             name="model_selection",
 46 |             variants={
 47 |                 "mini": "gpt-4o-mini",
 48 |                 "full": "gpt-4o"
 49 |             },
 50 |             weights={
 51 |                 "mini": 0.7,  # 70% chance
 52 |                 "full": 0.3   # 30% chance
 53 |             },
 54 |             duration_days=7,  # Run experiment for 1 week
 55 |             default_variant="mini"  # Use mini as fallback after experiment ends
 56 |         )
 57 | 
 58 |         print(f"🤖 Selected model: {selected_model}")
 59 | 
 60 |         # Make the API call with the selected model
 61 |         with ze.span("llm_call", tags={"selected_model": selected_model}):
 62 |             response = client.chat.completions.create(
 63 |                 model=selected_model,
 64 |                 messages=[
 65 |                     {"role": "system", "content": "You are a helpful assistant that explains things concisely."},
 66 |                     {"role": "user", "content": "Explain the concept of A/B testing in one paragraph."}
 67 |                 ],
 68 |                 temperature=0.1,
 69 |                 max_tokens=150
 70 |             )
 71 | 
 72 |         # Display the results
 73 |         answer = response.choices[0].message.content
 74 |         print(f"\n📝 Response:\n{answer}\n")
 75 |         
 76 |         # Track response quality with signals on the test_span
 77 |         # CRITICAL: Attach signals to the SAME SPAN where ze.choose() was called
 78 |         # The backend will automatically link these signals to the AB test variant
 79 |         # via the span_ab_choices junction table for aggregated analytics
 80 |         response_length = len(answer)
 81 |         is_concise = response_length <= 200  # Good responses should be concise
 82 |         has_good_length = 100 <= response_length <= 200
 83 |         
 84 |         ze.set_signal(test_span, {
 85 |             "response_quality": is_concise,
 86 |             "appropriate_length": has_good_length,
 87 |             "highly_effective": is_concise and has_good_length  # Both conditions met
 88 |         })
 89 |         
 90 |         print(f"\n🎯 Signals Tracked:")
 91 |         print(f"   ✓ response_quality: {is_concise} (≤200 chars)")
 92 |         print(f"   ✓ appropriate_length: {has_good_length} (100-200 chars)")
 93 |         print(f"   ✓ highly_effective: {is_concise and has_good_length}")
 94 |         print(f"\n   Length: {response_length} chars")
 95 |         print(f"   💡 These signals are automatically linked to the AB test variant!")
 96 |         print(f"\n📊 View the Signal Distribution chart in the ZeroEval dashboard!")
 97 |         print(f"   Dashboard path: Monitoring → A/B Testing → model_selection")
 98 | 
 99 | if __name__ == "__main__":
100 |     main()


--------------------------------------------------------------------------------
/src/zeroeval/observability/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for the observability module."""
  2 | 
  3 | import json
  4 | import uuid
  5 | from datetime import datetime, date
  6 | from decimal import Decimal
  7 | from typing import Any
  8 | import logging
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class SafeJSONEncoder(json.JSONEncoder):
 14 |     """
 15 |     Custom JSON encoder that handles non-serializable types gracefully.
 16 |     
 17 |     This encoder converts common non-serializable types to their string representations
 18 |     to prevent serialization errors during span flushing.
 19 |     """
 20 |     
 21 |     def default(self, obj: Any) -> Any:
 22 |         """Convert non-serializable objects to serializable formats."""
 23 |         try:
 24 |             # Handle UUID objects
 25 |             if isinstance(obj, uuid.UUID):
 26 |                 return str(obj)
 27 |             
 28 |             # Handle datetime objects
 29 |             elif isinstance(obj, (datetime, date)):
 30 |                 return obj.isoformat()
 31 |             
 32 |             # Handle Decimal objects
 33 |             elif isinstance(obj, Decimal):
 34 |                 return float(obj)
 35 |             
 36 |             # Handle bytes
 37 |             elif isinstance(obj, bytes):
 38 |                 try:
 39 |                     return obj.decode('utf-8')
 40 |                 except UnicodeDecodeError:
 41 |                     # If not valid UTF-8, encode as base64
 42 |                     import base64
 43 |                     return base64.b64encode(obj).decode('utf-8')
 44 |             
 45 |             # Handle sets by converting to lists
 46 |             elif isinstance(obj, set):
 47 |                 return list(obj)
 48 |             
 49 |             # Handle any object with a __dict__ attribute
 50 |             elif hasattr(obj, '__dict__'):
 51 |                 return {
 52 |                     '_type': obj.__class__.__name__,
 53 |                     '_module': obj.__class__.__module__,
 54 |                     'data': obj.__dict__
 55 |                 }
 56 |             
 57 |             # Handle any object with a __str__ method
 58 |             elif hasattr(obj, '__str__'):
 59 |                 return str(obj)
 60 |             
 61 |             # Let the base class raise the TypeError
 62 |             return super().default(obj)
 63 |             
 64 |         except Exception as e:
 65 |             # If all else fails, return a string representation
 66 |             logger.warning(f"Failed to serialize object of type {type(obj).__name__}: {e}")
 67 |             return f"<{type(obj).__name__} object>"
 68 | 
 69 | 
 70 | def safe_json_dumps(obj: Any, **kwargs) -> str:
 71 |     """
 72 |     Safely serialize an object to JSON, handling non-serializable types.
 73 |     
 74 |     Args:
 75 |         obj: The object to serialize
 76 |         **kwargs: Additional arguments to pass to json.dumps
 77 |         
 78 |     Returns:
 79 |         JSON string representation of the object
 80 |     """
 81 |     # Use our custom encoder by default
 82 |     kwargs.setdefault('cls', SafeJSONEncoder)
 83 |     return json.dumps(obj, **kwargs)
 84 | 
 85 | 
 86 | def sanitize_for_json(obj: Any) -> Any:
 87 |     """
 88 |     Recursively sanitize an object to ensure it's JSON serializable.
 89 |     
 90 |     This function walks through nested structures and converts non-serializable
 91 |     objects to serializable formats.
 92 |     
 93 |     Args:
 94 |         obj: The object to sanitize
 95 |         
 96 |     Returns:
 97 |         A JSON-serializable version of the object
 98 |     """
 99 |     if obj is None or isinstance(obj, (str, int, float, bool)):
100 |         return obj
101 |     
102 |     elif isinstance(obj, uuid.UUID):
103 |         return str(obj)
104 |     
105 |     elif isinstance(obj, (datetime, date)):
106 |         return obj.isoformat()
107 |     
108 |     elif isinstance(obj, Decimal):
109 |         return float(obj)
110 |     
111 |     elif isinstance(obj, bytes):
112 |         try:
113 |             return obj.decode('utf-8')
114 |         except UnicodeDecodeError:
115 |             import base64
116 |             return base64.b64encode(obj).decode('utf-8')
117 |     
118 |     elif isinstance(obj, dict):
119 |         return {k: sanitize_for_json(v) for k, v in obj.items()}
120 |     
121 |     elif isinstance(obj, (list, tuple)):
122 |         return [sanitize_for_json(item) for item in obj]
123 |     
124 |     elif isinstance(obj, set):
125 |         return [sanitize_for_json(item) for item in obj]
126 |     
127 |     elif hasattr(obj, '__dict__'):
128 |         return {
129 |             '_type': obj.__class__.__name__,
130 |             '_module': obj.__class__.__module__,
131 |             'data': sanitize_for_json(obj.__dict__)
132 |         }
133 |     
134 |     else:
135 |         # For any other type, convert to string
136 |         try:
137 |             return str(obj)
138 |         except Exception:
139 |             return f"<{type(obj).__name__} object>"
140 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: 🚀 SDK CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [main, develop]
  6 |   pull_request:
  7 |     branches: [main, develop]
  8 | 
  9 | jobs:
 10 |   quality:
 11 |     name: 🔍 Code Quality
 12 |     runs-on: ubuntu-latest
 13 | 
 14 |     steps:
 15 |       - name: 📥 Checkout code
 16 |         uses: actions/checkout@v4
 17 | 
 18 |       - name: 🐍 Install uv
 19 |         uses: astral-sh/setup-uv@v3
 20 |         with:
 21 |           version: "latest"
 22 | 
 23 |       - name: 🔧 Set up Python
 24 |         run: uv python install 3.12
 25 | 
 26 |       - name: 📦 Install dependencies
 27 |         run: uv sync --group dev
 28 | 
 29 |       - name: 🎨 Check Formatting
 30 |         run: |
 31 |           echo "::group::🎨 Ruff Formatting"
 32 |           uv run ruff format --check --diff .
 33 |           echo "::endgroup::"
 34 | 
 35 |       - name: 🔍 Run Linting
 36 |         run: |
 37 |           echo "::group::🔍 Ruff Linting"
 38 |           uv run ruff check . --output-format=github
 39 |           echo "::endgroup::"
 40 | 
 41 |       - name: 🔬 Type Checking
 42 |         run: |
 43 |           echo "::group::🔬 MyPy Type Checking"
 44 |           uv run mypy src/zeroeval --show-error-codes
 45 |           echo "::endgroup::"
 46 | 
 47 |       - name: ✅ Quality Summary
 48 |         if: success()
 49 |         run: |
 50 |           echo "::notice::✅ All code quality checks passed!"
 51 |           echo "- Formatting: PASSED"
 52 |           echo "- Linting: PASSED"
 53 |           echo "- Type checking: PASSED"
 54 | 
 55 |   test:
 56 |     name: 🧪 Test Python ${{ matrix.python-version }}
 57 |     runs-on: ubuntu-latest
 58 |     strategy:
 59 |       fail-fast: false
 60 |       matrix:
 61 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 62 | 
 63 |     steps:
 64 |       - name: 📥 Checkout code
 65 |         uses: actions/checkout@v4
 66 | 
 67 |       - name: 🐍 Install uv
 68 |         uses: astral-sh/setup-uv@v3
 69 |         with:
 70 |           version: "latest"
 71 | 
 72 |       - name: 🔧 Set up Python ${{ matrix.python-version }}
 73 |         run: uv python install ${{ matrix.python-version }}
 74 | 
 75 |       - name: 📦 Install dependencies
 76 |         run: uv sync --group dev
 77 | 
 78 |       - name: 🧪 Run Core Tests
 79 |         run: |
 80 |           echo "::group::🧪 Core Tests"
 81 |           uv run pytest tests/core/ -v --tb=short
 82 |           echo "::endgroup::"
 83 | 
 84 |       - name: ⚡ Run Performance Tests
 85 |         run: |
 86 |           echo "::group::⚡ Performance Tests"
 87 |           uv run pytest tests/performance/ --runperformance -v --tb=short
 88 |           echo "::endgroup::"
 89 | 
 90 |       - name: 📊 Test Summary
 91 |         if: success()
 92 |         run: |
 93 |           echo "::notice::✅ All tests passed on Python ${{ matrix.python-version }}!"
 94 |           echo "- Core tests: PASSED"
 95 |           echo "- Performance tests: PASSED"
 96 | 
 97 |   test-matrix:
 98 |     name: 🎯 Multi-Python Matrix
 99 |     runs-on: ubuntu-latest
100 | 
101 |     steps:
102 |       - name: 📥 Checkout code
103 |         uses: actions/checkout@v4
104 | 
105 |       - name: 🐍 Install uv
106 |         uses: astral-sh/setup-uv@v3
107 |         with:
108 |           version: "latest"
109 | 
110 |       - name: 🔧 Set up Python versions
111 |         run: uv python install 3.9 3.10 3.11 3.12 3.13
112 | 
113 |       - name: 📦 Install dependencies
114 |         run: uv sync --group dev
115 | 
116 |       - name: 🎯 Run Tox Matrix
117 |         run: |
118 |           echo "::group::🎯 Tox Multi-Python Testing"
119 |           uv run tox --parallel auto
120 |           echo "::endgroup::"
121 | 
122 |       - name: 📈 Matrix Summary
123 |         if: success()
124 |         run: |
125 |           echo "::notice::✅ Multi-Python matrix testing completed!"
126 |           echo "- Tested across Python 3.9-3.13"
127 |           echo "- Core + Performance tests: PASSED"
128 | 
129 |   all-checks:
130 |     name: ✅ All Checks
131 |     runs-on: ubuntu-latest
132 |     needs: [quality, test, test-matrix]
133 |     if: always()
134 | 
135 |     steps:
136 |       - name: 🎉 Success Summary
137 |         if: ${{ needs.quality.result == 'success' && needs.test.result == 'success' && needs.test-matrix.result == 'success' }}
138 |         run: |
139 |           echo "::notice::🎉 All CI checks passed successfully!"
140 |           echo "✅ Code Quality: PASSED"
141 |           echo "✅ Individual Tests: PASSED"
142 |           echo "✅ Matrix Tests: PASSED"
143 |           echo ""
144 |           echo "🚀 Ready to merge!"
145 | 
146 |       - name: ❌ Failure Summary
147 |         if: ${{ needs.quality.result == 'failure' || needs.test.result == 'failure' || needs.test-matrix.result == 'failure' }}
148 |         run: |
149 |           echo "::error::❌ Some CI checks failed"
150 |           echo "🔍 Code Quality: ${{ needs.quality.result }}"
151 |           echo "🧪 Individual Tests: ${{ needs.test.result }}"
152 |           echo "🎯 Matrix Tests: ${{ needs.test-matrix.result }}"
153 |           echo ""
154 |           echo "Please fix the failing checks before merging."
155 |           exit 1
156 | 


--------------------------------------------------------------------------------
/tests/performance/test_span_performance.py:
--------------------------------------------------------------------------------
  1 | """Performance tests for span operations."""
  2 | 
  3 | import gc
  4 | import threading
  5 | import time
  6 | 
  7 | import pytest
  8 | 
  9 | from zeroeval.observability import span
 10 | from zeroeval.observability.tracer import Tracer
 11 | 
 12 | 
 13 | @pytest.mark.performance
 14 | def test_many_spans_cpu_performance(tracer: Tracer):
 15 |     """Test CPU performance with many spans."""
 16 |     num_spans = 1000
 17 | 
 18 |     start_time = time.time()
 19 | 
 20 |     for i in range(num_spans):
 21 |         with span(name=f"span_{i}"):
 22 |             pass
 23 | 
 24 |     tracer.flush()
 25 |     duration = time.time() - start_time
 26 | 
 27 |     # Should create 1000 spans in under 5 seconds
 28 |     assert duration < 5.0, f"Too slow: {duration:.2f}s for {num_spans} spans"
 29 |     assert len(tracer._writer.spans) == num_spans
 30 | 
 31 |     # Should achieve at least 200 spans/second
 32 |     spans_per_second = num_spans / duration
 33 |     assert spans_per_second > 200, f"Too slow: {spans_per_second:.1f} spans/sec"
 34 | 
 35 | 
 36 | @pytest.mark.performance
 37 | def test_memory_leak_detection(tracer: Tracer):
 38 |     """Test for memory leaks in span operations."""
 39 | 
 40 |     def create_spans():
 41 |         for i in range(100):
 42 |             with span(name=f"leak_test_{i}"):
 43 |                 pass
 44 |         tracer.flush()
 45 | 
 46 |     # Run multiple batches and check for memory leaks
 47 |     gc.collect()
 48 |     initial_objects = len(gc.get_objects())
 49 | 
 50 |     # Create 5 batches of 100 spans each
 51 |     for _ in range(5):
 52 |         create_spans()
 53 |         gc.collect()
 54 | 
 55 |     final_objects = len(gc.get_objects())
 56 |     object_growth = final_objects - initial_objects
 57 | 
 58 |     # Adjusted threshold - allow for more growth due to test infrastructure and Python version differences
 59 |     assert object_growth < 2100, f"Memory leak detected: {object_growth} objects"
 60 |     assert len(tracer._writer.spans) == 500
 61 | 
 62 | 
 63 | @pytest.mark.performance
 64 | def test_concurrent_spans_performance(tracer: Tracer):
 65 |     """Test performance with concurrent spans."""
 66 |     num_threads = 5
 67 |     spans_per_thread = 50
 68 | 
 69 |     def create_spans(thread_id):
 70 |         for i in range(spans_per_thread):
 71 |             with span(name=f"thread_{thread_id}_span_{i}"):
 72 |                 pass
 73 | 
 74 |     start_time = time.time()
 75 | 
 76 |     threads = []
 77 |     for i in range(num_threads):
 78 |         thread = threading.Thread(target=create_spans, args=(i,))
 79 |         threads.append(thread)
 80 |         thread.start()
 81 | 
 82 |     for thread in threads:
 83 |         thread.join()
 84 | 
 85 |     tracer.flush()
 86 |     duration = time.time() - start_time
 87 | 
 88 |     total_spans = num_threads * spans_per_thread
 89 |     assert duration < 10.0, f"Concurrent spans too slow: {duration:.2f}s"
 90 |     assert len(tracer._writer.spans) == total_spans
 91 | 
 92 | 
 93 | @pytest.mark.performance
 94 | def test_span_creation_speed(tracer: Tracer):
 95 |     """Test that spans can be created at reasonable speed."""
 96 |     iterations = 1000
 97 | 
 98 |     # Test span creation speed
 99 |     @span(name="speed_test")
100 |     def traced_func():
101 |         return 42
102 | 
103 |     start = time.time()
104 |     for _ in range(iterations):
105 |         traced_func()
106 |     duration = time.time() - start
107 | 
108 |     tracer.flush()
109 | 
110 |     # Should create spans at reasonable speed
111 |     spans_per_second = iterations / duration
112 |     assert spans_per_second > 500, (
113 |         f"Span creation too slow: {spans_per_second:.1f} spans/sec"
114 |     )
115 |     assert duration < 5.0, (
116 |         f"Span creation took too long: {duration:.2f}s for {iterations} spans"
117 |     )
118 |     assert len(tracer._writer.spans) == iterations
119 | 
120 | 
121 | @pytest.mark.performance
122 | def test_buffer_efficiency(tracer: Tracer):
123 |     """Test buffer management efficiency."""
124 |     original_max = tracer._max_spans
125 |     tracer._max_spans = 50  # Small buffer for testing
126 | 
127 |     try:
128 |         # Create more spans than buffer size
129 |         for i in range(200):
130 |             with span(name=f"buffer_test_{i}"):
131 |                 pass
132 | 
133 |         tracer.flush()
134 | 
135 |         # Should have all spans despite small buffer
136 |         assert len(tracer._writer.spans) == 200
137 | 
138 |     finally:
139 |         tracer._max_spans = original_max
140 | 
141 | 
142 | @pytest.mark.performance
143 | def test_deep_nesting_performance(tracer: Tracer):
144 |     """Test performance with deep nesting."""
145 |     depth = 100
146 | 
147 |     def create_nested(current_depth):
148 |         if current_depth <= 0:
149 |             return
150 | 
151 |         with span(name=f"nested_{current_depth}"):
152 |             create_nested(current_depth - 1)
153 | 
154 |     start_time = time.time()
155 |     create_nested(depth)
156 |     duration = time.time() - start_time
157 | 
158 |     tracer.flush()
159 | 
160 |     # Should handle deep nesting efficiently
161 |     assert duration < 2.0, f"Deep nesting too slow: {duration:.2f}s"
162 |     assert len(tracer._writer.spans) == depth
163 | 


--------------------------------------------------------------------------------
/examples_v2/tuning/auto_prompt_optimization.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example demonstrating automatic prompt optimization with ze.prompt()
  3 | 
  4 | This example shows how ZeroEval automatically uses optimized prompt versions
  5 | from your dashboard while keeping a fallback in your code.
  6 | """
  7 | 
  8 | import zeroeval as ze
  9 | from openai import OpenAI
 10 | 
 11 | # Initialize ZeroEval
 12 | ze.init()
 13 | client = OpenAI()
 14 | 
 15 | def example_1_auto_optimization():
 16 |     """
 17 |     When you provide content to ze.prompt(), ZeroEval automatically:
 18 |     1. Checks for an optimized version in your dashboard
 19 |     2. Uses the optimized version if one exists
 20 |     3. Falls back to your provided content if no optimization exists yet
 21 |     
 22 |     This means your prompts improve automatically without code changes!
 23 |     """
 24 |     print("=== Example 1: Auto-optimization ===\n")
 25 |     
 26 |     # This will use the latest optimized version if available in your dashboard
 27 |     # Otherwise, it uses the content you provide here
 28 |     system_prompt = ze.prompt(
 29 |         name="customer-support",
 30 |         content="You are a helpful customer support agent."
 31 |     )
 32 |     
 33 |     response = client.chat.completions.create(
 34 |         model="gpt-4",
 35 |         messages=[
 36 |             {"role": "system", "content": system_prompt},
 37 |             {"role": "user", "content": "How do I reset my password?"}
 38 |         ]
 39 |     )
 40 |     
 41 |     print(f"Response: {response.choices[0].message.content}\n")
 42 | 
 43 | 
 44 | def example_2_explicit_content():
 45 |     """
 46 |     Use from_="explicit" to always use the hardcoded content, bypassing
 47 |     auto-optimization. Useful for testing, debugging, or A/B tests.
 48 |     """
 49 |     print("=== Example 2: Explicit content (bypass optimization) ===\n")
 50 |     
 51 |     # This ALWAYS uses the hardcoded content, ignoring any optimized versions
 52 |     system_prompt = ze.prompt(
 53 |         name="customer-support",
 54 |         from_="explicit",
 55 |         content="You are a helpful customer support agent."
 56 |     )
 57 |     
 58 |     response = client.chat.completions.create(
 59 |         model="gpt-4",
 60 |         messages=[
 61 |             {"role": "system", "content": system_prompt},
 62 |             {"role": "user", "content": "How do I reset my password?"}
 63 |         ]
 64 |     )
 65 |     
 66 |     print(f"Response: {response.choices[0].message.content}\n")
 67 | 
 68 | 
 69 | def example_3_explicit_latest():
 70 |     """
 71 |     If you want to explicitly require an optimized version to exist,
 72 |     use from_="latest". This will fail if no optimized versions exist yet.
 73 |     """
 74 |     print("=== Example 3: Explicit latest (requires optimization) ===\n")
 75 |     
 76 |     try:
 77 |         # This REQUIRES an optimized version to exist
 78 |         system_prompt = ze.prompt(
 79 |             name="customer-support",
 80 |             from_="latest"
 81 |         )
 82 |         
 83 |         response = client.chat.completions.create(
 84 |             model="gpt-4",
 85 |             messages=[
 86 |                 {"role": "system", "content": system_prompt},
 87 |                 {"role": "user", "content": "How do I reset my password?"}
 88 |             ]
 89 |         )
 90 |         
 91 |         print(f"Response: {response.choices[0].message.content}\n")
 92 |     except Exception as e:
 93 |         print(f"Error: {e}")
 94 |         print("This means no optimized versions exist yet. Use content= for fallback.\n")
 95 | 
 96 | 
 97 | def example_4_with_variables():
 98 |     """
 99 |     Variables work seamlessly with all modes.
100 |     """
101 |     print("=== Example 4: Variables with auto-optimization ===\n")
102 |     
103 |     system_prompt = ze.prompt(
104 |         name="company-support",
105 |         content="You are a customer support agent for {{company}}. Be helpful and professional.",
106 |         variables={"company": "TechCorp"}
107 |     )
108 |     
109 |     response = client.chat.completions.create(
110 |         model="gpt-4",
111 |         messages=[
112 |             {"role": "system", "content": system_prompt},
113 |             {"role": "user", "content": "I need help with billing"}
114 |         ]
115 |     )
116 |     
117 |     print(f"Response: {response.choices[0].message.content}\n")
118 | 
119 | 
120 | def example_5_error_handling():
121 |     """
122 |     Demonstrate error handling for invalid usage.
123 |     """
124 |     print("=== Example 5: Error handling ===\n")
125 |     
126 |     try:
127 |         # This will fail: from_="explicit" requires content
128 |         system_prompt = ze.prompt(
129 |             name="customer-support",
130 |             from_="explicit"
131 |         )
132 |     except ValueError as e:
133 |         print(f"✓ Expected error caught: {e}\n")
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     print("\n" + "="*60)
138 |     print("ZeroEval Auto-Optimization Example")
139 |     print("="*60 + "\n")
140 |     
141 |     example_1_auto_optimization()
142 |     example_2_explicit_content()
143 |     example_3_explicit_latest()
144 |     example_4_with_variables()
145 |     example_5_error_handling()
146 |     
147 |     print("\n" + "="*60)
148 |     print("Summary:")
149 |     print("- Use content= for automatic optimization with fallback (RECOMMENDED)")
150 |     print("- Use from_='explicit' to always use hardcoded content")
151 |     print("- Use from_='latest' to require an optimized version")
152 |     print("- Variables work seamlessly with all approaches")
153 |     print("="*60 + "\n")
154 | 
155 | 


--------------------------------------------------------------------------------
/src/zeroeval/core/reader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from abc import ABC, abstractmethod
  3 | from typing import TYPE_CHECKING, Optional
  4 | 
  5 | import requests
  6 | 
  7 | from .init import _validate_init
  8 | 
  9 | if TYPE_CHECKING:
 10 |     from .dataset_class import Dataset
 11 | 
 12 | 
 13 | 
 14 | class DatasetReader(ABC):
 15 |     """Interface for reading datasets from different sources."""
 16 |     
 17 |     @abstractmethod
 18 |     def pull_by_name(self, dataset_name: str, version_number: Optional[int] = None) -> "Dataset":
 19 |         """
 20 |         Pull a dataset from a destination using its dataset name.
 21 |         Project is automatically resolved from API key.
 22 |         """
 23 |         pass
 24 | 
 25 | 
 26 | class DatasetBackendReader(DatasetReader):
 27 |     """
 28 |     Reads datasets from the ZeroEval backend API using the new v1 endpoints.
 29 |     """
 30 |     def __init__(self):
 31 |         """
 32 |         Initialize with a base URL, falling back to localhost if not set.
 33 |         """
 34 |         self.base_url = os.environ.get("ZEROEVAL_API_URL", "https://api.zeroeval.com")
 35 |         self._api_key = None
 36 |         self._headers = None
 37 |     
 38 |     def _ensure_auth_setup(self):
 39 |         """Ensure API key is set and headers are configured."""
 40 |         if self._api_key is None:
 41 |             self._api_key = os.environ.get("ZEROEVAL_API_KEY")
 42 |             if not self._api_key:
 43 |                 raise ValueError("ZEROEVAL_API_KEY environment variable not set")
 44 |         
 45 |         if self._headers is None:
 46 |             self._headers = {"Authorization": f"Bearer {self._api_key}"}
 47 | 
 48 |     def pull_by_name(self, dataset_name: str, version_number: Optional[int] = None) -> "Dataset":
 49 |         """
 50 |         Pull a dataset by dataset name using the v1 API.
 51 |         Project is automatically resolved from API key.
 52 |         """
 53 |         if not _validate_init():
 54 |             raise ValueError(
 55 |                 "ZeroEval SDK not initialized. Please call ze.init(api_key='YOUR_API_KEY') first.\n"
 56 |                 "You can set the API key in one of these ways:\n"
 57 |                 "1. Pass it directly: ze.init(api_key='sk_ze_...')\n"
 58 |                 "2. Set environment variable: export ZEROEVAL_API_KEY='sk_ze_...'\n"
 59 |                 "3. Create a .env file with: ZEROEVAL_API_KEY=sk_ze_..."
 60 |             )
 61 |         self._ensure_auth_setup()
 62 | 
 63 |         from .dataset_class import Dataset
 64 | 
 65 |         # 1) Fetch dataset metadata using v1 API
 66 |         info_url = f"{self.base_url}/v1/datasets/{dataset_name}"
 67 |         try:
 68 |             info_resp = requests.get(info_url, headers=self._headers)
 69 |             info_resp.raise_for_status()
 70 |             dataset_info = info_resp.json()
 71 |         except requests.RequestException as e:
 72 |             # If we received a 404, raise a more friendly error indicating that the dataset was not found
 73 |             if e.response is not None and e.response.status_code == 404:
 74 |                 raise ValueError(
 75 |                     f"Dataset '{dataset_name}' not found in your project. "
 76 |                     "Verify that the dataset name is correct."
 77 |                 ) from e
 78 |             # Otherwise, raise a generic runtime error
 79 |             raise RuntimeError(f"Failed to fetch dataset info by name: {e}") from e
 80 | 
 81 |         dataset_id = dataset_info["id"]
 82 | 
 83 |         # 2) Fetch rows + version using v1 API
 84 |         data_url = f"{self.base_url}/v1/datasets/{dataset_name}/data"
 85 |         params = {}
 86 |         if version_number is not None:
 87 |             params["version_number"] = version_number
 88 |         # Set a high limit to get all rows (v1 API supports up to 10000)
 89 |         params["limit"] = 10000
 90 |         params["offset"] = 0
 91 |         
 92 |         all_rows = []
 93 |         total_rows = None
 94 |         
 95 |         # Paginate through all rows
 96 |         while True:
 97 |             try:
 98 |                 data_resp = requests.get(data_url, params=params, headers=self._headers)
 99 |                 data_resp.raise_for_status()
100 |                 data_json = data_resp.json()
101 |             except requests.RequestException as e:
102 |                 if e.response is not None and e.response.status_code == 404:
103 |                     raise ValueError(
104 |                         f"No data found for dataset '{dataset_name}' in your project "
105 |                         f"(version: {version_number if version_number else 'latest'})."
106 |                     ) from e
107 |                 raise RuntimeError(f"Failed to fetch dataset rows by name: {e}") from e
108 |             
109 |             # Collect rows
110 |             all_rows.extend(data_json["rows"])
111 |             
112 |             # Check if we have all rows
113 |             if total_rows is None and "totalRows" in data_json:
114 |                 total_rows = data_json["totalRows"]
115 |             
116 |             # If we have all rows or no total_count, break
117 |             if total_rows is None or len(all_rows) >= total_rows:
118 |                 break
119 |                 
120 |             # Otherwise, fetch next page
121 |             params["offset"] = len(all_rows)
122 |         
123 |         # Create dataset with all rows
124 |         dataset = Dataset(
125 |             dataset_info["name"],
126 |             data=all_rows,
127 |             description=dataset_info.get("description")
128 |         )
129 | 
130 |         # Set backend metadata
131 |         dataset._backend_id = dataset_id
132 |         dataset._version_id = data_json["version"]["id"]
133 |         dataset._version_number = data_json["version"]["version_number"]
134 |         
135 |         return dataset
136 | 


--------------------------------------------------------------------------------
/tests/test_gemini_compatibility.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test to verify Gemini integration compatibility with google-genai 1.21.1.
  3 | 
  4 | This test file checks that our integration correctly handles the API structure
  5 | of google-genai 1.21.1, which is the minimum supported version.
  6 | """
  7 | 
  8 | import pytest
  9 | from unittest.mock import Mock, MagicMock
 10 | from zeroeval.observability.integrations.gemini.integration import GeminiIntegration
 11 | 
 12 | 
 13 | class TestGeminiCompatibility:
 14 |     """Test suite for google-genai 1.21.1 compatibility."""
 15 |     
 16 |     def test_function_declaration_compatibility(self):
 17 |         """Test that the integration handles function declarations correctly for 1.21.1."""
 18 |         integration = GeminiIntegration(Mock())
 19 |         
 20 |         # Mock a function declaration as it appears in 1.21.1
 21 |         mock_func_decl = Mock()
 22 |         mock_func_decl.name = "get_weather"
 23 |         mock_func_decl.description = "Get weather for a location"
 24 |         # In 1.21.1, parameters use types.Schema
 25 |         mock_func_decl.parameters = Mock()
 26 |         mock_func_decl.parameters.type = "OBJECT"
 27 |         mock_func_decl.parameters.properties = {
 28 |             "location": {"type": "STRING", "description": "City and state"}
 29 |         }
 30 |         mock_func_decl.parameters.required = ["location"]
 31 |         
 32 |         # Mock tool with function declarations
 33 |         mock_tool = Mock()
 34 |         mock_tool.function_declarations = [mock_func_decl]
 35 |         
 36 |         # Mock config with tools
 37 |         mock_config = Mock()
 38 |         mock_config.tools = [mock_tool]
 39 |         
 40 |         # Extract attributes
 41 |         result = integration._extract_config_attributes(mock_config)
 42 |         
 43 |         # Verify tools were extracted correctly
 44 |         assert "tools" in result
 45 |         assert len(result["tools"]) == 1
 46 |         assert result["tools"][0]["name"] == "get_weather"
 47 |         assert result["tools"][0]["description"] == "Get weather for a location"
 48 |     
 49 |     def test_response_structure_compatibility(self):
 50 |         """Test that the integration handles response structure correctly for 1.21.1."""
 51 |         integration = GeminiIntegration(Mock())
 52 |         
 53 |         # Mock response structure as it appears in 1.21.1
 54 |         mock_response = Mock()
 55 |         mock_candidate = Mock()
 56 |         mock_content = Mock()
 57 |         mock_part = Mock()
 58 |         
 59 |         # Text response
 60 |         mock_part.text = "Test response"
 61 |         mock_part.function_call = None  # No function call
 62 |         
 63 |         mock_content.parts = [mock_part]
 64 |         mock_content.role = "model"
 65 |         
 66 |         mock_candidate.content = mock_content
 67 |         mock_candidate.finish_reason = "STOP"
 68 |         mock_candidate.safety_ratings = []
 69 |         
 70 |         mock_response.candidates = [mock_candidate]
 71 |         
 72 |         # Usage metadata format in 1.21.1
 73 |         mock_usage = Mock()
 74 |         mock_usage.prompt_token_count = 10
 75 |         mock_usage.candidates_token_count = 20
 76 |         mock_usage.total_token_count = 30
 77 |         mock_response.usage_metadata = mock_usage
 78 |         
 79 |         # This should be handled correctly by our integration
 80 |         # Just verify the structure is as expected
 81 |         assert hasattr(mock_response, 'candidates')
 82 |         assert hasattr(mock_response.candidates[0], 'content')
 83 |         assert hasattr(mock_response.candidates[0].content, 'parts')
 84 |         assert hasattr(mock_response.usage_metadata, 'prompt_token_count')
 85 |     
 86 |     def test_function_call_response_compatibility(self):
 87 |         """Test handling of function call responses in 1.21.1 format."""
 88 |         integration = GeminiIntegration(Mock())
 89 |         
 90 |         # Mock function call response
 91 |         mock_fc = Mock()
 92 |         mock_fc.name = "get_weather"
 93 |         mock_fc.args = {"location": "San Francisco, CA"}
 94 |         
 95 |         # In 1.21.1, function calls are in parts
 96 |         mock_part = Mock()
 97 |         mock_part.text = None
 98 |         mock_part.function_call = mock_fc
 99 |         
100 |         # Test extraction logic
101 |         # The integration should handle this correctly
102 |         assert hasattr(mock_part, 'function_call')
103 |         assert mock_part.function_call.name == "get_weather"
104 |         assert mock_part.function_call.args == {"location": "San Francisco, CA"}
105 |     
106 |     def test_environment_variable_compatibility(self):
107 |         """Test that the integration works with both GOOGLE_API_KEY and GEMINI_API_KEY."""
108 |         # This is handled by the google-genai client itself, not our integration
109 |         # Our integration just patches the client after it's created
110 |         # So we don't need to handle the environment variable differences
111 |         pass
112 |     
113 |     def test_config_attributes_with_1_21_1_types(self):
114 |         """Test config attribute extraction with 1.21.1 type names."""
115 |         integration = GeminiIntegration(Mock())
116 |         
117 |         # Mock config as it appears in 1.21.1
118 |         mock_config = Mock()
119 |         mock_config.temperature = 0.7
120 |         mock_config.max_output_tokens = 1000
121 |         mock_config.top_p = 0.9
122 |         mock_config.top_k = 40
123 |         mock_config.stop_sequences = ["END"]
124 |         mock_config.response_mime_type = "application/json"
125 |         
126 |         # Test response schema (Pydantic model)
127 |         class TestModel:
128 |             pass
129 |         mock_config.response_schema = TestModel
130 |         
131 |         # Extract attributes
132 |         result = integration._extract_config_attributes(mock_config)
133 |         
134 |         # Verify all attributes were extracted
135 |         assert result["temperature"] == 0.7
136 |         assert result["max_output_tokens"] == 1000
137 |         assert result["top_p"] == 0.9
138 |         assert result["top_k"] == 40
139 |         assert result["stop_sequences"] == ["END"]
140 |         assert result["response_mime_type"] == "application/json"
141 |         assert "response_schema" in result


--------------------------------------------------------------------------------
/src/zeroeval/cli/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import subprocess
  4 | import time
  5 | import webbrowser
  6 | from getpass import getpass
  7 | from pathlib import Path
  8 | 
  9 | from .utils import animate_dots, brand_print, console, show_welcome_box, spinner
 10 | 
 11 | 
 12 | def get_shell_config_file():
 13 |     """Get the appropriate shell configuration file for the current system."""
 14 |     system = platform.system()
 15 |     
 16 |     if system == "Windows":
 17 |         # Windows: Use PowerShell profile or set system environment variable
 18 |         return None  # We'll handle Windows differently
 19 |     
 20 |     # Unix-like systems (macOS, Linux)
 21 |     shell = os.environ.get('SHELL', '').lower()
 22 |     home = Path.home()
 23 |     
 24 |     if 'zsh' in shell:
 25 |         return home / '.zshrc'
 26 |     elif 'bash' in shell:
 27 |         # Check for .bashrc first, then .bash_profile
 28 |         if (home / '.bashrc').exists():
 29 |             return home / '.bashrc'
 30 |         else:
 31 |             return home / '.bash_profile'
 32 |     elif 'fish' in shell:
 33 |         return home / '.config' / 'fish' / 'config.fish'
 34 |     else:
 35 |         # Default to .bashrc for unknown shells
 36 |         return home / '.bashrc'
 37 | 
 38 | def save_to_shell_config(token):
 39 |     """Save the API key to the appropriate shell configuration file."""
 40 |     try:
 41 |         system = platform.system()
 42 |         
 43 |         if system == "Windows":
 44 |             # Use setx command to set persistent environment variable on Windows
 45 |             result = subprocess.run(
 46 |                 ['setx', 'ZEROEVAL_API_KEY', token],
 47 |                 capture_output=True,
 48 |                 text=True,
 49 |                 shell=True
 50 |             )
 51 |             return result.returncode == 0, "Windows Registry (System Environment Variables)"
 52 |         else:
 53 |             # Unix-like systems
 54 |             config_file = get_shell_config_file()
 55 |             if not config_file:
 56 |                 return False, None
 57 |             
 58 |             # Check if the export already exists
 59 |             export_line = f'export ZEROEVAL_API_KEY="{token}"'
 60 |             
 61 |             if config_file.exists():
 62 |                 content = config_file.read_text()
 63 |                 if 'ZEROEVAL_API_KEY' in content:
 64 |                     # Update existing entry
 65 |                     lines = content.splitlines()
 66 |                     for i, line in enumerate(lines):
 67 |                         if 'export ZEROEVAL_API_KEY=' in line and not line.strip().startswith('#'):
 68 |                             lines[i] = export_line
 69 |                             break
 70 |                     config_file.write_text('\n'.join(lines) + '\n')
 71 |                 else:
 72 |                     # Append new entry
 73 |                     with open(config_file, 'a') as f:
 74 |                         f.write(f'\n# ZeroEval API Key\n{export_line}\n')
 75 |             else:
 76 |                 # Create new file with the export
 77 |                 config_file.write_text(f'# ZeroEval API Key\n{export_line}\n')
 78 |             
 79 |             # Also set it in the current session
 80 |             os.environ['ZEROEVAL_API_KEY'] = token
 81 |             
 82 |             return True, str(config_file)
 83 |     except Exception as e:
 84 |         console.print(f"[warning]Warning: Could not automatically save to shell config: {e}[/warning]")
 85 |         return False, None
 86 | 
 87 | def setup():
 88 |     """Launch the browser to the tokens page for user setup with a magical experience."""
 89 |     # Clear screen and show welcome
 90 |     console.clear()
 91 |     show_welcome_box()
 92 |     time.sleep(1)
 93 | 
 94 |     # Preparing message
 95 |     brand_print("Preparing your development environment...")
 96 |     time.sleep(0.5)
 97 | 
 98 |     # Simulate initialization with spinner
 99 |     with spinner("Initializing ZeroEval") as progress:
100 |         progress.add_task("", total=None)
101 |         time.sleep(2)
102 | 
103 |     # Launch browser with animation
104 |     brand_print("Opening secure token generation page...")
105 |     animate_dots("Launching browser", 1.5)
106 |     webbrowser.open("https://app.zeroeval.com/settings?section=api-keys")
107 | 
108 |     # Final message
109 |     console.print("\n✨ [success]Browser opened! Complete the setup in your browser[/success]")
110 |     console.print("Once you've generated your API key, please enter it below:\n")
111 |     
112 |     # Get token from user
113 |     token = getpass("API Key: ")
114 |     
115 |     if token:
116 |         # Save to shell configuration
117 |         with spinner("Saving your API key") as progress:
118 |             progress.add_task("", total=None)
119 |             success, location = save_to_shell_config(token)
120 |             time.sleep(1)
121 |         
122 |         if success:
123 |             console.print("\n🔐 [success]API Key saved successfully![/success]")
124 |             if location:
125 |                 if platform.system() == "Windows":
126 |                     console.print(f"   Saved to: {location}")
127 |                     console.print("   [info]Note: You may need to restart your terminal for changes to take effect[/info]")
128 |                 else:
129 |                     console.print(f"   Saved to: {location}")
130 |                     console.print(f"   [info]Run 'source {location}' or restart your terminal to use it[/info]")
131 |             console.print("\n💡 [tip]Best practice: Also store this in a .env file in your project root[/tip]")
132 |             console.print("   Create a .env file and add:")
133 |             console.print("   ZEROEVAL_API_KEY=...\n")
134 |         else:
135 |             # Fallback to manual instructions
136 |             console.print("\n⚠️ [warning]Could not automatically save API key[/warning]")
137 |             console.print("\nTo use your API key, set it as an environment variable:\n")
138 |             console.print(f"[info]export ZEROEVAL_API_KEY=\"{token}\"[/info]\n")
139 |             console.print("💡 [tip]Best practice: Store this in a .env file in your project root[/tip]")
140 |             console.print("   Create a .env file and add:")
141 |             console.print("   ZEROEVAL_API_KEY=...\n")
142 |         
143 |         console.print("Happy building!\n")
144 |     else:
145 |         console.print("\n⚠️ [warning]No token provided[/warning]")
146 |         console.print("You can run setup again later when you have your token.\n")


--------------------------------------------------------------------------------
/examples_v2/tuning/bookstore_agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Customer Support Agent with Tuning
  4 | =================================
  5 | 
  6 | This example demonstrates how to build a customer support agent using ZeroEval's
  7 | tuning features. It uses `ze.prompt()` to manage the prompt and `ze.send_feedback()`
  8 | to provide signals for optimization.
  9 | 
 10 | Key concepts:
 11 | 1. `ze.prompt()`: Defines the prompt and binds variables for interpolation
 12 | 2. Automatic Tracing: The SDK automatically traces OpenAI calls
 13 | 3. Interactive Mode: You can chat with the agent and see how it responds
 14 | """
 15 | 
 16 | import os
 17 | import uuid
 18 | from pathlib import Path
 19 | 
 20 | from dotenv import load_dotenv
 21 | 
 22 | # Load environment variables BEFORE importing zeroeval
 23 | env_path = Path(__file__).parent.parent / ".env"
 24 | load_dotenv(env_path)
 25 | 
 26 | import openai
 27 | import zeroeval as ze
 28 | 
 29 | # 1. Initialize ZeroEval
 30 | # Ensure you have ZEROEVAL_API_KEY and ZEROEVAL_API_URL set in your environment
 31 | ze.init(
 32 |     api_key=os.getenv("ZEROEVAL_API_KEY"),
 33 |     api_url=os.getenv("ZEROEVAL_API_URL", "http://localhost:8000")
 34 | )
 35 | 
 36 | def customer_support_agent(user_query: str, user_context: dict = None, conversation_history: list = None):
 37 |     """
 38 |     A simple customer support agent that uses a managed prompt and maintains conversation history.
 39 |     """
 40 |     if user_context is None:
 41 |         user_context = {}
 42 |     if conversation_history is None:
 43 |         conversation_history = []
 44 | 
 45 |     # 2. Define the prompt using ze.prompt()
 46 |     # This registers the prompt with ZeroEval (if not exists) and allows for versioning.
 47 |     # The 'content' is your base prompt. You can use {{variable}} syntax.
 48 |     # 'variables' are passed for interpolation and tracking.
 49 |     
 50 |     prompt_name = "bookstore-support-agent"
 51 |     
 52 |     system_instruction = ze.prompt(
 53 |         name=prompt_name,
 54 |         content="""You are Elena, a passionate book enthusiast and customer support specialist at Bibliophile Books. You've worked in the bookstore for 5 years and genuinely love helping people discover their next great read.
 55 | 
 56 | Your personality:
 57 | - Warm and personable, like chatting with a knowledgeable friend at a bookshop
 58 | - Enthusiastic about books and reading
 59 | - Patient and empathetic when customers have issues
 60 | - Professional but not overly formal
 61 | - You use the customer's name naturally in conversation
 62 | 
 63 | Customer Information:
 64 | - Name: {{user_name}}
 65 | - Membership Level: {{membership}}
 66 | 
 67 | Guidelines:
 68 | 1. Address {{user_name}} directly and warmly (but don't say "Hi {{user_name}}" in every message if you're in an ongoing conversation)
 69 | 2. For Gold members: Remember they have free shipping, priority support, and 15% off all purchases
 70 | 3. For Standard members: Offer helpful service while mentioning Gold membership benefits when relevant
 71 | 4. Keep responses concise but friendly (2-4 sentences for simple queries)
 72 | 5. If you don't know something or can't help, offer to connect them with a specialist
 73 | 6. Never use placeholder text like "[Your Name]" - you are Elena
 74 | 7. End naturally without formal sign-offs unless it's clearly the end of the conversation
 75 | 8. IMPORTANT: Remember information from the conversation history and don't ask for things the customer already told you
 76 | 
 77 | Respond directly to their query in a helpful, personable way.""",
 78 |         variables={
 79 |             "user_name": user_context.get("name", "there"),
 80 |             "membership": user_context.get("membership", "Standard")
 81 |         }
 82 |     )
 83 | 
 84 |     # Initialize OpenAI client (ZeroEval automatically instruments this)
 85 |     client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 86 | 
 87 |     print(f"\n--- Sending Request to AI ({prompt_name}) ---")
 88 |     
 89 |     # Build messages with conversation history
 90 |     messages = [{"role": "system", "content": system_instruction}]
 91 |     messages.extend(conversation_history)
 92 |     messages.append({"role": "user", "content": user_query})
 93 |     
 94 |     # 3. Call the Model
 95 |     # The SDK intercepts this call:
 96 |     # - Detects the <zeroeval> metadata from ze.prompt()
 97 |     # - Interpolates variables into the content
 98 |     # - Traces the execution
 99 |     response = client.chat.completions.create(
100 |         model="gpt-4o-mini", # Use a cost-effective model
101 |         messages=messages,
102 |         temperature=0.7
103 |     )
104 | 
105 |     completion_text = response.choices[0].message.content
106 |     completion_id = response.id
107 |     
108 |     return completion_text, completion_id, prompt_name
109 | 
110 | def main():
111 |     # Example interaction
112 |     print("\n=== Bookstore Support Agent (Type 'exit' to quit) ===")
113 |     
114 |     # We'll assume a fixed user context for this session
115 |     user_context = {
116 |         "name": "Alice",
117 |         "membership": "Gold" # VIP customer
118 |     }
119 |     print(f"Context: User={user_context['name']}, Membership={user_context['membership']}\n")
120 |     
121 |     # Initialize conversation history
122 |     conversation_history = []
123 |     
124 |     # Agent introduces itself
125 |     intro_query = "Hello! Please introduce yourself and ask how you can help me today."
126 |     response_text, _, _ = customer_support_agent(intro_query, user_context, conversation_history)
127 |     print(f"Elena: {response_text}\n")
128 |     
129 |     # Add intro to history
130 |     conversation_history.append({"role": "user", "content": intro_query})
131 |     conversation_history.append({"role": "assistant", "content": response_text})
132 |     
133 |     while True:
134 |         try:
135 |             user_query = input("\nEnter your query: ").strip()
136 |             if not user_query:
137 |                 continue
138 |                 
139 |             if user_query.lower() in ('exit', 'quit'):
140 |                 print("Goodbye!")
141 |                 break
142 |                 
143 |             response_text, completion_id, prompt_slug = customer_support_agent(user_query, user_context, conversation_history)
144 |             
145 |             print(f"\nElena: {response_text}")
146 |             
147 |             # Add to conversation history
148 |             conversation_history.append({"role": "user", "content": user_query})
149 |             conversation_history.append({"role": "assistant", "content": response_text})
150 |             
151 |         except KeyboardInterrupt:
152 |             print("\nGoodbye!")
153 |             break
154 |         except Exception as e:
155 |             print(f"\nError: {e}")
156 |             print("Check your ZEROEVAL_API_KEY and OPENAI_API_KEY.")
157 |             break
158 | 
159 | if __name__ == "__main__":
160 |     main()
161 | 


--------------------------------------------------------------------------------
/src/zeroeval/providers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | OpenTelemetry providers for ZeroEval.
  3 | 
  4 | This module provides TracerProviders optimized for sending traces to ZeroEval,
  5 | with support for different integration scenarios.
  6 | """
  7 | 
  8 | import logging
  9 | import os
 10 | from typing import Any, Optional
 11 | 
 12 | from opentelemetry import trace as otel_trace_api
 13 | from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 14 | from opentelemetry.sdk.resources import Resource
 15 | from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
 16 | from opentelemetry.sdk.trace.export import BatchSpanProcessor
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class ZeroEvalOTLPProvider(TracerProvider):
 22 |     """
 23 |     Standard OpenTelemetry TracerProvider configured for ZeroEval.
 24 |     
 25 |     This provider sets up OTLP export to ZeroEval with proper authentication
 26 |     and resource attributes. It works like any standard TracerProvider and
 27 |     allows multiple span processors.
 28 |     
 29 |     Args:
 30 |         api_key: ZeroEval API key. If not provided, reads from ZEROEVAL_API_KEY env var.
 31 |         api_url: ZeroEval API URL. Defaults to ZEROEVAL_API_URL env var or https://api.zeroeval.com.
 32 |         service_name: Service name for traces. Defaults to "zeroeval-app".
 33 |         
 34 |     Example:
 35 |         # Basic usage with environment variables
 36 |         provider = ZeroEvalOTLPProvider()
 37 |         trace.set_tracer_provider(provider)
 38 |         
 39 |         # Explicit configuration
 40 |         provider = ZeroEvalOTLPProvider(
 41 |             api_key="sk_ze_...",
 42 |             api_url="https://api.zeroeval.com",
 43 |             service_name="my-service"
 44 |         )
 45 |     """
 46 |     
 47 |     def __init__(
 48 |         self, 
 49 |         api_key: Optional[str] = None, 
 50 |         api_url: Optional[str] = None,
 51 |         service_name: str = "zeroeval-app"
 52 |     ):
 53 |         # Get configuration
 54 |         api_key = api_key or os.getenv("ZEROEVAL_API_KEY")
 55 |         if not api_key:
 56 |             raise ValueError(
 57 |                 "Missing ZEROEVAL_API_KEY. Set environment variable or pass api_key parameter."
 58 |             )
 59 |         
 60 |         api_url = api_url or os.getenv("ZEROEVAL_API_URL", "https://api.zeroeval.com")
 61 |         endpoint = f"{api_url.rstrip('/')}/v1/traces"
 62 |         
 63 |         # Create resource with service information
 64 |         resource = Resource.create({
 65 |             "service.name": service_name,
 66 |             "service.version": os.getenv("SERVICE_VERSION", "0.1.0"),
 67 |             "deployment.environment": os.getenv("DEPLOYMENT_ENV", "production"),
 68 |         })
 69 |         
 70 |         # Initialize parent TracerProvider
 71 |         super().__init__(resource=resource)
 72 |         
 73 |         # Configure OTLP exporter to ZeroEval
 74 |         exporter = OTLPSpanExporter(
 75 |             endpoint=endpoint,
 76 |             headers={"Authorization": f"Bearer {api_key}"}
 77 |         )
 78 |         
 79 |         # Add batch processor for efficient span export
 80 |         self.add_span_processor(BatchSpanProcessor(exporter))
 81 | 
 82 |         # Attach a processor that injects ZeroEval session attributes onto every OTEL span.
 83 |         # This ensures backend mapping links traces to the correct session instead of the default.
 84 |         try:
 85 |             session_id = os.getenv("ZEROEVAL_SESSION_ID")
 86 |             if not session_id:
 87 |                 import uuid as _uuid
 88 |                 session_id = str(_uuid.uuid4())
 89 |                 os.environ["ZEROEVAL_SESSION_ID"] = session_id
 90 |             session_name = os.getenv("ZEROEVAL_SESSION_NAME")
 91 |             self.add_span_processor(_SessionAttributeProcessor(session_id=session_id, session_name=session_name))
 92 |         except Exception:
 93 |             # Non-fatal: if we fail to attach, OTEL mapping will still fallback to default behavior
 94 |             logger.debug("Failed to attach session attribute processor", exc_info=True)
 95 |         
 96 |         logger.debug(f"Initialized ZeroEvalOTLPProvider with endpoint: {endpoint}")
 97 | 
 98 | 
 99 | class _SessionAttributeProcessor(SpanProcessor):
100 |     """Span processor that stamps `zeroeval.session.*` attributes on every OTEL span.
101 | 
102 |     The backend OTEL mapper will read these attributes (if present) to associate
103 |     spans/traces with the correct session created for this execution.
104 |     """
105 | 
106 |     def __init__(self, session_id: str, session_name: Optional[str] = None) -> None:
107 |         self._session_id = session_id
108 |         self._session_name = session_name
109 | 
110 |     def on_start(self, span: Any, parent_context: Any) -> None:  # type: ignore[override]
111 |         try:
112 |             span.set_attribute("zeroeval.session.id", self._session_id)
113 |             if self._session_name:
114 |                 span.set_attribute("zeroeval.session.name", self._session_name)
115 |         except Exception:
116 |             # Best-effort only
117 |             pass
118 | 
119 |     # No-ops for other hooks
120 |     def on_end(self, span: Any) -> None:  # type: ignore[override]
121 |         return
122 | 
123 |     def shutdown(self) -> None:  # type: ignore[override]
124 |         return
125 | 
126 |     def force_flush(self, timeout_millis: int = 30000) -> bool:  # type: ignore[override]
127 |         return True
128 | 
129 | 
130 | class SingleProcessorProvider(ZeroEvalOTLPProvider):
131 |     """
132 |     A ZeroEval provider that only accepts one span processor.
133 |     
134 |     This is useful when integrating with libraries that automatically add their
135 |     own span processors (like other observability tools) but you want to ensure
136 |     traces only go to ZeroEval. The first processor added (ZeroEval's) is kept,
137 |     and subsequent processors are silently ignored.
138 |     
139 |     This solves the common "401 Unauthorized" error when using dummy credentials
140 |     with auto-instrumenting libraries.
141 |     
142 |     Example:
143 |         # With third-party libraries - prevents duplicate exports
144 |         provider = SingleProcessorProvider()
145 |         
146 |         # Third-party processors are ignored, only ZeroEval receives traces
147 |     """
148 |     
149 |     def __init__(self, **kwargs):
150 |         super().__init__(**kwargs)
151 |         self._processor_locked = True  # Lock after parent adds ZeroEval processor
152 |     
153 |     def add_span_processor(self, span_processor: SpanProcessor) -> None:
154 |         """Add a span processor, but only if we haven't locked yet."""
155 |         if hasattr(self, '_processor_locked') and self._processor_locked:
156 |             processor_name = type(span_processor).__name__
157 |             logger.debug(
158 |                 f"Ignoring additional span processor '{processor_name}'. "
159 |                 "SingleProcessorProvider only accepts one processor."
160 |             )
161 |             return
162 |             
163 |         super().add_span_processor(span_processor)
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/span.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import traceback
  3 | import uuid
  4 | from dataclasses import dataclass, field
  5 | from typing import Any, Optional, Union
  6 | 
  7 | 
  8 | @dataclass
  9 | class Signal:
 10 |     """Represents a signal that can be attached to entities."""
 11 |     name: str
 12 |     value: Union[str, bool, int, float]
 13 |     signal_type: str = "boolean"  # "boolean" or "numerical"
 14 |     
 15 |     def __post_init__(self):
 16 |         # Auto-detect signal type and normalize value
 17 |         if isinstance(self.value, bool):
 18 |             self.signal_type = "boolean"
 19 |             self.value = "true" if self.value else "false"
 20 |         elif isinstance(self.value, (int, float)):
 21 |             self.signal_type = "numerical"
 22 |             self.value = str(self.value)
 23 |         else:
 24 |             # For string values, try to detect boolean
 25 |             str_val = str(self.value).lower()
 26 |             if str_val in ("true", "false"):
 27 |                 self.signal_type = "boolean"
 28 |                 self.value = str_val
 29 |             else:
 30 |                 # Default to boolean for string values
 31 |                 self.signal_type = "boolean"
 32 |                 self.value = str(self.value)
 33 | 
 34 | 
 35 | @dataclass
 36 | class Span:
 37 |     """
 38 |     Represents a traced operation with OpenTelemetry-compatible attributes.
 39 |     """
 40 |     # Required fields first
 41 |     name: str
 42 |     
 43 |     # Optional fields with defaults
 44 |     kind: str = "generic"  # Type of span: generic, llm, tts, http, database, vector_store, etc.
 45 |     session_id: Optional[str] = None
 46 |     session_name: Optional[str] = None
 47 |     trace_id: str = field(default_factory=lambda: str(uuid.uuid4()))
 48 |     span_id: str = field(default_factory=lambda: str(uuid.uuid4()))
 49 |     parent_id: Optional[str] = None
 50 |     start_time: float = field(default_factory=time.time)
 51 |     end_time: Optional[float] = None
 52 |     attributes: dict[str, Any] = field(default_factory=dict)
 53 |     # Fields for tracking execution
 54 |     input_data: Optional[str] = None
 55 |     output_data: Optional[str] = None
 56 |     code: Optional[str] = None  # Added code field
 57 |     code_filepath: Optional[str] = None
 58 |     code_lineno: Optional[int] = None
 59 |     error_code: Optional[str] = None
 60 |     error_message: Optional[str] = None
 61 |     error_stack: Optional[str] = None
 62 |     status: str = "ok"
 63 |     tags: dict[str, str] = field(default_factory=dict)
 64 |     # Optional tags that should be applied to the owning trace and/or session when this
 65 |     # span is ingested. These will be processed by the backend ingestion service.
 66 |     trace_tags: dict[str, str] = field(default_factory=dict)
 67 |     session_tags: dict[str, str] = field(default_factory=dict)
 68 |     # Signals attached to this span
 69 |     signals: dict[str, Any] = field(default_factory=dict)
 70 |     # AB test choices made in this span's context
 71 |     ab_choices: list[dict[str, Any]] = field(default_factory=list)
 72 | 
 73 |     def end(self) -> None:
 74 |         """Mark the span as completed with the current timestamp."""
 75 |         self.end_time = time.time()
 76 |     
 77 |     @property
 78 |     def duration_ms(self) -> Optional[float]:
 79 |         """Get the span duration in milliseconds, if completed."""
 80 |         if self.end_time is None:
 81 |             return None
 82 |         
 83 |         return (self.end_time - self.start_time) * 1000
 84 |     
 85 |     def set_error(self, code: str, message: str, stack: Optional[str] = None) -> None:
 86 |         """Set error information for the span."""
 87 |         self.error_code = code
 88 |         self.error_message = message
 89 |         self.error_stack = stack
 90 |         self.status = 'error'
 91 | 
 92 |     def set_io(self, input_data: Optional[str] = None, output_data: Optional[str] = None) -> None:
 93 |         """Set input/output data for the span, without overwriting existing values."""
 94 |         if input_data is not None:
 95 |             self.input_data = input_data
 96 |         if output_data is not None:
 97 |             self.output_data = output_data
 98 | 
 99 |     def set_code(self, code: str) -> None:
100 |         """Set the code that was executed in this span."""
101 |         self.code = code
102 | 
103 |     def set_code_context(self, filepath: str, lineno: int) -> None:
104 |         """Set the file path and line number for the span's execution context."""
105 |         self.code_filepath = filepath
106 |         self.code_lineno = lineno
107 | 
108 |     def set_signal(self, name: str, value: Union[str, bool, int, float]) -> None:
109 |         """Set a signal for this span."""
110 |         self.signals[name] = Signal(name=name, value=value)
111 | 
112 |     def to_dict(self) -> dict[str, Any]:
113 |         """Convert the span to a dictionary representation."""
114 |         # Convert signals to a serializable format
115 |         signals_dict = {}
116 |         for name, signal in self.signals.items():
117 |             if hasattr(signal, 'value') and hasattr(signal, 'signal_type'):
118 |                 signals_dict[name] = {
119 |                     'name': signal.name,
120 |                     'value': signal.value,
121 |                     'type': signal.signal_type
122 |                 }
123 |             else:
124 |                 # Handle legacy signal format or simple values
125 |                 signals_dict[name] = signal
126 |         
127 |         span_dict = {
128 |             "name": self.name,
129 |             "kind": self.kind,
130 |             "session_id": self.session_id,
131 |             "session_name": self.session_name,
132 |             "trace_id": self.trace_id,
133 |             "span_id": self.span_id,
134 |             "parent_id": self.parent_id,
135 |             "start_time": self.start_time,
136 |             "end_time": self.end_time,
137 |             "duration_ms": self.duration_ms,
138 |             "kind": self.kind,  # Added kind field
139 |             "attributes": self.attributes,
140 |             "tags": self.tags,
141 |             "trace_tags": self.trace_tags,
142 |             "session_tags": self.session_tags,
143 |             "signals": signals_dict,
144 |             "ab_choices": self.ab_choices,
145 |             "input_data": self.input_data,
146 |             "output_data": self.output_data,
147 |             "code": self.code,  # Added code field
148 |             "code_filepath": self.code_filepath,
149 |             "code_lineno": self.code_lineno,
150 |             "error_code": self.error_code,
151 |             "error_message": self.error_message,
152 |             "error_stack": self.error_stack,
153 |             "status": self.status
154 |         }
155 |         
156 |         return span_dict
157 | 
158 |     def end(self, error: Optional[Exception] = None) -> None:
159 |         """End the span, calculating duration and capturing errors."""
160 |         if self.end_time:
161 |             return  # Span already ended
162 |             
163 |         self.end_time = time.time()
164 | 
165 |         if error:
166 |             self.status = "error"
167 |             self.error_code = type(error).__name__
168 |             self.error_message = str(error)
169 |             self.error_stack = "".join(traceback.format_exception(error))


--------------------------------------------------------------------------------
/tests/test_client_feedback.py:
--------------------------------------------------------------------------------
  1 | """Tests for ZeroEval client feedback functionality."""
  2 | 
  3 | import json
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | import pytest
  7 | 
  8 | from zeroeval.client import ZeroEval
  9 | from zeroeval.errors import PromptRequestError
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def client():
 14 |     """Create a ZeroEval client for testing."""
 15 |     return ZeroEval(api_key="test-api-key", base_url="https://api.test.com")
 16 | 
 17 | 
 18 | @patch("zeroeval.client.requests.post")
 19 | def test_send_feedback_success(mock_post, client):
 20 |     """Test successful feedback submission."""
 21 |     mock_response = Mock()
 22 |     mock_response.status_code = 200
 23 |     mock_response.json.return_value = {
 24 |         "id": "feedback-123",
 25 |         "completion_id": "completion-456",
 26 |         "prompt_id": "prompt-789",
 27 |         "prompt_version_id": "version-abc",
 28 |         "project_id": "project-def",
 29 |         "thumbs_up": True,
 30 |         "reason": "Great response",
 31 |         "expected_output": None,
 32 |         "metadata": {},
 33 |         "created_by": "user-123",
 34 |         "created_at": "2025-01-01T00:00:00Z",
 35 |         "updated_at": "2025-01-01T00:00:00Z",
 36 |     }
 37 |     mock_post.return_value = mock_response
 38 | 
 39 |     result = client.send_feedback(
 40 |         prompt_slug="test-prompt",
 41 |         completion_id="completion-456",
 42 |         thumbs_up=True,
 43 |         reason="Great response",
 44 |     )
 45 | 
 46 |     # Verify the request was made correctly
 47 |     mock_post.assert_called_once()
 48 |     call_args = mock_post.call_args
 49 |     
 50 |     # Check URL
 51 |     assert call_args[0][0] == "https://api.test.com/v1/prompts/test-prompt/completions/completion-456/feedback"
 52 |     
 53 |     # Check headers
 54 |     headers = call_args[1]["headers"]
 55 |     assert headers["Authorization"] == "Bearer test-api-key"
 56 |     assert headers["Content-Type"] == "application/json"
 57 |     
 58 |     # Check payload
 59 |     payload = call_args[1]["json"]
 60 |     assert payload["thumbs_up"] is True
 61 |     assert payload["reason"] == "Great response"
 62 |     assert "expected_output" not in payload  # Not included when None
 63 |     assert "metadata" not in payload  # Not included when None
 64 |     
 65 |     # Check response
 66 |     assert result["id"] == "feedback-123"
 67 |     assert result["thumbs_up"] is True
 68 | 
 69 | 
 70 | @patch("zeroeval.client.requests.post")
 71 | def test_send_feedback_negative_with_expected_output(mock_post, client):
 72 |     """Test negative feedback with expected output."""
 73 |     mock_response = Mock()
 74 |     mock_response.status_code = 200
 75 |     mock_response.json.return_value = {
 76 |         "id": "feedback-456",
 77 |         "completion_id": "completion-789",
 78 |         "thumbs_up": False,
 79 |         "reason": "Incorrect format",
 80 |         "expected_output": "Should be JSON",
 81 |     }
 82 |     mock_post.return_value = mock_response
 83 | 
 84 |     result = client.send_feedback(
 85 |         prompt_slug="test-prompt",
 86 |         completion_id="completion-789",
 87 |         thumbs_up=False,
 88 |         reason="Incorrect format",
 89 |         expected_output="Should be JSON",
 90 |     )
 91 | 
 92 |     # Check payload includes all fields
 93 |     payload = mock_post.call_args[1]["json"]
 94 |     assert payload["thumbs_up"] is False
 95 |     assert payload["reason"] == "Incorrect format"
 96 |     assert payload["expected_output"] == "Should be JSON"
 97 |     
 98 |     assert result["id"] == "feedback-456"
 99 | 
100 | 
101 | @patch("zeroeval.client.requests.post")
102 | def test_send_feedback_with_metadata(mock_post, client):
103 |     """Test feedback submission with custom metadata."""
104 |     mock_response = Mock()
105 |     mock_response.status_code = 200
106 |     mock_response.json.return_value = {
107 |         "id": "feedback-789",
108 |         "thumbs_up": True,
109 |         "metadata": {"source": "automated", "version": "1.0"},
110 |     }
111 |     mock_post.return_value = mock_response
112 | 
113 |     result = client.send_feedback(
114 |         prompt_slug="test-prompt",
115 |         completion_id="completion-abc",
116 |         thumbs_up=True,
117 |         metadata={"source": "automated", "version": "1.0"},
118 |     )
119 | 
120 |     # Check metadata is included
121 |     payload = mock_post.call_args[1]["json"]
122 |     assert payload["metadata"] == {"source": "automated", "version": "1.0"}
123 |     
124 |     assert result["metadata"]["source"] == "automated"
125 | 
126 | 
127 | @patch("zeroeval.client.requests.post")
128 | def test_send_feedback_minimal(mock_post, client):
129 |     """Test feedback with only required fields."""
130 |     mock_response = Mock()
131 |     mock_response.status_code = 200
132 |     mock_response.json.return_value = {
133 |         "id": "feedback-minimal",
134 |         "thumbs_up": True,
135 |     }
136 |     mock_post.return_value = mock_response
137 | 
138 |     result = client.send_feedback(
139 |         prompt_slug="test-prompt",
140 |         completion_id="completion-xyz",
141 |         thumbs_up=True,
142 |     )
143 | 
144 |     # Check only thumbs_up is in payload
145 |     payload = mock_post.call_args[1]["json"]
146 |     assert payload == {"thumbs_up": True}
147 |     
148 |     assert result["id"] == "feedback-minimal"
149 | 
150 | 
151 | @patch("zeroeval.client.requests.post")
152 | def test_send_feedback_404_error(mock_post, client):
153 |     """Test feedback submission when completion not found."""
154 |     mock_response = Mock()
155 |     mock_response.status_code = 404
156 |     mock_response.text = "Completion not found"
157 |     mock_post.return_value = mock_response
158 | 
159 |     with pytest.raises(PromptRequestError) as exc_info:
160 |         client.send_feedback(
161 |             prompt_slug="test-prompt",
162 |             completion_id="nonexistent",
163 |             thumbs_up=True,
164 |         )
165 |     
166 |     assert "send_feedback failed" in str(exc_info.value)
167 |     assert "404" in str(exc_info.value.status)
168 | 
169 | 
170 | @patch("zeroeval.client.requests.post")
171 | def test_send_feedback_500_error(mock_post, client):
172 |     """Test feedback submission with server error."""
173 |     mock_response = Mock()
174 |     mock_response.status_code = 500
175 |     mock_response.text = "Internal server error"
176 |     mock_post.return_value = mock_response
177 | 
178 |     with pytest.raises(PromptRequestError) as exc_info:
179 |         client.send_feedback(
180 |             prompt_slug="test-prompt",
181 |             completion_id="completion-123",
182 |             thumbs_up=False,
183 |             reason="Test",
184 |         )
185 |     
186 |     assert "send_feedback failed" in str(exc_info.value)
187 |     assert "500" in str(exc_info.value.status)
188 | 
189 | 
190 | @patch("zeroeval.client.requests.post")
191 | def test_send_feedback_timeout(mock_post, client):
192 |     """Test feedback submission handles timeout correctly."""
193 |     mock_post.side_effect = Exception("Connection timeout")
194 | 
195 |     with pytest.raises(Exception) as exc_info:
196 |         client.send_feedback(
197 |             prompt_slug="test-prompt",
198 |             completion_id="completion-123",
199 |             thumbs_up=True,
200 |         )
201 |     
202 |     assert "timeout" in str(exc_info.value).lower()
203 | 
204 | 


--------------------------------------------------------------------------------
/tests/test_choice.py:
--------------------------------------------------------------------------------
  1 | """Tests for A/B testing choice functionality."""
  2 | 
  3 | import pytest
  4 | from unittest.mock import MagicMock, patch
  5 | from zeroeval.observability.choice import choose, clear_choice_cache, _choice_cache
  6 | from zeroeval.observability.tracer import tracer
  7 | 
  8 | 
  9 | class TestChoiceDefaultVariant:
 10 |     """Test that choose() falls back to default variant when test is completed."""
 11 | 
 12 |     def setup_method(self):
 13 |         """Clear cache before each test."""
 14 |         clear_choice_cache()
 15 | 
 16 |     def test_choose_returns_default_when_test_completed(self):
 17 |         """Test that choose returns default variant when backend reports completed status."""
 18 |         variants = {"control": "gpt-4", "variant_a": "claude-3"}
 19 |         weights = {"control": 0.5, "variant_a": 0.5}
 20 |         
 21 |         # Mock the backend response to indicate test is completed
 22 |         mock_response = {
 23 |             "test_status": "completed",
 24 |             "message": "Test has ended",
 25 |             "ab_choice_id": None
 26 |         }
 27 |         
 28 |         # Create a mock span context
 29 |         with tracer.span("test_span") as span:
 30 |             with patch('zeroeval.observability.choice._send_choice_data', return_value=mock_response):
 31 |                 result = choose(
 32 |                     name="test_experiment",
 33 |                     variants=variants,
 34 |                     weights=weights,
 35 |                     duration_days=14,
 36 |                     default_variant="control"
 37 |                 )
 38 |                 
 39 |                 # Should return the default variant value
 40 |                 assert result == "gpt-4"
 41 | 
 42 |     def test_choose_caches_default_variant_when_completed(self):
 43 |         """Test that default variant is cached when test is completed."""
 44 |         variants = {"control": "gpt-4", "variant_a": "claude-3"}
 45 |         weights = {"control": 0.5, "variant_a": 0.5}
 46 |         
 47 |         mock_response = {
 48 |             "test_status": "completed",
 49 |             "message": "Test has ended"
 50 |         }
 51 |         
 52 |         with tracer.span("test_span") as span:
 53 |             with patch('zeroeval.observability.choice._send_choice_data', return_value=mock_response):
 54 |                 # First call
 55 |                 result1 = choose(
 56 |                     name="test_experiment",
 57 |                     variants=variants,
 58 |                     weights=weights,
 59 |                     duration_days=14,
 60 |                     default_variant="control"
 61 |                 )
 62 |                 
 63 |                 # Second call - should use cached value
 64 |                 result2 = choose(
 65 |                     name="test_experiment",
 66 |                     variants=variants,
 67 |                     weights=weights,
 68 |                     duration_days=14,
 69 |                     default_variant="control"
 70 |                 )
 71 |                 
 72 |                 assert result1 == "gpt-4"
 73 |                 assert result2 == "gpt-4"
 74 |                 
 75 |                 # Verify cache contains default variant key
 76 |                 cache_key = f"span:{span.span_id}:test_experiment"
 77 |                 assert cache_key in _choice_cache
 78 |                 assert _choice_cache[cache_key] == "control"
 79 | 
 80 |     def test_choose_caches_selection_when_test_running(self):
 81 |         """Test that random selection is cached when test is running."""
 82 |         variants = {"control": "gpt-4", "variant_a": "claude-3"}
 83 |         weights = {"control": 1.0, "variant_a": 0.0}  # Force control selection
 84 |         
 85 |         mock_response = {
 86 |             "test_status": "running",
 87 |             "ab_choice_id": "test-choice-id"
 88 |         }
 89 |         
 90 |         with tracer.span("test_span") as span:
 91 |             with patch('zeroeval.observability.choice._send_choice_data', return_value=mock_response):
 92 |                 result = choose(
 93 |                     name="test_experiment",
 94 |                     variants=variants,
 95 |                     weights=weights,
 96 |                     duration_days=14,
 97 |                     default_variant="control"
 98 |                 )
 99 |                 
100 |                 assert result == "gpt-4"
101 |                 
102 |                 # Verify cache contains the selected variant key
103 |                 cache_key = f"span:{span.span_id}:test_experiment"
104 |                 assert cache_key in _choice_cache
105 |                 assert _choice_cache[cache_key] == "control"
106 | 
107 |     def test_choose_uses_first_variant_as_default_when_not_specified(self):
108 |         """Test that first variant is used as default when default_variant not specified."""
109 |         variants = {"control": "gpt-4", "variant_a": "claude-3"}
110 |         weights = {"control": 0.5, "variant_a": 0.5}
111 |         
112 |         mock_response = {
113 |             "test_status": "completed",
114 |             "message": "Test has ended"
115 |         }
116 |         
117 |         with tracer.span("test_span"):
118 |             with patch('zeroeval.observability.choice._send_choice_data', return_value=mock_response):
119 |                 result = choose(
120 |                     name="test_experiment",
121 |                     variants=variants,
122 |                     weights=weights,
123 |                     duration_days=14
124 |                     # default_variant not specified
125 |                 )
126 |                 
127 |                 # Should use first variant key as default
128 |                 assert result == "gpt-4"
129 | 
130 |     def test_choose_caches_on_api_failure(self):
131 |         """Test that selection is cached even when API call fails."""
132 |         variants = {"control": "gpt-4", "variant_a": "claude-3"}
133 |         weights = {"control": 1.0, "variant_a": 0.0}  # Force control selection
134 |         
135 |         with tracer.span("test_span") as span:
136 |             with patch('zeroeval.observability.choice._send_choice_data', side_effect=Exception("API error")):
137 |                 result = choose(
138 |                     name="test_experiment",
139 |                     variants=variants,
140 |                     weights=weights,
141 |                     duration_days=14,
142 |                     default_variant="control"
143 |                 )
144 |                 
145 |                 assert result == "gpt-4"
146 |                 
147 |                 # Should still cache the selection
148 |                 cache_key = f"span:{span.span_id}:test_experiment"
149 |                 assert cache_key in _choice_cache
150 |                 assert _choice_cache[cache_key] == "control"
151 | 
152 |     def test_choose_validates_default_variant(self):
153 |         """Test that choose raises ValueError when default_variant not in variants."""
154 |         variants = {"control": "gpt-4", "variant_a": "claude-3"}
155 |         weights = {"control": 0.5, "variant_a": 0.5}
156 |         
157 |         with tracer.span("test_span"):
158 |             with pytest.raises(ValueError, match="default_variant 'invalid' not found in variants"):
159 |                 choose(
160 |                     name="test_experiment",
161 |                     variants=variants,
162 |                     weights=weights,
163 |                     duration_days=14,
164 |                     default_variant="invalid"  # Not in variants
165 |                 )
166 | 
167 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/integrations/vocode/streaming_tracker.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Streaming tracker for Vocode operations.
  3 | 
  4 | Provides a clean way to track TTFB and duration for streaming operations
  5 | without requiring granular instrumentation.
  6 | """
  7 | 
  8 | import logging
  9 | import time
 10 | from collections.abc import AsyncGenerator
 11 | from typing import Any, Callable, Optional
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class StreamingSpanTracker:
 17 |     """
 18 |     Wraps a streaming generator to track TTFB and total duration.
 19 |     Updates the span with timing metrics as the stream progresses.
 20 |     """
 21 |     
 22 |     def __init__(self, 
 23 |                  generator: AsyncGenerator,
 24 |                  span: Any,
 25 |                  tracer: Any,
 26 |                  operation_type: str = "streaming"):
 27 |         """
 28 |         Initialize the streaming tracker.
 29 |         
 30 |         Args:
 31 |             generator: The async generator to wrap
 32 |             span: The ZeroEval span to update
 33 |             tracer: The ZeroEval tracer
 34 |             operation_type: Type of operation (tts, stt, etc.)
 35 |         """
 36 |         self.generator = generator
 37 |         self.span = span
 38 |         self.tracer = tracer
 39 |         self.operation_type = operation_type
 40 |         self.start_time = time.perf_counter()
 41 |         self.first_chunk_time: Optional[float] = None
 42 |         self.chunk_count = 0
 43 |         self.total_bytes = 0
 44 |         self.finished = False
 45 |     
 46 |     async def __aiter__(self):
 47 |         """Async iteration that tracks timing."""
 48 |         try:
 49 |             async for item in self.generator:
 50 |                 # Track first chunk (TTFB)
 51 |                 if self.first_chunk_time is None:
 52 |                     self.first_chunk_time = time.perf_counter()
 53 |                     ttfb_ms = (self.first_chunk_time - self.start_time) * 1000
 54 |                     
 55 |                     # Update span with TTFB
 56 |                     self.span.attributes[f"{self.operation_type}.ttfb_ms"] = round(ttfb_ms, 1)
 57 |                     logger.debug(f"[{self.operation_type}] TTFB: {ttfb_ms:.1f}ms")
 58 |                 
 59 |                 # Track chunk metrics
 60 |                 self.chunk_count += 1
 61 |                 if hasattr(item, 'chunk') and item.chunk:
 62 |                     self.total_bytes += len(item.chunk)
 63 |                 
 64 |                 yield item
 65 |                 
 66 |         finally:
 67 |             # Calculate total duration when stream ends
 68 |             if not self.finished:
 69 |                 self.finished = True
 70 |                 total_duration_ms = (time.perf_counter() - self.start_time) * 1000
 71 |                 
 72 |                 # Update span with final metrics
 73 |                 self.span.attributes[f"{self.operation_type}.duration_ms"] = round(total_duration_ms, 1)
 74 |                 self.span.attributes[f"{self.operation_type}.chunk_count"] = self.chunk_count
 75 |                 
 76 |                 if self.total_bytes > 0:
 77 |                     self.span.attributes[f"{self.operation_type}.total_bytes"] = self.total_bytes
 78 |                 
 79 |                 # Calculate streaming duration (time after first chunk)
 80 |                 if self.first_chunk_time:
 81 |                     streaming_duration_ms = (time.perf_counter() - self.first_chunk_time) * 1000
 82 |                     self.span.attributes[f"{self.operation_type}.streaming_duration_ms"] = round(streaming_duration_ms, 1)
 83 |                 
 84 |                 logger.debug(
 85 |                     f"[{self.operation_type}] Completed: "
 86 |                     f"{self.chunk_count} chunks, "
 87 |                     f"{total_duration_ms:.1f}ms total"
 88 |                 )
 89 |                 
 90 |                 # End the span
 91 |                 self.tracer.end_span(self.span)
 92 | 
 93 | 
 94 | class SynthesisResultWrapper:
 95 |     """
 96 |     Wraps a Vocode SynthesisResult to track TTS streaming metrics.
 97 |     Proxies all attributes to the original result while tracking streaming.
 98 |     """
 99 |     
100 |     def __init__(self, synthesis_result: Any, span: Any, tracer: Any, text: str):
101 |         """
102 |         Initialize the wrapper.
103 |         
104 |         Args:
105 |             synthesis_result: The original SynthesisResult from Vocode
106 |             span: The ZeroEval span for this TTS operation
107 |             tracer: The ZeroEval tracer
108 |             text: The text being synthesized
109 |         """
110 |         # Store original result for attribute proxying
111 |         self._synthesis_result = synthesis_result
112 |         self._span = span
113 |         self._tracer = tracer
114 |         self._text = text
115 |         
116 |         # Set input/output on span
117 |         self._span.set_io(
118 |             input_data=text,
119 |             output_data=f"Audio stream ({len(text)} chars)"
120 |         )
121 |         
122 |         # Wrap the chunk generator with tracking
123 |         self.chunk_generator = StreamingSpanTracker(
124 |             synthesis_result.chunk_generator,
125 |             span,
126 |             tracer,
127 |             operation_type="tts"
128 |         )
129 |         
130 |         # Explicitly copy key attributes
131 |         self.get_message_up_to = synthesis_result.get_message_up_to
132 |         if hasattr(synthesis_result, 'cached'):
133 |             self.cached = synthesis_result.cached
134 |             span.attributes["tts.cached"] = synthesis_result.cached
135 |     
136 |     def __getattr__(self, name):
137 |         """
138 |         Proxy any unknown attributes to the original synthesis result.
139 |         This ensures compatibility with Vocode's expectations.
140 |         """
141 |         # Avoid infinite recursion when accessing _synthesis_result
142 |         if name.startswith('_'):
143 |             raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
144 |         
145 |         # Proxy to the original synthesis result
146 |         return getattr(self._synthesis_result, name)
147 |     
148 |     def __setattr__(self, name, value):
149 |         """
150 |         Handle attribute setting properly.
151 |         """
152 |         # Set our own private attributes
153 |         if name.startswith('_') or name in ('chunk_generator', 'get_message_up_to', 'cached'):
154 |             object.__setattr__(self, name, value)
155 |         else:
156 |             # Proxy to the original synthesis result
157 |             if hasattr(self, '_synthesis_result'):
158 |                 setattr(self._synthesis_result, name, value)
159 |             else:
160 |                 object.__setattr__(self, name, value)
161 | 
162 | 
163 | def track_tts_streaming(span: Any, tracer: Any, text: str) -> Callable:
164 |     """
165 |     Returns a decorator that wraps TTS results with streaming tracking.
166 |     
167 |     Args:
168 |         span: The ZeroEval span for the TTS operation
169 |         tracer: The ZeroEval tracer  
170 |         text: The text being synthesized
171 |     
172 |     Returns:
173 |         A function that wraps SynthesisResult objects
174 |     """
175 |     def wrap_result(synthesis_result):
176 |         """Wrap the synthesis result with tracking."""
177 |         if synthesis_result is None:
178 |             tracer.end_span(span)
179 |             return synthesis_result
180 |         
181 |         return SynthesisResultWrapper(synthesis_result, span, tracer, text)
182 |     
183 |     return wrap_result
184 | 


--------------------------------------------------------------------------------
/examples_v2/tuning/bookstore_agent_with_feedback.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Customer Support Agent with Tuning and Feedback Loop
  4 | ===================================================
  5 | 
  6 | This example is an enhanced version of `customer_support_agent.py` that adds an
  7 | automated feedback loop. It demonstrates how to:
  8 | 
  9 | 1. Use `ze.prompt()` to manage prompts
 10 | 2. Automatically trace OpenAI calls
 11 | 3. **New**: Use a powerful model (evaluator) to critique the agent's responses
 12 | 4. **New**: Submit this feedback using `ze.send_feedback()` to improve the prompt over time
 13 | 
 14 | Key concepts:
 15 | - `ze.send_feedback()`: Submits programmatic feedback (thumbs up/down, reason) associated with a completion
 16 | - Automated Evaluation: Using a stronger model to grade a faster/cheaper model
 17 | """
 18 | 
 19 | import os
 20 | from pathlib import Path
 21 | import json
 22 | 
 23 | from dotenv import load_dotenv
 24 | 
 25 | # Load environment variables BEFORE importing zeroeval
 26 | env_path = Path(__file__).parent.parent / ".env"
 27 | load_dotenv(env_path)
 28 | 
 29 | import openai
 30 | import zeroeval as ze
 31 | 
 32 | # 1. Initialize ZeroEval
 33 | # Ensure you have ZEROEVAL_API_KEY and ZEROEVAL_API_URL set in your environment
 34 | ze.init(
 35 |     api_key=os.getenv("ZEROEVAL_API_KEY"),
 36 |     api_url=os.getenv("ZEROEVAL_API_URL", "http://localhost:8000"),
 37 | )
 38 | 
 39 | # Initialize OpenAI client
 40 | client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 41 | 
 42 | def customer_support_agent(user_query: str, user_context: dict = None, conversation_history: list = None):
 43 |     """
 44 |     A simple customer support agent that uses a managed prompt and maintains conversation history.
 45 |     """
 46 |     if user_context is None:
 47 |         user_context = {}
 48 |     if conversation_history is None:
 49 |         conversation_history = []
 50 | 
 51 |     # 2. Define the prompt using ze.prompt()
 52 |     prompt_name = "bookstore-support-agent-with-sdk-feedback"
 53 |     
 54 |     system_instruction = ze.prompt(
 55 |         name=prompt_name,
 56 |         content="""You are Elena, a passionate book enthusiast and customer support specialist at Bibliophile Books. You've worked in the bookstore for 5 years and genuinely love helping people discover their next great read.
 57 | 
 58 | Your personality:
 59 | - Warm and personable, like chatting with a knowledgeable friend at a bookshop
 60 | - Enthusiastic about books and reading
 61 | - Patient and empathetic when customers have issues
 62 | - Professional but not overly formal
 63 | - You use the customer's name naturally in conversation
 64 | 
 65 | Customer Information:
 66 | - Name: {{user_name}}
 67 | - Membership Level: {{membership}}
 68 | 
 69 | Guidelines:
 70 | - Address {{user_name}} directly and warmly
 71 | - For Gold members: Remember they have free shipping, priority support, and 15% off all purchases
 72 | - For Standard members: Offer helpful service while mentioning Gold membership benefits when relevant
 73 | - Keep responses concise but friendly
 74 | - If you don't know something or can't help, offer to connect them with a specialist
 75 | - Never use placeholder text like "[Your Name]" - you are Elena
 76 | 
 77 | Respond directly to their query in a helpful, personable way.""",
 78 |         variables={
 79 |             "user_name": user_context.get("name", "there"),
 80 |             "membership": user_context.get("membership", "Standard")
 81 |         }
 82 |     )
 83 | 
 84 |     print(f"\n--- Sending Request to AI ({prompt_name}) ---")
 85 |     
 86 |     # Build messages with conversation history
 87 |     messages = [{"role": "system", "content": system_instruction}]
 88 |     messages.extend(conversation_history)
 89 |     messages.append({"role": "user", "content": user_query})
 90 |     
 91 |     # 3. Call the Model
 92 |     # The SDK intercepts this call and tracks the completion_id
 93 |     response = client.chat.completions.create(
 94 |         model="gpt-4o-mini", # Use a cost-effective model for the agent
 95 |         messages=messages,
 96 |         temperature=0.7
 97 |     )
 98 | 
 99 |     completion_text = response.choices[0].message.content
100 |     completion_id = response.id
101 |     
102 |     return completion_text, completion_id, prompt_name
103 | 
104 | def evaluate_response(user_query: str, agent_response: str):
105 |     """
106 |     Uses a powerful model (Evaluator) to grade the agent's response.
107 |     Returns (is_good: bool, reason: str)
108 |     """
109 |     print("\n--- Running Evaluator (GPT-4o) ---")
110 |     
111 |     eval_prompt = f"""You are an expert customer support quality assurance specialist. 
112 |     Your job is to evaluate a customer support response.
113 | 
114 |     User Query: "{user_query}"
115 |     Agent Response: "{agent_response}"
116 | 
117 |     Criteria:
118 |     1. Is the tone warm and professional?
119 |     2. Is the information accurate and helpful?
120 |     3. Does it address the user's specific query?
121 | 
122 |     Output strictly in JSON format with these fields:
123 |     - "score": 1 to 5 (5 being perfect)
124 |     - "reason": A brief explanation of the score
125 |     - "thumbs_up": true if score >= 4, else false
126 |     """
127 | 
128 |     response = client.chat.completions.create(
129 |         model="gpt-4o", # Use a powerful model for evaluation
130 |         messages=[{"role": "user", "content": eval_prompt}],
131 |         temperature=0,
132 |         response_format={"type": "json_object"}
133 |     )
134 |     
135 |     try:
136 |         result = json.loads(response.choices[0].message.content)
137 |         return result
138 |     except Exception as e:
139 |         print(f"Error parsing evaluation: {e}")
140 |         return {"thumbs_up": True, "reason": "Failed to parse evaluation", "score": 5}
141 | 
142 | def main():
143 |     # Example interaction
144 |     print("\n=== Bookstore Support Agent with Feedback Loop (Type 'exit' to quit) ===")
145 |     
146 |     user_context = {
147 |         "name": "Alice",
148 |         "membership": "Gold" # VIP customer
149 |     }
150 |     print(f"Context: User={user_context['name']}, Membership={user_context['membership']}\n")
151 |     
152 |     conversation_history = []
153 |     
154 |     while True:
155 |         try:
156 |             user_query = input("\nEnter your query: ").strip()
157 |             if not user_query:
158 |                 continue
159 |                 
160 |             if user_query.lower() in ('exit', 'quit'):
161 |                 print("Goodbye!")
162 |                 break
163 |                 
164 |             # 1. Get response from the agent
165 |             response_text, completion_id, prompt_slug = customer_support_agent(
166 |                 user_query, 
167 |                 user_context, 
168 |                 conversation_history
169 |             )
170 |             
171 |             print(f"\nElena: {response_text}")
172 |             print(f"\n[DEBUG] OpenAI completion_id: {completion_id}")
173 |             print(f"[DEBUG] Prompt slug: {prompt_slug}")
174 |             
175 |             # 2. Generate feedback using a powerful model
176 |             # In a real system, this might happen asynchronously or be sampled
177 |             eval_result = evaluate_response(user_query, response_text)
178 |             
179 |             print(f"\n[Evaluator] Score: {eval_result.get('score')}/5")
180 |             print(f"[Evaluator] Reason: {eval_result.get('reason')}")
181 |             print(f"[Evaluator] Verdict: {'👍 Thumbs Up' if eval_result.get('thumbs_up') else '👎 Thumbs Down'}")
182 |             
183 |             # 3. Submit feedback to ZeroEval
184 |             # This signals to the optimizer which responses were good/bad
185 |             ze.send_feedback(
186 |                 prompt_slug=prompt_slug,
187 |                 completion_id=completion_id,
188 |                 thumbs_up=eval_result.get("thumbs_up", True),
189 |                 reason=eval_result.get("reason"),
190 |                 metadata={
191 |                     "score": eval_result.get("score"),
192 |                     "evaluator_model": "gpt-4o"
193 |                 }
194 |             )
195 |             print("✓ Feedback submitted to ZeroEval")
196 |             
197 |             # Add to conversation history
198 |             conversation_history.append({"role": "user", "content": user_query})
199 |             conversation_history.append({"role": "assistant", "content": response_text})
200 |             
201 |         except KeyboardInterrupt:
202 |             print("\nGoodbye!")
203 |             break
204 |         except Exception as e:
205 |             print(f"\nError: {e}")
206 |             import traceback
207 |             traceback.print_exc()
208 |             break
209 | 
210 | if __name__ == "__main__":
211 |     main()
212 | 
213 | 


--------------------------------------------------------------------------------
/tests/test_httpx_integration.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test the httpx integration with mock Gemini API responses.
  3 | """
  4 | 
  5 | import json
  6 | import pytest
  7 | import httpx
  8 | from unittest.mock import patch, MagicMock
  9 | from zeroeval import ze
 10 | from zeroeval.observability.integrations.httpx.integration import HttpxIntegration
 11 | 
 12 | 
 13 | def test_httpx_integration_setup():
 14 |     """Test that httpx integration can be set up."""
 15 |     # Create a mock tracer
 16 |     mock_tracer = MagicMock()
 17 |     
 18 |     # Create and setup the integration
 19 |     integration = HttpxIntegration(mock_tracer)
 20 |     integration.setup()
 21 |     
 22 |     # Verify that httpx methods are patched
 23 |     assert hasattr(httpx.Client.request, "__ze_patched__")
 24 |     assert hasattr(httpx.AsyncClient.request, "__ze_patched__")
 25 | 
 26 | 
 27 | def test_gemini_url_pattern_matching():
 28 |     """Test that the Gemini URL pattern correctly identifies API endpoints."""
 29 |     mock_tracer = MagicMock()
 30 |     integration = HttpxIntegration(mock_tracer)
 31 |     
 32 |     # Valid Gemini URLs
 33 |     valid_urls = [
 34 |         "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent",
 35 |         "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent",
 36 |         "https://generativelanguage.googleapis.com/v1/models/gemini-1.5-pro:generateContent",
 37 |         "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:streamGenerateContent",
 38 |     ]
 39 |     
 40 |     for url in valid_urls:
 41 |         assert integration._should_trace_request(url), f"Should trace: {url}"
 42 |     
 43 |     # Invalid URLs that should not be traced
 44 |     invalid_urls = [
 45 |         "https://api.openai.com/v1/chat/completions",
 46 |         "https://example.com/api",
 47 |         "https://generativelanguage.googleapis.com/v1beta/models",
 48 |         "https://google.com",
 49 |     ]
 50 |     
 51 |     for url in invalid_urls:
 52 |         assert not integration._should_trace_request(url), f"Should not trace: {url}"
 53 | 
 54 | 
 55 | def test_model_extraction_from_url():
 56 |     """Test extracting model name from Gemini URL."""
 57 |     mock_tracer = MagicMock()
 58 |     integration = HttpxIntegration(mock_tracer)
 59 |     
 60 |     test_cases = [
 61 |         ("https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent", "gemini-1.5-flash"),
 62 |         ("https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:streamGenerateContent", "gemini-pro"),
 63 |         ("https://generativelanguage.googleapis.com/v1/models/gemini-1.5-pro-latest:generateContent", "gemini-1.5-pro-latest"),
 64 |     ]
 65 |     
 66 |     for url, expected_model in test_cases:
 67 |         assert integration._extract_model_from_url(url) == expected_model
 68 | 
 69 | 
 70 | def test_operation_extraction_from_url():
 71 |     """Test extracting operation name from Gemini URL."""
 72 |     mock_tracer = MagicMock()
 73 |     integration = HttpxIntegration(mock_tracer)
 74 |     
 75 |     test_cases = [
 76 |         ("https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent", "generateContent"),
 77 |         ("https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:streamGenerateContent", "streamGenerateContent"),
 78 |     ]
 79 |     
 80 |     for url, expected_operation in test_cases:
 81 |         assert integration._extract_operation_from_url(url) == expected_operation
 82 | 
 83 | 
 84 | def test_gemini_request_parsing():
 85 |     """Test parsing of Gemini API request payloads."""
 86 |     mock_tracer = MagicMock()
 87 |     integration = HttpxIntegration(mock_tracer)
 88 |     
 89 |     # Test basic request
 90 |     request_data = {
 91 |         "contents": [{"parts": [{"text": "Hello"}]}],
 92 |         "generationConfig": {
 93 |             "temperature": 0.7,
 94 |             "maxOutputTokens": 100,
 95 |             "topP": 0.9,
 96 |             "topK": 40
 97 |         }
 98 |     }
 99 |     
100 |     attrs = integration._parse_gemini_request(request_data)
101 |     assert attrs["contents"] == request_data["contents"]
102 |     assert attrs["temperature"] == 0.7
103 |     assert attrs["max_output_tokens"] == 100
104 |     assert attrs["top_p"] == 0.9
105 |     assert attrs["top_k"] == 40
106 |     
107 |     # Test request with tools
108 |     request_with_tools = {
109 |         "contents": [{"parts": [{"text": "Hello"}]}],
110 |         "tools": [{
111 |             "functionDeclarations": [{
112 |                 "name": "get_weather",
113 |                 "description": "Get weather information"
114 |             }]
115 |         }],
116 |         "toolConfig": {
117 |             "functionCallingConfig": {
118 |                 "mode": "AUTO"
119 |             }
120 |         }
121 |     }
122 |     
123 |     attrs = integration._parse_gemini_request(request_with_tools)
124 |     assert "tools" in attrs
125 |     assert len(attrs["tools"]) == 1
126 |     assert attrs["tools"][0]["name"] == "get_weather"
127 |     assert attrs["tool_calling_mode"] == "AUTO"
128 | 
129 | 
130 | def test_gemini_response_parsing():
131 |     """Test parsing of Gemini API response payloads."""
132 |     mock_tracer = MagicMock()
133 |     integration = HttpxIntegration(mock_tracer)
134 |     
135 |     # Test basic response
136 |     response_data = {
137 |         "candidates": [{
138 |             "content": {
139 |                 "parts": [{"text": "Hello, world!"}]
140 |             },
141 |             "finishReason": "STOP",
142 |             "safetyRatings": [
143 |                 {"category": "HARM_CATEGORY_HARASSMENT", "probability": "NEGLIGIBLE"}
144 |             ]
145 |         }],
146 |         "usageMetadata": {
147 |             "promptTokenCount": 10,
148 |             "candidatesTokenCount": 20,
149 |             "totalTokenCount": 30
150 |         },
151 |         "modelVersion": "gemini-1.5-flash-001",
152 |         "responseId": "abc123"
153 |     }
154 |     
155 |     attrs, output = integration._parse_gemini_response(response_data)
156 |     assert output == "Hello, world!"
157 |     assert attrs["finish_reason"] == "STOP"
158 |     assert attrs["inputTokens"] == 10
159 |     assert attrs["outputTokens"] == 20
160 |     assert attrs["totalTokens"] == 30
161 |     assert attrs["model_version"] == "gemini-1.5-flash-001"
162 |     assert attrs["response_id"] == "abc123"
163 |     assert len(attrs["safety_ratings"]) == 1
164 |     
165 |     # Test response with function call
166 |     response_with_function = {
167 |         "candidates": [{
168 |             "content": {
169 |                 "parts": [{
170 |                     "functionCall": {
171 |                         "name": "get_weather",
172 |                         "args": {"location": "San Francisco"}
173 |                     }
174 |                 }]
175 |             }
176 |         }],
177 |         "usageMetadata": {
178 |             "promptTokenCount": 15,
179 |             "candidatesTokenCount": 5,
180 |             "totalTokenCount": 20
181 |         }
182 |     }
183 |     
184 |     attrs, output = integration._parse_gemini_response(response_with_function)
185 |     assert "function_calls" in attrs
186 |     assert len(attrs["function_calls"]) == 1
187 |     assert attrs["function_calls"][0]["name"] == "get_weather"
188 |     assert attrs["function_calls"][0]["args"]["location"] == "San Francisco"
189 |     # When there's only a function call, output should be the JSON representation
190 |     assert "get_weather" in output
191 | 
192 | 
193 | @pytest.mark.asyncio
194 | async def test_async_request_tracing():
195 |     """Test that async requests are properly traced."""
196 |     mock_tracer = MagicMock()
197 |     mock_span = MagicMock()
198 |     mock_tracer.start_span.return_value = mock_span
199 |     
200 |     integration = HttpxIntegration(mock_tracer)
201 |     integration.setup()
202 |     
203 |     # Mock response
204 |     mock_response = MagicMock()
205 |     mock_response.status_code = 200
206 |     mock_response.text = json.dumps({
207 |         "candidates": [{
208 |             "content": {"parts": [{"text": "Test response"}]}
209 |         }],
210 |         "usageMetadata": {
211 |             "promptTokenCount": 5,
212 |             "candidatesTokenCount": 3,
213 |             "totalTokenCount": 8
214 |         }
215 |     })
216 |     
217 |     # Patch the original httpx request
218 |     with patch("httpx.AsyncClient.request", new=MagicMock(return_value=mock_response)):
219 |         # Re-apply our wrapper
220 |         original_method = httpx.AsyncClient.request
221 |         wrapped_method = integration._wrap_request_async(original_method)
222 |         
223 |         # Make a request that should be traced
224 |         url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent"
225 |         response = await wrapped_method(
226 |             None,  # self (client instance)
227 |             "POST",
228 |             url,
229 |             json={"contents": [{"parts": [{"text": "Hello"}]}]}
230 |         )
231 |     
232 |     # Verify span was created
233 |     mock_tracer.start_span.assert_called_once()
234 |     call_args = mock_tracer.start_span.call_args
235 |     assert call_args[0][0] == "gemini.models.generateContent"
236 |     assert call_args[1]["kind"] == "llm"
237 |     assert call_args[1]["attributes"]["model"] == "gemini-1.5-flash"
238 |     
239 |     # Verify span was ended
240 |     mock_tracer.end_span.assert_called_once_with(mock_span)
241 | 
242 | 
243 | if __name__ == "__main__":
244 |     # Run tests
245 |     pytest.main([__file__, "-v"])
246 | 


--------------------------------------------------------------------------------
/examples_v2/tuning/bookstore_agent_with_api_feedback.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Customer Support Agent with API Feedback Loop
  4 | ===================================================
  5 | 
  6 | This example demonstrates how to submit feedback using the ZeroEval API directly,
  7 | bypassing the SDK's `ze.send_feedback` helper. This is useful for:
  8 | 1. Frontend applications calling the backend directly
  9 | 2. Systems where the SDK is not installed
 10 | 3. Custom integrations
 11 | 
 12 | Key concepts:
 13 | - `POST /v1/prompts/{slug}/completions/{id}/feedback`: The feedback endpoint
 14 | - Direct API interaction
 15 | """
 16 | 
 17 | import os
 18 | import json
 19 | import requests
 20 | from pathlib import Path
 21 | from dotenv import load_dotenv
 22 | 
 23 | # Load environment variables BEFORE importing zeroeval
 24 | env_path = Path(__file__).parent.parent / ".env"
 25 | load_dotenv(env_path)
 26 | 
 27 | import openai
 28 | import zeroeval as ze
 29 | 
 30 | # Configuration
 31 | API_URL = os.getenv("ZEROEVAL_API_URL", "http://localhost:8000")
 32 | API_KEY = os.getenv("ZEROEVAL_API_KEY")  # Use your ZeroEval API Key
 33 | 
 34 | # 1. Initialize ZeroEval
 35 | ze.init(
 36 |     api_key=API_KEY,
 37 |     api_url=API_URL,
 38 | )
 39 | 
 40 | # Initialize OpenAI client
 41 | client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 42 | 
 43 | def customer_support_agent(user_query: str, user_context: dict = None, conversation_history: list = None):
 44 |     """
 45 |     A simple customer support agent that uses a managed prompt and maintains conversation history.
 46 |     """
 47 |     if user_context is None:
 48 |         user_context = {}
 49 |     if conversation_history is None:
 50 |         conversation_history = []
 51 | 
 52 |     # 2. Define the prompt using ze.prompt()
 53 |     prompt_name = "bookstore-support-agent-with-api-feedback"
 54 |     
 55 |     system_instruction = ze.prompt(
 56 |         name=prompt_name,
 57 |         content="""You are Elena, a passionate book enthusiast and customer support specialist at Bibliophile Books. You've worked in the bookstore for 5 years and genuinely love helping people discover their next great read.
 58 | 
 59 | Your personality:
 60 | - Warm and personable, like chatting with a knowledgeable friend at a bookshop
 61 | - Enthusiastic about books and reading
 62 | - Patient and empathetic when customers have issues
 63 | - Professional but not overly formal
 64 | - You use the customer's name naturally in conversation
 65 | 
 66 | Customer Information:
 67 | - Name: {{user_name}}
 68 | - Membership Level: {{membership}}
 69 | 
 70 | Guidelines:
 71 | - Address {{user_name}} directly and warmly
 72 | - For Gold members: Remember they have free shipping, priority support, and 15% off all purchases
 73 | - For Standard members: Offer helpful service while mentioning Gold membership benefits when relevant
 74 | - Keep responses concise but friendly
 75 | - If you don't know something or can't help, offer to connect them with a specialist
 76 | - Never use placeholder text like "[Your Name]" - you are Elena
 77 | 
 78 | Respond directly to their query in a helpful, personable way.""",
 79 |         variables={
 80 |             "user_name": user_context.get("name", "there"),
 81 |             "membership": user_context.get("membership", "Standard")
 82 |         }
 83 |     )
 84 | 
 85 |     print(f"\n--- Sending Request to AI ({prompt_name}) ---")
 86 |     
 87 |     # Build messages with conversation history
 88 |     messages = [{"role": "system", "content": system_instruction}]
 89 |     messages.extend(conversation_history)
 90 |     messages.append({"role": "user", "content": user_query})
 91 |     
 92 |     # 3. Call the Model
 93 |     # The SDK intercepts this call and tracks the completion_id
 94 |     response = client.chat.completions.create(
 95 |         model="gpt-4o-mini", # Use a cost-effective model for the agent
 96 |         messages=messages,
 97 |         temperature=0.7
 98 |     )
 99 | 
100 |     completion_text = response.choices[0].message.content
101 |     completion_id = response.id
102 |     
103 |     return completion_text, completion_id, prompt_name
104 | 
105 | def evaluate_response(user_query: str, agent_response: str):
106 |     """
107 |     Uses a powerful model (Evaluator) to grade the agent's response.
108 |     Returns (is_good: bool, reason: str)
109 |     """
110 |     print("\n--- Running Evaluator (GPT-4o) ---")
111 |     
112 |     eval_prompt = f"""You are an expert customer support quality assurance specialist. 
113 |     Your job is to evaluate a customer support response.
114 | 
115 |     User Query: "{user_query}"
116 |     Agent Response: "{agent_response}"
117 | 
118 |     Criteria:
119 |     1. Is the tone warm and professional?
120 |     2. Is the information accurate and helpful?
121 |     3. Does it address the user's specific query?
122 | 
123 |     Output strictly in JSON format with these fields:
124 |     - "score": 1 to 5 (5 being perfect)
125 |     - "reason": A brief explanation of the score
126 |     - "thumbs_up": true if score >= 4, else false
127 |     """
128 | 
129 |     response = client.chat.completions.create(
130 |         model="gpt-4o", # Use a powerful model for evaluation
131 |         messages=[{"role": "user", "content": eval_prompt}],
132 |         temperature=0,
133 |         response_format={"type": "json_object"}
134 |     )
135 |     
136 |     try:
137 |         result = json.loads(response.choices[0].message.content)
138 |         return result
139 |     except Exception as e:
140 |         print(f"Error parsing evaluation: {e}")
141 |         return {"thumbs_up": True, "reason": "Failed to parse evaluation", "score": 5}
142 | 
143 | def send_feedback_via_api(prompt_slug, completion_id, thumbs_up, reason=None, expected_output=None, metadata=None):
144 |     """
145 |     Sends feedback directly using requests.post to the ZeroEval API.
146 |     """
147 |     url = f"{API_URL}/v1/prompts/{prompt_slug}/completions/{completion_id}/feedback"
148 |     
149 |     payload = {
150 |         "thumbs_up": thumbs_up,
151 |         "reason": reason,
152 |         "expected_output": expected_output,
153 |         "metadata": metadata or {}
154 |     }
155 |     
156 |     headers = {
157 |         "Authorization": f"Bearer {API_KEY}",
158 |         "Content-Type": "application/json"
159 |     }
160 |     
161 |     try:
162 |         print(f"\n[API] POST {url}")
163 |         resp = requests.post(url, json=payload, headers=headers)
164 |         resp.raise_for_status()
165 |         print("✓ API Feedback submitted successfully")
166 |         return resp.json()
167 |     except requests.exceptions.HTTPError as e:
168 |         print(f"❌ API Request failed: {e}")
169 |         print(f"Response: {e.response.text}")
170 |         return None
171 |     except Exception as e:
172 |         print(f"❌ Error sending feedback: {e}")
173 |         return None
174 | 
175 | def main():
176 |     # Example interaction
177 |     print("\n=== Bookstore Support Agent with API Feedback (Type 'exit' to quit) ===")
178 |     
179 |     user_context = {
180 |         "name": "Alice",
181 |         "membership": "Gold" # VIP customer
182 |     }
183 |     print(f"Context: User={user_context['name']}, Membership={user_context['membership']}\n")
184 |     
185 |     conversation_history = []
186 |     
187 |     while True:
188 |         try:
189 |             user_query = input("\nEnter your query: ").strip()
190 |             if not user_query:
191 |                 continue
192 |                 
193 |             if user_query.lower() in ('exit', 'quit'):
194 |                 print("Goodbye!")
195 |                 break
196 |                 
197 |             # 1. Get response from the agent
198 |             response_text, completion_id, prompt_slug = customer_support_agent(
199 |                 user_query, 
200 |                 user_context, 
201 |                 conversation_history
202 |             )
203 |             
204 |             print(f"\nElena: {response_text}")
205 |             print(f"\n[DEBUG] OpenAI completion_id: {completion_id}")
206 |             print(f"[DEBUG] Prompt slug: {prompt_slug}")
207 |             
208 |             # 2. Generate feedback using a powerful model
209 |             # In a real system, this might happen asynchronously or be sampled
210 |             eval_result = evaluate_response(user_query, response_text)
211 |             
212 |             print(f"\n[Evaluator] Score: {eval_result.get('score')}/5")
213 |             print(f"[Evaluator] Reason: {eval_result.get('reason')}")
214 |             print(f"[Evaluator] Verdict: {'👍 Thumbs Up' if eval_result.get('thumbs_up') else '👎 Thumbs Down'}")
215 |             
216 |             # 3. Submit feedback via API directly
217 |             send_feedback_via_api(
218 |                 prompt_slug=prompt_slug,
219 |                 completion_id=completion_id,
220 |                 thumbs_up=eval_result.get("thumbs_up", True),
221 |                 reason=eval_result.get("reason"),
222 |                 metadata={
223 |                     "score": eval_result.get("score"),
224 |                     "evaluator_model": "gpt-4o",
225 |                     "source": "direct_api"
226 |                 }
227 |             )
228 |             
229 |             # Add to conversation history
230 |             conversation_history.append({"role": "user", "content": user_query})
231 |             conversation_history.append({"role": "assistant", "content": response_text})
232 |             
233 |         except KeyboardInterrupt:
234 |             print("\nGoodbye!")
235 |             break
236 |         except Exception as e:
237 |             print(f"\nError: {e}")
238 |             import traceback
239 |             traceback.print_exc()
240 |             break
241 | 
242 | if __name__ == "__main__":
243 |     main()
244 | 
245 | 


--------------------------------------------------------------------------------
/INTEGRATIONS.md:
--------------------------------------------------------------------------------
  1 | # ZeroEval Integrations
  2 | 
  3 | ZeroEval automatically instruments popular AI/ML frameworks to provide comprehensive observability without manual instrumentation. When you initialize the ZeroEval tracer, it automatically detects and patches supported libraries.
  4 | 
  5 | ## Overview
  6 | 
  7 | All integrations are automatically enabled when available. No additional configuration is required beyond initializing ZeroEval:
  8 | 
  9 | ```python
 10 | import zeroeval as ze
 11 | ze.init(api_key="YOUR_API_KEY")
 12 | ```
 13 | 
 14 | ## Supported Integrations
 15 | 
 16 | ### 1. OpenAI
 17 | 
 18 | Automatically traces all OpenAI API calls including:
 19 | 
 20 | - Chat completions (streaming and non-streaming)
 21 | - Responses API (for GPT-5 and newer models)
 22 | - Token usage tracking
 23 | - Input/output capture
 24 | - Error handling
 25 | - OpenAI-compatible response methods
 26 | 
 27 | **Traced Operations:**
 28 | 
 29 | - `client.chat.completions.create()`
 30 | - `client.responses.create()` (GPT-5+ models)
 31 | - Streaming responses with automatic buffering
 32 | 
 33 | **Response Methods Support:**
 34 | 
 35 | ZeroEval now provides OpenAI-compatible response methods for both streaming and non-streaming responses, including responses from OpenAI-compatible APIs that return plain dictionaries:
 36 | 
 37 | - `response.to_dict()` - Convert response to dictionary
 38 | - `response.to_json()` - Convert response to JSON string
 39 | - `response.model_dump()` - Pydantic v2 alias for `to_dict()`
 40 | - `response.model_dump_json()` - Pydantic v2 alias for `to_json()`
 41 | - `response.dict()` - Deprecated Pydantic v1 alias
 42 | - `response.json()` - Deprecated Pydantic v1 alias
 43 | 
 44 | **Example:**
 45 | 
 46 | ```python
 47 | import openai
 48 | client = openai.OpenAI()
 49 | 
 50 | # This call is automatically traced
 51 | response = client.chat.completions.create(
 52 |     model="gpt-4",
 53 |     messages=[{"role": "user", "content": "Hello!"}]
 54 | )
 55 | 
 56 | # Response methods work for both OpenAI and compatible APIs
 57 | data = response.to_dict()  # Get dictionary representation
 58 | json_str = response.to_json(indent=2)  # Get formatted JSON
 59 | 
 60 | # Works with streaming too
 61 | stream = client.chat.completions.create(
 62 |     model="gpt-4",
 63 |     messages=[{"role": "user", "content": "Hello!"}],
 64 |     stream=True
 65 | )
 66 | 
 67 | # After streaming completes, response methods are available
 68 | for chunk in stream:
 69 |     print(chunk)
 70 | 
 71 | # Now you can use response methods on the stream
 72 | final_data = stream.to_dict()
 73 | ```
 74 | 
 75 | **Responses API (GPT-5+ Models):**
 76 | 
 77 | The new `responses.create()` endpoint for GPT-5 and newer models is also automatically traced:
 78 | 
 79 | ```python
 80 | # Using the responses API
 81 | response = client.responses.create(
 82 |     model="gpt-5",
 83 |     input=[{"role": "user", "content": "Hello!"}],
 84 |     tools=[{
 85 |         "type": "function",
 86 |         "function": {
 87 |             "name": "get_weather",
 88 |             "description": "Get the weather"
 89 |         }
 90 |     }],
 91 |     reasoning={
 92 |         "effort": "low",
 93 |         "summary": "detailed"
 94 |     }
 95 | )
 96 | 
 97 | # Access response data
 98 | print(response.output_text)
 99 | print(response.usage.input_tokens)
100 | 
101 | # Use response methods
102 | data = response.to_dict()
103 | json_str = response.to_json()
104 | ```
105 | 
106 | The integration captures:
107 | 
108 | - Input data and tools
109 | - Output text and tool calls
110 | - Reasoning traces
111 | - Token usage (input_tokens/output_tokens)
112 | - Response methods work on all response types
113 | 
114 | ### 2. LangChain
115 | 
116 | Comprehensive tracing for all LangChain components:
117 | 
118 | **Traced Components:**
119 | 
120 | - **Runnables**: `invoke`, `ainvoke`, `stream`, `astream`, `batch`, `abatch`
121 | - **LLMs**: All language model calls via `BaseLanguageModel`
122 | - **Tools**: `BaseTool.run()` and `arun()`
123 | - **Retrievers**: Document retrieval operations
124 | - **Chains**: Sequential and parallel chain execution
125 | 
126 | **Example:**
127 | 
128 | ```python
129 | from langchain_openai import ChatOpenAI
130 | from langchain_core.prompts import ChatPromptTemplate
131 | 
132 | # All components are automatically traced
133 | model = ChatOpenAI()
134 | prompt = ChatPromptTemplate.from_template("Tell me about {topic}")
135 | chain = prompt | model
136 | 
137 | # This creates a trace with spans for prompt + model
138 | response = chain.invoke({"topic": "AI"})
139 | ```
140 | 
141 | ### 3. LangGraph (Enhanced)
142 | 
143 | Our most comprehensive integration, tracing the full agentic workflow.
144 | 
145 | **Important:** You will see BOTH `langgraph.*` and `langchain.*` spans when using LangGraph:
146 | 
147 | - `langgraph.invoke`, `langgraph.stream` - High-level graph execution spans
148 | - `langchain.*` - Individual node and component execution spans
149 | 
150 | This is expected behavior as LangGraph builds on top of LangChain components.
151 | 
152 | **Traced Operations:**
153 | 
154 | #### Graph Execution
155 | 
156 | - `invoke`, `ainvoke` - Full graph runs with metadata
157 | - `stream`, `astream` - Streaming with node sequence tracking
158 | - Graph structure metadata (nodes, edges, conditionals)
159 | 
160 | #### Node-Level Tracing
161 | 
162 | - **Individual node executions** - Each node gets its own span
163 | - **State transformations** - Input/output state for each node
164 | - **Execution timing** - Latency per node
165 | 
166 | #### Conditional Logic
167 | 
168 | - **Conditional edges** - Traces routing decisions
169 | - **Dynamic flow** - Captures actual execution path
170 | 
171 | #### Tool Integration
172 | 
173 | - **Tool calls within nodes** - Integrated with LangChain tool tracing
174 | - **Multi-step reasoning** - Full visibility into agent decision-making
175 | 
176 | #### Checkpointing (if enabled)
177 | 
178 | - **State persistence** - Save/load operations
179 | - **Recovery points** - Checkpoint timing and size
180 | 
181 | **Enhanced Attributes:**
182 | 
183 | - `node_count` - Number of nodes in the graph
184 | - `edge_count` - Number of edges
185 | - `has_conditionals` - Whether graph has conditional routing
186 | - `nodes` - List of node names
187 | - `node_sequence` - Execution order during streaming
188 | - `time_to_first_chunk` - Streaming latency metrics
189 | 
190 | **Example:**
191 | 
192 | ```python
193 | from langgraph.graph import StateGraph, START, END
194 | from langchain_core.messages import HumanMessage
195 | 
196 | # Define a multi-node graph
197 | workflow = StateGraph(AgentState)
198 | workflow.add_node("reasoning", reasoning_node)
199 | workflow.add_node("agent", agent_node)
200 | workflow.add_node("tools", tool_node)
201 | 
202 | # Add conditional routing
203 | workflow.add_conditional_edges(
204 |     "agent",
205 |     should_continue,
206 |     {"tools": "tools", "end": END}
207 | )
208 | 
209 | app = workflow.compile()
210 | 
211 | # This creates a comprehensive trace hierarchy:
212 | # - Parent span for full graph execution
213 | # - Child spans for each node (reasoning, agent, tools)
214 | # - Metadata about graph structure and routing
215 | result = app.invoke({"messages": [HumanMessage(content="Help me plan a trip")]})
216 | ```
217 | 
218 | **Trace Hierarchy Example:**
219 | 
220 | ```
221 | langgraph.invoke (500ms)
222 | ├── langgraph.node.reasoning (50ms)
223 | ├── langgraph.node.agent (200ms)
224 | ├── langchain.invoke (180ms) [LLM call within agent]
225 | ├── langgraph.node.tools (150ms)
226 | └── langchain.tool.run (140ms) [Tool execution]
227 | ```
228 | 
229 | ## Auto-Instrumentation Details
230 | 
231 | The ZeroEval tracer automatically:
232 | 
233 | 1. **Detects installed packages** - Only patches libraries that are available
234 | 2. **Preserves functionality** - All original behavior is maintained
235 | 3. **Handles errors gracefully** - Tracing failures don't break your application
236 | 4. **Supports async operations** - Full async/await support
237 | 5. **Manages trace hierarchy** - Automatic parent-child span relationships
238 | 
239 | ## Disabling Integrations
240 | 
241 | While integrations are automatic, you can disable specific ones if needed:
242 | 
243 | ```python
244 | import zeroeval as ze
245 | 
246 | # Method 1: Disable during initialization (recommended)
247 | ze.init(
248 |     api_key="YOUR_API_KEY",
249 |     disabled_integrations=["openai", "langgraph"]  # Disable specific integrations
250 | )
251 | 
252 | # Method 2: Via environment variable
253 | # Set ZEROEVAL_DISABLED_INTEGRATIONS=openai,langgraph before running
254 | 
255 | # Method 3: Configure after initialization
256 | from zeroeval.observability.tracer import tracer
257 | tracer.configure(integrations={"openai": False, "langgraph": False})
258 | ```
259 | 
260 | **Common Use Cases for Disabling:**
261 | 
262 | - **LiveKit Users**: Disable `openai` to prevent conflicts with LiveKit's OpenAI plugin
263 | - **Custom Instrumentation**: Disable auto-instrumentation when you have custom tracing
264 | - **Performance**: Disable integrations you're not using to reduce overhead
265 | 
266 | ## Performance Impact
267 | 
268 | All integrations are designed for minimal overhead:
269 | 
270 | - Trace data is buffered and sent asynchronously
271 | - Sampling can be configured for high-volume applications
272 | - Serialization happens outside the critical path
273 | 
274 | ## Coming Soon
275 | 
276 | - **Anthropic** - Claude API tracing
277 | - **Cohere** - Full Cohere platform support
278 | - **HuggingFace** - Transformers and Inference API
279 | - **LlamaIndex** - Document processing and retrieval
280 | - **Custom Integrations** - SDK for building your own integrations
281 | 
282 | ## Troubleshooting
283 | 
284 | If traces aren't appearing:
285 | 
286 | 1. Check that ZeroEval is initialized before importing frameworks:
287 | 
288 |    ```python
289 |    import zeroeval as ze
290 |    ze.init()  # Must come first
291 | 
292 |    # Then import frameworks
293 |    import openai
294 |    import langchain
295 |    ```
296 | 
297 | 2. Verify integrations are loaded:
298 | 
299 |    ```python
300 |    from zeroeval.observability.tracer import tracer
301 |    print(tracer._integrations.keys())
302 |    # Should show: ['OpenAIIntegration', 'LangChainIntegration', 'LangGraphIntegration']
303 |    ```
304 | 
305 | 3. Check for errors in initialization:
306 |    ```bash
307 |    export ZEROEVAL_LOG_LEVEL=DEBUG
308 |    python your_script.py
309 |    ```
310 | 
311 | For more help, contact us at [founders@zeroeval.com](mailto:founders@zeroeval.com)
312 | 


--------------------------------------------------------------------------------
/src/zeroeval/observability/choice.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | from typing import Dict, Any, Union
  5 | 
  6 | import requests
  7 | 
  8 | from .span import Span
  9 | from .tracer import tracer
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | # Cache for choices made within the same context to ensure consistency
 14 | _choice_cache: Dict[str, str] = {}
 15 | 
 16 | 
 17 | def choose(
 18 |     name: str,
 19 |     variants: Dict[str, Any],
 20 |     weights: Dict[str, float],
 21 |     duration_days: int,
 22 |     default_variant: str | None = None
 23 | ) -> Any:
 24 |     """
 25 |     Make an A/B test choice using weighted random selection with experiment timeboxing.
 26 | 
 27 |     This function automatically attaches the choice to the current span, trace, or session
 28 |     context. Choices are cached per entity to ensure consistency within the same context.
 29 |     
 30 |     The experiment runs for the specified duration_days, after which the backend automatically
 31 |     stops accepting new choices. This ensures experiments are timebound and conclusions can be
 32 |     drawn from a fixed observation window.
 33 | 
 34 |     Args:
 35 |         name: Name of the choice/test (e.g., "model_selection", "ui_variant")
 36 |         variants: Dictionary mapping variant keys to their values
 37 |                  e.g., {"control": "gpt-4", "variant_a": "claude-3"}
 38 |         weights: Dictionary mapping variant keys to their selection weights
 39 |                 e.g., {"control": 0.5, "variant_a": 0.5}
 40 |         duration_days: How many days the experiment should run (required for timeboxing)
 41 |         default_variant: Fallback variant key to use if the test has completed
 42 |                         (defaults to first variant key if not specified)
 43 | 
 44 |     Returns:
 45 |         The value from the selected variant
 46 | 
 47 |     Example:
 48 |         model = ze.choose(
 49 |             name="model_test",
 50 |             variants={"control": "gpt-4", "variant_a": "claude-3"},
 51 |             weights={"control": 0.5, "variant_a": 0.5},
 52 |             duration_days=14,
 53 |             default_variant="control"
 54 |         )
 55 |         # Returns either "gpt-4" or "claude-3" for 14 days, then defaults to "gpt-4"
 56 | 
 57 |     Note:
 58 |         Use ze.set_signal() to attach boolean success/failure signals to the same entity
 59 |         to enable signal-based analytics in the ZeroEval dashboard.
 60 | 
 61 |     Raises:
 62 |         ValueError: If variants and weights don't have matching keys, or duration_days <= 0
 63 |         RuntimeError: If unable to determine current context
 64 |     """
 65 |     if not variants:
 66 |         raise ValueError("variants dictionary cannot be empty")
 67 | 
 68 |     if not weights:
 69 |         raise ValueError("weights dictionary cannot be empty")
 70 | 
 71 |     if duration_days <= 0:
 72 |         raise ValueError("duration_days must be greater than 0")
 73 | 
 74 |     # Validate that variants and weights have matching keys
 75 |     variant_keys = set(variants.keys())
 76 |     weight_keys = set(weights.keys())
 77 | 
 78 |     if variant_keys != weight_keys:
 79 |         raise ValueError(
 80 |             f"Variant keys {variant_keys} must match weight keys {weight_keys}"
 81 |         )
 82 | 
 83 |     # Validate that weights sum to a reasonable value (allow some floating point tolerance)
 84 |     weight_sum = sum(weights.values())
 85 |     if not (0.95 <= weight_sum <= 1.05):
 86 |         logger.warning(
 87 |             f"Weights for choice '{name}' sum to {weight_sum:.3f}, not 1.0. "
 88 |             "This may cause unexpected probability distributions."
 89 |         )
 90 | 
 91 |     # Set default variant to first key if not specified
 92 |     if default_variant is None:
 93 |         default_variant = list(variants.keys())[0]
 94 |     elif default_variant not in variants:
 95 |         raise ValueError(f"default_variant '{default_variant}' not found in variants")
 96 | 
 97 |     # Determine current context (similar to signals)
 98 |     current_span = tracer.get_current_span()
 99 |     current_trace = tracer.get_current_trace()
100 |     current_session = tracer.get_current_session()
101 | 
102 |     entity_type = None
103 |     entity_id = None
104 | 
105 |     # Prioritize span, then trace, then session (most specific to least specific)
106 |     if current_span:
107 |         entity_type = "span"
108 |         entity_id = current_span.span_id
109 |     elif current_trace:
110 |         entity_type = "trace"
111 |         entity_id = current_trace
112 |     elif current_session:
113 |         entity_type = "session"
114 |         entity_id = current_session
115 |     else:
116 |         raise RuntimeError(
117 |             "ze.choose() must be called within an active span, trace, or session context. "
118 |             "Make sure you're calling it within a @ze.span decorated function or ze.span() context manager."
119 |         )
120 | 
121 |     # Create cache key for this entity + choice combination
122 |     cache_key = f"{entity_type}:{entity_id}:{name}"
123 | 
124 |     # Check if we've already made this choice for this entity
125 |     if cache_key in _choice_cache:
126 |         selected_key = _choice_cache[cache_key]
127 |         logger.debug(
128 |             f"Using cached choice for {name}: {selected_key} -> {variants[selected_key]}"
129 |         )
130 |         return variants[selected_key]
131 | 
132 |     # Make weighted random selection
133 |     variant_keys_list = list(variants.keys())
134 |     variant_weights = [weights[key] for key in variant_keys_list]
135 | 
136 |     selected_key = random.choices(variant_keys_list, weights=variant_weights, k=1)[0]
137 |     selected_value = variants[selected_key]
138 | 
139 |     logger.info(
140 |         f"Made choice '{name}' for {entity_type}:{entity_id}: "
141 |         f"{selected_key} -> {selected_value}"
142 |     )
143 | 
144 |     # Send choice data to backend
145 |     try:
146 |         response_data = _send_choice_data(
147 |             entity_type=entity_type,
148 |             entity_id=entity_id,
149 |             choice_name=name,
150 |             variant_key=selected_key,
151 |             variant_value=str(selected_value),
152 |             variants=variants,
153 |             weights=weights,
154 |             duration_days=duration_days
155 |         )
156 |         
157 |         # Check if test has completed or been manually ended
158 |         if response_data and response_data.get("test_status") == "completed":
159 |             logger.warning(
160 |                 f"A/B test '{name}' has completed. Using default variant '{default_variant}'. "
161 |                 f"Message: {response_data.get('message', 'Test ended')}"
162 |             )
163 |             # Cache the default variant for consistency within this context
164 |             _choice_cache[cache_key] = default_variant
165 |             return variants[default_variant]
166 |         
167 |         # Test is running - cache the random selection
168 |         _choice_cache[cache_key] = selected_key
169 |         
170 |         # Attach AB choice to the current span for linkage
171 |         if response_data and response_data.get("ab_choice_id") and current_span:
172 |             ab_choice_metadata = {
173 |                 "ab_choice_id": response_data["ab_choice_id"],
174 |                 "choice_name": name,
175 |                 "variant_key": selected_key,
176 |                 "variant_value": str(selected_value)
177 |             }
178 |             current_span.ab_choices.append(ab_choice_metadata)
179 |             logger.debug(f"Attached AB choice {response_data['ab_choice_id']} to span {current_span.span_id}")
180 |             
181 |     except Exception as e:
182 |         logger.warning(f"Failed to send choice data for {name}: {e}")
183 |         # Cache the selection even if API call failed to ensure consistency
184 |         _choice_cache[cache_key] = selected_key
185 |         # Don't raise - choice selection should still work even if logging fails
186 | 
187 |     return selected_value
188 | 
189 | 
190 | def _send_choice_data(
191 |     entity_type: str,
192 |     entity_id: str,
193 |     choice_name: str,
194 |     variant_key: str,
195 |     variant_value: str,
196 |     variants: Dict[str, Any],
197 |     weights: Dict[str, float],
198 |     duration_days: int
199 | ) -> Dict[str, Any] | None:
200 |     """
201 |     Send choice data to the backend immediately.
202 | 
203 |     Returns:
204 |         Response data dict if successful, None otherwise
205 |     """
206 |     # Get configuration from environment
207 |     api_url = os.getenv("ZEROEVAL_API_URL", "https://api.zeroeval.com")
208 |     api_key = os.getenv("ZEROEVAL_API_KEY")
209 | 
210 |     if not all([api_key, api_url]):
211 |         logger.warning(
212 |             "Cannot send choice data. Missing ZEROEVAL_API_KEY or ZEROEVAL_API_URL."
213 |         )
214 |         return None
215 | 
216 |     # Prepare payload with new fields
217 |     endpoint = f"{api_url}/ab-choices"
218 |     headers = {
219 |         "Authorization": f"Bearer {api_key}",
220 |         "Content-Type": "application/json",
221 |     }
222 | 
223 |     # Convert variants to string values for serialization
224 |     serialized_variants = {k: str(v) for k, v in variants.items()}
225 | 
226 |     payload = {
227 |         "entity_type": entity_type,
228 |         "entity_id": entity_id,
229 |         "choice_name": choice_name,
230 |         "variant_key": variant_key,
231 |         "variant_value": variant_value,
232 |         "variants": serialized_variants,
233 |         "weights": weights,
234 |         "duration_days": duration_days
235 |     }
236 | 
237 |     try:
238 |         logger.debug(f"Sending choice data for {choice_name} to {endpoint}")
239 |         response = requests.post(endpoint, json=payload, headers=headers, timeout=5.0)
240 |         response.raise_for_status()
241 |         logger.debug(f"Choice data sent successfully: HTTP {response.status_code}")
242 |         
243 |         # Parse and return response
244 |         return response.json()
245 |     except requests.exceptions.RequestException as e:
246 |         logger.error(f"Failed to send choice data: {e}")
247 |         return None
248 | 
249 | 
250 | def clear_choice_cache() -> None:
251 |     """
252 |     Clear the choice cache.
253 | 
254 |     This is mainly useful for testing or when you want to force new choices
255 |     to be made for the same entities.
256 |     """
257 |     global _choice_cache
258 |     _choice_cache.clear()
259 |     logger.debug("Choice cache cleared")


--------------------------------------------------------------------------------
/tests/test_gemini_integration.py:
--------------------------------------------------------------------------------
  1 | """Tests for the Gemini integration."""
  2 | 
  3 | import pytest
  4 | from unittest.mock import Mock, patch, MagicMock
  5 | from zeroeval.observability.integrations.gemini.integration import GeminiIntegration
  6 | 
  7 | 
  8 | class TestGeminiIntegration:
  9 |     """Test suite for GeminiIntegration."""
 10 |     
 11 |     def test_is_available_when_package_exists(self):
 12 |         """Test that is_available returns True when google.genai is installed."""
 13 |         with patch('importlib.import_module') as mock_import:
 14 |             mock_import.return_value = Mock()
 15 |             assert GeminiIntegration.is_available() is True
 16 |             mock_import.assert_called_once_with('google.genai')
 17 |     
 18 |     def test_is_available_when_package_missing(self):
 19 |         """Test that is_available returns False when google.genai is not installed."""
 20 |         with patch('importlib.import_module') as mock_import:
 21 |             mock_import.side_effect = ImportError()
 22 |             assert GeminiIntegration.is_available() is False
 23 |     
 24 |     def test_setup_patches_client_init(self):
 25 |         """Test that setup correctly patches the Client.__init__ method."""
 26 |         # Mock the google.genai module
 27 |         mock_genai = Mock()
 28 |         mock_client_class = Mock()
 29 |         mock_genai.Client = mock_client_class
 30 |         
 31 |         with patch.dict('sys.modules', {'google': Mock(genai=mock_genai), 'google.genai': mock_genai}):
 32 |             # Create integration with a mock tracer
 33 |             mock_tracer = Mock()
 34 |             integration = GeminiIntegration(mock_tracer)
 35 |             
 36 |             # Run setup
 37 |             integration.setup()
 38 |             
 39 |             # Verify that Client.__init__ was patched
 40 |             assert mock_client_class.__init__ in integration._original_functions.values()
 41 |     
 42 |     def test_serialize_contents_handles_strings(self):
 43 |         """Test that _serialize_contents handles string inputs correctly."""
 44 |         integration = GeminiIntegration(Mock())
 45 |         
 46 |         result = integration._serialize_contents("Hello, world!")
 47 |         assert result == "Hello, world!"
 48 |     
 49 |     def test_serialize_contents_handles_lists(self):
 50 |         """Test that _serialize_contents handles list inputs correctly."""
 51 |         integration = GeminiIntegration(Mock())
 52 |         
 53 |         # Test with list of strings
 54 |         result = integration._serialize_contents(["Hello", "World"])
 55 |         assert result == ["Hello", "World"]
 56 |         
 57 |         # Test with mixed list
 58 |         mock_obj = Mock()
 59 |         mock_obj.__dict__ = {"text": "test", "role": "user"}
 60 |         result = integration._serialize_contents(["Hello", mock_obj])
 61 |         assert len(result) == 2
 62 |         assert result[0] == "Hello"
 63 |         assert isinstance(result[1], dict) or isinstance(result[1], str)
 64 |     
 65 |     def test_extract_config_attributes(self):
 66 |         """Test that _extract_config_attributes correctly extracts configuration."""
 67 |         integration = GeminiIntegration(Mock())
 68 |         
 69 |         # Mock config object
 70 |         mock_config = Mock()
 71 |         mock_config.temperature = 0.7
 72 |         mock_config.max_output_tokens = 100
 73 |         mock_config.top_p = 0.9
 74 |         mock_config.response_mime_type = "application/json"
 75 |         
 76 |         result = integration._extract_config_attributes(mock_config)
 77 |         
 78 |         assert result['temperature'] == 0.7
 79 |         assert result['max_output_tokens'] == 100
 80 |         assert result['top_p'] == 0.9
 81 |         assert result['response_mime_type'] == "application/json"
 82 |     
 83 |     def test_extract_config_attributes_with_tools(self):
 84 |         """Test that _extract_config_attributes correctly extracts tool information."""
 85 |         integration = GeminiIntegration(Mock())
 86 |         
 87 |         # Mock function declaration
 88 |         mock_func_decl = Mock()
 89 |         mock_func_decl.name = "get_weather"
 90 |         mock_func_decl.description = "Get weather information"
 91 |         
 92 |         # Mock tool with function declarations
 93 |         mock_tool = Mock()
 94 |         mock_tool.function_declarations = [mock_func_decl]
 95 |         
 96 |         # Mock config with tools
 97 |         mock_config = Mock()
 98 |         mock_config.tools = [mock_tool]
 99 |         
100 |         # Also test with a callable
101 |         def test_function():
102 |             """Test function docstring"""
103 |             pass
104 |         
105 |         mock_config.tools.append(test_function)
106 |         
107 |         result = integration._extract_config_attributes(mock_config)
108 |         
109 |         assert 'tools' in result
110 |         assert len(result['tools']) == 2
111 |         assert result['tools'][0]['name'] == 'get_weather'
112 |         assert result['tools'][0]['description'] == 'Get weather information'
113 |         assert result['tools'][1]['name'] == 'test_function'
114 |         assert result['tools'][1]['description'] == 'Test function docstring'
115 |     
116 |     def test_object_to_dict_with_to_dict_method(self):
117 |         """Test _object_to_dict when object has to_dict method."""
118 |         integration = GeminiIntegration(Mock())
119 |         
120 |         mock_obj = Mock()
121 |         mock_obj.to_dict.return_value = {"key": "value"}
122 |         
123 |         result = integration._object_to_dict(mock_obj)
124 |         assert result == {"key": "value"}
125 |         mock_obj.to_dict.assert_called_once()
126 |     
127 |     def test_object_to_dict_with_dict_attribute(self):
128 |         """Test _object_to_dict when object has __dict__ attribute."""
129 |         integration = GeminiIntegration(Mock())
130 |         
131 |         class TestObj:
132 |             def __init__(self):
133 |                 self.public_attr = "public"
134 |                 self._private_attr = "private"
135 |                 self.nested = Mock()
136 |                 self.nested.__dict__ = {"inner": "value"}
137 |         
138 |         obj = TestObj()
139 |         result = integration._object_to_dict(obj)
140 |         
141 |         assert "public_attr" in result
142 |         assert result["public_attr"] == "public"
143 |         assert "_private_attr" not in result  # Private attrs excluded
144 |         assert "nested" in result
145 |         assert isinstance(result["nested"], dict)
146 |     
147 |     @patch('time.time')
148 |     def test_wrap_generate_content_success(self, mock_time):
149 |         """Test successful wrapping of generate_content method."""
150 |         mock_time.side_effect = [1000, 1001]  # Start and end time
151 |         
152 |         # Setup mocks
153 |         mock_tracer = Mock()
154 |         mock_span = Mock()
155 |         mock_tracer.start_span.return_value = mock_span
156 |         
157 |         integration = GeminiIntegration(mock_tracer)
158 |         
159 |         # Mock response
160 |         mock_response = Mock()
161 |         mock_candidate = Mock()
162 |         mock_content = Mock()
163 |         mock_part = Mock()
164 |         mock_part.text = "The sky is blue."
165 |         mock_content.parts = [mock_part]
166 |         mock_candidate.content = mock_content
167 |         mock_candidate.finish_reason = "STOP"
168 |         mock_response.candidates = [mock_candidate]
169 |         
170 |         mock_usage = Mock()
171 |         mock_usage.prompt_token_count = 10
172 |         mock_usage.candidates_token_count = 20
173 |         mock_usage.total_token_count = 30
174 |         mock_response.usage_metadata = mock_usage
175 |         
176 |         # Mock original function
177 |         original_func = Mock(return_value=mock_response)
178 |         
179 |         # Wrap the function
180 |         wrapped_func = integration._wrap_generate_content(original_func)
181 |         
182 |         # Call wrapped function
183 |         result = wrapped_func(
184 |             model="gemini-2.0-flash-001",
185 |             contents="Why is the sky blue?",
186 |             config=None
187 |         )
188 |         
189 |         # Verify span was created with correct attributes
190 |         mock_tracer.start_span.assert_called_once()
191 |         call_args = mock_tracer.start_span.call_args
192 |         assert call_args[1]["name"] == "gemini.models.generate_content"
193 |         assert call_args[1]["kind"] == "llm"
194 |         assert call_args[1]["attributes"]["model"] == "gemini-2.0-flash-001"
195 |         assert call_args[1]["attributes"]["streaming"] is False
196 |         
197 |         # Verify span attributes were set
198 |         assert mock_span.attributes["inputTokens"] == 10
199 |         assert mock_span.attributes["outputTokens"] == 20
200 |         assert mock_span.attributes["totalTokens"] == 30
201 |         assert mock_span.attributes["finish_reason"] == "STOP"
202 |         assert "throughput" in mock_span.attributes
203 |         
204 |         # Verify span IO was set
205 |         mock_span.set_io.assert_called_once()
206 |         
207 |         # Verify span was ended
208 |         mock_tracer.end_span.assert_called_once_with(mock_span)
209 |         
210 |         # Verify result is returned unchanged
211 |         assert result == mock_response
212 |     
213 |     def test_wrap_generate_content_error_handling(self):
214 |         """Test error handling in wrapped generate_content method."""
215 |         # Setup mocks
216 |         mock_tracer = Mock()
217 |         mock_span = Mock()
218 |         mock_tracer.start_span.return_value = mock_span
219 |         
220 |         integration = GeminiIntegration(mock_tracer)
221 |         
222 |         # Mock original function that raises an error
223 |         original_func = Mock(side_effect=ValueError("Test error"))
224 |         
225 |         # Wrap the function
226 |         wrapped_func = integration._wrap_generate_content(original_func)
227 |         
228 |         # Call wrapped function and expect error
229 |         with pytest.raises(ValueError, match="Test error"):
230 |             wrapped_func(
231 |                 model="gemini-2.0-flash-001",
232 |                 contents="Test content"
233 |             )
234 |         
235 |         # Verify error was recorded in span
236 |         mock_span.set_error.assert_called_once()
237 |         error_call = mock_span.set_error.call_args
238 |         assert error_call[1]["code"] == "ValueError"
239 |         assert error_call[1]["message"] == "Test error"
240 |         
241 |         # Verify span was ended even with error
242 |         mock_tracer.end_span.assert_called_once_with(mock_span)


--------------------------------------------------------------------------------