├── models
├── __init__.py
├── text
│ ├── __init__.py
│ ├── gpt5.py
│ └── gpt4o.py
├── voice
│ └── __init__.py
├── realtime
│ ├── __init__.py
│ ├── liveanswer
│ │ ├── utils.py
│ │ ├── main.py
│ │ ├── mrcr_context.py
│ │ ├── stt_service.py
│ │ ├── audio_to_answer.py
│ │ ├── audio.py
│ │ └── explain.py
│ ├── freeze_omni.py
│ └── moshi.py
└── shared
│ ├── __init__.py
│ ├── base_adapter.py
│ └── timing_utils.py
├── tests
├── __init__.py
├── README.md
└── test_models.py
├── assets
└── vera.png
├── test_voice_episodes
└── audio
│ ├── vera_aime_0a923d23.wav
│ ├── vera_mrcr_00c44580.wav
│ ├── vera_simpleqa_0a9d56e1.wav
│ ├── vera_browsecomp_9c79d2a8.wav
│ └── vera_gpqadiamond_fa834623.wav
├── data
├── download.txt
└── README.md
├── LICENSES
├── GPQA.CC-BY-4.0.txt
├── MRCR.txt
├── AIME-2025.MIT.txt
├── SimpleQA.MIT.txt
├── BrowseComp.MIT.txt
├── Boson-Higgs-Audio-2-Community-License.txt
└── Meta-Llama-3-Community-License.txt
├── utils
├── __init__.py
└── web_search.py
├── NOTICE.txt
├── ATTRIBUTIONS.md
├── .env.template
├── evaluation
├── grader
│ ├── __init__.py
│ ├── base.py
│ ├── prompts.py
│ ├── wer_calculator.py
│ ├── llm_grader.py
│ ├── voice_grader.py
│ └── run_grader.py
└── text
│ ├── batch_evaluate.py
│ └── run_evaluation.py
├── .gitignore
├── LICENSE
├── pyproject.toml
└── README.md
/models/__init__.py:
--------------------------------------------------------------------------------
1 | """VERA Model Adapters"""
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """VERA Model Tests Package"""
2 |
--------------------------------------------------------------------------------
/models/text/__init__.py:
--------------------------------------------------------------------------------
1 | """Text model adapters for VERA"""
--------------------------------------------------------------------------------
/models/voice/__init__.py:
--------------------------------------------------------------------------------
1 | """Voice model adapters for VERA"""
--------------------------------------------------------------------------------
/models/realtime/__init__.py:
--------------------------------------------------------------------------------
1 | """Realtime model adapters for VERA"""
--------------------------------------------------------------------------------
/models/shared/__init__.py:
--------------------------------------------------------------------------------
1 | """Shared utilities for model adapters"""
--------------------------------------------------------------------------------
/assets/vera.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linyueqian/VERA/HEAD/assets/vera.png
--------------------------------------------------------------------------------
/test_voice_episodes/audio/vera_aime_0a923d23.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_aime_0a923d23.wav
--------------------------------------------------------------------------------
/test_voice_episodes/audio/vera_mrcr_00c44580.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_mrcr_00c44580.wav
--------------------------------------------------------------------------------
/test_voice_episodes/audio/vera_simpleqa_0a9d56e1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_simpleqa_0a9d56e1.wav
--------------------------------------------------------------------------------
/test_voice_episodes/audio/vera_browsecomp_9c79d2a8.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_browsecomp_9c79d2a8.wav
--------------------------------------------------------------------------------
/test_voice_episodes/audio/vera_gpqadiamond_fa834623.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_gpqadiamond_fa834623.wav
--------------------------------------------------------------------------------
/data/download.txt:
--------------------------------------------------------------------------------
1 | VERA Dataset - Download Instructions
2 |
3 | Download the complete VERA dataset from Google Drive:
4 |
5 | Download URL: https://drive.google.com/file/d/1k0b4qXfQ16fVqe-hMn_GSsfc0exCLgSq/view?usp=sharing
6 |
--------------------------------------------------------------------------------
/LICENSES/GPQA.CC-BY-4.0.txt:
--------------------------------------------------------------------------------
1 | GPQA (c) by Irving David Rein
2 |
3 | GPQA is licensed under a Creative Commons Attribution 4.0 International License.
4 |
5 | You should have received a copy of the license along with this
6 | work. If not, see .
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility helpers for the VERA benchmark.
3 |
4 | The voice evaluators expect a handful of shared helpers under ``utils``.
5 | Historically these lived out-of-tree, which meant the packaged release was
6 | missing the module entirely. This package re-introduces the helpers so that
7 | legacy imports like ``from utils.web_search import is_browsecomp_episode`` resolve
8 | at runtime.
9 | """
10 |
11 | from .web_search import is_browsecomp_episode
12 |
13 | __all__ = ["is_browsecomp_episode"]
14 |
--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | Text JSON: upstream licenses (see ATTRIBUTIONS.md). No text edits.
2 | Audio: generated with Boson Higgs Audio 2; subject to the Boson Higgs Audio 2 Community License.
3 |
4 | "Built with Higgs Materials licensed from Boson AI USA, Inc., Copyright © Boson AI USA, Inc., All Rights Reserved and Meta Llama 3 licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc., All Rights Reserved."
5 |
6 | Restriction: do not use the audio outputs to improve any other large language model.
7 |
8 | License texts: see ./LICENSES/
--------------------------------------------------------------------------------
/models/realtime/liveanswer/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Optional
3 | from pathlib import Path
4 |
5 | # Load environment variables from .env if available
6 | from dotenv import load_dotenv # type: ignore
7 |
8 | # Load from the project root .env file
9 | project_root = Path(__file__).resolve().parents[4] # Go up 4 levels to vera_dev
10 | env_path = project_root / ".env"
11 | load_dotenv(env_path)
12 |
13 |
14 | def _env(name: str, default: Optional[str] = None) -> Optional[str]:
15 | value = os.environ.get(name, default)
16 | return value
17 |
18 |
19 |
--------------------------------------------------------------------------------
/ATTRIBUTIONS.md:
--------------------------------------------------------------------------------
1 | SimpleQA — Source: https://github.com/openai/simple-evals — License: MIT — Change: audio added via Higgs; no text edits.
2 | BrowseComp — Source: https://github.com/openai/simple-evals — License: MIT — Change: audio added via Higgs; no text edits.
3 | MRCR — Source: https://huggingface.co/datasets/openai/mrcr — License: MIT — Change: audio added via Higgs; no text edits.
4 | GPQA‑Diamond — Source: https://huggingface.co/datasets/Idavidrein/gpqa — License: CC BY 4.0 — Note: HF gate forbids public example release.
5 | Audio synthesis — Model: Boson Higgs Audio 2 — See NOTICE.txt and ./LICENSES/.
--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | # OpenAI API Keys
2 | OPENAI_API_KEY=your_openai_api_key_here
3 | OPENAI_BASE_URL=https://api.openai.com/v1
4 |
5 | # Google/Gemini API Keys
6 | GOOGLE_API_KEY=your_google_api_key_here
7 |
8 | # Anthropic API Keys
9 | ANTHROPIC_API_KEY=your_anthropic_api_key_here
10 |
11 | # Azure Speech Services (for ASR/TTS)
12 | AZURE_SPEECH_KEY=your_azure_speech_key_here
13 | AZURE_SPEECH_REGION=your_azure_region_here
14 |
15 | # Azure AI Inference (for Phi-4)
16 | PHI4_CHAT_COMPLETIONS_URL=https://your-phi4-endpoint.services.ai.azure.com/models/chat/completions?api-version=2024-05-01-preview
17 | PHI4_API_KEY=your_phi4_api_key_here
18 | PHI4_MODEL=Phi-4-multimodal-instruct
19 |
20 | # Groq API (optional)
21 | GROQ_API_KEY=your_groq_api_key_here
22 |
23 | # HuggingFace (for local models)
24 | HF_TOKEN=your_hf_token_here
--------------------------------------------------------------------------------
/evaluation/grader/__init__.py:
--------------------------------------------------------------------------------
1 | """Grader package: accuracy-first grading utilities.
2 |
3 | This module provides:
4 | - Prompt templates per benchmark
5 | - Grader base classes and result types
6 | - Heuristic and LLM-backed graders
7 | - Voice evaluation with ASR and WER calculation
8 | - A small CLI for grading single triplets or batch outputs
9 | """
10 |
11 | from .base import GradeResult, GradeLabel
12 | from .prompts import get_accuracy_prompt
13 | from .llm_grader import LLMAccuracyGrader
14 | from .voice_grader import VoiceAccuracyGrader
15 | from .asr_processor import ASRProcessor
16 | from .wer_calculator import WERCalculator
17 |
18 | __all__ = [
19 | "GradeResult",
20 | "GradeLabel",
21 | "get_accuracy_prompt",
22 | "LLMAccuracyGrader",
23 | "VoiceAccuracyGrader",
24 | "ASRProcessor",
25 | "WERCalculator",
26 | ]
27 |
--------------------------------------------------------------------------------
/utils/web_search.py:
--------------------------------------------------------------------------------
1 | """
2 | Stubs for legacy web search utilities referenced by voice evaluators.
3 |
4 | The project no longer exposes live web search capabilities, but older releases
5 | still import ``utils.web_search.is_browsecomp_episode`` to decide whether the
6 | BrowseComp tooling should run. We keep a minimal shim so those imports resolve
7 | without pulling in unavailable dependencies.
8 | """
9 |
10 | from __future__ import annotations
11 |
12 | from typing import Any, Dict, Optional
13 |
14 |
15 | def is_browsecomp_episode(episode_data: Optional[Dict[str, Any]]) -> bool:
16 | """Return True if this episode should enable web search tooling.
17 |
18 | Strategy: enable only for browsecomp benchmark by id/track hints.
19 | """
20 | episode_id = (episode_data or {}).get("id", "").lower()
21 | track = (episode_data or {}).get("track", "").lower()
22 | return "browsecomp" in episode_id or track == "browsecomp"
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # macOS
2 | .DS_Store
3 | .DS_Store?
4 | ._*
5 | .Spotlight-V100
6 | .Trashes
7 | ehthumbs.db
8 | Thumbs.db
9 |
10 | # Python
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 | *.so
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # Virtual environments
34 | .env
35 | .venv
36 | env/
37 | venv/
38 | ENV/
39 | env.bak/
40 | venv.bak/
41 |
42 | # IDE
43 | .vscode/
44 | .idea/
45 | *.swp
46 | *.swo
47 | *~
48 |
49 | # Jupyter Notebook
50 | .ipynb_checkpoints
51 |
52 | # pytest
53 | .pytest_cache/
54 |
55 | # Coverage reports
56 | htmlcov/
57 | .coverage
58 | .coverage.*
59 | coverage.xml
60 | *.cover
61 | .hypothesis/
62 |
63 | # mypy
64 | .mypy_cache/
65 | .dmypy.json
66 | dmypy.json
67 |
68 | # Results and logs
69 | results/
70 | logs/
71 | *.log
72 |
73 | # Large dataset files (should be downloaded separately)
74 | data/final_dataset/
75 |
76 | .claude
--------------------------------------------------------------------------------
/LICENSES/MRCR.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
--------------------------------------------------------------------------------
/LICENSES/AIME-2025.MIT.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Adobe
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/LICENSES/SimpleQA.MIT.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 OpenAI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/LICENSES/BrowseComp.MIT.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 OpenAI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/evaluation/grader/base.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dataclasses import dataclass
4 | from enum import Enum
5 | from typing import Optional, Dict, Any
6 |
7 |
8 | class GradeLabel(str, Enum):
9 | CORRECT = "CORRECT"
10 | INCORRECT = "INCORRECT"
11 | NOT_ATTEMPTED = "NOT_ATTEMPTED"
12 |
13 |
14 | @dataclass
15 | class GradeResult:
16 | label: GradeLabel
17 | extracted_final_answer: Optional[str] = None
18 | reasoning: Optional[str] = None
19 | correct_flag: Optional[bool] = None
20 | confidence: Optional[float] = None # 0-100
21 | raw_model_output: Optional[str] = None
22 | metadata: Optional[Dict[str, Any]] = None
23 |
24 |
25 | class BaseAccuracyGrader:
26 | """Interface for accuracy graders.
27 |
28 | Implementations should focus on judging whether a predicted response
29 | answers the question correctly with respect to a gold target.
30 | """
31 |
32 | def grade(
33 | self,
34 | question: str,
35 | gold_target: str,
36 | predicted_answer: str,
37 | benchmark: Optional[str] = None,
38 | mode: str = "triad", # "triad" -> A/B/C, "binary" -> yes/no
39 | ) -> GradeResult:
40 | raise NotImplementedError
41 |
42 |
--------------------------------------------------------------------------------
/models/realtime/liveanswer/main.py:
--------------------------------------------------------------------------------
1 | import threading
2 | from pathlib import Path
3 | from typing import List, Optional
4 |
5 | from .explain import ExplainSynthesizer
6 | from .audio import AudioGenerator
7 | from .solver_standard import StandardProblemSolver as ProblemSolver
8 |
9 |
10 | def main_request(request: str, audio_file_path: Optional[str] = None) -> tuple[bytes, float, str, str]:
11 | """
12 | Orchestrates the pipeline:
13 | - Create ExplainSynthesizer
14 | - Run AudioGenerator and ProblemSolver concurrently
15 | - Return resulting MP3 bytes, time to first response, GPT-5 response, and Groq explanation
16 | """
17 |
18 | explainer = ExplainSynthesizer(request=request)
19 | audio_gen = AudioGenerator(explainer=explainer)
20 | solver = ProblemSolver(explainer=explainer, audio_file_path=audio_file_path)
21 |
22 | audio_bytes_holder: List[bytes] = []
23 | time_to_first_response_holder: List[float] = []
24 |
25 | def run_audio():
26 | audio_bytes, time_to_first_response = audio_gen.start()
27 | audio_bytes_holder.append(audio_bytes)
28 | time_to_first_response_holder.append(time_to_first_response)
29 |
30 | t_audio = threading.Thread(target=run_audio, name="audio_gen")
31 | t_solver = threading.Thread(target=lambda: solver.start(request), name="problem_solver")
32 |
33 | t_audio.start()
34 | t_solver.start()
35 |
36 | t_audio.join()
37 | t_solver.join()
38 |
39 | audio_bytes = audio_bytes_holder[0] if audio_bytes_holder else b""
40 | time_to_first_response = time_to_first_response_holder[0] if time_to_first_response_holder else 0.0
41 |
42 | # Get both the GPT-5 response and the Groq explanation
43 | gpt5_response = getattr(explainer, 'gpt5_response', 'GPT-5 response not captured')
44 | groq_explanation = getattr(explainer, 'spoken_explanation', 'Groq explanation not captured')
45 |
46 | if audio_bytes:
47 | output_path = Path(__file__).resolve().parents[1] / "liveanswer-output.mp3"
48 | output_path.write_bytes(audio_bytes)
49 |
50 | return audio_bytes, time_to_first_response, gpt5_response, groq_explanation
51 |
52 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # VERA Model Tests
2 |
3 | This directory contains tests for VERA model adapters.
4 |
5 | ## Running Tests
6 |
7 | ### Quick Smoke Tests (No Dependencies)
8 |
9 | Run the basic smoke tests without installing pytest:
10 |
11 | ```bash
12 | python tests/test_models.py
13 | ```
14 |
15 | This will test:
16 | - Base adapter classes
17 | - Text model imports (GPT-4o, Gemini, etc.)
18 | - Voice model imports (Qwen2-Audio, Ultravox, etc.)
19 | - Realtime model imports (GPT Realtime, Gemini Realtime, Moshi)
20 | - Configuration utilities
21 | - MRCR context parsing
22 |
23 | ### Full Test Suite (with pytest)
24 |
25 | If you have pytest installed, run comprehensive tests:
26 |
27 | ```bash
28 | # Install pytest if needed
29 | pip install pytest
30 |
31 | # Run all tests
32 | pytest tests/test_models.py
33 |
34 | # Run with verbose output
35 | pytest tests/test_models.py -v
36 |
37 | # Run specific test class
38 | pytest tests/test_models.py::TestGPT4oAdapter -v
39 | ```
40 |
41 | ## Test Coverage
42 |
43 | ### Base Classes (`TestBaseAdapter`)
44 | - ✓ ModelConfig creation
45 | - ✓ BaseAdapter initialization
46 |
47 | ### Text Models
48 | - ✓ GPT-4o adapter (`TestGPT4oAdapter`)
49 | - ✓ Gemini 2.5 Pro adapter (`TestGemini25ProAdapter`)
50 | - ✓ GPT-5 adapter (`TestGPT5Adapter`)
51 |
52 | ### Voice Models
53 | - ✓ Qwen2-Audio adapter (`TestQwen2AudioAdapter`)
54 | - ✓ Ultravox adapter (`TestUltravoxAdapter`)
55 |
56 | ### Realtime Models
57 | - ✓ GPT Realtime adapter (`TestGPTRealtimeAdapter`)
58 | - ✓ Gemini Realtime adapter (`TestGeminiRealtimeAdapter`)
59 | - ✓ Moshi adapter (`TestMoshiAdapter`)
60 |
61 | ### Utilities
62 | - ✓ Timing utilities (`TestTimingUtils`)
63 |
64 | ## Test Output
65 |
66 | ### Success
67 | ```
68 | ✓ All required tests passed!
69 | ```
70 |
71 | ### Skipped Tests
72 | Some tests may be skipped if optional dependencies aren't installed:
73 | ```
74 | ⊘ Voice models skipped: No module named 'librosa'
75 | ```
76 |
77 | This is expected and won't affect the core functionality tests.
78 |
79 | ## Adding New Tests
80 |
81 | To add tests for a new model:
82 |
83 | 1. Import the model adapter
84 | 2. Create a test class (e.g., `TestMyNewAdapter`)
85 | 3. Add test methods starting with `test_`
86 | 4. Update the smoke tests in `run_smoke_tests()` if needed
87 |
88 | Example:
89 |
90 | ```python
91 | class TestMyNewAdapter:
92 | """Test my new adapter"""
93 |
94 | def test_adapter_initialization(self):
95 | """Test adapter can be initialized"""
96 | from models.mytype.mynew import MyNewAdapter
97 |
98 | adapter = MyNewAdapter(api_key="test-key")
99 | assert adapter.model_name == "my-new-model"
100 | ```
101 |
102 | ## Notes
103 |
104 | - Tests use mocking to avoid requiring API keys or making real API calls
105 | - Voice model tests may require additional dependencies (librosa, torch, vllm)
106 | - Realtime model tests check module imports and basic functionality
107 | - The test suite is designed to run quickly and not require model downloads
108 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "vera-benchmark"
7 | version = "1.0.0"
8 | description = "VERA: Voice-Enabled Reasoning Assessment Benchmark for Evaluating Reasoning Resilience in Voice Agents"
9 | readme = "README.md"
10 | requires-python = ">=3.9"
11 | license = {text = "MIT"}
12 | authors = [
13 | {name = "VERA Team", email = "vera@example.com"}
14 | ]
15 | keywords = ["voice", "reasoning", "benchmark", "ai", "evaluation"]
16 | classifiers = [
17 | "Development Status :: 4 - Beta",
18 | "Intended Audience :: Developers",
19 | "Intended Audience :: Science/Research",
20 | "License :: OSI Approved :: MIT License",
21 | "Operating System :: OS Independent",
22 | "Programming Language :: Python :: 3",
23 | "Programming Language :: Python :: 3.9",
24 | "Programming Language :: Python :: 3.10",
25 | "Programming Language :: Python :: 3.11",
26 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
27 | "Topic :: Multimedia :: Sound/Audio :: Speech",
28 | ]
29 |
30 | dependencies = [
31 | "pydantic>=2.0.0",
32 | "pyyaml>=6.0",
33 | "click>=8.0.0",
34 | "numpy>=1.26.0",
35 | "scipy>=1.7.0",
36 | "openai>=1.0.0",
37 | "anthropic>=0.7.0",
38 | "datasets>=2.0.0",
39 | "librosa>=0.9.0",
40 | "soundfile>=0.12.0",
41 | "webrtcvad>=2.0.10",
42 | "websockets>=12.0",
43 | "azure-cognitiveservices-speech>=1.30.0",
44 | "requests>=2.25.0",
45 | "httpx>=0.24.0",
46 | "tqdm>=4.60.0",
47 | "jsonlines>=3.0.0",
48 | "tiktoken>=0.4.0",
49 | "python-dotenv>=0.19.0",
50 | "rich>=12.0.0",
51 | "matplotlib>=3.5.0",
52 | "seaborn>=0.11.0",
53 | "pandas>=1.3.0",
54 | "pytest>=8.3.5",
55 | "pytest-cov>=5.0.0",
56 | "pydub>=0.25.1",
57 | "nemo-toolkit[asr]>=1.23.0",
58 | "google-genai>=1.36.0",
59 | "websocket-client>=1.8.0",
60 | "vllm>=0.1.2",
61 | "groq>=0.31.1",
62 | "sphn>=0.2.0",
63 | "python-socketio[client]>=5.13.0",
64 | "dotenv>=0.9.9",
65 | ]
66 |
67 | [project.optional-dependencies]
68 | dev = [
69 | "pytest>=7.0.0",
70 | "pytest-cov>=4.0.0",
71 | "black>=22.0.0",
72 | "isort>=5.10.0",
73 | "flake8>=4.0.0",
74 | "mypy>=0.991",
75 | "pre-commit>=2.17.0",
76 | ]
77 |
78 | [project.urls]
79 | Homepage = "https://github.com/linyueqian/VERA"
80 | Documentation = "https://github.com/linyueqian/VERA"
81 | Repository = "https://github.com/linyueqian/VERA.git"
82 | "Bug Tracker" = "https://github.com/linyueqian/VERA/issues"
83 |
84 | ## No console scripts are exposed by this repository
85 |
86 | [tool.setuptools.packages.find]
87 | where = ["."]
88 | exclude = ["tests*", "*.tests*", "models*"]
89 |
90 | ## No package data declared; this repo is primarily a benchmark codebase
91 |
92 | [tool.black]
93 | line-length = 88
94 | target-version = ['py39']
95 | include = '\.pyi?$'
96 | extend-exclude = '''
97 | /(
98 | \.eggs
99 | | \.git
100 | | \.hg
101 | | \.mypy_cache
102 | | \.tox
103 | | \.venv
104 | | _build
105 | | buck-out
106 | | build
107 | | dist
108 | )/
109 | '''
110 |
111 | [tool.isort]
112 | profile = "black"
113 | multi_line_output = 3
114 | include_trailing_comma = true
115 | force_grid_wrap = 0
116 | use_parentheses = true
117 | ensure_newline_before_comments = true
118 | line_length = 88
119 |
120 | [tool.mypy]
121 | python_version = "3.9"
122 | warn_return_any = true
123 | warn_unused_configs = true
124 | disallow_untyped_defs = true
125 | ignore_missing_imports = true
126 |
127 | [tool.pytest.ini_options]
128 | testpaths = ["tests"]
129 | python_files = ["test_*.py"]
130 | python_classes = ["Test*"]
131 | python_functions = ["test_*"]
132 | addopts = "--cov=models --cov=evaluation --cov-report=term-missing"
133 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VERA: Voice Evaluation of Reasoning Ability
2 |
3 | [](https://opensource.org/licenses/MIT) [](https://arxiv.org/abs/2509.26542)
4 |
5 | **Voice Evaluation of Reasoning Ability: Diagnosing the Modality-Induced Performance Gap**
6 |
7 | 
8 |
9 | We present Voice Evaluation of Reasoning Ability (VERA), a benchmark for evaluating reasoning ability in voice-interactive systems under real-time conversational constraints. VERA comprises 2,931 voice-native episodes derived from established text benchmarks and organized into five tracks (Math, Web, Science, Long-Context, Factual). Each item is adapted for speech interaction while preserving reasoning difficulty.
10 |
11 | ## Installation
12 |
13 | ```bash
14 | # Clone the repository
15 | git clone https://github.com/linyueqian/VERA.git
16 | cd VERA
17 |
18 | # Install uv if you haven't already
19 | curl -LsSf https://astral.sh/uv/install.sh | sh
20 |
21 | # Install dependencies (handles virtual environment automatically)
22 | uv sync
23 | ```
24 |
25 | ## Dataset
26 |
27 | The VERA dataset contains 2,931 voice-native episodes across five tracks. Questions and answers are encrypted using XOR cipher to prevent memorization. See [data/README.md](data/README.md) for complete details on structure, encryption, and decryption.
28 |
29 | ### Download
30 |
31 | Download the complete dataset from Google Drive: https://drive.google.com/file/d/1k0b4qXfQ16fVqe-hMn_GSsfc0exCLgSq/view?usp=sharing
32 |
33 | ### Sample Data
34 |
35 | Check `test_voice_episodes/` for unencrypted examples:
36 |
37 | ```bash
38 | # View sample episode structure
39 | cat test_voice_episodes/test.json
40 |
41 | # Listen to sample audio
42 | ls test_voice_episodes/audio/
43 | ```
44 |
45 | ## Quick Start
46 |
47 | ### 1. Set up API keys
48 |
49 | ```bash
50 | cp .env.template .env
51 | # Edit .env with your API keys
52 | ```
53 |
54 | ### 2. Run evaluation
55 |
56 | ```bash
57 | # Evaluate voice models
58 | uv run python evaluation/voice/batch_evaluate.py
59 |
60 | # Evaluate text models (for comparison)
61 | uv run python evaluation/text/batch_evaluate.py
62 |
63 | # Evaluate realtime models
64 | uv run python evaluation/realtime/batch_evaluate.py
65 | ```
66 |
67 | ### 3. View results
68 |
69 | Results will be saved in the specified output directory with performance metrics and analysis.
70 |
71 | ## Acknowledgements
72 |
73 | We thank the [Full-Duplex-Bench](https://github.com/DanielLin94144/Full-Duplex-Bench) project for their implementations of several realtime models, including Freeze-Omni, Moshi, and Sonic, which we adapted for use in VERA.
74 |
75 | ## Citation
76 |
77 | If you use VERA in your research, please cite our paper:
78 |
79 | ```bibtex
80 | @misc{lin2025vera,
81 | title={Voice Evaluation of Reasoning Ability: Diagnosing the Modality-Induced Performance Gap},
82 | author={Lin, Yueqian and Hu, Zhengmian and Wang, Qinsi and Liu, Yudong and Zhang, Hengfan and Subramanian, Jayakumar and Vlassis, Nikos and Li, Hai Helen and Chen, Yiran},
83 | year={2025},
84 | eprint={2509.26542},
85 | archivePrefix={arXiv},
86 | primaryClass={eess.AS},
87 | url={https://arxiv.org/abs/2509.26542}
88 | }
89 | ```
90 |
91 | ## License
92 |
93 | This project uses a dual licensing structure:
94 |
95 | - **Code**: MIT License (see [LICENSE](LICENSE))
96 | - **Data**: The text data follows upstream licenses (MIT for SimpleQA, BrowseComp, MRCR; CC BY 4.0 for GPQA-Diamond). The audio data is generated with Boson Higgs Audio 2 and is subject to the Boson Higgs Audio 2 Community License.
97 |
98 | For complete licensing details, attribution information, and restrictions, please see:
99 | - [ATTRIBUTIONS.md](ATTRIBUTIONS.md) for data source attributions
100 | - [NOTICE.txt](NOTICE.txt) for audio generation licensing and restrictions
101 | - [LICENSES/](LICENSES/) for full license texts
102 |
--------------------------------------------------------------------------------
/evaluation/text/batch_evaluate.py:
--------------------------------------------------------------------------------
1 | """
2 | Batch evaluation script to run multiple models on multiple datasets
3 | Convenient wrapper around run_evaluation.py
4 | """
5 |
6 | import os
7 | import sys
8 | import asyncio
9 | import argparse
10 | from pathlib import Path
11 | from datetime import datetime
12 | from dotenv import load_dotenv
13 | from evaluation.text.run_evaluation import TextModelEvaluator
14 |
15 | # Load environment variables from .env file
16 | load_dotenv()
17 |
18 | def get_available_datasets():
19 | """Get list of available dataset files"""
20 | dataset_dir = Path(__file__).parent.parent.parent / 'data' / 'final_dataset' / 'text'
21 | if not dataset_dir.exists():
22 | return []
23 | return [f.stem.replace('_voice_episodes', '') for f in dataset_dir.glob('*_voice_episodes.json')]
24 |
25 | def get_dataset_path(dataset_name):
26 | """Get full path to dataset file"""
27 | dataset_dir = Path(__file__).parent.parent.parent / 'data' / 'final_dataset' / 'text'
28 | return dataset_dir / f'{dataset_name}_voice_episodes.json'
29 |
30 | async def main():
31 | parser = argparse.ArgumentParser(description='Batch evaluate text models on VERA datasets')
32 | parser.add_argument('--models', nargs='+',
33 | choices=['gpt4o', 'gpt5-instant', 'gpt5-thinking', 'gemini-2.5-pro', 'gemini-2.5-flash'],
34 | default=['gpt4o', 'gpt5-instant', 'gpt5-thinking', 'gemini-2.5-pro', 'gemini-2.5-flash'],
35 | help='Models to evaluate')
36 | parser.add_argument('--datasets', nargs='+',
37 | choices=get_available_datasets(),
38 | default=get_available_datasets(),
39 | help='Datasets to evaluate on')
40 | parser.add_argument('--max-episodes', type=int,
41 | help='Maximum episodes per dataset')
42 | parser.add_argument('--max-concurrent', type=int, default=16,
43 | help='Maximum concurrent requests')
44 | parser.add_argument('--sequential', action='store_true',
45 | help='Run evaluations sequentially instead of in parallel')
46 | parser.add_argument('--resume-from', type=str,
47 | help='Resume from existing output directory (e.g., test_output/gemini-2.5-pro_browsecomp_20250917_215054)')
48 |
49 | args = parser.parse_args()
50 |
51 | print("Available datasets:", get_available_datasets())
52 | print(f"Selected models: {args.models}")
53 | print(f"Selected datasets: {args.datasets}")
54 |
55 | evaluator = TextModelEvaluator()
56 |
57 | # Create combinations of model and dataset
58 | tasks = []
59 | for model in args.models:
60 | for dataset in args.datasets:
61 | dataset_path = get_dataset_path(dataset)
62 | if not dataset_path.exists():
63 | print(f"Warning: Dataset not found: {dataset_path}")
64 | continue
65 |
66 | print(f"Queuing: {model} on {dataset}")
67 | task = evaluator.run_evaluation(
68 | model, str(dataset_path),
69 | args.max_episodes, args.max_concurrent,
70 | resume_from=args.resume_from
71 | )
72 | tasks.append((model, dataset, task))
73 |
74 | if not tasks:
75 | print("No valid model/dataset combinations found")
76 | return 1
77 |
78 | print(f"\nStarting {len(tasks)} evaluation tasks...")
79 | start_time = datetime.now()
80 |
81 | if args.sequential:
82 | # Run sequentially
83 | for model, dataset, task in tasks:
84 | print(f"\n--- Running {model} on {dataset} ---")
85 | try:
86 | await task
87 | print(f"✓ Completed {model} on {dataset}")
88 | except Exception as e:
89 | print(f"✗ Failed {model} on {dataset}: {e}")
90 | else:
91 | # Run in parallel
92 | results = await asyncio.gather(*[task for _, _, task in tasks], return_exceptions=True)
93 |
94 | # Print results
95 | for i, (model, dataset, _) in enumerate(tasks):
96 | if isinstance(results[i], Exception):
97 | print(f"✗ Failed {model} on {dataset}: {results[i]}")
98 | else:
99 | print(f"✓ Completed {model} on {dataset}")
100 |
101 | end_time = datetime.now()
102 | total_duration = (end_time - start_time).total_seconds()
103 |
104 | print(f"\nBatch evaluation completed in {total_duration:.2f} seconds")
105 | print(f"Results saved to test_output/ directory")
106 |
107 | if __name__ == "__main__":
108 | exit(asyncio.run(main()))
109 |
--------------------------------------------------------------------------------
/models/shared/base_adapter.py:
--------------------------------------------------------------------------------
1 | """
2 | Base adapter interface for VERA model implementations
3 | """
4 |
5 | from abc import ABC, abstractmethod
6 | from typing import Dict, Any, List, Optional
7 | from dataclasses import dataclass
8 | from pathlib import Path
9 | import asyncio
10 |
11 |
12 | @dataclass
13 | class ModelConfig:
14 | """Base configuration for model adapters"""
15 | model_name: str
16 | temperature: float = 0.0
17 | max_tokens: int = 4096
18 | timeout: float = 300.0
19 | max_concurrent: int = 16
20 |
21 |
22 | class BaseAdapter(ABC):
23 | """Base class for all VERA model adapters"""
24 |
25 | def __init__(self, config: ModelConfig):
26 | self.config = config
27 | self.model_name = config.model_name
28 |
29 | @abstractmethod
30 | def process_episode(self, episode: Dict[str, Any], output_dir: str) -> Dict[str, Any]:
31 | """
32 | Process a single episode.
33 |
34 | Args:
35 | episode: Episode data containing turns and metadata
36 | output_dir: Directory to save outputs
37 |
38 | Returns:
39 | Standardized episode result
40 | """
41 | pass
42 |
43 | async def process_episodes_batch(
44 | self,
45 | episodes: List[Dict[str, Any]],
46 | output_dir: str,
47 | max_concurrent: Optional[int] = None
48 | ) -> Dict[str, Any]:
49 | """
50 | Process multiple episodes concurrently.
51 |
52 | Args:
53 | episodes: List of episodes to process
54 | output_dir: Directory to save outputs
55 | max_concurrent: Maximum concurrent episodes (uses config default if None)
56 |
57 | Returns:
58 | Standardized batch result
59 | """
60 | from .timing_utils import create_standardized_batch_result
61 | import time
62 |
63 | max_concurrent = max_concurrent or self.config.max_concurrent
64 | print(f"[{self.model_name}] Batch processing {len(episodes)} episodes (max {max_concurrent} concurrent)")
65 |
66 | output_path = Path(output_dir)
67 | output_path.mkdir(parents=True, exist_ok=True)
68 |
69 | start_time = time.time()
70 | semaphore = asyncio.Semaphore(max_concurrent)
71 |
72 | async def process_one(episode):
73 | async with semaphore:
74 | return await asyncio.to_thread(self.process_episode, episode, output_dir)
75 |
76 | tasks = [process_one(ep) for ep in episodes]
77 | results = await asyncio.gather(*tasks, return_exceptions=True)
78 |
79 | processed_results = []
80 | for i, result in enumerate(results):
81 | if isinstance(result, Exception):
82 | processed_results.append({
83 | "episode_id": episodes[i].get("id", f"episode_{i}"),
84 | "turns": [],
85 | "total_time": 0,
86 | "success": False,
87 | "error": str(result),
88 | "metadata": {}
89 | })
90 | else:
91 | processed_results.append(result)
92 |
93 | total_time = time.time() - start_time
94 | return create_standardized_batch_result(
95 | episodes=processed_results,
96 | total_time=total_time,
97 | model_name=self.model_name
98 | )
99 |
100 |
101 | class TextAdapter(BaseAdapter):
102 | """Base class for text-based model adapters"""
103 |
104 | def __init__(self, config: ModelConfig, api_key: str):
105 | super().__init__(config)
106 | self.api_key = api_key
107 |
108 | @abstractmethod
109 | def _make_api_request(self, messages: List[Dict[str, str]], **kwargs) -> str:
110 | """Make API request to text model"""
111 | pass
112 |
113 |
114 | class VoiceAdapter(BaseAdapter):
115 | """Base class for voice model adapters"""
116 |
117 | def __init__(self, config: ModelConfig):
118 | super().__init__(config)
119 |
120 | @abstractmethod
121 | def _process_audio_input(self, audio_path: str, text_prompt: str) -> str:
122 | """Process audio input with text prompt"""
123 | pass
124 |
125 |
126 | class RealtimeAdapter(BaseAdapter):
127 | """Base class for realtime model adapters"""
128 |
129 | def __init__(self, config: ModelConfig):
130 | super().__init__(config)
131 |
132 | @abstractmethod
133 | def _establish_connection(self) -> Any:
134 | """Establish connection to realtime model"""
135 | pass
136 |
137 | @abstractmethod
138 | def _send_audio_chunk(self, connection: Any, audio_data: bytes) -> None:
139 | """Send audio chunk to model"""
140 | pass
141 |
142 | @abstractmethod
143 | def _receive_response(self, connection: Any) -> str:
144 | """Receive response from model"""
145 | pass
--------------------------------------------------------------------------------
/models/shared/timing_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Shared timing utilities for model adapters
3 | """
4 |
5 | import time
6 | import json
7 | from typing import Dict, Any, List, Optional
8 | from pathlib import Path
9 |
10 |
11 | def make_timed_api_request(request_func, *args, **kwargs) -> Dict[str, Any]:
12 | """
13 | Execute an API request with timing information.
14 |
15 | Args:
16 | request_func: The function to call
17 | *args, **kwargs: Arguments to pass to the function
18 |
19 | Returns:
20 | Dict containing timing info and result
21 | """
22 | start_time = time.time()
23 | try:
24 | result = request_func(*args, **kwargs)
25 | end_time = time.time()
26 | return {
27 | "result": result,
28 | "timing": {
29 | "start_time": start_time,
30 | "end_time": end_time,
31 | "duration": end_time - start_time
32 | },
33 | "success": True
34 | }
35 | except Exception as e:
36 | end_time = time.time()
37 | return {
38 | "result": None,
39 | "timing": {
40 | "start_time": start_time,
41 | "end_time": end_time,
42 | "duration": end_time - start_time
43 | },
44 | "success": False,
45 | "error": str(e)
46 | }
47 |
48 |
49 | def create_turn_result(
50 | turn_index: int,
51 | prompt: str,
52 | response: str,
53 | timing: Dict[str, float],
54 | success: bool = True,
55 | error: Optional[str] = None,
56 | metadata: Optional[Dict[str, Any]] = None
57 | ) -> Dict[str, Any]:
58 | """
59 | Create a standardized turn result.
60 |
61 | Args:
62 | turn_index: Index of the turn
63 | prompt: The input prompt
64 | response: The model response
65 | timing: Timing information
66 | success: Whether the turn was successful
67 | error: Error message if any
68 | metadata: Additional metadata
69 |
70 | Returns:
71 | Standardized turn result dict
72 | """
73 | return {
74 | "turn_index": turn_index,
75 | "prompt": prompt,
76 | "response": response,
77 | "timing": timing,
78 | "success": success,
79 | "error": error,
80 | "metadata": metadata or {}
81 | }
82 |
83 |
84 | def create_standardized_episode_result(
85 | episode_id: str,
86 | turns: List[Dict[str, Any]],
87 | total_time: float,
88 | success: bool = True,
89 | error: Optional[str] = None,
90 | metadata: Optional[Dict[str, Any]] = None
91 | ) -> Dict[str, Any]:
92 | """
93 | Create a standardized episode result.
94 |
95 | Args:
96 | episode_id: Unique episode identifier
97 | turns: List of turn results
98 | total_time: Total processing time
99 | success: Whether the episode was successful
100 | error: Error message if any
101 | metadata: Additional metadata
102 |
103 | Returns:
104 | Standardized episode result dict
105 | """
106 | return {
107 | "episode_id": episode_id,
108 | "turns": turns,
109 | "total_time": total_time,
110 | "success": success,
111 | "error": error,
112 | "metadata": metadata or {},
113 | "num_turns": len(turns),
114 | "successful_turns": sum(1 for turn in turns if turn.get("success", True))
115 | }
116 |
117 |
118 | def create_standardized_batch_result(
119 | episodes: List[Dict[str, Any]],
120 | total_time: float,
121 | model_name: str,
122 | metadata: Optional[Dict[str, Any]] = None
123 | ) -> Dict[str, Any]:
124 | """
125 | Create a standardized batch result.
126 |
127 | Args:
128 | episodes: List of episode results
129 | total_time: Total batch processing time
130 | model_name: Name of the model used
131 | metadata: Additional metadata
132 |
133 | Returns:
134 | Standardized batch result dict
135 | """
136 | successful_episodes = sum(1 for ep in episodes if ep.get("success", True))
137 | total_turns = sum(ep.get("num_turns", 0) for ep in episodes)
138 | successful_turns = sum(ep.get("successful_turns", 0) for ep in episodes)
139 |
140 | return {
141 | "model_name": model_name,
142 | "episodes": episodes,
143 | "summary": {
144 | "total_episodes": len(episodes),
145 | "successful_episodes": successful_episodes,
146 | "total_turns": total_turns,
147 | "successful_turns": successful_turns,
148 | "success_rate": successful_episodes / len(episodes) if episodes else 0,
149 | "turn_success_rate": successful_turns / total_turns if total_turns > 0 else 0
150 | },
151 | "total_time": total_time,
152 | "average_time_per_episode": total_time / len(episodes) if episodes else 0,
153 | "metadata": metadata or {},
154 | "timestamp": time.time()
155 | }
156 |
157 |
158 | def save_results(results: Dict[str, Any], output_path: Path, indent: int = 2) -> None:
159 | """
160 | Save results to JSON file.
161 |
162 | Args:
163 | results: Results dictionary to save
164 | output_path: Path to save the results
165 | indent: JSON indentation level
166 | """
167 | output_path.parent.mkdir(parents=True, exist_ok=True)
168 | with open(output_path, 'w', encoding='utf-8') as f:
169 | json.dump(results, f, indent=indent, ensure_ascii=False)
--------------------------------------------------------------------------------
/evaluation/grader/prompts.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Optional
4 |
5 |
6 | # Triad accuracy template (A/B/C)
7 | GRADER_TEMPLATE_TRIAD = (
8 | """
9 | Your job is to look at a question, a ground truth answer, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
10 | First, examples for calibration are provided. Then, only return A/B/C for the new case.
11 |
12 | The following are examples of CORRECT predicted answers.
13 | ```
14 | Question: What are the names of Barack Obama's children?
15 | Ground truth: Malia Obama and Sasha Obama
16 | Predicted answer 1: sasha and malia obama
17 | Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
18 | Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
19 | ```
20 | These predicted answers are all CORRECT because:
21 | - They fully contain the important information in the ground truth.
22 | - They do not contain any information that contradicts the ground truth.
23 | - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
24 | - Hedging and guessing are permissible, provided the response fully includes the ground truth and contains no contradictions.
25 |
26 | The following are examples of INCORRECT predicted answers.
27 | ```
28 | Question: What are the names of Barack Obama's children?
29 | Ground truth: Malia and Sasha
30 | Predicted answer 1: Malia.
31 | Predicted answer 2: Malia, Sasha, and Susan.
32 | Predicted answer 3: Barack Obama does not have any children.
33 | Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
34 | Predicted answer 5: While I don't know their exact names, I can tell you that Barack Obama has three children.
35 | Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm with updated references.
36 | ```
37 | These predicted answers are all INCORRECT because:
38 | - A factual statement contradicts the ground truth. Hedged incorrect statements are still incorrect.
39 |
40 | The following are examples of NOT_ATTEMPTED predicted answers.
41 | ```
42 | Question: What are the names of Barack Obama's children?
43 | Ground truth: Malia and Sasha
44 | Predicted answer 1: I don't know.
45 | Predicted answer 2: I need more context about which Obama you are talking about.
46 | Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
47 | Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
48 | ```
49 | These are NOT_ATTEMPTED because:
50 | - The important information in the ground truth is not included, and there is no direct contradiction.
51 |
52 | Additional guidance:
53 | - For numerical ground truths, accept answers correct to the last significant figure (e.g., 120k ~ 115k–124k).
54 | - The ground truth may contain more information than the question; the predicted answer only needs to cover what's asked.
55 | - Obvious inferences are allowed (e.g., omit "California" when "San Francisco, California" is the target for "What city...").
56 | - Reasonable typos in names are acceptable if clearly the same entity.
57 |
58 | Here is a new example. Reply with A/B/C only, no extra text.
59 | ```
60 | Ground truth: {ground_truth}
61 | Predicted answer: {predicted_answer}
62 | ```
63 |
64 | Grade the predicted answer as one of:
65 | A: CORRECT
66 | B: INCORRECT
67 | C: NOT_ATTEMPTED
68 |
69 | Just return the single letter: "A", "B", or "C".
70 | """
71 | .strip()
72 | )
73 |
74 |
75 | def _benchmark_header(benchmark: Optional[str]) -> str:
76 | if not benchmark:
77 | return ""
78 | # Short, domain-aware header if desired
79 | bm = benchmark.lower()
80 | if "aime" in bm:
81 | return (
82 | "Mathematical QA: judge strictly on the final numeric/value answer. "
83 | "Allow only minimal rounding consistent with the ground truth; reject paraphrases."
84 | )
85 | if "browsecomp" in bm:
86 | return (
87 | "Web QA: prioritize factual exactness over citations; judge only whether the predicted answer matches the ground truth."
88 | )
89 | if "gpqa" in bm:
90 | return (
91 | "Graduate-level science QA: semantic equivalence is acceptable if factually identical; contradictions are incorrect."
92 | )
93 | if "mrcr" in bm:
94 | return (
95 | "Long-context needle retrieval: mark CORRECT only if the predicted answer contains the exact ground truth string "
96 | "as a contiguous span (case-insensitive). Paraphrases, substitutions, or partial matches are INCORRECT. "
97 | "Ignore surrounding commentary; focus solely on inclusion of the exact phrase."
98 | )
99 | if "simpleqa" in bm:
100 | return (
101 | "Simple factual recall: require the predicted answer to match the ground truth entity/value. "
102 | "Minor spelling variations are acceptable only if clearly the same name."
103 | )
104 | return ""
105 |
106 |
107 | def get_accuracy_prompt(
108 | question: Optional[str],
109 | ground_truth: str,
110 | predicted_answer: str,
111 | benchmark: Optional[str] = None,
112 | ) -> str:
113 | """Return a benchmark-aware accuracy grading prompt (triad A/B/C)."""
114 | header = _benchmark_header(benchmark)
115 |
116 | # Triad prompt (A/B/C)
117 | core = GRADER_TEMPLATE_TRIAD.format(
118 | ground_truth=ground_truth,
119 | predicted_answer=predicted_answer,
120 | )
121 | return f"{header}\n\n{core}".strip() if header else core
122 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # VERA Dataset
2 |
3 | This directory contains the VERA (Voice Evaluation of Reasoning Ability) dataset.
4 |
5 | ## Download
6 |
7 | The complete dataset is available on Google Drive:
8 |
9 | **Download URL:** https://drive.google.com/file/d/1k0b4qXfQ16fVqe-hMn_GSsfc0exCLgSq/view?usp=sharing
10 |
11 | ## Dataset Overview
12 |
13 | The VERA dataset contains **2,931 voice-native episodes** organized into five tracks:
14 |
15 | | Track | Episodes | Source | Description |
16 | |-------|----------|--------|-------------|
17 | | **Math** | 115 | AIME 2025 | Competition mathematics problems |
18 | | **Web** | 1,107 | BrowseComp | Web browsing and research tasks |
19 | | **Science** | 161 | GPQA Diamond | Graduate-level science questions |
20 | | **Long-Context** | 548 | MRCR | Multi-turn reading comprehension |
21 | | **Factual** | 1,000 | SimpleQA | Factual recall questions |
22 |
23 | ## Dataset Structure
24 |
25 | Each episode contains:
26 |
27 | - **`id`**: Unique identifier (e.g., `vera_aime_58789fd1`)
28 | - **`track`**: Category (`mathematical_reasoning`, `web`, `science`, `long_context`, `factual`)
29 | - **`turns`**: Array of conversation turns with:
30 | - `role`: Speaker role (`user`)
31 | - `text_content`: Encrypted question text (base64-encoded)
32 | - `audio_file`: Path to corresponding audio file
33 | - `prefix_text`: Optional prefix (usually null)
34 | - `postfix_text`: Optional postfix (usually null)
35 | - **`context_documents`**: Additional context materials (if any)
36 | - **`interruptions`**: Interruption events (if any)
37 | - **`metadata`**: Contains encrypted `expected_answer`
38 | - **`canary`**: Unique decryption key for this episode
39 |
40 | ### Example Episode Structure
41 |
42 | ```json
43 | {
44 | "id": "vera_aime_58789fd1",
45 | "track": "mathematical_reasoning",
46 | "turns": [
47 | {
48 | "role": "user",
49 | "text_content": "ayDyHIziBKCtUXnstgrT...",
50 | "audio_file": "aime_voice_episodes_audio/vera_aime_58789fd1.wav",
51 | "prefix_text": null,
52 | "postfix_text": null
53 | }
54 | ],
55 | "context_documents": [],
56 | "interruptions": [],
57 | "metadata": {
58 | "expected_answer": "EnS9"
59 | },
60 | "canary": "04a8d78a8fe43328c0a9936731ed47fd"
61 | }
62 | ```
63 |
64 | ## Encryption
65 |
66 | To prevent LLM memorization and ensure evaluation integrity, all questions (`text_content`) and answers (`expected_answer`) are encrypted using XOR cipher with SHA256-derived keys, following the methodology used in OpenAI's BrowseComp benchmark.
67 |
68 | ### Decryption
69 |
70 | To decrypt the questions and answers, use the following Python code:
71 |
72 | ```python
73 | import base64
74 | import hashlib
75 |
76 | def derive_key(password: str, length: int) -> bytes:
77 | """Derive a fixed-length key from the password using SHA256."""
78 | hasher = hashlib.sha256()
79 | hasher.update(password.encode())
80 | key = hasher.digest()
81 | return key * (length // len(key)) + key[: length % len(key)]
82 |
83 | def decrypt(ciphertext_b64: str, password: str) -> str:
84 | """Decrypt base64-encoded ciphertext with XOR."""
85 | encrypted = base64.b64decode(ciphertext_b64)
86 | key = derive_key(password, len(encrypted))
87 | decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
88 | return decrypted.decode()
89 |
90 | # Example usage:
91 | import json
92 |
93 | with open('voice_episodes.json', 'r') as f:
94 | data = json.load(f)
95 |
96 | # Decrypt the first episode
97 | episode = data['episodes'][0]
98 | canary = episode['canary']
99 |
100 | # Decrypt question
101 | question = decrypt(episode['turns'][0]['text_content'], canary)
102 | print(f"Question: {question}")
103 |
104 | # Decrypt answer
105 | answer = decrypt(episode['metadata']['expected_answer'], canary)
106 | print(f"Expected Answer: {answer}")
107 | ```
108 |
109 | ## Audio Files
110 |
111 | Audio files are organized in the following directories:
112 | - `aime_voice_episodes_audio/` - Math problems (115 files)
113 | - `browsecomp_voice_episodes_audio/` - Web tasks (1,107 files)
114 | - `gpqa_diamond_voice_episodes_audio/` - Science questions (161 files)
115 | - `mrcr_voice_episodes_audio/` - Long-context tasks (548 files)
116 | - `simpleqa_voice_episodes_audio/` - Factual questions (1,000 files)
117 |
118 | Each `audio_file` field in the dataset references the relative path to the corresponding audio file.
119 |
120 | All audio is synthesized using **Boson Higgs Audio 2** for consistent, high-quality speech generation.
121 |
122 | ## Sample Data
123 |
124 | A small sample of the dataset (with unencrypted text for easier inspection) is available in the `test_voice_episodes/` directory at the repository root:
125 |
126 | ```bash
127 | # View sample episodes
128 | cat test_voice_episodes/test.json
129 |
130 | # Listen to sample audio
131 | ls test_voice_episodes/audio/
132 | ```
133 |
134 | ## License and Attribution
135 |
136 | The dataset follows upstream licenses:
137 |
138 | - **SimpleQA, BrowseComp, MRCR**: MIT License
139 | - **GPQA Diamond**: CC BY 4.0
140 | - **Audio**: Boson Higgs Audio 2 Community License (with usage restrictions)
141 |
142 | **Important restriction**: Do not use the audio outputs to improve any other large language model.
143 |
144 | See [ATTRIBUTIONS.md](../ATTRIBUTIONS.md) and [NOTICE.txt](../NOTICE.txt) in the repository root for complete attribution and licensing details.
145 |
146 | ## Citation
147 |
148 | If you use this dataset, please cite:
149 |
150 | ```bibtex
151 | @misc{lin2025vera,
152 | title={Voice Evaluation of Reasoning Ability: Diagnosing the Modality-Induced Performance Gap},
153 | author={Lin, Yueqian and Hu, Zhengmian and Wang, Qinsi and Liu, Yudong and Zhang, Hengfan and Subramanian, Jayakumar and Vlassis, Nikos and Li, Hai Helen and Chen, Yiran},
154 | year={2025},
155 | eprint={2509.26542},
156 | archivePrefix={arXiv},
157 | primaryClass={eess.AS},
158 | url={https://arxiv.org/abs/2509.26542}
159 | }
160 | ```
161 |
--------------------------------------------------------------------------------
/evaluation/grader/wer_calculator.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import List, Tuple
4 | import re
5 |
6 |
7 | class WERCalculator:
8 | """Calculate Word Error Rate (WER) between reference and hypothesis text."""
9 |
10 | @staticmethod
11 | def normalize_text(text: str) -> str:
12 | """Normalize text for WER calculation."""
13 | text = text.lower()
14 | text = re.sub(r'[^\w\s]', '', text)
15 | text = re.sub(r'\s+', ' ', text)
16 | return text.strip()
17 |
18 | @staticmethod
19 | def tokenize(text: str) -> List[str]:
20 | """Tokenize text into words."""
21 | normalized = WERCalculator.normalize_text(text)
22 | return normalized.split() if normalized else []
23 |
24 | @staticmethod
25 | def edit_distance(ref_words: List[str], hyp_words: List[str]) -> Tuple[int, List[List[int]]]:
26 | """
27 | Calculate edit distance using dynamic programming.
28 | Returns (distance, dp_matrix) for traceback.
29 | """
30 | m, n = len(ref_words), len(hyp_words)
31 |
32 | dp = [[0] * (n + 1) for _ in range(m + 1)]
33 |
34 | for i in range(m + 1):
35 | dp[i][0] = i
36 | for j in range(n + 1):
37 | dp[0][j] = j
38 |
39 | for i in range(1, m + 1):
40 | for j in range(1, n + 1):
41 | if ref_words[i-1] == hyp_words[j-1]:
42 | dp[i][j] = dp[i-1][j-1]
43 | else:
44 | dp[i][j] = 1 + min(
45 | dp[i-1][j],
46 | dp[i][j-1],
47 | dp[i-1][j-1]
48 | )
49 |
50 | return dp[m][n], dp
51 |
52 | @staticmethod
53 | def get_alignment(ref_words: List[str], hyp_words: List[str], dp_matrix: List[List[int]]) -> List[Tuple[str, str, str]]:
54 | """
55 | Get alignment between reference and hypothesis using traceback.
56 | Returns list of (ref_word, hyp_word, operation).
57 | """
58 | m, n = len(ref_words), len(hyp_words)
59 | alignment = []
60 |
61 | i, j = m, n
62 | while i > 0 or j > 0:
63 | if i > 0 and j > 0:
64 | if ref_words[i-1] == hyp_words[j-1]:
65 | alignment.append((ref_words[i-1], hyp_words[j-1], "MATCH"))
66 | i -= 1
67 | j -= 1
68 | elif dp_matrix[i][j] == dp_matrix[i-1][j-1] + 1:
69 | alignment.append((ref_words[i-1], hyp_words[j-1], "SUB"))
70 | i -= 1
71 | j -= 1
72 | elif dp_matrix[i][j] == dp_matrix[i-1][j] + 1:
73 | alignment.append((ref_words[i-1], "*", "DEL"))
74 | i -= 1
75 | else:
76 | alignment.append(("*", hyp_words[j-1], "INS"))
77 | j -= 1
78 | elif i > 0:
79 | alignment.append((ref_words[i-1], "*", "DEL"))
80 | i -= 1
81 | else:
82 | alignment.append(("*", hyp_words[j-1], "INS"))
83 | j -= 1
84 |
85 | return list(reversed(alignment))
86 |
87 | @classmethod
88 | def calculate_wer(cls, reference: str, hypothesis: str, return_details: bool = False) -> dict:
89 | """
90 | Calculate Word Error Rate between reference and hypothesis.
91 |
92 | Args:
93 | reference: Ground truth text
94 | hypothesis: Predicted text (e.g., from ASR)
95 | return_details: If True, return detailed alignment information
96 |
97 | Returns:
98 | Dictionary with WER metrics and optionally alignment details
99 | """
100 | ref_words = cls.tokenize(reference)
101 | hyp_words = cls.tokenize(hypothesis)
102 |
103 | if len(ref_words) == 0:
104 | if len(hyp_words) == 0:
105 | result = {
106 | "wer": 0.0,
107 | "substitutions": 0,
108 | "deletions": 0,
109 | "insertions": 0,
110 | "total_words": 0,
111 | "reference_length": 0,
112 | "hypothesis_length": 0
113 | }
114 | else:
115 | result = {
116 | "wer": float('inf'),
117 | "substitutions": 0,
118 | "deletions": 0,
119 | "insertions": len(hyp_words),
120 | "total_words": len(hyp_words),
121 | "reference_length": 0,
122 | "hypothesis_length": len(hyp_words)
123 | }
124 | else:
125 | edit_dist, dp_matrix = cls.edit_distance(ref_words, hyp_words)
126 |
127 | if return_details:
128 | alignment = cls.get_alignment(ref_words, hyp_words, dp_matrix)
129 | substitutions = sum(1 for _, _, op in alignment if op == "SUB")
130 | deletions = sum(1 for _, _, op in alignment if op == "DEL")
131 | insertions = sum(1 for _, _, op in alignment if op == "INS")
132 | else:
133 | alignment = None
134 | substitutions = 0
135 | deletions = 0
136 | insertions = 0
137 |
138 | wer = edit_dist / len(ref_words)
139 |
140 | result = {
141 | "wer": wer,
142 | "substitutions": substitutions,
143 | "deletions": deletions,
144 | "insertions": insertions,
145 | "total_words": edit_dist,
146 | "reference_length": len(ref_words),
147 | "hypothesis_length": len(hyp_words)
148 | }
149 |
150 | if return_details:
151 | result["alignment"] = alignment
152 | result["reference_words"] = ref_words
153 | result["hypothesis_words"] = hyp_words
154 |
155 | return result
156 |
157 | @classmethod
158 | def batch_calculate_wer(cls, pairs: List[Tuple[str, str]], return_details: bool = False) -> List[dict]:
159 | """Calculate WER for multiple reference-hypothesis pairs."""
160 | return [cls.calculate_wer(ref, hyp, return_details) for ref, hyp in pairs]
--------------------------------------------------------------------------------
/models/realtime/liveanswer/mrcr_context.py:
--------------------------------------------------------------------------------
1 | """
2 | MRCR context handling for LiveAnswer.
3 | Based on azure_gpt_realtime approach.
4 | """
5 |
6 | import json
7 | from typing import List, Dict, Any, Optional
8 | from pathlib import Path
9 |
10 |
11 | def parse_mrcr_context(context: str) -> List[Dict[str, str]]:
12 | """Parse MRCR context document into conversation messages"""
13 | messages = []
14 |
15 | # Split by User: and Assistant: markers
16 | lines = context.split('\n')
17 | current_role = None
18 | current_content = []
19 |
20 | for line in lines:
21 | if line.startswith('User:'):
22 | if current_role and current_content:
23 | messages.append({"role": current_role, "content": '\n'.join(current_content).strip()})
24 | current_role = "user"
25 | current_content = [line[5:].strip()] # Remove 'User:' prefix
26 | elif line.startswith('Assistant:'):
27 | if current_role and current_content:
28 | messages.append({"role": current_role, "content": '\n'.join(current_content).strip()})
29 | current_role = "assistant"
30 | current_content = [line[10:].strip()] # Remove 'Assistant:' prefix
31 | else:
32 | if current_content is not None:
33 | current_content.append(line)
34 |
35 | # Add the last message
36 | if current_role and current_content:
37 | messages.append({"role": current_role, "content": '\n'.join(current_content).strip()})
38 |
39 | return messages
40 |
41 |
42 | def load_context_documents_from_audio_file(audio_file_path: str) -> List[Dict[str, Any]]:
43 | """
44 | Load context documents from episode JSON based on audio file path.
45 | Follows the same pattern as azure_gpt_realtime.
46 | """
47 | audio_path = Path(audio_file_path)
48 |
49 | # Try to find corresponding episode JSON
50 | episode_json_candidates = [
51 | # Same directory, replace .wav with _episode.json
52 | audio_path.parent / f"{audio_path.stem}_episode.json",
53 | # test_voice_episodes directory structure
54 | audio_path.parent.parent / "episodes" / f"{audio_path.stem}_episode.json",
55 | # Current directory test_voice_episodes
56 | Path.cwd() / "test_voice_episodes" / "episodes" / f"{audio_path.stem}_episode.json",
57 | ]
58 |
59 | # Add test_voice_episodes direct files based on audio file type
60 | audio_stem = audio_path.stem.lower()
61 | if "mrcr" in audio_stem:
62 | episode_json_candidates.append(Path.cwd() / "test_voice_episodes" / "test_mrcr_episode.json")
63 | elif "browsecomp" in audio_stem:
64 | episode_json_candidates.append(Path.cwd() / "test_voice_episodes" / "test_browsecomp_episode.json")
65 | elif "aime" in audio_stem:
66 | episode_json_candidates.append(Path.cwd() / "test_voice_episodes" / "test_aime_episode.json")
67 |
68 | episode_json = None
69 | print(f"!!!MRCR: Looking for episode JSON for audio file: {audio_file_path}")
70 | for candidate in episode_json_candidates:
71 | print(f"!!!MRCR: Checking candidate: {candidate}")
72 | if candidate.exists():
73 | episode_json = candidate
74 | print(f"!!!MRCR: Found episode JSON: {episode_json}")
75 | break
76 |
77 | if not episode_json:
78 | print(f"!!!MRCR: No episode JSON found for audio file: {audio_file_path}")
79 | print(f"!!!MRCR: Tried candidates: {episode_json_candidates}")
80 | return []
81 |
82 | try:
83 | episode_data = json.loads(episode_json.read_text())
84 | if episode_data.get("episodes"):
85 | first_episode = episode_data["episodes"][0]
86 | context_documents = first_episode.get("context_documents", [])
87 | print(f"!!!MRCR: Found {len(context_documents)} context documents from {episode_json}")
88 | if context_documents:
89 | print(f"!!!MRCR: First context document has {len(context_documents[0].get('content', ''))} characters")
90 | return context_documents
91 | except Exception as e:
92 | print(f"Error loading context documents from {episode_json}: {e}")
93 |
94 | return []
95 |
96 |
97 | def inject_mrcr_context_into_messages(
98 | messages: List[Dict[str, str]],
99 | context_documents: List[Dict[str, Any]],
100 | episode_id: Optional[str] = None
101 | ) -> List[Dict[str, str]]:
102 | """
103 | Inject MRCR context documents into message history.
104 | Based on azure_gpt_realtime approach.
105 | """
106 | if not context_documents:
107 | return messages
108 |
109 | print(f"Injecting {len(context_documents)} context documents...")
110 |
111 | # Determine if this is MRCR
112 | is_mrcr = False
113 | if episode_id:
114 | is_mrcr = "mrcr" in episode_id.lower()
115 |
116 | # Insert context documents before the conversation
117 | context_messages = []
118 |
119 | for i, doc in enumerate(context_documents):
120 | content = doc.get("content", "")
121 | if content and is_mrcr:
122 | # For MRCR, inject the full conversation as a system message
123 | print(f"Injecting MRCR conversation context from document {i+1}")
124 | parsed_messages = parse_mrcr_context(content)
125 |
126 | # Convert conversation to system context
127 | context_text = "Previous conversation:\n\n"
128 | for msg in parsed_messages:
129 | role = msg["role"].title()
130 | context_text += f"{role}: {msg['content']}\n\n"
131 |
132 | context_messages.append({
133 | "role": "system",
134 | "content": f"You have access to the following conversation history:\n\n{context_text.strip()}"
135 | })
136 | elif content:
137 | # For non-MRCR, add as single assistant message
138 | context_messages.append({
139 | "role": "assistant",
140 | "content": f"Previous context: {content}"
141 | })
142 |
143 | print(f"Context injection complete.")
144 |
145 | # Return context messages + original messages
146 | return context_messages + messages
147 |
148 |
149 | def is_mrcr_episode(audio_file_path: str) -> bool:
150 | """Check if this is an MRCR episode based on file path."""
151 | path_str = str(audio_file_path).lower()
152 | return "mrcr" in path_str
--------------------------------------------------------------------------------
/models/realtime/liveanswer/stt_service.py:
--------------------------------------------------------------------------------
1 | import os
2 | import azure.cognitiveservices.speech as speechsdk
3 | from typing import Optional, Tuple
4 | from pathlib import Path
5 |
6 |
7 | class AzureSTTService:
8 | """Azure Speech-to-Text service for audio file transcription."""
9 |
10 | def __init__(self,
11 | speech_key: Optional[str] = None,
12 | speech_region: Optional[str] = None):
13 | """
14 | Initialize Azure STT service.
15 |
16 | Args:
17 | speech_key: Azure Speech API key (defaults to env var)
18 | speech_region: Azure region (defaults to env var)
19 | """
20 | self.speech_key = speech_key or os.environ.get("AZURE_SPEECH_API_KEY")
21 | self.speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")
22 |
23 | if not self.speech_key or not self.speech_region:
24 | raise ValueError(
25 | "Azure Speech credentials not found. "
26 | "Set AZURE_SPEECH_API_KEY and AZURE_SPEECH_REGION environment variables."
27 | )
28 |
29 | self.speech_config = speechsdk.SpeechConfig(
30 | subscription=self.speech_key,
31 | region=self.speech_region
32 | )
33 |
34 | # Set recognition language (can be made configurable)
35 | self.speech_config.speech_recognition_language = "en-US"
36 |
37 | # Enable detailed recognition results
38 | self.speech_config.request_word_level_timestamps()
39 |
40 | def transcribe_file(self, audio_file_path: str) -> Tuple[str, dict]:
41 | """
42 | Transcribe audio file to text.
43 |
44 | Args:
45 | audio_file_path: Path to audio file (WAV, MP3, etc.)
46 |
47 | Returns:
48 | Tuple of (transcript, metadata dict with timing info)
49 | """
50 | audio_path = Path(audio_file_path)
51 | if not audio_path.exists():
52 | raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
53 |
54 | # Create audio config from file
55 | audio_config = speechsdk.audio.AudioConfig(filename=str(audio_path))
56 |
57 | # Create recognizer
58 | recognizer = speechsdk.SpeechRecognizer(
59 | speech_config=self.speech_config,
60 | audio_config=audio_config
61 | )
62 |
63 | # Collect all results
64 | all_results = []
65 | done = False
66 |
67 | def handle_recognized(evt):
68 | if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
69 | all_results.append({
70 | 'text': evt.result.text,
71 | 'offset': evt.result.offset,
72 | 'duration': evt.result.duration
73 | })
74 |
75 | def stop_continuous(evt):
76 | nonlocal done
77 | done = True
78 |
79 | # Connect callbacks
80 | recognizer.recognized.connect(handle_recognized)
81 | recognizer.session_stopped.connect(stop_continuous)
82 | recognizer.canceled.connect(stop_continuous)
83 |
84 | # Start continuous recognition
85 | recognizer.start_continuous_recognition()
86 |
87 | # Wait for completion
88 | import time
89 | while not done:
90 | time.sleep(0.5)
91 |
92 | recognizer.stop_continuous_recognition()
93 |
94 | # Combine results
95 | full_transcript = ' '.join(r['text'] for r in all_results)
96 |
97 | metadata = {
98 | 'segments': all_results,
99 | 'total_segments': len(all_results),
100 | 'file_path': str(audio_path),
101 | 'language': self.speech_config.speech_recognition_language
102 | }
103 |
104 | return full_transcript.strip(), metadata
105 |
106 | def transcribe_with_diarization(self, audio_file_path: str) -> Tuple[str, dict]:
107 | """
108 | Transcribe audio with speaker diarization (who said what).
109 |
110 | Args:
111 | audio_file_path: Path to audio file
112 |
113 | Returns:
114 | Tuple of (transcript with speaker labels, metadata)
115 | """
116 | audio_path = Path(audio_file_path)
117 | if not audio_path.exists():
118 | raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
119 |
120 | # Create audio config
121 | audio_config = speechsdk.audio.AudioConfig(filename=str(audio_path))
122 |
123 | # Enable diarization
124 | self.speech_config.set_property(
125 | speechsdk.PropertyId.SpeechServiceConnection_LanguageIdMode, "Continuous"
126 | )
127 |
128 | # Create conversation transcriber
129 | conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
130 | speech_config=self.speech_config,
131 | audio_config=audio_config
132 | )
133 |
134 | transcription_results = []
135 | done = False
136 |
137 | def handle_transcribed(evt):
138 | if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
139 | transcription_results.append({
140 | 'speaker_id': evt.result.speaker_id or 'Unknown',
141 | 'text': evt.result.text,
142 | 'offset': evt.result.offset,
143 | 'duration': evt.result.duration
144 | })
145 |
146 | def stop_cb(evt):
147 | nonlocal done
148 | done = True
149 |
150 | # Connect callbacks
151 | conversation_transcriber.transcribed.connect(handle_transcribed)
152 | conversation_transcriber.session_stopped.connect(stop_cb)
153 | conversation_transcriber.canceled.connect(stop_cb)
154 |
155 | # Start transcription
156 | conversation_transcriber.start_transcribing_async()
157 |
158 | # Wait for completion
159 | import time
160 | while not done:
161 | time.sleep(0.5)
162 |
163 | conversation_transcriber.stop_transcribing_async()
164 |
165 | # Format output with speaker labels
166 | formatted_transcript = []
167 | current_speaker = None
168 |
169 | for segment in transcription_results:
170 | speaker = segment['speaker_id']
171 | if speaker != current_speaker:
172 | formatted_transcript.append(f"\n[Speaker {speaker}]: {segment['text']}")
173 | current_speaker = speaker
174 | else:
175 | formatted_transcript.append(segment['text'])
176 |
177 | full_transcript = ' '.join(formatted_transcript).strip()
178 |
179 | metadata = {
180 | 'segments': transcription_results,
181 | 'total_segments': len(transcription_results),
182 | 'speakers': list(set(s['speaker_id'] for s in transcription_results)),
183 | 'file_path': str(audio_path),
184 | 'language': self.speech_config.speech_recognition_language
185 | }
186 |
187 | return full_transcript, metadata
--------------------------------------------------------------------------------
/models/realtime/freeze_omni.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | import argparse
5 | import asyncio
6 | import json
7 | import queue
8 | import sys
9 | import time
10 | from pathlib import Path
11 | from typing import List
12 |
13 | import numpy as np
14 | import soundfile as sf
15 | import socketio
16 | import torch
17 | import torchaudio.functional as AF
18 | from glob import glob
19 |
20 | ### Configuration ###
21 | root_dir_path = "YOUR_ROOT_DIRECTORY_PATH"
22 | tasks = [
23 | "YOUR_TASK_NAME",
24 | ]
25 | prefix = "" # "" or "clean_": the prefix for input wav files
26 | overwrite = True # Whether to overwrite existing output files
27 | #####################
28 |
29 | all_wav_files = []
30 | for task in tasks:
31 | root_dir = f"{root_dir_path}/{task}/"
32 | root_file_dir = f"{root_dir}/*/{prefix}input.wav"
33 | wav_files = sorted(glob(root_file_dir))
34 | all_wav_files.extend(wav_files)
35 |
36 | FRAME_MS = 30
37 | SEND_SR = 16_000
38 | RECV_SR = 24_000
39 | TX_SAMP = int(SEND_SR * FRAME_MS / 1000)
40 | RX_SAMP = int(RECV_SR * FRAME_MS / 1000)
41 | RX_BYTES = RX_SAMP * 2
42 |
43 |
44 | def _mono(sig: np.ndarray) -> np.ndarray:
45 | return sig if sig.ndim == 1 else sig.mean(axis=1)
46 |
47 |
48 | def _resample(sig: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
49 | if orig_sr == target_sr:
50 | return sig
51 | wav = torch.from_numpy(sig.astype(np.float32) / 32768).unsqueeze(0)
52 | wav_rs = AF.resample(wav, orig_sr, target_sr)
53 | return (wav_rs.squeeze().numpy() * 32768).astype(np.int16)
54 |
55 |
56 | def _chunk(sig: np.ndarray, frame_len: int) -> List[np.ndarray]:
57 | pad = (-len(sig)) % frame_len
58 | if pad:
59 | sig = np.concatenate([sig, np.zeros(pad, dtype=sig.dtype)])
60 | return [sig[i : i + frame_len] for i in range(0, len(sig), frame_len)]
61 |
62 |
63 | def _compact_json(obj):
64 | return json.dumps(obj, separators=(",", ":"))
65 |
66 |
67 | class FreezeOmniClient:
68 | def __init__(self, server_ip: str, inp: Path, out: Path):
69 | self.server_ip = server_ip
70 | self.inp = inp
71 | self.out = out
72 | self.audio_q = queue.Queue()
73 | self.pending = bytearray()
74 | self.muted = False # true after stop_tts until next audio
75 |
76 | self.sio = socketio.Client(
77 | ssl_verify=False,
78 | reconnection=True,
79 | reconnection_attempts=0,
80 | reconnection_delay=2,
81 | reconnection_delay_max=30,
82 | randomization_factor=0.2,
83 | )
84 |
85 | self.sio.on("connect", self._on_connect)
86 | self.sio.on("disconnect", self._on_disconnect)
87 | self.sio.on("audio", self._on_audio)
88 | self.sio.on("stop_tts", self._on_stop_tts)
89 | self.sio.on("too_many_users", self._on_too_many)
90 |
91 | def _on_connect(self):
92 | print("[SIO] ✅ Connected", flush=True)
93 | asyncio.run(self._stream())
94 |
95 | def _on_disconnect(self):
96 | print("[SIO] 🔌 Disconnected", flush=True)
97 |
98 | def _on_audio(self, data: bytes):
99 | self.audio_q.put(data)
100 | self.muted = False # new audio resumes output
101 |
102 | def _on_stop_tts(self):
103 | print("[SIO] ⏹️ stop_tts → mute", flush=True)
104 | self.pending.clear() # discard any buffered TTS
105 | self.muted = True
106 |
107 | def _on_too_many(self, *_, **__):
108 | print("[SIO] ❌ Too many users", file=sys.stderr)
109 | self.sio.disconnect()
110 |
111 | async def _stream(self):
112 | wav, sr = sf.read(self.inp, dtype="int16")
113 | wav = _mono(wav)
114 | wav = _resample(wav, sr, SEND_SR)
115 | tx_frames = _chunk(wav, TX_SAMP)
116 | total_frames = len(tx_frames)
117 | frames_written = 0
118 |
119 | with sf.SoundFile(
120 | self.out, "w", samplerate=RECV_SR, channels=1, subtype="PCM_16"
121 | ) as fout:
122 | self.sio.emit("recording-started")
123 | frame_dur = FRAME_MS / 1000.0
124 |
125 | for frame in tx_frames:
126 | self.sio.emit(
127 | "audio",
128 | _compact_json(
129 | {"audio": list(frame.tobytes()), "sample_rate": SEND_SR}
130 | ),
131 | )
132 |
133 | while not self.audio_q.empty():
134 | self.pending.extend(self.audio_q.get())
135 |
136 | if self.muted:
137 | chunk = b""
138 | else:
139 | chunk = self.pending[:RX_BYTES]
140 | self.pending = self.pending[RX_BYTES:]
141 |
142 | if len(chunk) < RX_BYTES:
143 | chunk += b"\x00" * (RX_BYTES - len(chunk))
144 | fout.write(np.frombuffer(chunk, dtype=np.int16))
145 | frames_written += 1
146 |
147 | await asyncio.sleep(frame_dur)
148 |
149 | self.sio.emit("recording-stopped")
150 | flush_until = time.time() + 1.0
151 | while time.time() < flush_until and frames_written < total_frames:
152 | while not self.audio_q.empty():
153 | self.pending.extend(self.audio_q.get())
154 | chunk = b"" if self.muted else self.pending[:RX_BYTES]
155 | self.pending = self.pending[RX_BYTES:]
156 | if len(chunk) < RX_BYTES:
157 | chunk += b"\x00" * (RX_BYTES - len(chunk))
158 | fout.write(np.frombuffer(chunk, dtype=np.int16))
159 | frames_written += 1
160 | await asyncio.sleep(frame_dur)
161 |
162 | while frames_written < total_frames:
163 | fout.write(np.zeros(RX_SAMP, dtype=np.int16))
164 | frames_written += 1
165 |
166 | self.sio.disconnect()
167 | print(
168 | f"[DONE] input len = {len(wav) / SEND_SR:.2f}s | output len = {sf.info(self.out).duration:.2f}s"
169 | )
170 |
171 | def run(self):
172 | url = f"https://{self.server_ip}"
173 | try:
174 | self.sio.connect(url, transports=["websocket"], wait_timeout=10)
175 | self.sio.wait()
176 | if self.sio.connected:
177 | self.sio.disconnect()
178 | except KeyboardInterrupt:
179 | self.sio.disconnect()
180 | except Exception as e:
181 | print(f"[ERR] {e}", file=sys.stderr)
182 | self.sio.disconnect()
183 |
184 |
185 | def main():
186 | ap = argparse.ArgumentParser(
187 | description="Freeze-Omni streaming client with instant stop_tts mute"
188 | )
189 | ap.add_argument("--server_ip", required=True)
190 | args = ap.parse_args()
191 |
192 | for inp in all_wav_files:
193 | args.input = Path(inp)
194 | args.output = Path(inp.replace("input.wav", "output.wav"))
195 | if not overwrite and args.output.exists():
196 | print(f"[SKIP] {args.output} already exists, skipping...")
197 | continue
198 | print(f"[RUN] {args.input} → {args.output}")
199 | FreezeOmniClient(args.server_ip, args.input, args.output).run()
200 |
201 |
202 | if __name__ == "__main__":
203 | main()
204 |
205 |
--------------------------------------------------------------------------------
/models/realtime/moshi.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import annotations
3 |
4 | import argparse
5 | import asyncio
6 | from glob import glob
7 | from pathlib import Path
8 | from typing import List
9 |
10 | import numpy as np
11 | import soundfile as sf
12 | import sphn
13 | import torch
14 | import torchaudio.functional as AF
15 | import websockets
16 | import websockets.exceptions as wsex
17 |
18 |
19 | ### Configuration ###
20 | root_dir_path = Path("YOUR_ROOT_DIRECTORY_PATH")
21 | tasks = [
22 | "YOUR_TASK_NAME",
23 | ]
24 | prefix = "" # "" or "clean_": the prefix for input wav files
25 | overwrite = True # Whether to overwrite existing output files
26 | #####################
27 |
28 |
29 | SEND_SR = 24_000
30 | FRAME_SMP = 1_920
31 | SKIP_FRAMES = 1
32 | FRAME_SEC = FRAME_SMP / SEND_SR
33 |
34 |
35 | def _patch_sphn():
36 | if not hasattr(sphn.OpusStreamWriter, "read_bytes"):
37 | for alt in ("get_bytes", "flush_bytes", "read_data"):
38 | if hasattr(sphn.OpusStreamWriter, alt):
39 | setattr(
40 | sphn.OpusStreamWriter,
41 | "read_bytes",
42 | getattr(sphn.OpusStreamWriter, alt),
43 | )
44 | break
45 | else:
46 | setattr(sphn.OpusStreamWriter, "read_bytes", lambda self: b"")
47 | if not hasattr(sphn.OpusStreamReader, "read_pcm"):
48 | for alt in ("get_pcm", "receive_pcm", "read_float"):
49 | if hasattr(sphn.OpusStreamReader, alt):
50 | setattr(
51 | sphn.OpusStreamReader,
52 | "read_pcm",
53 | getattr(sphn.OpusStreamReader, alt),
54 | )
55 | break
56 | else:
57 | setattr(
58 | sphn.OpusStreamReader, "read_pcm", lambda self: np.empty(0, np.float32)
59 | )
60 |
61 |
62 | _patch_sphn()
63 |
64 |
65 | def _mono(x: np.ndarray) -> np.ndarray:
66 | return x if x.ndim == 1 else x.mean(axis=1)
67 |
68 |
69 | def _resample(x: np.ndarray, sr: int, tgt: int) -> np.ndarray:
70 | if sr == tgt:
71 | return x
72 | y = torch.from_numpy(x.astype(np.float32) / 32768).unsqueeze(0)
73 | y = AF.resample(y, sr, tgt)[0].numpy()
74 | return (y * 32768).astype(np.int16)
75 |
76 |
77 | def _chunk(sig: np.ndarray) -> List[np.ndarray]:
78 | pad = (-len(sig)) % FRAME_SMP
79 | if pad:
80 | sig = np.concatenate([sig, np.zeros(pad, sig.dtype)])
81 | return [sig[i : i + FRAME_SMP] for i in range(0, len(sig), FRAME_SMP)]
82 |
83 |
84 | class MoshiFileClient:
85 | def __init__(self, ws_url: str, inp: Path, out: Path):
86 | self.url, self.inp, self.out = ws_url, inp, out
87 |
88 | sig16, sr = sf.read(inp, dtype="int16")
89 | self.sig24 = _resample(_mono(sig16), sr, SEND_SR)
90 | self.max_samples = len(self.sig24)
91 |
92 | self.writer = sphn.OpusStreamWriter(SEND_SR)
93 | self.reader = sphn.OpusStreamReader(SEND_SR)
94 |
95 | async def _send(self, ws):
96 | for frame in _chunk(self.sig24):
97 | pkt0 = self.writer.append_pcm(frame.astype(np.float32) / 32768)
98 | if isinstance(pkt0, (bytes, bytearray)):
99 | await ws.send(b"\x01" + pkt0)
100 | queued = self.writer.read_bytes()
101 | if queued:
102 | await ws.send(b"\x01" + queued)
103 | await asyncio.sleep(FRAME_SEC)
104 |
105 | queued = self.writer.read_bytes()
106 | if queued:
107 | await ws.send(b"\x01" + queued)
108 | await asyncio.sleep(0.5)
109 | await ws.close()
110 |
111 | async def _recv(self, ws):
112 | samples_written = 0
113 | first_pcm_seen = False
114 |
115 | with sf.SoundFile(
116 | self.out, "w", samplerate=SEND_SR, channels=1, subtype="PCM_16"
117 | ) as fout:
118 | try:
119 | async for msg in ws:
120 | if not msg or msg[0] not in (1, 2):
121 | continue
122 | kind, payload = msg[0], msg[1:]
123 |
124 | if kind == 1: # audio bytes
125 | self.reader.append_bytes(payload)
126 | while True:
127 | pcm = self.reader.read_pcm()
128 | if pcm.size == 0:
129 | break
130 | if not first_pcm_seen:
131 | pad = min(SKIP_FRAMES * FRAME_SMP, self.max_samples)
132 | fout.write(np.zeros(pad, dtype=np.int16))
133 | samples_written += pad
134 | first_pcm_seen = True
135 | remain = self.max_samples - samples_written
136 | if remain <= 0:
137 | continue
138 | n_write = min(pcm.size, remain)
139 | fout.write((pcm[:n_write] * 32768).astype(np.int16))
140 | samples_written += n_write
141 | else:
142 | print("[TEXT]", payload.decode(errors="ignore"))
143 |
144 | except wsex.ConnectionClosedError:
145 | pass
146 |
147 | if samples_written < self.max_samples:
148 | fout.write(np.zeros(self.max_samples - samples_written, dtype=np.int16))
149 |
150 | async def _run(self):
151 | async with websockets.connect(self.url, max_size=None) as ws:
152 | try:
153 | first = await asyncio.wait_for(ws.recv(), timeout=1.0)
154 | if not (isinstance(first, (bytes, bytearray)) and first[:1] == b"\x00"):
155 | ws._put_message(first)
156 | except Exception:
157 | pass
158 | await asyncio.gather(self._send(ws), self._recv(ws))
159 | print("[DONE]", self.inp)
160 |
161 | def run(self):
162 | try:
163 | asyncio.run(self._run())
164 | except wsex.ConnectionClosedError:
165 | pass
166 |
167 |
168 | def _ws_url(addr: str) -> str:
169 | if "://" in addr:
170 | proto, rest = addr.split("://", 1)
171 | proto = "ws" if proto in {"http", "ws"} else "wss"
172 | return f"{proto}://{rest.rstrip('/')}/api/chat"
173 | if ":" not in addr:
174 | addr += ":8998"
175 | return f"ws://{addr}/api/chat"
176 |
177 |
178 | def _input_files() -> List[Path]:
179 | files: List[Path] = []
180 | for t in tasks:
181 | pattern = root_dir_path / f"{t}/*/{prefix}input.wav"
182 | files += [Path(p) for p in sorted(glob(str(pattern)))]
183 | return files
184 |
185 |
186 | def main():
187 | ap = argparse.ArgumentParser("moshi_batch_client")
188 | ap.add_argument("--server_ip", required=True, help="host[:port] or http(s):// URL")
189 | args = ap.parse_args()
190 |
191 | url = _ws_url(args.server_ip)
192 | for inp in _input_files():
193 | out = inp.with_name(inp.name.replace("input.wav", "output.wav"))
194 | if not overwrite and out.exists():
195 | print("[SKIP]", out)
196 | continue
197 | out.parent.mkdir(parents=True, exist_ok=True)
198 | print("[RUN]", inp)
199 | MoshiFileClient(url, inp, out).run()
200 |
201 |
202 | if __name__ == "__main__":
203 | main()
204 |
205 |
--------------------------------------------------------------------------------
/evaluation/grader/llm_grader.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | import httpx
5 | import asyncio
6 | import time
7 | import random
8 | from typing import Optional, Tuple
9 |
10 | from .base import BaseAccuracyGrader, GradeLabel, GradeResult
11 | from .prompts import get_accuracy_prompt
12 |
13 |
14 | class LLMAccuracyGrader(BaseAccuracyGrader):
15 | """LLM-backed accuracy grader using Azure OpenAI chat completions.
16 |
17 | Notes:
18 | - Requires environment variables: AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY
19 | - By default uses deployment "gpt-4o" and api-version "2024-10-21"
20 | - Does not stream; single-turn prompt per grading task
21 | """
22 |
23 | def __init__(
24 | self,
25 | deployment_name: str = "gpt-4o",
26 | api_version: str = "2024-10-21",
27 | temperature: float = 0.0,
28 | max_retries: int = 3,
29 | base_delay: float = 1.0,
30 | ) -> None:
31 | self.azure_endpoint = (os.getenv("AZURE_OPENAI_ENDPOINT") or "").rstrip("/")
32 | self.api_key = os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("AZURE_API_KEY")
33 | self.deployment_name = deployment_name
34 | self.api_version = api_version
35 | self.temperature = temperature
36 | self.max_retries = max_retries
37 | self.base_delay = base_delay
38 |
39 | def _ensure_env(self) -> None:
40 | if not self.azure_endpoint or not self.api_key:
41 | raise RuntimeError(
42 | "LLMAccuracyGrader requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY"
43 | )
44 |
45 | async def _chat(self, system: str, user: str) -> str:
46 | self._ensure_env()
47 | url = f"{self.azure_endpoint}/openai/deployments/{self.deployment_name}/chat/completions"
48 | headers = {"Content-Type": "application/json", "api-key": self.api_key}
49 | params = {"api-version": self.api_version}
50 | payload = {
51 | "messages": [
52 | {"role": "system", "content": system},
53 | {"role": "user", "content": user},
54 | ],
55 | "temperature": self.temperature,
56 | "max_tokens": 512,
57 | }
58 |
59 | last_exception = None
60 |
61 | for attempt in range(self.max_retries + 1):
62 | try:
63 | async with httpx.AsyncClient(timeout=60.0) as client:
64 | r = await client.post(url, headers=headers, params=params, json=payload)
65 | r.raise_for_status()
66 | data = r.json()
67 | return data["choices"][0]["message"]["content"].strip()
68 |
69 | except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError) as e:
70 | last_exception = e
71 |
72 | if attempt == self.max_retries:
73 | # Last attempt failed, re-raise the exception
74 | break
75 |
76 | # Calculate delay with exponential backoff and jitter
77 | delay = self.base_delay * (2 ** attempt) + random.uniform(0, 1)
78 |
79 | # Special handling for rate limits (429)
80 | if isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 429:
81 | # For rate limits, wait longer
82 | delay = max(delay, 5.0 + random.uniform(0, 5))
83 | print(f"Rate limit hit, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})")
84 | elif isinstance(e, httpx.ConnectError):
85 | print(f"Connection error, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})")
86 | elif isinstance(e, httpx.TimeoutException):
87 | print(f"Timeout error, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})")
88 | else:
89 | print(f"HTTP error {e.response.status_code if hasattr(e, 'response') else 'unknown'}, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})")
90 |
91 | await asyncio.sleep(delay)
92 |
93 | # If we get here, all retries failed
94 | raise last_exception
95 |
96 | def _parse_binary(self, content: str) -> Tuple[Optional[str], Optional[bool], Optional[float], Optional[str]]:
97 | # Very light parsing for fields we care about
98 | extracted = None
99 | correct_flag = None
100 | confidence = None
101 | reasoning = None
102 | for line in content.splitlines():
103 | l = line.strip()
104 | if l.lower().startswith("extracted_final_answer:"):
105 | extracted = l.split(":", 1)[1].strip()
106 | extracted = None if extracted.lower() == "none" else extracted
107 | elif l.lower().startswith("correct:"):
108 | v = l.split(":", 1)[1].strip().lower()
109 | if v in {"yes", "no"}:
110 | correct_flag = v == "yes"
111 | elif l.lower().startswith("confidence:"):
112 | v = l.split(":", 1)[1].strip().replace("%", "")
113 | try:
114 | confidence = float(v)
115 | except Exception:
116 | confidence = None
117 | elif l.lower().startswith("reasoning:"):
118 | reasoning = l.split(":", 1)[1].strip()
119 | return extracted, correct_flag, confidence, reasoning
120 |
121 | def _parse_triad(self, content: str) -> GradeLabel:
122 | c = content.strip().upper()
123 | if c.startswith("A"):
124 | return GradeLabel.CORRECT
125 | if c.startswith("B"):
126 | return GradeLabel.INCORRECT
127 | if c.startswith("C"):
128 | return GradeLabel.NOT_ATTEMPTED
129 | # default fallback if model deviates
130 | return GradeLabel.INCORRECT
131 |
132 | async def grade_async(
133 | self,
134 | question: str,
135 | ground_truth: str,
136 | predicted_answer: str,
137 | benchmark: Optional[str] = None,
138 | ) -> GradeResult:
139 | prompt = get_accuracy_prompt(
140 | question=question, ground_truth=ground_truth, predicted_answer=predicted_answer, benchmark=benchmark
141 | )
142 |
143 | # Constrain output: A/B/C only
144 | system = (
145 | "You are an academic grader. Return only a single capital letter (A/B/C) per instructions."
146 | )
147 |
148 | content = await self._chat(system=system, user=prompt)
149 |
150 | # triad mode
151 | label = self._parse_triad(content)
152 | return GradeResult(
153 | label=label,
154 | extracted_final_answer=None,
155 | reasoning=None,
156 | correct_flag=None,
157 | confidence=None,
158 | raw_model_output=content,
159 | )
160 |
161 | def grade(
162 | self,
163 | question: str,
164 | ground_truth: str,
165 | predicted_answer: str,
166 | benchmark: Optional[str] = None,
167 | ) -> GradeResult:
168 | async def _run():
169 | return await self.grade_async(question, ground_truth, predicted_answer, benchmark)
170 | try:
171 | return asyncio.run(_run())
172 | except RuntimeError:
173 | # If already inside an event loop
174 | loop = asyncio.get_event_loop() # type: ignore
175 | return loop.run_until_complete(_run())
176 |
--------------------------------------------------------------------------------
/models/text/gpt5.py:
--------------------------------------------------------------------------------
1 | """
2 | GPT-5 OpenAI Browse Adapter for VERA
3 | Uses OpenAI Responses API with web_search_preview tool for browsecomp benchmark
4 | """
5 |
6 | import os
7 | import json
8 | import time
9 | import httpx
10 | from typing import Dict, Any, List
11 | from pathlib import Path
12 |
13 | from ..shared.timing_utils import (
14 | create_turn_result,
15 | create_standardized_episode_result,
16 | create_standardized_batch_result,
17 | )
18 |
19 |
20 | class GPT5OpenAIBrowseAdapter:
21 | """OpenAI GPT-5 adapter using web_search_preview for browsecomp."""
22 |
23 | def __init__(self, api_key: str, api_base: str = "https://api.openai.com", api_version: str = "2025-02-01-preview", reasoning_effort: str = "high", reasoning_summary: str = "detailed"):
24 | self.api_key = api_key
25 | self.api_base = api_base.rstrip('/')
26 | self.api_version = api_version
27 | self.model_name = "gpt-5"
28 | self.reasoning_effort = reasoning_effort
29 | self.reasoning_summary = reasoning_summary
30 |
31 | async def process_episodes_batch(self, episodes: List[Dict[str, Any]], output_dir: str, max_concurrent: int = 16) -> Dict[str, Any]:
32 | print(f"[GPT-5 OpenAI Browse] Batch processing {len(episodes)} episodes (max {max_concurrent} concurrent)")
33 | output_path = Path(output_dir)
34 | output_path.mkdir(parents=True, exist_ok=True)
35 |
36 | start = time.time()
37 | import asyncio
38 | semaphore = asyncio.Semaphore(max_concurrent)
39 |
40 | async def run_one(ep):
41 | async with semaphore:
42 | import asyncio
43 | return await asyncio.to_thread(self.process_episode, ep, output_dir)
44 |
45 | tasks = [run_one(ep) for ep in episodes]
46 | results = await asyncio.gather(*tasks, return_exceptions=True)
47 | processed = []
48 | for i, r in enumerate(results):
49 | if isinstance(r, Exception):
50 | processed.append({
51 | 'episode_id': episodes[i].get('id', f'episode_{i}'),
52 | 'error': str(r),
53 | 'success': False
54 | })
55 | else:
56 | processed.append(r)
57 |
58 | duration = time.time() - start
59 | batch = create_standardized_batch_result(
60 | episodes=processed,
61 | total_time=duration,
62 | model_name=f"{self.model_name}_openai_browse_{self.reasoning_effort}",
63 | metadata={"max_concurrent": max_concurrent},
64 | )
65 | batch_file = output_path / f"gpt5_openai_browse_batch_{int(time.time())}.json"
66 | with open(batch_file, 'w') as f:
67 | json.dump(batch, f, indent=2)
68 | print(
69 | f"[GPT-5 OpenAI Browse] Batch completed: "
70 | f"{batch['summary']['successful_episodes']}/{batch['summary']['total_episodes']} successful"
71 | )
72 | return batch
73 |
74 | def process_episode(self, episode_data: Dict[str, Any], output_dir: str) -> Dict[str, Any]:
75 | episode_id = episode_data.get('id', 'unknown')
76 | output_path = Path(output_dir)
77 | output_path.mkdir(parents=True, exist_ok=True)
78 |
79 | session_start = time.time()
80 | turns_results: List[Dict[str, Any]] = []
81 | total_tokens = 0
82 |
83 | for turn_idx, turn in enumerate(episode_data.get('turns', [])):
84 | if turn.get('role') != 'user':
85 | continue
86 | turn_start = time.time()
87 | prompt = self._prepare_prompt(turn, episode_data, turn_idx)
88 | response_data = self._call_openai_responses(prompt)
89 | turn_end = time.time()
90 | timing = {
91 | "start_time": turn_start,
92 | "end_time": turn_end,
93 | "duration": turn_end - turn_start,
94 | }
95 |
96 | model_metadata = {
97 | 'model': self.model_name,
98 | 'provider': 'openai',
99 | 'response_id': response_data.get('id', '')
100 | }
101 |
102 | error = response_data.get('error') if 'error' in response_data else None
103 | response_text = (
104 | response_data.get('output', {}).get('content')
105 | if isinstance(response_data.get('output'), dict)
106 | else response_data.get('output', '')
107 | ) or response_data.get('text', '') or ''
108 |
109 | turn_result = create_turn_result(
110 | turn_index=turn_idx,
111 | prompt=prompt,
112 | response=response_text,
113 | timing=timing,
114 | success=(error is None),
115 | error=error,
116 | metadata=model_metadata,
117 | )
118 | turns_results.append(turn_result)
119 | if not error:
120 | total_tokens += response_data.get('usage', {}).get('total_tokens', 0)
121 |
122 | session_duration = time.time() - session_start
123 | success = all(t.get('success', True) for t in turns_results)
124 | return create_standardized_episode_result(
125 | episode_id=episode_id,
126 | turns=turns_results,
127 | total_time=session_duration,
128 | success=success,
129 | metadata={
130 | "model_name": f"{self.model_name}_openai_browse",
131 | "total_tokens": total_tokens,
132 | },
133 | )
134 |
135 | def _prepare_prompt(self, turn: Dict[str, Any], episode_data: Dict[str, Any], turn_idx: int) -> str:
136 | user_speech = turn.get('text_content', '')
137 | context_docs = episode_data.get('context_documents', [])
138 | parts: List[str] = []
139 | if context_docs:
140 | parts.append("Context Documents:")
141 | for i, doc in enumerate(context_docs):
142 | parts.append(f"Document {i+1}: {doc.get('content','')}")
143 | parts.append("")
144 | if turn_idx > 0:
145 | parts.append("Previous conversation:")
146 | for prev_idx in range(turn_idx):
147 | pt = episode_data['turns'][prev_idx]
148 | role = pt.get('role')
149 | if role == 'user':
150 | parts.append(f"User: {pt.get('text_content','')}")
151 | elif role == 'assistant':
152 | parts.append(f"Assistant: {pt.get('response','')}")
153 | parts.append("")
154 | parts.append(f"User: {user_speech}")
155 | return "\n".join(parts)
156 |
157 | def _call_openai_responses(self, prompt: str) -> Dict[str, Any]:
158 | url = f"{self.api_base}/v1/responses"
159 | headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
160 | params = {"api-version": self.api_version}
161 | payload = {
162 | "input": [{"role": "user", "content": prompt}],
163 | "model": self.model_name,
164 | "tools": [{"type": "web_search_preview", "search_context_size": "high"}],
165 | "truncation": "auto",
166 | "reasoning": {"effort": self.reasoning_effort, "summary": self.reasoning_summary},
167 | "max_output_tokens": 16384
168 | }
169 | try:
170 | with httpx.Client(timeout=180.0) as client:
171 | resp = client.post(url, headers=headers, params=params, json=payload)
172 | resp.raise_for_status()
173 | return resp.json()
174 | except httpx.HTTPStatusError as e:
175 | return {"error": f"HTTP {e.response.status_code}: {e.response.text}", "status_code": e.response.status_code}
176 | except httpx.TimeoutException:
177 | return {"error": "Request timed out"}
178 | except Exception as e:
179 | return {"error": f"Unexpected error: {e}"}
180 |
181 | # Backward-compatible alias for tests and external code
182 | GPT5Adapter = GPT5OpenAIBrowseAdapter
183 |
--------------------------------------------------------------------------------
/models/realtime/liveanswer/audio_to_answer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import json
5 | from pathlib import Path
6 | from typing import Tuple, Optional
7 |
8 | from .main import main_request
9 | from .stt_service import AzureSTTService
10 |
11 |
12 | class AudioToAnswer:
13 | """Process audio input to generate audio answer using Azure STT + LiveAnswer."""
14 |
15 | def __init__(self,
16 | speech_key: Optional[str] = None,
17 | speech_region: Optional[str] = None,
18 | enable_diarization: bool = False):
19 | """
20 | Initialize the audio-to-answer pipeline.
21 |
22 | Args:
23 | speech_key: Azure Speech API key
24 | speech_region: Azure region
25 | enable_diarization: Whether to use speaker diarization
26 | """
27 | self.stt_service = AzureSTTService(speech_key, speech_region)
28 | self.enable_diarization = enable_diarization
29 |
30 | def process_audio_file(self,
31 | audio_file_path: str,
32 | output_dir: Optional[str] = None,
33 | verbose: bool = True) -> Tuple[str, bytes, dict]:
34 | """
35 | Process audio file: STT -> LiveAnswer -> TTS.
36 |
37 | Args:
38 | audio_file_path: Path to input audio file
39 | output_dir: Directory for output files (defaults to current dir)
40 | verbose: Print progress messages
41 |
42 | Returns:
43 | Tuple of (transcript, answer_audio_bytes, metadata)
44 | """
45 | start_time = time.time()
46 |
47 | # Step 1: Transcribe audio
48 | if verbose:
49 | print(f"[1/3] Transcribing audio file: {audio_file_path}")
50 |
51 | if self.enable_diarization:
52 | transcript, stt_metadata = self.stt_service.transcribe_with_diarization(audio_file_path)
53 | else:
54 | transcript, stt_metadata = self.stt_service.transcribe_file(audio_file_path)
55 |
56 | transcription_time = time.time() - start_time
57 |
58 | if verbose:
59 | print(f" Transcription: '{transcript[:100]}{'...' if len(transcript) > 100 else ''}'")
60 | print(f" Time taken: {transcription_time:.2f}s")
61 |
62 | # Step 2: Generate answer
63 | if verbose:
64 | print(f"[2/3] Generating answer...")
65 |
66 | answer_start = time.time()
67 | answer_audio_bytes, time_to_first_response, gpt5_response, groq_explanation = main_request(transcript, audio_file_path)
68 | answer_time = time.time() - answer_start
69 |
70 | if verbose:
71 | print(f" Time to first response: {time_to_first_response:.2f}s")
72 | print(f" Total generation time: {answer_time:.2f}s")
73 |
74 | # Step 3: Save outputs
75 | if output_dir is None:
76 | output_dir = os.getcwd()
77 | else:
78 | os.makedirs(output_dir, exist_ok=True)
79 |
80 | # Save answer audio
81 | timestamp = time.strftime("%Y%m%d_%H%M%S")
82 | answer_path = Path(output_dir) / f"answer_{timestamp}.mp3"
83 | answer_path.write_bytes(answer_audio_bytes)
84 |
85 | # Save transcript
86 | transcript_path = Path(output_dir) / f"transcript_{timestamp}.txt"
87 | transcript_path.write_text(transcript)
88 |
89 | # Save GPT-5 response (raw solver output)
90 | gpt5_response_path = Path(output_dir) / f"gpt5_response_{timestamp}.txt"
91 | gpt5_response_path.write_text(gpt5_response)
92 |
93 | # Save Groq explanation (what was spoken)
94 | groq_explanation_path = Path(output_dir) / f"groq_explanation_{timestamp}.txt"
95 | groq_explanation_path.write_text(groq_explanation)
96 |
97 | # Save detailed timing info
98 | timing_path = Path(output_dir) / f"timing_{timestamp}.json"
99 | timing_data = {
100 | 'time_to_first_audio_chunk': time_to_first_response,
101 | 'transcription_time': transcription_time,
102 | 'answer_generation_time': answer_time,
103 | 'total_processing_time': time.time() - start_time,
104 | 'transcript_length_chars': len(transcript),
105 | 'audio_output_size_bytes': len(answer_audio_bytes),
106 | 'timestamp': timestamp
107 | }
108 | timing_path.write_text(json.dumps(timing_data, indent=2))
109 |
110 | if verbose:
111 | print(f"[3/3] Outputs saved:")
112 | print(f" Answer audio: {answer_path}")
113 | print(f" Transcript: {transcript_path}")
114 | print(f" GPT-5 response: {gpt5_response_path}")
115 | print(f" Groq explanation: {groq_explanation_path}")
116 | print(f" Timing data: {timing_path}")
117 |
118 | # Compile metadata
119 | metadata = {
120 | 'input_audio': audio_file_path,
121 | 'transcript': transcript,
122 | 'gpt5_response': gpt5_response,
123 | 'groq_explanation': groq_explanation,
124 | 'transcript_length': len(transcript),
125 | 'stt_metadata': stt_metadata,
126 | 'answer_audio_path': str(answer_path),
127 | 'transcript_path': str(transcript_path),
128 | 'gpt5_response_path': str(gpt5_response_path),
129 | 'groq_explanation_path': str(groq_explanation_path),
130 | 'timing_path': str(timing_path),
131 | 'timings': {
132 | 'transcription_time': transcription_time,
133 | 'answer_generation_time': answer_time,
134 | 'time_to_first_response': time_to_first_response,
135 | 'total_time': time.time() - start_time
136 | }
137 | }
138 |
139 | return transcript, answer_audio_bytes, metadata
140 |
141 | def process_audio_stream(self, audio_stream):
142 | """
143 | Future: Process audio stream in real-time.
144 | Currently not implemented - placeholder for future enhancement.
145 | """
146 | raise NotImplementedError(
147 | "Real-time audio streaming not yet implemented. "
148 | "Use process_audio_file() for file-based processing."
149 | )
150 |
151 |
152 | def main():
153 | """CLI entry point for audio-to-answer processing."""
154 | import argparse
155 | import json
156 |
157 | parser = argparse.ArgumentParser(description="Process audio to generate answer")
158 | parser.add_argument("audio_file", help="Path to input audio file")
159 | parser.add_argument("--output-dir", default=None, help="Output directory")
160 | parser.add_argument("--diarization", action="store_true", help="Enable speaker diarization")
161 | parser.add_argument("--save-metadata", action="store_true", help="Save metadata JSON")
162 | parser.add_argument("--quiet", action="store_true", help="Suppress verbose output")
163 |
164 | args = parser.parse_args()
165 |
166 | try:
167 | # Initialize processor
168 | processor = AudioToAnswer(enable_diarization=args.diarization)
169 |
170 | # Process audio
171 | transcript, audio_bytes, metadata = processor.process_audio_file(
172 | audio_file_path=args.audio_file,
173 | output_dir=args.output_dir,
174 | verbose=not args.quiet
175 | )
176 |
177 | # Optionally save metadata
178 | if args.save_metadata:
179 | metadata_path = Path(args.output_dir or os.getcwd()) / "metadata.json"
180 | # Convert metadata to JSON-serializable format
181 | json_metadata = {
182 | k: v if not isinstance(v, bytes) else f""
183 | for k, v in metadata.items()
184 | }
185 | metadata_path.write_text(json.dumps(json_metadata, indent=2))
186 | if not args.quiet:
187 | print(f" Metadata: {metadata_path}")
188 |
189 | if not args.quiet:
190 | print(f"\nProcessing complete! Total time: {metadata['timings']['total_time']:.2f}s")
191 |
192 | except Exception as e:
193 | print(f"Error: {e}", file=sys.stderr)
194 | sys.exit(1)
195 |
196 |
197 | if __name__ == "__main__":
198 | main()
--------------------------------------------------------------------------------
/models/realtime/liveanswer/audio.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 | import threading
4 |
5 | import azure.cognitiveservices.speech as speechsdk # type: ignore
6 |
7 | from .utils import _env
8 | from .explain import ExplainSynthesizer
9 |
10 |
11 | class AzureSpeechClient:
12 | def __init__(self) -> None:
13 | self.key = _env("AZURE_SPEECH_KEY")
14 | self.region = _env("AZURE_SPEECH_REGION")
15 | self.voice = _env("AZURE_SPEECH_VOICE", "en-US-JennyNeural")
16 | self.output_format_name = _env("AZURE_SPEECH_FORMAT", "Audio24Khz160KBitRateMonoMp3")
17 | self._sdk_ready = bool(self.key and self.region and speechsdk is not None)
18 |
19 | if self._sdk_ready:
20 | self.speech_config = speechsdk.SpeechConfig(subscription=self.key, region=self.region)
21 | self.speech_config.speech_synthesis_voice_name = self.voice
22 | fmt = getattr(speechsdk.SpeechSynthesisOutputFormat, self.output_format_name)
23 | self.speech_config.set_speech_synthesis_output_format(fmt)
24 | else:
25 | self.speech_config = None
26 |
27 |
28 | class AudioGenerator:
29 | def __init__(self, explainer: ExplainSynthesizer):
30 | self.explainer = explainer
31 | self.azure = AzureSpeechClient()
32 | self.all_sound = bytearray()
33 | self._stop_event = threading.Event()
34 | self.request_start_time: float = time.monotonic()
35 | self.start_time: float = 0.0
36 | self._stream_req = None
37 | self._generated_seconds = 0.0
38 | self._bitrate_bps = self._guess_bitrate(self.azure.output_format_name)
39 |
40 | @staticmethod
41 | def _guess_bitrate(format_name: str) -> float:
42 | if not format_name:
43 | return 160_000.0
44 | match = re.search(r"(\d+)KBitRate", format_name)
45 | if match:
46 | return float(match.group(1)) * 1000.0
47 | sr_match = re.search(r"Audio(\d+)Khz", format_name)
48 | if sr_match:
49 | sample_rate = float(sr_match.group(1)) * 1000.0
50 | bit_depth = 16.0
51 | if "8Bit" in format_name:
52 | bit_depth = 8.0
53 | elif "24Bit" in format_name:
54 | bit_depth = 24.0
55 | return sample_rate * bit_depth
56 | return 160_000.0
57 |
58 | def _update_generated_seconds(self, byte_count: int) -> None:
59 | if byte_count <= 0:
60 | return
61 | bitrate = self._bitrate_bps or 160_000.0
62 | self._generated_seconds += (byte_count * 8.0) / bitrate
63 |
64 | def _watch_need_more_explanation(self) -> None:
65 | if not self.explainer.spoken_explanation:
66 | # first = self.explainer.pop_more_explanation(max_token=80)
67 | # first = self.explainer.pop_more_explanation(max_token=64)
68 | # first = self.explainer.pop_more_explanation(max_token=32)
69 | first = self.explainer.pop_more_explanation()
70 | print(f"!!!AudioGen: First chunk from explainer: '{first[:100]}...' ({len(first) if first else 0} chars)")
71 | if first and self._stream_req is not None:
72 | print(f"!!!AudioGen: Writing first chunk to TTS stream")
73 | self._stream_req.input_stream.write(first)
74 |
75 | # time_margin = 10.0
76 | time_margin = 10.0
77 | while not self._stop_event.is_set():
78 | if self.start_time == 0.0:
79 | elapsed = 0.0
80 | else:
81 | elapsed = time.monotonic() - self.start_time
82 | total_estimated = self._generated_seconds
83 | remaining = total_estimated - elapsed
84 | print(f"!!!total_estimated: {total_estimated}, elapsed: {elapsed}, remaining: {remaining}")
85 |
86 | if remaining <= time_margin:
87 | more = self.explainer.pop_more_explanation()
88 | print(f"!!!AudioGen: Got more chunk: '{more[:100] if more else None}...' ({len(more) if more else 0} chars)")
89 | if more is not None:
90 | if more and self._stream_req is not None:
91 | print(f"!!!AudioGen: Writing more chunk to TTS stream")
92 | self._stream_req.input_stream.write(more)
93 | else:
94 | print(f"!!!AudioGen: No more chunks, closing TTS stream")
95 | if self._stream_req is not None:
96 | self._stream_req.input_stream.close()
97 | return
98 |
99 | time.sleep(max(0.5, remaining - time_margin))
100 |
101 | def start(self) -> tuple[bytes, float]:
102 | self.all_sound.clear()
103 | self._generated_seconds = 0.0
104 | self._stop_event.clear()
105 | self.start_time = 0.0 # Reset start time
106 |
107 | if not getattr(self.azure, "_sdk_ready", False):
108 | raise RuntimeError("Azure Speech SDK or credentials not available for streaming synthesis")
109 |
110 | try:
111 | region = self.azure.region
112 | key = self.azure.key
113 | voice = self.azure.voice
114 |
115 | tts_endpoint = f"wss://{region}.tts.speech.microsoft.com/cognitiveservices/websocket/v2"
116 | cfg = speechsdk.SpeechConfig(endpoint=tts_endpoint, subscription=key)
117 | cfg.speech_synthesis_voice_name = voice
118 | fmt = getattr(speechsdk.SpeechSynthesisOutputFormat, self.azure.output_format_name)
119 | cfg.set_speech_synthesis_output_format(fmt)
120 | cfg.set_property(speechsdk.PropertyId.SpeechSynthesis_RtfTimeoutThreshold, "4")
121 | cfg.set_property(speechsdk.PropertyId.SpeechSynthesis_FrameTimeoutInterval, str(int(60*1000))) # 60s
122 |
123 | req = speechsdk.SpeechSynthesisRequest(speechsdk.SpeechSynthesisRequestInputType.TextStream)
124 | self._stream_req = req
125 | synth = speechsdk.SpeechSynthesizer(speech_config=cfg, audio_config=None)
126 |
127 | def on_synthesizing(evt):
128 | if self.start_time == 0.0:
129 | self.start_time = time.monotonic()
130 | data_bytes = evt.result.audio_data
131 | if data_bytes:
132 | self.all_sound.extend(data_bytes)
133 | self._update_generated_seconds(len(data_bytes))
134 |
135 | def on_synthesis_started(evt):
136 | print(f"!!!TTS: synthesis started")
137 | def on_synthesis_completed(evt):
138 | print(f"!!!TTS: synthesis completed")
139 | def on_synthesis_canceled(evt):
140 | print(f"!!!TTS: synthesis canceled - {evt}")
141 | def on_synthesis_error(evt):
142 | print(f"!!!TTS: synthesis error - {evt}")
143 |
144 | synth.synthesizing.connect(on_synthesizing)
145 | synth.synthesis_started.connect(on_synthesis_started)
146 | synth.synthesis_completed.connect(on_synthesis_completed)
147 | synth.synthesis_canceled.connect(on_synthesis_canceled)
148 | # Note: synthesis_error might not exist in all SDK versions
149 |
150 | fut = synth.speak_async(req)
151 |
152 | t_watcher = threading.Thread(target=self._watch_need_more_explanation, name="watcher", daemon=True)
153 | t_watcher.start()
154 |
155 | r = fut.get()
156 | if r.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
157 | print(f"!!!synthesis completed")
158 | elif r.reason == speechsdk.ResultReason.Canceled:
159 | print(f"!!!synthesis canceled: {r.cancellation_details.reason}, {r.cancellation_details.error_details}")
160 | else:
161 | print(f"!!!synthesis failed: {r.reason}")
162 | t_watcher.join()
163 | finally:
164 | self._stop_event.set()
165 | self._stream_req = None
166 |
167 | # Calculate time to first response, with fallback if TTS never started
168 | if self.start_time > 0.0:
169 | time_to_first_response = self.start_time - self.request_start_time
170 | else:
171 | # TTS never started, use current time as fallback
172 | time_to_first_response = time.monotonic() - self.request_start_time
173 | print(f"!!!AudioGen: TTS never started, using fallback timing: {time_to_first_response:.2f}s")
174 |
175 | return bytes(self.all_sound), time_to_first_response
176 |
--------------------------------------------------------------------------------
/LICENSES/Boson-Higgs-Audio-2-Community-License.txt:
--------------------------------------------------------------------------------
1 | BOSON HIGGS AUDIO 2 COMMUNITY LICENSE AGREEMENT
2 |
3 | Boson Higgs Audio 2 Version Release Date: June 20, 2025
4 |
5 | This License Agreement (the “Agreement”) is entered into by and between Licensee (as defined below) and Boson AI USA, Inc. (“Boson”) and is based upon the Meta Llama 3 Community License Agreement as of April 18, 2024 (the “Meta License Agreement”), which can be found at https://llama.meta.com/llama3/license/. The terms and conditions of the Meta License Agreement are hereby incorporated herein by reference and Unless stated otherwise below, its terms apply. The Higgs Audio 2 model developed by Boson AI USA, Inc. (“Higgs Materials”) is an audio model derived from Meta Llama 3 software and algorithms.
6 |
7 | “Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Higgs Materials set forth herein and the Meta License Agreement.
8 |
9 | “Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering into this Agreement on their behalf.
10 |
11 | “Higgs Audio 2” means the foundational large audio language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing developed by Boson AI distributed at https://github.com/boson-ai/boson-multimodal or otherwise.
12 | “Higgs Materials” means, collectively, Boson’s proprietary modification of Meta Llama 3 and Documentation (and any portion thereof) made available under this Agreement.
13 |
14 | “Boson” or “we” means Boson AI USA, Inc.
15 |
16 | By clicking “I Accept” below or by using or distributing any portion or element of the Higgs Materials, you agree to be bound by this Agreement.
17 |
18 | 1. License Rights and Redistribution.
19 | a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Boson’s intellectual property or other rights owned by Boson embodied in the Higgs Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Higgs Materials.
20 | b. Redistribution and Use.
21 | i. If you distribute or make available the Higgs Materials (or any derivative works thereof), or a product or service that uses any of them, including another AI model, you shall (A) provide a copy of this Agreement and the of Meta License ’s Llama 3 agreement with any such Higgs Materials; and (B) prominently display “Built with Higgs Materials licensed from Boson AI USA, Inc., Copyright Boson AI USA, Inc., All Rights Reserved and Meta Llama 3 licensed under the Meta Llama 3 Community License, Copyright Meta Platforms, Inc., All Right Reserved". based on Meta Llama 3” on a related website, user interface, blogpost, about page, or product documentation. If you use the Higgs Materials to create, modify, enhance, train, fine tune, or otherwise improve an AI model or similar software, which is distributed or made available, you shall also include “Higgs Audio 2” at the beginning of any such AI model or software name.
22 | ii. Even if you receive Higgs Materials, or any modifications, enhancements or derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will apply to you.
23 | iii. You must retain in all copies of the Llama Materials that you distribute and as set forth above, include the following attribution notice within a “Notice” text file distributed as a part of such copies:
24 | “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
25 | “Boson Higgs Audio 2 is licensed under the Boson Community License, Copyright © Boson AI USA, Inc. All Rights Reserved.”
26 | iv. Your use of the Higgs Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which is hereby incorporated by reference into this Agreement.
27 | v. You will not use the Higgs Materials or any output or results of the Higgs Materials to improve any other large language model (excluding Boson Higgs Audio 2 or derivative works thereof).
28 | vi. You hereby acknowledge that Boson is the owner of the Higgs Materials and under no circumstance shall you bring any legal action, claim, charge, demand challenging such ownership rights of Boson.
29 |
30 | 2. Additional Commercial Terms. If the annual active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 100,000 annual active users in the preceding calendar year, you must request an expanded license from Boson AI, which Boson AI may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Boson AI otherwise expressly grants you such rights.
31 |
32 | 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE Higgs Materials AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITH ALL FAULTS, WITHOUT WARRANTIES OF ANY KIND EXPRESS, IMPLIED, BASED UPON CUSTOM AND USAGE OR COURSE OF DEALING, AND BOSON AI DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE HIGGS MATERIALS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR USE OF THE HIGGS MATERIALS AND ANY OUTPUT AND RESULTS.
33 |
34 | 4. Limitation of Liability. IN NO EVENT WILL BOSON AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF BOSON, META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
35 |
36 | 5. Intellectual Property.
37 | a. No trademark licenses are granted under this Agreement, or in connection with the Higgs Materials., nNeither Boson nor Licensee may use any name or mark owned by, or associated with, the other party hereto or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Higgs Materials or as set forth in this Section 5(a). Boson hereby grants you a license to use “Higgs Audio 2” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. All goodwill arising out of your use of the Mark will inure to the benefit of Meta and Boson AI.
38 | b. Subject to Boson’s ownership of the Higgs Materials and derivatives made by or for Boson AI, with respect to any derivative works and modifications of the Higgs Materials that are made by you, as between you and Boson AI, you are and will be the owner of such derivative works and modifications.
39 | c. If you institute litigation or other proceedings against Boson AI, Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Higgs Materials or Boson Higgs Audio 2 outputs or results, or any portion thereof any of the foregoing, constitutes infringement of the intellectual property or other rights owned or licensable by you, then any licenses granted to you hereunder this Agreement shall immediately terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Boson AI from and against any claim, charge, demand, cause of action by any third party arising out of or related to your use or distribution of the Higgs Materials.
40 |
41 | 6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Higgs Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Boson AI may terminate this Agreement if you are in breach of any term or condition of this Agreement by providing you with written notice. Upon your receipt of written notice of termination of this Agreement, you shall delete the Higgs Materials from any computer, server or IT device and cease use of the Higgs Materials in all respects. Sections 1(b)(vi), 3, 4 and 7 shall survive the termination of this Agreement.
42 |
43 | 7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The federal courts in the Northern District of California and the state courts in Santa Clara County, California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
44 |
--------------------------------------------------------------------------------
/models/text/gpt4o.py:
--------------------------------------------------------------------------------
1 | """
2 | GPT-4o OpenAI Browse Adapter for VERA
3 | Uses OpenAI Responses API with web_search_preview tool for browsecomp benchmark
4 | """
5 |
6 | import os
7 | import json
8 | import time
9 | import httpx
10 | from typing import Dict, Any, List
11 | from pathlib import Path
12 |
13 | from ..shared.timing_utils import (
14 | create_turn_result,
15 | create_standardized_episode_result,
16 | create_standardized_batch_result,
17 | )
18 | from ..shared.base_adapter import TextAdapter, ModelConfig
19 |
20 |
21 | class GPT4oOpenAIBrowseAdapter(TextAdapter):
22 | """OpenAI GPT-4o adapter using web_search_preview for browsecomp."""
23 |
24 | def __init__(self, api_key: str, api_base: str = "https://api.openai.com", api_version: str = "2025-02-01-preview"):
25 | config = ModelConfig(model_name="gpt-4o")
26 | super().__init__(config, api_key)
27 | self.api_base = api_base.rstrip('/')
28 | self.api_version = api_version
29 |
30 | async def process_episodes_batch(self, episodes: List[Dict[str, Any]], output_dir: str, max_concurrent: int = 16) -> Dict[str, Any]:
31 | """Batch process episodes concurrently."""
32 | print(f"[GPT-4o OpenAI Browse] Batch processing {len(episodes)} episodes (max {max_concurrent} concurrent)")
33 | output_path = Path(output_dir)
34 | output_path.mkdir(parents=True, exist_ok=True)
35 |
36 | start = time.time()
37 | import asyncio
38 | semaphore = asyncio.Semaphore(max_concurrent)
39 |
40 | async def run_one(ep):
41 | async with semaphore:
42 | import asyncio
43 | return await asyncio.to_thread(self.process_episode, ep, output_dir)
44 |
45 | tasks = [run_one(ep) for ep in episodes]
46 | results = await asyncio.gather(*tasks, return_exceptions=True)
47 | processed = []
48 | for i, r in enumerate(results):
49 | if isinstance(r, Exception):
50 | processed.append({
51 | 'episode_id': episodes[i].get('id', f'episode_{i}'),
52 | 'error': str(r),
53 | 'success': False
54 | })
55 | else:
56 | processed.append(r)
57 |
58 | duration = time.time() - start
59 | batch = create_standardized_batch_result(
60 | episodes=processed,
61 | total_time=duration,
62 | model_name=f"{self.model_name}_openai_browse",
63 | metadata={"max_concurrent": max_concurrent},
64 | )
65 | batch_file = output_path / f"gpt4o_openai_browse_batch_{int(time.time())}.json"
66 | with open(batch_file, 'w') as f:
67 | json.dump(batch, f, indent=2)
68 | print(
69 | f"[GPT-4o OpenAI Browse] Batch completed: "
70 | f"{batch['summary']['successful_episodes']}/{batch['summary']['total_episodes']} successful"
71 | )
72 | return batch
73 |
74 | def process_episode(self, episode_data: Dict[str, Any], output_dir: str) -> Dict[str, Any]:
75 | episode_id = episode_data.get('id', 'unknown')
76 | output_path = Path(output_dir)
77 | output_path.mkdir(parents=True, exist_ok=True)
78 |
79 | session_start = time.time()
80 | turns_results: List[Dict[str, Any]] = []
81 | total_tokens = 0
82 |
83 | for turn_idx, turn in enumerate(episode_data.get('turns', [])):
84 | if turn.get('role') != 'user':
85 | continue
86 | turn_start = time.time()
87 | prompt = self._prepare_prompt(turn, episode_data, turn_idx)
88 | response_data = self._call_openai_responses(prompt)
89 | turn_end = time.time()
90 | timing = {
91 | "start_time": turn_start,
92 | "end_time": turn_end,
93 | "duration": turn_end - turn_start,
94 | }
95 |
96 | model_metadata = {
97 | 'model': self.model_name,
98 | 'provider': 'openai',
99 | 'response_id': response_data.get('id', '')
100 | }
101 |
102 | error = response_data.get('error') if 'error' in response_data else None
103 | # Extract a plain text response if available
104 | response_text = (
105 | response_data.get('output', {}).get('content')
106 | if isinstance(response_data.get('output'), dict)
107 | else response_data.get('output', '')
108 | ) or response_data.get('text', '') or ''
109 |
110 | turn_result = create_turn_result(
111 | turn_index=turn_idx,
112 | prompt=prompt,
113 | response=response_text,
114 | timing=timing,
115 | success=(error is None),
116 | error=error,
117 | metadata=model_metadata,
118 | )
119 | turns_results.append(turn_result)
120 | if not error:
121 | total_tokens += response_data.get('usage', {}).get('total_tokens', 0)
122 |
123 | session_duration = time.time() - session_start
124 | success = all(t.get('success', True) for t in turns_results)
125 | return create_standardized_episode_result(
126 | episode_id=episode_id,
127 | turns=turns_results,
128 | total_time=session_duration,
129 | success=success,
130 | metadata={
131 | "model_name": f"{self.model_name}_openai_browse",
132 | "total_tokens": total_tokens,
133 | },
134 | )
135 |
136 | def _prepare_prompt(self, turn: Dict[str, Any], episode_data: Dict[str, Any], turn_idx: int) -> str:
137 | user_speech = turn.get('text_content', '')
138 | context_docs = episode_data.get('context_documents', [])
139 | parts: List[str] = []
140 | if context_docs:
141 | parts.append("Context Documents:")
142 | for i, doc in enumerate(context_docs):
143 | parts.append(f"Document {i+1}: {doc.get('content','')}")
144 | parts.append("")
145 | if turn_idx > 0:
146 | parts.append("Previous conversation:")
147 | for prev_idx in range(turn_idx):
148 | pt = episode_data['turns'][prev_idx]
149 | role = pt.get('role')
150 | if role == 'user':
151 | parts.append(f"User: {pt.get('text_content','')}")
152 | elif role == 'assistant':
153 | parts.append(f"Assistant: {pt.get('response','')}")
154 | parts.append("")
155 | parts.append(f"User: {user_speech}")
156 | return "\n".join(parts)
157 |
158 | def _make_api_request(self, messages: List[Dict[str, str]], **kwargs) -> str:
159 | """Make API request to OpenAI GPT-4o"""
160 | if len(messages) == 1 and messages[0].get("role") == "user":
161 | prompt = messages[0]["content"]
162 | else:
163 | # Convert messages to prompt format
164 | prompt_parts = []
165 | for msg in messages:
166 | role = msg.get("role", "user")
167 | content = msg.get("content", "")
168 | if role == "user":
169 | prompt_parts.append(f"User: {content}")
170 | elif role == "assistant":
171 | prompt_parts.append(f"Assistant: {content}")
172 | prompt = "\n".join(prompt_parts)
173 |
174 | response_data = self._call_openai_responses(prompt)
175 | if "error" in response_data:
176 | raise Exception(response_data["error"])
177 |
178 | return response_data.get("output", {}).get("content", "")
179 |
180 | def _call_openai_responses(self, prompt: str) -> Dict[str, Any]:
181 | url = f"{self.api_base}/v1/responses"
182 | headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
183 | params = {"api-version": self.api_version}
184 | payload = {
185 | "input": [{"role": "user", "content": prompt}],
186 | "model": self.model_name,
187 | "tools": [{"type": "web_search_preview", "search_context_size": "high"}],
188 | "truncation": "auto",
189 | "max_output_tokens": 8192
190 | }
191 | try:
192 | with httpx.Client(timeout=120.0) as client:
193 | resp = client.post(url, headers=headers, params=params, json=payload)
194 | resp.raise_for_status()
195 | return resp.json()
196 | except httpx.HTTPStatusError as e:
197 | return {"error": f"HTTP {e.response.status_code}: {e.response.text}", "status_code": e.response.status_code}
198 | except httpx.TimeoutException:
199 | return {"error": "Request timed out"}
200 | except Exception as e:
201 | return {"error": f"Unexpected error: {e}"}
202 |
--------------------------------------------------------------------------------
/models/realtime/liveanswer/explain.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Optional, List
3 |
4 | import requests # type: ignore
5 |
6 | from .utils import _env
7 |
8 |
9 | class ExplainSynthesizer:
10 | """
11 | Incrementally produces spoken explanation text by calling Groq for continuation
12 | (assistant prefill style).
13 | """
14 |
15 | def __init__(self, request: str):
16 | self.request: str = request
17 | self.all_thought: List[Optional[str]] = []
18 | self.spoken_explanation: str = ""
19 | self.finished: bool = False
20 |
21 | # Track consecutive dummy explanations to prevent infinite loops
22 | # Give generous tolerance since thinking model may be slow
23 | self._consecutive_dummy_count: int = 0
24 | self._max_consecutive_dummy: int = 10 # Allow up to 10 dummy responses (thinking time)
25 |
26 | self._last_groq_messages: List[dict] = []
27 | self._last_groq_response: Optional[str] = None
28 |
29 | def push_thought(self, s: Optional[str]) -> None:
30 | self.all_thought.append(s)
31 |
32 | def _groq_chat_completion(self, messages: List[dict], max_tokens: int) -> str:
33 | if self._last_groq_messages == messages:
34 | return self._last_groq_response
35 | self._last_groq_messages = messages
36 | self._last_groq_response = self.__groq_chat_completion(messages, max_tokens)
37 | return self._last_groq_response
38 |
39 | def __groq_chat_completion(self, messages: List[dict], max_tokens: int) -> str:
40 | api_key = _env("GROQ_API_KEY")
41 | if not api_key:
42 | raise RuntimeError("GROQ_API_KEY missing")
43 |
44 | groq_base = _env("GROQ_ENDPOINT", "https://api.groq.com/openai/v1")
45 | url = f"{groq_base.rstrip('/')}" \
46 | f"/chat/completions"
47 | model = _env("GROQ_MODEL", "llama-3.3-70b-versatile")
48 | print(f"!!!Groq API: model={model}, max_tokens={max_tokens}")
49 |
50 | headers = {
51 | "Authorization": f"Bearer {api_key}",
52 | "Content-Type": "application/json",
53 | }
54 | # Increase temperature for Template3 to encourage more generation
55 | temperature = 0.9 if max_tokens > 1000 else 0.7
56 |
57 | payload = {
58 | "model": model,
59 | "messages": messages,
60 | "temperature": temperature,
61 | "max_completion_tokens": max_tokens,
62 | }
63 |
64 | # Increase timeout for longer responses
65 | timeout = 60 if max_tokens > 1000 else 30
66 | try:
67 | resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=timeout)
68 | resp.raise_for_status()
69 | data = resp.json()
70 | content = data["choices"][0]["message"]["content"].strip()
71 | finish_reason = data["choices"][0].get("finish_reason", "unknown")
72 | print(f"!!!Groq API response: finish_reason={finish_reason}, content_length={len(content)}")
73 | return content
74 | except Exception as e:
75 | print(f"!!!Groq API error: {e}")
76 | raise
77 |
78 | def pop_more_explanation(self, max_token: int = 32) -> Optional[str]:
79 | if self.finished:
80 | print(f"!!!pop_more_explanation: Already finished, returning None")
81 | return None
82 |
83 | has_any_thought = len(self.all_thought) > 0
84 | last_thought = self.all_thought[-1] if has_any_thought else None
85 | non_none_thoughts = [t for t in self.all_thought if t is not None]
86 | all_thought_text = (" ".join(non_none_thoughts)).strip()
87 |
88 | print(f"!!!pop_more_explanation: has_any_thought={has_any_thought}, last_thought={'None' if last_thought is None else 'Some'}, len(all_thought)={len(self.all_thought)}")
89 |
90 | # Use different system prompt for Template3 (finalization) to encourage comprehensive response
91 | if last_thought is None and has_any_thought:
92 | # Template3: Need comprehensive final explanation
93 | system_prompt = (
94 | "You are a thorough, clear explainer providing a complete final explanation. "
95 | "Generate natural spoken-style text that fully explains the solution. "
96 | "Write exactly as if spoken aloud. Avoid symbols, equations, code fences, or special characters; "
97 | "use plain words instead. Express relations in words (e.g., x=y -> 'x equals y'). "
98 | "Provide a COMPLETE and COMPREHENSIVE explanation. Do not be too concise - be thorough."
99 | )
100 | else:
101 | # Template1 and Template2: Regular concise style
102 | system_prompt = (
103 | "You are a concise, clear explainer. Generate natural spoken-style text. "
104 | "Avoid lists unless necessary. Keep continuity with the prior assistant text. "
105 | "Write exactly as if spoken aloud. Avoid symbols, equations, code fences, or special characters; "
106 | "use plain words instead. Express relations in words (e.g., x=y -> 'x equals y'). Keep punctuation minimal and natural."
107 | "Use short sentences and phrases if possible. Avoid long sentences and paragraphs."
108 | )
109 |
110 | assistant_prefill = self.spoken_explanation.strip()
111 | final_answer = False
112 | if not has_any_thought:
113 | # Template1: no solver thoughts yet → confirm + typically how to proceed
114 | print(f"!!!Using Template1: No solver thoughts yet")
115 | user_template = (
116 | "Begin the spoken explanation. Start with a very brief rephrase of the user's "
117 | "request (one short sentence) to confirm understanding, then briefly state what you would "
118 | "typically do to approach it, and continue naturally. Do not include any disclaimers about inability or limitations. "
119 | "Avoid lists unless necessary; keep it concise and fluid.\n\n"
120 | f"User request: {self.request}"
121 | )
122 | elif last_thought is None:
123 | # Template3: finalization
124 | max_token = 2048 # Increase token budget for comprehensive explanation
125 | final_answer = True
126 | print(f"!!!Using Template3: Finalization with max_token={max_token}")
127 | print(f"!!!All solver thoughts collected: {len(non_none_thoughts)} thoughts, {len(all_thought_text)} chars")
128 | user_template = (
129 | "The conversation is concluding. Please provide a COMPREHENSIVE and DETAILED final explanation that:\n"
130 | "1. Fully explains the solution approach and reasoning\n"
131 | "2. Clearly states the final answer\n"
132 | "3. Explains WHY this answer is correct\n"
133 | "4. Should be at least 3-4 paragraphs long for completeness\n"
134 | "Continue from where you left off, but ensure the explanation is thorough and complete. "
135 | "Do not stop until you have fully explained the solution.\n\n"
136 | + (f"All solver thoughts so far: {all_thought_text}\n\n" if all_thought_text else "")
137 | + f"User request: {self.request}"
138 | )
139 | else:
140 | # Template2: ongoing with accumulated thoughts
141 | print(f"!!!Using Template2: Ongoing with {len(non_none_thoughts)} thoughts")
142 | user_template = (
143 | "Continue the spoken explanation naturally. Keep it fluid and avoid abrupt topic jumps. "
144 | "Be sure to include all latest updates from the accumulated reasoning (all_thought_text) as quickly as possible.\n\n"
145 | + (f"Use the overall reasoning so far: {all_thought_text}\n\n" if all_thought_text else "")
146 | + f"User request: {self.request}"
147 | )
148 |
149 | messages = [
150 | {"role": "system", "content": system_prompt},
151 | {"role": "user", "content": user_template},
152 | ]
153 |
154 | # Use TTS-friendly filler text instead of newlines
155 | dumb_explanation = "I'm still thinking about this problem. Let me work through the details."
156 | def remove_dumb_words(s: str) -> str:
157 | return s.replace(dumb_explanation, "")
158 |
159 | if assistant_prefill:
160 | messages.append({"role": "assistant", "content": remove_dumb_words(assistant_prefill)})
161 |
162 | print(f"!!!max_token: {max_token}")
163 | print(f"!!!Calling Groq with messages count: {len(messages)}")
164 | chunk = self._groq_chat_completion(messages=messages, max_tokens=max_token)
165 | print(f"!!!Groq returned chunk length: {len(chunk)} chars")
166 | if has_any_thought and last_thought is None:
167 | print(f"!!!Setting finished=True (Template3 completed)")
168 | self.finished = True
169 | if not chunk.strip():
170 | if not final_answer:
171 | self._consecutive_dummy_count += 1
172 | print(f"!!!Empty chunk returned ({self._consecutive_dummy_count}/{self._max_consecutive_dummy}), using dumb explanation")
173 |
174 | # Only stop if we've exceeded the maximum dummy responses
175 | if self._consecutive_dummy_count >= self._max_consecutive_dummy:
176 | print(f"!!!Too many consecutive dummy responses ({self._max_consecutive_dummy}), ending conversation")
177 | self.finished = True
178 | return None
179 |
180 | chunk = dumb_explanation
181 | else:
182 | # For final answer (Template3), if we get empty response, just end cleanly
183 | print(f"!!!Empty chunk in final answer - ending conversation")
184 | self.finished = True
185 | return None # Signal end of conversation
186 | else:
187 | # Reset counter when we get a real response
188 | if self._consecutive_dummy_count > 0:
189 | print(f"!!!Got real response, resetting dummy counter from {self._consecutive_dummy_count}")
190 | self._consecutive_dummy_count = 0
191 |
192 | print(f"!!!chunk: {chunk[:200]}..." if len(chunk) > 200 else f"!!!chunk: {chunk}")
193 |
194 | # if self.spoken_explanation and not self.spoken_explanation.endswith(" "):
195 | # self.spoken_explanation += " "
196 | self.spoken_explanation += chunk
197 | return chunk
198 |
--------------------------------------------------------------------------------
/LICENSES/Meta-Llama-3-Community-License.txt:
--------------------------------------------------------------------------------
1 | META LLAMA 3 COMMUNITY LICENSE AGREEMENT
2 |
3 | Meta Llama 3 Version Release Date: April 18, 2024
4 | “Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein.
5 |
6 | “Documentation” means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https://llama.meta.com/get-started/.
7 |
8 | “Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
9 |
10 | “Meta Llama 3” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at https://llama.meta.com/llama-downloads.
11 |
12 | “Llama Materials” means, collectively, Meta’s proprietary Meta Llama 3 and Documentation (and any portion thereof) made available under this Agreement.
13 |
14 | “Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
15 |
16 | By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement.
17 |
18 | 1. License Rights and Redistribution.
19 |
20 | a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials.
21 | b. Redistribution and Use.
22 | i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service that uses any of them, including another AI model, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Meta Llama 3” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama 3” at the beginning of any such AI model name.
23 | ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
24 | iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
25 | iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which is hereby incorporated by reference into this Agreement.
26 | v. You will not use the Llama Materials or any output or results of the Llama Materials to improve any other large language model (excluding Meta Llama 3 or derivative works thereof).
27 |
28 | 2. Additional Commercial Terms. If, on the Meta Llama 3 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights.
29 |
30 | 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
31 |
32 | 4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
33 |
34 | 5. Intellectual Property.
35 | a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama 3” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/ ). All goodwill arising out of your use of the Mark will inure to the benefit of Meta.
36 | b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
37 | c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Meta Llama 3 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials.
38 |
39 | 6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
40 |
41 | 7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
42 |
43 |
44 | Meta Llama 3 Acceptable Use Policy
45 | Meta is committed to promoting safe and fair use of its tools and features, including Meta Llama 3. If you access or use Meta Llama 3, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at https://llama.meta.com/llama3/use-policy
46 | Prohibited Uses
47 | We want everyone to use Meta Llama 3 safely and responsibly. You agree you will not use, or allow others to use, Meta Llama 3 to:
48 | 1. Violate the law or others’ rights, including to:
49 | a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
50 | i. Violence or terrorism
51 | ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
52 | iii. Human trafficking, exploitation, and sexual violence
53 | iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
54 | v. Sexual solicitation
55 | vi. Any other criminal activity
56 | b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
57 | c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
58 | d. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
59 | e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
60 | f. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Llama Materials
61 | g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
62 |
63 | 2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Meta Llama 3 related to the following:
64 | a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
65 | b. Guns and illegal weapons (including weapon development)
66 | c. Illegal drugs and regulated/controlled substances
67 | d. Operation of critical infrastructure, transportation technologies, or heavy machinery
68 | e. Self-harm or harm to others, including suicide, cutting, and eating disorders
69 | f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
70 |
71 | 3. Intentionally deceive or mislead others, including use of Meta Llama 3 related to the following:
72 | a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
73 | b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
74 | c. Generating, promoting, or further distributing spam
75 | d. Impersonating another individual without consent, authorization, or legal right
76 | e. Representing that the use of Meta Llama 3 or outputs are human-generated
77 | f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
78 | g. Fail to appropriately disclose to end users any known dangers of your AI system
79 |
80 | Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means:
81 | * Reporting issues with the model: https://github.com/meta-llama/llama3
82 | * Reporting risky content generated by the model: developers.facebook.com/llama_output_feedback
83 | * Reporting bugs and security concerns: facebook.com/whitehat/info
84 | * Reporting violations of the Acceptable Use Policy or unlicensed uses of Meta Llama 3: LlamaUseReport@meta.com
85 |
--------------------------------------------------------------------------------
/evaluation/grader/voice_grader.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | import asyncio
5 | from typing import Optional, Dict, Any
6 | from pathlib import Path
7 |
8 | from .base import BaseAccuracyGrader, GradeLabel, GradeResult
9 | from .llm_grader import LLMAccuracyGrader
10 | from .asr_processor import ASRProcessor
11 | from .wer_calculator import WERCalculator
12 |
13 |
14 | class VoiceAccuracyGrader(BaseAccuracyGrader):
15 | """
16 | Voice accuracy grader that processes audio files through ASR then uses LLM grading.
17 |
18 | Pipeline:
19 | 1. Audio file → ASR → transcript
20 | 2. Transcript → LLM grader → accuracy grade
21 | 3. Also calculates WER between ASR transcript and expected text
22 | """
23 |
24 | def __init__(
25 | self,
26 | asr_provider: str = "azure",
27 | llm_deployment_name: str = "gpt-4o",
28 | llm_api_version: str = "2024-10-21",
29 | llm_temperature: float = 0.0,
30 | max_retries: int = 3,
31 | base_delay: float = 1.0,
32 | azure_speech_key: Optional[str] = None,
33 | azure_speech_region: Optional[str] = None,
34 | openai_api_key: Optional[str] = None,
35 | openai_base_url: Optional[str] = None,
36 | ):
37 | # Initialize ASR processor
38 | self.asr_processor = ASRProcessor(
39 | provider=asr_provider,
40 | azure_speech_key=azure_speech_key,
41 | azure_speech_region=azure_speech_region,
42 | openai_api_key=openai_api_key,
43 | openai_base_url=openai_base_url,
44 | )
45 |
46 | # Initialize LLM grader for semantic evaluation
47 | self.llm_grader = LLMAccuracyGrader(
48 | deployment_name=llm_deployment_name,
49 | api_version=llm_api_version,
50 | temperature=llm_temperature,
51 | max_retries=max_retries,
52 | base_delay=base_delay,
53 | )
54 |
55 | # WER calculator for transcript quality
56 | self.wer_calculator = WERCalculator()
57 |
58 | def _extract_audio_path_from_response(self, response_data: Dict[str, Any]) -> Optional[str]:
59 | """Extract audio file path from voice output response data."""
60 | if isinstance(response_data, dict):
61 | # Check audio_info section
62 | if "audio_info" in response_data and "output_file" in response_data["audio_info"]:
63 | return response_data["audio_info"]["output_file"]
64 |
65 | # Check for direct audio path
66 | if "output_audio_path" in response_data:
67 | return response_data["output_audio_path"]
68 |
69 | # Check conversation transcript for audio response
70 | if "conversation_transcript" in response_data:
71 | for turn in response_data["conversation_transcript"]:
72 | if turn.get("type") == "audio_response" and "audio_file" in turn:
73 | return turn["audio_file"]
74 |
75 | return None
76 |
77 | async def grade_voice_response_async(
78 | self,
79 | question: str,
80 | ground_truth: str,
81 | voice_response_path_or_data: str | Dict[str, Any],
82 | expected_transcript: Optional[str] = None,
83 | benchmark: Optional[str] = None,
84 | calculate_wer: bool = True,
85 | ) -> Dict[str, Any]:
86 | """
87 | Grade a voice response (audio file or response data structure).
88 |
89 | Args:
90 | question: The question asked
91 | ground_truth: Ground truth answer
92 | voice_response_path_or_data: Path to audio file OR response data dict
93 | expected_transcript: Expected transcript text (for WER calculation)
94 | benchmark: Benchmark name for grading context
95 | calculate_wer: Whether to calculate WER metrics
96 |
97 | Returns:
98 | Dictionary containing grading results and ASR/WER metrics
99 | """
100 | # Extract audio file path
101 | if isinstance(voice_response_path_or_data, str):
102 | audio_path = voice_response_path_or_data
103 | response_data = None
104 | else:
105 | response_data = voice_response_path_or_data
106 | audio_path = self._extract_audio_path_from_response(response_data)
107 |
108 | if not audio_path:
109 | return {
110 | "success": False,
111 | "error": "Could not find audio file path in response data",
112 | "asr_result": None,
113 | "llm_grade": None,
114 | "wer_metrics": None
115 | }
116 |
117 | # Ensure audio file exists
118 | audio_path = Path(audio_path)
119 | if not audio_path.exists():
120 | return {
121 | "success": False,
122 | "error": f"Audio file not found: {audio_path}",
123 | "asr_result": None,
124 | "llm_grade": None,
125 | "wer_metrics": None
126 | }
127 |
128 | # Step 1: Transcribe audio
129 | asr_result = await self.asr_processor.transcribe_async(str(audio_path))
130 |
131 | if not asr_result["success"]:
132 | return {
133 | "success": False,
134 | "error": f"ASR failed: {asr_result['error']}",
135 | "asr_result": asr_result,
136 | "llm_grade": None,
137 | "wer_metrics": None
138 | }
139 |
140 | transcript = asr_result["text"]
141 |
142 | # Step 2: Grade transcript using LLM
143 | try:
144 | llm_grade = await self.llm_grader.grade_async(
145 | question=question,
146 | ground_truth=ground_truth,
147 | predicted_answer=transcript,
148 | benchmark=benchmark
149 | )
150 | except Exception as e:
151 | return {
152 | "success": False,
153 | "error": f"LLM grading failed: {str(e)}",
154 | "asr_result": asr_result,
155 | "llm_grade": None,
156 | "wer_metrics": None
157 | }
158 |
159 | # Step 3: Calculate WER if expected transcript provided
160 | wer_metrics = None
161 | if calculate_wer and expected_transcript:
162 | wer_metrics = self.wer_calculator.calculate_wer(
163 | reference=expected_transcript,
164 | hypothesis=transcript,
165 | return_details=True
166 | )
167 |
168 | return {
169 | "success": True,
170 | "error": None,
171 | "asr_result": asr_result,
172 | "llm_grade": llm_grade,
173 | "wer_metrics": wer_metrics,
174 | "transcript": transcript,
175 | "audio_path": str(audio_path)
176 | }
177 |
178 | def grade_voice_response(
179 | self,
180 | question: str,
181 | ground_truth: str,
182 | voice_response_path_or_data: str | Dict[str, Any],
183 | expected_transcript: Optional[str] = None,
184 | benchmark: Optional[str] = None,
185 | calculate_wer: bool = True,
186 | ) -> Dict[str, Any]:
187 | """Sync wrapper for voice response grading."""
188 | async def _run():
189 | return await self.grade_voice_response_async(
190 | question, ground_truth, voice_response_path_or_data,
191 | expected_transcript, benchmark, calculate_wer
192 | )
193 |
194 | try:
195 | return asyncio.run(_run())
196 | except RuntimeError:
197 | # If already inside an event loop
198 | loop = asyncio.get_event_loop()
199 | return loop.run_until_complete(_run())
200 |
201 | def batch_grade_voice_responses(
202 | self,
203 | grading_tasks: list[Dict[str, Any]],
204 | ) -> list[Dict[str, Any]]:
205 | """
206 | Grade multiple voice responses in batch.
207 |
208 | Args:
209 | grading_tasks: List of dicts with keys:
210 | - question: str
211 | - ground_truth: str
212 | - voice_response_path_or_data: str | Dict
213 | - expected_transcript: Optional[str]
214 | - benchmark: Optional[str]
215 | - calculate_wer: Optional[bool] = True
216 | """
217 | async def _batch_grade():
218 | tasks = []
219 | for task in grading_tasks:
220 | tasks.append(self.grade_voice_response_async(
221 | question=task["question"],
222 | ground_truth=task["ground_truth"],
223 | voice_response_path_or_data=task["voice_response_path_or_data"],
224 | expected_transcript=task.get("expected_transcript"),
225 | benchmark=task.get("benchmark"),
226 | calculate_wer=task.get("calculate_wer", True)
227 | ))
228 |
229 | return await asyncio.gather(*tasks, return_exceptions=True)
230 |
231 | try:
232 | results = asyncio.run(_batch_grade())
233 | except RuntimeError:
234 | loop = asyncio.get_event_loop()
235 | results = loop.run_until_complete(_batch_grade())
236 |
237 | # Handle exceptions in results
238 | processed_results = []
239 | for i, result in enumerate(results):
240 | if isinstance(result, Exception):
241 | processed_results.append({
242 | "success": False,
243 | "error": f"Exception in task {i}: {str(result)}",
244 | "asr_result": None,
245 | "llm_grade": None,
246 | "wer_metrics": None
247 | })
248 | else:
249 | processed_results.append(result)
250 |
251 | return processed_results
252 |
253 | # Implement base class interface for compatibility
254 | def grade(
255 | self,
256 | question: str,
257 | ground_truth: str,
258 | predicted_answer: str | Dict[str, Any], # Can be transcript or voice response data
259 | benchmark: Optional[str] = None,
260 | ) -> GradeResult:
261 | """
262 | Grade method for base class compatibility.
263 |
264 | If predicted_answer is a string, treat as transcript and grade directly.
265 | If predicted_answer is a dict, treat as voice response data and process through ASR.
266 | """
267 | if isinstance(predicted_answer, str):
268 | # Direct transcript grading
269 | return self.llm_grader.grade(
270 | question=question,
271 | ground_truth=ground_truth,
272 | predicted_answer=predicted_answer,
273 | benchmark=benchmark
274 | )
275 | elif isinstance(predicted_answer, dict):
276 | # Voice response grading
277 | result = self.grade_voice_response(
278 | question=question,
279 | ground_truth=ground_truth,
280 | voice_response_path_or_data=predicted_answer,
281 | benchmark=benchmark
282 | )
283 |
284 | if result["success"]:
285 | return result["llm_grade"]
286 | else:
287 | # Return error as incorrect grade
288 | return GradeResult(
289 | label=GradeLabel.INCORRECT,
290 | extracted_final_answer=None,
291 | reasoning=result["error"],
292 | correct_flag=False,
293 | confidence=None,
294 | raw_model_output=None,
295 | metadata={"voice_grading_error": result["error"]}
296 | )
297 | else:
298 | raise ValueError(f"Invalid predicted_answer type: {type(predicted_answer)}")
--------------------------------------------------------------------------------
/evaluation/text/run_evaluation.py:
--------------------------------------------------------------------------------
1 | """
2 | General Text Model Evaluation Script for VERA Datasets
3 | Supports GPT-4o, GPT-5 Instant, GPT-5 Thinking with async processing by default
4 | """
5 |
6 | import os
7 | import sys
8 | import json
9 | import time
10 | import asyncio
11 | import argparse
12 | from pathlib import Path
13 | from typing import Dict, Any, List, Optional
14 | from datetime import datetime
15 | from dotenv import load_dotenv
16 | import yaml
17 |
18 | # Load environment variables from .env file
19 | load_dotenv()
20 |
21 | # Use explicit package imports for adapters present in this repository
22 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter
23 | from models.text.gpt5 import GPT5OpenAIBrowseAdapter
24 | from models.text.gemini25_pro import Gemini25ProBrowseAdapter
25 | from models.text.gemini25_flash import Gemini25FlashBrowseAdapter
26 |
27 |
28 | class TextModelEvaluator:
29 | """General evaluator that can work with different text model adapters"""
30 |
31 | def __init__(self):
32 | # Load canonical config.yaml and overlay with .env for secrets
33 | self.config = self._load_config()
34 | self.models = {
35 | 'gpt4o': self._create_gpt4o_adapter,
36 | 'gpt5-instant': self._create_gpt5_instant_adapter,
37 | 'gpt5-thinking': self._create_gpt5_thinking_adapter,
38 | 'gemini-2.5-pro': self._create_gemini_25_pro_adapter,
39 | 'gemini-2.5-flash': self._create_gemini_25_flash_adapter
40 | }
41 | self._current_dataset_name = None
42 |
43 | def set_dataset_context(self, dataset_name: str):
44 | self._current_dataset_name = dataset_name
45 |
46 | def _load_config(self) -> Dict[str, Any]:
47 | """Load config.yaml from project root; return empty dict if missing."""
48 | cfg_path = Path(__file__).parent.parent.parent / 'config.yaml'
49 | if not cfg_path.exists():
50 | return {}
51 | try:
52 | with open(cfg_path, 'r', encoding='utf-8') as f:
53 | return yaml.safe_load(f) or {}
54 | except Exception:
55 | return {}
56 |
57 | # --- Config helpers with .env overlay ---
58 | def _get_openai_api_key(self) -> Optional[str]:
59 | return os.getenv('OPENAI_API_KEY') or (self.config.get('api_keys', {}) or {}).get('openai_api_key')
60 |
61 | # Azure variants are not wired in this repository's adapters; OpenAI browse is used instead.
62 |
63 | def _get_gemini_api_key(self) -> Optional[str]:
64 | return os.getenv('GEMINI_API_KEY') or (self.config.get('api_keys', {}) or {}).get('gemini_api_key')
65 |
66 | def _create_gpt4o_adapter(self):
67 | """Create GPT-4o OpenAI browse adapter (used for all tracks)."""
68 | openai_key = self._get_openai_api_key()
69 | if not openai_key:
70 | raise ValueError("GPT-4o requires OPENAI_API_KEY for OpenAI browse adapter")
71 | return GPT4oOpenAIBrowseAdapter(api_key=openai_key)
72 |
73 | def _create_gpt5_instant_adapter(self):
74 | """Create GPT-5 Instant (OpenAI browse) with low reasoning effort."""
75 | openai_key = self._get_openai_api_key()
76 | if not openai_key:
77 | raise ValueError("GPT-5 Instant requires OPENAI_API_KEY for OpenAI browse adapter")
78 | return GPT5OpenAIBrowseAdapter(api_key=openai_key, reasoning_effort='low', reasoning_summary='auto')
79 |
80 | def _create_gpt5_thinking_adapter(self):
81 | """Create GPT-5 Thinking (OpenAI browse) with high reasoning effort."""
82 | openai_key = self._get_openai_api_key()
83 | if not openai_key:
84 | raise ValueError("GPT-5 Thinking requires OPENAI_API_KEY for OpenAI browse adapter")
85 | return GPT5OpenAIBrowseAdapter(api_key=openai_key, reasoning_effort='high', reasoning_summary='detailed')
86 |
87 | def _create_gemini_25_pro_adapter(self):
88 | """Create Gemini 2.5 Pro adapter with browse support"""
89 | api_key = self._get_gemini_api_key()
90 | if not api_key:
91 | raise ValueError("Gemini 2.5 Pro requires GEMINI_API_KEY environment variable")
92 | return Gemini25ProBrowseAdapter(api_key=api_key)
93 |
94 | def _create_gemini_25_flash_adapter(self):
95 | """Create Gemini 2.5 Flash adapter with browse support"""
96 | api_key = self._get_gemini_api_key()
97 | if not api_key:
98 | raise ValueError("Gemini 2.5 Flash requires GEMINI_API_KEY environment variable")
99 | return Gemini25FlashBrowseAdapter(api_key=api_key)
100 |
101 | def load_dataset(self, dataset_path: str) -> Dict[str, Any]:
102 | """Load a VERA dataset JSON file"""
103 | with open(dataset_path, 'r', encoding='utf-8') as f:
104 | return json.load(f)
105 |
106 | def create_output_dir(self, model_name: str, dataset_name: str) -> str:
107 | """Create timestamped output directory"""
108 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
109 | output_dir = f"test_output/{model_name}_{dataset_name}_{timestamp}"
110 | Path(output_dir).mkdir(parents=True, exist_ok=True)
111 | return output_dir
112 |
113 | def get_completed_episodes(self, output_dir: str) -> set:
114 | """Get set of episode IDs that have been completed in output directory"""
115 | completed = set()
116 | output_path = Path(output_dir)
117 |
118 | if not output_path.exists():
119 | return completed
120 |
121 | # Look for JSON result files
122 | for json_file in output_path.glob("*.json"):
123 | try:
124 | with open(json_file, 'r') as f:
125 | data = json.load(f)
126 |
127 | # Check if it's an individual episode result
128 | if 'episode_id' in data:
129 | completed.add(data['episode_id'])
130 |
131 | # Check if it's a batch result with individual episodes
132 | elif 'results' in data:
133 | for result in data['results']:
134 | if isinstance(result, dict) and 'episode_id' in result:
135 | completed.add(result['episode_id'])
136 |
137 | except (json.JSONDecodeError, KeyError):
138 | continue
139 |
140 | return completed
141 |
142 | def filter_episodes_for_resume(self, episodes: List[Dict[str, Any]], output_dir: str) -> List[Dict[str, Any]]:
143 | """Filter episodes to skip already completed ones"""
144 | completed_ids = self.get_completed_episodes(output_dir)
145 |
146 | if not completed_ids:
147 | print("No completed episodes found, processing all episodes")
148 | return episodes
149 |
150 | print(f"Found {len(completed_ids)} completed episodes, skipping them")
151 |
152 | remaining_episodes = []
153 | for episode in episodes:
154 | episode_id = episode.get('id', '')
155 | if episode_id not in completed_ids:
156 | remaining_episodes.append(episode)
157 | else:
158 | print(f"Skipping completed episode: {episode_id}")
159 |
160 | print(f"Remaining episodes to process: {len(remaining_episodes)}/{len(episodes)}")
161 | return remaining_episodes
162 |
163 | async def run_evaluation(self, model_name: str, dataset_path: str,
164 | max_episodes: Optional[int] = None,
165 | max_concurrent: int = 16,
166 | resume_from: Optional[str] = None) -> Dict[str, Any]:
167 | """Run evaluation with async processing by default"""
168 | print(f"Loading dataset: {dataset_path}")
169 | dataset = self.load_dataset(dataset_path)
170 | episodes = dataset.get('episodes', [])
171 |
172 | dataset_name = Path(dataset_path).stem.replace('_voice_episodes', '')
173 |
174 | # Handle resume functionality
175 | if resume_from:
176 | if not Path(resume_from).exists():
177 | raise ValueError(f"Resume directory does not exist: {resume_from}")
178 |
179 | print(f"Resuming from: {resume_from}")
180 | output_dir = resume_from
181 |
182 | # Filter out already completed episodes
183 | episodes = self.filter_episodes_for_resume(episodes, output_dir)
184 |
185 | if not episodes:
186 | print("All episodes already completed!")
187 | return {'message': 'All episodes already completed', 'skipped': True}
188 |
189 | else:
190 | # Create new output directory
191 | output_dir = self.create_output_dir(model_name, dataset_name)
192 |
193 | if max_episodes:
194 | episodes = episodes[:max_episodes]
195 | print(f"Limited to {max_episodes} episodes")
196 |
197 | print(f"Creating model adapter: {model_name}")
198 | self.set_dataset_context(dataset_name)
199 | adapter = self.models[model_name]()
200 |
201 | print(f"Starting async evaluation with {len(episodes)} episodes")
202 | start_time = time.time()
203 |
204 | # All adapters implement async batch processing that returns standardized batch result
205 | results = await adapter.process_episodes_batch(episodes, output_dir, max_concurrent)
206 |
207 | end_time = time.time()
208 | duration = end_time - start_time
209 |
210 | # Save summary
211 | # Derive basic counters from standardized batch result
212 | summary_counts = results.get('summary', {}) if isinstance(results, dict) else {}
213 | total_episodes = summary_counts.get('total_episodes', len(episodes))
214 | successful = summary_counts.get('successful_episodes', 0)
215 | failed = total_episodes - successful
216 |
217 | summary = {
218 | 'model': model_name,
219 | 'dataset': dataset_name,
220 | 'dataset_path': dataset_path,
221 | 'output_directory': output_dir,
222 | 'total_episodes': total_episodes,
223 | 'processed': total_episodes,
224 | 'successful': successful,
225 | 'failed': failed,
226 | 'duration_seconds': duration,
227 | 'episodes_per_second': total_episodes / duration if duration > 0 else 0,
228 | 'timestamp': datetime.now().isoformat(),
229 | 'max_concurrent': max_concurrent,
230 | 'async_processing': True
231 | }
232 |
233 | summary_path = Path(output_dir) / 'evaluation_summary.json'
234 | with open(summary_path, 'w', encoding='utf-8') as f:
235 | json.dump(summary, f, indent=2)
236 |
237 | print(f"\nEvaluation completed!")
238 | print(f"Model: {model_name}")
239 | print(f"Dataset: {dataset_name}")
240 | print(f"Episodes: {summary['successful']}/{summary['total_episodes']} successful")
241 | print(f"Duration: {duration:.2f}s ({summary['episodes_per_second']:.2f} episodes/sec)")
242 | print(f"Output: {output_dir}")
243 |
244 | return summary
245 |
246 |
247 | def main():
248 | parser = argparse.ArgumentParser(description='Evaluate text models on VERA datasets')
249 | parser.add_argument('model', choices=['gpt4o', 'gpt5-instant', 'gpt5-thinking', 'gemini-2.5-pro', 'gemini-2.5-flash'],
250 | help='Text model to evaluate')
251 | parser.add_argument('dataset', help='Path to dataset JSON file')
252 | parser.add_argument('--max-episodes', type=int, help='Maximum number of episodes to process')
253 | parser.add_argument('--max-concurrent', type=int, default=16,
254 | help='Maximum concurrent requests')
255 |
256 | args = parser.parse_args()
257 |
258 | # Validate dataset path
259 | if not Path(args.dataset).exists():
260 | print(f"Error: Dataset file not found: {args.dataset}")
261 | return 1
262 |
263 | evaluator = TextModelEvaluator()
264 |
265 | try:
266 | summary = asyncio.run(evaluator.run_evaluation(
267 | args.model, args.dataset, args.max_episodes, args.max_concurrent
268 | ))
269 | return 0
270 |
271 | except Exception as e:
272 | print(f"Error during evaluation: {e}")
273 | import traceback
274 | traceback.print_exc()
275 | return 1
276 |
277 |
278 | if __name__ == "__main__":
279 | exit(main())
280 |
--------------------------------------------------------------------------------
/evaluation/grader/run_grader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | CLI for grading accuracy of model responses.
4 |
5 | Usage:
6 | uv run python evaluation/grader/run_grader.py single \
7 | --question "..." --ground-truth "..." --pred "..." [--benchmark simpleqa]
8 |
9 | uv run python evaluation/grader/run_grader.py batch \
10 | --dataset data/final_dataset/text/simpleqa_voice_episodes.json \
11 | --results test_output/gpt4o_simpleqa_*/gpt4o_openai_browse_batch_*.json \
12 | [--benchmark simpleqa]
13 | """
14 |
15 | from __future__ import annotations
16 |
17 | import argparse
18 | import glob
19 | import json
20 | from pathlib import Path
21 | import asyncio
22 | from typing import Dict, Any, List, Optional
23 |
24 | from evaluation.grader.base import GradeLabel
25 | from evaluation.grader.llm_grader import LLMAccuracyGrader
26 |
27 |
28 | def _load_dataset_questions(dataset_path: str) -> Dict[str, Dict[str, str]]:
29 | """Map episode_id -> {question, ground_truth} from dataset."""
30 | with open(dataset_path, "r", encoding="utf-8") as f:
31 | data = json.load(f)
32 | mapping: Dict[str, Dict[str, str]] = {}
33 | for ep in data.get("episodes", []):
34 | eid = ep.get("id")
35 | turns = ep.get("turns", [])
36 | q = ""
37 | if turns:
38 | q = turns[0].get("text_content", "")
39 | # expected can be under turn.metadata or ep.metadata
40 | target = (
41 | (turns[0].get("metadata", {}) or {}).get("expected_answer")
42 | or ep.get("metadata", {}).get("expected_answer")
43 | or ""
44 | )
45 | if eid:
46 | mapping[eid] = {"question": q, "ground_truth": target}
47 | return mapping
48 |
49 |
50 | def _load_results(results_glob: str) -> List[Dict[str, Any]]:
51 | files = sorted(glob.glob(results_glob))
52 | episodes: List[Dict[str, Any]] = []
53 | for fp in files:
54 | with open(fp, "r", encoding="utf-8") as f:
55 | data = json.load(f)
56 | # Standardized batch format stores per-episode in 'episodes' or directly as list
57 | eps = data.get("episodes") or data.get("results") or []
58 | if isinstance(eps, list) and eps:
59 | episodes.extend(eps)
60 | else:
61 | # Some adapters save single-episode results
62 | if "episode_id" in data and "turn_results" in data:
63 | episodes.append(data)
64 | return episodes
65 |
66 |
67 | def _extract_predicted_answer(episode_result: Dict[str, Any]) -> Optional[str]:
68 | """Extract the assistant response text from per-episode result.
69 |
70 | Supports both legacy fields (turn_results/model_response) and
71 | standardized fields (turns/response).
72 | """
73 | # Legacy shape
74 | turns = episode_result.get("turn_results")
75 | if isinstance(turns, list) and turns:
76 | return turns[-1].get("model_response")
77 |
78 | # Standardized shape
79 | turns = episode_result.get("turns")
80 | if isinstance(turns, list) and turns:
81 | return turns[-1].get("response")
82 |
83 | return None
84 |
85 |
86 | def _summarize_counts(labels: List[GradeLabel]) -> Dict[str, Any]:
87 | total = len(labels)
88 | c = sum(1 for l in labels if l == GradeLabel.CORRECT)
89 | i = sum(1 for l in labels if l == GradeLabel.INCORRECT)
90 | n = sum(1 for l in labels if l == GradeLabel.NOT_ATTEMPTED)
91 | acc = (c / total) if total else 0.0
92 | return {"total": total, "correct": c, "incorrect": i, "not_attempted": n, "accuracy": acc}
93 |
94 |
95 | def main():
96 | # Try loading .env if available for Azure creds
97 | try:
98 | from dotenv import load_dotenv # type: ignore
99 | load_dotenv()
100 | except Exception:
101 | pass
102 | parser = argparse.ArgumentParser(description="Accuracy grader")
103 | sub = parser.add_subparsers(dest="cmd", required=True)
104 |
105 | p_single = sub.add_parser("single", help="Grade a single triplet")
106 | p_single.add_argument("--question", required=False, default="")
107 | p_single.add_argument("--ground-truth", required=False, dest="ground_truth")
108 | p_single.add_argument("--pred", required=True)
109 | p_single.add_argument("--benchmark", required=False)
110 | # LLM-only, triad mode
111 |
112 | p_batch = sub.add_parser("batch", help="Grade a batch of results vs. a dataset")
113 | p_batch.add_argument("--dataset", required=True)
114 | p_batch.add_argument("--results", required=True, help="Glob to batch result JSON(s)")
115 | p_batch.add_argument("--benchmark", required=False)
116 | p_batch.add_argument("--out", required=False, help="Optional path to write detailed grades JSON")
117 | p_batch.add_argument("--max-concurrent", type=int, default=16, help="Max concurrent grading requests")
118 |
119 | p_latest = sub.add_parser("latest", help="Auto-find latest results per model/benchmark under test_output and grade them")
120 | p_latest.add_argument("--models", nargs="*", default=["gpt4o", "gpt5-instant", "gpt5-thinking", "gemini-2.5-pro", "gemini-2.5-flash"], help="Models to include")
121 | p_latest.add_argument(
122 | "--benchmarks",
123 | nargs="*",
124 | default=["aime", "browsecomp", "gpqa_diamond", "mrcr", "simpleqa"],
125 | help="Benchmarks/datasets to include",
126 | )
127 | p_latest.add_argument("--out-dir", default="", help="Optional directory to also write an aggregate summary")
128 | p_latest.add_argument("--max-concurrent", type=int, default=16, help="Max concurrent grading requests")
129 |
130 | args = parser.parse_args()
131 |
132 | grader = LLMAccuracyGrader()
133 |
134 | if args.cmd == "single":
135 | if not args.ground_truth:
136 | parser.error("--ground-truth is required")
137 | res = grader.grade(
138 | question=args.question,
139 | ground_truth=args.ground_truth,
140 | predicted_answer=args.pred,
141 | benchmark=args.benchmark,
142 | )
143 | print(json.dumps({
144 | "label": res.label,
145 | "question": args.question,
146 | "ground_truth": args.ground_truth,
147 | "extracted_final_answer": res.extracted_final_answer,
148 | "confidence": res.confidence,
149 | "reasoning": res.reasoning,
150 | }, default=str, indent=2))
151 | return 0
152 |
153 | if args.cmd == "batch":
154 | # batch
155 | ep_map = _load_dataset_questions(args.dataset)
156 | results = _load_results(args.results)
157 |
158 | async def _grade_all():
159 | sem = asyncio.Semaphore(max(1, args.max_concurrent))
160 | detailed_local = []
161 | labels_local: List[GradeLabel] = []
162 |
163 | async def _one(ep: Dict[str, Any]):
164 | eid = ep.get("episode_id")
165 | if not eid or eid not in ep_map:
166 | return None
167 | qa = ep_map[eid]
168 | pred = _extract_predicted_answer(ep) or ""
169 | async with sem:
170 | gres = await grader.grade_async(
171 | question=qa["question"],
172 | ground_truth=qa["ground_truth"],
173 | predicted_answer=pred,
174 | benchmark=args.benchmark,
175 | )
176 | labels_local.append(gres.label)
177 | detailed_local.append({
178 | "episode_id": eid,
179 | "question": qa["question"],
180 | "ground_truth": qa["ground_truth"],
181 | "predicted_answer": pred,
182 | "label": gres.label,
183 | "confidence": gres.confidence,
184 | "extracted_final_answer": gres.extracted_final_answer,
185 | })
186 |
187 | tasks = [
188 | _one(ep) for ep in results
189 | ]
190 | await asyncio.gather(*tasks)
191 | return labels_local, detailed_local
192 |
193 | labels, detailed = asyncio.run(_grade_all())
194 |
195 | summary = _summarize_counts(labels)
196 | out = {
197 | "summary": summary,
198 | "grades": detailed,
199 | }
200 |
201 | print(json.dumps(out, indent=2))
202 | if args.out:
203 | Path(args.out).parent.mkdir(parents=True, exist_ok=True)
204 | with open(args.out, "w", encoding="utf-8") as f:
205 | json.dump(out, f, indent=2)
206 | return 0
207 |
208 | if args.cmd == "latest":
209 | # Build dataset path resolver
210 | dataset_dir = Path(__file__).parent.parent.parent / 'data' / 'final_dataset' / 'text'
211 | def dataset_path(ds: str) -> Path:
212 | return dataset_dir / f"{ds}_voice_episodes.json"
213 |
214 | # Map model -> batch filename pattern within the run folder (new standardized adapters)
215 | batch_prefix = {
216 | 'gpt4o': 'gpt4o_openai_browse_batch_',
217 | 'gpt5-instant': 'gpt5_openai_browse_batch_',
218 | 'gpt5-thinking': 'gpt5_openai_browse_batch_',
219 | 'gemini-2.5-pro': 'gemini_25_pro_browse_batch_',
220 | 'gemini-2.5-flash': 'gemini_25_flash_browse_batch_',
221 | }
222 |
223 | base = Path('test_output')
224 | # Also check text_output for Gemini results
225 | text_output_base = Path('text_output')
226 | out_dir = Path(args.out_dir) if args.out_dir else None
227 | if out_dir:
228 | out_dir.mkdir(parents=True, exist_ok=True)
229 |
230 | overall = []
231 | summary_rows = []
232 |
233 | for model in args.models:
234 | for ds in args.benchmarks:
235 | run_dirs = sorted(base.glob(f"{model}_{ds}_*"))
236 |
237 | # For Gemini models, also check text_output structure
238 | if model in ["gemini-2.5-pro", "gemini-2.5-flash"] and text_output_base.exists():
239 | gemini_folder = "gemini_2.5_pro" if model == "gemini-2.5-pro" else "gemini_2.5_flash"
240 | gemini_dir = text_output_base / gemini_folder
241 | if gemini_dir.exists():
242 | # Look for dataset subdirectories
243 | gemini_ds_dirs = sorted(gemini_dir.glob(f"*{ds}*"))
244 | run_dirs.extend(gemini_ds_dirs)
245 |
246 | if not run_dirs:
247 | continue
248 | # pick most recent directory that actually contains results
249 | latest_dir = None
250 | for cand in reversed(run_dirs):
251 | # any batch or per-episode results inside?
252 | prefix = batch_prefix.get(model)
253 | # Backward-compatible per-episode prefix patterns (legacy + current)
254 | per_prefix = {
255 | 'gpt4o': None,
256 | 'gpt5-instant': None,
257 | 'gpt5-thinking': None,
258 | 'gemini-2.5-pro': 'gemini_25_pro_browse_',
259 | 'gemini-2.5-flash': 'gemini_25_flash_browse_',
260 | }.get(model)
261 | if list(cand.glob(f"{prefix}*.json")) or list(cand.glob(f"{per_prefix}*.json")):
262 | latest_dir = cand
263 | break
264 | if latest_dir is None:
265 | continue
266 | # find batch file
267 | prefix = batch_prefix.get(model)
268 | if not prefix:
269 | continue
270 | batch_files = sorted(latest_dir.glob(f"{prefix}*.json"))
271 | results_glob: Optional[str] = None
272 | if batch_files:
273 | batch_file = str(batch_files[-1])
274 | results_glob = batch_file
275 | else:
276 | # Fallback to per-episode results if no batch file is present
277 | per_prefix = {
278 | 'gpt4o': None,
279 | 'gpt5-instant': None,
280 | 'gpt5-thinking': None,
281 | 'gemini-2.5-pro': 'gemini_25_pro_browse_',
282 | 'gemini-2.5-flash': 'gemini_25_flash_browse_',
283 | }.get(model)
284 | if per_prefix:
285 | per_files = sorted(latest_dir.glob(f"{per_prefix}*.json"))
286 | if per_files:
287 | results_glob = str(latest_dir / f"{per_prefix}*.json")
288 | if not results_glob:
289 | continue
290 |
291 | ds_path = dataset_path(ds)
292 | if not ds_path.exists():
293 | continue
294 |
295 | # Reuse batch grading pipeline
296 | ep_map = _load_dataset_questions(str(ds_path))
297 | results = _load_results(results_glob)
298 |
299 | async def _grade_all_latest():
300 | sem = asyncio.Semaphore(max(1, args.max_concurrent))
301 | detailed_local = []
302 | labels_local: List[GradeLabel] = []
303 |
304 | async def _one(ep: Dict[str, Any]):
305 | eid = ep.get("episode_id")
306 | if not eid or eid not in ep_map:
307 | return None
308 | qa = ep_map[eid]
309 | pred = _extract_predicted_answer(ep) or ""
310 | async with sem:
311 | gres = await grader.grade_async(
312 | question=qa["question"],
313 | ground_truth=qa["ground_truth"],
314 | predicted_answer=pred,
315 | benchmark=ds,
316 | )
317 | labels_local.append(gres.label)
318 | detailed_local.append({
319 | "episode_id": eid,
320 | "question": qa["question"],
321 | "ground_truth": qa["ground_truth"],
322 | "predicted_answer": pred,
323 | "label": gres.label,
324 | "confidence": gres.confidence,
325 | "extracted_final_answer": gres.extracted_final_answer,
326 | "model": model,
327 | "dataset": ds,
328 | "results_file": results_glob,
329 | })
330 |
331 | await asyncio.gather(*[ _one(ep) for ep in results ])
332 | return labels_local, detailed_local
333 |
334 | labels, detailed = asyncio.run(_grade_all_latest())
335 | overall.extend(detailed)
336 | summary = _summarize_counts(labels)
337 | results_file_for_summary = results_glob or ""
338 | summary_rows.append({
339 | "model": model,
340 | "dataset": ds,
341 | **summary,
342 | "results_dir": str(latest_dir),
343 | "results_file": results_file_for_summary,
344 | })
345 |
346 | # write per-pair file into the corresponding run folder
347 | pair_out_inplace = Path(latest_dir) / "llm_grades.json"
348 | with open(pair_out_inplace, 'w', encoding='utf-8') as f:
349 | json.dump({"summary": summary, "grades": detailed}, f, indent=2)
350 |
351 | # optionally also write to central out-dir if provided
352 | if out_dir:
353 | pair_out = out_dir / f"{model}_{ds}_grades_llm.json"
354 | with open(pair_out, 'w', encoding='utf-8') as f:
355 | json.dump({"summary": summary, "grades": detailed}, f, indent=2)
356 |
357 | # write aggregate
358 | agg = {
359 | "pairs": summary_rows,
360 | "total_pairs": len(summary_rows),
361 | }
362 | print(json.dumps(agg, indent=2))
363 | if out_dir:
364 | with open(out_dir / "summary_latest_grades.json", 'w', encoding='utf-8') as f:
365 | json.dump(agg, f, indent=2)
366 | return 0
367 |
368 |
369 | if __name__ == "__main__":
370 | raise SystemExit(main())
371 |
--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Test module for VERA model adapters
4 | Tests basic functionality of each model to ensure they work correctly
5 | """
6 |
7 | import os
8 | import sys
9 | import json
10 | import tempfile
11 | from pathlib import Path
12 | from unittest.mock import Mock, patch, MagicMock
13 |
14 | # Try to import pytest, but make it optional
15 | try:
16 | import pytest
17 | PYTEST_AVAILABLE = True
18 | except ImportError:
19 | PYTEST_AVAILABLE = False
20 | # Define minimal pytest decorators for standalone mode
21 | class pytest:
22 | class fixture:
23 | def __init__(self, *args, **kwargs):
24 | pass
25 | def __call__(self, func):
26 | return func
27 | fixture = fixture()
28 | @staticmethod
29 | def skip(msg):
30 | pass
31 |
32 | # Add project root to path
33 | project_root = Path(__file__).parent.parent
34 | sys.path.insert(0, str(project_root))
35 |
36 | from models.shared.base_adapter import ModelConfig, BaseAdapter, TextAdapter, VoiceAdapter, RealtimeAdapter
37 |
38 |
39 | # ============================================================================
40 | # Test Fixtures
41 | # ============================================================================
42 |
43 | @pytest.fixture
44 | def sample_episode():
45 | """Sample episode data for testing"""
46 | return {
47 | "id": "test_episode_001",
48 | "track": "standard",
49 | "turns": [
50 | {
51 | "role": "user",
52 | "text_content": "What is 2+2?",
53 | "audio_file": None
54 | }
55 | ],
56 | "context_documents": []
57 | }
58 |
59 |
60 | @pytest.fixture
61 | def sample_mrcr_episode():
62 | """Sample MRCR episode with context"""
63 | return {
64 | "id": "test_mrcr_001",
65 | "track": "long_context",
66 | "turns": [
67 | {
68 | "role": "user",
69 | "text_content": "What was discussed earlier?",
70 | "audio_file": None
71 | }
72 | ],
73 | "context_documents": [
74 | {
75 | "content": "User: Hello\nAssistant: Hi there!\nUser: What's the weather?\nAssistant: It's sunny today."
76 | }
77 | ]
78 | }
79 |
80 |
81 | @pytest.fixture
82 | def temp_output_dir():
83 | """Temporary output directory"""
84 | with tempfile.TemporaryDirectory() as tmpdir:
85 | yield tmpdir
86 |
87 |
88 | # ============================================================================
89 | # Test Base Classes
90 | # ============================================================================
91 |
92 | class TestBaseAdapter:
93 | """Test base adapter functionality"""
94 |
95 | def test_model_config_creation(self):
96 | """Test ModelConfig dataclass"""
97 | config = ModelConfig(model_name="test-model")
98 | assert config.model_name == "test-model"
99 | assert config.temperature == 0.0
100 | assert config.max_tokens == 4096
101 | assert config.timeout == 300.0
102 | assert config.max_concurrent == 16
103 |
104 | def test_base_adapter_initialization(self):
105 | """Test BaseAdapter initialization"""
106 | config = ModelConfig(model_name="test-model")
107 |
108 | # Create concrete implementation for testing
109 | class TestAdapter(BaseAdapter):
110 | def process_episode(self, episode, output_dir):
111 | return {"episode_id": episode["id"], "success": True}
112 |
113 | adapter = TestAdapter(config)
114 | assert adapter.config == config
115 | assert adapter.model_name == "test-model"
116 |
117 |
118 | # ============================================================================
119 | # Test Text Models
120 | # ============================================================================
121 |
122 | class TestGPT4oAdapter:
123 | """Test GPT-4o text adapter"""
124 |
125 | @patch('httpx.Client')
126 | def test_adapter_initialization(self, mock_client):
127 | """Test GPT-4o adapter can be initialized"""
128 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter
129 |
130 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key")
131 | assert adapter.model_name == "gpt-4o"
132 | assert adapter.api_key == "test-key"
133 |
134 | @patch('httpx.Client')
135 | def test_prepare_prompt(self, mock_client):
136 | """Test prompt preparation"""
137 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter
138 |
139 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key")
140 |
141 | episode = {
142 | "id": "test_001",
143 | "turns": [
144 | {"role": "user", "text_content": "Hello"}
145 | ],
146 | "context_documents": []
147 | }
148 |
149 | turn = episode["turns"][0]
150 | prompt = adapter._prepare_prompt(turn, episode, 0)
151 |
152 | assert "Hello" in prompt
153 |
154 | @patch('httpx.Client')
155 | def test_make_api_request_simple_message(self, mock_client):
156 | """Test API request with simple message"""
157 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter
158 |
159 | # Mock the API response
160 | mock_response = Mock()
161 | mock_response.json.return_value = {
162 | "output": {"content": "Test response"},
163 | "usage": {"total_tokens": 10}
164 | }
165 | mock_response.raise_for_status = Mock()
166 |
167 | mock_client_instance = Mock()
168 | mock_client_instance.post.return_value = mock_response
169 | mock_client.return_value.__enter__.return_value = mock_client_instance
170 |
171 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key")
172 |
173 | messages = [{"role": "user", "content": "Hello"}]
174 | response = adapter._make_api_request(messages)
175 |
176 | assert response == "Test response"
177 |
178 |
179 | class TestGemini25ProAdapter:
180 | """Test Gemini 2.5 Pro adapter"""
181 |
182 | @patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"})
183 | def test_adapter_can_be_imported(self):
184 | """Test that Gemini adapter can be imported"""
185 | try:
186 | from models.text.gemini25_pro import Gemini25ProAdapter
187 | assert True
188 | except ImportError as e:
189 | pytest.skip(f"Gemini dependencies not available: {e}")
190 |
191 |
192 | class TestGPT5Adapter:
193 | """Test GPT-5 adapter"""
194 |
195 | def test_adapter_can_be_imported(self):
196 | """Test that GPT-5 adapter can be imported"""
197 | try:
198 | from models.text.gpt5 import GPT5Adapter
199 | assert True
200 | except ImportError as e:
201 | pytest.skip(f"GPT-5 dependencies not available: {e}")
202 |
203 |
204 | # ============================================================================
205 | # Test Voice Models
206 | # ============================================================================
207 |
208 | class TestQwen2AudioAdapter:
209 | """Test Qwen2-Audio voice adapter"""
210 |
211 | def test_adapter_can_be_imported(self):
212 | """Test that Qwen2-Audio adapter can be imported"""
213 | try:
214 | from models.voice.qwen2_audio import Qwen2AudioAdaptiveEvaluator, EvaluationConfig
215 |
216 | config = EvaluationConfig()
217 | assert config.model_name == "Qwen/Qwen2-Audio-7B-Instruct"
218 | assert config.temperature == 0.7
219 | except ImportError as e:
220 | pytest.skip(f"Qwen2-Audio dependencies not available: {e}")
221 |
222 | def test_task_type_detection(self):
223 | """Test task type detection logic"""
224 | try:
225 | from models.voice.qwen2_audio import Qwen2AudioAdaptiveEvaluator, EvaluationConfig
226 | except ImportError:
227 | pytest.skip("Qwen2-Audio dependencies not available")
228 |
229 | config = EvaluationConfig()
230 |
231 | # Mock the LLM initialization to avoid loading the model
232 | with patch('models.voice.qwen2_audio.LLM'):
233 | evaluator = Qwen2AudioAdaptiveEvaluator(config)
234 |
235 | # Test MRCR detection
236 | mrcr_episode = {
237 | "id": "test_mrcr_001",
238 | "track": "long_context",
239 | "context_documents": [{"content": "test"}]
240 | }
241 | assert evaluator.detect_task_type(mrcr_episode) == "mrcr"
242 |
243 | # Test standard detection
244 | standard_episode = {
245 | "id": "test_standard_001",
246 | "track": "standard",
247 | "context_documents": []
248 | }
249 | assert evaluator.detect_task_type(standard_episode) == "standard"
250 |
251 |
252 | class TestUltravoxAdapter:
253 | """Test Ultravox voice adapter"""
254 |
255 | def test_adapter_can_be_imported(self):
256 | """Test that Ultravox adapter can be imported"""
257 | try:
258 | from models.voice.ultravox import UltravoxAdapter
259 | assert True
260 | except ImportError as e:
261 | pytest.skip(f"Ultravox dependencies not available: {e}")
262 |
263 |
264 | # ============================================================================
265 | # Test Realtime Models
266 | # ============================================================================
267 |
268 | class TestGPTRealtimeAdapter:
269 | """Test GPT Realtime adapter"""
270 |
271 | def test_module_can_be_imported(self):
272 | """Test that GPT Realtime module can be imported"""
273 | try:
274 | from models.realtime import gpt_realtime
275 | assert hasattr(gpt_realtime, 'main')
276 | assert hasattr(gpt_realtime, 'parse_mrcr_context')
277 | except ImportError as e:
278 | pytest.skip(f"GPT Realtime dependencies not available: {e}")
279 |
280 | def test_parse_mrcr_context(self):
281 | """Test MRCR context parsing"""
282 | try:
283 | from models.realtime.gpt_realtime import parse_mrcr_context
284 | except ImportError:
285 | pytest.skip("GPT Realtime dependencies not available")
286 |
287 | context = "User: Hello\nAssistant: Hi there!\nUser: How are you?\nAssistant: I'm doing well!"
288 | messages = parse_mrcr_context(context)
289 |
290 | assert len(messages) == 4
291 | assert messages[0]["role"] == "user"
292 | assert messages[0]["content"] == "Hello"
293 | assert messages[1]["role"] == "assistant"
294 | assert messages[1]["content"] == "Hi there!"
295 |
296 |
297 | class TestGeminiRealtimeAdapter:
298 | """Test Gemini Realtime adapter"""
299 |
300 | def test_adapter_can_be_imported(self):
301 | """Test that Gemini Realtime adapter can be imported"""
302 | try:
303 | from models.realtime import gemini
304 | assert True
305 | except ImportError as e:
306 | pytest.skip(f"Gemini Realtime dependencies not available: {e}")
307 |
308 |
309 | class TestMoshiAdapter:
310 | """Test Moshi adapter"""
311 |
312 | def test_adapter_can_be_imported(self):
313 | """Test that Moshi adapter can be imported"""
314 | try:
315 | from models.realtime import moshi
316 | assert True
317 | except ImportError as e:
318 | pytest.skip(f"Moshi dependencies not available: {e}")
319 |
320 |
321 | # ============================================================================
322 | # Integration Tests
323 | # ============================================================================
324 |
325 | class TestModelIntegration:
326 | """Integration tests for model adapters"""
327 |
328 | @patch('httpx.Client')
329 | def test_text_model_episode_processing(self, mock_client, sample_episode, temp_output_dir):
330 | """Test that a text model can process an episode"""
331 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter
332 |
333 | # Mock successful API response
334 | mock_response = Mock()
335 | mock_response.json.return_value = {
336 | "id": "test-response",
337 | "output": {"content": "4"},
338 | "usage": {
339 | "total_tokens": 10,
340 | "prompt_tokens": 5,
341 | "completion_tokens": 5
342 | }
343 | }
344 | mock_response.raise_for_status = Mock()
345 |
346 | mock_client_instance = Mock()
347 | mock_client_instance.post.return_value = mock_response
348 | mock_client.return_value.__enter__.return_value = mock_client_instance
349 |
350 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key")
351 |
352 | result = adapter.process_episode(sample_episode, temp_output_dir)
353 |
354 | assert "episode_id" in result
355 | assert result["episode_id"] == "test_episode_001"
356 | assert "turn_results" in result
357 |
358 | def test_model_config_variations(self):
359 | """Test different model configurations"""
360 | configs = [
361 | ModelConfig(model_name="test-1", temperature=0.0),
362 | ModelConfig(model_name="test-2", temperature=0.7, max_tokens=2048),
363 | ModelConfig(model_name="test-3", max_concurrent=8)
364 | ]
365 |
366 | for config in configs:
367 | assert config.model_name.startswith("test-")
368 | assert 0.0 <= config.temperature <= 1.0
369 | assert config.max_tokens > 0
370 |
371 |
372 | # ============================================================================
373 | # Utility Tests
374 | # ============================================================================
375 |
376 | class TestTimingUtils:
377 | """Test timing utilities"""
378 |
379 | def test_timing_utils_can_be_imported(self):
380 | """Test that timing utilities can be imported"""
381 | try:
382 | from models.shared.timing_utils import (
383 | create_turn_result,
384 | create_standardized_episode_result,
385 | create_standardized_batch_result
386 | )
387 | assert True
388 | except ImportError as e:
389 | pytest.skip(f"Timing utilities not available: {e}")
390 |
391 |
392 | # ============================================================================
393 | # Main Test Runner
394 | # ============================================================================
395 |
396 | def run_smoke_tests():
397 | """Run basic smoke tests without pytest"""
398 | print("=" * 70)
399 | print("VERA Model Smoke Tests")
400 | print("=" * 70)
401 |
402 | passed = 0
403 | failed = 0
404 | skipped = 0
405 |
406 | # Test 1: Import base classes
407 | print("\n[1/8] Testing base classes...")
408 | try:
409 | from models.shared.base_adapter import ModelConfig, BaseAdapter
410 | config = ModelConfig(model_name="test")
411 | print("✓ Base classes work")
412 | passed += 1
413 | except Exception as e:
414 | print(f"✗ Base classes failed: {e}")
415 | failed += 1
416 |
417 | # Test 2: Import text models
418 | print("\n[2/8] Testing text model imports...")
419 | try:
420 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter
421 | print("✓ Text models can be imported")
422 | passed += 1
423 | except Exception as e:
424 | print(f"✗ Text models failed: {e}")
425 | failed += 1
426 |
427 | # Test 3: Import voice models
428 | print("\n[3/8] Testing voice model imports...")
429 | try:
430 | from models.voice.qwen2_audio import EvaluationConfig
431 | print("✓ Voice models can be imported")
432 | passed += 1
433 | except Exception as e:
434 | print(f"⊘ Voice models skipped: {e}")
435 | skipped += 1
436 |
437 | # Test 4: Import realtime models
438 | print("\n[4/8] Testing realtime model imports...")
439 | try:
440 | from models.realtime import gpt_realtime
441 | print("✓ Realtime models can be imported")
442 | passed += 1
443 | except Exception as e:
444 | print(f"⊘ Realtime models skipped: {e}")
445 | skipped += 1
446 |
447 | # Test 5: Test ModelConfig
448 | print("\n[5/8] Testing ModelConfig...")
449 | try:
450 | config = ModelConfig(
451 | model_name="test-model",
452 | temperature=0.5,
453 | max_tokens=2048
454 | )
455 | assert config.model_name == "test-model"
456 | assert config.temperature == 0.5
457 | print("✓ ModelConfig works")
458 | passed += 1
459 | except Exception as e:
460 | print(f"✗ ModelConfig failed: {e}")
461 | failed += 1
462 |
463 | # Test 6: Test timing utilities
464 | print("\n[6/8] Testing timing utilities...")
465 | try:
466 | from models.shared.timing_utils import create_turn_result
467 | print("✓ Timing utilities can be imported")
468 | passed += 1
469 | except Exception as e:
470 | print(f"⊘ Timing utilities skipped: {e}")
471 | skipped += 1
472 |
473 | # Test 7: Test GPT-4o adapter initialization
474 | print("\n[7/8] Testing GPT-4o adapter initialization...")
475 | try:
476 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter
477 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key")
478 | assert adapter.model_name == "gpt-4o"
479 | print("✓ GPT-4o adapter initializes")
480 | passed += 1
481 | except Exception as e:
482 | print(f"✗ GPT-4o adapter failed: {e}")
483 | failed += 1
484 |
485 | # Test 8: Test MRCR context parsing
486 | print("\n[8/8] Testing MRCR context parsing...")
487 | try:
488 | from models.realtime.gpt_realtime import parse_mrcr_context
489 | context = "User: Hello\nAssistant: Hi!"
490 | messages = parse_mrcr_context(context)
491 | assert len(messages) == 2
492 | print("✓ MRCR parsing works")
493 | passed += 1
494 | except Exception as e:
495 | print(f"⊘ MRCR parsing skipped: {e}")
496 | skipped += 1
497 |
498 | # Summary
499 | print("\n" + "=" * 70)
500 | print("Summary")
501 | print("=" * 70)
502 | print(f"✓ Passed: {passed}")
503 | print(f"✗ Failed: {failed}")
504 | print(f"⊘ Skipped: {skipped}")
505 | print(f"Total: {passed + failed + skipped}")
506 |
507 | if failed == 0:
508 | print("\n✓ All required tests passed!")
509 | return 0
510 | else:
511 | print(f"\n✗ {failed} test(s) failed")
512 | return 1
513 |
514 |
515 | if __name__ == "__main__":
516 | # If run directly, execute smoke tests
517 | # If run with pytest, pytest will discover and run the test classes
518 | import sys
519 | if len(sys.argv) == 1 or not PYTEST_AVAILABLE:
520 | sys.exit(run_smoke_tests())
521 | else:
522 | if PYTEST_AVAILABLE:
523 | pytest.main([__file__] + sys.argv[1:])
524 | else:
525 | print("pytest not installed. Running smoke tests instead.")
526 | sys.exit(run_smoke_tests())
527 |
--------------------------------------------------------------------------------