├── models ├── __init__.py ├── text │ ├── __init__.py │ ├── gpt5.py │ └── gpt4o.py ├── voice │ └── __init__.py ├── realtime │ ├── __init__.py │ ├── liveanswer │ │ ├── utils.py │ │ ├── main.py │ │ ├── mrcr_context.py │ │ ├── stt_service.py │ │ ├── audio_to_answer.py │ │ ├── audio.py │ │ └── explain.py │ ├── freeze_omni.py │ └── moshi.py └── shared │ ├── __init__.py │ ├── base_adapter.py │ └── timing_utils.py ├── tests ├── __init__.py ├── README.md └── test_models.py ├── assets └── vera.png ├── test_voice_episodes └── audio │ ├── vera_aime_0a923d23.wav │ ├── vera_mrcr_00c44580.wav │ ├── vera_simpleqa_0a9d56e1.wav │ ├── vera_browsecomp_9c79d2a8.wav │ └── vera_gpqadiamond_fa834623.wav ├── data ├── download.txt └── README.md ├── LICENSES ├── GPQA.CC-BY-4.0.txt ├── MRCR.txt ├── AIME-2025.MIT.txt ├── SimpleQA.MIT.txt ├── BrowseComp.MIT.txt ├── Boson-Higgs-Audio-2-Community-License.txt └── Meta-Llama-3-Community-License.txt ├── utils ├── __init__.py └── web_search.py ├── NOTICE.txt ├── ATTRIBUTIONS.md ├── .env.template ├── evaluation ├── grader │ ├── __init__.py │ ├── base.py │ ├── prompts.py │ ├── wer_calculator.py │ ├── llm_grader.py │ ├── voice_grader.py │ └── run_grader.py └── text │ ├── batch_evaluate.py │ └── run_evaluation.py ├── .gitignore ├── LICENSE ├── pyproject.toml └── README.md /models/__init__.py: -------------------------------------------------------------------------------- 1 | """VERA Model Adapters""" -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """VERA Model Tests Package""" 2 | -------------------------------------------------------------------------------- /models/text/__init__.py: -------------------------------------------------------------------------------- 1 | """Text model adapters for VERA""" -------------------------------------------------------------------------------- /models/voice/__init__.py: -------------------------------------------------------------------------------- 1 | """Voice model adapters for VERA""" -------------------------------------------------------------------------------- /models/realtime/__init__.py: -------------------------------------------------------------------------------- 1 | """Realtime model adapters for VERA""" -------------------------------------------------------------------------------- /models/shared/__init__.py: -------------------------------------------------------------------------------- 1 | """Shared utilities for model adapters""" -------------------------------------------------------------------------------- /assets/vera.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linyueqian/VERA/HEAD/assets/vera.png -------------------------------------------------------------------------------- /test_voice_episodes/audio/vera_aime_0a923d23.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_aime_0a923d23.wav -------------------------------------------------------------------------------- /test_voice_episodes/audio/vera_mrcr_00c44580.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_mrcr_00c44580.wav -------------------------------------------------------------------------------- /test_voice_episodes/audio/vera_simpleqa_0a9d56e1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_simpleqa_0a9d56e1.wav -------------------------------------------------------------------------------- /test_voice_episodes/audio/vera_browsecomp_9c79d2a8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_browsecomp_9c79d2a8.wav -------------------------------------------------------------------------------- /test_voice_episodes/audio/vera_gpqadiamond_fa834623.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linyueqian/VERA/HEAD/test_voice_episodes/audio/vera_gpqadiamond_fa834623.wav -------------------------------------------------------------------------------- /data/download.txt: -------------------------------------------------------------------------------- 1 | VERA Dataset - Download Instructions 2 | 3 | Download the complete VERA dataset from Google Drive: 4 | 5 | Download URL: https://drive.google.com/file/d/1k0b4qXfQ16fVqe-hMn_GSsfc0exCLgSq/view?usp=sharing 6 | -------------------------------------------------------------------------------- /LICENSES/GPQA.CC-BY-4.0.txt: -------------------------------------------------------------------------------- 1 | GPQA (c) by Irving David Rein 2 | 3 | GPQA is licensed under a Creative Commons Attribution 4.0 International License. 4 | 5 | You should have received a copy of the license along with this 6 | work. If not, see . -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility helpers for the VERA benchmark. 3 | 4 | The voice evaluators expect a handful of shared helpers under ``utils``. 5 | Historically these lived out-of-tree, which meant the packaged release was 6 | missing the module entirely. This package re-introduces the helpers so that 7 | legacy imports like ``from utils.web_search import is_browsecomp_episode`` resolve 8 | at runtime. 9 | """ 10 | 11 | from .web_search import is_browsecomp_episode 12 | 13 | __all__ = ["is_browsecomp_episode"] 14 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Text JSON: upstream licenses (see ATTRIBUTIONS.md). No text edits. 2 | Audio: generated with Boson Higgs Audio 2; subject to the Boson Higgs Audio 2 Community License. 3 | 4 | "Built with Higgs Materials licensed from Boson AI USA, Inc., Copyright © Boson AI USA, Inc., All Rights Reserved and Meta Llama 3 licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc., All Rights Reserved." 5 | 6 | Restriction: do not use the audio outputs to improve any other large language model. 7 | 8 | License texts: see ./LICENSES/ -------------------------------------------------------------------------------- /models/realtime/liveanswer/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | from pathlib import Path 4 | 5 | # Load environment variables from .env if available 6 | from dotenv import load_dotenv # type: ignore 7 | 8 | # Load from the project root .env file 9 | project_root = Path(__file__).resolve().parents[4] # Go up 4 levels to vera_dev 10 | env_path = project_root / ".env" 11 | load_dotenv(env_path) 12 | 13 | 14 | def _env(name: str, default: Optional[str] = None) -> Optional[str]: 15 | value = os.environ.get(name, default) 16 | return value 17 | 18 | 19 | -------------------------------------------------------------------------------- /ATTRIBUTIONS.md: -------------------------------------------------------------------------------- 1 | SimpleQA — Source: https://github.com/openai/simple-evals — License: MIT — Change: audio added via Higgs; no text edits. 2 | BrowseComp — Source: https://github.com/openai/simple-evals — License: MIT — Change: audio added via Higgs; no text edits. 3 | MRCR — Source: https://huggingface.co/datasets/openai/mrcr — License: MIT — Change: audio added via Higgs; no text edits. 4 | GPQA‑Diamond — Source: https://huggingface.co/datasets/Idavidrein/gpqa — License: CC BY 4.0 — Note: HF gate forbids public example release. 5 | Audio synthesis — Model: Boson Higgs Audio 2 — See NOTICE.txt and ./LICENSES/. -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | # OpenAI API Keys 2 | OPENAI_API_KEY=your_openai_api_key_here 3 | OPENAI_BASE_URL=https://api.openai.com/v1 4 | 5 | # Google/Gemini API Keys 6 | GOOGLE_API_KEY=your_google_api_key_here 7 | 8 | # Anthropic API Keys 9 | ANTHROPIC_API_KEY=your_anthropic_api_key_here 10 | 11 | # Azure Speech Services (for ASR/TTS) 12 | AZURE_SPEECH_KEY=your_azure_speech_key_here 13 | AZURE_SPEECH_REGION=your_azure_region_here 14 | 15 | # Azure AI Inference (for Phi-4) 16 | PHI4_CHAT_COMPLETIONS_URL=https://your-phi4-endpoint.services.ai.azure.com/models/chat/completions?api-version=2024-05-01-preview 17 | PHI4_API_KEY=your_phi4_api_key_here 18 | PHI4_MODEL=Phi-4-multimodal-instruct 19 | 20 | # Groq API (optional) 21 | GROQ_API_KEY=your_groq_api_key_here 22 | 23 | # HuggingFace (for local models) 24 | HF_TOKEN=your_hf_token_here -------------------------------------------------------------------------------- /evaluation/grader/__init__.py: -------------------------------------------------------------------------------- 1 | """Grader package: accuracy-first grading utilities. 2 | 3 | This module provides: 4 | - Prompt templates per benchmark 5 | - Grader base classes and result types 6 | - Heuristic and LLM-backed graders 7 | - Voice evaluation with ASR and WER calculation 8 | - A small CLI for grading single triplets or batch outputs 9 | """ 10 | 11 | from .base import GradeResult, GradeLabel 12 | from .prompts import get_accuracy_prompt 13 | from .llm_grader import LLMAccuracyGrader 14 | from .voice_grader import VoiceAccuracyGrader 15 | from .asr_processor import ASRProcessor 16 | from .wer_calculator import WERCalculator 17 | 18 | __all__ = [ 19 | "GradeResult", 20 | "GradeLabel", 21 | "get_accuracy_prompt", 22 | "LLMAccuracyGrader", 23 | "VoiceAccuracyGrader", 24 | "ASRProcessor", 25 | "WERCalculator", 26 | ] 27 | -------------------------------------------------------------------------------- /utils/web_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stubs for legacy web search utilities referenced by voice evaluators. 3 | 4 | The project no longer exposes live web search capabilities, but older releases 5 | still import ``utils.web_search.is_browsecomp_episode`` to decide whether the 6 | BrowseComp tooling should run. We keep a minimal shim so those imports resolve 7 | without pulling in unavailable dependencies. 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from typing import Any, Dict, Optional 13 | 14 | 15 | def is_browsecomp_episode(episode_data: Optional[Dict[str, Any]]) -> bool: 16 | """Return True if this episode should enable web search tooling. 17 | 18 | Strategy: enable only for browsecomp benchmark by id/track hints. 19 | """ 20 | episode_id = (episode_data or {}).get("id", "").lower() 21 | track = (episode_data or {}).get("track", "").lower() 22 | return "browsecomp" in episode_id or track == "browsecomp" 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # macOS 2 | .DS_Store 3 | .DS_Store? 4 | ._* 5 | .Spotlight-V100 6 | .Trashes 7 | ehthumbs.db 8 | Thumbs.db 9 | 10 | # Python 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | *.so 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # Virtual environments 34 | .env 35 | .venv 36 | env/ 37 | venv/ 38 | ENV/ 39 | env.bak/ 40 | venv.bak/ 41 | 42 | # IDE 43 | .vscode/ 44 | .idea/ 45 | *.swp 46 | *.swo 47 | *~ 48 | 49 | # Jupyter Notebook 50 | .ipynb_checkpoints 51 | 52 | # pytest 53 | .pytest_cache/ 54 | 55 | # Coverage reports 56 | htmlcov/ 57 | .coverage 58 | .coverage.* 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | 63 | # mypy 64 | .mypy_cache/ 65 | .dmypy.json 66 | dmypy.json 67 | 68 | # Results and logs 69 | results/ 70 | logs/ 71 | *.log 72 | 73 | # Large dataset files (should be downloaded separately) 74 | data/final_dataset/ 75 | 76 | .claude -------------------------------------------------------------------------------- /LICENSES/MRCR.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /LICENSES/AIME-2025.MIT.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Adobe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /LICENSES/SimpleQA.MIT.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /LICENSES/BrowseComp.MIT.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /evaluation/grader/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from typing import Optional, Dict, Any 6 | 7 | 8 | class GradeLabel(str, Enum): 9 | CORRECT = "CORRECT" 10 | INCORRECT = "INCORRECT" 11 | NOT_ATTEMPTED = "NOT_ATTEMPTED" 12 | 13 | 14 | @dataclass 15 | class GradeResult: 16 | label: GradeLabel 17 | extracted_final_answer: Optional[str] = None 18 | reasoning: Optional[str] = None 19 | correct_flag: Optional[bool] = None 20 | confidence: Optional[float] = None # 0-100 21 | raw_model_output: Optional[str] = None 22 | metadata: Optional[Dict[str, Any]] = None 23 | 24 | 25 | class BaseAccuracyGrader: 26 | """Interface for accuracy graders. 27 | 28 | Implementations should focus on judging whether a predicted response 29 | answers the question correctly with respect to a gold target. 30 | """ 31 | 32 | def grade( 33 | self, 34 | question: str, 35 | gold_target: str, 36 | predicted_answer: str, 37 | benchmark: Optional[str] = None, 38 | mode: str = "triad", # "triad" -> A/B/C, "binary" -> yes/no 39 | ) -> GradeResult: 40 | raise NotImplementedError 41 | 42 | -------------------------------------------------------------------------------- /models/realtime/liveanswer/main.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from pathlib import Path 3 | from typing import List, Optional 4 | 5 | from .explain import ExplainSynthesizer 6 | from .audio import AudioGenerator 7 | from .solver_standard import StandardProblemSolver as ProblemSolver 8 | 9 | 10 | def main_request(request: str, audio_file_path: Optional[str] = None) -> tuple[bytes, float, str, str]: 11 | """ 12 | Orchestrates the pipeline: 13 | - Create ExplainSynthesizer 14 | - Run AudioGenerator and ProblemSolver concurrently 15 | - Return resulting MP3 bytes, time to first response, GPT-5 response, and Groq explanation 16 | """ 17 | 18 | explainer = ExplainSynthesizer(request=request) 19 | audio_gen = AudioGenerator(explainer=explainer) 20 | solver = ProblemSolver(explainer=explainer, audio_file_path=audio_file_path) 21 | 22 | audio_bytes_holder: List[bytes] = [] 23 | time_to_first_response_holder: List[float] = [] 24 | 25 | def run_audio(): 26 | audio_bytes, time_to_first_response = audio_gen.start() 27 | audio_bytes_holder.append(audio_bytes) 28 | time_to_first_response_holder.append(time_to_first_response) 29 | 30 | t_audio = threading.Thread(target=run_audio, name="audio_gen") 31 | t_solver = threading.Thread(target=lambda: solver.start(request), name="problem_solver") 32 | 33 | t_audio.start() 34 | t_solver.start() 35 | 36 | t_audio.join() 37 | t_solver.join() 38 | 39 | audio_bytes = audio_bytes_holder[0] if audio_bytes_holder else b"" 40 | time_to_first_response = time_to_first_response_holder[0] if time_to_first_response_holder else 0.0 41 | 42 | # Get both the GPT-5 response and the Groq explanation 43 | gpt5_response = getattr(explainer, 'gpt5_response', 'GPT-5 response not captured') 44 | groq_explanation = getattr(explainer, 'spoken_explanation', 'Groq explanation not captured') 45 | 46 | if audio_bytes: 47 | output_path = Path(__file__).resolve().parents[1] / "liveanswer-output.mp3" 48 | output_path.write_bytes(audio_bytes) 49 | 50 | return audio_bytes, time_to_first_response, gpt5_response, groq_explanation 51 | 52 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # VERA Model Tests 2 | 3 | This directory contains tests for VERA model adapters. 4 | 5 | ## Running Tests 6 | 7 | ### Quick Smoke Tests (No Dependencies) 8 | 9 | Run the basic smoke tests without installing pytest: 10 | 11 | ```bash 12 | python tests/test_models.py 13 | ``` 14 | 15 | This will test: 16 | - Base adapter classes 17 | - Text model imports (GPT-4o, Gemini, etc.) 18 | - Voice model imports (Qwen2-Audio, Ultravox, etc.) 19 | - Realtime model imports (GPT Realtime, Gemini Realtime, Moshi) 20 | - Configuration utilities 21 | - MRCR context parsing 22 | 23 | ### Full Test Suite (with pytest) 24 | 25 | If you have pytest installed, run comprehensive tests: 26 | 27 | ```bash 28 | # Install pytest if needed 29 | pip install pytest 30 | 31 | # Run all tests 32 | pytest tests/test_models.py 33 | 34 | # Run with verbose output 35 | pytest tests/test_models.py -v 36 | 37 | # Run specific test class 38 | pytest tests/test_models.py::TestGPT4oAdapter -v 39 | ``` 40 | 41 | ## Test Coverage 42 | 43 | ### Base Classes (`TestBaseAdapter`) 44 | - ✓ ModelConfig creation 45 | - ✓ BaseAdapter initialization 46 | 47 | ### Text Models 48 | - ✓ GPT-4o adapter (`TestGPT4oAdapter`) 49 | - ✓ Gemini 2.5 Pro adapter (`TestGemini25ProAdapter`) 50 | - ✓ GPT-5 adapter (`TestGPT5Adapter`) 51 | 52 | ### Voice Models 53 | - ✓ Qwen2-Audio adapter (`TestQwen2AudioAdapter`) 54 | - ✓ Ultravox adapter (`TestUltravoxAdapter`) 55 | 56 | ### Realtime Models 57 | - ✓ GPT Realtime adapter (`TestGPTRealtimeAdapter`) 58 | - ✓ Gemini Realtime adapter (`TestGeminiRealtimeAdapter`) 59 | - ✓ Moshi adapter (`TestMoshiAdapter`) 60 | 61 | ### Utilities 62 | - ✓ Timing utilities (`TestTimingUtils`) 63 | 64 | ## Test Output 65 | 66 | ### Success 67 | ``` 68 | ✓ All required tests passed! 69 | ``` 70 | 71 | ### Skipped Tests 72 | Some tests may be skipped if optional dependencies aren't installed: 73 | ``` 74 | ⊘ Voice models skipped: No module named 'librosa' 75 | ``` 76 | 77 | This is expected and won't affect the core functionality tests. 78 | 79 | ## Adding New Tests 80 | 81 | To add tests for a new model: 82 | 83 | 1. Import the model adapter 84 | 2. Create a test class (e.g., `TestMyNewAdapter`) 85 | 3. Add test methods starting with `test_` 86 | 4. Update the smoke tests in `run_smoke_tests()` if needed 87 | 88 | Example: 89 | 90 | ```python 91 | class TestMyNewAdapter: 92 | """Test my new adapter""" 93 | 94 | def test_adapter_initialization(self): 95 | """Test adapter can be initialized""" 96 | from models.mytype.mynew import MyNewAdapter 97 | 98 | adapter = MyNewAdapter(api_key="test-key") 99 | assert adapter.model_name == "my-new-model" 100 | ``` 101 | 102 | ## Notes 103 | 104 | - Tests use mocking to avoid requiring API keys or making real API calls 105 | - Voice model tests may require additional dependencies (librosa, torch, vllm) 106 | - Realtime model tests check module imports and basic functionality 107 | - The test suite is designed to run quickly and not require model downloads 108 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "vera-benchmark" 7 | version = "1.0.0" 8 | description = "VERA: Voice-Enabled Reasoning Assessment Benchmark for Evaluating Reasoning Resilience in Voice Agents" 9 | readme = "README.md" 10 | requires-python = ">=3.9" 11 | license = {text = "MIT"} 12 | authors = [ 13 | {name = "VERA Team", email = "vera@example.com"} 14 | ] 15 | keywords = ["voice", "reasoning", "benchmark", "ai", "evaluation"] 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Developers", 19 | "Intended Audience :: Science/Research", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | "Programming Language :: Python :: 3", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: 3.11", 26 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 27 | "Topic :: Multimedia :: Sound/Audio :: Speech", 28 | ] 29 | 30 | dependencies = [ 31 | "pydantic>=2.0.0", 32 | "pyyaml>=6.0", 33 | "click>=8.0.0", 34 | "numpy>=1.26.0", 35 | "scipy>=1.7.0", 36 | "openai>=1.0.0", 37 | "anthropic>=0.7.0", 38 | "datasets>=2.0.0", 39 | "librosa>=0.9.0", 40 | "soundfile>=0.12.0", 41 | "webrtcvad>=2.0.10", 42 | "websockets>=12.0", 43 | "azure-cognitiveservices-speech>=1.30.0", 44 | "requests>=2.25.0", 45 | "httpx>=0.24.0", 46 | "tqdm>=4.60.0", 47 | "jsonlines>=3.0.0", 48 | "tiktoken>=0.4.0", 49 | "python-dotenv>=0.19.0", 50 | "rich>=12.0.0", 51 | "matplotlib>=3.5.0", 52 | "seaborn>=0.11.0", 53 | "pandas>=1.3.0", 54 | "pytest>=8.3.5", 55 | "pytest-cov>=5.0.0", 56 | "pydub>=0.25.1", 57 | "nemo-toolkit[asr]>=1.23.0", 58 | "google-genai>=1.36.0", 59 | "websocket-client>=1.8.0", 60 | "vllm>=0.1.2", 61 | "groq>=0.31.1", 62 | "sphn>=0.2.0", 63 | "python-socketio[client]>=5.13.0", 64 | "dotenv>=0.9.9", 65 | ] 66 | 67 | [project.optional-dependencies] 68 | dev = [ 69 | "pytest>=7.0.0", 70 | "pytest-cov>=4.0.0", 71 | "black>=22.0.0", 72 | "isort>=5.10.0", 73 | "flake8>=4.0.0", 74 | "mypy>=0.991", 75 | "pre-commit>=2.17.0", 76 | ] 77 | 78 | [project.urls] 79 | Homepage = "https://github.com/linyueqian/VERA" 80 | Documentation = "https://github.com/linyueqian/VERA" 81 | Repository = "https://github.com/linyueqian/VERA.git" 82 | "Bug Tracker" = "https://github.com/linyueqian/VERA/issues" 83 | 84 | ## No console scripts are exposed by this repository 85 | 86 | [tool.setuptools.packages.find] 87 | where = ["."] 88 | exclude = ["tests*", "*.tests*", "models*"] 89 | 90 | ## No package data declared; this repo is primarily a benchmark codebase 91 | 92 | [tool.black] 93 | line-length = 88 94 | target-version = ['py39'] 95 | include = '\.pyi?$' 96 | extend-exclude = ''' 97 | /( 98 | \.eggs 99 | | \.git 100 | | \.hg 101 | | \.mypy_cache 102 | | \.tox 103 | | \.venv 104 | | _build 105 | | buck-out 106 | | build 107 | | dist 108 | )/ 109 | ''' 110 | 111 | [tool.isort] 112 | profile = "black" 113 | multi_line_output = 3 114 | include_trailing_comma = true 115 | force_grid_wrap = 0 116 | use_parentheses = true 117 | ensure_newline_before_comments = true 118 | line_length = 88 119 | 120 | [tool.mypy] 121 | python_version = "3.9" 122 | warn_return_any = true 123 | warn_unused_configs = true 124 | disallow_untyped_defs = true 125 | ignore_missing_imports = true 126 | 127 | [tool.pytest.ini_options] 128 | testpaths = ["tests"] 129 | python_files = ["test_*.py"] 130 | python_classes = ["Test*"] 131 | python_functions = ["test_*"] 132 | addopts = "--cov=models --cov=evaluation --cov-report=term-missing" 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VERA: Voice Evaluation of Reasoning Ability 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![arXiv](https://img.shields.io/badge/arXiv-2509.26542-b31b1b.svg)](https://arxiv.org/abs/2509.26542) 4 | 5 | **Voice Evaluation of Reasoning Ability: Diagnosing the Modality-Induced Performance Gap** 6 | 7 | ![VERA overview](assets/vera.png) 8 | 9 | We present Voice Evaluation of Reasoning Ability (VERA), a benchmark for evaluating reasoning ability in voice-interactive systems under real-time conversational constraints. VERA comprises 2,931 voice-native episodes derived from established text benchmarks and organized into five tracks (Math, Web, Science, Long-Context, Factual). Each item is adapted for speech interaction while preserving reasoning difficulty. 10 | 11 | ## Installation 12 | 13 | ```bash 14 | # Clone the repository 15 | git clone https://github.com/linyueqian/VERA.git 16 | cd VERA 17 | 18 | # Install uv if you haven't already 19 | curl -LsSf https://astral.sh/uv/install.sh | sh 20 | 21 | # Install dependencies (handles virtual environment automatically) 22 | uv sync 23 | ``` 24 | 25 | ## Dataset 26 | 27 | The VERA dataset contains 2,931 voice-native episodes across five tracks. Questions and answers are encrypted using XOR cipher to prevent memorization. See [data/README.md](data/README.md) for complete details on structure, encryption, and decryption. 28 | 29 | ### Download 30 | 31 | Download the complete dataset from Google Drive: https://drive.google.com/file/d/1k0b4qXfQ16fVqe-hMn_GSsfc0exCLgSq/view?usp=sharing 32 | 33 | ### Sample Data 34 | 35 | Check `test_voice_episodes/` for unencrypted examples: 36 | 37 | ```bash 38 | # View sample episode structure 39 | cat test_voice_episodes/test.json 40 | 41 | # Listen to sample audio 42 | ls test_voice_episodes/audio/ 43 | ``` 44 | 45 | ## Quick Start 46 | 47 | ### 1. Set up API keys 48 | 49 | ```bash 50 | cp .env.template .env 51 | # Edit .env with your API keys 52 | ``` 53 | 54 | ### 2. Run evaluation 55 | 56 | ```bash 57 | # Evaluate voice models 58 | uv run python evaluation/voice/batch_evaluate.py 59 | 60 | # Evaluate text models (for comparison) 61 | uv run python evaluation/text/batch_evaluate.py 62 | 63 | # Evaluate realtime models 64 | uv run python evaluation/realtime/batch_evaluate.py 65 | ``` 66 | 67 | ### 3. View results 68 | 69 | Results will be saved in the specified output directory with performance metrics and analysis. 70 | 71 | ## Acknowledgements 72 | 73 | We thank the [Full-Duplex-Bench](https://github.com/DanielLin94144/Full-Duplex-Bench) project for their implementations of several realtime models, including Freeze-Omni, Moshi, and Sonic, which we adapted for use in VERA. 74 | 75 | ## Citation 76 | 77 | If you use VERA in your research, please cite our paper: 78 | 79 | ```bibtex 80 | @misc{lin2025vera, 81 | title={Voice Evaluation of Reasoning Ability: Diagnosing the Modality-Induced Performance Gap}, 82 | author={Lin, Yueqian and Hu, Zhengmian and Wang, Qinsi and Liu, Yudong and Zhang, Hengfan and Subramanian, Jayakumar and Vlassis, Nikos and Li, Hai Helen and Chen, Yiran}, 83 | year={2025}, 84 | eprint={2509.26542}, 85 | archivePrefix={arXiv}, 86 | primaryClass={eess.AS}, 87 | url={https://arxiv.org/abs/2509.26542} 88 | } 89 | ``` 90 | 91 | ## License 92 | 93 | This project uses a dual licensing structure: 94 | 95 | - **Code**: MIT License (see [LICENSE](LICENSE)) 96 | - **Data**: The text data follows upstream licenses (MIT for SimpleQA, BrowseComp, MRCR; CC BY 4.0 for GPQA-Diamond). The audio data is generated with Boson Higgs Audio 2 and is subject to the Boson Higgs Audio 2 Community License. 97 | 98 | For complete licensing details, attribution information, and restrictions, please see: 99 | - [ATTRIBUTIONS.md](ATTRIBUTIONS.md) for data source attributions 100 | - [NOTICE.txt](NOTICE.txt) for audio generation licensing and restrictions 101 | - [LICENSES/](LICENSES/) for full license texts 102 | -------------------------------------------------------------------------------- /evaluation/text/batch_evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Batch evaluation script to run multiple models on multiple datasets 3 | Convenient wrapper around run_evaluation.py 4 | """ 5 | 6 | import os 7 | import sys 8 | import asyncio 9 | import argparse 10 | from pathlib import Path 11 | from datetime import datetime 12 | from dotenv import load_dotenv 13 | from evaluation.text.run_evaluation import TextModelEvaluator 14 | 15 | # Load environment variables from .env file 16 | load_dotenv() 17 | 18 | def get_available_datasets(): 19 | """Get list of available dataset files""" 20 | dataset_dir = Path(__file__).parent.parent.parent / 'data' / 'final_dataset' / 'text' 21 | if not dataset_dir.exists(): 22 | return [] 23 | return [f.stem.replace('_voice_episodes', '') for f in dataset_dir.glob('*_voice_episodes.json')] 24 | 25 | def get_dataset_path(dataset_name): 26 | """Get full path to dataset file""" 27 | dataset_dir = Path(__file__).parent.parent.parent / 'data' / 'final_dataset' / 'text' 28 | return dataset_dir / f'{dataset_name}_voice_episodes.json' 29 | 30 | async def main(): 31 | parser = argparse.ArgumentParser(description='Batch evaluate text models on VERA datasets') 32 | parser.add_argument('--models', nargs='+', 33 | choices=['gpt4o', 'gpt5-instant', 'gpt5-thinking', 'gemini-2.5-pro', 'gemini-2.5-flash'], 34 | default=['gpt4o', 'gpt5-instant', 'gpt5-thinking', 'gemini-2.5-pro', 'gemini-2.5-flash'], 35 | help='Models to evaluate') 36 | parser.add_argument('--datasets', nargs='+', 37 | choices=get_available_datasets(), 38 | default=get_available_datasets(), 39 | help='Datasets to evaluate on') 40 | parser.add_argument('--max-episodes', type=int, 41 | help='Maximum episodes per dataset') 42 | parser.add_argument('--max-concurrent', type=int, default=16, 43 | help='Maximum concurrent requests') 44 | parser.add_argument('--sequential', action='store_true', 45 | help='Run evaluations sequentially instead of in parallel') 46 | parser.add_argument('--resume-from', type=str, 47 | help='Resume from existing output directory (e.g., test_output/gemini-2.5-pro_browsecomp_20250917_215054)') 48 | 49 | args = parser.parse_args() 50 | 51 | print("Available datasets:", get_available_datasets()) 52 | print(f"Selected models: {args.models}") 53 | print(f"Selected datasets: {args.datasets}") 54 | 55 | evaluator = TextModelEvaluator() 56 | 57 | # Create combinations of model and dataset 58 | tasks = [] 59 | for model in args.models: 60 | for dataset in args.datasets: 61 | dataset_path = get_dataset_path(dataset) 62 | if not dataset_path.exists(): 63 | print(f"Warning: Dataset not found: {dataset_path}") 64 | continue 65 | 66 | print(f"Queuing: {model} on {dataset}") 67 | task = evaluator.run_evaluation( 68 | model, str(dataset_path), 69 | args.max_episodes, args.max_concurrent, 70 | resume_from=args.resume_from 71 | ) 72 | tasks.append((model, dataset, task)) 73 | 74 | if not tasks: 75 | print("No valid model/dataset combinations found") 76 | return 1 77 | 78 | print(f"\nStarting {len(tasks)} evaluation tasks...") 79 | start_time = datetime.now() 80 | 81 | if args.sequential: 82 | # Run sequentially 83 | for model, dataset, task in tasks: 84 | print(f"\n--- Running {model} on {dataset} ---") 85 | try: 86 | await task 87 | print(f"✓ Completed {model} on {dataset}") 88 | except Exception as e: 89 | print(f"✗ Failed {model} on {dataset}: {e}") 90 | else: 91 | # Run in parallel 92 | results = await asyncio.gather(*[task for _, _, task in tasks], return_exceptions=True) 93 | 94 | # Print results 95 | for i, (model, dataset, _) in enumerate(tasks): 96 | if isinstance(results[i], Exception): 97 | print(f"✗ Failed {model} on {dataset}: {results[i]}") 98 | else: 99 | print(f"✓ Completed {model} on {dataset}") 100 | 101 | end_time = datetime.now() 102 | total_duration = (end_time - start_time).total_seconds() 103 | 104 | print(f"\nBatch evaluation completed in {total_duration:.2f} seconds") 105 | print(f"Results saved to test_output/ directory") 106 | 107 | if __name__ == "__main__": 108 | exit(asyncio.run(main())) 109 | -------------------------------------------------------------------------------- /models/shared/base_adapter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base adapter interface for VERA model implementations 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import Dict, Any, List, Optional 7 | from dataclasses import dataclass 8 | from pathlib import Path 9 | import asyncio 10 | 11 | 12 | @dataclass 13 | class ModelConfig: 14 | """Base configuration for model adapters""" 15 | model_name: str 16 | temperature: float = 0.0 17 | max_tokens: int = 4096 18 | timeout: float = 300.0 19 | max_concurrent: int = 16 20 | 21 | 22 | class BaseAdapter(ABC): 23 | """Base class for all VERA model adapters""" 24 | 25 | def __init__(self, config: ModelConfig): 26 | self.config = config 27 | self.model_name = config.model_name 28 | 29 | @abstractmethod 30 | def process_episode(self, episode: Dict[str, Any], output_dir: str) -> Dict[str, Any]: 31 | """ 32 | Process a single episode. 33 | 34 | Args: 35 | episode: Episode data containing turns and metadata 36 | output_dir: Directory to save outputs 37 | 38 | Returns: 39 | Standardized episode result 40 | """ 41 | pass 42 | 43 | async def process_episodes_batch( 44 | self, 45 | episodes: List[Dict[str, Any]], 46 | output_dir: str, 47 | max_concurrent: Optional[int] = None 48 | ) -> Dict[str, Any]: 49 | """ 50 | Process multiple episodes concurrently. 51 | 52 | Args: 53 | episodes: List of episodes to process 54 | output_dir: Directory to save outputs 55 | max_concurrent: Maximum concurrent episodes (uses config default if None) 56 | 57 | Returns: 58 | Standardized batch result 59 | """ 60 | from .timing_utils import create_standardized_batch_result 61 | import time 62 | 63 | max_concurrent = max_concurrent or self.config.max_concurrent 64 | print(f"[{self.model_name}] Batch processing {len(episodes)} episodes (max {max_concurrent} concurrent)") 65 | 66 | output_path = Path(output_dir) 67 | output_path.mkdir(parents=True, exist_ok=True) 68 | 69 | start_time = time.time() 70 | semaphore = asyncio.Semaphore(max_concurrent) 71 | 72 | async def process_one(episode): 73 | async with semaphore: 74 | return await asyncio.to_thread(self.process_episode, episode, output_dir) 75 | 76 | tasks = [process_one(ep) for ep in episodes] 77 | results = await asyncio.gather(*tasks, return_exceptions=True) 78 | 79 | processed_results = [] 80 | for i, result in enumerate(results): 81 | if isinstance(result, Exception): 82 | processed_results.append({ 83 | "episode_id": episodes[i].get("id", f"episode_{i}"), 84 | "turns": [], 85 | "total_time": 0, 86 | "success": False, 87 | "error": str(result), 88 | "metadata": {} 89 | }) 90 | else: 91 | processed_results.append(result) 92 | 93 | total_time = time.time() - start_time 94 | return create_standardized_batch_result( 95 | episodes=processed_results, 96 | total_time=total_time, 97 | model_name=self.model_name 98 | ) 99 | 100 | 101 | class TextAdapter(BaseAdapter): 102 | """Base class for text-based model adapters""" 103 | 104 | def __init__(self, config: ModelConfig, api_key: str): 105 | super().__init__(config) 106 | self.api_key = api_key 107 | 108 | @abstractmethod 109 | def _make_api_request(self, messages: List[Dict[str, str]], **kwargs) -> str: 110 | """Make API request to text model""" 111 | pass 112 | 113 | 114 | class VoiceAdapter(BaseAdapter): 115 | """Base class for voice model adapters""" 116 | 117 | def __init__(self, config: ModelConfig): 118 | super().__init__(config) 119 | 120 | @abstractmethod 121 | def _process_audio_input(self, audio_path: str, text_prompt: str) -> str: 122 | """Process audio input with text prompt""" 123 | pass 124 | 125 | 126 | class RealtimeAdapter(BaseAdapter): 127 | """Base class for realtime model adapters""" 128 | 129 | def __init__(self, config: ModelConfig): 130 | super().__init__(config) 131 | 132 | @abstractmethod 133 | def _establish_connection(self) -> Any: 134 | """Establish connection to realtime model""" 135 | pass 136 | 137 | @abstractmethod 138 | def _send_audio_chunk(self, connection: Any, audio_data: bytes) -> None: 139 | """Send audio chunk to model""" 140 | pass 141 | 142 | @abstractmethod 143 | def _receive_response(self, connection: Any) -> str: 144 | """Receive response from model""" 145 | pass -------------------------------------------------------------------------------- /models/shared/timing_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared timing utilities for model adapters 3 | """ 4 | 5 | import time 6 | import json 7 | from typing import Dict, Any, List, Optional 8 | from pathlib import Path 9 | 10 | 11 | def make_timed_api_request(request_func, *args, **kwargs) -> Dict[str, Any]: 12 | """ 13 | Execute an API request with timing information. 14 | 15 | Args: 16 | request_func: The function to call 17 | *args, **kwargs: Arguments to pass to the function 18 | 19 | Returns: 20 | Dict containing timing info and result 21 | """ 22 | start_time = time.time() 23 | try: 24 | result = request_func(*args, **kwargs) 25 | end_time = time.time() 26 | return { 27 | "result": result, 28 | "timing": { 29 | "start_time": start_time, 30 | "end_time": end_time, 31 | "duration": end_time - start_time 32 | }, 33 | "success": True 34 | } 35 | except Exception as e: 36 | end_time = time.time() 37 | return { 38 | "result": None, 39 | "timing": { 40 | "start_time": start_time, 41 | "end_time": end_time, 42 | "duration": end_time - start_time 43 | }, 44 | "success": False, 45 | "error": str(e) 46 | } 47 | 48 | 49 | def create_turn_result( 50 | turn_index: int, 51 | prompt: str, 52 | response: str, 53 | timing: Dict[str, float], 54 | success: bool = True, 55 | error: Optional[str] = None, 56 | metadata: Optional[Dict[str, Any]] = None 57 | ) -> Dict[str, Any]: 58 | """ 59 | Create a standardized turn result. 60 | 61 | Args: 62 | turn_index: Index of the turn 63 | prompt: The input prompt 64 | response: The model response 65 | timing: Timing information 66 | success: Whether the turn was successful 67 | error: Error message if any 68 | metadata: Additional metadata 69 | 70 | Returns: 71 | Standardized turn result dict 72 | """ 73 | return { 74 | "turn_index": turn_index, 75 | "prompt": prompt, 76 | "response": response, 77 | "timing": timing, 78 | "success": success, 79 | "error": error, 80 | "metadata": metadata or {} 81 | } 82 | 83 | 84 | def create_standardized_episode_result( 85 | episode_id: str, 86 | turns: List[Dict[str, Any]], 87 | total_time: float, 88 | success: bool = True, 89 | error: Optional[str] = None, 90 | metadata: Optional[Dict[str, Any]] = None 91 | ) -> Dict[str, Any]: 92 | """ 93 | Create a standardized episode result. 94 | 95 | Args: 96 | episode_id: Unique episode identifier 97 | turns: List of turn results 98 | total_time: Total processing time 99 | success: Whether the episode was successful 100 | error: Error message if any 101 | metadata: Additional metadata 102 | 103 | Returns: 104 | Standardized episode result dict 105 | """ 106 | return { 107 | "episode_id": episode_id, 108 | "turns": turns, 109 | "total_time": total_time, 110 | "success": success, 111 | "error": error, 112 | "metadata": metadata or {}, 113 | "num_turns": len(turns), 114 | "successful_turns": sum(1 for turn in turns if turn.get("success", True)) 115 | } 116 | 117 | 118 | def create_standardized_batch_result( 119 | episodes: List[Dict[str, Any]], 120 | total_time: float, 121 | model_name: str, 122 | metadata: Optional[Dict[str, Any]] = None 123 | ) -> Dict[str, Any]: 124 | """ 125 | Create a standardized batch result. 126 | 127 | Args: 128 | episodes: List of episode results 129 | total_time: Total batch processing time 130 | model_name: Name of the model used 131 | metadata: Additional metadata 132 | 133 | Returns: 134 | Standardized batch result dict 135 | """ 136 | successful_episodes = sum(1 for ep in episodes if ep.get("success", True)) 137 | total_turns = sum(ep.get("num_turns", 0) for ep in episodes) 138 | successful_turns = sum(ep.get("successful_turns", 0) for ep in episodes) 139 | 140 | return { 141 | "model_name": model_name, 142 | "episodes": episodes, 143 | "summary": { 144 | "total_episodes": len(episodes), 145 | "successful_episodes": successful_episodes, 146 | "total_turns": total_turns, 147 | "successful_turns": successful_turns, 148 | "success_rate": successful_episodes / len(episodes) if episodes else 0, 149 | "turn_success_rate": successful_turns / total_turns if total_turns > 0 else 0 150 | }, 151 | "total_time": total_time, 152 | "average_time_per_episode": total_time / len(episodes) if episodes else 0, 153 | "metadata": metadata or {}, 154 | "timestamp": time.time() 155 | } 156 | 157 | 158 | def save_results(results: Dict[str, Any], output_path: Path, indent: int = 2) -> None: 159 | """ 160 | Save results to JSON file. 161 | 162 | Args: 163 | results: Results dictionary to save 164 | output_path: Path to save the results 165 | indent: JSON indentation level 166 | """ 167 | output_path.parent.mkdir(parents=True, exist_ok=True) 168 | with open(output_path, 'w', encoding='utf-8') as f: 169 | json.dump(results, f, indent=indent, ensure_ascii=False) -------------------------------------------------------------------------------- /evaluation/grader/prompts.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | 6 | # Triad accuracy template (A/B/C) 7 | GRADER_TEMPLATE_TRIAD = ( 8 | """ 9 | Your job is to look at a question, a ground truth answer, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. 10 | First, examples for calibration are provided. Then, only return A/B/C for the new case. 11 | 12 | The following are examples of CORRECT predicted answers. 13 | ``` 14 | Question: What are the names of Barack Obama's children? 15 | Ground truth: Malia Obama and Sasha Obama 16 | Predicted answer 1: sasha and malia obama 17 | Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check 18 | Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. 19 | ``` 20 | These predicted answers are all CORRECT because: 21 | - They fully contain the important information in the ground truth. 22 | - They do not contain any information that contradicts the ground truth. 23 | - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. 24 | - Hedging and guessing are permissible, provided the response fully includes the ground truth and contains no contradictions. 25 | 26 | The following are examples of INCORRECT predicted answers. 27 | ``` 28 | Question: What are the names of Barack Obama's children? 29 | Ground truth: Malia and Sasha 30 | Predicted answer 1: Malia. 31 | Predicted answer 2: Malia, Sasha, and Susan. 32 | Predicted answer 3: Barack Obama does not have any children. 33 | Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. 34 | Predicted answer 5: While I don't know their exact names, I can tell you that Barack Obama has three children. 35 | Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm with updated references. 36 | ``` 37 | These predicted answers are all INCORRECT because: 38 | - A factual statement contradicts the ground truth. Hedged incorrect statements are still incorrect. 39 | 40 | The following are examples of NOT_ATTEMPTED predicted answers. 41 | ``` 42 | Question: What are the names of Barack Obama's children? 43 | Ground truth: Malia and Sasha 44 | Predicted answer 1: I don't know. 45 | Predicted answer 2: I need more context about which Obama you are talking about. 46 | Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. 47 | Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. 48 | ``` 49 | These are NOT_ATTEMPTED because: 50 | - The important information in the ground truth is not included, and there is no direct contradiction. 51 | 52 | Additional guidance: 53 | - For numerical ground truths, accept answers correct to the last significant figure (e.g., 120k ~ 115k–124k). 54 | - The ground truth may contain more information than the question; the predicted answer only needs to cover what's asked. 55 | - Obvious inferences are allowed (e.g., omit "California" when "San Francisco, California" is the target for "What city..."). 56 | - Reasonable typos in names are acceptable if clearly the same entity. 57 | 58 | Here is a new example. Reply with A/B/C only, no extra text. 59 | ``` 60 | Ground truth: {ground_truth} 61 | Predicted answer: {predicted_answer} 62 | ``` 63 | 64 | Grade the predicted answer as one of: 65 | A: CORRECT 66 | B: INCORRECT 67 | C: NOT_ATTEMPTED 68 | 69 | Just return the single letter: "A", "B", or "C". 70 | """ 71 | .strip() 72 | ) 73 | 74 | 75 | def _benchmark_header(benchmark: Optional[str]) -> str: 76 | if not benchmark: 77 | return "" 78 | # Short, domain-aware header if desired 79 | bm = benchmark.lower() 80 | if "aime" in bm: 81 | return ( 82 | "Mathematical QA: judge strictly on the final numeric/value answer. " 83 | "Allow only minimal rounding consistent with the ground truth; reject paraphrases." 84 | ) 85 | if "browsecomp" in bm: 86 | return ( 87 | "Web QA: prioritize factual exactness over citations; judge only whether the predicted answer matches the ground truth." 88 | ) 89 | if "gpqa" in bm: 90 | return ( 91 | "Graduate-level science QA: semantic equivalence is acceptable if factually identical; contradictions are incorrect." 92 | ) 93 | if "mrcr" in bm: 94 | return ( 95 | "Long-context needle retrieval: mark CORRECT only if the predicted answer contains the exact ground truth string " 96 | "as a contiguous span (case-insensitive). Paraphrases, substitutions, or partial matches are INCORRECT. " 97 | "Ignore surrounding commentary; focus solely on inclusion of the exact phrase." 98 | ) 99 | if "simpleqa" in bm: 100 | return ( 101 | "Simple factual recall: require the predicted answer to match the ground truth entity/value. " 102 | "Minor spelling variations are acceptable only if clearly the same name." 103 | ) 104 | return "" 105 | 106 | 107 | def get_accuracy_prompt( 108 | question: Optional[str], 109 | ground_truth: str, 110 | predicted_answer: str, 111 | benchmark: Optional[str] = None, 112 | ) -> str: 113 | """Return a benchmark-aware accuracy grading prompt (triad A/B/C).""" 114 | header = _benchmark_header(benchmark) 115 | 116 | # Triad prompt (A/B/C) 117 | core = GRADER_TEMPLATE_TRIAD.format( 118 | ground_truth=ground_truth, 119 | predicted_answer=predicted_answer, 120 | ) 121 | return f"{header}\n\n{core}".strip() if header else core 122 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # VERA Dataset 2 | 3 | This directory contains the VERA (Voice Evaluation of Reasoning Ability) dataset. 4 | 5 | ## Download 6 | 7 | The complete dataset is available on Google Drive: 8 | 9 | **Download URL:** https://drive.google.com/file/d/1k0b4qXfQ16fVqe-hMn_GSsfc0exCLgSq/view?usp=sharing 10 | 11 | ## Dataset Overview 12 | 13 | The VERA dataset contains **2,931 voice-native episodes** organized into five tracks: 14 | 15 | | Track | Episodes | Source | Description | 16 | |-------|----------|--------|-------------| 17 | | **Math** | 115 | AIME 2025 | Competition mathematics problems | 18 | | **Web** | 1,107 | BrowseComp | Web browsing and research tasks | 19 | | **Science** | 161 | GPQA Diamond | Graduate-level science questions | 20 | | **Long-Context** | 548 | MRCR | Multi-turn reading comprehension | 21 | | **Factual** | 1,000 | SimpleQA | Factual recall questions | 22 | 23 | ## Dataset Structure 24 | 25 | Each episode contains: 26 | 27 | - **`id`**: Unique identifier (e.g., `vera_aime_58789fd1`) 28 | - **`track`**: Category (`mathematical_reasoning`, `web`, `science`, `long_context`, `factual`) 29 | - **`turns`**: Array of conversation turns with: 30 | - `role`: Speaker role (`user`) 31 | - `text_content`: Encrypted question text (base64-encoded) 32 | - `audio_file`: Path to corresponding audio file 33 | - `prefix_text`: Optional prefix (usually null) 34 | - `postfix_text`: Optional postfix (usually null) 35 | - **`context_documents`**: Additional context materials (if any) 36 | - **`interruptions`**: Interruption events (if any) 37 | - **`metadata`**: Contains encrypted `expected_answer` 38 | - **`canary`**: Unique decryption key for this episode 39 | 40 | ### Example Episode Structure 41 | 42 | ```json 43 | { 44 | "id": "vera_aime_58789fd1", 45 | "track": "mathematical_reasoning", 46 | "turns": [ 47 | { 48 | "role": "user", 49 | "text_content": "ayDyHIziBKCtUXnstgrT...", 50 | "audio_file": "aime_voice_episodes_audio/vera_aime_58789fd1.wav", 51 | "prefix_text": null, 52 | "postfix_text": null 53 | } 54 | ], 55 | "context_documents": [], 56 | "interruptions": [], 57 | "metadata": { 58 | "expected_answer": "EnS9" 59 | }, 60 | "canary": "04a8d78a8fe43328c0a9936731ed47fd" 61 | } 62 | ``` 63 | 64 | ## Encryption 65 | 66 | To prevent LLM memorization and ensure evaluation integrity, all questions (`text_content`) and answers (`expected_answer`) are encrypted using XOR cipher with SHA256-derived keys, following the methodology used in OpenAI's BrowseComp benchmark. 67 | 68 | ### Decryption 69 | 70 | To decrypt the questions and answers, use the following Python code: 71 | 72 | ```python 73 | import base64 74 | import hashlib 75 | 76 | def derive_key(password: str, length: int) -> bytes: 77 | """Derive a fixed-length key from the password using SHA256.""" 78 | hasher = hashlib.sha256() 79 | hasher.update(password.encode()) 80 | key = hasher.digest() 81 | return key * (length // len(key)) + key[: length % len(key)] 82 | 83 | def decrypt(ciphertext_b64: str, password: str) -> str: 84 | """Decrypt base64-encoded ciphertext with XOR.""" 85 | encrypted = base64.b64decode(ciphertext_b64) 86 | key = derive_key(password, len(encrypted)) 87 | decrypted = bytes(a ^ b for a, b in zip(encrypted, key)) 88 | return decrypted.decode() 89 | 90 | # Example usage: 91 | import json 92 | 93 | with open('voice_episodes.json', 'r') as f: 94 | data = json.load(f) 95 | 96 | # Decrypt the first episode 97 | episode = data['episodes'][0] 98 | canary = episode['canary'] 99 | 100 | # Decrypt question 101 | question = decrypt(episode['turns'][0]['text_content'], canary) 102 | print(f"Question: {question}") 103 | 104 | # Decrypt answer 105 | answer = decrypt(episode['metadata']['expected_answer'], canary) 106 | print(f"Expected Answer: {answer}") 107 | ``` 108 | 109 | ## Audio Files 110 | 111 | Audio files are organized in the following directories: 112 | - `aime_voice_episodes_audio/` - Math problems (115 files) 113 | - `browsecomp_voice_episodes_audio/` - Web tasks (1,107 files) 114 | - `gpqa_diamond_voice_episodes_audio/` - Science questions (161 files) 115 | - `mrcr_voice_episodes_audio/` - Long-context tasks (548 files) 116 | - `simpleqa_voice_episodes_audio/` - Factual questions (1,000 files) 117 | 118 | Each `audio_file` field in the dataset references the relative path to the corresponding audio file. 119 | 120 | All audio is synthesized using **Boson Higgs Audio 2** for consistent, high-quality speech generation. 121 | 122 | ## Sample Data 123 | 124 | A small sample of the dataset (with unencrypted text for easier inspection) is available in the `test_voice_episodes/` directory at the repository root: 125 | 126 | ```bash 127 | # View sample episodes 128 | cat test_voice_episodes/test.json 129 | 130 | # Listen to sample audio 131 | ls test_voice_episodes/audio/ 132 | ``` 133 | 134 | ## License and Attribution 135 | 136 | The dataset follows upstream licenses: 137 | 138 | - **SimpleQA, BrowseComp, MRCR**: MIT License 139 | - **GPQA Diamond**: CC BY 4.0 140 | - **Audio**: Boson Higgs Audio 2 Community License (with usage restrictions) 141 | 142 | **Important restriction**: Do not use the audio outputs to improve any other large language model. 143 | 144 | See [ATTRIBUTIONS.md](../ATTRIBUTIONS.md) and [NOTICE.txt](../NOTICE.txt) in the repository root for complete attribution and licensing details. 145 | 146 | ## Citation 147 | 148 | If you use this dataset, please cite: 149 | 150 | ```bibtex 151 | @misc{lin2025vera, 152 | title={Voice Evaluation of Reasoning Ability: Diagnosing the Modality-Induced Performance Gap}, 153 | author={Lin, Yueqian and Hu, Zhengmian and Wang, Qinsi and Liu, Yudong and Zhang, Hengfan and Subramanian, Jayakumar and Vlassis, Nikos and Li, Hai Helen and Chen, Yiran}, 154 | year={2025}, 155 | eprint={2509.26542}, 156 | archivePrefix={arXiv}, 157 | primaryClass={eess.AS}, 158 | url={https://arxiv.org/abs/2509.26542} 159 | } 160 | ``` 161 | -------------------------------------------------------------------------------- /evaluation/grader/wer_calculator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Tuple 4 | import re 5 | 6 | 7 | class WERCalculator: 8 | """Calculate Word Error Rate (WER) between reference and hypothesis text.""" 9 | 10 | @staticmethod 11 | def normalize_text(text: str) -> str: 12 | """Normalize text for WER calculation.""" 13 | text = text.lower() 14 | text = re.sub(r'[^\w\s]', '', text) 15 | text = re.sub(r'\s+', ' ', text) 16 | return text.strip() 17 | 18 | @staticmethod 19 | def tokenize(text: str) -> List[str]: 20 | """Tokenize text into words.""" 21 | normalized = WERCalculator.normalize_text(text) 22 | return normalized.split() if normalized else [] 23 | 24 | @staticmethod 25 | def edit_distance(ref_words: List[str], hyp_words: List[str]) -> Tuple[int, List[List[int]]]: 26 | """ 27 | Calculate edit distance using dynamic programming. 28 | Returns (distance, dp_matrix) for traceback. 29 | """ 30 | m, n = len(ref_words), len(hyp_words) 31 | 32 | dp = [[0] * (n + 1) for _ in range(m + 1)] 33 | 34 | for i in range(m + 1): 35 | dp[i][0] = i 36 | for j in range(n + 1): 37 | dp[0][j] = j 38 | 39 | for i in range(1, m + 1): 40 | for j in range(1, n + 1): 41 | if ref_words[i-1] == hyp_words[j-1]: 42 | dp[i][j] = dp[i-1][j-1] 43 | else: 44 | dp[i][j] = 1 + min( 45 | dp[i-1][j], 46 | dp[i][j-1], 47 | dp[i-1][j-1] 48 | ) 49 | 50 | return dp[m][n], dp 51 | 52 | @staticmethod 53 | def get_alignment(ref_words: List[str], hyp_words: List[str], dp_matrix: List[List[int]]) -> List[Tuple[str, str, str]]: 54 | """ 55 | Get alignment between reference and hypothesis using traceback. 56 | Returns list of (ref_word, hyp_word, operation). 57 | """ 58 | m, n = len(ref_words), len(hyp_words) 59 | alignment = [] 60 | 61 | i, j = m, n 62 | while i > 0 or j > 0: 63 | if i > 0 and j > 0: 64 | if ref_words[i-1] == hyp_words[j-1]: 65 | alignment.append((ref_words[i-1], hyp_words[j-1], "MATCH")) 66 | i -= 1 67 | j -= 1 68 | elif dp_matrix[i][j] == dp_matrix[i-1][j-1] + 1: 69 | alignment.append((ref_words[i-1], hyp_words[j-1], "SUB")) 70 | i -= 1 71 | j -= 1 72 | elif dp_matrix[i][j] == dp_matrix[i-1][j] + 1: 73 | alignment.append((ref_words[i-1], "*", "DEL")) 74 | i -= 1 75 | else: 76 | alignment.append(("*", hyp_words[j-1], "INS")) 77 | j -= 1 78 | elif i > 0: 79 | alignment.append((ref_words[i-1], "*", "DEL")) 80 | i -= 1 81 | else: 82 | alignment.append(("*", hyp_words[j-1], "INS")) 83 | j -= 1 84 | 85 | return list(reversed(alignment)) 86 | 87 | @classmethod 88 | def calculate_wer(cls, reference: str, hypothesis: str, return_details: bool = False) -> dict: 89 | """ 90 | Calculate Word Error Rate between reference and hypothesis. 91 | 92 | Args: 93 | reference: Ground truth text 94 | hypothesis: Predicted text (e.g., from ASR) 95 | return_details: If True, return detailed alignment information 96 | 97 | Returns: 98 | Dictionary with WER metrics and optionally alignment details 99 | """ 100 | ref_words = cls.tokenize(reference) 101 | hyp_words = cls.tokenize(hypothesis) 102 | 103 | if len(ref_words) == 0: 104 | if len(hyp_words) == 0: 105 | result = { 106 | "wer": 0.0, 107 | "substitutions": 0, 108 | "deletions": 0, 109 | "insertions": 0, 110 | "total_words": 0, 111 | "reference_length": 0, 112 | "hypothesis_length": 0 113 | } 114 | else: 115 | result = { 116 | "wer": float('inf'), 117 | "substitutions": 0, 118 | "deletions": 0, 119 | "insertions": len(hyp_words), 120 | "total_words": len(hyp_words), 121 | "reference_length": 0, 122 | "hypothesis_length": len(hyp_words) 123 | } 124 | else: 125 | edit_dist, dp_matrix = cls.edit_distance(ref_words, hyp_words) 126 | 127 | if return_details: 128 | alignment = cls.get_alignment(ref_words, hyp_words, dp_matrix) 129 | substitutions = sum(1 for _, _, op in alignment if op == "SUB") 130 | deletions = sum(1 for _, _, op in alignment if op == "DEL") 131 | insertions = sum(1 for _, _, op in alignment if op == "INS") 132 | else: 133 | alignment = None 134 | substitutions = 0 135 | deletions = 0 136 | insertions = 0 137 | 138 | wer = edit_dist / len(ref_words) 139 | 140 | result = { 141 | "wer": wer, 142 | "substitutions": substitutions, 143 | "deletions": deletions, 144 | "insertions": insertions, 145 | "total_words": edit_dist, 146 | "reference_length": len(ref_words), 147 | "hypothesis_length": len(hyp_words) 148 | } 149 | 150 | if return_details: 151 | result["alignment"] = alignment 152 | result["reference_words"] = ref_words 153 | result["hypothesis_words"] = hyp_words 154 | 155 | return result 156 | 157 | @classmethod 158 | def batch_calculate_wer(cls, pairs: List[Tuple[str, str]], return_details: bool = False) -> List[dict]: 159 | """Calculate WER for multiple reference-hypothesis pairs.""" 160 | return [cls.calculate_wer(ref, hyp, return_details) for ref, hyp in pairs] -------------------------------------------------------------------------------- /models/realtime/liveanswer/mrcr_context.py: -------------------------------------------------------------------------------- 1 | """ 2 | MRCR context handling for LiveAnswer. 3 | Based on azure_gpt_realtime approach. 4 | """ 5 | 6 | import json 7 | from typing import List, Dict, Any, Optional 8 | from pathlib import Path 9 | 10 | 11 | def parse_mrcr_context(context: str) -> List[Dict[str, str]]: 12 | """Parse MRCR context document into conversation messages""" 13 | messages = [] 14 | 15 | # Split by User: and Assistant: markers 16 | lines = context.split('\n') 17 | current_role = None 18 | current_content = [] 19 | 20 | for line in lines: 21 | if line.startswith('User:'): 22 | if current_role and current_content: 23 | messages.append({"role": current_role, "content": '\n'.join(current_content).strip()}) 24 | current_role = "user" 25 | current_content = [line[5:].strip()] # Remove 'User:' prefix 26 | elif line.startswith('Assistant:'): 27 | if current_role and current_content: 28 | messages.append({"role": current_role, "content": '\n'.join(current_content).strip()}) 29 | current_role = "assistant" 30 | current_content = [line[10:].strip()] # Remove 'Assistant:' prefix 31 | else: 32 | if current_content is not None: 33 | current_content.append(line) 34 | 35 | # Add the last message 36 | if current_role and current_content: 37 | messages.append({"role": current_role, "content": '\n'.join(current_content).strip()}) 38 | 39 | return messages 40 | 41 | 42 | def load_context_documents_from_audio_file(audio_file_path: str) -> List[Dict[str, Any]]: 43 | """ 44 | Load context documents from episode JSON based on audio file path. 45 | Follows the same pattern as azure_gpt_realtime. 46 | """ 47 | audio_path = Path(audio_file_path) 48 | 49 | # Try to find corresponding episode JSON 50 | episode_json_candidates = [ 51 | # Same directory, replace .wav with _episode.json 52 | audio_path.parent / f"{audio_path.stem}_episode.json", 53 | # test_voice_episodes directory structure 54 | audio_path.parent.parent / "episodes" / f"{audio_path.stem}_episode.json", 55 | # Current directory test_voice_episodes 56 | Path.cwd() / "test_voice_episodes" / "episodes" / f"{audio_path.stem}_episode.json", 57 | ] 58 | 59 | # Add test_voice_episodes direct files based on audio file type 60 | audio_stem = audio_path.stem.lower() 61 | if "mrcr" in audio_stem: 62 | episode_json_candidates.append(Path.cwd() / "test_voice_episodes" / "test_mrcr_episode.json") 63 | elif "browsecomp" in audio_stem: 64 | episode_json_candidates.append(Path.cwd() / "test_voice_episodes" / "test_browsecomp_episode.json") 65 | elif "aime" in audio_stem: 66 | episode_json_candidates.append(Path.cwd() / "test_voice_episodes" / "test_aime_episode.json") 67 | 68 | episode_json = None 69 | print(f"!!!MRCR: Looking for episode JSON for audio file: {audio_file_path}") 70 | for candidate in episode_json_candidates: 71 | print(f"!!!MRCR: Checking candidate: {candidate}") 72 | if candidate.exists(): 73 | episode_json = candidate 74 | print(f"!!!MRCR: Found episode JSON: {episode_json}") 75 | break 76 | 77 | if not episode_json: 78 | print(f"!!!MRCR: No episode JSON found for audio file: {audio_file_path}") 79 | print(f"!!!MRCR: Tried candidates: {episode_json_candidates}") 80 | return [] 81 | 82 | try: 83 | episode_data = json.loads(episode_json.read_text()) 84 | if episode_data.get("episodes"): 85 | first_episode = episode_data["episodes"][0] 86 | context_documents = first_episode.get("context_documents", []) 87 | print(f"!!!MRCR: Found {len(context_documents)} context documents from {episode_json}") 88 | if context_documents: 89 | print(f"!!!MRCR: First context document has {len(context_documents[0].get('content', ''))} characters") 90 | return context_documents 91 | except Exception as e: 92 | print(f"Error loading context documents from {episode_json}: {e}") 93 | 94 | return [] 95 | 96 | 97 | def inject_mrcr_context_into_messages( 98 | messages: List[Dict[str, str]], 99 | context_documents: List[Dict[str, Any]], 100 | episode_id: Optional[str] = None 101 | ) -> List[Dict[str, str]]: 102 | """ 103 | Inject MRCR context documents into message history. 104 | Based on azure_gpt_realtime approach. 105 | """ 106 | if not context_documents: 107 | return messages 108 | 109 | print(f"Injecting {len(context_documents)} context documents...") 110 | 111 | # Determine if this is MRCR 112 | is_mrcr = False 113 | if episode_id: 114 | is_mrcr = "mrcr" in episode_id.lower() 115 | 116 | # Insert context documents before the conversation 117 | context_messages = [] 118 | 119 | for i, doc in enumerate(context_documents): 120 | content = doc.get("content", "") 121 | if content and is_mrcr: 122 | # For MRCR, inject the full conversation as a system message 123 | print(f"Injecting MRCR conversation context from document {i+1}") 124 | parsed_messages = parse_mrcr_context(content) 125 | 126 | # Convert conversation to system context 127 | context_text = "Previous conversation:\n\n" 128 | for msg in parsed_messages: 129 | role = msg["role"].title() 130 | context_text += f"{role}: {msg['content']}\n\n" 131 | 132 | context_messages.append({ 133 | "role": "system", 134 | "content": f"You have access to the following conversation history:\n\n{context_text.strip()}" 135 | }) 136 | elif content: 137 | # For non-MRCR, add as single assistant message 138 | context_messages.append({ 139 | "role": "assistant", 140 | "content": f"Previous context: {content}" 141 | }) 142 | 143 | print(f"Context injection complete.") 144 | 145 | # Return context messages + original messages 146 | return context_messages + messages 147 | 148 | 149 | def is_mrcr_episode(audio_file_path: str) -> bool: 150 | """Check if this is an MRCR episode based on file path.""" 151 | path_str = str(audio_file_path).lower() 152 | return "mrcr" in path_str -------------------------------------------------------------------------------- /models/realtime/liveanswer/stt_service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import azure.cognitiveservices.speech as speechsdk 3 | from typing import Optional, Tuple 4 | from pathlib import Path 5 | 6 | 7 | class AzureSTTService: 8 | """Azure Speech-to-Text service for audio file transcription.""" 9 | 10 | def __init__(self, 11 | speech_key: Optional[str] = None, 12 | speech_region: Optional[str] = None): 13 | """ 14 | Initialize Azure STT service. 15 | 16 | Args: 17 | speech_key: Azure Speech API key (defaults to env var) 18 | speech_region: Azure region (defaults to env var) 19 | """ 20 | self.speech_key = speech_key or os.environ.get("AZURE_SPEECH_API_KEY") 21 | self.speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION") 22 | 23 | if not self.speech_key or not self.speech_region: 24 | raise ValueError( 25 | "Azure Speech credentials not found. " 26 | "Set AZURE_SPEECH_API_KEY and AZURE_SPEECH_REGION environment variables." 27 | ) 28 | 29 | self.speech_config = speechsdk.SpeechConfig( 30 | subscription=self.speech_key, 31 | region=self.speech_region 32 | ) 33 | 34 | # Set recognition language (can be made configurable) 35 | self.speech_config.speech_recognition_language = "en-US" 36 | 37 | # Enable detailed recognition results 38 | self.speech_config.request_word_level_timestamps() 39 | 40 | def transcribe_file(self, audio_file_path: str) -> Tuple[str, dict]: 41 | """ 42 | Transcribe audio file to text. 43 | 44 | Args: 45 | audio_file_path: Path to audio file (WAV, MP3, etc.) 46 | 47 | Returns: 48 | Tuple of (transcript, metadata dict with timing info) 49 | """ 50 | audio_path = Path(audio_file_path) 51 | if not audio_path.exists(): 52 | raise FileNotFoundError(f"Audio file not found: {audio_file_path}") 53 | 54 | # Create audio config from file 55 | audio_config = speechsdk.audio.AudioConfig(filename=str(audio_path)) 56 | 57 | # Create recognizer 58 | recognizer = speechsdk.SpeechRecognizer( 59 | speech_config=self.speech_config, 60 | audio_config=audio_config 61 | ) 62 | 63 | # Collect all results 64 | all_results = [] 65 | done = False 66 | 67 | def handle_recognized(evt): 68 | if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: 69 | all_results.append({ 70 | 'text': evt.result.text, 71 | 'offset': evt.result.offset, 72 | 'duration': evt.result.duration 73 | }) 74 | 75 | def stop_continuous(evt): 76 | nonlocal done 77 | done = True 78 | 79 | # Connect callbacks 80 | recognizer.recognized.connect(handle_recognized) 81 | recognizer.session_stopped.connect(stop_continuous) 82 | recognizer.canceled.connect(stop_continuous) 83 | 84 | # Start continuous recognition 85 | recognizer.start_continuous_recognition() 86 | 87 | # Wait for completion 88 | import time 89 | while not done: 90 | time.sleep(0.5) 91 | 92 | recognizer.stop_continuous_recognition() 93 | 94 | # Combine results 95 | full_transcript = ' '.join(r['text'] for r in all_results) 96 | 97 | metadata = { 98 | 'segments': all_results, 99 | 'total_segments': len(all_results), 100 | 'file_path': str(audio_path), 101 | 'language': self.speech_config.speech_recognition_language 102 | } 103 | 104 | return full_transcript.strip(), metadata 105 | 106 | def transcribe_with_diarization(self, audio_file_path: str) -> Tuple[str, dict]: 107 | """ 108 | Transcribe audio with speaker diarization (who said what). 109 | 110 | Args: 111 | audio_file_path: Path to audio file 112 | 113 | Returns: 114 | Tuple of (transcript with speaker labels, metadata) 115 | """ 116 | audio_path = Path(audio_file_path) 117 | if not audio_path.exists(): 118 | raise FileNotFoundError(f"Audio file not found: {audio_file_path}") 119 | 120 | # Create audio config 121 | audio_config = speechsdk.audio.AudioConfig(filename=str(audio_path)) 122 | 123 | # Enable diarization 124 | self.speech_config.set_property( 125 | speechsdk.PropertyId.SpeechServiceConnection_LanguageIdMode, "Continuous" 126 | ) 127 | 128 | # Create conversation transcriber 129 | conversation_transcriber = speechsdk.transcription.ConversationTranscriber( 130 | speech_config=self.speech_config, 131 | audio_config=audio_config 132 | ) 133 | 134 | transcription_results = [] 135 | done = False 136 | 137 | def handle_transcribed(evt): 138 | if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: 139 | transcription_results.append({ 140 | 'speaker_id': evt.result.speaker_id or 'Unknown', 141 | 'text': evt.result.text, 142 | 'offset': evt.result.offset, 143 | 'duration': evt.result.duration 144 | }) 145 | 146 | def stop_cb(evt): 147 | nonlocal done 148 | done = True 149 | 150 | # Connect callbacks 151 | conversation_transcriber.transcribed.connect(handle_transcribed) 152 | conversation_transcriber.session_stopped.connect(stop_cb) 153 | conversation_transcriber.canceled.connect(stop_cb) 154 | 155 | # Start transcription 156 | conversation_transcriber.start_transcribing_async() 157 | 158 | # Wait for completion 159 | import time 160 | while not done: 161 | time.sleep(0.5) 162 | 163 | conversation_transcriber.stop_transcribing_async() 164 | 165 | # Format output with speaker labels 166 | formatted_transcript = [] 167 | current_speaker = None 168 | 169 | for segment in transcription_results: 170 | speaker = segment['speaker_id'] 171 | if speaker != current_speaker: 172 | formatted_transcript.append(f"\n[Speaker {speaker}]: {segment['text']}") 173 | current_speaker = speaker 174 | else: 175 | formatted_transcript.append(segment['text']) 176 | 177 | full_transcript = ' '.join(formatted_transcript).strip() 178 | 179 | metadata = { 180 | 'segments': transcription_results, 181 | 'total_segments': len(transcription_results), 182 | 'speakers': list(set(s['speaker_id'] for s in transcription_results)), 183 | 'file_path': str(audio_path), 184 | 'language': self.speech_config.speech_recognition_language 185 | } 186 | 187 | return full_transcript, metadata -------------------------------------------------------------------------------- /models/realtime/freeze_omni.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | import argparse 5 | import asyncio 6 | import json 7 | import queue 8 | import sys 9 | import time 10 | from pathlib import Path 11 | from typing import List 12 | 13 | import numpy as np 14 | import soundfile as sf 15 | import socketio 16 | import torch 17 | import torchaudio.functional as AF 18 | from glob import glob 19 | 20 | ### Configuration ### 21 | root_dir_path = "YOUR_ROOT_DIRECTORY_PATH" 22 | tasks = [ 23 | "YOUR_TASK_NAME", 24 | ] 25 | prefix = "" # "" or "clean_": the prefix for input wav files 26 | overwrite = True # Whether to overwrite existing output files 27 | ##################### 28 | 29 | all_wav_files = [] 30 | for task in tasks: 31 | root_dir = f"{root_dir_path}/{task}/" 32 | root_file_dir = f"{root_dir}/*/{prefix}input.wav" 33 | wav_files = sorted(glob(root_file_dir)) 34 | all_wav_files.extend(wav_files) 35 | 36 | FRAME_MS = 30 37 | SEND_SR = 16_000 38 | RECV_SR = 24_000 39 | TX_SAMP = int(SEND_SR * FRAME_MS / 1000) 40 | RX_SAMP = int(RECV_SR * FRAME_MS / 1000) 41 | RX_BYTES = RX_SAMP * 2 42 | 43 | 44 | def _mono(sig: np.ndarray) -> np.ndarray: 45 | return sig if sig.ndim == 1 else sig.mean(axis=1) 46 | 47 | 48 | def _resample(sig: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: 49 | if orig_sr == target_sr: 50 | return sig 51 | wav = torch.from_numpy(sig.astype(np.float32) / 32768).unsqueeze(0) 52 | wav_rs = AF.resample(wav, orig_sr, target_sr) 53 | return (wav_rs.squeeze().numpy() * 32768).astype(np.int16) 54 | 55 | 56 | def _chunk(sig: np.ndarray, frame_len: int) -> List[np.ndarray]: 57 | pad = (-len(sig)) % frame_len 58 | if pad: 59 | sig = np.concatenate([sig, np.zeros(pad, dtype=sig.dtype)]) 60 | return [sig[i : i + frame_len] for i in range(0, len(sig), frame_len)] 61 | 62 | 63 | def _compact_json(obj): 64 | return json.dumps(obj, separators=(",", ":")) 65 | 66 | 67 | class FreezeOmniClient: 68 | def __init__(self, server_ip: str, inp: Path, out: Path): 69 | self.server_ip = server_ip 70 | self.inp = inp 71 | self.out = out 72 | self.audio_q = queue.Queue() 73 | self.pending = bytearray() 74 | self.muted = False # true after stop_tts until next audio 75 | 76 | self.sio = socketio.Client( 77 | ssl_verify=False, 78 | reconnection=True, 79 | reconnection_attempts=0, 80 | reconnection_delay=2, 81 | reconnection_delay_max=30, 82 | randomization_factor=0.2, 83 | ) 84 | 85 | self.sio.on("connect", self._on_connect) 86 | self.sio.on("disconnect", self._on_disconnect) 87 | self.sio.on("audio", self._on_audio) 88 | self.sio.on("stop_tts", self._on_stop_tts) 89 | self.sio.on("too_many_users", self._on_too_many) 90 | 91 | def _on_connect(self): 92 | print("[SIO] ✅ Connected", flush=True) 93 | asyncio.run(self._stream()) 94 | 95 | def _on_disconnect(self): 96 | print("[SIO] 🔌 Disconnected", flush=True) 97 | 98 | def _on_audio(self, data: bytes): 99 | self.audio_q.put(data) 100 | self.muted = False # new audio resumes output 101 | 102 | def _on_stop_tts(self): 103 | print("[SIO] ⏹️ stop_tts → mute", flush=True) 104 | self.pending.clear() # discard any buffered TTS 105 | self.muted = True 106 | 107 | def _on_too_many(self, *_, **__): 108 | print("[SIO] ❌ Too many users", file=sys.stderr) 109 | self.sio.disconnect() 110 | 111 | async def _stream(self): 112 | wav, sr = sf.read(self.inp, dtype="int16") 113 | wav = _mono(wav) 114 | wav = _resample(wav, sr, SEND_SR) 115 | tx_frames = _chunk(wav, TX_SAMP) 116 | total_frames = len(tx_frames) 117 | frames_written = 0 118 | 119 | with sf.SoundFile( 120 | self.out, "w", samplerate=RECV_SR, channels=1, subtype="PCM_16" 121 | ) as fout: 122 | self.sio.emit("recording-started") 123 | frame_dur = FRAME_MS / 1000.0 124 | 125 | for frame in tx_frames: 126 | self.sio.emit( 127 | "audio", 128 | _compact_json( 129 | {"audio": list(frame.tobytes()), "sample_rate": SEND_SR} 130 | ), 131 | ) 132 | 133 | while not self.audio_q.empty(): 134 | self.pending.extend(self.audio_q.get()) 135 | 136 | if self.muted: 137 | chunk = b"" 138 | else: 139 | chunk = self.pending[:RX_BYTES] 140 | self.pending = self.pending[RX_BYTES:] 141 | 142 | if len(chunk) < RX_BYTES: 143 | chunk += b"\x00" * (RX_BYTES - len(chunk)) 144 | fout.write(np.frombuffer(chunk, dtype=np.int16)) 145 | frames_written += 1 146 | 147 | await asyncio.sleep(frame_dur) 148 | 149 | self.sio.emit("recording-stopped") 150 | flush_until = time.time() + 1.0 151 | while time.time() < flush_until and frames_written < total_frames: 152 | while not self.audio_q.empty(): 153 | self.pending.extend(self.audio_q.get()) 154 | chunk = b"" if self.muted else self.pending[:RX_BYTES] 155 | self.pending = self.pending[RX_BYTES:] 156 | if len(chunk) < RX_BYTES: 157 | chunk += b"\x00" * (RX_BYTES - len(chunk)) 158 | fout.write(np.frombuffer(chunk, dtype=np.int16)) 159 | frames_written += 1 160 | await asyncio.sleep(frame_dur) 161 | 162 | while frames_written < total_frames: 163 | fout.write(np.zeros(RX_SAMP, dtype=np.int16)) 164 | frames_written += 1 165 | 166 | self.sio.disconnect() 167 | print( 168 | f"[DONE] input len = {len(wav) / SEND_SR:.2f}s | output len = {sf.info(self.out).duration:.2f}s" 169 | ) 170 | 171 | def run(self): 172 | url = f"https://{self.server_ip}" 173 | try: 174 | self.sio.connect(url, transports=["websocket"], wait_timeout=10) 175 | self.sio.wait() 176 | if self.sio.connected: 177 | self.sio.disconnect() 178 | except KeyboardInterrupt: 179 | self.sio.disconnect() 180 | except Exception as e: 181 | print(f"[ERR] {e}", file=sys.stderr) 182 | self.sio.disconnect() 183 | 184 | 185 | def main(): 186 | ap = argparse.ArgumentParser( 187 | description="Freeze-Omni streaming client with instant stop_tts mute" 188 | ) 189 | ap.add_argument("--server_ip", required=True) 190 | args = ap.parse_args() 191 | 192 | for inp in all_wav_files: 193 | args.input = Path(inp) 194 | args.output = Path(inp.replace("input.wav", "output.wav")) 195 | if not overwrite and args.output.exists(): 196 | print(f"[SKIP] {args.output} already exists, skipping...") 197 | continue 198 | print(f"[RUN] {args.input} → {args.output}") 199 | FreezeOmniClient(args.server_ip, args.input, args.output).run() 200 | 201 | 202 | if __name__ == "__main__": 203 | main() 204 | 205 | -------------------------------------------------------------------------------- /models/realtime/moshi.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import annotations 3 | 4 | import argparse 5 | import asyncio 6 | from glob import glob 7 | from pathlib import Path 8 | from typing import List 9 | 10 | import numpy as np 11 | import soundfile as sf 12 | import sphn 13 | import torch 14 | import torchaudio.functional as AF 15 | import websockets 16 | import websockets.exceptions as wsex 17 | 18 | 19 | ### Configuration ### 20 | root_dir_path = Path("YOUR_ROOT_DIRECTORY_PATH") 21 | tasks = [ 22 | "YOUR_TASK_NAME", 23 | ] 24 | prefix = "" # "" or "clean_": the prefix for input wav files 25 | overwrite = True # Whether to overwrite existing output files 26 | ##################### 27 | 28 | 29 | SEND_SR = 24_000 30 | FRAME_SMP = 1_920 31 | SKIP_FRAMES = 1 32 | FRAME_SEC = FRAME_SMP / SEND_SR 33 | 34 | 35 | def _patch_sphn(): 36 | if not hasattr(sphn.OpusStreamWriter, "read_bytes"): 37 | for alt in ("get_bytes", "flush_bytes", "read_data"): 38 | if hasattr(sphn.OpusStreamWriter, alt): 39 | setattr( 40 | sphn.OpusStreamWriter, 41 | "read_bytes", 42 | getattr(sphn.OpusStreamWriter, alt), 43 | ) 44 | break 45 | else: 46 | setattr(sphn.OpusStreamWriter, "read_bytes", lambda self: b"") 47 | if not hasattr(sphn.OpusStreamReader, "read_pcm"): 48 | for alt in ("get_pcm", "receive_pcm", "read_float"): 49 | if hasattr(sphn.OpusStreamReader, alt): 50 | setattr( 51 | sphn.OpusStreamReader, 52 | "read_pcm", 53 | getattr(sphn.OpusStreamReader, alt), 54 | ) 55 | break 56 | else: 57 | setattr( 58 | sphn.OpusStreamReader, "read_pcm", lambda self: np.empty(0, np.float32) 59 | ) 60 | 61 | 62 | _patch_sphn() 63 | 64 | 65 | def _mono(x: np.ndarray) -> np.ndarray: 66 | return x if x.ndim == 1 else x.mean(axis=1) 67 | 68 | 69 | def _resample(x: np.ndarray, sr: int, tgt: int) -> np.ndarray: 70 | if sr == tgt: 71 | return x 72 | y = torch.from_numpy(x.astype(np.float32) / 32768).unsqueeze(0) 73 | y = AF.resample(y, sr, tgt)[0].numpy() 74 | return (y * 32768).astype(np.int16) 75 | 76 | 77 | def _chunk(sig: np.ndarray) -> List[np.ndarray]: 78 | pad = (-len(sig)) % FRAME_SMP 79 | if pad: 80 | sig = np.concatenate([sig, np.zeros(pad, sig.dtype)]) 81 | return [sig[i : i + FRAME_SMP] for i in range(0, len(sig), FRAME_SMP)] 82 | 83 | 84 | class MoshiFileClient: 85 | def __init__(self, ws_url: str, inp: Path, out: Path): 86 | self.url, self.inp, self.out = ws_url, inp, out 87 | 88 | sig16, sr = sf.read(inp, dtype="int16") 89 | self.sig24 = _resample(_mono(sig16), sr, SEND_SR) 90 | self.max_samples = len(self.sig24) 91 | 92 | self.writer = sphn.OpusStreamWriter(SEND_SR) 93 | self.reader = sphn.OpusStreamReader(SEND_SR) 94 | 95 | async def _send(self, ws): 96 | for frame in _chunk(self.sig24): 97 | pkt0 = self.writer.append_pcm(frame.astype(np.float32) / 32768) 98 | if isinstance(pkt0, (bytes, bytearray)): 99 | await ws.send(b"\x01" + pkt0) 100 | queued = self.writer.read_bytes() 101 | if queued: 102 | await ws.send(b"\x01" + queued) 103 | await asyncio.sleep(FRAME_SEC) 104 | 105 | queued = self.writer.read_bytes() 106 | if queued: 107 | await ws.send(b"\x01" + queued) 108 | await asyncio.sleep(0.5) 109 | await ws.close() 110 | 111 | async def _recv(self, ws): 112 | samples_written = 0 113 | first_pcm_seen = False 114 | 115 | with sf.SoundFile( 116 | self.out, "w", samplerate=SEND_SR, channels=1, subtype="PCM_16" 117 | ) as fout: 118 | try: 119 | async for msg in ws: 120 | if not msg or msg[0] not in (1, 2): 121 | continue 122 | kind, payload = msg[0], msg[1:] 123 | 124 | if kind == 1: # audio bytes 125 | self.reader.append_bytes(payload) 126 | while True: 127 | pcm = self.reader.read_pcm() 128 | if pcm.size == 0: 129 | break 130 | if not first_pcm_seen: 131 | pad = min(SKIP_FRAMES * FRAME_SMP, self.max_samples) 132 | fout.write(np.zeros(pad, dtype=np.int16)) 133 | samples_written += pad 134 | first_pcm_seen = True 135 | remain = self.max_samples - samples_written 136 | if remain <= 0: 137 | continue 138 | n_write = min(pcm.size, remain) 139 | fout.write((pcm[:n_write] * 32768).astype(np.int16)) 140 | samples_written += n_write 141 | else: 142 | print("[TEXT]", payload.decode(errors="ignore")) 143 | 144 | except wsex.ConnectionClosedError: 145 | pass 146 | 147 | if samples_written < self.max_samples: 148 | fout.write(np.zeros(self.max_samples - samples_written, dtype=np.int16)) 149 | 150 | async def _run(self): 151 | async with websockets.connect(self.url, max_size=None) as ws: 152 | try: 153 | first = await asyncio.wait_for(ws.recv(), timeout=1.0) 154 | if not (isinstance(first, (bytes, bytearray)) and first[:1] == b"\x00"): 155 | ws._put_message(first) 156 | except Exception: 157 | pass 158 | await asyncio.gather(self._send(ws), self._recv(ws)) 159 | print("[DONE]", self.inp) 160 | 161 | def run(self): 162 | try: 163 | asyncio.run(self._run()) 164 | except wsex.ConnectionClosedError: 165 | pass 166 | 167 | 168 | def _ws_url(addr: str) -> str: 169 | if "://" in addr: 170 | proto, rest = addr.split("://", 1) 171 | proto = "ws" if proto in {"http", "ws"} else "wss" 172 | return f"{proto}://{rest.rstrip('/')}/api/chat" 173 | if ":" not in addr: 174 | addr += ":8998" 175 | return f"ws://{addr}/api/chat" 176 | 177 | 178 | def _input_files() -> List[Path]: 179 | files: List[Path] = [] 180 | for t in tasks: 181 | pattern = root_dir_path / f"{t}/*/{prefix}input.wav" 182 | files += [Path(p) for p in sorted(glob(str(pattern)))] 183 | return files 184 | 185 | 186 | def main(): 187 | ap = argparse.ArgumentParser("moshi_batch_client") 188 | ap.add_argument("--server_ip", required=True, help="host[:port] or http(s):// URL") 189 | args = ap.parse_args() 190 | 191 | url = _ws_url(args.server_ip) 192 | for inp in _input_files(): 193 | out = inp.with_name(inp.name.replace("input.wav", "output.wav")) 194 | if not overwrite and out.exists(): 195 | print("[SKIP]", out) 196 | continue 197 | out.parent.mkdir(parents=True, exist_ok=True) 198 | print("[RUN]", inp) 199 | MoshiFileClient(url, inp, out).run() 200 | 201 | 202 | if __name__ == "__main__": 203 | main() 204 | 205 | -------------------------------------------------------------------------------- /evaluation/grader/llm_grader.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import httpx 5 | import asyncio 6 | import time 7 | import random 8 | from typing import Optional, Tuple 9 | 10 | from .base import BaseAccuracyGrader, GradeLabel, GradeResult 11 | from .prompts import get_accuracy_prompt 12 | 13 | 14 | class LLMAccuracyGrader(BaseAccuracyGrader): 15 | """LLM-backed accuracy grader using Azure OpenAI chat completions. 16 | 17 | Notes: 18 | - Requires environment variables: AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY 19 | - By default uses deployment "gpt-4o" and api-version "2024-10-21" 20 | - Does not stream; single-turn prompt per grading task 21 | """ 22 | 23 | def __init__( 24 | self, 25 | deployment_name: str = "gpt-4o", 26 | api_version: str = "2024-10-21", 27 | temperature: float = 0.0, 28 | max_retries: int = 3, 29 | base_delay: float = 1.0, 30 | ) -> None: 31 | self.azure_endpoint = (os.getenv("AZURE_OPENAI_ENDPOINT") or "").rstrip("/") 32 | self.api_key = os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("AZURE_API_KEY") 33 | self.deployment_name = deployment_name 34 | self.api_version = api_version 35 | self.temperature = temperature 36 | self.max_retries = max_retries 37 | self.base_delay = base_delay 38 | 39 | def _ensure_env(self) -> None: 40 | if not self.azure_endpoint or not self.api_key: 41 | raise RuntimeError( 42 | "LLMAccuracyGrader requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY" 43 | ) 44 | 45 | async def _chat(self, system: str, user: str) -> str: 46 | self._ensure_env() 47 | url = f"{self.azure_endpoint}/openai/deployments/{self.deployment_name}/chat/completions" 48 | headers = {"Content-Type": "application/json", "api-key": self.api_key} 49 | params = {"api-version": self.api_version} 50 | payload = { 51 | "messages": [ 52 | {"role": "system", "content": system}, 53 | {"role": "user", "content": user}, 54 | ], 55 | "temperature": self.temperature, 56 | "max_tokens": 512, 57 | } 58 | 59 | last_exception = None 60 | 61 | for attempt in range(self.max_retries + 1): 62 | try: 63 | async with httpx.AsyncClient(timeout=60.0) as client: 64 | r = await client.post(url, headers=headers, params=params, json=payload) 65 | r.raise_for_status() 66 | data = r.json() 67 | return data["choices"][0]["message"]["content"].strip() 68 | 69 | except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError) as e: 70 | last_exception = e 71 | 72 | if attempt == self.max_retries: 73 | # Last attempt failed, re-raise the exception 74 | break 75 | 76 | # Calculate delay with exponential backoff and jitter 77 | delay = self.base_delay * (2 ** attempt) + random.uniform(0, 1) 78 | 79 | # Special handling for rate limits (429) 80 | if isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 429: 81 | # For rate limits, wait longer 82 | delay = max(delay, 5.0 + random.uniform(0, 5)) 83 | print(f"Rate limit hit, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})") 84 | elif isinstance(e, httpx.ConnectError): 85 | print(f"Connection error, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})") 86 | elif isinstance(e, httpx.TimeoutException): 87 | print(f"Timeout error, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})") 88 | else: 89 | print(f"HTTP error {e.response.status_code if hasattr(e, 'response') else 'unknown'}, retrying in {delay:.1f}s (attempt {attempt + 1}/{self.max_retries + 1})") 90 | 91 | await asyncio.sleep(delay) 92 | 93 | # If we get here, all retries failed 94 | raise last_exception 95 | 96 | def _parse_binary(self, content: str) -> Tuple[Optional[str], Optional[bool], Optional[float], Optional[str]]: 97 | # Very light parsing for fields we care about 98 | extracted = None 99 | correct_flag = None 100 | confidence = None 101 | reasoning = None 102 | for line in content.splitlines(): 103 | l = line.strip() 104 | if l.lower().startswith("extracted_final_answer:"): 105 | extracted = l.split(":", 1)[1].strip() 106 | extracted = None if extracted.lower() == "none" else extracted 107 | elif l.lower().startswith("correct:"): 108 | v = l.split(":", 1)[1].strip().lower() 109 | if v in {"yes", "no"}: 110 | correct_flag = v == "yes" 111 | elif l.lower().startswith("confidence:"): 112 | v = l.split(":", 1)[1].strip().replace("%", "") 113 | try: 114 | confidence = float(v) 115 | except Exception: 116 | confidence = None 117 | elif l.lower().startswith("reasoning:"): 118 | reasoning = l.split(":", 1)[1].strip() 119 | return extracted, correct_flag, confidence, reasoning 120 | 121 | def _parse_triad(self, content: str) -> GradeLabel: 122 | c = content.strip().upper() 123 | if c.startswith("A"): 124 | return GradeLabel.CORRECT 125 | if c.startswith("B"): 126 | return GradeLabel.INCORRECT 127 | if c.startswith("C"): 128 | return GradeLabel.NOT_ATTEMPTED 129 | # default fallback if model deviates 130 | return GradeLabel.INCORRECT 131 | 132 | async def grade_async( 133 | self, 134 | question: str, 135 | ground_truth: str, 136 | predicted_answer: str, 137 | benchmark: Optional[str] = None, 138 | ) -> GradeResult: 139 | prompt = get_accuracy_prompt( 140 | question=question, ground_truth=ground_truth, predicted_answer=predicted_answer, benchmark=benchmark 141 | ) 142 | 143 | # Constrain output: A/B/C only 144 | system = ( 145 | "You are an academic grader. Return only a single capital letter (A/B/C) per instructions." 146 | ) 147 | 148 | content = await self._chat(system=system, user=prompt) 149 | 150 | # triad mode 151 | label = self._parse_triad(content) 152 | return GradeResult( 153 | label=label, 154 | extracted_final_answer=None, 155 | reasoning=None, 156 | correct_flag=None, 157 | confidence=None, 158 | raw_model_output=content, 159 | ) 160 | 161 | def grade( 162 | self, 163 | question: str, 164 | ground_truth: str, 165 | predicted_answer: str, 166 | benchmark: Optional[str] = None, 167 | ) -> GradeResult: 168 | async def _run(): 169 | return await self.grade_async(question, ground_truth, predicted_answer, benchmark) 170 | try: 171 | return asyncio.run(_run()) 172 | except RuntimeError: 173 | # If already inside an event loop 174 | loop = asyncio.get_event_loop() # type: ignore 175 | return loop.run_until_complete(_run()) 176 | -------------------------------------------------------------------------------- /models/text/gpt5.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPT-5 OpenAI Browse Adapter for VERA 3 | Uses OpenAI Responses API with web_search_preview tool for browsecomp benchmark 4 | """ 5 | 6 | import os 7 | import json 8 | import time 9 | import httpx 10 | from typing import Dict, Any, List 11 | from pathlib import Path 12 | 13 | from ..shared.timing_utils import ( 14 | create_turn_result, 15 | create_standardized_episode_result, 16 | create_standardized_batch_result, 17 | ) 18 | 19 | 20 | class GPT5OpenAIBrowseAdapter: 21 | """OpenAI GPT-5 adapter using web_search_preview for browsecomp.""" 22 | 23 | def __init__(self, api_key: str, api_base: str = "https://api.openai.com", api_version: str = "2025-02-01-preview", reasoning_effort: str = "high", reasoning_summary: str = "detailed"): 24 | self.api_key = api_key 25 | self.api_base = api_base.rstrip('/') 26 | self.api_version = api_version 27 | self.model_name = "gpt-5" 28 | self.reasoning_effort = reasoning_effort 29 | self.reasoning_summary = reasoning_summary 30 | 31 | async def process_episodes_batch(self, episodes: List[Dict[str, Any]], output_dir: str, max_concurrent: int = 16) -> Dict[str, Any]: 32 | print(f"[GPT-5 OpenAI Browse] Batch processing {len(episodes)} episodes (max {max_concurrent} concurrent)") 33 | output_path = Path(output_dir) 34 | output_path.mkdir(parents=True, exist_ok=True) 35 | 36 | start = time.time() 37 | import asyncio 38 | semaphore = asyncio.Semaphore(max_concurrent) 39 | 40 | async def run_one(ep): 41 | async with semaphore: 42 | import asyncio 43 | return await asyncio.to_thread(self.process_episode, ep, output_dir) 44 | 45 | tasks = [run_one(ep) for ep in episodes] 46 | results = await asyncio.gather(*tasks, return_exceptions=True) 47 | processed = [] 48 | for i, r in enumerate(results): 49 | if isinstance(r, Exception): 50 | processed.append({ 51 | 'episode_id': episodes[i].get('id', f'episode_{i}'), 52 | 'error': str(r), 53 | 'success': False 54 | }) 55 | else: 56 | processed.append(r) 57 | 58 | duration = time.time() - start 59 | batch = create_standardized_batch_result( 60 | episodes=processed, 61 | total_time=duration, 62 | model_name=f"{self.model_name}_openai_browse_{self.reasoning_effort}", 63 | metadata={"max_concurrent": max_concurrent}, 64 | ) 65 | batch_file = output_path / f"gpt5_openai_browse_batch_{int(time.time())}.json" 66 | with open(batch_file, 'w') as f: 67 | json.dump(batch, f, indent=2) 68 | print( 69 | f"[GPT-5 OpenAI Browse] Batch completed: " 70 | f"{batch['summary']['successful_episodes']}/{batch['summary']['total_episodes']} successful" 71 | ) 72 | return batch 73 | 74 | def process_episode(self, episode_data: Dict[str, Any], output_dir: str) -> Dict[str, Any]: 75 | episode_id = episode_data.get('id', 'unknown') 76 | output_path = Path(output_dir) 77 | output_path.mkdir(parents=True, exist_ok=True) 78 | 79 | session_start = time.time() 80 | turns_results: List[Dict[str, Any]] = [] 81 | total_tokens = 0 82 | 83 | for turn_idx, turn in enumerate(episode_data.get('turns', [])): 84 | if turn.get('role') != 'user': 85 | continue 86 | turn_start = time.time() 87 | prompt = self._prepare_prompt(turn, episode_data, turn_idx) 88 | response_data = self._call_openai_responses(prompt) 89 | turn_end = time.time() 90 | timing = { 91 | "start_time": turn_start, 92 | "end_time": turn_end, 93 | "duration": turn_end - turn_start, 94 | } 95 | 96 | model_metadata = { 97 | 'model': self.model_name, 98 | 'provider': 'openai', 99 | 'response_id': response_data.get('id', '') 100 | } 101 | 102 | error = response_data.get('error') if 'error' in response_data else None 103 | response_text = ( 104 | response_data.get('output', {}).get('content') 105 | if isinstance(response_data.get('output'), dict) 106 | else response_data.get('output', '') 107 | ) or response_data.get('text', '') or '' 108 | 109 | turn_result = create_turn_result( 110 | turn_index=turn_idx, 111 | prompt=prompt, 112 | response=response_text, 113 | timing=timing, 114 | success=(error is None), 115 | error=error, 116 | metadata=model_metadata, 117 | ) 118 | turns_results.append(turn_result) 119 | if not error: 120 | total_tokens += response_data.get('usage', {}).get('total_tokens', 0) 121 | 122 | session_duration = time.time() - session_start 123 | success = all(t.get('success', True) for t in turns_results) 124 | return create_standardized_episode_result( 125 | episode_id=episode_id, 126 | turns=turns_results, 127 | total_time=session_duration, 128 | success=success, 129 | metadata={ 130 | "model_name": f"{self.model_name}_openai_browse", 131 | "total_tokens": total_tokens, 132 | }, 133 | ) 134 | 135 | def _prepare_prompt(self, turn: Dict[str, Any], episode_data: Dict[str, Any], turn_idx: int) -> str: 136 | user_speech = turn.get('text_content', '') 137 | context_docs = episode_data.get('context_documents', []) 138 | parts: List[str] = [] 139 | if context_docs: 140 | parts.append("Context Documents:") 141 | for i, doc in enumerate(context_docs): 142 | parts.append(f"Document {i+1}: {doc.get('content','')}") 143 | parts.append("") 144 | if turn_idx > 0: 145 | parts.append("Previous conversation:") 146 | for prev_idx in range(turn_idx): 147 | pt = episode_data['turns'][prev_idx] 148 | role = pt.get('role') 149 | if role == 'user': 150 | parts.append(f"User: {pt.get('text_content','')}") 151 | elif role == 'assistant': 152 | parts.append(f"Assistant: {pt.get('response','')}") 153 | parts.append("") 154 | parts.append(f"User: {user_speech}") 155 | return "\n".join(parts) 156 | 157 | def _call_openai_responses(self, prompt: str) -> Dict[str, Any]: 158 | url = f"{self.api_base}/v1/responses" 159 | headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} 160 | params = {"api-version": self.api_version} 161 | payload = { 162 | "input": [{"role": "user", "content": prompt}], 163 | "model": self.model_name, 164 | "tools": [{"type": "web_search_preview", "search_context_size": "high"}], 165 | "truncation": "auto", 166 | "reasoning": {"effort": self.reasoning_effort, "summary": self.reasoning_summary}, 167 | "max_output_tokens": 16384 168 | } 169 | try: 170 | with httpx.Client(timeout=180.0) as client: 171 | resp = client.post(url, headers=headers, params=params, json=payload) 172 | resp.raise_for_status() 173 | return resp.json() 174 | except httpx.HTTPStatusError as e: 175 | return {"error": f"HTTP {e.response.status_code}: {e.response.text}", "status_code": e.response.status_code} 176 | except httpx.TimeoutException: 177 | return {"error": "Request timed out"} 178 | except Exception as e: 179 | return {"error": f"Unexpected error: {e}"} 180 | 181 | # Backward-compatible alias for tests and external code 182 | GPT5Adapter = GPT5OpenAIBrowseAdapter 183 | -------------------------------------------------------------------------------- /models/realtime/liveanswer/audio_to_answer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import json 5 | from pathlib import Path 6 | from typing import Tuple, Optional 7 | 8 | from .main import main_request 9 | from .stt_service import AzureSTTService 10 | 11 | 12 | class AudioToAnswer: 13 | """Process audio input to generate audio answer using Azure STT + LiveAnswer.""" 14 | 15 | def __init__(self, 16 | speech_key: Optional[str] = None, 17 | speech_region: Optional[str] = None, 18 | enable_diarization: bool = False): 19 | """ 20 | Initialize the audio-to-answer pipeline. 21 | 22 | Args: 23 | speech_key: Azure Speech API key 24 | speech_region: Azure region 25 | enable_diarization: Whether to use speaker diarization 26 | """ 27 | self.stt_service = AzureSTTService(speech_key, speech_region) 28 | self.enable_diarization = enable_diarization 29 | 30 | def process_audio_file(self, 31 | audio_file_path: str, 32 | output_dir: Optional[str] = None, 33 | verbose: bool = True) -> Tuple[str, bytes, dict]: 34 | """ 35 | Process audio file: STT -> LiveAnswer -> TTS. 36 | 37 | Args: 38 | audio_file_path: Path to input audio file 39 | output_dir: Directory for output files (defaults to current dir) 40 | verbose: Print progress messages 41 | 42 | Returns: 43 | Tuple of (transcript, answer_audio_bytes, metadata) 44 | """ 45 | start_time = time.time() 46 | 47 | # Step 1: Transcribe audio 48 | if verbose: 49 | print(f"[1/3] Transcribing audio file: {audio_file_path}") 50 | 51 | if self.enable_diarization: 52 | transcript, stt_metadata = self.stt_service.transcribe_with_diarization(audio_file_path) 53 | else: 54 | transcript, stt_metadata = self.stt_service.transcribe_file(audio_file_path) 55 | 56 | transcription_time = time.time() - start_time 57 | 58 | if verbose: 59 | print(f" Transcription: '{transcript[:100]}{'...' if len(transcript) > 100 else ''}'") 60 | print(f" Time taken: {transcription_time:.2f}s") 61 | 62 | # Step 2: Generate answer 63 | if verbose: 64 | print(f"[2/3] Generating answer...") 65 | 66 | answer_start = time.time() 67 | answer_audio_bytes, time_to_first_response, gpt5_response, groq_explanation = main_request(transcript, audio_file_path) 68 | answer_time = time.time() - answer_start 69 | 70 | if verbose: 71 | print(f" Time to first response: {time_to_first_response:.2f}s") 72 | print(f" Total generation time: {answer_time:.2f}s") 73 | 74 | # Step 3: Save outputs 75 | if output_dir is None: 76 | output_dir = os.getcwd() 77 | else: 78 | os.makedirs(output_dir, exist_ok=True) 79 | 80 | # Save answer audio 81 | timestamp = time.strftime("%Y%m%d_%H%M%S") 82 | answer_path = Path(output_dir) / f"answer_{timestamp}.mp3" 83 | answer_path.write_bytes(answer_audio_bytes) 84 | 85 | # Save transcript 86 | transcript_path = Path(output_dir) / f"transcript_{timestamp}.txt" 87 | transcript_path.write_text(transcript) 88 | 89 | # Save GPT-5 response (raw solver output) 90 | gpt5_response_path = Path(output_dir) / f"gpt5_response_{timestamp}.txt" 91 | gpt5_response_path.write_text(gpt5_response) 92 | 93 | # Save Groq explanation (what was spoken) 94 | groq_explanation_path = Path(output_dir) / f"groq_explanation_{timestamp}.txt" 95 | groq_explanation_path.write_text(groq_explanation) 96 | 97 | # Save detailed timing info 98 | timing_path = Path(output_dir) / f"timing_{timestamp}.json" 99 | timing_data = { 100 | 'time_to_first_audio_chunk': time_to_first_response, 101 | 'transcription_time': transcription_time, 102 | 'answer_generation_time': answer_time, 103 | 'total_processing_time': time.time() - start_time, 104 | 'transcript_length_chars': len(transcript), 105 | 'audio_output_size_bytes': len(answer_audio_bytes), 106 | 'timestamp': timestamp 107 | } 108 | timing_path.write_text(json.dumps(timing_data, indent=2)) 109 | 110 | if verbose: 111 | print(f"[3/3] Outputs saved:") 112 | print(f" Answer audio: {answer_path}") 113 | print(f" Transcript: {transcript_path}") 114 | print(f" GPT-5 response: {gpt5_response_path}") 115 | print(f" Groq explanation: {groq_explanation_path}") 116 | print(f" Timing data: {timing_path}") 117 | 118 | # Compile metadata 119 | metadata = { 120 | 'input_audio': audio_file_path, 121 | 'transcript': transcript, 122 | 'gpt5_response': gpt5_response, 123 | 'groq_explanation': groq_explanation, 124 | 'transcript_length': len(transcript), 125 | 'stt_metadata': stt_metadata, 126 | 'answer_audio_path': str(answer_path), 127 | 'transcript_path': str(transcript_path), 128 | 'gpt5_response_path': str(gpt5_response_path), 129 | 'groq_explanation_path': str(groq_explanation_path), 130 | 'timing_path': str(timing_path), 131 | 'timings': { 132 | 'transcription_time': transcription_time, 133 | 'answer_generation_time': answer_time, 134 | 'time_to_first_response': time_to_first_response, 135 | 'total_time': time.time() - start_time 136 | } 137 | } 138 | 139 | return transcript, answer_audio_bytes, metadata 140 | 141 | def process_audio_stream(self, audio_stream): 142 | """ 143 | Future: Process audio stream in real-time. 144 | Currently not implemented - placeholder for future enhancement. 145 | """ 146 | raise NotImplementedError( 147 | "Real-time audio streaming not yet implemented. " 148 | "Use process_audio_file() for file-based processing." 149 | ) 150 | 151 | 152 | def main(): 153 | """CLI entry point for audio-to-answer processing.""" 154 | import argparse 155 | import json 156 | 157 | parser = argparse.ArgumentParser(description="Process audio to generate answer") 158 | parser.add_argument("audio_file", help="Path to input audio file") 159 | parser.add_argument("--output-dir", default=None, help="Output directory") 160 | parser.add_argument("--diarization", action="store_true", help="Enable speaker diarization") 161 | parser.add_argument("--save-metadata", action="store_true", help="Save metadata JSON") 162 | parser.add_argument("--quiet", action="store_true", help="Suppress verbose output") 163 | 164 | args = parser.parse_args() 165 | 166 | try: 167 | # Initialize processor 168 | processor = AudioToAnswer(enable_diarization=args.diarization) 169 | 170 | # Process audio 171 | transcript, audio_bytes, metadata = processor.process_audio_file( 172 | audio_file_path=args.audio_file, 173 | output_dir=args.output_dir, 174 | verbose=not args.quiet 175 | ) 176 | 177 | # Optionally save metadata 178 | if args.save_metadata: 179 | metadata_path = Path(args.output_dir or os.getcwd()) / "metadata.json" 180 | # Convert metadata to JSON-serializable format 181 | json_metadata = { 182 | k: v if not isinstance(v, bytes) else f"" 183 | for k, v in metadata.items() 184 | } 185 | metadata_path.write_text(json.dumps(json_metadata, indent=2)) 186 | if not args.quiet: 187 | print(f" Metadata: {metadata_path}") 188 | 189 | if not args.quiet: 190 | print(f"\nProcessing complete! Total time: {metadata['timings']['total_time']:.2f}s") 191 | 192 | except Exception as e: 193 | print(f"Error: {e}", file=sys.stderr) 194 | sys.exit(1) 195 | 196 | 197 | if __name__ == "__main__": 198 | main() -------------------------------------------------------------------------------- /models/realtime/liveanswer/audio.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import threading 4 | 5 | import azure.cognitiveservices.speech as speechsdk # type: ignore 6 | 7 | from .utils import _env 8 | from .explain import ExplainSynthesizer 9 | 10 | 11 | class AzureSpeechClient: 12 | def __init__(self) -> None: 13 | self.key = _env("AZURE_SPEECH_KEY") 14 | self.region = _env("AZURE_SPEECH_REGION") 15 | self.voice = _env("AZURE_SPEECH_VOICE", "en-US-JennyNeural") 16 | self.output_format_name = _env("AZURE_SPEECH_FORMAT", "Audio24Khz160KBitRateMonoMp3") 17 | self._sdk_ready = bool(self.key and self.region and speechsdk is not None) 18 | 19 | if self._sdk_ready: 20 | self.speech_config = speechsdk.SpeechConfig(subscription=self.key, region=self.region) 21 | self.speech_config.speech_synthesis_voice_name = self.voice 22 | fmt = getattr(speechsdk.SpeechSynthesisOutputFormat, self.output_format_name) 23 | self.speech_config.set_speech_synthesis_output_format(fmt) 24 | else: 25 | self.speech_config = None 26 | 27 | 28 | class AudioGenerator: 29 | def __init__(self, explainer: ExplainSynthesizer): 30 | self.explainer = explainer 31 | self.azure = AzureSpeechClient() 32 | self.all_sound = bytearray() 33 | self._stop_event = threading.Event() 34 | self.request_start_time: float = time.monotonic() 35 | self.start_time: float = 0.0 36 | self._stream_req = None 37 | self._generated_seconds = 0.0 38 | self._bitrate_bps = self._guess_bitrate(self.azure.output_format_name) 39 | 40 | @staticmethod 41 | def _guess_bitrate(format_name: str) -> float: 42 | if not format_name: 43 | return 160_000.0 44 | match = re.search(r"(\d+)KBitRate", format_name) 45 | if match: 46 | return float(match.group(1)) * 1000.0 47 | sr_match = re.search(r"Audio(\d+)Khz", format_name) 48 | if sr_match: 49 | sample_rate = float(sr_match.group(1)) * 1000.0 50 | bit_depth = 16.0 51 | if "8Bit" in format_name: 52 | bit_depth = 8.0 53 | elif "24Bit" in format_name: 54 | bit_depth = 24.0 55 | return sample_rate * bit_depth 56 | return 160_000.0 57 | 58 | def _update_generated_seconds(self, byte_count: int) -> None: 59 | if byte_count <= 0: 60 | return 61 | bitrate = self._bitrate_bps or 160_000.0 62 | self._generated_seconds += (byte_count * 8.0) / bitrate 63 | 64 | def _watch_need_more_explanation(self) -> None: 65 | if not self.explainer.spoken_explanation: 66 | # first = self.explainer.pop_more_explanation(max_token=80) 67 | # first = self.explainer.pop_more_explanation(max_token=64) 68 | # first = self.explainer.pop_more_explanation(max_token=32) 69 | first = self.explainer.pop_more_explanation() 70 | print(f"!!!AudioGen: First chunk from explainer: '{first[:100]}...' ({len(first) if first else 0} chars)") 71 | if first and self._stream_req is not None: 72 | print(f"!!!AudioGen: Writing first chunk to TTS stream") 73 | self._stream_req.input_stream.write(first) 74 | 75 | # time_margin = 10.0 76 | time_margin = 10.0 77 | while not self._stop_event.is_set(): 78 | if self.start_time == 0.0: 79 | elapsed = 0.0 80 | else: 81 | elapsed = time.monotonic() - self.start_time 82 | total_estimated = self._generated_seconds 83 | remaining = total_estimated - elapsed 84 | print(f"!!!total_estimated: {total_estimated}, elapsed: {elapsed}, remaining: {remaining}") 85 | 86 | if remaining <= time_margin: 87 | more = self.explainer.pop_more_explanation() 88 | print(f"!!!AudioGen: Got more chunk: '{more[:100] if more else None}...' ({len(more) if more else 0} chars)") 89 | if more is not None: 90 | if more and self._stream_req is not None: 91 | print(f"!!!AudioGen: Writing more chunk to TTS stream") 92 | self._stream_req.input_stream.write(more) 93 | else: 94 | print(f"!!!AudioGen: No more chunks, closing TTS stream") 95 | if self._stream_req is not None: 96 | self._stream_req.input_stream.close() 97 | return 98 | 99 | time.sleep(max(0.5, remaining - time_margin)) 100 | 101 | def start(self) -> tuple[bytes, float]: 102 | self.all_sound.clear() 103 | self._generated_seconds = 0.0 104 | self._stop_event.clear() 105 | self.start_time = 0.0 # Reset start time 106 | 107 | if not getattr(self.azure, "_sdk_ready", False): 108 | raise RuntimeError("Azure Speech SDK or credentials not available for streaming synthesis") 109 | 110 | try: 111 | region = self.azure.region 112 | key = self.azure.key 113 | voice = self.azure.voice 114 | 115 | tts_endpoint = f"wss://{region}.tts.speech.microsoft.com/cognitiveservices/websocket/v2" 116 | cfg = speechsdk.SpeechConfig(endpoint=tts_endpoint, subscription=key) 117 | cfg.speech_synthesis_voice_name = voice 118 | fmt = getattr(speechsdk.SpeechSynthesisOutputFormat, self.azure.output_format_name) 119 | cfg.set_speech_synthesis_output_format(fmt) 120 | cfg.set_property(speechsdk.PropertyId.SpeechSynthesis_RtfTimeoutThreshold, "4") 121 | cfg.set_property(speechsdk.PropertyId.SpeechSynthesis_FrameTimeoutInterval, str(int(60*1000))) # 60s 122 | 123 | req = speechsdk.SpeechSynthesisRequest(speechsdk.SpeechSynthesisRequestInputType.TextStream) 124 | self._stream_req = req 125 | synth = speechsdk.SpeechSynthesizer(speech_config=cfg, audio_config=None) 126 | 127 | def on_synthesizing(evt): 128 | if self.start_time == 0.0: 129 | self.start_time = time.monotonic() 130 | data_bytes = evt.result.audio_data 131 | if data_bytes: 132 | self.all_sound.extend(data_bytes) 133 | self._update_generated_seconds(len(data_bytes)) 134 | 135 | def on_synthesis_started(evt): 136 | print(f"!!!TTS: synthesis started") 137 | def on_synthesis_completed(evt): 138 | print(f"!!!TTS: synthesis completed") 139 | def on_synthesis_canceled(evt): 140 | print(f"!!!TTS: synthesis canceled - {evt}") 141 | def on_synthesis_error(evt): 142 | print(f"!!!TTS: synthesis error - {evt}") 143 | 144 | synth.synthesizing.connect(on_synthesizing) 145 | synth.synthesis_started.connect(on_synthesis_started) 146 | synth.synthesis_completed.connect(on_synthesis_completed) 147 | synth.synthesis_canceled.connect(on_synthesis_canceled) 148 | # Note: synthesis_error might not exist in all SDK versions 149 | 150 | fut = synth.speak_async(req) 151 | 152 | t_watcher = threading.Thread(target=self._watch_need_more_explanation, name="watcher", daemon=True) 153 | t_watcher.start() 154 | 155 | r = fut.get() 156 | if r.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: 157 | print(f"!!!synthesis completed") 158 | elif r.reason == speechsdk.ResultReason.Canceled: 159 | print(f"!!!synthesis canceled: {r.cancellation_details.reason}, {r.cancellation_details.error_details}") 160 | else: 161 | print(f"!!!synthesis failed: {r.reason}") 162 | t_watcher.join() 163 | finally: 164 | self._stop_event.set() 165 | self._stream_req = None 166 | 167 | # Calculate time to first response, with fallback if TTS never started 168 | if self.start_time > 0.0: 169 | time_to_first_response = self.start_time - self.request_start_time 170 | else: 171 | # TTS never started, use current time as fallback 172 | time_to_first_response = time.monotonic() - self.request_start_time 173 | print(f"!!!AudioGen: TTS never started, using fallback timing: {time_to_first_response:.2f}s") 174 | 175 | return bytes(self.all_sound), time_to_first_response 176 | -------------------------------------------------------------------------------- /LICENSES/Boson-Higgs-Audio-2-Community-License.txt: -------------------------------------------------------------------------------- 1 | BOSON HIGGS AUDIO 2 COMMUNITY LICENSE AGREEMENT 2 | 3 | Boson Higgs Audio 2 Version Release Date: June 20, 2025 4 | 5 | This License Agreement (the “Agreement”) is entered into by and between Licensee (as defined below) and Boson AI USA, Inc. (“Boson”) and is based upon the Meta Llama 3 Community License Agreement as of April 18, 2024 (the “Meta License Agreement”), which can be found at https://llama.meta.com/llama3/license/. The terms and conditions of the Meta License Agreement are hereby incorporated herein by reference and Unless stated otherwise below, its terms apply. The Higgs Audio 2 model developed by Boson AI USA, Inc. (“Higgs Materials”) is an audio model derived from Meta Llama 3 software and algorithms. 6 | 7 | “Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Higgs Materials set forth herein and the Meta License Agreement. 8 | 9 | “Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering into this Agreement on their behalf. 10 | 11 | “Higgs Audio 2” means the foundational large audio language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing developed by Boson AI distributed at https://github.com/boson-ai/boson-multimodal or otherwise. 12 | “Higgs Materials” means, collectively, Boson’s proprietary modification of Meta Llama 3 and Documentation (and any portion thereof) made available under this Agreement. 13 | 14 | “Boson” or “we” means Boson AI USA, Inc. 15 | 16 | By clicking “I Accept” below or by using or distributing any portion or element of the Higgs Materials, you agree to be bound by this Agreement. 17 | 18 | 1. License Rights and Redistribution. 19 | a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Boson’s intellectual property or other rights owned by Boson embodied in the Higgs Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Higgs Materials. 20 | b. Redistribution and Use. 21 | i. If you distribute or make available the Higgs Materials (or any derivative works thereof), or a product or service that uses any of them, including another AI model, you shall (A) provide a copy of this Agreement and the of Meta License ’s Llama 3 agreement with any such Higgs Materials; and (B) prominently display “Built with Higgs Materials licensed from Boson AI USA, Inc., Copyright Boson AI USA, Inc., All Rights Reserved and Meta Llama 3 licensed under the Meta Llama 3 Community License, Copyright Meta Platforms, Inc., All Right Reserved". based on Meta Llama 3” on a related website, user interface, blogpost, about page, or product documentation. If you use the Higgs Materials to create, modify, enhance, train, fine tune, or otherwise improve an AI model or similar software, which is distributed or made available, you shall also include “Higgs Audio 2” at the beginning of any such AI model or software name. 22 | ii. Even if you receive Higgs Materials, or any modifications, enhancements or derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will apply to you. 23 | iii. You must retain in all copies of the Llama Materials that you distribute and as set forth above, include the following attribution notice within a “Notice” text file distributed as a part of such copies: 24 | “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.” 25 | “Boson Higgs Audio 2 is licensed under the Boson Community License, Copyright © Boson AI USA, Inc. All Rights Reserved.” 26 | iv. Your use of the Higgs Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which is hereby incorporated by reference into this Agreement. 27 | v. You will not use the Higgs Materials or any output or results of the Higgs Materials to improve any other large language model (excluding Boson Higgs Audio 2 or derivative works thereof). 28 | vi. You hereby acknowledge that Boson is the owner of the Higgs Materials and under no circumstance shall you bring any legal action, claim, charge, demand challenging such ownership rights of Boson. 29 | 30 | 2. Additional Commercial Terms. If the annual active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 100,000 annual active users in the preceding calendar year, you must request an expanded license from Boson AI, which Boson AI may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Boson AI otherwise expressly grants you such rights. 31 | 32 | 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE Higgs Materials AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITH ALL FAULTS, WITHOUT WARRANTIES OF ANY KIND EXPRESS, IMPLIED, BASED UPON CUSTOM AND USAGE OR COURSE OF DEALING, AND BOSON AI DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE HIGGS MATERIALS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR USE OF THE HIGGS MATERIALS AND ANY OUTPUT AND RESULTS. 33 | 34 | 4. Limitation of Liability. IN NO EVENT WILL BOSON AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF BOSON, META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. 35 | 36 | 5. Intellectual Property. 37 | a. No trademark licenses are granted under this Agreement, or in connection with the Higgs Materials., nNeither Boson nor Licensee may use any name or mark owned by, or associated with, the other party hereto or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Higgs Materials or as set forth in this Section 5(a). Boson hereby grants you a license to use “Higgs Audio 2” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. All goodwill arising out of your use of the Mark will inure to the benefit of Meta and Boson AI. 38 | b. Subject to Boson’s ownership of the Higgs Materials and derivatives made by or for Boson AI, with respect to any derivative works and modifications of the Higgs Materials that are made by you, as between you and Boson AI, you are and will be the owner of such derivative works and modifications. 39 | c. If you institute litigation or other proceedings against Boson AI, Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Higgs Materials or Boson Higgs Audio 2 outputs or results, or any portion thereof any of the foregoing, constitutes infringement of the intellectual property or other rights owned or licensable by you, then any licenses granted to you hereunder this Agreement shall immediately terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Boson AI from and against any claim, charge, demand, cause of action by any third party arising out of or related to your use or distribution of the Higgs Materials. 40 | 41 | 6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Higgs Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Boson AI may terminate this Agreement if you are in breach of any term or condition of this Agreement by providing you with written notice. Upon your receipt of written notice of termination of this Agreement, you shall delete the Higgs Materials from any computer, server or IT device and cease use of the Higgs Materials in all respects. Sections 1(b)(vi), 3, 4 and 7 shall survive the termination of this Agreement. 42 | 43 | 7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The federal courts in the Northern District of California and the state courts in Santa Clara County, California shall have exclusive jurisdiction of any dispute arising out of this Agreement. 44 | -------------------------------------------------------------------------------- /models/text/gpt4o.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPT-4o OpenAI Browse Adapter for VERA 3 | Uses OpenAI Responses API with web_search_preview tool for browsecomp benchmark 4 | """ 5 | 6 | import os 7 | import json 8 | import time 9 | import httpx 10 | from typing import Dict, Any, List 11 | from pathlib import Path 12 | 13 | from ..shared.timing_utils import ( 14 | create_turn_result, 15 | create_standardized_episode_result, 16 | create_standardized_batch_result, 17 | ) 18 | from ..shared.base_adapter import TextAdapter, ModelConfig 19 | 20 | 21 | class GPT4oOpenAIBrowseAdapter(TextAdapter): 22 | """OpenAI GPT-4o adapter using web_search_preview for browsecomp.""" 23 | 24 | def __init__(self, api_key: str, api_base: str = "https://api.openai.com", api_version: str = "2025-02-01-preview"): 25 | config = ModelConfig(model_name="gpt-4o") 26 | super().__init__(config, api_key) 27 | self.api_base = api_base.rstrip('/') 28 | self.api_version = api_version 29 | 30 | async def process_episodes_batch(self, episodes: List[Dict[str, Any]], output_dir: str, max_concurrent: int = 16) -> Dict[str, Any]: 31 | """Batch process episodes concurrently.""" 32 | print(f"[GPT-4o OpenAI Browse] Batch processing {len(episodes)} episodes (max {max_concurrent} concurrent)") 33 | output_path = Path(output_dir) 34 | output_path.mkdir(parents=True, exist_ok=True) 35 | 36 | start = time.time() 37 | import asyncio 38 | semaphore = asyncio.Semaphore(max_concurrent) 39 | 40 | async def run_one(ep): 41 | async with semaphore: 42 | import asyncio 43 | return await asyncio.to_thread(self.process_episode, ep, output_dir) 44 | 45 | tasks = [run_one(ep) for ep in episodes] 46 | results = await asyncio.gather(*tasks, return_exceptions=True) 47 | processed = [] 48 | for i, r in enumerate(results): 49 | if isinstance(r, Exception): 50 | processed.append({ 51 | 'episode_id': episodes[i].get('id', f'episode_{i}'), 52 | 'error': str(r), 53 | 'success': False 54 | }) 55 | else: 56 | processed.append(r) 57 | 58 | duration = time.time() - start 59 | batch = create_standardized_batch_result( 60 | episodes=processed, 61 | total_time=duration, 62 | model_name=f"{self.model_name}_openai_browse", 63 | metadata={"max_concurrent": max_concurrent}, 64 | ) 65 | batch_file = output_path / f"gpt4o_openai_browse_batch_{int(time.time())}.json" 66 | with open(batch_file, 'w') as f: 67 | json.dump(batch, f, indent=2) 68 | print( 69 | f"[GPT-4o OpenAI Browse] Batch completed: " 70 | f"{batch['summary']['successful_episodes']}/{batch['summary']['total_episodes']} successful" 71 | ) 72 | return batch 73 | 74 | def process_episode(self, episode_data: Dict[str, Any], output_dir: str) -> Dict[str, Any]: 75 | episode_id = episode_data.get('id', 'unknown') 76 | output_path = Path(output_dir) 77 | output_path.mkdir(parents=True, exist_ok=True) 78 | 79 | session_start = time.time() 80 | turns_results: List[Dict[str, Any]] = [] 81 | total_tokens = 0 82 | 83 | for turn_idx, turn in enumerate(episode_data.get('turns', [])): 84 | if turn.get('role') != 'user': 85 | continue 86 | turn_start = time.time() 87 | prompt = self._prepare_prompt(turn, episode_data, turn_idx) 88 | response_data = self._call_openai_responses(prompt) 89 | turn_end = time.time() 90 | timing = { 91 | "start_time": turn_start, 92 | "end_time": turn_end, 93 | "duration": turn_end - turn_start, 94 | } 95 | 96 | model_metadata = { 97 | 'model': self.model_name, 98 | 'provider': 'openai', 99 | 'response_id': response_data.get('id', '') 100 | } 101 | 102 | error = response_data.get('error') if 'error' in response_data else None 103 | # Extract a plain text response if available 104 | response_text = ( 105 | response_data.get('output', {}).get('content') 106 | if isinstance(response_data.get('output'), dict) 107 | else response_data.get('output', '') 108 | ) or response_data.get('text', '') or '' 109 | 110 | turn_result = create_turn_result( 111 | turn_index=turn_idx, 112 | prompt=prompt, 113 | response=response_text, 114 | timing=timing, 115 | success=(error is None), 116 | error=error, 117 | metadata=model_metadata, 118 | ) 119 | turns_results.append(turn_result) 120 | if not error: 121 | total_tokens += response_data.get('usage', {}).get('total_tokens', 0) 122 | 123 | session_duration = time.time() - session_start 124 | success = all(t.get('success', True) for t in turns_results) 125 | return create_standardized_episode_result( 126 | episode_id=episode_id, 127 | turns=turns_results, 128 | total_time=session_duration, 129 | success=success, 130 | metadata={ 131 | "model_name": f"{self.model_name}_openai_browse", 132 | "total_tokens": total_tokens, 133 | }, 134 | ) 135 | 136 | def _prepare_prompt(self, turn: Dict[str, Any], episode_data: Dict[str, Any], turn_idx: int) -> str: 137 | user_speech = turn.get('text_content', '') 138 | context_docs = episode_data.get('context_documents', []) 139 | parts: List[str] = [] 140 | if context_docs: 141 | parts.append("Context Documents:") 142 | for i, doc in enumerate(context_docs): 143 | parts.append(f"Document {i+1}: {doc.get('content','')}") 144 | parts.append("") 145 | if turn_idx > 0: 146 | parts.append("Previous conversation:") 147 | for prev_idx in range(turn_idx): 148 | pt = episode_data['turns'][prev_idx] 149 | role = pt.get('role') 150 | if role == 'user': 151 | parts.append(f"User: {pt.get('text_content','')}") 152 | elif role == 'assistant': 153 | parts.append(f"Assistant: {pt.get('response','')}") 154 | parts.append("") 155 | parts.append(f"User: {user_speech}") 156 | return "\n".join(parts) 157 | 158 | def _make_api_request(self, messages: List[Dict[str, str]], **kwargs) -> str: 159 | """Make API request to OpenAI GPT-4o""" 160 | if len(messages) == 1 and messages[0].get("role") == "user": 161 | prompt = messages[0]["content"] 162 | else: 163 | # Convert messages to prompt format 164 | prompt_parts = [] 165 | for msg in messages: 166 | role = msg.get("role", "user") 167 | content = msg.get("content", "") 168 | if role == "user": 169 | prompt_parts.append(f"User: {content}") 170 | elif role == "assistant": 171 | prompt_parts.append(f"Assistant: {content}") 172 | prompt = "\n".join(prompt_parts) 173 | 174 | response_data = self._call_openai_responses(prompt) 175 | if "error" in response_data: 176 | raise Exception(response_data["error"]) 177 | 178 | return response_data.get("output", {}).get("content", "") 179 | 180 | def _call_openai_responses(self, prompt: str) -> Dict[str, Any]: 181 | url = f"{self.api_base}/v1/responses" 182 | headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} 183 | params = {"api-version": self.api_version} 184 | payload = { 185 | "input": [{"role": "user", "content": prompt}], 186 | "model": self.model_name, 187 | "tools": [{"type": "web_search_preview", "search_context_size": "high"}], 188 | "truncation": "auto", 189 | "max_output_tokens": 8192 190 | } 191 | try: 192 | with httpx.Client(timeout=120.0) as client: 193 | resp = client.post(url, headers=headers, params=params, json=payload) 194 | resp.raise_for_status() 195 | return resp.json() 196 | except httpx.HTTPStatusError as e: 197 | return {"error": f"HTTP {e.response.status_code}: {e.response.text}", "status_code": e.response.status_code} 198 | except httpx.TimeoutException: 199 | return {"error": "Request timed out"} 200 | except Exception as e: 201 | return {"error": f"Unexpected error: {e}"} 202 | -------------------------------------------------------------------------------- /models/realtime/liveanswer/explain.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Optional, List 3 | 4 | import requests # type: ignore 5 | 6 | from .utils import _env 7 | 8 | 9 | class ExplainSynthesizer: 10 | """ 11 | Incrementally produces spoken explanation text by calling Groq for continuation 12 | (assistant prefill style). 13 | """ 14 | 15 | def __init__(self, request: str): 16 | self.request: str = request 17 | self.all_thought: List[Optional[str]] = [] 18 | self.spoken_explanation: str = "" 19 | self.finished: bool = False 20 | 21 | # Track consecutive dummy explanations to prevent infinite loops 22 | # Give generous tolerance since thinking model may be slow 23 | self._consecutive_dummy_count: int = 0 24 | self._max_consecutive_dummy: int = 10 # Allow up to 10 dummy responses (thinking time) 25 | 26 | self._last_groq_messages: List[dict] = [] 27 | self._last_groq_response: Optional[str] = None 28 | 29 | def push_thought(self, s: Optional[str]) -> None: 30 | self.all_thought.append(s) 31 | 32 | def _groq_chat_completion(self, messages: List[dict], max_tokens: int) -> str: 33 | if self._last_groq_messages == messages: 34 | return self._last_groq_response 35 | self._last_groq_messages = messages 36 | self._last_groq_response = self.__groq_chat_completion(messages, max_tokens) 37 | return self._last_groq_response 38 | 39 | def __groq_chat_completion(self, messages: List[dict], max_tokens: int) -> str: 40 | api_key = _env("GROQ_API_KEY") 41 | if not api_key: 42 | raise RuntimeError("GROQ_API_KEY missing") 43 | 44 | groq_base = _env("GROQ_ENDPOINT", "https://api.groq.com/openai/v1") 45 | url = f"{groq_base.rstrip('/')}" \ 46 | f"/chat/completions" 47 | model = _env("GROQ_MODEL", "llama-3.3-70b-versatile") 48 | print(f"!!!Groq API: model={model}, max_tokens={max_tokens}") 49 | 50 | headers = { 51 | "Authorization": f"Bearer {api_key}", 52 | "Content-Type": "application/json", 53 | } 54 | # Increase temperature for Template3 to encourage more generation 55 | temperature = 0.9 if max_tokens > 1000 else 0.7 56 | 57 | payload = { 58 | "model": model, 59 | "messages": messages, 60 | "temperature": temperature, 61 | "max_completion_tokens": max_tokens, 62 | } 63 | 64 | # Increase timeout for longer responses 65 | timeout = 60 if max_tokens > 1000 else 30 66 | try: 67 | resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=timeout) 68 | resp.raise_for_status() 69 | data = resp.json() 70 | content = data["choices"][0]["message"]["content"].strip() 71 | finish_reason = data["choices"][0].get("finish_reason", "unknown") 72 | print(f"!!!Groq API response: finish_reason={finish_reason}, content_length={len(content)}") 73 | return content 74 | except Exception as e: 75 | print(f"!!!Groq API error: {e}") 76 | raise 77 | 78 | def pop_more_explanation(self, max_token: int = 32) -> Optional[str]: 79 | if self.finished: 80 | print(f"!!!pop_more_explanation: Already finished, returning None") 81 | return None 82 | 83 | has_any_thought = len(self.all_thought) > 0 84 | last_thought = self.all_thought[-1] if has_any_thought else None 85 | non_none_thoughts = [t for t in self.all_thought if t is not None] 86 | all_thought_text = (" ".join(non_none_thoughts)).strip() 87 | 88 | print(f"!!!pop_more_explanation: has_any_thought={has_any_thought}, last_thought={'None' if last_thought is None else 'Some'}, len(all_thought)={len(self.all_thought)}") 89 | 90 | # Use different system prompt for Template3 (finalization) to encourage comprehensive response 91 | if last_thought is None and has_any_thought: 92 | # Template3: Need comprehensive final explanation 93 | system_prompt = ( 94 | "You are a thorough, clear explainer providing a complete final explanation. " 95 | "Generate natural spoken-style text that fully explains the solution. " 96 | "Write exactly as if spoken aloud. Avoid symbols, equations, code fences, or special characters; " 97 | "use plain words instead. Express relations in words (e.g., x=y -> 'x equals y'). " 98 | "Provide a COMPLETE and COMPREHENSIVE explanation. Do not be too concise - be thorough." 99 | ) 100 | else: 101 | # Template1 and Template2: Regular concise style 102 | system_prompt = ( 103 | "You are a concise, clear explainer. Generate natural spoken-style text. " 104 | "Avoid lists unless necessary. Keep continuity with the prior assistant text. " 105 | "Write exactly as if spoken aloud. Avoid symbols, equations, code fences, or special characters; " 106 | "use plain words instead. Express relations in words (e.g., x=y -> 'x equals y'). Keep punctuation minimal and natural." 107 | "Use short sentences and phrases if possible. Avoid long sentences and paragraphs." 108 | ) 109 | 110 | assistant_prefill = self.spoken_explanation.strip() 111 | final_answer = False 112 | if not has_any_thought: 113 | # Template1: no solver thoughts yet → confirm + typically how to proceed 114 | print(f"!!!Using Template1: No solver thoughts yet") 115 | user_template = ( 116 | "Begin the spoken explanation. Start with a very brief rephrase of the user's " 117 | "request (one short sentence) to confirm understanding, then briefly state what you would " 118 | "typically do to approach it, and continue naturally. Do not include any disclaimers about inability or limitations. " 119 | "Avoid lists unless necessary; keep it concise and fluid.\n\n" 120 | f"User request: {self.request}" 121 | ) 122 | elif last_thought is None: 123 | # Template3: finalization 124 | max_token = 2048 # Increase token budget for comprehensive explanation 125 | final_answer = True 126 | print(f"!!!Using Template3: Finalization with max_token={max_token}") 127 | print(f"!!!All solver thoughts collected: {len(non_none_thoughts)} thoughts, {len(all_thought_text)} chars") 128 | user_template = ( 129 | "The conversation is concluding. Please provide a COMPREHENSIVE and DETAILED final explanation that:\n" 130 | "1. Fully explains the solution approach and reasoning\n" 131 | "2. Clearly states the final answer\n" 132 | "3. Explains WHY this answer is correct\n" 133 | "4. Should be at least 3-4 paragraphs long for completeness\n" 134 | "Continue from where you left off, but ensure the explanation is thorough and complete. " 135 | "Do not stop until you have fully explained the solution.\n\n" 136 | + (f"All solver thoughts so far: {all_thought_text}\n\n" if all_thought_text else "") 137 | + f"User request: {self.request}" 138 | ) 139 | else: 140 | # Template2: ongoing with accumulated thoughts 141 | print(f"!!!Using Template2: Ongoing with {len(non_none_thoughts)} thoughts") 142 | user_template = ( 143 | "Continue the spoken explanation naturally. Keep it fluid and avoid abrupt topic jumps. " 144 | "Be sure to include all latest updates from the accumulated reasoning (all_thought_text) as quickly as possible.\n\n" 145 | + (f"Use the overall reasoning so far: {all_thought_text}\n\n" if all_thought_text else "") 146 | + f"User request: {self.request}" 147 | ) 148 | 149 | messages = [ 150 | {"role": "system", "content": system_prompt}, 151 | {"role": "user", "content": user_template}, 152 | ] 153 | 154 | # Use TTS-friendly filler text instead of newlines 155 | dumb_explanation = "I'm still thinking about this problem. Let me work through the details." 156 | def remove_dumb_words(s: str) -> str: 157 | return s.replace(dumb_explanation, "") 158 | 159 | if assistant_prefill: 160 | messages.append({"role": "assistant", "content": remove_dumb_words(assistant_prefill)}) 161 | 162 | print(f"!!!max_token: {max_token}") 163 | print(f"!!!Calling Groq with messages count: {len(messages)}") 164 | chunk = self._groq_chat_completion(messages=messages, max_tokens=max_token) 165 | print(f"!!!Groq returned chunk length: {len(chunk)} chars") 166 | if has_any_thought and last_thought is None: 167 | print(f"!!!Setting finished=True (Template3 completed)") 168 | self.finished = True 169 | if not chunk.strip(): 170 | if not final_answer: 171 | self._consecutive_dummy_count += 1 172 | print(f"!!!Empty chunk returned ({self._consecutive_dummy_count}/{self._max_consecutive_dummy}), using dumb explanation") 173 | 174 | # Only stop if we've exceeded the maximum dummy responses 175 | if self._consecutive_dummy_count >= self._max_consecutive_dummy: 176 | print(f"!!!Too many consecutive dummy responses ({self._max_consecutive_dummy}), ending conversation") 177 | self.finished = True 178 | return None 179 | 180 | chunk = dumb_explanation 181 | else: 182 | # For final answer (Template3), if we get empty response, just end cleanly 183 | print(f"!!!Empty chunk in final answer - ending conversation") 184 | self.finished = True 185 | return None # Signal end of conversation 186 | else: 187 | # Reset counter when we get a real response 188 | if self._consecutive_dummy_count > 0: 189 | print(f"!!!Got real response, resetting dummy counter from {self._consecutive_dummy_count}") 190 | self._consecutive_dummy_count = 0 191 | 192 | print(f"!!!chunk: {chunk[:200]}..." if len(chunk) > 200 else f"!!!chunk: {chunk}") 193 | 194 | # if self.spoken_explanation and not self.spoken_explanation.endswith(" "): 195 | # self.spoken_explanation += " " 196 | self.spoken_explanation += chunk 197 | return chunk 198 | -------------------------------------------------------------------------------- /LICENSES/Meta-Llama-3-Community-License.txt: -------------------------------------------------------------------------------- 1 | META LLAMA 3 COMMUNITY LICENSE AGREEMENT 2 | 3 | Meta Llama 3 Version Release Date: April 18, 2024 4 | “Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein. 5 | 6 | “Documentation” means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https://llama.meta.com/get-started/. 7 | 8 | “Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf. 9 | 10 | “Meta Llama 3” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at https://llama.meta.com/llama-downloads. 11 | 12 | “Llama Materials” means, collectively, Meta’s proprietary Meta Llama 3 and Documentation (and any portion thereof) made available under this Agreement. 13 | 14 | “Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland). 15 | 16 | By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement. 17 | 18 | 1. License Rights and Redistribution. 19 | 20 | a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials. 21 | b. Redistribution and Use. 22 | i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service that uses any of them, including another AI model, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Meta Llama 3” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama 3” at the beginning of any such AI model name. 23 | ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you. 24 | iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.” 25 | iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which is hereby incorporated by reference into this Agreement. 26 | v. You will not use the Llama Materials or any output or results of the Llama Materials to improve any other large language model (excluding Meta Llama 3 or derivative works thereof). 27 | 28 | 2. Additional Commercial Terms. If, on the Meta Llama 3 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights. 29 | 30 | 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS. 31 | 32 | 4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. 33 | 34 | 5. Intellectual Property. 35 | a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama 3” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/ ). All goodwill arising out of your use of the Mark will inure to the benefit of Meta. 36 | b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications. 37 | c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Meta Llama 3 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials. 38 | 39 | 6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement. 40 | 41 | 7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement. 42 | 43 | 44 | Meta Llama 3 Acceptable Use Policy 45 | Meta is committed to promoting safe and fair use of its tools and features, including Meta Llama 3. If you access or use Meta Llama 3, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at https://llama.meta.com/llama3/use-policy 46 | Prohibited Uses 47 | We want everyone to use Meta Llama 3 safely and responsibly. You agree you will not use, or allow others to use, Meta Llama 3 to: 48 | 1. Violate the law or others’ rights, including to: 49 | a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as: 50 | i. Violence or terrorism 51 | ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material 52 | iii. Human trafficking, exploitation, and sexual violence 53 | iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials. 54 | v. Sexual solicitation 55 | vi. Any other criminal activity 56 | b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals 57 | c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services 58 | d. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices 59 | e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws 60 | f. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Llama Materials 61 | g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system 62 | 63 | 2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Meta Llama 3 related to the following: 64 | a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State 65 | b. Guns and illegal weapons (including weapon development) 66 | c. Illegal drugs and regulated/controlled substances 67 | d. Operation of critical infrastructure, transportation technologies, or heavy machinery 68 | e. Self-harm or harm to others, including suicide, cutting, and eating disorders 69 | f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual 70 | 71 | 3. Intentionally deceive or mislead others, including use of Meta Llama 3 related to the following: 72 | a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation 73 | b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content 74 | c. Generating, promoting, or further distributing spam 75 | d. Impersonating another individual without consent, authorization, or legal right 76 | e. Representing that the use of Meta Llama 3 or outputs are human-generated 77 | f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement 78 | g. Fail to appropriately disclose to end users any known dangers of your AI system 79 | 80 | Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means: 81 | * Reporting issues with the model: https://github.com/meta-llama/llama3 82 | * Reporting risky content generated by the model: developers.facebook.com/llama_output_feedback 83 | * Reporting bugs and security concerns: facebook.com/whitehat/info 84 | * Reporting violations of the Acceptable Use Policy or unlicensed uses of Meta Llama 3: LlamaUseReport@meta.com 85 | -------------------------------------------------------------------------------- /evaluation/grader/voice_grader.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import asyncio 5 | from typing import Optional, Dict, Any 6 | from pathlib import Path 7 | 8 | from .base import BaseAccuracyGrader, GradeLabel, GradeResult 9 | from .llm_grader import LLMAccuracyGrader 10 | from .asr_processor import ASRProcessor 11 | from .wer_calculator import WERCalculator 12 | 13 | 14 | class VoiceAccuracyGrader(BaseAccuracyGrader): 15 | """ 16 | Voice accuracy grader that processes audio files through ASR then uses LLM grading. 17 | 18 | Pipeline: 19 | 1. Audio file → ASR → transcript 20 | 2. Transcript → LLM grader → accuracy grade 21 | 3. Also calculates WER between ASR transcript and expected text 22 | """ 23 | 24 | def __init__( 25 | self, 26 | asr_provider: str = "azure", 27 | llm_deployment_name: str = "gpt-4o", 28 | llm_api_version: str = "2024-10-21", 29 | llm_temperature: float = 0.0, 30 | max_retries: int = 3, 31 | base_delay: float = 1.0, 32 | azure_speech_key: Optional[str] = None, 33 | azure_speech_region: Optional[str] = None, 34 | openai_api_key: Optional[str] = None, 35 | openai_base_url: Optional[str] = None, 36 | ): 37 | # Initialize ASR processor 38 | self.asr_processor = ASRProcessor( 39 | provider=asr_provider, 40 | azure_speech_key=azure_speech_key, 41 | azure_speech_region=azure_speech_region, 42 | openai_api_key=openai_api_key, 43 | openai_base_url=openai_base_url, 44 | ) 45 | 46 | # Initialize LLM grader for semantic evaluation 47 | self.llm_grader = LLMAccuracyGrader( 48 | deployment_name=llm_deployment_name, 49 | api_version=llm_api_version, 50 | temperature=llm_temperature, 51 | max_retries=max_retries, 52 | base_delay=base_delay, 53 | ) 54 | 55 | # WER calculator for transcript quality 56 | self.wer_calculator = WERCalculator() 57 | 58 | def _extract_audio_path_from_response(self, response_data: Dict[str, Any]) -> Optional[str]: 59 | """Extract audio file path from voice output response data.""" 60 | if isinstance(response_data, dict): 61 | # Check audio_info section 62 | if "audio_info" in response_data and "output_file" in response_data["audio_info"]: 63 | return response_data["audio_info"]["output_file"] 64 | 65 | # Check for direct audio path 66 | if "output_audio_path" in response_data: 67 | return response_data["output_audio_path"] 68 | 69 | # Check conversation transcript for audio response 70 | if "conversation_transcript" in response_data: 71 | for turn in response_data["conversation_transcript"]: 72 | if turn.get("type") == "audio_response" and "audio_file" in turn: 73 | return turn["audio_file"] 74 | 75 | return None 76 | 77 | async def grade_voice_response_async( 78 | self, 79 | question: str, 80 | ground_truth: str, 81 | voice_response_path_or_data: str | Dict[str, Any], 82 | expected_transcript: Optional[str] = None, 83 | benchmark: Optional[str] = None, 84 | calculate_wer: bool = True, 85 | ) -> Dict[str, Any]: 86 | """ 87 | Grade a voice response (audio file or response data structure). 88 | 89 | Args: 90 | question: The question asked 91 | ground_truth: Ground truth answer 92 | voice_response_path_or_data: Path to audio file OR response data dict 93 | expected_transcript: Expected transcript text (for WER calculation) 94 | benchmark: Benchmark name for grading context 95 | calculate_wer: Whether to calculate WER metrics 96 | 97 | Returns: 98 | Dictionary containing grading results and ASR/WER metrics 99 | """ 100 | # Extract audio file path 101 | if isinstance(voice_response_path_or_data, str): 102 | audio_path = voice_response_path_or_data 103 | response_data = None 104 | else: 105 | response_data = voice_response_path_or_data 106 | audio_path = self._extract_audio_path_from_response(response_data) 107 | 108 | if not audio_path: 109 | return { 110 | "success": False, 111 | "error": "Could not find audio file path in response data", 112 | "asr_result": None, 113 | "llm_grade": None, 114 | "wer_metrics": None 115 | } 116 | 117 | # Ensure audio file exists 118 | audio_path = Path(audio_path) 119 | if not audio_path.exists(): 120 | return { 121 | "success": False, 122 | "error": f"Audio file not found: {audio_path}", 123 | "asr_result": None, 124 | "llm_grade": None, 125 | "wer_metrics": None 126 | } 127 | 128 | # Step 1: Transcribe audio 129 | asr_result = await self.asr_processor.transcribe_async(str(audio_path)) 130 | 131 | if not asr_result["success"]: 132 | return { 133 | "success": False, 134 | "error": f"ASR failed: {asr_result['error']}", 135 | "asr_result": asr_result, 136 | "llm_grade": None, 137 | "wer_metrics": None 138 | } 139 | 140 | transcript = asr_result["text"] 141 | 142 | # Step 2: Grade transcript using LLM 143 | try: 144 | llm_grade = await self.llm_grader.grade_async( 145 | question=question, 146 | ground_truth=ground_truth, 147 | predicted_answer=transcript, 148 | benchmark=benchmark 149 | ) 150 | except Exception as e: 151 | return { 152 | "success": False, 153 | "error": f"LLM grading failed: {str(e)}", 154 | "asr_result": asr_result, 155 | "llm_grade": None, 156 | "wer_metrics": None 157 | } 158 | 159 | # Step 3: Calculate WER if expected transcript provided 160 | wer_metrics = None 161 | if calculate_wer and expected_transcript: 162 | wer_metrics = self.wer_calculator.calculate_wer( 163 | reference=expected_transcript, 164 | hypothesis=transcript, 165 | return_details=True 166 | ) 167 | 168 | return { 169 | "success": True, 170 | "error": None, 171 | "asr_result": asr_result, 172 | "llm_grade": llm_grade, 173 | "wer_metrics": wer_metrics, 174 | "transcript": transcript, 175 | "audio_path": str(audio_path) 176 | } 177 | 178 | def grade_voice_response( 179 | self, 180 | question: str, 181 | ground_truth: str, 182 | voice_response_path_or_data: str | Dict[str, Any], 183 | expected_transcript: Optional[str] = None, 184 | benchmark: Optional[str] = None, 185 | calculate_wer: bool = True, 186 | ) -> Dict[str, Any]: 187 | """Sync wrapper for voice response grading.""" 188 | async def _run(): 189 | return await self.grade_voice_response_async( 190 | question, ground_truth, voice_response_path_or_data, 191 | expected_transcript, benchmark, calculate_wer 192 | ) 193 | 194 | try: 195 | return asyncio.run(_run()) 196 | except RuntimeError: 197 | # If already inside an event loop 198 | loop = asyncio.get_event_loop() 199 | return loop.run_until_complete(_run()) 200 | 201 | def batch_grade_voice_responses( 202 | self, 203 | grading_tasks: list[Dict[str, Any]], 204 | ) -> list[Dict[str, Any]]: 205 | """ 206 | Grade multiple voice responses in batch. 207 | 208 | Args: 209 | grading_tasks: List of dicts with keys: 210 | - question: str 211 | - ground_truth: str 212 | - voice_response_path_or_data: str | Dict 213 | - expected_transcript: Optional[str] 214 | - benchmark: Optional[str] 215 | - calculate_wer: Optional[bool] = True 216 | """ 217 | async def _batch_grade(): 218 | tasks = [] 219 | for task in grading_tasks: 220 | tasks.append(self.grade_voice_response_async( 221 | question=task["question"], 222 | ground_truth=task["ground_truth"], 223 | voice_response_path_or_data=task["voice_response_path_or_data"], 224 | expected_transcript=task.get("expected_transcript"), 225 | benchmark=task.get("benchmark"), 226 | calculate_wer=task.get("calculate_wer", True) 227 | )) 228 | 229 | return await asyncio.gather(*tasks, return_exceptions=True) 230 | 231 | try: 232 | results = asyncio.run(_batch_grade()) 233 | except RuntimeError: 234 | loop = asyncio.get_event_loop() 235 | results = loop.run_until_complete(_batch_grade()) 236 | 237 | # Handle exceptions in results 238 | processed_results = [] 239 | for i, result in enumerate(results): 240 | if isinstance(result, Exception): 241 | processed_results.append({ 242 | "success": False, 243 | "error": f"Exception in task {i}: {str(result)}", 244 | "asr_result": None, 245 | "llm_grade": None, 246 | "wer_metrics": None 247 | }) 248 | else: 249 | processed_results.append(result) 250 | 251 | return processed_results 252 | 253 | # Implement base class interface for compatibility 254 | def grade( 255 | self, 256 | question: str, 257 | ground_truth: str, 258 | predicted_answer: str | Dict[str, Any], # Can be transcript or voice response data 259 | benchmark: Optional[str] = None, 260 | ) -> GradeResult: 261 | """ 262 | Grade method for base class compatibility. 263 | 264 | If predicted_answer is a string, treat as transcript and grade directly. 265 | If predicted_answer is a dict, treat as voice response data and process through ASR. 266 | """ 267 | if isinstance(predicted_answer, str): 268 | # Direct transcript grading 269 | return self.llm_grader.grade( 270 | question=question, 271 | ground_truth=ground_truth, 272 | predicted_answer=predicted_answer, 273 | benchmark=benchmark 274 | ) 275 | elif isinstance(predicted_answer, dict): 276 | # Voice response grading 277 | result = self.grade_voice_response( 278 | question=question, 279 | ground_truth=ground_truth, 280 | voice_response_path_or_data=predicted_answer, 281 | benchmark=benchmark 282 | ) 283 | 284 | if result["success"]: 285 | return result["llm_grade"] 286 | else: 287 | # Return error as incorrect grade 288 | return GradeResult( 289 | label=GradeLabel.INCORRECT, 290 | extracted_final_answer=None, 291 | reasoning=result["error"], 292 | correct_flag=False, 293 | confidence=None, 294 | raw_model_output=None, 295 | metadata={"voice_grading_error": result["error"]} 296 | ) 297 | else: 298 | raise ValueError(f"Invalid predicted_answer type: {type(predicted_answer)}") -------------------------------------------------------------------------------- /evaluation/text/run_evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | General Text Model Evaluation Script for VERA Datasets 3 | Supports GPT-4o, GPT-5 Instant, GPT-5 Thinking with async processing by default 4 | """ 5 | 6 | import os 7 | import sys 8 | import json 9 | import time 10 | import asyncio 11 | import argparse 12 | from pathlib import Path 13 | from typing import Dict, Any, List, Optional 14 | from datetime import datetime 15 | from dotenv import load_dotenv 16 | import yaml 17 | 18 | # Load environment variables from .env file 19 | load_dotenv() 20 | 21 | # Use explicit package imports for adapters present in this repository 22 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter 23 | from models.text.gpt5 import GPT5OpenAIBrowseAdapter 24 | from models.text.gemini25_pro import Gemini25ProBrowseAdapter 25 | from models.text.gemini25_flash import Gemini25FlashBrowseAdapter 26 | 27 | 28 | class TextModelEvaluator: 29 | """General evaluator that can work with different text model adapters""" 30 | 31 | def __init__(self): 32 | # Load canonical config.yaml and overlay with .env for secrets 33 | self.config = self._load_config() 34 | self.models = { 35 | 'gpt4o': self._create_gpt4o_adapter, 36 | 'gpt5-instant': self._create_gpt5_instant_adapter, 37 | 'gpt5-thinking': self._create_gpt5_thinking_adapter, 38 | 'gemini-2.5-pro': self._create_gemini_25_pro_adapter, 39 | 'gemini-2.5-flash': self._create_gemini_25_flash_adapter 40 | } 41 | self._current_dataset_name = None 42 | 43 | def set_dataset_context(self, dataset_name: str): 44 | self._current_dataset_name = dataset_name 45 | 46 | def _load_config(self) -> Dict[str, Any]: 47 | """Load config.yaml from project root; return empty dict if missing.""" 48 | cfg_path = Path(__file__).parent.parent.parent / 'config.yaml' 49 | if not cfg_path.exists(): 50 | return {} 51 | try: 52 | with open(cfg_path, 'r', encoding='utf-8') as f: 53 | return yaml.safe_load(f) or {} 54 | except Exception: 55 | return {} 56 | 57 | # --- Config helpers with .env overlay --- 58 | def _get_openai_api_key(self) -> Optional[str]: 59 | return os.getenv('OPENAI_API_KEY') or (self.config.get('api_keys', {}) or {}).get('openai_api_key') 60 | 61 | # Azure variants are not wired in this repository's adapters; OpenAI browse is used instead. 62 | 63 | def _get_gemini_api_key(self) -> Optional[str]: 64 | return os.getenv('GEMINI_API_KEY') or (self.config.get('api_keys', {}) or {}).get('gemini_api_key') 65 | 66 | def _create_gpt4o_adapter(self): 67 | """Create GPT-4o OpenAI browse adapter (used for all tracks).""" 68 | openai_key = self._get_openai_api_key() 69 | if not openai_key: 70 | raise ValueError("GPT-4o requires OPENAI_API_KEY for OpenAI browse adapter") 71 | return GPT4oOpenAIBrowseAdapter(api_key=openai_key) 72 | 73 | def _create_gpt5_instant_adapter(self): 74 | """Create GPT-5 Instant (OpenAI browse) with low reasoning effort.""" 75 | openai_key = self._get_openai_api_key() 76 | if not openai_key: 77 | raise ValueError("GPT-5 Instant requires OPENAI_API_KEY for OpenAI browse adapter") 78 | return GPT5OpenAIBrowseAdapter(api_key=openai_key, reasoning_effort='low', reasoning_summary='auto') 79 | 80 | def _create_gpt5_thinking_adapter(self): 81 | """Create GPT-5 Thinking (OpenAI browse) with high reasoning effort.""" 82 | openai_key = self._get_openai_api_key() 83 | if not openai_key: 84 | raise ValueError("GPT-5 Thinking requires OPENAI_API_KEY for OpenAI browse adapter") 85 | return GPT5OpenAIBrowseAdapter(api_key=openai_key, reasoning_effort='high', reasoning_summary='detailed') 86 | 87 | def _create_gemini_25_pro_adapter(self): 88 | """Create Gemini 2.5 Pro adapter with browse support""" 89 | api_key = self._get_gemini_api_key() 90 | if not api_key: 91 | raise ValueError("Gemini 2.5 Pro requires GEMINI_API_KEY environment variable") 92 | return Gemini25ProBrowseAdapter(api_key=api_key) 93 | 94 | def _create_gemini_25_flash_adapter(self): 95 | """Create Gemini 2.5 Flash adapter with browse support""" 96 | api_key = self._get_gemini_api_key() 97 | if not api_key: 98 | raise ValueError("Gemini 2.5 Flash requires GEMINI_API_KEY environment variable") 99 | return Gemini25FlashBrowseAdapter(api_key=api_key) 100 | 101 | def load_dataset(self, dataset_path: str) -> Dict[str, Any]: 102 | """Load a VERA dataset JSON file""" 103 | with open(dataset_path, 'r', encoding='utf-8') as f: 104 | return json.load(f) 105 | 106 | def create_output_dir(self, model_name: str, dataset_name: str) -> str: 107 | """Create timestamped output directory""" 108 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 109 | output_dir = f"test_output/{model_name}_{dataset_name}_{timestamp}" 110 | Path(output_dir).mkdir(parents=True, exist_ok=True) 111 | return output_dir 112 | 113 | def get_completed_episodes(self, output_dir: str) -> set: 114 | """Get set of episode IDs that have been completed in output directory""" 115 | completed = set() 116 | output_path = Path(output_dir) 117 | 118 | if not output_path.exists(): 119 | return completed 120 | 121 | # Look for JSON result files 122 | for json_file in output_path.glob("*.json"): 123 | try: 124 | with open(json_file, 'r') as f: 125 | data = json.load(f) 126 | 127 | # Check if it's an individual episode result 128 | if 'episode_id' in data: 129 | completed.add(data['episode_id']) 130 | 131 | # Check if it's a batch result with individual episodes 132 | elif 'results' in data: 133 | for result in data['results']: 134 | if isinstance(result, dict) and 'episode_id' in result: 135 | completed.add(result['episode_id']) 136 | 137 | except (json.JSONDecodeError, KeyError): 138 | continue 139 | 140 | return completed 141 | 142 | def filter_episodes_for_resume(self, episodes: List[Dict[str, Any]], output_dir: str) -> List[Dict[str, Any]]: 143 | """Filter episodes to skip already completed ones""" 144 | completed_ids = self.get_completed_episodes(output_dir) 145 | 146 | if not completed_ids: 147 | print("No completed episodes found, processing all episodes") 148 | return episodes 149 | 150 | print(f"Found {len(completed_ids)} completed episodes, skipping them") 151 | 152 | remaining_episodes = [] 153 | for episode in episodes: 154 | episode_id = episode.get('id', '') 155 | if episode_id not in completed_ids: 156 | remaining_episodes.append(episode) 157 | else: 158 | print(f"Skipping completed episode: {episode_id}") 159 | 160 | print(f"Remaining episodes to process: {len(remaining_episodes)}/{len(episodes)}") 161 | return remaining_episodes 162 | 163 | async def run_evaluation(self, model_name: str, dataset_path: str, 164 | max_episodes: Optional[int] = None, 165 | max_concurrent: int = 16, 166 | resume_from: Optional[str] = None) -> Dict[str, Any]: 167 | """Run evaluation with async processing by default""" 168 | print(f"Loading dataset: {dataset_path}") 169 | dataset = self.load_dataset(dataset_path) 170 | episodes = dataset.get('episodes', []) 171 | 172 | dataset_name = Path(dataset_path).stem.replace('_voice_episodes', '') 173 | 174 | # Handle resume functionality 175 | if resume_from: 176 | if not Path(resume_from).exists(): 177 | raise ValueError(f"Resume directory does not exist: {resume_from}") 178 | 179 | print(f"Resuming from: {resume_from}") 180 | output_dir = resume_from 181 | 182 | # Filter out already completed episodes 183 | episodes = self.filter_episodes_for_resume(episodes, output_dir) 184 | 185 | if not episodes: 186 | print("All episodes already completed!") 187 | return {'message': 'All episodes already completed', 'skipped': True} 188 | 189 | else: 190 | # Create new output directory 191 | output_dir = self.create_output_dir(model_name, dataset_name) 192 | 193 | if max_episodes: 194 | episodes = episodes[:max_episodes] 195 | print(f"Limited to {max_episodes} episodes") 196 | 197 | print(f"Creating model adapter: {model_name}") 198 | self.set_dataset_context(dataset_name) 199 | adapter = self.models[model_name]() 200 | 201 | print(f"Starting async evaluation with {len(episodes)} episodes") 202 | start_time = time.time() 203 | 204 | # All adapters implement async batch processing that returns standardized batch result 205 | results = await adapter.process_episodes_batch(episodes, output_dir, max_concurrent) 206 | 207 | end_time = time.time() 208 | duration = end_time - start_time 209 | 210 | # Save summary 211 | # Derive basic counters from standardized batch result 212 | summary_counts = results.get('summary', {}) if isinstance(results, dict) else {} 213 | total_episodes = summary_counts.get('total_episodes', len(episodes)) 214 | successful = summary_counts.get('successful_episodes', 0) 215 | failed = total_episodes - successful 216 | 217 | summary = { 218 | 'model': model_name, 219 | 'dataset': dataset_name, 220 | 'dataset_path': dataset_path, 221 | 'output_directory': output_dir, 222 | 'total_episodes': total_episodes, 223 | 'processed': total_episodes, 224 | 'successful': successful, 225 | 'failed': failed, 226 | 'duration_seconds': duration, 227 | 'episodes_per_second': total_episodes / duration if duration > 0 else 0, 228 | 'timestamp': datetime.now().isoformat(), 229 | 'max_concurrent': max_concurrent, 230 | 'async_processing': True 231 | } 232 | 233 | summary_path = Path(output_dir) / 'evaluation_summary.json' 234 | with open(summary_path, 'w', encoding='utf-8') as f: 235 | json.dump(summary, f, indent=2) 236 | 237 | print(f"\nEvaluation completed!") 238 | print(f"Model: {model_name}") 239 | print(f"Dataset: {dataset_name}") 240 | print(f"Episodes: {summary['successful']}/{summary['total_episodes']} successful") 241 | print(f"Duration: {duration:.2f}s ({summary['episodes_per_second']:.2f} episodes/sec)") 242 | print(f"Output: {output_dir}") 243 | 244 | return summary 245 | 246 | 247 | def main(): 248 | parser = argparse.ArgumentParser(description='Evaluate text models on VERA datasets') 249 | parser.add_argument('model', choices=['gpt4o', 'gpt5-instant', 'gpt5-thinking', 'gemini-2.5-pro', 'gemini-2.5-flash'], 250 | help='Text model to evaluate') 251 | parser.add_argument('dataset', help='Path to dataset JSON file') 252 | parser.add_argument('--max-episodes', type=int, help='Maximum number of episodes to process') 253 | parser.add_argument('--max-concurrent', type=int, default=16, 254 | help='Maximum concurrent requests') 255 | 256 | args = parser.parse_args() 257 | 258 | # Validate dataset path 259 | if not Path(args.dataset).exists(): 260 | print(f"Error: Dataset file not found: {args.dataset}") 261 | return 1 262 | 263 | evaluator = TextModelEvaluator() 264 | 265 | try: 266 | summary = asyncio.run(evaluator.run_evaluation( 267 | args.model, args.dataset, args.max_episodes, args.max_concurrent 268 | )) 269 | return 0 270 | 271 | except Exception as e: 272 | print(f"Error during evaluation: {e}") 273 | import traceback 274 | traceback.print_exc() 275 | return 1 276 | 277 | 278 | if __name__ == "__main__": 279 | exit(main()) 280 | -------------------------------------------------------------------------------- /evaluation/grader/run_grader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | CLI for grading accuracy of model responses. 4 | 5 | Usage: 6 | uv run python evaluation/grader/run_grader.py single \ 7 | --question "..." --ground-truth "..." --pred "..." [--benchmark simpleqa] 8 | 9 | uv run python evaluation/grader/run_grader.py batch \ 10 | --dataset data/final_dataset/text/simpleqa_voice_episodes.json \ 11 | --results test_output/gpt4o_simpleqa_*/gpt4o_openai_browse_batch_*.json \ 12 | [--benchmark simpleqa] 13 | """ 14 | 15 | from __future__ import annotations 16 | 17 | import argparse 18 | import glob 19 | import json 20 | from pathlib import Path 21 | import asyncio 22 | from typing import Dict, Any, List, Optional 23 | 24 | from evaluation.grader.base import GradeLabel 25 | from evaluation.grader.llm_grader import LLMAccuracyGrader 26 | 27 | 28 | def _load_dataset_questions(dataset_path: str) -> Dict[str, Dict[str, str]]: 29 | """Map episode_id -> {question, ground_truth} from dataset.""" 30 | with open(dataset_path, "r", encoding="utf-8") as f: 31 | data = json.load(f) 32 | mapping: Dict[str, Dict[str, str]] = {} 33 | for ep in data.get("episodes", []): 34 | eid = ep.get("id") 35 | turns = ep.get("turns", []) 36 | q = "" 37 | if turns: 38 | q = turns[0].get("text_content", "") 39 | # expected can be under turn.metadata or ep.metadata 40 | target = ( 41 | (turns[0].get("metadata", {}) or {}).get("expected_answer") 42 | or ep.get("metadata", {}).get("expected_answer") 43 | or "" 44 | ) 45 | if eid: 46 | mapping[eid] = {"question": q, "ground_truth": target} 47 | return mapping 48 | 49 | 50 | def _load_results(results_glob: str) -> List[Dict[str, Any]]: 51 | files = sorted(glob.glob(results_glob)) 52 | episodes: List[Dict[str, Any]] = [] 53 | for fp in files: 54 | with open(fp, "r", encoding="utf-8") as f: 55 | data = json.load(f) 56 | # Standardized batch format stores per-episode in 'episodes' or directly as list 57 | eps = data.get("episodes") or data.get("results") or [] 58 | if isinstance(eps, list) and eps: 59 | episodes.extend(eps) 60 | else: 61 | # Some adapters save single-episode results 62 | if "episode_id" in data and "turn_results" in data: 63 | episodes.append(data) 64 | return episodes 65 | 66 | 67 | def _extract_predicted_answer(episode_result: Dict[str, Any]) -> Optional[str]: 68 | """Extract the assistant response text from per-episode result. 69 | 70 | Supports both legacy fields (turn_results/model_response) and 71 | standardized fields (turns/response). 72 | """ 73 | # Legacy shape 74 | turns = episode_result.get("turn_results") 75 | if isinstance(turns, list) and turns: 76 | return turns[-1].get("model_response") 77 | 78 | # Standardized shape 79 | turns = episode_result.get("turns") 80 | if isinstance(turns, list) and turns: 81 | return turns[-1].get("response") 82 | 83 | return None 84 | 85 | 86 | def _summarize_counts(labels: List[GradeLabel]) -> Dict[str, Any]: 87 | total = len(labels) 88 | c = sum(1 for l in labels if l == GradeLabel.CORRECT) 89 | i = sum(1 for l in labels if l == GradeLabel.INCORRECT) 90 | n = sum(1 for l in labels if l == GradeLabel.NOT_ATTEMPTED) 91 | acc = (c / total) if total else 0.0 92 | return {"total": total, "correct": c, "incorrect": i, "not_attempted": n, "accuracy": acc} 93 | 94 | 95 | def main(): 96 | # Try loading .env if available for Azure creds 97 | try: 98 | from dotenv import load_dotenv # type: ignore 99 | load_dotenv() 100 | except Exception: 101 | pass 102 | parser = argparse.ArgumentParser(description="Accuracy grader") 103 | sub = parser.add_subparsers(dest="cmd", required=True) 104 | 105 | p_single = sub.add_parser("single", help="Grade a single triplet") 106 | p_single.add_argument("--question", required=False, default="") 107 | p_single.add_argument("--ground-truth", required=False, dest="ground_truth") 108 | p_single.add_argument("--pred", required=True) 109 | p_single.add_argument("--benchmark", required=False) 110 | # LLM-only, triad mode 111 | 112 | p_batch = sub.add_parser("batch", help="Grade a batch of results vs. a dataset") 113 | p_batch.add_argument("--dataset", required=True) 114 | p_batch.add_argument("--results", required=True, help="Glob to batch result JSON(s)") 115 | p_batch.add_argument("--benchmark", required=False) 116 | p_batch.add_argument("--out", required=False, help="Optional path to write detailed grades JSON") 117 | p_batch.add_argument("--max-concurrent", type=int, default=16, help="Max concurrent grading requests") 118 | 119 | p_latest = sub.add_parser("latest", help="Auto-find latest results per model/benchmark under test_output and grade them") 120 | p_latest.add_argument("--models", nargs="*", default=["gpt4o", "gpt5-instant", "gpt5-thinking", "gemini-2.5-pro", "gemini-2.5-flash"], help="Models to include") 121 | p_latest.add_argument( 122 | "--benchmarks", 123 | nargs="*", 124 | default=["aime", "browsecomp", "gpqa_diamond", "mrcr", "simpleqa"], 125 | help="Benchmarks/datasets to include", 126 | ) 127 | p_latest.add_argument("--out-dir", default="", help="Optional directory to also write an aggregate summary") 128 | p_latest.add_argument("--max-concurrent", type=int, default=16, help="Max concurrent grading requests") 129 | 130 | args = parser.parse_args() 131 | 132 | grader = LLMAccuracyGrader() 133 | 134 | if args.cmd == "single": 135 | if not args.ground_truth: 136 | parser.error("--ground-truth is required") 137 | res = grader.grade( 138 | question=args.question, 139 | ground_truth=args.ground_truth, 140 | predicted_answer=args.pred, 141 | benchmark=args.benchmark, 142 | ) 143 | print(json.dumps({ 144 | "label": res.label, 145 | "question": args.question, 146 | "ground_truth": args.ground_truth, 147 | "extracted_final_answer": res.extracted_final_answer, 148 | "confidence": res.confidence, 149 | "reasoning": res.reasoning, 150 | }, default=str, indent=2)) 151 | return 0 152 | 153 | if args.cmd == "batch": 154 | # batch 155 | ep_map = _load_dataset_questions(args.dataset) 156 | results = _load_results(args.results) 157 | 158 | async def _grade_all(): 159 | sem = asyncio.Semaphore(max(1, args.max_concurrent)) 160 | detailed_local = [] 161 | labels_local: List[GradeLabel] = [] 162 | 163 | async def _one(ep: Dict[str, Any]): 164 | eid = ep.get("episode_id") 165 | if not eid or eid not in ep_map: 166 | return None 167 | qa = ep_map[eid] 168 | pred = _extract_predicted_answer(ep) or "" 169 | async with sem: 170 | gres = await grader.grade_async( 171 | question=qa["question"], 172 | ground_truth=qa["ground_truth"], 173 | predicted_answer=pred, 174 | benchmark=args.benchmark, 175 | ) 176 | labels_local.append(gres.label) 177 | detailed_local.append({ 178 | "episode_id": eid, 179 | "question": qa["question"], 180 | "ground_truth": qa["ground_truth"], 181 | "predicted_answer": pred, 182 | "label": gres.label, 183 | "confidence": gres.confidence, 184 | "extracted_final_answer": gres.extracted_final_answer, 185 | }) 186 | 187 | tasks = [ 188 | _one(ep) for ep in results 189 | ] 190 | await asyncio.gather(*tasks) 191 | return labels_local, detailed_local 192 | 193 | labels, detailed = asyncio.run(_grade_all()) 194 | 195 | summary = _summarize_counts(labels) 196 | out = { 197 | "summary": summary, 198 | "grades": detailed, 199 | } 200 | 201 | print(json.dumps(out, indent=2)) 202 | if args.out: 203 | Path(args.out).parent.mkdir(parents=True, exist_ok=True) 204 | with open(args.out, "w", encoding="utf-8") as f: 205 | json.dump(out, f, indent=2) 206 | return 0 207 | 208 | if args.cmd == "latest": 209 | # Build dataset path resolver 210 | dataset_dir = Path(__file__).parent.parent.parent / 'data' / 'final_dataset' / 'text' 211 | def dataset_path(ds: str) -> Path: 212 | return dataset_dir / f"{ds}_voice_episodes.json" 213 | 214 | # Map model -> batch filename pattern within the run folder (new standardized adapters) 215 | batch_prefix = { 216 | 'gpt4o': 'gpt4o_openai_browse_batch_', 217 | 'gpt5-instant': 'gpt5_openai_browse_batch_', 218 | 'gpt5-thinking': 'gpt5_openai_browse_batch_', 219 | 'gemini-2.5-pro': 'gemini_25_pro_browse_batch_', 220 | 'gemini-2.5-flash': 'gemini_25_flash_browse_batch_', 221 | } 222 | 223 | base = Path('test_output') 224 | # Also check text_output for Gemini results 225 | text_output_base = Path('text_output') 226 | out_dir = Path(args.out_dir) if args.out_dir else None 227 | if out_dir: 228 | out_dir.mkdir(parents=True, exist_ok=True) 229 | 230 | overall = [] 231 | summary_rows = [] 232 | 233 | for model in args.models: 234 | for ds in args.benchmarks: 235 | run_dirs = sorted(base.glob(f"{model}_{ds}_*")) 236 | 237 | # For Gemini models, also check text_output structure 238 | if model in ["gemini-2.5-pro", "gemini-2.5-flash"] and text_output_base.exists(): 239 | gemini_folder = "gemini_2.5_pro" if model == "gemini-2.5-pro" else "gemini_2.5_flash" 240 | gemini_dir = text_output_base / gemini_folder 241 | if gemini_dir.exists(): 242 | # Look for dataset subdirectories 243 | gemini_ds_dirs = sorted(gemini_dir.glob(f"*{ds}*")) 244 | run_dirs.extend(gemini_ds_dirs) 245 | 246 | if not run_dirs: 247 | continue 248 | # pick most recent directory that actually contains results 249 | latest_dir = None 250 | for cand in reversed(run_dirs): 251 | # any batch or per-episode results inside? 252 | prefix = batch_prefix.get(model) 253 | # Backward-compatible per-episode prefix patterns (legacy + current) 254 | per_prefix = { 255 | 'gpt4o': None, 256 | 'gpt5-instant': None, 257 | 'gpt5-thinking': None, 258 | 'gemini-2.5-pro': 'gemini_25_pro_browse_', 259 | 'gemini-2.5-flash': 'gemini_25_flash_browse_', 260 | }.get(model) 261 | if list(cand.glob(f"{prefix}*.json")) or list(cand.glob(f"{per_prefix}*.json")): 262 | latest_dir = cand 263 | break 264 | if latest_dir is None: 265 | continue 266 | # find batch file 267 | prefix = batch_prefix.get(model) 268 | if not prefix: 269 | continue 270 | batch_files = sorted(latest_dir.glob(f"{prefix}*.json")) 271 | results_glob: Optional[str] = None 272 | if batch_files: 273 | batch_file = str(batch_files[-1]) 274 | results_glob = batch_file 275 | else: 276 | # Fallback to per-episode results if no batch file is present 277 | per_prefix = { 278 | 'gpt4o': None, 279 | 'gpt5-instant': None, 280 | 'gpt5-thinking': None, 281 | 'gemini-2.5-pro': 'gemini_25_pro_browse_', 282 | 'gemini-2.5-flash': 'gemini_25_flash_browse_', 283 | }.get(model) 284 | if per_prefix: 285 | per_files = sorted(latest_dir.glob(f"{per_prefix}*.json")) 286 | if per_files: 287 | results_glob = str(latest_dir / f"{per_prefix}*.json") 288 | if not results_glob: 289 | continue 290 | 291 | ds_path = dataset_path(ds) 292 | if not ds_path.exists(): 293 | continue 294 | 295 | # Reuse batch grading pipeline 296 | ep_map = _load_dataset_questions(str(ds_path)) 297 | results = _load_results(results_glob) 298 | 299 | async def _grade_all_latest(): 300 | sem = asyncio.Semaphore(max(1, args.max_concurrent)) 301 | detailed_local = [] 302 | labels_local: List[GradeLabel] = [] 303 | 304 | async def _one(ep: Dict[str, Any]): 305 | eid = ep.get("episode_id") 306 | if not eid or eid not in ep_map: 307 | return None 308 | qa = ep_map[eid] 309 | pred = _extract_predicted_answer(ep) or "" 310 | async with sem: 311 | gres = await grader.grade_async( 312 | question=qa["question"], 313 | ground_truth=qa["ground_truth"], 314 | predicted_answer=pred, 315 | benchmark=ds, 316 | ) 317 | labels_local.append(gres.label) 318 | detailed_local.append({ 319 | "episode_id": eid, 320 | "question": qa["question"], 321 | "ground_truth": qa["ground_truth"], 322 | "predicted_answer": pred, 323 | "label": gres.label, 324 | "confidence": gres.confidence, 325 | "extracted_final_answer": gres.extracted_final_answer, 326 | "model": model, 327 | "dataset": ds, 328 | "results_file": results_glob, 329 | }) 330 | 331 | await asyncio.gather(*[ _one(ep) for ep in results ]) 332 | return labels_local, detailed_local 333 | 334 | labels, detailed = asyncio.run(_grade_all_latest()) 335 | overall.extend(detailed) 336 | summary = _summarize_counts(labels) 337 | results_file_for_summary = results_glob or "" 338 | summary_rows.append({ 339 | "model": model, 340 | "dataset": ds, 341 | **summary, 342 | "results_dir": str(latest_dir), 343 | "results_file": results_file_for_summary, 344 | }) 345 | 346 | # write per-pair file into the corresponding run folder 347 | pair_out_inplace = Path(latest_dir) / "llm_grades.json" 348 | with open(pair_out_inplace, 'w', encoding='utf-8') as f: 349 | json.dump({"summary": summary, "grades": detailed}, f, indent=2) 350 | 351 | # optionally also write to central out-dir if provided 352 | if out_dir: 353 | pair_out = out_dir / f"{model}_{ds}_grades_llm.json" 354 | with open(pair_out, 'w', encoding='utf-8') as f: 355 | json.dump({"summary": summary, "grades": detailed}, f, indent=2) 356 | 357 | # write aggregate 358 | agg = { 359 | "pairs": summary_rows, 360 | "total_pairs": len(summary_rows), 361 | } 362 | print(json.dumps(agg, indent=2)) 363 | if out_dir: 364 | with open(out_dir / "summary_latest_grades.json", 'w', encoding='utf-8') as f: 365 | json.dump(agg, f, indent=2) 366 | return 0 367 | 368 | 369 | if __name__ == "__main__": 370 | raise SystemExit(main()) 371 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Test module for VERA model adapters 4 | Tests basic functionality of each model to ensure they work correctly 5 | """ 6 | 7 | import os 8 | import sys 9 | import json 10 | import tempfile 11 | from pathlib import Path 12 | from unittest.mock import Mock, patch, MagicMock 13 | 14 | # Try to import pytest, but make it optional 15 | try: 16 | import pytest 17 | PYTEST_AVAILABLE = True 18 | except ImportError: 19 | PYTEST_AVAILABLE = False 20 | # Define minimal pytest decorators for standalone mode 21 | class pytest: 22 | class fixture: 23 | def __init__(self, *args, **kwargs): 24 | pass 25 | def __call__(self, func): 26 | return func 27 | fixture = fixture() 28 | @staticmethod 29 | def skip(msg): 30 | pass 31 | 32 | # Add project root to path 33 | project_root = Path(__file__).parent.parent 34 | sys.path.insert(0, str(project_root)) 35 | 36 | from models.shared.base_adapter import ModelConfig, BaseAdapter, TextAdapter, VoiceAdapter, RealtimeAdapter 37 | 38 | 39 | # ============================================================================ 40 | # Test Fixtures 41 | # ============================================================================ 42 | 43 | @pytest.fixture 44 | def sample_episode(): 45 | """Sample episode data for testing""" 46 | return { 47 | "id": "test_episode_001", 48 | "track": "standard", 49 | "turns": [ 50 | { 51 | "role": "user", 52 | "text_content": "What is 2+2?", 53 | "audio_file": None 54 | } 55 | ], 56 | "context_documents": [] 57 | } 58 | 59 | 60 | @pytest.fixture 61 | def sample_mrcr_episode(): 62 | """Sample MRCR episode with context""" 63 | return { 64 | "id": "test_mrcr_001", 65 | "track": "long_context", 66 | "turns": [ 67 | { 68 | "role": "user", 69 | "text_content": "What was discussed earlier?", 70 | "audio_file": None 71 | } 72 | ], 73 | "context_documents": [ 74 | { 75 | "content": "User: Hello\nAssistant: Hi there!\nUser: What's the weather?\nAssistant: It's sunny today." 76 | } 77 | ] 78 | } 79 | 80 | 81 | @pytest.fixture 82 | def temp_output_dir(): 83 | """Temporary output directory""" 84 | with tempfile.TemporaryDirectory() as tmpdir: 85 | yield tmpdir 86 | 87 | 88 | # ============================================================================ 89 | # Test Base Classes 90 | # ============================================================================ 91 | 92 | class TestBaseAdapter: 93 | """Test base adapter functionality""" 94 | 95 | def test_model_config_creation(self): 96 | """Test ModelConfig dataclass""" 97 | config = ModelConfig(model_name="test-model") 98 | assert config.model_name == "test-model" 99 | assert config.temperature == 0.0 100 | assert config.max_tokens == 4096 101 | assert config.timeout == 300.0 102 | assert config.max_concurrent == 16 103 | 104 | def test_base_adapter_initialization(self): 105 | """Test BaseAdapter initialization""" 106 | config = ModelConfig(model_name="test-model") 107 | 108 | # Create concrete implementation for testing 109 | class TestAdapter(BaseAdapter): 110 | def process_episode(self, episode, output_dir): 111 | return {"episode_id": episode["id"], "success": True} 112 | 113 | adapter = TestAdapter(config) 114 | assert adapter.config == config 115 | assert adapter.model_name == "test-model" 116 | 117 | 118 | # ============================================================================ 119 | # Test Text Models 120 | # ============================================================================ 121 | 122 | class TestGPT4oAdapter: 123 | """Test GPT-4o text adapter""" 124 | 125 | @patch('httpx.Client') 126 | def test_adapter_initialization(self, mock_client): 127 | """Test GPT-4o adapter can be initialized""" 128 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter 129 | 130 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key") 131 | assert adapter.model_name == "gpt-4o" 132 | assert adapter.api_key == "test-key" 133 | 134 | @patch('httpx.Client') 135 | def test_prepare_prompt(self, mock_client): 136 | """Test prompt preparation""" 137 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter 138 | 139 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key") 140 | 141 | episode = { 142 | "id": "test_001", 143 | "turns": [ 144 | {"role": "user", "text_content": "Hello"} 145 | ], 146 | "context_documents": [] 147 | } 148 | 149 | turn = episode["turns"][0] 150 | prompt = adapter._prepare_prompt(turn, episode, 0) 151 | 152 | assert "Hello" in prompt 153 | 154 | @patch('httpx.Client') 155 | def test_make_api_request_simple_message(self, mock_client): 156 | """Test API request with simple message""" 157 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter 158 | 159 | # Mock the API response 160 | mock_response = Mock() 161 | mock_response.json.return_value = { 162 | "output": {"content": "Test response"}, 163 | "usage": {"total_tokens": 10} 164 | } 165 | mock_response.raise_for_status = Mock() 166 | 167 | mock_client_instance = Mock() 168 | mock_client_instance.post.return_value = mock_response 169 | mock_client.return_value.__enter__.return_value = mock_client_instance 170 | 171 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key") 172 | 173 | messages = [{"role": "user", "content": "Hello"}] 174 | response = adapter._make_api_request(messages) 175 | 176 | assert response == "Test response" 177 | 178 | 179 | class TestGemini25ProAdapter: 180 | """Test Gemini 2.5 Pro adapter""" 181 | 182 | @patch.dict(os.environ, {"GOOGLE_API_KEY": "test-key"}) 183 | def test_adapter_can_be_imported(self): 184 | """Test that Gemini adapter can be imported""" 185 | try: 186 | from models.text.gemini25_pro import Gemini25ProAdapter 187 | assert True 188 | except ImportError as e: 189 | pytest.skip(f"Gemini dependencies not available: {e}") 190 | 191 | 192 | class TestGPT5Adapter: 193 | """Test GPT-5 adapter""" 194 | 195 | def test_adapter_can_be_imported(self): 196 | """Test that GPT-5 adapter can be imported""" 197 | try: 198 | from models.text.gpt5 import GPT5Adapter 199 | assert True 200 | except ImportError as e: 201 | pytest.skip(f"GPT-5 dependencies not available: {e}") 202 | 203 | 204 | # ============================================================================ 205 | # Test Voice Models 206 | # ============================================================================ 207 | 208 | class TestQwen2AudioAdapter: 209 | """Test Qwen2-Audio voice adapter""" 210 | 211 | def test_adapter_can_be_imported(self): 212 | """Test that Qwen2-Audio adapter can be imported""" 213 | try: 214 | from models.voice.qwen2_audio import Qwen2AudioAdaptiveEvaluator, EvaluationConfig 215 | 216 | config = EvaluationConfig() 217 | assert config.model_name == "Qwen/Qwen2-Audio-7B-Instruct" 218 | assert config.temperature == 0.7 219 | except ImportError as e: 220 | pytest.skip(f"Qwen2-Audio dependencies not available: {e}") 221 | 222 | def test_task_type_detection(self): 223 | """Test task type detection logic""" 224 | try: 225 | from models.voice.qwen2_audio import Qwen2AudioAdaptiveEvaluator, EvaluationConfig 226 | except ImportError: 227 | pytest.skip("Qwen2-Audio dependencies not available") 228 | 229 | config = EvaluationConfig() 230 | 231 | # Mock the LLM initialization to avoid loading the model 232 | with patch('models.voice.qwen2_audio.LLM'): 233 | evaluator = Qwen2AudioAdaptiveEvaluator(config) 234 | 235 | # Test MRCR detection 236 | mrcr_episode = { 237 | "id": "test_mrcr_001", 238 | "track": "long_context", 239 | "context_documents": [{"content": "test"}] 240 | } 241 | assert evaluator.detect_task_type(mrcr_episode) == "mrcr" 242 | 243 | # Test standard detection 244 | standard_episode = { 245 | "id": "test_standard_001", 246 | "track": "standard", 247 | "context_documents": [] 248 | } 249 | assert evaluator.detect_task_type(standard_episode) == "standard" 250 | 251 | 252 | class TestUltravoxAdapter: 253 | """Test Ultravox voice adapter""" 254 | 255 | def test_adapter_can_be_imported(self): 256 | """Test that Ultravox adapter can be imported""" 257 | try: 258 | from models.voice.ultravox import UltravoxAdapter 259 | assert True 260 | except ImportError as e: 261 | pytest.skip(f"Ultravox dependencies not available: {e}") 262 | 263 | 264 | # ============================================================================ 265 | # Test Realtime Models 266 | # ============================================================================ 267 | 268 | class TestGPTRealtimeAdapter: 269 | """Test GPT Realtime adapter""" 270 | 271 | def test_module_can_be_imported(self): 272 | """Test that GPT Realtime module can be imported""" 273 | try: 274 | from models.realtime import gpt_realtime 275 | assert hasattr(gpt_realtime, 'main') 276 | assert hasattr(gpt_realtime, 'parse_mrcr_context') 277 | except ImportError as e: 278 | pytest.skip(f"GPT Realtime dependencies not available: {e}") 279 | 280 | def test_parse_mrcr_context(self): 281 | """Test MRCR context parsing""" 282 | try: 283 | from models.realtime.gpt_realtime import parse_mrcr_context 284 | except ImportError: 285 | pytest.skip("GPT Realtime dependencies not available") 286 | 287 | context = "User: Hello\nAssistant: Hi there!\nUser: How are you?\nAssistant: I'm doing well!" 288 | messages = parse_mrcr_context(context) 289 | 290 | assert len(messages) == 4 291 | assert messages[0]["role"] == "user" 292 | assert messages[0]["content"] == "Hello" 293 | assert messages[1]["role"] == "assistant" 294 | assert messages[1]["content"] == "Hi there!" 295 | 296 | 297 | class TestGeminiRealtimeAdapter: 298 | """Test Gemini Realtime adapter""" 299 | 300 | def test_adapter_can_be_imported(self): 301 | """Test that Gemini Realtime adapter can be imported""" 302 | try: 303 | from models.realtime import gemini 304 | assert True 305 | except ImportError as e: 306 | pytest.skip(f"Gemini Realtime dependencies not available: {e}") 307 | 308 | 309 | class TestMoshiAdapter: 310 | """Test Moshi adapter""" 311 | 312 | def test_adapter_can_be_imported(self): 313 | """Test that Moshi adapter can be imported""" 314 | try: 315 | from models.realtime import moshi 316 | assert True 317 | except ImportError as e: 318 | pytest.skip(f"Moshi dependencies not available: {e}") 319 | 320 | 321 | # ============================================================================ 322 | # Integration Tests 323 | # ============================================================================ 324 | 325 | class TestModelIntegration: 326 | """Integration tests for model adapters""" 327 | 328 | @patch('httpx.Client') 329 | def test_text_model_episode_processing(self, mock_client, sample_episode, temp_output_dir): 330 | """Test that a text model can process an episode""" 331 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter 332 | 333 | # Mock successful API response 334 | mock_response = Mock() 335 | mock_response.json.return_value = { 336 | "id": "test-response", 337 | "output": {"content": "4"}, 338 | "usage": { 339 | "total_tokens": 10, 340 | "prompt_tokens": 5, 341 | "completion_tokens": 5 342 | } 343 | } 344 | mock_response.raise_for_status = Mock() 345 | 346 | mock_client_instance = Mock() 347 | mock_client_instance.post.return_value = mock_response 348 | mock_client.return_value.__enter__.return_value = mock_client_instance 349 | 350 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key") 351 | 352 | result = adapter.process_episode(sample_episode, temp_output_dir) 353 | 354 | assert "episode_id" in result 355 | assert result["episode_id"] == "test_episode_001" 356 | assert "turn_results" in result 357 | 358 | def test_model_config_variations(self): 359 | """Test different model configurations""" 360 | configs = [ 361 | ModelConfig(model_name="test-1", temperature=0.0), 362 | ModelConfig(model_name="test-2", temperature=0.7, max_tokens=2048), 363 | ModelConfig(model_name="test-3", max_concurrent=8) 364 | ] 365 | 366 | for config in configs: 367 | assert config.model_name.startswith("test-") 368 | assert 0.0 <= config.temperature <= 1.0 369 | assert config.max_tokens > 0 370 | 371 | 372 | # ============================================================================ 373 | # Utility Tests 374 | # ============================================================================ 375 | 376 | class TestTimingUtils: 377 | """Test timing utilities""" 378 | 379 | def test_timing_utils_can_be_imported(self): 380 | """Test that timing utilities can be imported""" 381 | try: 382 | from models.shared.timing_utils import ( 383 | create_turn_result, 384 | create_standardized_episode_result, 385 | create_standardized_batch_result 386 | ) 387 | assert True 388 | except ImportError as e: 389 | pytest.skip(f"Timing utilities not available: {e}") 390 | 391 | 392 | # ============================================================================ 393 | # Main Test Runner 394 | # ============================================================================ 395 | 396 | def run_smoke_tests(): 397 | """Run basic smoke tests without pytest""" 398 | print("=" * 70) 399 | print("VERA Model Smoke Tests") 400 | print("=" * 70) 401 | 402 | passed = 0 403 | failed = 0 404 | skipped = 0 405 | 406 | # Test 1: Import base classes 407 | print("\n[1/8] Testing base classes...") 408 | try: 409 | from models.shared.base_adapter import ModelConfig, BaseAdapter 410 | config = ModelConfig(model_name="test") 411 | print("✓ Base classes work") 412 | passed += 1 413 | except Exception as e: 414 | print(f"✗ Base classes failed: {e}") 415 | failed += 1 416 | 417 | # Test 2: Import text models 418 | print("\n[2/8] Testing text model imports...") 419 | try: 420 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter 421 | print("✓ Text models can be imported") 422 | passed += 1 423 | except Exception as e: 424 | print(f"✗ Text models failed: {e}") 425 | failed += 1 426 | 427 | # Test 3: Import voice models 428 | print("\n[3/8] Testing voice model imports...") 429 | try: 430 | from models.voice.qwen2_audio import EvaluationConfig 431 | print("✓ Voice models can be imported") 432 | passed += 1 433 | except Exception as e: 434 | print(f"⊘ Voice models skipped: {e}") 435 | skipped += 1 436 | 437 | # Test 4: Import realtime models 438 | print("\n[4/8] Testing realtime model imports...") 439 | try: 440 | from models.realtime import gpt_realtime 441 | print("✓ Realtime models can be imported") 442 | passed += 1 443 | except Exception as e: 444 | print(f"⊘ Realtime models skipped: {e}") 445 | skipped += 1 446 | 447 | # Test 5: Test ModelConfig 448 | print("\n[5/8] Testing ModelConfig...") 449 | try: 450 | config = ModelConfig( 451 | model_name="test-model", 452 | temperature=0.5, 453 | max_tokens=2048 454 | ) 455 | assert config.model_name == "test-model" 456 | assert config.temperature == 0.5 457 | print("✓ ModelConfig works") 458 | passed += 1 459 | except Exception as e: 460 | print(f"✗ ModelConfig failed: {e}") 461 | failed += 1 462 | 463 | # Test 6: Test timing utilities 464 | print("\n[6/8] Testing timing utilities...") 465 | try: 466 | from models.shared.timing_utils import create_turn_result 467 | print("✓ Timing utilities can be imported") 468 | passed += 1 469 | except Exception as e: 470 | print(f"⊘ Timing utilities skipped: {e}") 471 | skipped += 1 472 | 473 | # Test 7: Test GPT-4o adapter initialization 474 | print("\n[7/8] Testing GPT-4o adapter initialization...") 475 | try: 476 | from models.text.gpt4o import GPT4oOpenAIBrowseAdapter 477 | adapter = GPT4oOpenAIBrowseAdapter(api_key="test-key") 478 | assert adapter.model_name == "gpt-4o" 479 | print("✓ GPT-4o adapter initializes") 480 | passed += 1 481 | except Exception as e: 482 | print(f"✗ GPT-4o adapter failed: {e}") 483 | failed += 1 484 | 485 | # Test 8: Test MRCR context parsing 486 | print("\n[8/8] Testing MRCR context parsing...") 487 | try: 488 | from models.realtime.gpt_realtime import parse_mrcr_context 489 | context = "User: Hello\nAssistant: Hi!" 490 | messages = parse_mrcr_context(context) 491 | assert len(messages) == 2 492 | print("✓ MRCR parsing works") 493 | passed += 1 494 | except Exception as e: 495 | print(f"⊘ MRCR parsing skipped: {e}") 496 | skipped += 1 497 | 498 | # Summary 499 | print("\n" + "=" * 70) 500 | print("Summary") 501 | print("=" * 70) 502 | print(f"✓ Passed: {passed}") 503 | print(f"✗ Failed: {failed}") 504 | print(f"⊘ Skipped: {skipped}") 505 | print(f"Total: {passed + failed + skipped}") 506 | 507 | if failed == 0: 508 | print("\n✓ All required tests passed!") 509 | return 0 510 | else: 511 | print(f"\n✗ {failed} test(s) failed") 512 | return 1 513 | 514 | 515 | if __name__ == "__main__": 516 | # If run directly, execute smoke tests 517 | # If run with pytest, pytest will discover and run the test classes 518 | import sys 519 | if len(sys.argv) == 1 or not PYTEST_AVAILABLE: 520 | sys.exit(run_smoke_tests()) 521 | else: 522 | if PYTEST_AVAILABLE: 523 | pytest.main([__file__] + sys.argv[1:]) 524 | else: 525 | print("pytest not installed. Running smoke tests instead.") 526 | sys.exit(run_smoke_tests()) 527 | --------------------------------------------------------------------------------