├── .github └── workflows │ ├── README.md │ ├── sdk-tests-lean.yml │ └── sdk-tests.yml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── VERSION ├── VERSIONING.md ├── cli ├── .gitignore ├── README.md ├── poetry.lock ├── pyproject.toml └── trainloop_cli │ ├── __init__.py │ ├── __main__.py │ ├── commands │ ├── __init__.py │ ├── add.py │ ├── eval.py │ ├── init.py │ ├── studio.py │ └── utils.py │ ├── eval_core │ ├── __init__.py │ ├── _trace_helpers.py │ ├── helpers.py │ ├── judge.py │ ├── runner.py │ └── types.py │ └── scaffold │ └── trainloop │ ├── .gitignore │ ├── README.md │ ├── eval │ ├── __init__.py │ ├── metrics │ │ ├── __init__.py │ │ ├── always_pass.py │ │ └── is_helpful.py │ └── suites │ │ ├── __init__.py │ │ ├── always_passes.py │ │ └── is_helpful.py │ └── trainloop.config.yaml ├── images └── drake_evals.png ├── infra ├── .gitignore ├── Pulumi.trainloop.yaml ├── Pulumi.yaml ├── README.md ├── index.ts ├── package-lock.json ├── package.json └── tsconfig.json ├── package.json ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── registry ├── README.md ├── __init__.py ├── config_types.py ├── helpers.py ├── judge.py ├── metrics │ ├── __init__.py │ ├── always_pass │ │ ├── always_pass.py │ │ └── config.py │ ├── index.py │ └── is_helpful │ │ ├── config.py │ │ └── is_helpful.py ├── metrics_registry.py ├── suites │ ├── index.py │ ├── is_helpful │ │ ├── config.py │ │ └── is_helpful.py │ └── sample │ │ ├── config.py │ │ └── sample.py └── types.py ├── releases ├── 0.5.0.md └── 0.6.0.md ├── runner ├── .gitignore ├── README.md ├── bin │ └── run.js ├── package-lock.json └── package.json ├── scripts ├── build.py ├── build │ ├── build_docker.py │ └── build_studio.py ├── bump │ ├── README.md │ ├── bump_pulumi.py │ └── bump_version.py ├── publish.py └── publish │ ├── publish_cli.py │ ├── publish_sdk.py │ └── publish_studio.py ├── sdk ├── .gitignore ├── TESTING.md ├── go │ └── trainloop-llm-logging │ │ ├── .gitignore │ │ ├── README.md │ │ ├── go.mod │ │ ├── go.sum │ │ ├── instrumentation │ │ ├── http.go │ │ └── instrumentation.go │ │ ├── internal │ │ ├── config │ │ │ └── config.go │ │ ├── exporter │ │ │ └── exporter.go │ │ ├── logger │ │ │ └── logger.go │ │ ├── store │ │ │ └── store.go │ │ ├── types │ │ │ └── types.go │ │ └── utils │ │ │ └── utils.go │ │ └── trainloop_llm_logging.go ├── python │ ├── .gitignore │ ├── README.md │ ├── poetry.lock │ ├── pyproject.toml │ ├── pytest.ini │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── edge_cases │ │ │ └── __init__.py │ │ ├── integration │ │ │ └── __init__.py │ │ └── unit │ │ │ ├── __init__.py │ │ │ ├── test_config.py │ │ │ ├── test_exporter.py │ │ │ ├── test_instrumentation.py │ │ │ ├── test_logger.py │ │ │ ├── test_register.py │ │ │ └── test_store.py │ └── trainloop_llm_logging │ │ ├── __init__.py │ │ ├── config.py │ │ ├── exporter.py │ │ ├── instrumentation │ │ ├── __init__.py │ │ ├── http_client_lib.py │ │ ├── httpx_lib.py │ │ ├── requests_lib.py │ │ └── utils.py │ │ ├── logger.py │ │ ├── register.py │ │ ├── store.py │ │ └── types.py └── typescript │ ├── .gitignore │ ├── README.md │ ├── jest.config.js │ ├── package-lock.json │ ├── package.json │ ├── src │ ├── config.ts │ ├── constants.ts │ ├── exporter.ts │ ├── index.ts │ ├── instrumentation │ │ ├── fetch.ts │ │ ├── http.ts │ │ ├── index.ts │ │ └── utils.ts │ ├── logger.ts │ ├── store.ts │ └── types │ │ ├── global.d.ts │ │ └── shared.d.ts │ ├── tests │ ├── integration │ │ └── sdk-initialization.test.ts │ ├── setup.ts │ ├── test-utils.ts │ └── unit │ │ ├── config.test.ts │ │ ├── exporter.test.ts │ │ ├── fetch.test.ts │ │ ├── http.test.ts │ │ ├── logger.test.ts │ │ ├── register.test.ts │ │ ├── store.test.ts │ │ └── utils.test.ts │ └── tsconfig.json ├── tests ├── .gitignore ├── README.md ├── __init__.py ├── conftest.py ├── helpers │ ├── __init__.py │ └── mock_llm.py ├── integration │ ├── __init__.py │ └── init_flow │ │ ├── __init__.py │ │ └── test_init_command.py └── unit │ ├── __init__.py │ └── judge │ ├── __init__.py │ └── test_judge_basic.py └── ui ├── .env.example ├── .eslintrc.json ├── .gitignore ├── Dockerfile ├── README.md ├── app ├── api │ ├── collected-data │ │ ├── GET.ts │ │ └── route.ts │ ├── dashboard │ │ └── route.ts │ ├── events │ │ ├── [id] │ │ │ └── route.ts │ │ └── route.ts │ ├── groq │ │ └── route.ts │ └── results │ │ ├── [id] │ │ └── route.ts │ │ └── route.ts ├── events │ ├── [id] │ │ └── page.tsx │ └── page.tsx ├── experiments │ ├── [id] │ │ └── page.tsx │ └── new │ │ └── page.tsx ├── globals.css ├── layout.tsx ├── page.tsx ├── results │ ├── [id] │ │ └── page.tsx │ └── page.tsx └── timeline │ └── page.tsx ├── components.json ├── components ├── charts │ ├── metrics-trend-chart.tsx │ ├── model-comparison-chart.tsx │ ├── progress-timeline-chart.tsx │ └── radar-comparison-chart.tsx ├── dashboard-header.tsx ├── dashboard-shell.tsx ├── dashboard │ ├── dashboard-charts.tsx │ ├── dashboard-content.tsx │ ├── recent-events.tsx │ └── recent-results.tsx ├── events-table.tsx ├── experiment-metrics.tsx ├── experiment-responses.tsx ├── experiments-list.tsx ├── flow-editor │ ├── flow-canvas.tsx │ ├── flow-nodes.tsx │ ├── node-selector.tsx │ ├── simple-canvas.tsx │ └── simple-node.tsx ├── groq-evaluator.tsx ├── results-list.tsx ├── theme-provider.tsx └── ui │ ├── accordion.tsx │ ├── alert-dialog.tsx │ ├── alert.tsx │ ├── aspect-ratio.tsx │ ├── avatar.tsx │ ├── badge.tsx │ ├── breadcrumb.tsx │ ├── button.tsx │ ├── calendar.tsx │ ├── card.tsx │ ├── carousel.tsx │ ├── chart.tsx │ ├── checkbox.tsx │ ├── collapsible.tsx │ ├── command.tsx │ ├── context-menu.tsx │ ├── dialog.tsx │ ├── drawer.tsx │ ├── dropdown-menu.tsx │ ├── form.tsx │ ├── hover-card.tsx │ ├── input-otp.tsx │ ├── input.tsx │ ├── label.tsx │ ├── menubar.tsx │ ├── navigation-menu.tsx │ ├── pagination.tsx │ ├── popover.tsx │ ├── progress.tsx │ ├── radio-group.tsx │ ├── resizable.tsx │ ├── scroll-area.tsx │ ├── select.tsx │ ├── separator.tsx │ ├── sheet.tsx │ ├── sidebar.tsx │ ├── skeleton.tsx │ ├── slider.tsx │ ├── sonner.tsx │ ├── switch.tsx │ ├── table.tsx │ ├── tabs.tsx │ ├── textarea.tsx │ ├── toast.tsx │ ├── toaster.tsx │ ├── toggle-group.tsx │ ├── toggle.tsx │ ├── tooltip.tsx │ ├── use-mobile.tsx │ └── use-toast.ts ├── database ├── duckdb.ts ├── events.ts ├── registry.ts ├── results.ts ├── schema.ts └── utils.ts ├── hooks ├── use-mobile.tsx └── use-toast.ts ├── lib └── utils.ts ├── next.config.mjs ├── package-lock.json ├── package.json ├── postcss.config.mjs ├── public ├── placeholder-logo.png ├── placeholder-logo.svg ├── placeholder-user.jpg ├── placeholder.jpg └── placeholder.svg ├── styles └── globals.css ├── tailwind.config.ts ├── tsconfig.json └── utils └── json-helpers.ts /.github/workflows/README.md: -------------------------------------------------------------------------------- 1 | # GitHub Actions Workflows 2 | 3 | This directory contains GitHub Actions workflows for the TrainLoop Evals project. 4 | 5 | ## SDK Test Workflows 6 | 7 | ### sdk-tests-lean.yml (Primary - Runs Automatically) 8 | The main test suite that runs on every PR, PR sync, and push to main: 9 | - **Python SDK**: Tests on Python 3.11 on Ubuntu 10 | - **TypeScript SDK**: Tests on Node.js 20.x on Ubuntu 11 | - Includes linting checks for both SDKs 12 | - Optimized for fast feedback (~2-3 minutes) 13 | 14 | ### sdk-tests.yml (Full Matrix - Manual/Scheduled) 15 | Comprehensive test suite for cross-platform compatibility: 16 | - **Python SDK**: Tests on Python 3.8-3.12 across Ubuntu, macOS, and Windows 17 | - **TypeScript SDK**: Tests on Node.js 18.x, 20.x, and 22.x across Ubuntu, macOS, and Windows 18 | - **Triggers**: 19 | - Manual dispatch via GitHub Actions UI 20 | - Daily schedule (midnight UTC) 21 | - Pushes to release branches and main 22 | 23 | ## When Tests Run 24 | 25 | - **Every PR**: `sdk-tests-lean.yml` runs automatically 26 | - **Daily**: `sdk-tests.yml` runs the full matrix to catch compatibility issues 27 | - **On Demand**: Run `sdk-tests.yml` manually from the Actions tab 28 | - **Release Branches**: Both workflows run to ensure quality 29 | 30 | ## Workflow Features 31 | 32 | Both workflows include: 33 | - Automatic triggering on PRs and pushes to main 34 | - Path filtering (only runs when SDK files change) 35 | - Dependency caching for faster runs 36 | - Coverage reporting (currently generates reports but doesn't upload them) 37 | - Parallel execution of Python and TypeScript tests 38 | 39 | ## Running Tests Locally 40 | 41 | Before pushing, you can run tests locally: 42 | 43 | ```bash 44 | # Python SDK 45 | cd sdk/python 46 | poetry run pytest 47 | 48 | # TypeScript SDK 49 | cd sdk/typescript 50 | npm test 51 | ``` 52 | 53 | ## Customization 54 | 55 | To adjust the test matrix or add new checks: 56 | 1. Edit the `matrix` section in the workflow files 57 | 2. Add new steps under the respective job 58 | 3. Update path filters if testing new directories 59 | 60 | ## Troubleshooting 61 | 62 | - **Tests failing on Windows**: Check for path separator issues 63 | - **Cache misses**: Ensure lock files are committed 64 | - **Slow runs**: Consider using the lean workflow for routine changes 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | .venv/ 4 | venv/ 5 | env/ 6 | 7 | # Build outputs 8 | dist/ 9 | build/ 10 | *.tgz 11 | *.egg-info/ 12 | 13 | # IDE and OS files 14 | .DS_Store 15 | .idea/ 16 | .vscode/ 17 | *.swp 18 | *.swo 19 | *~ 20 | 21 | # Python 22 | __pycache__/ 23 | *.py[cod] 24 | *$py.class 25 | *.so 26 | .Python 27 | 28 | # Test coverage 29 | .coverage 30 | .coverage.* 31 | htmlcov/ 32 | coverage/ 33 | *.cover 34 | *.py,cover 35 | .hypothesis/ 36 | .pytest_cache/ 37 | coverage.xml 38 | *.coverage 39 | .nyc_output/ 40 | 41 | # Environment files 42 | .env.local 43 | .env.development.local 44 | .env.test.local 45 | .env.production.local 46 | 47 | # Logs 48 | logs/ 49 | *.log 50 | npm-debug.log* 51 | yarn-debug.log* 52 | yarn-error.log* 53 | lerna-debug.log* 54 | .pnpm-debug.log* 55 | 56 | # Testing 57 | test-results/ 58 | junit.xml 59 | 60 | # Temporary files 61 | *.tmp 62 | *.temp 63 | .cache/ 64 | 65 | # TrainLoop specific (keep data for testing) 66 | # trainloop/data/ -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.6.0 (2025-06-03) 4 | [Release Notes](releases/0.6.0.md) 5 | 6 | 7 | ## 0.5.0 (2025-05-27) 8 | [Release Notes](releases/0.5.0.md) 9 | 10 | ## 0.4.0 (2025-05-22) 11 | • Public Release 12 | 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guide 2 | 3 | ## Setup 4 | 5 | ### 1. Install pipx 6 | 7 | First, make sure you have pipx installed: 8 | 9 | ```bash 10 | python -m pip install --user pipx 11 | pipx ensurepath 12 | ``` 13 | 14 | ### 2. Install Dependencies 15 | 16 | ```bash 17 | # Dependencies installation instructions will go here 18 | ``` 19 | 20 | ### 3. Development Workflow 21 | 22 | When contributing to this project, please follow the standard GitHub workflow: 23 | 24 | 1. Fork the repository 25 | 2. Create a feature branch 26 | 3. Make your changes 27 | 4. Run tests 28 | 5. Submit a pull request 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 TrainLoop 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.6.0 2 | -------------------------------------------------------------------------------- /VERSIONING.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Evals - Versioning 2 | 3 | TrainLoop Evals follows [Semantic Versioning](https://semver.org/) (MAJOR.MINOR.PATCH) with a simple workflow for versioning, building, and publishing packages. 4 | 5 | ## Quick Reference 6 | 7 | ```bash 8 | # Bump version (updates VERSION file, package.json files, and changelog) 9 | npm run bump patch "Bug fix description" 10 | npm run bump minor "New feature description" 11 | npm run bump major "Breaking change description" 12 | 13 | # Build packages 14 | npm run build # Build all components 15 | npm run build:docker # Build only Docker image 16 | npm run build:studio # Build only Studio package 17 | 18 | # Publish packages 19 | npm run publish # Publish all components 20 | npm run publish:sdk # Publish only SDK packages 21 | npm run publish:cli # Publish only CLI package 22 | npm run publish:studio # Publish only Studio package 23 | 24 | # Pulumi (infrastructure) 25 | npm run pulumi:bump # Update Pulumi config with new version 26 | ``` 27 | 28 | ## How Versioning Works 29 | 30 | The system uses a central `VERSION` file as the source of truth. When you run the bump command, it: 31 | 32 | 1. Updates the version in the `VERSION` file 33 | 2. Updates all package.json files and pyproject.toml files 34 | 3. Updates the CHANGELOG.md with your provided message 35 | 4. Commits changes, creates a git tag, and pushes to the main branch 36 | 37 | ## Workflow Examples 38 | 39 | ### Typical Release Process 40 | 41 | ```bash 42 | # 1. Bump version with changelog entry 43 | npm run bump minor "Added new evaluation metrics" 44 | 45 | # 2. Build all components 46 | npm run build 47 | 48 | # 3. Publish packages 49 | npm run publish 50 | 51 | # 4. Update infrastructure (if needed) 52 | npm run pulumi:bump 53 | cd infra && pulumi up 54 | ``` 55 | 56 | ### Partial Updates 57 | 58 | If you only need to update specific components: 59 | 60 | ```bash 61 | # Only publish the SDK 62 | npm run publish:sdk 63 | 64 | # Only build Docker image 65 | npm run build:docker 66 | ``` 67 | 68 | ## Technical Details 69 | 70 | - The `VERSION` file contains the current semantic version 71 | - Python scripts in the `scripts/` directory handle the version management 72 | - All components (UI, SDK, CLI, etc.) are versioned together 73 | - The system automatically updates package-lock.json files where needed 74 | - Docker images are tagged with the version number 75 | 76 | For detailed information on the implementation, see the scripts in the `scripts/` directory. 77 | -------------------------------------------------------------------------------- /cli/.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | build 3 | *.egg-info 4 | __pycache__ 5 | ui -------------------------------------------------------------------------------- /cli/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop CLI 2 | 3 | The CLI bootstraps a `trainloop/` workspace, runs test suites and launches the Studio UI. 4 | 5 | ## Install 6 | 7 | ```bash 8 | pip install trainloop-cli 9 | # or via pipx 10 | pipx run trainloop-cli --help 11 | ``` 12 | 13 | ## Commands 14 | 15 | ```bash 16 | trainloop init # scaffold workspace 17 | trainloop eval # run suites in eval/suites 18 | trainloop studio # open the web studio 19 | ``` 20 | 21 | See the [project README](../README.md) for the full workflow. 22 | -------------------------------------------------------------------------------- /cli/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "trainloop-cli" 3 | version = "0.6.0" 4 | description = "TrainLoop command-line interface" 5 | authors = ["Mason Pierce "] 6 | readme = "README.md" 7 | 8 | # Include trainloop_cli package and UI files 9 | packages = [{ include = "trainloop_cli" }] 10 | 11 | [tool.poetry.dependencies] 12 | python = "^3.9" 13 | click = "^8.1.0" 14 | pyyaml = "^6.0" 15 | packaging = "^23.0" 16 | tomli = "^2.0" 17 | litellm = "^1.72.0" 18 | 19 | [tool.poetry.group.dev.dependencies] 20 | pytest = "^7.0" 21 | black = "^23.3.0" 22 | flake8 = "^6.0" 23 | 24 | 25 | [tool.poetry.scripts] 26 | # exposes a `trainloop` CLI entrypoint: 27 | trainloop = "trainloop_cli.__main__:main" 28 | 29 | [build-system] 30 | requires = ["poetry-core>=1.0.0"] 31 | build-backend = "poetry.core.masonry.api" 32 | -------------------------------------------------------------------------------- /cli/trainloop_cli/__init__.py: -------------------------------------------------------------------------------- 1 | """TrainLoop Evaluations CLI package.""" 2 | -------------------------------------------------------------------------------- /cli/trainloop_cli/commands/__init__.py: -------------------------------------------------------------------------------- 1 | """TrainLoop Evaluations CLI commands.""" 2 | 3 | from .init import init_command as init_cmd 4 | from .eval import eval_command as eval_cmd 5 | from .studio import studio_command as studio_cmd 6 | from .add import add_command as add_cmd 7 | 8 | __all__ = ["init_cmd", "eval_cmd", "studio_cmd", "add_cmd"] 9 | -------------------------------------------------------------------------------- /cli/trainloop_cli/commands/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | import yaml 4 | 5 | 6 | def find_root(silent_on_error: bool = False) -> Path | None: 7 | """Walk upward until we hit trainloop.config.yaml; error if missing.""" 8 | cur = Path.cwd() 9 | for p in [cur, *cur.parents]: 10 | if (p / "trainloop.config.yaml").exists(): 11 | return p 12 | 13 | if silent_on_error: 14 | return None 15 | 16 | raise RuntimeError( 17 | "❌ trainloop.config.yaml not found. " 18 | "Run this command inside the trainloop folder " 19 | "or create one with `trainloop init`." 20 | ) 21 | 22 | 23 | def resolve_data_folder_path(data_folder: str, config_path: Path) -> str: 24 | """ 25 | Resolves the data folder path to an absolute path. 26 | 27 | Args: 28 | data_folder: The data folder path from config 29 | config_path: The path to the config file 30 | 31 | Returns: 32 | The resolved absolute data folder path as a string 33 | """ 34 | if not data_folder: 35 | return "" 36 | 37 | data_folder_path = Path(data_folder) 38 | if data_folder_path.is_absolute(): 39 | # If it's an absolute path, use it directly 40 | return str(data_folder_path.absolute()) 41 | 42 | # If it's relative, make it relative to config directory and convert to absolute 43 | config_dir = Path(config_path).parent 44 | return str((config_dir / data_folder_path).absolute()) 45 | 46 | 47 | def load_config_for_cli(root_path: Path) -> None: 48 | """Parse YAML and export env-vars exactly like the JS SDK.""" 49 | trainloop_config_path = root_path / "trainloop.config.yaml" 50 | if not trainloop_config_path.exists(): 51 | return 52 | 53 | config = yaml.safe_load(trainloop_config_path.read_text()) or {} 54 | trainloop_config = config.get("trainloop", {}) 55 | data_folder = trainloop_config.get("data_folder", "") 56 | resolved_path = resolve_data_folder_path(data_folder, trainloop_config_path) 57 | 58 | if "data_folder" in trainloop_config: # required 59 | os.environ["TRAINLOOP_DATA_FOLDER"] = resolved_path 60 | if "log_level" in trainloop_config: # optional 61 | os.environ["TRAINLOOP_LOG_LEVEL"] = str( 62 | trainloop_config.get("log_level", "info").upper() 63 | ) 64 | -------------------------------------------------------------------------------- /cli/trainloop_cli/eval_core/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /cli/trainloop_cli/eval_core/_trace_helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for LLM Judge tracing. 3 | """ 4 | 5 | from __future__ import annotations 6 | import os 7 | import json 8 | import logging 9 | from pathlib import Path 10 | from typing import List, Dict, Optional, Any 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def ensure_trace_dir() -> Optional[Path]: 16 | """ 17 | Ensure the judge_traces directory exists inside the TRAINLOOP_DATA_FOLDER 18 | and return its path. Returns None if TRAINLOOP_DATA_FOLDER is not set. 19 | """ 20 | data_folder_path_str = os.getenv("TRAINLOOP_DATA_FOLDER") 21 | 22 | if not data_folder_path_str: 23 | logger.error( 24 | "TRAINLOOP_DATA_FOLDER environment variable is not set. " 25 | "Cannot save judge traces." 26 | ) 27 | return None 28 | 29 | data_folder_path = Path(data_folder_path_str) 30 | if not data_folder_path.is_dir(): 31 | logger.error( 32 | f"TRAINLOOP_DATA_FOLDER ('{data_folder_path_str}') does not exist or is not a directory. " 33 | "Cannot save judge traces." 34 | ) 35 | return None 36 | 37 | trace_dir = data_folder_path / "judge_traces" 38 | 39 | try: 40 | trace_dir.mkdir(parents=True, exist_ok=True) 41 | return trace_dir 42 | except OSError as e: 43 | logger.error(f"Could not create trace directory {trace_dir}: {e}") 44 | return None 45 | 46 | 47 | def write_trace_log( 48 | trace_id: str, trace_events: List[Dict[str, Any]], trace_dir: Optional[Path] 49 | ): 50 | """Write the trace events to a .jsonl file.""" 51 | if not trace_dir: 52 | logger.info( 53 | "Trace directory not available (TRAINLOOP_DATA_FOLDER likely not set). Skipping writing trace log." 54 | ) 55 | return 56 | 57 | if not trace_events: 58 | logger.info(f"No trace events to write for trace_id {trace_id}.") 59 | return 60 | 61 | trace_file_path = trace_dir / f"{trace_id}.jsonl" 62 | try: 63 | with open(trace_file_path, "w", encoding="utf-8") as f: 64 | for event in trace_events: 65 | json.dump(event, f) 66 | f.write("\n") 67 | logger.info(f"Judge trace written to: {trace_file_path}") 68 | except IOError as e: 69 | logger.error(f"Failed to write trace log to {trace_file_path}: {e}") 70 | -------------------------------------------------------------------------------- /cli/trainloop_cli/eval_core/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | TrainLoop evaluation types - core data classes for samples and results. 3 | """ 4 | 5 | from __future__ import annotations 6 | from dataclasses import dataclass 7 | from typing import Any, Literal 8 | from typing import TypedDict, List, Dict 9 | 10 | 11 | class CollectedSampleDict(TypedDict, total=False): 12 | durationMs: int 13 | tag: str 14 | input: List[Dict[str, str]] 15 | output: Dict[Literal["content"], str] 16 | model: str 17 | modelParams: Dict[str, Any] 18 | startTimeMs: int 19 | endTimeMs: int 20 | url: str 21 | location: Dict[Literal["tag", "lineNumber"], str] 22 | 23 | 24 | @dataclass(slots=True, frozen=True) 25 | class Sample: 26 | duration_ms: int # Duration of the request in milliseconds 27 | tag: str # The tag of the event 28 | input: List[Dict[str, str]] # Input(s) to the model 29 | output: Dict[Literal["content"], str] # Output(s) from the model 30 | model: str # The model used to generate the response 31 | model_params: Dict[str, Any] # Model parameters 32 | start_time_ms: int # Start time in milliseconds since epoch 33 | end_time_ms: int # End time in milliseconds since epoch 34 | url: str # The request URL 35 | location: Dict[ 36 | Literal["tag", "lineNumber"], str 37 | ] # Location information (tag, lineNumber) 38 | 39 | 40 | @dataclass(slots=True, frozen=False) 41 | class Result: 42 | metric: str # The name of the metric 43 | sample: Sample # The sample that was evaluated 44 | passed: int # 1 or 0 45 | reason: str | None = None # The reason for the failure (if any) 46 | -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore data directory (events and results) 2 | data/ 3 | 4 | # Python artifacts 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | *.so 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # Virtual environment 28 | venv/ 29 | .venv/ 30 | .env 31 | 32 | # Editor directories and files 33 | .idea/ 34 | .vscode/ 35 | *.swp 36 | *.swo 37 | *~ 38 | -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TrainLoop/evals/2e6776b49539d12d82443e573419931a3c24458f/cli/trainloop_cli/scaffold/trainloop/eval/__init__.py -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/eval/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | import pkgutil 2 | import inspect 3 | from importlib import import_module 4 | 5 | _funcs = {} 6 | for mod in pkgutil.walk_packages(__path__, __name__ + "."): 7 | m = import_module(mod.name) 8 | for _, obj in inspect.getmembers(m, inspect.isfunction): 9 | if obj.__module__ == m.__name__: 10 | _funcs[obj.__name__] = obj 11 | globals().update(_funcs) # so users can `from ... import does_compile` 12 | -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/eval/metrics/always_pass.py: -------------------------------------------------------------------------------- 1 | from trainloop_cli.eval_core.types import Sample 2 | 3 | 4 | def always_pass(_: Sample) -> int: # 1 = pass, 0 = fail 5 | return 1 6 | -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/eval/suites/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TrainLoop/evals/2e6776b49539d12d82443e573419931a3c24458f/cli/trainloop_cli/scaffold/trainloop/eval/suites/__init__.py -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/eval/suites/always_passes.py: -------------------------------------------------------------------------------- 1 | from trainloop_cli.eval_core.helpers import tag 2 | from ..metrics import always_pass 3 | 4 | # You can define as many metrics as you like to test against and chain them here. These will run on every sample matching "my-tag". 5 | results = tag("my-tag").check(always_pass, always_pass) 6 | -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/eval/suites/is_helpful.py: -------------------------------------------------------------------------------- 1 | from trainloop_cli.eval_core.helpers import tag 2 | from ..metrics import is_helpful 3 | 4 | # You can define as many metrics as you like to test against and chain them here. These will run on every sample matching "my-tag". 5 | results = tag("my-tag").check(is_helpful) 6 | -------------------------------------------------------------------------------- /cli/trainloop_cli/scaffold/trainloop/trainloop.config.yaml: -------------------------------------------------------------------------------- 1 | # TrainLoop Configuration 2 | trainloop: 3 | data_folder: data # Relative to this config file 4 | host_allowlist: 5 | - api.openai.com 6 | - api.anthropic.com 7 | log_level: warn 8 | # Judge Configuration 9 | judge: 10 | env_path: "../.env.judge" # Optional: Path to a .env file to load for the judge (e.g., for API keys) 11 | models: 12 | - openai/gpt-4.1-2025-04-14 13 | - anthropic/claude-sonnet-4-20250514 14 | calls_per_model_per_claim: 3 15 | temperature: 0.7 16 | -------------------------------------------------------------------------------- /images/drake_evals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TrainLoop/evals/2e6776b49539d12d82443e573419931a3c24458f/images/drake_evals.png -------------------------------------------------------------------------------- /infra/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /node_modules/ 3 | -------------------------------------------------------------------------------- /infra/Pulumi.trainloop.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | aws:region: us-west-2 3 | appImage: ghcr.io/trainloop/evals 4 | appVersion: 0.4.0 5 | # Change these to your desired values 6 | subdomain: evals 7 | hostedZoneDomain: trainloop.ai 8 | # S3 bucket configuration 9 | s3MountEnabled: true 10 | s3MountPath: /mnt/s3data 11 | -------------------------------------------------------------------------------- /infra/Pulumi.yaml: -------------------------------------------------------------------------------- 1 | name: evals 2 | description: A simple deployment for the TrainLoop Evaluation UI 3 | runtime: 4 | name: nodejs 5 | options: 6 | packagemanager: npm 7 | -------------------------------------------------------------------------------- /infra/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Infra 2 | 3 | Pulumi configuration for deploying the Studio as a small demo on AWS. 4 | 5 | ## Usage 6 | 7 | ```bash 8 | pulumi preview 9 | pulumi up # deploy 10 | ``` 11 | 12 | See `Pulumi.trainloop.yaml` for configurable options. 13 | -------------------------------------------------------------------------------- /infra/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "evals", 3 | "main": "index.ts", 4 | "devDependencies": { 5 | "@types/node": "^18", 6 | "typescript": "^5.0.0" 7 | }, 8 | "dependencies": { 9 | "@pulumi/aws": "^6.0.0", 10 | "@pulumi/awsx": "^2.0.2", 11 | "@pulumi/pulumi": "^3.113.0" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /infra/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "strict": true, 4 | "outDir": "bin", 5 | "target": "es2020", 6 | "module": "commonjs", 7 | "moduleResolution": "node", 8 | "sourceMap": true, 9 | "experimentalDecorators": true, 10 | "pretty": true, 11 | "noFallthroughCasesInSwitch": true, 12 | "noImplicitReturns": true, 13 | "forceConsistentCasingInFileNames": true 14 | }, 15 | "files": [ 16 | "index.ts" 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "trainloop-evals", 3 | "private": false, 4 | "description": "TrainLoop Evaluation Framework", 5 | "repository": { 6 | "type": "git", 7 | "url": "https://github.com/trainloop/evals" 8 | }, 9 | "scripts": { 10 | "bump": "pipx run scripts/bump/bump_version.py", 11 | "build": "pipx run scripts/build.py", 12 | "build:docker": "pipx run scripts/build.py --skip-studio", 13 | "build:studio": "pipx run scripts/build.py --skip-docker", 14 | "publish": "pipx run scripts/publish.py", 15 | "publish:sdk": "pipx run scripts/publish.py --skip-cli --skip-studio", 16 | "publish:cli": "pipx run scripts/publish.py --skip-sdk --skip-studio", 17 | "publish:studio": "pipx run scripts/publish.py --skip-sdk --skip-cli", 18 | "pulumi:bump": "pipx run scripts/bump/bump_pulumi.py", 19 | "pulumi:destroy": "cd infra && pulumi destroy", 20 | "dev": "cd ui && npm run dev" 21 | }, 22 | "engines": { 23 | "node": ">=20.0.0" 24 | } 25 | } -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "click" 5 | version = "8.1.8" 6 | description = "Composable command line interface toolkit" 7 | optional = false 8 | python-versions = ">=3.7" 9 | groups = ["main"] 10 | files = [ 11 | {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, 12 | {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, 13 | ] 14 | 15 | [package.dependencies] 16 | colorama = {version = "*", markers = "platform_system == \"Windows\""} 17 | 18 | [[package]] 19 | name = "colorama" 20 | version = "0.4.6" 21 | description = "Cross-platform colored terminal text." 22 | optional = false 23 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 24 | groups = ["main"] 25 | markers = "platform_system == \"Windows\"" 26 | files = [ 27 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 28 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 29 | ] 30 | 31 | [[package]] 32 | name = "trainloop-cli" 33 | version = "0.1.0" 34 | description = "TrainLoop command‑line interface" 35 | optional = false 36 | python-versions = "^3.9" 37 | groups = ["main"] 38 | files = [] 39 | develop = true 40 | 41 | [package.dependencies] 42 | click = "^8.1.0" 43 | 44 | [package.source] 45 | type = "directory" 46 | url = "cli" 47 | 48 | [metadata] 49 | lock-version = "2.1" 50 | python-versions = "^3.9" 51 | content-hash = "ecd9e8c0cea4a9e147b72fe40ac3e15d186d1026af50431a074b5536d54bf3be" 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "trainloop-monorepo" 3 | version = "0.3.0" 4 | description = "TrainLoop monorepo (CLI + infra + UI)" 5 | authors = ["Mason Pierce "] 6 | readme = "README.md" 7 | 8 | package-mode = false 9 | 10 | [tool.poetry.dependencies] 11 | python = "^3.9" 12 | trainloop-cli = { path = "cli", develop = true } 13 | trainloop-llm-logging = { path = "sdk/python", develop = true } 14 | 15 | [build-system] 16 | requires = ["poetry-core>=1.0.0"] 17 | build-backend = "poetry.core.masonry.api" 18 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | python_files = test_*.py 4 | python_classes = Test* 5 | python_functions = test_* 6 | 7 | # Test markers for categorization 8 | markers = 9 | unit: Fast unit tests 10 | integration: End-to-end integration tests 11 | slow: Tests that take longer to run 12 | judge: Tests that involve LLM judge functionality 13 | cli: Tests for CLI commands 14 | scaffold: Tests for scaffold template functionality 15 | registry: Tests for registry components 16 | 17 | # Output and reporting 18 | addopts = 19 | -v 20 | --tb=short 21 | --strict-markers 22 | 23 | # Ignore warnings from dependencies 24 | filterwarnings = 25 | ignore::DeprecationWarning 26 | ignore::PendingDeprecationWarning 27 | 28 | # Test discovery 29 | norecursedirs = .git .tox dist build *.egg __pycache__ .venv venv 30 | -------------------------------------------------------------------------------- /registry/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Registry 2 | 3 | This directory contains pre-built metrics and suites that can be installed using the `trainloop add` command. 4 | 5 | ## Structure 6 | 7 | ``` 8 | registry/ 9 | ├── config_types.py # Type definitions for configs 10 | ├── metrics/ 11 | │ ├── index.py # Auto-generated list of available metrics 12 | │ └── {metric_name}/ 13 | │ ├── config.py # Metric metadata and configuration 14 | │ └── {metric_name}.py # Metric implementation 15 | └── suites/ 16 | ├── index.py # Auto-generated list of available suites 17 | └── {suite_name}/ 18 | ├── config.py # Suite metadata and dependencies 19 | └── {suite_name}.py # Suite implementation 20 | ``` 21 | 22 | ## Adding a New Metric 23 | 24 | 1. Create a new directory under `registry/metrics/` with your metric name 25 | 2. Add `config.py`: 26 | ```python 27 | from registry.config_types import MetricConfig 28 | 29 | config = MetricConfig( 30 | name="your_metric_name", 31 | description="What this metric does", 32 | min_version="0.5.0", # Minimum CLI version required 33 | dependencies=[], # Other metrics this depends on 34 | author="Your Name", 35 | tags=["category", "purpose"], 36 | ) 37 | ``` 38 | 3. Add `{metric_name}.py` with your implementation 39 | 4. The metric will automatically appear in the index (no manual updates needed!) 40 | 41 | ## Adding a New Suite 42 | 43 | 1. Create a new directory under `registry/suites/` with your suite name 44 | 2. Add `config.py`: 45 | ```python 46 | from registry.config_types import SuiteConfig 47 | 48 | config = SuiteConfig( 49 | name="your_suite_name", 50 | description="What this suite tests", 51 | min_version="0.5.0", 52 | dependencies=["metric_one", "metric_two"], # Required metrics 53 | author="Your Name", 54 | tags=["category", "use_case"], 55 | ) 56 | ``` 57 | 3. Add `{suite_name}.py` with your suite definition 58 | 4. The suite will automatically appear in the index (no manual updates needed!) 59 | 60 | ## Usage 61 | 62 | Users can install components using: 63 | 64 | ```bash 65 | # Install a metric 66 | trainloop add metric always_pass 67 | 68 | # Install a suite (and its dependencies) 69 | trainloop add suite sample 70 | 71 | # List available components 72 | trainloop add metric --list 73 | trainloop add suite --list 74 | 75 | # Force overwrite existing components 76 | trainloop add metric always_pass --force 77 | 78 | # Use a specific version 79 | trainloop add metric always_pass --version 0.4.0 80 | ``` 81 | 82 | ## Version Compatibility 83 | 84 | The `min_version` field in config ensures that components are only installed on compatible CLI versions. This prevents users from installing components that rely on features not available in their version. 85 | 86 | ## Best Practices 87 | 88 | 1. Keep metrics focused on a single concern 89 | 2. Metrics must return a reward between 0 and 1, where 1 is a pass and 0 is a fail 90 | 3. Suites must return an array of Result objects 91 | 4. Include meaningful error messages 92 | 5. Use descriptive names that indicate what the metric/suite tests. Check for existing names to avoid conflicts. 93 | 6. Use type hints for better IDE support and validation 94 | -------------------------------------------------------------------------------- /registry/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /registry/config_types.py: -------------------------------------------------------------------------------- 1 | """Configuration types for registry components.""" 2 | 3 | from dataclasses import dataclass 4 | from typing import List 5 | 6 | 7 | @dataclass 8 | class MetricConfig: 9 | """Configuration for a metric.""" 10 | 11 | name: str 12 | description: str 13 | min_version: str 14 | dependencies: List[str] = None 15 | author: str = "TrainLoop Team" 16 | tags: List[str] = None 17 | 18 | def __post_init__(self): 19 | if self.dependencies is None: 20 | self.dependencies = [] 21 | if self.tags is None: 22 | self.tags = [] 23 | 24 | 25 | @dataclass 26 | class SuiteConfig: 27 | """Configuration for a suite.""" 28 | 29 | name: str 30 | description: str 31 | min_version: str 32 | dependencies: List[str] # Required for suites - these are metric names 33 | author: str = "TrainLoop Team" 34 | tags: List[str] = None 35 | 36 | def __post_init__(self): 37 | if self.tags is None: 38 | self.tags = [] 39 | -------------------------------------------------------------------------------- /registry/helpers.py: -------------------------------------------------------------------------------- 1 | """Helper stubs for TrainLoop registry components.""" 2 | 3 | from __future__ import annotations 4 | from typing import Callable, List 5 | from .types import Sample, Result 6 | 7 | 8 | class Tag: 9 | """Stub for tag-based metric checking.""" 10 | 11 | def __init__(self, samples: List[Sample]): 12 | self.samples = samples 13 | 14 | def check( 15 | self, *metrics: Callable[[Sample], int], workers: int | None = None 16 | ) -> List[Result]: 17 | """Apply metrics to samples matching the tag.""" 18 | # This is a stub - actual implementation would run metrics 19 | _ = (metrics, workers) # Mark as intentionally unused 20 | return [] 21 | 22 | 23 | def tag(name: str, raw: bool = False) -> Tag | List[Sample]: 24 | """Create a tag-based checker for applying metrics.""" 25 | # This is a stub - actual implementation would load samples 26 | _ = (name, raw) # Mark as intentionally unused 27 | return Tag([]) 28 | -------------------------------------------------------------------------------- /registry/judge.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any 2 | 3 | def assert_true( 4 | positive_claim: str, 5 | negative_claim: str, 6 | cfg: Optional[Dict[str, Any]] = None 7 | ) -> int: 8 | """ 9 | Stub for the assert_true function from the TrainLoop judge. 10 | This is for type checking and local registry development. 11 | The actual implementation is in the main CLI package. 12 | """ 13 | # In a real scenario, this would involve LLM calls. 14 | # For a stub, we can return a default value like 1 or 0, 15 | # or raise NotImplementedError. Returning an int allows 16 | # type checking of arithmetic operations if the result is used that way. 17 | return 1 18 | -------------------------------------------------------------------------------- /registry/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TrainLoop/evals/2e6776b49539d12d82443e573419931a3c24458f/registry/metrics/__init__.py -------------------------------------------------------------------------------- /registry/metrics/always_pass/always_pass.py: -------------------------------------------------------------------------------- 1 | from registry.types import Sample 2 | 3 | 4 | def always_pass(_: Sample) -> int: # 1 = pass, 0 = fail 5 | return 1 6 | -------------------------------------------------------------------------------- /registry/metrics/always_pass/config.py: -------------------------------------------------------------------------------- 1 | """Configuration for the always_pass metric.""" 2 | 3 | from registry.config_types import MetricConfig 4 | 5 | config = MetricConfig( 6 | name="always_pass", 7 | description="A simple metric that always returns a passing verdict", 8 | min_version="0.5.0", 9 | dependencies=[], 10 | author="TrainLoop Team", 11 | tags=["testing", "basic"], 12 | ) 13 | -------------------------------------------------------------------------------- /registry/metrics/index.py: -------------------------------------------------------------------------------- 1 | """Index of available metrics in the TrainLoop registry.""" 2 | 3 | from pathlib import Path 4 | from importlib import import_module 5 | import sys 6 | 7 | # Add registry to path so we can import configs 8 | registry_path = Path(__file__).parent.parent 9 | sys.path.insert(0, str(registry_path)) 10 | 11 | # Dynamically discover all metrics 12 | components = [] 13 | metrics_dir = Path(__file__).parent 14 | 15 | for metric_dir in metrics_dir.iterdir(): 16 | if metric_dir.is_dir() and not metric_dir.name.startswith("_"): 17 | config_file = metric_dir / "config.py" 18 | if config_file.exists(): 19 | try: 20 | # Import the config module 21 | module = import_module(f"metrics.{metric_dir.name}.config") 22 | if hasattr(module, "config"): 23 | components.append({ 24 | "name": module.config.name, 25 | "description": module.config.description, 26 | "tags": module.config.tags, 27 | }) 28 | except ImportError as e: 29 | print(f"Failed to import metrics.{metric_dir.name}.config: {e}") 30 | except Exception as e: 31 | print(f"Error processing {metric_dir.name}: {e}") 32 | 33 | # Clean up sys.path 34 | sys.path.pop(0) 35 | -------------------------------------------------------------------------------- /registry/metrics/is_helpful/config.py: -------------------------------------------------------------------------------- 1 | from registry.config_types import MetricConfig 2 | 3 | config = MetricConfig( 4 | name="is_helpful", 5 | description="Metric using the TrainLoop judge to evaluate response helpfulness.", 6 | min_version="0.1.0", 7 | dependencies=[], 8 | author="TrainLoop Team", 9 | tags=["judge", "helpfulness"], 10 | ) 11 | -------------------------------------------------------------------------------- /registry/metrics_registry.py: -------------------------------------------------------------------------------- 1 | """Registry metrics module - imports all available metrics.""" 2 | 3 | from registry.metrics.always_pass import always_pass 4 | from registry.metrics.is_helpful import is_helpful 5 | 6 | # Export all metrics 7 | __all__ = [ 8 | "always_pass", 9 | "is_helpful", 10 | ] 11 | -------------------------------------------------------------------------------- /registry/suites/index.py: -------------------------------------------------------------------------------- 1 | """Index of available suites in the TrainLoop registry.""" 2 | 3 | from pathlib import Path 4 | from importlib import import_module 5 | import sys 6 | 7 | # Add registry to path so we can import configs 8 | registry_path = Path(__file__).parent.parent 9 | sys.path.insert(0, str(registry_path)) 10 | 11 | # Dynamically discover all suites 12 | components = [] 13 | suites_dir = Path(__file__).parent 14 | 15 | for suite_dir in suites_dir.iterdir(): 16 | if suite_dir.is_dir() and not suite_dir.name.startswith("_"): 17 | config_file = suite_dir / "config.py" 18 | if config_file.exists(): 19 | try: 20 | # Import the config module 21 | module = import_module(f"suites.{suite_dir.name}.config") 22 | if hasattr(module, "config"): 23 | components.append({ 24 | "name": module.config.name, 25 | "description": module.config.description, 26 | "dependencies": module.config.dependencies, 27 | "tags": module.config.tags, 28 | }) 29 | except ImportError: 30 | pass # Skip suites that can't be imported 31 | 32 | # Clean up sys.path 33 | sys.path.pop(0) 34 | -------------------------------------------------------------------------------- /registry/suites/is_helpful/config.py: -------------------------------------------------------------------------------- 1 | from registry.config_types import SuiteConfig 2 | 3 | config = SuiteConfig( 4 | name="is_helpful", 5 | description="A suite that uses the is_helpful metric to evaluate responses tagged with 'my-tag'.", 6 | min_version="0.1.0", 7 | dependencies=["is_helpful"], # Refers to the 'is_helpful' metric 8 | author="TrainLoop Team", 9 | tags=["helpfulness", "example"], 10 | ) 11 | -------------------------------------------------------------------------------- /registry/suites/is_helpful/is_helpful.py: -------------------------------------------------------------------------------- 1 | from registry.metrics_registry import is_helpful 2 | from registry.helpers import tag 3 | 4 | # You can define as many metrics as you like to test against and chain them here. These will run on every sample matching "my-tag". 5 | results = tag("my-tag").check(is_helpful) 6 | -------------------------------------------------------------------------------- /registry/suites/sample/config.py: -------------------------------------------------------------------------------- 1 | """Configuration for the sample suite.""" 2 | 3 | from registry.config_types import SuiteConfig 4 | 5 | config = SuiteConfig( 6 | name="sample", 7 | description="A sample evaluation suite demonstrating how to test LLM behavior", 8 | min_version="0.5.0", 9 | dependencies=["always_pass"], 10 | author="TrainLoop Team", 11 | tags=["example", "starter"], 12 | ) 13 | -------------------------------------------------------------------------------- /registry/suites/sample/sample.py: -------------------------------------------------------------------------------- 1 | from registry.metrics_registry import always_pass 2 | from registry.helpers import tag 3 | 4 | # You can define as many metrics as you like to test against and chain them here. These will run on every sample matching "my-tag". 5 | results = tag("my-tag").check(always_pass, always_pass) 6 | -------------------------------------------------------------------------------- /registry/types.py: -------------------------------------------------------------------------------- 1 | """Type stubs for TrainLoop registry components.""" 2 | 3 | from __future__ import annotations 4 | from dataclasses import dataclass 5 | from typing import Any, List, Dict, Literal 6 | 7 | 8 | @dataclass(slots=True, frozen=True) 9 | class Sample: 10 | """Represents a sample for evaluation. 11 | 12 | This matches the actual Sample type from trainloop eval types. 13 | """ 14 | duration_ms: int # Duration of the request in milliseconds 15 | tag: str # The tag of the event 16 | input: List[Dict[str, str]] # Input(s) to the model 17 | output: Dict[Literal["content"], str] # Output(s) from the model 18 | model: str # The model used to generate the response 19 | model_params: Dict[str, Any] # Model parameters 20 | start_time_ms: int # Start time in milliseconds since epoch 21 | end_time_ms: int # End time in milliseconds since epoch 22 | url: str # The request URL 23 | location: Dict[Literal["tag", "lineNumber"], str] # Location information 24 | 25 | 26 | @dataclass(slots=True, frozen=False) 27 | class Result: 28 | """Represents a metric evaluation result.""" 29 | metric: str # The name of the metric 30 | sample: Sample # The sample that was evaluated 31 | passed: int # 1 or 0 32 | reason: str | None = None # The reason for the failure (if any) 33 | -------------------------------------------------------------------------------- /releases/0.5.0.md: -------------------------------------------------------------------------------- 1 | Summary: Major SDK refactoring with comprehensive testing infrastructure 2 | 3 | **Major SDK Refactoring and Testing Infrastructure** 4 | 5 | The changes in this release include: 6 | 7 | ### 📁 SDK Restructuring 8 | - Renamed SDK from "evals-sdk" to "sdk" 9 | - Renamed packages from "trainloop_evals" to "trainloop_llm_logging" 10 | - Improved config functions to handle environment variable fallbacks 11 | 12 | ### 🧪 Testing Infrastructure 13 | - Added comprehensive test suites with over 3000 lines of tests 14 | - Implemented pytest configuration for Python SDK 15 | - Implemented Jest configuration for TypeScript SDK 16 | - Created test directory structure: unit/, integration/, edge-cases/ 17 | - Added test utilities and fixtures for both SDKs 18 | 19 | ### 🔄 CI/CD Workflows 20 | - Added GitHub Actions workflows for automated testing 21 | - Created lean workflow for fast PR feedback (~2-3 minutes) 22 | - Created full matrix workflow for comprehensive OS/version testing 23 | - Configured path filtering to only run when SDK files change 24 | 25 | ### 🐛 Bug Fixes 26 | - Fixed UI database initialization to handle empty directories gracefully 27 | - Fixed concurrency issues in DuckDB 28 | - Minor bug fixes in instrumentation 29 | 30 | ### 🛠️ Enhancements 31 | - Enhanced studio CLI with `--config` and `--local` arguments 32 | - Added comprehensive testing documentation in `sdk/TESTING.md` 33 | - Updated README with Drake meme for better engagement 34 | -------------------------------------------------------------------------------- /releases/0.6.0.md: -------------------------------------------------------------------------------- 1 | Summary: Go SDK release and Registry system with enhanced CLI 2 | 3 | **Go SDK and Registry System** 4 | 5 | The changes in this release include: 6 | 7 | ### 🚀 New Go SDK 8 | - Released v1 of the TrainLoop Go SDK for automatic LLM call logging 9 | - Supports instrumentation of standard `net/http` package 10 | - Captures requests to OpenAI, Anthropic, and other LLM providers 11 | - Features automatic tagging, location tracking, and data export 12 | - Includes comprehensive documentation and examples 13 | 14 | ### 📦 Registry System 15 | - Created a new registry system for sharing metrics and evaluation suites 16 | - Added `trainloop add` command for installing metrics and suites from the registry 17 | - Supports both online GitHub-based registry and local development with `--registry` flag 18 | - Implemented dynamic index loading for automatic component discovery 19 | - Added typed Python configuration with dataclasses for better IDE support 20 | 21 | ### 🛠️ CLI Enhancements 22 | - Enhanced `trainloop add` command with flexible options: 23 | - `trainloop add --list` to view all available components 24 | - `trainloop add metric ` to install specific metrics 25 | - `trainloop add suite ` to install suites with dependencies 26 | - Added `--registry` flag for local registry development and testing 27 | - Improved error handling and user feedback 28 | 29 | ### 📚 Documentation Updates 30 | - Updated TypeScript SDK README to clarify JavaScript compatibility 31 | - Added Go SDK reference to the main README 32 | - Improved installation instructions for all SDKs 33 | 34 | ### 🐛 Bug Fixes 35 | - Fixed import path handling in the `trainloop add` command 36 | - Fixed issue where untagged calls were incorrectly labeled 37 | - Corrected Go SDK import paths to match GitHub repository structure -------------------------------------------------------------------------------- /runner/.gitignore: -------------------------------------------------------------------------------- 1 | _bundle -------------------------------------------------------------------------------- /runner/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Studio Runner 2 | 3 | Simple wrapper used by the CLI to serve the prebuilt Studio UI. 4 | 5 | ```bash 6 | node bin/run.js 7 | ``` 8 | 9 | It expects `TRAINLOOP_DATA_FOLDER` to point to the folder where the SDKs write their files. 10 | -------------------------------------------------------------------------------- /runner/bin/run.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const path = require('path'); 3 | const fs = require('fs'); 4 | 5 | const ROOT = path.join(__dirname, '..', '_bundle'); 6 | const PORT = process.env.PORT || '8888'; 7 | 8 | console.log(`🚀 TrainLoop Studio → http://localhost:${PORT}`); 9 | 10 | // Set environment variables 11 | process.env.PORT = PORT; 12 | process.env.NODE_ENV = 'production'; 13 | 14 | // Ensure TRAINLOOP_DATA_FOLDER is set 15 | if (!process.env.TRAINLOOP_DATA_FOLDER) { 16 | console.error('Error: TRAINLOOP_DATA_FOLDER environment variable is not set'); 17 | console.error('Please set this variable to point to your TrainLoop data directory'); 18 | process.exit(1); 19 | } 20 | 21 | // Verify that the data folder exists 22 | const dataFolder = process.env.TRAINLOOP_DATA_FOLDER; 23 | if (!fs.existsSync(dataFolder)) { 24 | console.error(`Error: Data folder does not exist: ${dataFolder}`); 25 | console.error('Please create this directory or specify a different TRAINLOOP_DATA_FOLDER'); 26 | process.exit(1); 27 | } 28 | 29 | console.log(`Using data folder: ${dataFolder}`); 30 | 31 | 32 | // Change working directory to the bundle directory 33 | process.chdir(ROOT); 34 | 35 | // Directly require the server file instead of spawning a process 36 | try { 37 | require(path.join(ROOT, 'server.js')); 38 | } catch (error) { 39 | console.error('Error starting TrainLoop Studio:', error); 40 | process.exit(1); 41 | } 42 | 43 | // Handle termination signals 44 | ['SIGINT', 'SIGTERM'].forEach(sig => { 45 | process.on(sig, () => process.exit(0)); 46 | }); 47 | -------------------------------------------------------------------------------- /runner/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "trainloop-studio-runner", 3 | "version": "0.6.0", 4 | "lockfileVersion": 3, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "trainloop-studio-runner", 9 | "version": "0.6.0", 10 | "bin": { 11 | "trainloop-studio": "bin/run.js" 12 | }, 13 | "engines": { 14 | "node": ">=18" 15 | } 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /runner/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "trainloop-studio-runner", 3 | "version": "0.6.0", 4 | "private": true, 5 | "description": "Bundled TrainLoop Studio runner", 6 | "bin": { 7 | "trainloop-studio": "bin/run.js" 8 | }, 9 | "engines": { 10 | "node": ">=18" 11 | }, 12 | "files": [ 13 | "bin/", 14 | "_bundle/" 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /scripts/bump/README.md: -------------------------------------------------------------------------------- 1 | # Bump Version Script 2 | 3 | This script handles version bumping, changelog updates, git commits, tagging, and pushing. 4 | 5 | ## Usage 6 | 7 | ### Step 1: Create Release Notes 8 | Before bumping the version, create a release notes file in the `releases/` directory. 9 | The file **must** start with a `Summary:` line that briefly describes the release. 10 | 11 | ```bash 12 | # For example, if bumping to 0.5.0, create releases/0.5.0.md 13 | cat > releases/0.5.0.md << EOF 14 | Summary: Major SDK refactoring with comprehensive testing infrastructure 15 | 16 | **Major SDK Refactoring and Testing Infrastructure** 17 | 18 | The changes in this release include: 19 | 20 | ### 📁 SDK Restructuring 21 | - Renamed SDK from "evals-sdk" to "sdk" 22 | - Renamed packages from "trainloop_evals" to "trainloop_llm_logging" 23 | - Improved config functions to handle environment variable fallbacks 24 | 25 | ### 🧪 Testing Infrastructure 26 | - Added comprehensive test suites with over 3000 lines of tests 27 | - Implemented pytest configuration for Python SDK 28 | - Implemented Jest configuration for TypeScript SDK 29 | 30 | ### 🐛 Bug Fixes 31 | - Fixed UI database initialization to handle empty directories gracefully 32 | - Fixed concurrency issues in DuckDB 33 | EOF 34 | ``` 35 | 36 | ### Step 2: Run the Bump Script 37 | ```bash 38 | # Bump patch version 39 | npm run bump patch 40 | # or 41 | python scripts/bump/bump_version.py patch 42 | 43 | # Bump minor version 44 | npm run bump minor 45 | 46 | # Bump major version 47 | npm run bump major 48 | ``` 49 | 50 | ## How It Works 51 | 52 | 1. **Pre-check**: The script calculates the new version and checks if `releases/.md` exists 53 | 2. **Summary Extraction**: Reads the `Summary:` line from the release file for the commit message 54 | 3. **Version Update**: Updates all package.json and pyproject.toml files 55 | 4. **Lock Files**: Regenerates npm lock files 56 | 5. **Changelog**: Adds an entry to CHANGELOG.md that links to the release notes file 57 | 6. **Git Operations**: Commits all changes including the release file, tags, and pushes 58 | 59 | ## Release File Format 60 | 61 | The release file **must** start with a Summary line: 62 | ```markdown 63 | Summary: Brief description of the release 64 | 65 | [Rest of your detailed release notes in markdown format] 66 | ``` 67 | 68 | ## Example Changelog Output 69 | 70 | The changelog will contain links to the full release notes: 71 | 72 | ```markdown 73 | # Changelog 74 | 75 | ## 0.5.0 (2025-05-27) 76 | [Release Notes](releases/0.5.0.md) 77 | 78 | ## 0.4.0 (2025-05-22) 79 | • Public Release 80 | ``` 81 | 82 | ## Important Notes 83 | 84 | - The release file **must** exist before running the bump script 85 | - The release file **must** start with `Summary: ` 86 | - The summary is used for the git commit message and tag 87 | - Release files are committed as part of the version bump 88 | - Release files should be markdown formatted for best readability 89 | -------------------------------------------------------------------------------- /scripts/bump/bump_pulumi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | trainloop-pulumi - update docker tag in Pulumi YAML and (optionally) deploy. 4 | 5 | usage: 6 | python scripts/pulumi.py # just rewrites YAML 7 | python scripts/pulumi.py --up # rewrite YAML, then `pulumi up --yes` 8 | python scripts/pulumi.py --preview # rewrite YAML, then `pulumi preview` 9 | """ 10 | from __future__ import annotations 11 | import argparse 12 | import os 13 | import pathlib 14 | import re 15 | import subprocess 16 | 17 | ROOT = pathlib.Path(__file__).resolve().parents[2] 18 | VER = ROOT.joinpath("VERSION").read_text(encoding="utf-8").strip() 19 | REG = os.getenv("REGISTRY", "ghcr.io/trainloop") 20 | IMAGE = f"{REG}/evals:{VER}" 21 | 22 | 23 | def sh(cmd: str, cwd: pathlib.Path): 24 | subprocess.run(cmd, shell=True, check=True, cwd=cwd) 25 | 26 | 27 | def update_yaml(yaml: pathlib.Path) -> None: 28 | text = yaml.read_text() 29 | text = re.sub(r"^\s*appImage:.*$", f" appImage: {REG}/evals", text, flags=re.M) 30 | text = re.sub(r"^\s*appVersion:.*$", f" appVersion: {VER}", text, flags=re.M) 31 | yaml.write_text(text) 32 | print(f"📝 updated {yaml.name}") 33 | 34 | 35 | def main() -> None: 36 | ap = argparse.ArgumentParser() 37 | g = ap.add_mutually_exclusive_group() 38 | g.add_argument("--up", action="store_true", help="run pulumi up --yes") 39 | g.add_argument("--preview", action="store_true", help="run pulumi preview") 40 | args = ap.parse_args() 41 | 42 | for yaml in (ROOT / "infra").glob("Pulumi.*.yaml"): 43 | update_yaml(yaml) 44 | 45 | if args.up: 46 | sh("pulumi up --yes", ROOT / "infra") 47 | elif args.preview: 48 | sh("pulumi preview", ROOT / "infra") 49 | else: 50 | print("🔔 YAML updated - no pulumi command executed") 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /sdk/.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | -------------------------------------------------------------------------------- /sdk/go/trainloop-llm-logging/.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | # Go workspace file 18 | go.work 19 | 20 | # IDE specific files 21 | .idea/ 22 | .vscode/ 23 | *.swp 24 | *.swo 25 | *~ 26 | 27 | # OS specific files 28 | .DS_Store 29 | Thumbs.db 30 | 31 | # TrainLoop data (for examples) 32 | trainloop/ 33 | -------------------------------------------------------------------------------- /sdk/go/trainloop-llm-logging/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/TrainLoop/evals/sdk/go/trainloop-llm-logging 2 | 3 | go 1.20 4 | 5 | require gopkg.in/yaml.v3 v3.0.1 6 | -------------------------------------------------------------------------------- /sdk/go/trainloop-llm-logging/go.sum: -------------------------------------------------------------------------------- 1 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 2 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 3 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 4 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 5 | -------------------------------------------------------------------------------- /sdk/go/trainloop-llm-logging/instrumentation/instrumentation.go: -------------------------------------------------------------------------------- 1 | package instrumentation 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/TrainLoop/evals/sdk/go/trainloop-llm-logging/internal/exporter" 7 | "github.com/TrainLoop/evals/sdk/go/trainloop-llm-logging/internal/logger" 8 | ) 9 | 10 | var tlLog = logger.CreateLogger("trainloop-instrumentation") 11 | var once sync.Once 12 | var installed bool = false 13 | 14 | // InstallAllPatches initializes all available instrumentations. 15 | func InstallAllPatches(exp *exporter.FileExporter, hostAllowlist []string) { 16 | once.Do(func() { 17 | InstallPatches(exp, hostAllowlist) 18 | // Add other instrumentations here if any (e.g., gRPC) 19 | installed = true 20 | tlLog.Info("All instrumentations installed.") 21 | }) 22 | if installed { 23 | tlLog.Debug("Instrumentation already installed.") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /sdk/go/trainloop-llm-logging/internal/logger/logger.go: -------------------------------------------------------------------------------- 1 | package logger 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | // LogLevel defines the severity of a log message. 12 | type LogLevel int 13 | 14 | const ( 15 | // DEBUG level 16 | DEBUG LogLevel = iota 17 | // INFO level 18 | INFO 19 | // WARN level 20 | WARN 21 | // ERROR level 22 | ERROR 23 | ) 24 | 25 | var logLevelNames = map[LogLevel]string{ 26 | DEBUG: "DEBUG", 27 | INFO: "INFO", 28 | WARN: "WARN", 29 | ERROR: "ERROR", 30 | } 31 | 32 | var currentLogLevel = WARN // Default log level 33 | 34 | // Logger is a simple logger instance. 35 | type Logger struct { 36 | scope string 37 | } 38 | 39 | func init() { 40 | SetLogLevel(os.Getenv("TRAINLOOP_LOG_LEVEL")) 41 | } 42 | 43 | // SetLogLevel sets the global logging level. 44 | func SetLogLevel(levelStr string) { 45 | levelStr = strings.ToUpper(levelStr) 46 | switch levelStr { 47 | case "DEBUG": 48 | currentLogLevel = DEBUG 49 | case "INFO": 50 | currentLogLevel = INFO 51 | case "WARN": 52 | currentLogLevel = WARN 53 | case "ERROR": 54 | currentLogLevel = ERROR 55 | default: 56 | if levelStr != "" { 57 | log.Printf("[WARN] [logger] Invalid TRAINLOOP_LOG_LEVEL '%s', defaulting to WARN\n", levelStr) 58 | } 59 | currentLogLevel = WARN // Default if invalid or not set 60 | } 61 | } 62 | 63 | // CreateLogger creates a new logger with a given scope. 64 | func CreateLogger(scope string) *Logger { 65 | return &Logger{scope: scope} 66 | } 67 | 68 | func (l *Logger) log(level LogLevel, format string, args ...any) { 69 | if level < currentLogLevel { 70 | return 71 | } 72 | timestamp := time.Now().Format("2006-01-02T15:04:05Z07:00") 73 | prefix := fmt.Sprintf("[%s] [%s] [%s] ", logLevelNames[level], timestamp, l.scope) 74 | log.Printf(prefix+format+"\n", args...) 75 | } 76 | 77 | // Debug logs a message at DEBUG level. 78 | func (l *Logger) Debug(format string, args ...any) { 79 | l.log(DEBUG, format, args...) 80 | } 81 | 82 | // Info logs a message at INFO level. 83 | func (l *Logger) Info(format string, args ...any) { 84 | l.log(INFO, format, args...) 85 | } 86 | 87 | // Warn logs a message at WARN level. 88 | func (l *Logger) Warn(format string, args ...any) { 89 | l.log(WARN, format, args...) 90 | } 91 | 92 | // Error logs a message at ERROR level. 93 | func (l *Logger) Error(format string, args ...any) { 94 | l.log(ERROR, format, args...) 95 | } 96 | -------------------------------------------------------------------------------- /sdk/go/trainloop-llm-logging/internal/types/types.go: -------------------------------------------------------------------------------- 1 | package types 2 | 3 | import "time" 4 | 5 | // LLMCallLocation stores the file and line number of an LLM call. 6 | type LLMCallLocation struct { 7 | File string `json:"file"` 8 | LineNumber string `json:"lineNumber"` 9 | } 10 | 11 | // ParsedResponseBody represents the structured content of an LLM response. 12 | type ParsedResponseBody struct { 13 | Content string `json:"content"` 14 | } 15 | 16 | // ParsedRequestBody represents the structured content of an LLM request. 17 | type ParsedRequestBody struct { 18 | Messages []map[string]string `json:"messages"` 19 | Model string `json:"model"` 20 | ModelParams map[string]any `json:"modelParams"` 21 | } 22 | 23 | // CollectedSample is the structure for storing a single LLM call's data. 24 | type CollectedSample struct { 25 | DurationMs int64 `json:"durationMs"` 26 | Tag string `json:"tag"` 27 | Input []map[string]string `json:"input"` // This is ParsedRequestBody.Messages 28 | Output *ParsedResponseBody `json:"output"` 29 | Model string `json:"model"` 30 | ModelParams map[string]any `json:"modelParams"` 31 | StartTimeMs int64 `json:"startTimeMs"` // Unix timestamp 32 | EndTimeMs int64 `json:"endTimeMs"` // Unix timestamp 33 | URL string `json:"url"` 34 | Location LLMCallLocation `json:"location"` 35 | } 36 | 37 | // LLMCallData is used to pass raw call data to the exporter. 38 | type LLMCallData struct { 39 | RequestBodyStr string 40 | ResponseBodyStr string 41 | URL string 42 | Tag string 43 | Location LLMCallLocation 44 | StartTime time.Time 45 | EndTime time.Time 46 | IsLLMRequest bool 47 | Headers map[string]string 48 | Status int 49 | } 50 | 51 | // TrainloopConfigObject represents the 'trainloop' section of the config file. 52 | type TrainloopConfigObject struct { 53 | DataFolder string `yaml:"data_folder"` 54 | HostAllowlist []string `yaml:"host_allowlist"` 55 | LogLevel string `yaml:"log_level"` 56 | } 57 | 58 | // TrainloopConfig is the top-level structure for the config file. 59 | type TrainloopConfig struct { 60 | Trainloop TrainloopConfigObject `yaml:"trainloop"` 61 | } 62 | 63 | // RegistryEntry stores metadata for a call site in the registry. 64 | type RegistryEntry struct { 65 | LineNumber string `json:"lineNumber"` 66 | Tag string `json:"tag"` 67 | FirstSeen string `json:"firstSeen"` // ISO-8601 UTC 68 | LastSeen string `json:"lastSeen"` 69 | Count int `json:"count"` 70 | } 71 | 72 | // Registry is the structure for the _registry.json file. 73 | type Registry struct { 74 | Schema int `json:"schema"` // always 1 for now 75 | Files map[string]map[string]RegistryEntry `json:"files"` // file → line → entry 76 | } 77 | -------------------------------------------------------------------------------- /sdk/python/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | 5 | # Distribution / packaging 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | MANIFEST 23 | 24 | # Testing and coverage 25 | .tox/ 26 | .coverage 27 | .coverage.* 28 | .cache 29 | nosetests.xml 30 | coverage.xml 31 | *.cover 32 | .hypothesis/ 33 | .pytest_cache/ 34 | htmlcov/ 35 | 36 | # Virtual environments 37 | venv/ 38 | ENV/ 39 | env/ 40 | .venv 41 | 42 | # IDE 43 | .idea/ 44 | .vscode/ 45 | *.swp 46 | *.swo 47 | 48 | # OS 49 | .DS_Store 50 | Thumbs.db -------------------------------------------------------------------------------- /sdk/python/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Evals SDK (Python) 2 | 3 | Automatically capture LLM calls from Python apps so they can be graded later. 4 | 5 | ## Install 6 | 7 | ```bash 8 | pip install trainloop-llm-logging 9 | ``` 10 | 11 | ## Quick example 12 | 13 | ```python 14 | from trainloop_llm_logging import collect, trainloop_tag 15 | collect() # patch HTTP clients 16 | openai.chat.completions.create(..., trainloop_tag("my-tag")) 17 | ``` 18 | 19 | Set `TRAINLOOP_DATA_FOLDER` to choose where event files are written or set `data_folder` in your `trainloop.config.yaml` file. 20 | 21 | See the [project README](../../README.md) for more details. 22 | -------------------------------------------------------------------------------- /sdk/python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "trainloop-llm-logging" 3 | version = "0.6.0" 4 | description = "TrainLoop LLM Logging SDK for data collection" 5 | authors = ["Mason Pierce "] 6 | readme = "README.md" 7 | packages = [{ include = "trainloop_llm_logging" }] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.9" 11 | requests = "^2.31.0" 12 | pyyaml = "^6.0" 13 | 14 | [tool.poetry.group.dev.dependencies] 15 | pytest = "^7.0" 16 | pytest-cov = "^4.1.0" 17 | pytest-mock = "^3.12.0" 18 | pytest-asyncio = "^0.21.1" 19 | pytest-timeout = "^2.2.0" 20 | pytest-xdist = "^3.5.0" 21 | freezegun = "^1.4.0" 22 | responses = "^0.24.1" 23 | black = "^23.3.0" 24 | mypy = "^1.5.0" 25 | httpx = "^0.28.1" 26 | requests = "^2.32.3" 27 | 28 | [build-system] 29 | requires = ["poetry-core>=1.0.0"] 30 | build-backend = "poetry.core.masonry.api" 31 | -------------------------------------------------------------------------------- /sdk/python/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | minversion = 7.0 3 | testpaths = tests 4 | python_files = test_*.py 5 | python_classes = Test* 6 | python_functions = test_* 7 | addopts = 8 | -v 9 | --strict-markers 10 | --tb=short 11 | --capture=no 12 | --cov=trainloop_llm_logging 13 | --cov-report=html 14 | --cov-report=term-missing 15 | --cov-branch 16 | markers = 17 | unit: Unit tests 18 | integration: Integration tests 19 | slow: Slow tests 20 | edge_case: Edge case tests 21 | requires_network: Tests that require network access 22 | requires_fs: Tests that require filesystem access 23 | -------------------------------------------------------------------------------- /sdk/python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sdk/python/tests/edge_cases/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sdk/python/tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sdk/python/tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sdk/python/tests/unit/test_register.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the register module. 3 | """ 4 | import pytest 5 | from trainloop_llm_logging import HEADER_NAME, trainloop_tag 6 | 7 | 8 | class TestRegisterModule: 9 | """Test the public API functions.""" 10 | 11 | @pytest.mark.unit 12 | def test_header_name_constant(self): 13 | """Test that HEADER_NAME is correctly defined.""" 14 | assert HEADER_NAME == "X-Trainloop-Tag" 15 | 16 | @pytest.mark.unit 17 | def test_trainloop_tag_returns_dict(self): 18 | """Test that trainloop_tag returns a dictionary with the correct header.""" 19 | tag = "test-tag" 20 | result = trainloop_tag(tag) 21 | 22 | assert isinstance(result, dict) 23 | assert HEADER_NAME in result 24 | assert result[HEADER_NAME] == tag 25 | 26 | @pytest.mark.unit 27 | def test_trainloop_tag_with_empty_string(self): 28 | """Test trainloop_tag with empty string.""" 29 | result = trainloop_tag("") 30 | 31 | assert result == {HEADER_NAME: ""} 32 | 33 | @pytest.mark.unit 34 | @pytest.mark.edge_case 35 | def test_trainloop_tag_with_special_characters(self): 36 | """Test trainloop_tag with special characters.""" 37 | special_tag = "test-tag-αβγ-🚀-@#$%" 38 | result = trainloop_tag(special_tag) 39 | 40 | assert result[HEADER_NAME] == special_tag 41 | 42 | @pytest.mark.unit 43 | @pytest.mark.edge_case 44 | def test_trainloop_tag_with_very_long_string(self): 45 | """Test trainloop_tag with very long string.""" 46 | long_tag = "x" * 1000 47 | result = trainloop_tag(long_tag) 48 | 49 | assert result[HEADER_NAME] == long_tag 50 | assert len(result[HEADER_NAME]) == 1000 51 | -------------------------------------------------------------------------------- /sdk/python/trainloop_llm_logging/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | TrainLoop LLM Logging SDK 3 | ------------------- 4 | Public surface: 5 | 6 | • HEADER_NAME 7 | • trainloop_tag(tag) → {"X-Trainloop-Tag": tag} 8 | • collect() → bootstrap + auto-patch 9 | 10 | Import this once, early in your program: 11 | 12 | import trainloop_llm_logging as tl 13 | tl.collect() 14 | 15 | Everything else happens automatically. 16 | """ 17 | 18 | from .register import HEADER_NAME, trainloop_tag, collect 19 | 20 | __all__ = ["HEADER_NAME", "trainloop_tag", "collect"] 21 | -------------------------------------------------------------------------------- /sdk/python/trainloop_llm_logging/instrumentation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Central patch installer. Each sub-module exposes install(sink). 3 | """ 4 | 5 | from __future__ import annotations 6 | from ..exporter import FileExporter 7 | 8 | TRAINLOOP_INSTRUMENTATION_INSTALLED = False 9 | 10 | 11 | def install_patches(exporter: FileExporter) -> None: 12 | global TRAINLOOP_INSTRUMENTATION_INSTALLED 13 | if TRAINLOOP_INSTRUMENTATION_INSTALLED: 14 | return 15 | TRAINLOOP_INSTRUMENTATION_INSTALLED = True 16 | 17 | # pylint: disable=import-outside-toplevel 18 | from . import ( 19 | http_client_lib, 20 | requests_lib, 21 | httpx_lib, 22 | ) 23 | 24 | http_client_lib.install(exporter) 25 | requests_lib.install(exporter) 26 | httpx_lib.install(exporter) 27 | -------------------------------------------------------------------------------- /sdk/python/trainloop_llm_logging/logger.py: -------------------------------------------------------------------------------- 1 | # logger.py 2 | from __future__ import annotations 3 | 4 | import logging 5 | import os 6 | 7 | _LEVELS = {"ERROR": 40, "WARN": 30, "INFO": 20, "DEBUG": 10} 8 | _DEFAULT = "WARN" 9 | 10 | 11 | def _configure_root_once() -> None: 12 | """ 13 | Initialise the *root* logger exactly once, replacing any handlers that 14 | Uvicorn or another library may have installed. 15 | 16 | Call this early (e.g. in `main.py` **before** you import code that logs). 17 | """ 18 | if getattr(_configure_root_once, "_done", False): 19 | return 20 | 21 | lvl_name = os.getenv("TRAINLOOP_LOG_LEVEL", _DEFAULT).upper() 22 | lvl = _LEVELS.get(lvl_name, logging.INFO) 23 | 24 | # 'force=True' clears anything set up by uvicorn, avoiding duplicate handlers 25 | logging.basicConfig( 26 | level=lvl, 27 | format="[%(levelname)s] [%(asctime)s] [%(name)s] %(message)s", 28 | force=True, 29 | ) 30 | _configure_root_once._done = True 31 | 32 | 33 | def create_logger(scope: str) -> logging.Logger: 34 | """ 35 | Return a named logger that inherits the single root handler. 36 | 37 | >>> log = create_logger("trainloop-exporter") 38 | >>> log.info("hello") # ➜ [INFO] [...] [trainloop-exporter] hello 39 | """ 40 | _configure_root_once() # make sure root is ready 41 | logger = logging.getLogger(scope) 42 | return logger 43 | -------------------------------------------------------------------------------- /sdk/python/trainloop_llm_logging/register.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entry point - mirrors TS `src/index.ts`. 3 | 4 | • Loads YAML / env config 5 | • Spins up FileExporter 6 | • Installs outbound-HTTP patches (requests, httpx, …) 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | import os 12 | 13 | from .config import load_config_into_env 14 | from .exporter import FileExporter 15 | from .instrumentation import install_patches 16 | from .logger import create_logger 17 | from .instrumentation.utils import HEADER_NAME 18 | 19 | _log = create_logger("trainloop-register") 20 | 21 | 22 | def trainloop_tag(tag: str) -> dict[str, str]: 23 | """Helper to merge into headers: >>> headers |= trainloop_tag("checkout")""" 24 | return {HEADER_NAME: tag} 25 | 26 | 27 | _IS_INIT = False 28 | 29 | 30 | def collect(trainloop_config_path: str | None = None) -> None: 31 | """ 32 | Initialize the SDK (idempotent). Does nothing unless 33 | TRAINLOOP_DATA_FOLDER is set. 34 | """ 35 | global _IS_INIT 36 | if _IS_INIT: 37 | return 38 | 39 | load_config_into_env(trainloop_config_path) 40 | if "TRAINLOOP_DATA_FOLDER" not in os.environ: 41 | _log.warning("TRAINLOOP_DATA_FOLDER not set - SDK disabled") 42 | return 43 | exporter = FileExporter() # flushes every 10 s or 5 items 44 | install_patches(exporter) # monkey-patch outbound HTTP 45 | 46 | _IS_INIT = True 47 | _log.info("TrainLoop Evals SDK initialized") 48 | -------------------------------------------------------------------------------- /sdk/python/trainloop_llm_logging/store.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filesystem helpers - JSONL shards + _registry.json. 3 | 4 | Path layout identical to the Node SDK. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | import json 10 | import time 11 | from datetime import datetime, timezone 12 | from pathlib import Path 13 | 14 | from .logger import create_logger 15 | from .types import CollectedSample, LLMCallLocation, Registry, RegistryEntry 16 | 17 | _log = create_logger("trainloop-store") 18 | 19 | 20 | def _now_iso() -> str: 21 | return datetime.now(timezone.utc).isoformat(timespec="seconds") 22 | 23 | 24 | def update_registry(data_dir: str, loc: LLMCallLocation, tag: str) -> None: 25 | """ 26 | Persist (file, line) → {tag, firstSeen, lastSeen, count} 27 | Never duplicates; tag can be overwritten in place. 28 | """ 29 | path = Path(data_dir) / "_registry.json" 30 | _log.debug("Updating registry at %s", path) 31 | 32 | if path.exists(): 33 | try: 34 | reg: Registry = json.loads(path.read_text()) 35 | # If reg is an empty object, initialize it 36 | if reg == {}: 37 | reg = {"schema": 1, "files": {}} 38 | except Exception: 39 | _log.error("Corrupt registry - recreating") 40 | reg = {"schema": 1, "files": {}} 41 | else: 42 | reg = {"schema": 1, "files": {}} 43 | 44 | files = reg["files"].setdefault(loc["file"], {}) 45 | now = _now_iso() 46 | 47 | entry: RegistryEntry 48 | if loc["lineNumber"] in files: # already seen this line 49 | entry = files[loc["lineNumber"]] 50 | if entry["tag"] != tag: # tag changed in source 51 | entry["tag"] = tag 52 | entry["lastSeen"] = now 53 | entry["count"] += 1 54 | else: # first time 55 | entry = files[loc["lineNumber"]] = RegistryEntry( 56 | lineNumber=loc["lineNumber"], 57 | tag=tag, 58 | firstSeen=now, 59 | lastSeen=now, 60 | count=1, 61 | ) 62 | 63 | path.parent.mkdir(parents=True, exist_ok=True) 64 | path.write_text(json.dumps(reg, indent=2)) 65 | _log.debug( 66 | "Registry written - %s:%s = %s (count=%d)", 67 | loc["file"], 68 | loc["lineNumber"], 69 | entry["tag"], 70 | entry["count"], 71 | ) 72 | 73 | 74 | def save_samples(data_dir: str, samples: list[CollectedSample]) -> None: 75 | if not samples: 76 | return 77 | event_dir = Path(data_dir) / "events" 78 | event_dir.mkdir(parents=True, exist_ok=True) 79 | 80 | now = int(time.time() * 1000) 81 | window = 10 * 60 * 1000 82 | latest = max([int(f.stem) for f in event_dir.glob("*.jsonl")] + [0]) 83 | ts = latest if now - latest < window else now 84 | 85 | with (event_dir / f"{ts}.jsonl").open("a", encoding="utf-8") as f: 86 | for s in samples: 87 | f.write(json.dumps(s, ensure_ascii=False) + "\n") 88 | -------------------------------------------------------------------------------- /sdk/python/trainloop_llm_logging/types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import TypedDict, Dict, List, Optional, Any 3 | 4 | 5 | class LLMCallLocation(TypedDict): 6 | file: str 7 | lineNumber: str 8 | 9 | 10 | class ParsedResponseBody(TypedDict): 11 | content: str 12 | 13 | 14 | class ExpectedRequestBody(TypedDict, total=False): 15 | messages: List[Dict[str, str]] 16 | model: str 17 | 18 | 19 | class ParsedRequestBody(TypedDict): 20 | messages: List[Dict[str, str]] 21 | model: str 22 | modelParams: Dict[str, Any] 23 | 24 | 25 | class CollectedSample(TypedDict): 26 | durationMs: int 27 | tag: str 28 | input: List[Dict[str, str]] # This is ParsedRequestBody["messages"] 29 | output: ParsedResponseBody 30 | model: str 31 | modelParams: Dict[str, Any] 32 | startTimeMs: int 33 | endTimeMs: int 34 | url: str 35 | location: LLMCallLocation 36 | 37 | 38 | class LLMCallData(TypedDict, total=False): 39 | requestBodyStr: str 40 | responseBodyStr: str 41 | url: str 42 | tag: str 43 | location: LLMCallLocation 44 | startTimeMs: int 45 | endTimeMs: int 46 | durationMs: int 47 | isLLMRequest: bool 48 | headers: Dict[str, str] 49 | status: int 50 | 51 | 52 | class TrainLoopConfigObject(TypedDict): 53 | data_folder: Optional[str] 54 | host_allowlist: Optional[List[str]] 55 | log_level: Optional[str] 56 | 57 | 58 | class TrainloopConfig(TypedDict): 59 | trainloop: TrainLoopConfigObject 60 | 61 | 62 | class RegistryEntry(TypedDict): 63 | lineNumber: str 64 | tag: str 65 | firstSeen: str # ISO-8601 UTC 66 | lastSeen: str 67 | count: int 68 | 69 | 70 | class Registry(TypedDict): 71 | schema: int # always 1 for now 72 | files: Dict[str, Dict[str, RegistryEntry]] # file → line → entry 73 | -------------------------------------------------------------------------------- /sdk/typescript/.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | .pnp 4 | .pnp.js 5 | 6 | # Testing 7 | coverage/ 8 | .nyc_output/ 9 | junit.xml 10 | test-results/ 11 | 12 | # Production 13 | dist/ 14 | build/ 15 | lib/ 16 | *.tsbuildinfo 17 | 18 | # Debug 19 | npm-debug.log* 20 | yarn-debug.log* 21 | yarn-error.log* 22 | lerna-debug.log* 23 | .pnpm-debug.log* 24 | 25 | # Environment 26 | .env.local 27 | .env.development.local 28 | .env.test.local 29 | .env.production.local 30 | 31 | # IDE 32 | .idea/ 33 | .vscode/ 34 | *.swp 35 | *.swo 36 | 37 | # OS 38 | .DS_Store 39 | Thumbs.db 40 | 41 | # Temporary 42 | *.tmp 43 | *.temp 44 | .cache/ 45 | -------------------------------------------------------------------------------- /sdk/typescript/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Evals SDK (TypeScript/JavaScript) 2 | 3 | Patches Node HTTP libraries so every LLM request is logged for evaluation. Works with both TypeScript and JavaScript projects. 4 | 5 | ## Install 6 | 7 | ```bash 8 | npm install trainloop-llm-logging 9 | ``` 10 | 11 | ## Usage 12 | 13 | ```bash 14 | export TRAINLOOP_DATA_FOLDER=./trainloop/data # optional, otherwise will use the path at trainloop/trainloop.config.yaml 15 | NODE_OPTIONS="--require=trainloop-llm-logging" next dev 16 | ``` 17 | 18 | Tag individual calls when needed: 19 | 20 | **TypeScript:** 21 | ```ts 22 | import { trainloopTag } from 'trainloop-llm-logging'; 23 | 24 | openai.chat.completions.create(..., { headers: { ...trainloopTag("checkout") } }) 25 | ``` 26 | 27 | **JavaScript:** 28 | ```js 29 | const { trainloopTag } = require('trainloop-llm-logging'); 30 | 31 | openai.chat.completions.create(..., { headers: { ...trainloopTag("checkout") } }) 32 | ``` 33 | 34 | Logs are written under `$TRAINLOOP_DATA_FOLDER`. 35 | 36 | See the [project README](../../README.md) for context. 37 | -------------------------------------------------------------------------------- /sdk/typescript/jest.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('jest').Config} */ 2 | module.exports = { 3 | preset: 'ts-jest', 4 | testEnvironment: 'node', 5 | roots: ['/src', '/tests'], 6 | testMatch: [ 7 | '**/__tests__/**/*.+(ts|tsx|js)', 8 | '**/?(*.)+(spec|test).+(ts|tsx|js)' 9 | ], 10 | transform: { 11 | '^.+\\.(ts|tsx)$': ['ts-jest', { 12 | tsconfig: { 13 | allowJs: true, 14 | esModuleInterop: true, 15 | }, 16 | }], 17 | }, 18 | collectCoverageFrom: [ 19 | 'src/**/*.{js,ts}', 20 | '!src/**/*.d.ts', 21 | '!src/**/index.ts', 22 | '!src/**/__tests__/**', 23 | ], 24 | coverageThreshold: { 25 | global: { 26 | branches: 80, 27 | functions: 80, 28 | lines: 80, 29 | statements: 80, 30 | }, 31 | }, 32 | coverageReporters: ['text', 'lcov', 'html'], 33 | moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'], 34 | verbose: true, 35 | testTimeout: 10000, 36 | setupFilesAfterEnv: ['/tests/setup.ts'], 37 | moduleNameMapper: { 38 | '^@/(.*)$': '/src/$1', 39 | }, 40 | }; 41 | -------------------------------------------------------------------------------- /sdk/typescript/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "trainloop-llm-logging", 3 | "version": "0.6.0", 4 | "description": "TrainLoop Evaluations - header-based request tagging and zero-touch collection", 5 | "main": "dist/index.js", 6 | "types": "dist/index.d.ts", 7 | "exports": { 8 | ".": { 9 | "require": "./dist/index.js", 10 | "import": "./dist/index.js", 11 | "types": "./dist/index.d.ts" 12 | }, 13 | "./register": { 14 | "require": "./dist/register.js", 15 | "import": "./dist/register.js", 16 | "types": "./dist/register.d.ts" 17 | } 18 | }, 19 | "files": [ 20 | "dist", 21 | "README.md", 22 | "LICENSE" 23 | ], 24 | "scripts": { 25 | "build": "tsc -p tsconfig.json", 26 | "prepublishOnly": "npm run build", 27 | "dev": "concurrently -k -n TSC,YALC \"npm run build:watch\" \"chokidar dist -c 'yalc push'\"", 28 | "build:watch": "tsc -w -p tsconfig.json", 29 | "test": "jest", 30 | "test:watch": "jest --watch", 31 | "test:coverage": "jest --coverage", 32 | "test:unit": "jest --testPathPattern=tests/unit", 33 | "test:integration": "jest --testPathPattern=tests/integration", 34 | "test:edge": "jest --testPathPattern=tests/edge-cases" 35 | }, 36 | "keywords": [ 37 | "llm", 38 | "evaluation", 39 | "trainloop", 40 | "openai", 41 | "anthropic", 42 | "observability" 43 | ], 44 | "author": "TrainLoop ", 45 | "license": "MIT", 46 | "engines": { 47 | "node": ">=20.18.1" 48 | }, 49 | "devDependencies": { 50 | "@types/js-yaml": "^4.0.9", 51 | "@types/node": "^22.15.17", 52 | "@types/jest": "^29.5.11", 53 | "chokidar-cli": "^3.0.0", 54 | "concurrently": "^9.1.2", 55 | "typescript": "^5.4.5", 56 | "yalc": "^1.0.0-pre.53", 57 | "jest": "^29.7.0", 58 | "ts-jest": "^29.1.1", 59 | "@jest/globals": "^29.7.0", 60 | "nock": "^13.5.0", 61 | "timekeeper": "^2.3.1" 62 | }, 63 | "dependencies": { 64 | "js-yaml": "^4.1.0" 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /sdk/typescript/src/constants.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Shared constants for the TrainLoop SDK 3 | */ 4 | 5 | export const HEADER_NAME = "X-Trainloop-Tag"; 6 | 7 | export const DEFAULT_HOST_ALLOWLIST = ["api.openai.com", "api.anthropic.com"]; 8 | -------------------------------------------------------------------------------- /sdk/typescript/src/index.ts: -------------------------------------------------------------------------------- 1 | import { patchHttp } from "./instrumentation/http"; 2 | import http from "http"; 3 | import https from "https"; 4 | import { patchFetch } from "./instrumentation/fetch"; 5 | import { FileExporter } from "./exporter" 6 | import { loadConfig } from "./config"; 7 | import { HEADER_NAME, DEFAULT_HOST_ALLOWLIST } from "./constants"; 8 | 9 | /** 10 | * Public surface: 11 | * • HEADER_NAME - constant header string 12 | * • trainloopTag(tag) - helper to add the trainloop tag to the header 13 | * • shutdown() - graceful shutdown 14 | */ 15 | 16 | export { HEADER_NAME, DEFAULT_HOST_ALLOWLIST }; 17 | 18 | /** 19 | * Convenience helper - merge into your fetch/OpenAI headers 20 | */ 21 | export function trainloopTag(tag: string): Record { 22 | return { [HEADER_NAME]: tag }; 23 | } 24 | 25 | // Global exporter instance 26 | let globalExporter: FileExporter | null = null; 27 | 28 | const init = async () => { 29 | // First load the config from the trainloop folder if available 30 | loadConfig(); 31 | await import("./instrumentation"); 32 | globalExporter = new FileExporter(); 33 | patchHttp(http, globalExporter); 34 | patchHttp(https, globalExporter); 35 | patchFetch(globalExporter); 36 | } 37 | 38 | // Initialize the SDK 39 | init(); 40 | 41 | /** 42 | * Graceful shutdown - exports remaining data and cleans up resources 43 | */ 44 | export async function shutdown(): Promise { 45 | if (globalExporter) { 46 | globalExporter.shutdown(); 47 | globalExporter = null; 48 | } 49 | } 50 | 51 | // host allow‑list 52 | export const EXPECTED_LLM_PROVIDER_URLS = (process.env.TRAINLOOP_HOST_ALLOWLIST ?? DEFAULT_HOST_ALLOWLIST.join(",")) 53 | .split(",") 54 | .map((s) => s.trim()) 55 | .filter(Boolean); 56 | -------------------------------------------------------------------------------- /sdk/typescript/src/instrumentation/fetch.ts: -------------------------------------------------------------------------------- 1 | import { EXPECTED_LLM_PROVIDER_URLS, HEADER_NAME } from "../index"; 2 | import { FileExporter } from "../exporter"; 3 | import { getAndRemoveHeader, getCallerSite, getCallerStack, getFetchHost, cloneResponseForLogging } from "./utils"; 4 | import { createLogger } from "../logger"; 5 | 6 | const logger = createLogger("trainloop-fetch"); 7 | 8 | /* ------------- patch fetch ------------- */ 9 | 10 | export function patchFetch(exporter: FileExporter): void { 11 | if (typeof globalThis.fetch === "function") { 12 | const origFetch = globalThis.fetch; 13 | 14 | globalThis.fetch = (async function patchedFetch( 15 | resource: RequestInfo | URL, 16 | init: RequestInit = {}, 17 | ): Promise { 18 | const t0 = Date.now(); 19 | // Remove and retrieve the X-Trainloop-Tag header 20 | let tagValue: string | undefined; 21 | if (init && init.headers) { 22 | tagValue = getAndRemoveHeader(init.headers, HEADER_NAME); 23 | } 24 | 25 | const location = getCallerSite(getCallerStack()); 26 | 27 | const reqBody = 28 | init.body && typeof init.body !== "string" ? "[stream]" : init.body ?? ""; 29 | 30 | // Real fetch 31 | const res = await origFetch(resource, init); 32 | 33 | // Fire-and-forget logging 34 | (async () => { 35 | const host = getFetchHost(resource); 36 | if (host && EXPECTED_LLM_PROVIDER_URLS.includes(host)) { 37 | // Clone response without blocking 38 | const resBody = await cloneResponseForLogging(res); 39 | const t1 = Date.now(); 40 | const ms = t1 - t0; 41 | 42 | logger.info("------- START FETCH CALL -------"); 43 | logger.info(`Method: ${init.method ?? "GET"}`); 44 | logger.info(`Resource: ${resource}`); 45 | logger.info(`Request Body: ${reqBody}`); 46 | logger.info(`Status: ${res.status}`); 47 | logger.info(`Response Body: ${resBody}`); 48 | logger.info(`Duration: ${ms} ms`); 49 | logger.info(`Location: ${location}`); 50 | logger.info(`Tag: ${tagValue}`); 51 | logger.info("------- END FETCH CALL -------"); 52 | 53 | exporter.recordLLMCall({ 54 | requestBodyStr: reqBody, 55 | responseBodyStr: resBody, 56 | durationMs: Math.round(ms), 57 | url: resource.toString(), 58 | endTimeMs: Math.round(t1), 59 | startTimeMs: Math.round(t0), 60 | isLLMRequest: true, 61 | location, 62 | status: res.status, 63 | tag: tagValue, 64 | 65 | }) 66 | } 67 | })().catch(() => { 68 | /* swallow logging errors so they never affect the main flow */ 69 | }); 70 | // Caller receives stream immediately 71 | return res; 72 | }) as typeof fetch; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /sdk/typescript/src/instrumentation/index.ts: -------------------------------------------------------------------------------- 1 | import "./http"; 2 | import "./fetch"; -------------------------------------------------------------------------------- /sdk/typescript/src/logger.ts: -------------------------------------------------------------------------------- 1 | type Level = "error" | "warn" | "info" | "debug"; 2 | const ORDER: Record = { error: 0, warn: 1, info: 2, debug: 3 }; 3 | const DEFAULT_LEVEL: Level = "warn"; 4 | 5 | export function createLogger(scope: string) { 6 | const env = (process.env.TRAINLOOP_LOG_LEVEL || DEFAULT_LEVEL).toLowerCase() as Level; 7 | const threshold = ORDER[env] ?? ORDER.info; 8 | 9 | function log(level: Level, msg: string): void { 10 | if (ORDER[level] > threshold) return; 11 | const ts = new Date().toISOString(); 12 | // eslint-disable-next-line no-console 13 | console[level](`[${level.toUpperCase()}] [${ts}] [${scope}] ${msg}`); 14 | } 15 | 16 | return { 17 | error: (m: string) => log("error", m), 18 | warn: (m: string) => log("warn", m), 19 | info: (m: string) => log("info", m), 20 | debug: (m: string) => log("debug", m) 21 | }; 22 | } -------------------------------------------------------------------------------- /sdk/typescript/src/types/global.d.ts: -------------------------------------------------------------------------------- 1 | /* Enable auto‑completion for trainloopTag usage */ 2 | 3 | declare global { 4 | interface Headers { 5 | /** TrainLoop tag header */ 6 | ["X-Trainloop-Tag"]?: string; 7 | } 8 | } 9 | 10 | export { }; 11 | -------------------------------------------------------------------------------- /sdk/typescript/src/types/shared.d.ts: -------------------------------------------------------------------------------- 1 | export type CollectedSample = { 2 | durationMs: number; 3 | tag: string; 4 | input: ParsedRequestBody["messages"]; 5 | output: ParsedResponseBody; 6 | model: string; 7 | modelParams: Record; 8 | startTimeMs: number; // unix timestamp 9 | endTimeMs: number; // unix timestamp 10 | url: string; 11 | location: LLMCallLocation; 12 | } 13 | 14 | export type ParsedResponseBody = Record<"content", string>; 15 | 16 | export type ExpectedRequestBody = { 17 | messages: Record[]; 18 | model: string; 19 | [key: string]: unknown; 20 | } 21 | 22 | export type ParsedRequestBody = { 23 | messages: Record[]; 24 | model: string; 25 | modelParams: Record; 26 | } 27 | 28 | export type TrainloopConfig = { 29 | trainloop: { 30 | data_folder: string; 31 | host_allowlist: string[]; 32 | log_level: string; 33 | } 34 | } 35 | 36 | export type RegistryEntry = { 37 | lineNumber: string; 38 | tag: string; 39 | firstSeen: string; // ISO-8601 UTC 40 | lastSeen: string; 41 | count: number; 42 | }; 43 | 44 | export type Registry = { 45 | schema: number; 46 | files: { 47 | [file: string]: { 48 | [line: string]: RegistryEntry; 49 | }; 50 | }; 51 | }; 52 | 53 | 54 | export type LLMCallLocation = { 55 | file: string; 56 | lineNumber: string; 57 | }; 58 | 59 | export type LLMCallData = { 60 | requestBodyStr?: string; 61 | responseBodyStr?: string; 62 | url?: string; 63 | tag?: string; 64 | location?: LLMCallLocation; 65 | startTimeMs?: number; 66 | endTimeMs?: number; 67 | durationMs?: number; 68 | isLLMRequest?: boolean; 69 | headers?: Record; 70 | status?: number; 71 | }; -------------------------------------------------------------------------------- /sdk/typescript/tests/integration/sdk-initialization.test.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Integration test to verify SDK initialization behavior 3 | */ 4 | import { shutdown } from '../../src'; 5 | 6 | describe('SDK Initialization', () => { 7 | it('should allow multiple shutdown calls without error', async () => { 8 | // First shutdown 9 | await expect(shutdown()).resolves.toBeUndefined(); 10 | 11 | // Second shutdown should also work 12 | await expect(shutdown()).resolves.toBeUndefined(); 13 | }); 14 | 15 | it('should not have real FileExporter due to mocking', () => { 16 | // Verify that the FileExporter is mocked 17 | const { FileExporter } = require('../../src/exporter'); 18 | expect(jest.isMockFunction(FileExporter)).toBe(true); 19 | }); 20 | 21 | it('should have mocked exporter methods', () => { 22 | const { FileExporter } = require('../../src/exporter'); 23 | const mockInstance = new FileExporter(); 24 | 25 | expect(mockInstance.recordLLMCall).toBeDefined(); 26 | expect(mockInstance.shutdown).toBeDefined(); 27 | expect(mockInstance.clear).toBeDefined(); 28 | 29 | // Verify they are mock functions 30 | expect(jest.isMockFunction(mockInstance.recordLLMCall)).toBe(true); 31 | expect(jest.isMockFunction(mockInstance.shutdown)).toBe(true); 32 | expect(jest.isMockFunction(mockInstance.clear)).toBe(true); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /sdk/typescript/tests/setup.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Jest setup file - runs before all tests 3 | */ 4 | import { TextEncoder, TextDecoder } from 'util'; 5 | import * as os from 'os'; 6 | import * as path from 'path'; 7 | 8 | // Polyfill for Node.js environments that don't have these globals 9 | global.TextEncoder = TextEncoder; 10 | global.TextDecoder = TextDecoder as any; 11 | 12 | // Set test environment variables before any modules are imported 13 | process.env.NODE_ENV = 'test'; 14 | process.env.TRAINLOOP_DATA_FOLDER = path.join(os.tmpdir(), 'trainloop-test-data'); 15 | process.env.TRAINLOOP_HOST_ALLOWLIST = 'api.openai.com,api.anthropic.com'; 16 | process.env.TRAINLOOP_LOG_LEVEL = 'ERROR'; // Suppress logs during tests 17 | 18 | // Mock the FileExporter to prevent background timer in tests 19 | jest.mock('../src/exporter', () => ({ 20 | FileExporter: jest.fn().mockImplementation(() => ({ 21 | recordLLMCall: jest.fn(), 22 | shutdown: jest.fn(), 23 | clear: jest.fn() 24 | })) 25 | })); 26 | 27 | // Suppress console output during tests unless explicitly needed 28 | const originalConsole = { ...console }; 29 | beforeAll(() => { 30 | console.log = jest.fn(); 31 | console.info = jest.fn(); 32 | console.warn = jest.fn(); 33 | console.error = jest.fn(); 34 | console.debug = jest.fn(); 35 | }); 36 | 37 | afterAll(async () => { 38 | console.log = originalConsole.log; 39 | console.info = originalConsole.info; 40 | console.warn = originalConsole.warn; 41 | console.error = originalConsole.error; 42 | console.debug = originalConsole.debug; 43 | 44 | // Clean up SDK resources 45 | const { shutdown } = await import('../src/index'); 46 | await shutdown(); 47 | }); 48 | 49 | // Clear all mocks between tests 50 | afterEach(() => { 51 | jest.clearAllMocks(); 52 | jest.restoreAllMocks(); 53 | }); 54 | 55 | // Increase timeout for async operations 56 | jest.setTimeout(10000); 57 | -------------------------------------------------------------------------------- /sdk/typescript/tests/unit/register.test.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Unit tests for the register/public API functions 3 | */ 4 | import { HEADER_NAME, trainloopTag } from '../../src'; 5 | 6 | describe('Register Module', () => { 7 | describe('HEADER_NAME', () => { 8 | it('should be correctly defined', () => { 9 | expect(HEADER_NAME).toBe('X-Trainloop-Tag'); 10 | }); 11 | }); 12 | 13 | describe('trainloopTag', () => { 14 | it('should return object with correct header', () => { 15 | const tag = 'test-tag'; 16 | const result = trainloopTag(tag); 17 | 18 | expect(typeof result).toBe('object'); 19 | expect(result[HEADER_NAME]).toBe(tag); 20 | }); 21 | 22 | it('should handle empty string', () => { 23 | const result = trainloopTag(''); 24 | 25 | expect(result).toEqual({ [HEADER_NAME]: '' }); 26 | }); 27 | 28 | it('should handle special characters', () => { 29 | const specialTag = 'test-tag-αβγ-🚀-@#$%'; 30 | const result = trainloopTag(specialTag); 31 | 32 | expect(result[HEADER_NAME]).toBe(specialTag); 33 | }); 34 | 35 | it('should handle very long strings', () => { 36 | const longTag = 'x'.repeat(1000); 37 | const result = trainloopTag(longTag); 38 | 39 | expect(result[HEADER_NAME]).toBe(longTag); 40 | expect(result[HEADER_NAME].length).toBe(1000); 41 | }); 42 | 43 | it('should handle unicode characters', () => { 44 | const unicodeTag = '测试标签-テストタグ-테스트태그'; 45 | const result = trainloopTag(unicodeTag); 46 | 47 | expect(result[HEADER_NAME]).toBe(unicodeTag); 48 | }); 49 | 50 | it('should create new object each time', () => { 51 | const tag = 'test'; 52 | const result1 = trainloopTag(tag); 53 | const result2 = trainloopTag(tag); 54 | 55 | expect(result1).not.toBe(result2); // Different object references 56 | expect(result1).toEqual(result2); // But same content 57 | }); 58 | }); 59 | }); 60 | -------------------------------------------------------------------------------- /sdk/typescript/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2020", 4 | "module": "commonjs", 5 | "declaration": true, 6 | "outDir": "./dist", 7 | "rootDir": "./src", 8 | "strict": true, 9 | "esModuleInterop": true, 10 | "skipLibCheck": true, 11 | "forceConsistentCasingInFileNames": true, 12 | "lib": [ 13 | "es2020", 14 | "dom" 15 | ] 16 | }, 17 | "include": [ 18 | "src/**/*" 19 | ] 20 | } -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | .env* 2 | !.env.example -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Tests 2 | 3 | This directory contains comprehensive tests for the TrainLoop ecosystem, including CLI functionality, metrics, and integration testing. 4 | 5 | ## Test Structure 6 | 7 | ``` 8 | tests/ 9 | ├── unit/ # Fast unit tests 10 | │ ├── cli/ # CLI command tests 11 | │ ├── metrics/ # Metric function tests 12 | │ └── scaffold/ # Scaffold template tests 13 | ├── integration/ # End-to-end integration tests 14 | │ ├── init_flow/ # Test `trainloop init` workflow 15 | │ ├── eval_flow/ # Test `trainloop eval` workflow 16 | │ ├── add_flow/ # Test `trainloop add` workflow 17 | │ └── judge_flow/ # Test judge functionality 18 | ├── fixtures/ # Test data and sample projects 19 | │ ├── sample_project/ # Mock project structure 20 | │ ├── sample_data/ # JSONL test data 21 | │ └── configs/ # Test configuration files 22 | ├── helpers/ # Test utilities and mocks 23 | │ ├── mock_llm.py # Mock LLM responses for judge tests 24 | │ ├── temp_project.py # Temporary project creation 25 | │ └── cli_runner.py # CLI testing utilities 26 | └── conftest.py # Pytest configuration and fixtures 27 | 28 | ``` 29 | 30 | ## Test Categories 31 | 32 | ### Unit Tests (`tests/unit/`) 33 | - **CLI Commands**: Test individual CLI commands in isolation 34 | - **Metrics**: Test metric functions from registry and scaffold 35 | - **Scaffold**: Test template generation and validation 36 | 37 | ### Integration Tests (`tests/integration/`) 38 | - **Init Flow**: Test complete project initialization 39 | - **Eval Flow**: Test data collection → evaluation → results 40 | - **Add Flow**: Test adding metrics/suites from registry 41 | - **Judge Flow**: Test LLM judge with mocked responses 42 | 43 | ## Running Tests 44 | 45 | ```bash 46 | # Run all tests 47 | poetry run pytest tests/ 48 | 49 | # Run only unit tests 50 | poetry run pytest tests/unit/ 51 | 52 | # Run only integration tests 53 | poetry run pytest tests/integration/ 54 | 55 | # Run with coverage 56 | poetry run pytest tests/ --cov=trainloop_cli --cov-report=html 57 | 58 | # Run specific test category 59 | poetry run pytest tests/unit/cli/ -v 60 | ``` 61 | 62 | ## Key Benefits 63 | 64 | 1. **Separation**: Tests are separate from production code 65 | 2. **Comprehensive**: Covers CLI, metrics, scaffold, and integration 66 | 3. **Fast Feedback**: Unit tests run quickly for development 67 | 4. **Real Testing**: Integration tests use actual scaffold templates 68 | 5. **No Shipping**: Tests don't get included in CLI distribution 69 | 70 | ## Test Philosophy 71 | 72 | - Use real scaffold templates in integration tests 73 | - Mock external dependencies (LLM APIs, file system when appropriate) 74 | - Test the actual user workflows end-to-end 75 | - Maintain test isolation with temporary directories 76 | - Validate that scaffold components work correctly when installed 77 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/integration/init_flow/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/unit/judge/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ui/.env.example: -------------------------------------------------------------------------------- 1 | TRAINLOOP_DATA_FOLDER="./.trainloop" 2 | NEXT_PUBLIC_VERSION="0.0.0" 3 | NODE_ENV="development" -------------------------------------------------------------------------------- /ui/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next" 3 | } 4 | -------------------------------------------------------------------------------- /ui/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | 6 | # next.js 7 | /.next/ 8 | /out/ 9 | 10 | # production 11 | /build 12 | 13 | # debug 14 | npm-debug.log* 15 | yarn-debug.log* 16 | yarn-error.log* 17 | .pnpm-debug.log* 18 | 19 | # env files 20 | .env* 21 | !.env.example 22 | 23 | 24 | # typescript 25 | *.tsbuildinfo 26 | next-env.d.ts -------------------------------------------------------------------------------- /ui/Dockerfile: -------------------------------------------------------------------------------- 1 | # Single-stage runtime for TrainLoop Evals UI 2 | FROM node:20-slim 3 | WORKDIR /app 4 | 5 | # pass VERSION at build-time 6 | ARG VERSION 7 | RUN test -n "$VERSION" || (echo "ERROR: version argument is required" && false) 8 | 9 | # Set environment variables 10 | ENV NODE_ENV=production \ 11 | NEXT_TELEMETRY_DISABLED=1 \ 12 | PORT=3000 \ 13 | VERSION=${VERSION} 14 | 15 | # Add labels 16 | LABEL org.opencontainers.image.version=$VERSION \ 17 | org.opencontainers.image.title="TrainLoop Evals UI" \ 18 | org.opencontainers.image.source="https://github.com/trainloop/evals" 19 | 20 | # Create non-root user 21 | RUN addgroup --system --gid 1001 nodejs && \ 22 | adduser --system --uid 1001 nextjs 23 | 24 | # Copy only runtime files (assumes app is built locally) 25 | COPY package.json ./ 26 | COPY next.config.mjs ./ 27 | COPY .next/ ./.next/ 28 | COPY public/ ./public/ 29 | COPY node_modules/ ./node_modules/ 30 | # Handle .env.production file - will be handled differently in build.py 31 | 32 | # Switch to non-root user for runtime 33 | USER nextjs 34 | 35 | EXPOSE 3000 36 | HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ 37 | CMD wget -qO- http://localhost:3000/ || exit 1 38 | 39 | CMD ["npm", "run", "start"] 40 | -------------------------------------------------------------------------------- /ui/README.md: -------------------------------------------------------------------------------- 1 | # TrainLoop Studio UI 2 | 3 | A Next.js interface for exploring collected events and evaluation results. 4 | 5 | ## Prerequisites 6 | - Node.js 20+ 7 | - `TRAINLOOP_DATA_FOLDER` pointing to your data directory 8 | 9 | ## Dev server 10 | 11 | ```bash 12 | npm install 13 | npm run dev 14 | ``` 15 | 16 | The CLI's `trainloop studio` command uses the bundled version from the `runner/` package. 17 | -------------------------------------------------------------------------------- /ui/app/api/collected-data/GET.ts: -------------------------------------------------------------------------------- 1 | import { NextResponse } from 'next/server'; 2 | import { select } from '@/database/utils'; 3 | 4 | export const GET = async () => { 5 | try { 6 | const rows = await select("events", { 7 | tag: true, 8 | durationMs: true, 9 | input: true, 10 | output: true, 11 | modelParams: true, 12 | url: true, 13 | location: true, 14 | endTimeMs: true, 15 | model: true, 16 | startTimeMs: true, 17 | }) 18 | 19 | return new NextResponse(JSON.stringify(rows), { 20 | headers: { 'Content-Type': 'application/json' } 21 | }); 22 | } catch (error) { 23 | console.error("Error in /api/collected-data:", error); 24 | return NextResponse.json( 25 | { error: 'Failed to retrieve data', message: error instanceof Error ? error.message : String(error) }, 26 | { status: 500 } 27 | ); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /ui/app/api/collected-data/route.ts: -------------------------------------------------------------------------------- 1 | export { GET } from "./GET"; -------------------------------------------------------------------------------- /ui/app/api/events/[id]/route.ts: -------------------------------------------------------------------------------- 1 | import { NextResponse, NextRequest } from 'next/server' 2 | import { getEvent } from '@/database/events' 3 | import { convertBigIntsToNumbers } from '@/utils/json-helpers' 4 | 5 | export async function GET( 6 | request: NextRequest, 7 | { params }: { params: Promise<{ id: string }> } 8 | ) { 9 | const { id } = await params 10 | const row = await getEvent(Number(id)); 11 | if (!row) return NextResponse.json({ error: 'not found' }, { status: 404 }); 12 | 13 | // Convert BigInt values to string for serialization 14 | const safeRow = convertBigIntsToNumbers(row); 15 | return NextResponse.json(safeRow); 16 | } 17 | -------------------------------------------------------------------------------- /ui/app/api/events/route.ts: -------------------------------------------------------------------------------- 1 | import { NextResponse } from 'next/server' 2 | import { listEvents } from '@/database/events' 3 | import { convertBigIntsToNumbers } from '@/utils/json-helpers' 4 | 5 | export async function GET(req: Request) { 6 | const { searchParams } = new URL(req.url) 7 | const offset = parseInt(searchParams.get('offset') ?? '0', 10) 8 | const limit = parseInt(searchParams.get('limit') ?? '50', 10) 9 | const from = searchParams.get('from') ? Number(searchParams.get('from')) : undefined 10 | const to = searchParams.get('to') ? Number(searchParams.get('to')) : undefined 11 | const tags = searchParams.getAll('tag') 12 | const models = searchParams.getAll('model') 13 | const durationLt = searchParams.get('durationLt') ? Number(searchParams.get('durationLt')) : undefined 14 | 15 | const rows = await listEvents({ offset, limit, filters: { from, to, tags, models, durationLt } }) 16 | 17 | // Convert any BigInt values to numbers before serializing to JSON 18 | const safeRows = convertBigIntsToNumbers(rows); 19 | 20 | return NextResponse.json(safeRows) 21 | } 22 | -------------------------------------------------------------------------------- /ui/app/api/groq/route.ts: -------------------------------------------------------------------------------- 1 | import Groq from 'groq-sdk'; 2 | 3 | export async function POST(request: Request) { 4 | const groq = new Groq() 5 | 6 | try { 7 | const { prompt, model = "llama3-8b-8192" } = await request.json() 8 | 9 | if (!prompt) { 10 | return Response.json({ error: "Prompt is required" }, { status: 400 }) 11 | } 12 | 13 | const completion = await groq.chat.completions.create({ 14 | messages: [ 15 | { 16 | role: "user", 17 | content: prompt, 18 | }, 19 | ], 20 | model: model, 21 | }) 22 | 23 | return Response.json({ 24 | response: completion.choices[0].message.content, 25 | model: model, 26 | usage: completion.usage, 27 | }) 28 | } catch (error) { 29 | console.error("Error calling Groq API:", error) 30 | return Response.json({ error: "Failed to get response from Groq" }, { status: 500 }) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /ui/app/api/results/[id]/route.ts: -------------------------------------------------------------------------------- 1 | import { NextResponse } from 'next/server' 2 | import { getResult } from '@/database/results' 3 | 4 | export async function GET(_: Request, { params }: { params: { id: string } }) { 5 | const row = await getResult(params.id) 6 | if (!row) return NextResponse.json({ error: 'not found' }, { status: 404 }) 7 | return NextResponse.json(row) 8 | } 9 | -------------------------------------------------------------------------------- /ui/app/api/results/route.ts: -------------------------------------------------------------------------------- 1 | import { NextResponse } from 'next/server' 2 | import { listResults } from '@/database/results' 3 | import { convertBigIntsToNumbers } from '@/utils/json-helpers' 4 | 5 | export async function GET(req: Request) { 6 | const { searchParams } = new URL(req.url) 7 | const offset = parseInt(searchParams.get('offset') ?? '0', 10) 8 | const limit = parseInt(searchParams.get('limit') ?? '50', 10) 9 | const ts = searchParams.get('ts') || '' 10 | const suite = searchParams.get('suite') || '' 11 | const rows = await listResults({ ts, suite, offset, limit }) 12 | 13 | const safeRows = convertBigIntsToNumbers(rows) 14 | return NextResponse.json(safeRows) 15 | } 16 | -------------------------------------------------------------------------------- /ui/app/events/page.tsx: -------------------------------------------------------------------------------- 1 | import { EventsTable } from '@/components/events-table' 2 | 3 | export default function EventsPage() { 4 | return 5 | } 6 | -------------------------------------------------------------------------------- /ui/app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | body { 6 | font-family: Arial, Helvetica, sans-serif; 7 | } 8 | 9 | @layer utilities { 10 | .text-balance { 11 | text-wrap: balance; 12 | } 13 | } 14 | 15 | @layer base { 16 | :root { 17 | --background: 0 0% 100%; 18 | --foreground: 0 0% 3.9%; 19 | --card: 0 0% 100%; 20 | --card-foreground: 0 0% 3.9%; 21 | --popover: 0 0% 100%; 22 | --popover-foreground: 0 0% 3.9%; 23 | --primary: 0 0% 9%; 24 | --primary-foreground: 0 0% 98%; 25 | --secondary: 0 0% 96.1%; 26 | --secondary-foreground: 0 0% 9%; 27 | --muted: 0 0% 96.1%; 28 | --muted-foreground: 0 0% 45.1%; 29 | --accent: 0 0% 96.1%; 30 | --accent-foreground: 0 0% 9%; 31 | --destructive: 0 84.2% 60.2%; 32 | --destructive-foreground: 0 0% 98%; 33 | --border: 0 0% 89.8%; 34 | --input: 0 0% 89.8%; 35 | --ring: 0 0% 3.9%; 36 | --chart-1: 12 76% 61%; 37 | --chart-2: 173 58% 39%; 38 | --chart-3: 197 37% 24%; 39 | --chart-4: 43 74% 66%; 40 | --chart-5: 27 87% 67%; 41 | --radius: 0.5rem; 42 | --sidebar-background: 0 0% 98%; 43 | --sidebar-foreground: 240 5.3% 26.1%; 44 | --sidebar-primary: 240 5.9% 10%; 45 | --sidebar-primary-foreground: 0 0% 98%; 46 | --sidebar-accent: 240 4.8% 95.9%; 47 | --sidebar-accent-foreground: 240 5.9% 10%; 48 | --sidebar-border: 220 13% 91%; 49 | --sidebar-ring: 217.2 91.2% 59.8%; 50 | } 51 | .dark { 52 | --background: 0 0% 3.9%; 53 | --foreground: 0 0% 98%; 54 | --card: 0 0% 3.9%; 55 | --card-foreground: 0 0% 98%; 56 | --popover: 0 0% 3.9%; 57 | --popover-foreground: 0 0% 98%; 58 | --primary: 0 0% 98%; 59 | --primary-foreground: 0 0% 9%; 60 | --secondary: 0 0% 14.9%; 61 | --secondary-foreground: 0 0% 98%; 62 | --muted: 0 0% 14.9%; 63 | --muted-foreground: 0 0% 63.9%; 64 | --accent: 0 0% 14.9%; 65 | --accent-foreground: 0 0% 98%; 66 | --destructive: 0 62.8% 30.6%; 67 | --destructive-foreground: 0 0% 98%; 68 | --border: 0 0% 14.9%; 69 | --input: 0 0% 14.9%; 70 | --ring: 0 0% 83.1%; 71 | --chart-1: 220 70% 50%; 72 | --chart-2: 160 60% 45%; 73 | --chart-3: 30 80% 55%; 74 | --chart-4: 280 65% 60%; 75 | --chart-5: 340 75% 55%; 76 | --sidebar-background: 240 5.9% 10%; 77 | --sidebar-foreground: 240 4.8% 95.9%; 78 | --sidebar-primary: 224.3 76.3% 48%; 79 | --sidebar-primary-foreground: 0 0% 100%; 80 | --sidebar-accent: 240 3.7% 15.9%; 81 | --sidebar-accent-foreground: 240 4.8% 95.9%; 82 | --sidebar-border: 240 3.7% 15.9%; 83 | --sidebar-ring: 217.2 91.2% 59.8%; 84 | } 85 | } 86 | 87 | @layer base { 88 | * { 89 | @apply border-border; 90 | } 91 | body { 92 | @apply bg-background text-foreground; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /ui/app/layout.tsx: -------------------------------------------------------------------------------- 1 | import type { Metadata } from 'next' 2 | import './globals.css' 3 | import { ThemeProvider } from '@/components/theme-provider' 4 | import { DashboardShell } from '@/components/dashboard-shell' 5 | 6 | export const metadata: Metadata = { 7 | title: 'TrainLoop Evals', 8 | description: 'TrainLoop Evaluation Platform', 9 | generator: 'Next.js', 10 | } 11 | 12 | export default function RootLayout({ 13 | children, 14 | }: Readonly<{ 15 | children: React.ReactNode 16 | }>) { 17 | return ( 18 | 19 | 20 | 21 |
22 | 23 |
{children}
24 |
25 |
26 |
27 | 28 | 29 | ) 30 | } 31 | -------------------------------------------------------------------------------- /ui/app/page.tsx: -------------------------------------------------------------------------------- 1 | import DashboardContent from '@/components/dashboard/dashboard-content' 2 | import { Card, CardContent, CardHeader, CardTitle, CardDescription } from '@/components/ui/card' 3 | import { RecentEvents } from '@/components/dashboard/recent-events' 4 | import { RecentResults } from '@/components/dashboard/recent-results' 5 | 6 | async function getData() { 7 | // This would ideally come from an API but for now we'll use placeholder data 8 | return { 9 | totalEvents: 328, 10 | totalSuites: 5, 11 | totalMetrics: 12, 12 | passRate: 92.4, 13 | recentActivity: true 14 | } 15 | } 16 | 17 | export default async function DashboardPage() { 18 | const data = await getData() 19 | 20 | return ( 21 | <> 22 |
23 |

TrainLoop Evaluations Dashboard

24 |
25 | 26 | {/* New Dashboard Data Visualization */} 27 |
28 | 29 |
30 | 31 |
32 | 33 | 34 | Recent Events 35 | Latest model interactions collected 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | Evaluation Results 45 | Recent model evaluation metrics 46 | 47 | 48 | 49 | 50 | 51 |
52 | 53 | ) 54 | } 55 | -------------------------------------------------------------------------------- /ui/app/results/page.tsx: -------------------------------------------------------------------------------- 1 | 'use client' 2 | import { useState } from 'react' 3 | import { ResultsList } from '@/components/results-list' 4 | 5 | export default function ResultsPage() { 6 | const [ts, setTs] = useState('') 7 | const [suite, setSuite] = useState('') 8 | return ( 9 |
10 |
11 | setTs(e.target.value)} className="w-full border px-2 py-1 text-sm" /> 12 | setSuite(e.target.value)} className="w-full border px-2 py-1 text-sm" /> 13 |
14 |
15 | 16 |
17 |
18 | ) 19 | } 20 | -------------------------------------------------------------------------------- /ui/components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "default", 4 | "rsc": true, 5 | "tsx": true, 6 | "tailwind": { 7 | "config": "tailwind.config.ts", 8 | "css": "app/globals.css", 9 | "baseColor": "neutral", 10 | "cssVariables": true, 11 | "prefix": "" 12 | }, 13 | "aliases": { 14 | "components": "@/components", 15 | "utils": "@/lib/utils", 16 | "ui": "@/components/ui", 17 | "lib": "@/lib", 18 | "hooks": "@/hooks" 19 | }, 20 | "iconLibrary": "lucide" 21 | } -------------------------------------------------------------------------------- /ui/components/charts/metrics-trend-chart.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import { LineChart, Line, XAxis, YAxis, CartesianGrid, Legend, ResponsiveContainer } from "recharts" 4 | import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" 5 | import { ChartContainer, ChartTooltip, ChartTooltipContent } from "@/components/ui/chart" 6 | 7 | interface MetricsTrendChartProps { 8 | title: string 9 | description?: string 10 | data: Array<{ 11 | date: string 12 | [key: string]: string | number 13 | }> 14 | metrics: string[] 15 | className?: string 16 | } 17 | 18 | export function MetricsTrendChart({ title, description, data, metrics, className }: MetricsTrendChartProps) { 19 | // Generate a config object for ChartContainer with dynamic metric colors 20 | const generateConfig = () => { 21 | const colors = [ 22 | "hsl(var(--chart-1))", 23 | "hsl(var(--chart-2))", 24 | "hsl(var(--chart-3))", 25 | "hsl(var(--chart-4))", 26 | "hsl(var(--chart-5))", 27 | ] 28 | 29 | return metrics.reduce( 30 | (config, metric, index) => { 31 | config[metric] = { 32 | label: metric, 33 | color: colors[index % colors.length], 34 | } 35 | return config 36 | }, 37 | {} as Record, 38 | ) 39 | } 40 | 41 | return ( 42 | 43 | 44 | {title} 45 | {description && {description}} 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | `${value}%`} /> 54 | } /> 55 | 56 | {metrics.map((metric) => ( 57 | 65 | ))} 66 | 67 | 68 | 69 | 70 | 71 | ) 72 | } 73 | -------------------------------------------------------------------------------- /ui/components/charts/model-comparison-chart.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Legend, ResponsiveContainer } from "recharts" 4 | import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" 5 | import { ChartContainer, ChartTooltip, ChartTooltipContent } from "@/components/ui/chart" 6 | 7 | interface ModelComparisonChartProps { 8 | title: string 9 | description?: string 10 | data: Array<{ 11 | metric: string 12 | [key: string]: string | number 13 | }> 14 | models: string[] 15 | className?: string 16 | } 17 | 18 | export function ModelComparisonChart({ title, description, data, models, className }: ModelComparisonChartProps) { 19 | // Generate a config object for ChartContainer with dynamic model colors 20 | const generateConfig = () => { 21 | const colors = [ 22 | "hsl(var(--chart-1))", 23 | "hsl(var(--chart-2))", 24 | "hsl(var(--chart-3))", 25 | "hsl(var(--chart-4))", 26 | "hsl(var(--chart-5))", 27 | ] 28 | 29 | return models.reduce( 30 | (config, model, index) => { 31 | config[model] = { 32 | label: model, 33 | color: colors[index % colors.length], 34 | } 35 | return config 36 | }, 37 | {} as Record, 38 | ) 39 | } 40 | 41 | return ( 42 | 43 | 44 | {title} 45 | {description && {description}} 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | `${value}%`} /> 54 | } /> 55 | 56 | {models.map((model, index) => ( 57 | 64 | ))} 65 | 66 | 67 | 68 | 69 | 70 | ) 71 | } 72 | -------------------------------------------------------------------------------- /ui/components/charts/radar-comparison-chart.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import { Radar, RadarChart, PolarGrid, PolarAngleAxis, PolarRadiusAxis, ResponsiveContainer, Legend } from "recharts" 4 | import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" 5 | import { ChartContainer } from "@/components/ui/chart" 6 | 7 | interface RadarComparisonChartProps { 8 | title: string 9 | description?: string 10 | data: Array<{ 11 | metric: string 12 | [key: string]: string | number 13 | }> 14 | models: string[] 15 | className?: string 16 | } 17 | 18 | export function RadarComparisonChart({ title, description, data, models, className }: RadarComparisonChartProps) { 19 | // Generate a config object for ChartContainer with dynamic model colors 20 | const generateConfig = () => { 21 | const colors = [ 22 | "hsl(var(--chart-1))", 23 | "hsl(var(--chart-2))", 24 | "hsl(var(--chart-3))", 25 | "hsl(var(--chart-4))", 26 | "hsl(var(--chart-5))", 27 | ] 28 | 29 | return models.reduce( 30 | (config, model, index) => { 31 | config[model] = { 32 | label: model, 33 | color: colors[index % colors.length], 34 | } 35 | return config 36 | }, 37 | {} as Record, 38 | ) 39 | } 40 | 41 | return ( 42 | 43 | 44 | {title} 45 | {description && {description}} 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | `${value}%`} /> 54 | {models.map((model) => ( 55 | 63 | ))} 64 | 65 | 66 | 67 | 68 | 69 | 70 | ) 71 | } 72 | -------------------------------------------------------------------------------- /ui/components/dashboard-header.tsx: -------------------------------------------------------------------------------- 1 | import type React from "react" 2 | interface DashboardHeaderProps { 3 | heading: string 4 | text?: string 5 | children?: React.ReactNode 6 | } 7 | 8 | export function DashboardHeader({ heading, text, children }: DashboardHeaderProps) { 9 | return ( 10 |
11 |
12 |

{heading}

13 | {text &&

{text}

} 14 |
15 | {children} 16 |
17 | ) 18 | } 19 | -------------------------------------------------------------------------------- /ui/components/dashboard-shell.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import type React from "react" 4 | import Link from "next/link" 5 | import { Button } from "@/components/ui/button" 6 | import { BarChart3, Home, Activity, Trophy, Award, ListChecks } from "lucide-react" 7 | import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip" 8 | 9 | interface DashboardShellProps { 10 | children: React.ReactNode 11 | } 12 | 13 | export function DashboardShell({ children }: DashboardShellProps) { 14 | return ( 15 |
16 |
17 |
18 | 19 | 20 | LLM Evals 21 | 22 | 62 |
63 |
64 |
{children}
65 |
66 | ) 67 | } 68 | -------------------------------------------------------------------------------- /ui/components/flow-editor/node-selector.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import { Card, CardContent } from "@/components/ui/card" 4 | import { Button } from "@/components/ui/button" 5 | import { Database, MessageSquare, BarChart3, X } from "lucide-react" 6 | 7 | interface NodeSelectorProps { 8 | position: { x: number; y: number } 9 | onSelect: (nodeType: string) => void 10 | onClose: () => void 11 | } 12 | 13 | export function NodeSelector({ position, onSelect, onClose }: NodeSelectorProps) { 14 | const nodeTypes = [ 15 | { 16 | type: "dataNode", 17 | label: "Data Source", 18 | icon: , 19 | description: "Input data for evaluation", 20 | }, 21 | { 22 | type: "callNode", 23 | label: "Model Call", 24 | icon: , 25 | description: "Call an LLM API (Groq, GPT-4, etc.)", 26 | }, 27 | { 28 | type: "toolNode", 29 | label: "Evaluation Tool", 30 | icon: , 31 | description: "Analyze and score responses", 32 | }, 33 | ] 34 | 35 | return ( 36 |
e.stopPropagation()} 43 | > 44 | 45 |
46 |

Add Node

47 | 50 |
51 | 52 |
53 | {nodeTypes.map((node) => ( 54 | 68 | ))} 69 |
70 |
71 |
72 |
73 | ) 74 | } 75 | -------------------------------------------------------------------------------- /ui/components/groq-evaluator.tsx: -------------------------------------------------------------------------------- 1 | "use client" 2 | 3 | import { useState } from "react" 4 | import { Button } from "@/components/ui/button" 5 | import { Card, CardContent, CardDescription, CardFooter, CardHeader, CardTitle } from "@/components/ui/card" 6 | import { Textarea } from "@/components/ui/textarea" 7 | import { Label } from "@/components/ui/label" 8 | import { callGroqModel } from "@/lib/utils" 9 | import { Loader2 } from "lucide-react" 10 | 11 | interface GroqEvaluatorProps { 12 | referenceText?: string 13 | } 14 | 15 | export function GroqEvaluator({ referenceText }: GroqEvaluatorProps) { 16 | const [prompt, setPrompt] = useState("") 17 | const [response, setResponse] = useState("") 18 | const [loading, setLoading] = useState(false) 19 | const [error, setError] = useState("") 20 | 21 | const handleEvaluate = async () => { 22 | if (!prompt) return 23 | 24 | setLoading(true) 25 | setError("") 26 | 27 | try { 28 | const result = await callGroqModel(prompt) 29 | setResponse(result.response) 30 | } catch (err) { 31 | setError("Failed to get response from Groq") 32 | console.error(err) 33 | } finally { 34 | setLoading(false) 35 | } 36 | } 37 | 38 | return ( 39 | 40 | 41 | Groq Model Evaluation 42 | Test your prompts with Groq's LLM models 43 | 44 | 45 |
46 | 47 |