├── .gitattributes ├── tests ├── tiny_prompts.csv ├── README.md ├── test_sloprank.py └── test_smoke.py ├── sloprank ├── utils │ ├── __init__.py │ ├── confidence.py │ ├── commands.py │ ├── categorization.py │ ├── dashboard.py │ └── visualization.py ├── __main__.py ├── __init__.py ├── config.py ├── pandas_backend.py ├── parse.py └── collect.py ├── docs ├── visualizations │ ├── endorsement_graph.png │ └── endorsement_graph.gml └── README.md ├── results ├── visualizations │ ├── endorsement_graph.png │ └── endorsement_graph.gml ├── rankings.json ├── confidence_stats.json ├── category_analysis.csv ├── endorsement_graph.gml └── category_rankings.json ├── results-openrouter ├── visualizations │ ├── endorsement_graph.png │ └── endorsement_graph.gml └── rankings.json ├── examples ├── generate_dashboard.py ├── README.md ├── generate_visualization.py ├── compute_confidence.py ├── prompt_categorization.py └── dashboard.py ├── requirements.txt ├── scripts ├── bump_version.py └── create_github_release.py ├── Makefile ├── pyproject.toml ├── .gitignore ├── CHANGELOG.md ├── COMPATIBILITY_NOTES.md ├── LICENSE ├── prompts.csv └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /tests/tiny_prompts.csv: -------------------------------------------------------------------------------- 1 | Questions 2 | What is the capital of France? 3 | Name three primary colors 4 | -------------------------------------------------------------------------------- /sloprank/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | SlopRank utilities for visualization, confidence calculation, and dashboard generation. 3 | """ -------------------------------------------------------------------------------- /docs/visualizations/endorsement_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strangeloopcanon/LLMRank/HEAD/docs/visualizations/endorsement_graph.png -------------------------------------------------------------------------------- /results/visualizations/endorsement_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strangeloopcanon/LLMRank/HEAD/results/visualizations/endorsement_graph.png -------------------------------------------------------------------------------- /results-openrouter/visualizations/endorsement_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strangeloopcanon/LLMRank/HEAD/results-openrouter/visualizations/endorsement_graph.png -------------------------------------------------------------------------------- /sloprank/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main module entry point for running sloprank as a module 3 | """ 4 | 5 | import sys 6 | from .cli import main 7 | 8 | if __name__ == "__main__": 9 | sys.exit(main()) -------------------------------------------------------------------------------- /examples/generate_dashboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from dashboard import generate_html 3 | 4 | if __name__ == "__main__": 5 | dashboard_path = generate_html() 6 | print(f"Dashboard HTML generated at {dashboard_path}") 7 | print("You can open this file in a web browser to view the dashboard.") -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bodo>=2024.0.0 2 | click==8.1.7 3 | dash==2.18.2 4 | dash_core_components==2.0.0 5 | dash_html_components==2.0.0 6 | llm>=0.23 7 | matplotlib>=3.10.1 8 | networkx>=3.4.2 9 | numpy>=2.2.4 10 | plotly==6.0.0 11 | Requests>=2.32.5 12 | 13 | # Optional: Install for compatibility mode (if you specifically need regular pandas) 14 | # pandas>=2.2.3 15 | -------------------------------------------------------------------------------- /sloprank/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | SlopRank Package 3 | ---------------- 4 | 5 | Peer-based cross-evaluation of LLMs with PageRank-based scoring. 6 | 7 | Key features: 8 | - Peer-based evaluation where models score each other 9 | - Graph visualization of model endorsements 10 | - Confidence intervals and statistical significance tests 11 | - Category-based evaluation and ranking 12 | - Web dashboard for interactive exploration 13 | """ 14 | 15 | from .config import ( 16 | EvalConfig, 17 | VisualizationConfig, 18 | ConfidenceConfig, 19 | WebDashboardConfig, 20 | DEFAULT_CONFIG 21 | ) 22 | 23 | __version__ = "0.3.15" 24 | __all__ = [ 25 | "EvalConfig", 26 | "VisualizationConfig", 27 | "ConfidenceConfig", 28 | "WebDashboardConfig", 29 | "DEFAULT_CONFIG" 30 | ] 31 | -------------------------------------------------------------------------------- /results/rankings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rankings": [ 3 | [ 4 | "o1-preview", 5 | 0.17940361409787733 6 | ], 7 | [ 8 | "gpt-4o", 9 | 0.17830451744580658 10 | ], 11 | [ 12 | "deepseek-chat", 13 | 0.1671054138317305 14 | ], 15 | [ 16 | "gemini-2.0-flash-thinking-exp-1219", 17 | 0.16473186403675355 18 | ], 19 | [ 20 | "claude-3-5-sonnet-latest", 21 | 0.15557086205954448 22 | ], 23 | [ 24 | "gemini-exp-1206", 25 | 0.15488372852828722 26 | ] 27 | ], 28 | "metadata": { 29 | "evaluation_method": 1, 30 | "timestamp": "2025-01-14T10:21:14.432767" 31 | } 32 | } -------------------------------------------------------------------------------- /scripts/bump_version.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from pathlib import Path 4 | 5 | path = Path("pyproject.toml") 6 | text = path.read_text() 7 | match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', text) 8 | 9 | if not match: 10 | raise ValueError("Version not found in pyproject.toml") 11 | 12 | major, minor, patch = map(int, match.groups()) 13 | arg = sys.argv[1] if len(sys.argv) > 1 else "patch" 14 | 15 | if arg == "patch": 16 | patch += 1 17 | elif arg == "minor": 18 | minor += 1 19 | patch = 0 20 | elif arg == "major": 21 | major += 1 22 | minor = patch = 0 23 | else: 24 | raise ValueError("Expected patch, minor, or major") 25 | 26 | new_version = f'{major}.{minor}.{patch}' 27 | new_text = re.sub( 28 | r'version\s*=\s*"\d+\.\d+\.\d+"', 29 | f'version = \"{new_version}\"', 30 | text 31 | ) 32 | 33 | path.write_text(new_text) 34 | print(f"Bumped version to {new_version}") 35 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # SlopRank Dashboard 2 | 3 | This directory contains the interactive dashboard for SlopRank LLM evaluation framework. 4 | 5 | ## Files 6 | 7 | - `index.html` - The main dashboard file 8 | - `visualizations/` - Directory containing graph visualizations and images 9 | 10 | ## How to Use 11 | 12 | 1. Open `index.html` in any modern web browser 13 | 2. Explore the model rankings, category performance, and graph visualizations 14 | 15 | ## Hosting on GitHub Pages 16 | 17 | This directory is configured to be used with GitHub Pages. When GitHub Pages is enabled for this repo with the 'docs' folder as the source, the dashboard will be available at: 18 | 19 | https://yourusername.github.io/llmrank/ 20 | 21 | ## Updating the Dashboard 22 | 23 | To update this dashboard with new evaluation results: 24 | 25 | 1. Run the SlopRank tool with the `--dashboard` option 26 | 2. Copy the resulting dashboard.html to this directory as index.html 27 | 3. Update the image paths if necessary 28 | 4. Commit and push the changes -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYPROJECT = pyproject.toml 2 | VERSION = $(shell grep '^version' $(PYPROJECT) | sed -E 's/.*"([0-9]+\.[0-9]+\.[0-9]+)"/\1/') 3 | 4 | .PHONY: clean build check upload bump-patch bump-minor bump-major git-release publish 5 | 6 | clean: 7 | rm -rf build dist *.egg-info 8 | 9 | build: clean 10 | python -m build 11 | 12 | check: 13 | twine check dist/* 14 | 15 | upload: 16 | twine upload dist/* 17 | 18 | bump-patch: 19 | @python scripts/bump_version.py patch 20 | 21 | bump-minor: 22 | @python scripts/bump_version.py minor 23 | 24 | bump-major: 25 | @python scripts/bump_version.py major 26 | 27 | git-release: 28 | git add -A 29 | git commit -m "Release v$(VERSION)" || echo "Nothing to commit" 30 | @if git rev-parse "v$(VERSION)" >/dev/null 2>&1; then \ 31 | echo "⚠️ Tag v$(VERSION) already exists. Skipping tag creation."; \ 32 | else \ 33 | git tag v$(VERSION); \ 34 | fi 35 | git push 36 | git push --tags 37 | @python scripts/create_github_release.py v$(VERSION) 38 | 39 | BUMP ?= patch 40 | 41 | publish: 42 | @make bump-$(BUMP) 43 | @make build 44 | @make check 45 | @make upload 46 | @make git-release 47 | -------------------------------------------------------------------------------- /scripts/create_github_release.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | 5 | TAG = sys.argv[1] 6 | REPO = "strangeloopcanon/LLMRank" 7 | TOKEN = os.getenv("GITHUB_TOKEN") 8 | 9 | if not TOKEN: 10 | raise RuntimeError("GITHUB_TOKEN environment variable not set") 11 | 12 | BASE_URL = f"https://api.github.com/repos/{REPO}" 13 | RELEASE_URL = f"{BASE_URL}/releases" 14 | HEADERS = { 15 | "Authorization": f"Bearer {TOKEN}", 16 | "Accept": "application/vnd.github+json" 17 | } 18 | 19 | # Check if release already exists 20 | r = requests.get(f"{BASE_URL}/releases/tags/{TAG}", headers=HEADERS) 21 | if r.status_code == 200: 22 | print(f"⚠️ GitHub release for tag {TAG} already exists. Skipping.") 23 | sys.exit(0) 24 | elif r.status_code != 404: 25 | print(f"GitHub release check failed:\n{r.status_code}\n{r.text}") 26 | sys.exit(1) 27 | 28 | # Create release 29 | payload = { 30 | "tag_name": TAG, 31 | "name": f"Release {TAG}", 32 | "body": f"Auto-published release for version {TAG}", 33 | "draft": False, 34 | "prerelease": False 35 | } 36 | 37 | r = requests.post(RELEASE_URL, headers=HEADERS, json=payload) 38 | if r.status_code >= 300: 39 | print(f"GitHub release creation failed:\n{r.status_code}\n{r.text}") 40 | sys.exit(1) 41 | 42 | print(f"✅ GitHub release {TAG} created.") 43 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # SlopRank Tests 2 | 3 | This directory contains test files for the SlopRank library. 4 | 5 | ## Test Files 6 | 7 | | File | Description | 8 | |------|-------------| 9 | | `test_sloprank.py` | Simple end-to-end test for the SlopRank library | 10 | | `tiny_prompts.csv` | Minimal test prompts with just 2 simple questions | 11 | | `mini_prompts.csv` | Small test prompts with 3 more comprehensive questions | 12 | 13 | ## Running Tests 14 | 15 | To run the basic test: 16 | 17 | ```bash 18 | python test_sloprank.py 19 | ``` 20 | 21 | ### Test Process 22 | 23 | The test will automatically: 24 | 1. Create a test output directory (`test_results/`) 25 | 2. Collect responses from configured models 26 | 3. Collect evaluations between models 27 | 4. Parse evaluations 28 | 5. Build the endorsement graph 29 | 6. Compute the PageRank scores 30 | 7. Output the final rankings 31 | 32 | > **Note:** The full test may take several minutes to complete due to the time required for API calls to language models. 33 | 34 | ## Test Configuration 35 | 36 | The test script uses a simple configuration with: 37 | - 3 models: deepseek-reasoner, claude-3.7-sonnet, and chatgpt-4o 38 | - Simple factual questions to ensure fast responses 39 | - Full evaluation (all models evaluate each other) 40 | 41 | You can modify the test script to use different models, prompts, or evaluation settings. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel", "build", "twine"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools] 6 | license-files = [] # override the default behavior 7 | 8 | [project] 9 | name = "sloprank" 10 | version = "0.3.17" 11 | description = "Peer-based LLM cross-evaluation system" 12 | authors = [ 13 | { name = "Rohit Krishnan", email = "rohit.krishnan@gmail.com" } 14 | ] 15 | license = { file = "LICENSE" } 16 | readme = "README.md" 17 | requires-python = ">=3.9" 18 | 19 | dependencies = [ 20 | "bodo>=2024.0.0", 21 | "openpyxl>=3.0.10", 22 | "networkx>=2.8", 23 | "python-dotenv>=0.21.0", 24 | "click>=8.0.0", 25 | "numpy>=1.20.0", 26 | "matplotlib>=3.5.0", 27 | "llm>=0.13.0" 28 | ] 29 | 30 | [project.optional-dependencies] 31 | pandas = [ 32 | "pandas>=1.5.0" 33 | ] 34 | vis = [ 35 | "plotly>=5.5.0", 36 | "kaleido>=0.2.1" # For static image export with plotly 37 | ] 38 | dashboard = [ 39 | "dash>=2.0.0", 40 | "dash-bootstrap-components>=1.0.0" 41 | ] 42 | full = [ 43 | "plotly>=5.5.0", 44 | "kaleido>=0.2.1", 45 | "dash>=2.0.0", 46 | "dash-bootstrap-components>=1.0.0", 47 | "parallm>=0.1.3" 48 | ] 49 | 50 | [project.urls] 51 | Homepage = "https://github.com/strangeloopcanon/LLMRank" 52 | 53 | [tool.setuptools.packages.find] 54 | where = ["."] 55 | include = ["sloprank*"] 56 | exclude = ["results", "results.*"] 57 | 58 | [project.scripts] 59 | sloprank = "sloprank.cli:main" 60 | -------------------------------------------------------------------------------- /results/confidence_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "confidence_intervals": { 3 | "o1-preview": { 4 | "mean": 0.17906422978195008, 5 | "lower_bound": 0.15586134755557632, 6 | "upper_bound": 0.20028596105851823, 7 | "std_dev": 0.011390744131633145 8 | }, 9 | "claude-3-5-sonnet-latest": { 10 | "mean": 0.1559034710506988, 11 | "lower_bound": 0.1338431787122791, 12 | "upper_bound": 0.17700336456568294, 13 | "std_dev": 0.011074319360773228 14 | }, 15 | "deepseek-chat": { 16 | "mean": 0.16688348762576946, 17 | "lower_bound": 0.14471972554662413, 18 | "upper_bound": 0.19175975218761088, 19 | "std_dev": 0.012124035815348115 20 | }, 21 | "gpt-4o": { 22 | "mean": 0.17819819894678382, 23 | "lower_bound": 0.15655283702964287, 24 | "upper_bound": 0.2005852449712515, 25 | "std_dev": 0.010975986032101367 26 | }, 27 | "gemini-exp-1206": { 28 | "mean": 0.1549362213590768, 29 | "lower_bound": 0.1336108623981094, 30 | "upper_bound": 0.17961769528814694, 31 | "std_dev": 0.01173552363968152 32 | }, 33 | "gemini-2.0-flash-thinking-exp-1219": { 34 | "mean": 0.16501439123572084, 35 | "lower_bound": 0.14205363291625536, 36 | "upper_bound": 0.18732921920572762, 37 | "std_dev": 0.011653527254343038 38 | } 39 | }, 40 | "significance": { 41 | "o1-preview_vs_gpt-4o": false, 42 | "gpt-4o_vs_deepseek-chat": false, 43 | "deepseek-chat_vs_gemini-2.0-flash-thinking-exp-1219": false, 44 | "gemini-2.0-flash-thinking-exp-1219_vs_claude-3-5-sonnet-latest": false, 45 | "claude-3-5-sonnet-latest_vs_gemini-exp-1206": false 46 | } 47 | } -------------------------------------------------------------------------------- /results/category_analysis.csv: -------------------------------------------------------------------------------- 1 | category,model,average_score,evaluations_count 2 | Reasoning,o1-preview,8.8,30 3 | Reasoning,deepseek-chat,8.766666666666667,30 4 | Reasoning,claude-3-5-sonnet-latest,6.9655172413793105,29 5 | Reasoning,gemini-2.0-flash-thinking-exp-1219,8.206896551724139,29 6 | Reasoning,gemini-exp-1206,8.61111111111111,18 7 | Reasoning,gpt-4o,8.212121212121213,33 8 | Creativity,gpt-4o,7.923076923076923,13 9 | Creativity,gemini-exp-1206,8.833333333333334,6 10 | Creativity,deepseek-chat,8.5,14 11 | Creativity,o1-preview,8.857142857142858,14 12 | Creativity,claude-3-5-sonnet-latest,6.857142857142857,7 13 | Creativity,gemini-2.0-flash-thinking-exp-1219,8.045454545454545,11 14 | Economic,o1-preview,7.5,4 15 | Economic,deepseek-chat,8.0,5 16 | Economic,gemini-exp-1206,8.0,7 17 | Economic,gpt-4o,8.333333333333334,6 18 | Economic,claude-3-5-sonnet-latest,7.888888888888889,9 19 | Economic,gemini-2.0-flash-thinking-exp-1219,7.75,4 20 | Knowledge,deepseek-chat,4.333333333333333,3 21 | Knowledge,gemini-exp-1206,6.571428571428571,7 22 | Knowledge,claude-3-5-sonnet-latest,6.857142857142857,7 23 | Knowledge,gpt-4o,6.166666666666667,6 24 | Knowledge,o1-preview,5.833333333333333,6 25 | Knowledge,gemini-2.0-flash-thinking-exp-1219,7.0,4 26 | Technical,claude-3-5-sonnet-latest,8.0,1 27 | Technical,gemini-2.0-flash-thinking-exp-1219,7.333333333333333,3 28 | Technical,deepseek-chat,8.5,2 29 | Technical,o1-preview,8.666666666666666,3 30 | Technical,gemini-exp-1206,9.25,4 31 | Technical,gpt-4o,7.0,1 32 | Medical,o1-preview,6.2,5 33 | Medical,deepseek-chat,7.166666666666667,6 34 | Medical,gemini-exp-1206,6.714285714285714,7 35 | Medical,claude-3-5-sonnet-latest,5.0,6 36 | Medical,gemini-2.0-flash-thinking-exp-1219,6.142857142857143,7 37 | Medical,gpt-4o,8.5,4 38 | -------------------------------------------------------------------------------- /tests/test_sloprank.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple test script for SlopRank 3 | """ 4 | import pandas as pd 5 | import json 6 | from pathlib import Path 7 | from sloprank.config import EvalConfig, VisualizationConfig 8 | from sloprank.collect import collect_responses, collect_raw_evaluations 9 | from sloprank.parse import parse_evaluation_rows 10 | from sloprank.rank import build_endorsement_graph, compute_pagerank, finalize_rankings 11 | 12 | # Use existing tiny_prompts.csv file 13 | prompts_file = Path(__file__).parent / "tiny_prompts.csv" 14 | test_df = pd.read_csv(prompts_file) 15 | prompts = test_df["Questions"].tolist() 16 | 17 | # Define a simple test configuration 18 | config = EvalConfig( 19 | model_names=["deepseek-chat", "claude-3.5-haiku", "gpt-4o"], 20 | evaluation_method=1, # numeric 21 | use_subset_evaluation=False, # All models evaluate each other 22 | evaluators_subset_size=2, # This will be ignored since subset_evaluation is False 23 | output_dir=Path(__file__).parent / "test_results", 24 | request_delay=0.0 25 | ) 26 | 27 | # Create output directory 28 | config.output_dir.mkdir(exist_ok=True) 29 | 30 | # Create prompt pairs (prompt, answer_key) 31 | prompt_pairs = [(prompt, "") for prompt in prompts] 32 | 33 | # Collect responses 34 | print(f"Collecting responses from {len(config.model_names)} models for {len(prompts)} prompts...") 35 | responses_df = collect_responses(prompt_pairs, config) 36 | responses_df.to_csv(config.output_dir / "responses.csv", index=False) 37 | print(f"Saved responses to {config.output_dir}/responses.csv") 38 | 39 | # Collect evaluations 40 | print("Collecting evaluations...") 41 | raw_evaluations_df = collect_raw_evaluations(responses_df, config) 42 | raw_evaluations_df.to_csv(config.output_dir / "raw_evaluations.csv", index=False) 43 | print(f"Saved raw evaluations to {config.output_dir}/raw_evaluations.csv") 44 | 45 | # Parse evaluations 46 | print("Parsing evaluations...") 47 | evaluations_df = parse_evaluation_rows(raw_evaluations_df, config) 48 | evaluations_df.to_csv(config.output_dir / "evaluations.csv", index=False) 49 | print(f"Saved parsed evaluations to {config.output_dir}/evaluations.csv") 50 | 51 | # Build graph and compute rankings 52 | print("Building graph and computing rankings...") 53 | G = build_endorsement_graph(evaluations_df, config) 54 | pagerank_scores = compute_pagerank(G) 55 | rankings = finalize_rankings(pagerank_scores, config, G, evaluations_df) 56 | 57 | # Save rankings to file 58 | rankings_file = config.output_dir / "rankings.json" 59 | with open(rankings_file, "w") as f: 60 | json.dump(rankings, f, indent=4) 61 | print(f"Saved rankings to {rankings_file}") 62 | 63 | print("Test completed successfully!") -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # SlopRank Example Scripts 2 | 3 | This directory contains standalone scripts that demonstrate each of the advanced features of SlopRank. These scripts can be run individually after running the main SlopRank tool. 4 | 5 | ## Available Scripts 6 | 7 | ### 1. Graph Visualization (`generate_visualization.py`) 8 | 9 | Creates visual representations of the model endorsement network: 10 | 11 | ```bash 12 | python examples/generate_visualization.py 13 | ``` 14 | 15 | **Outputs:** 16 | - Static PNG visualization: `results/visualizations/endorsement_graph.png` 17 | - GraphML file: `results/visualizations/endorsement_graph.gml` 18 | 19 | ### 2. Confidence Intervals (`compute_confidence.py`) 20 | 21 | Uses bootstrap resampling to estimate statistical reliability: 22 | 23 | ```bash 24 | python examples/compute_confidence.py 25 | ``` 26 | 27 | **Outputs:** 28 | - `results/confidence_stats.json` containing: 29 | - Confidence intervals for each model's PageRank score 30 | - Statistical significance tests between adjacent ranks 31 | 32 | ### 3. Prompt Categorization (`prompt_categorization.py`) 33 | 34 | Automatically categorizes prompts and provides per-category rankings: 35 | 36 | ```bash 37 | python examples/prompt_categorization.py 38 | ``` 39 | 40 | **Outputs:** 41 | - Categorized version of your prompts file 42 | - Per-category rankings in `results/category_rankings.json` 43 | - CSV analysis in `results/category_analysis.csv` 44 | 45 | ### 4. Interactive Dashboard 46 | 47 | #### Dashboard Generation (`generate_dashboard.py`) 48 | Creates an HTML dashboard from all the results: 49 | 50 | ```bash 51 | python examples/generate_dashboard.py 52 | ``` 53 | 54 | #### Dashboard Server (`dashboard.py`) 55 | Starts a local server to view the dashboard: 56 | 57 | ```bash 58 | python examples/dashboard.py 59 | ``` 60 | 61 | ## Recommended Workflow 62 | 63 | For the best experience, run the tools in this order: 64 | 65 | 1. Run SlopRank: `sloprank --prompts prompts.csv --output-dir results` 66 | 2. Generate visualizations: `python examples/generate_visualization.py` 67 | 3. Compute confidence intervals: `python examples/compute_confidence.py` 68 | 4. Analyze categories: `python examples/prompt_categorization.py` 69 | 5. Generate dashboard: `python examples/generate_dashboard.py` 70 | 6. View the dashboard: `python examples/dashboard.py` 71 | 72 | ## Integrated CLI Alternative 73 | 74 | All these features are now integrated into the main `sloprank` CLI tool: 75 | 76 | ```bash 77 | sloprank run --prompts prompts.csv --output-dir results --visualize --confidence --categories --dashboard 78 | ``` 79 | 80 | These standalone example scripts are provided for educational purposes and for users who want to use each feature independently. -------------------------------------------------------------------------------- /results/endorsement_graph.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | directed 1 3 | node [ 4 | id 0 5 | label "gemini-2.0-flash-thinking-exp-1219" 6 | ] 7 | node [ 8 | id 1 9 | label "gemini-exp-1206" 10 | ] 11 | node [ 12 | id 2 13 | label "claude-3-5-sonnet-latest" 14 | ] 15 | node [ 16 | id 3 17 | label "o1-preview" 18 | ] 19 | node [ 20 | id 4 21 | label "gpt-4o" 22 | ] 23 | node [ 24 | id 5 25 | label "deepseek-chat" 26 | ] 27 | edge [ 28 | source 0 29 | target 3 30 | weight 138.0 31 | ] 32 | edge [ 33 | source 0 34 | target 5 35 | weight 173.0 36 | ] 37 | edge [ 38 | source 0 39 | target 2 40 | weight 113.0 41 | ] 42 | edge [ 43 | source 0 44 | target 1 45 | weight 89.0 46 | ] 47 | edge [ 48 | source 0 49 | target 4 50 | weight 130.0 51 | ] 52 | edge [ 53 | source 1 54 | target 2 55 | weight 129.0 56 | ] 57 | edge [ 58 | source 1 59 | target 0 60 | weight 188.0 61 | ] 62 | edge [ 63 | source 1 64 | target 5 65 | weight 183.0 66 | ] 67 | edge [ 68 | source 1 69 | target 4 70 | weight 180.0 71 | ] 72 | edge [ 73 | source 1 74 | target 3 75 | weight 148.0 76 | ] 77 | edge [ 78 | source 2 79 | target 3 80 | weight 248.0 81 | ] 82 | edge [ 83 | source 2 84 | target 0 85 | weight 162.0 86 | ] 87 | edge [ 88 | source 2 89 | target 1 90 | weight 160.0 91 | ] 92 | edge [ 93 | source 2 94 | target 4 95 | weight 166.0 96 | ] 97 | edge [ 98 | source 2 99 | target 5 100 | weight 104.0 101 | ] 102 | edge [ 103 | source 3 104 | target 0 105 | weight 131.0 106 | ] 107 | edge [ 108 | source 3 109 | target 5 110 | weight 129.0 111 | ] 112 | edge [ 113 | source 3 114 | target 1 115 | weight 144.0 116 | ] 117 | edge [ 118 | source 3 119 | target 4 120 | weight 157.0 121 | ] 122 | edge [ 123 | source 3 124 | target 2 125 | weight 139.0 126 | ] 127 | edge [ 128 | source 4 129 | target 0 130 | weight 155.0 131 | ] 132 | edge [ 133 | source 4 134 | target 5 135 | weight 146.0 136 | ] 137 | edge [ 138 | source 4 139 | target 2 140 | weight 146.0 141 | ] 142 | edge [ 143 | source 4 144 | target 3 145 | weight 129.0 146 | ] 147 | edge [ 148 | source 4 149 | target 1 150 | weight 141.0 151 | ] 152 | edge [ 153 | source 5 154 | target 4 155 | weight 212.0 156 | ] 157 | edge [ 158 | source 5 159 | target 0 160 | weight 135.5 161 | ] 162 | edge [ 163 | source 5 164 | target 3 165 | weight 203.0 166 | ] 167 | edge [ 168 | source 5 169 | target 1 170 | weight 142.0 171 | ] 172 | edge [ 173 | source 5 174 | target 2 175 | weight 143.0 176 | ] 177 | ] 178 | -------------------------------------------------------------------------------- /examples/generate_visualization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import pandas as pd 4 | import numpy as np 5 | import networkx as nx 6 | import matplotlib.pyplot as plt 7 | from pathlib import Path 8 | 9 | def generate_visualization(): 10 | # Create visualization directory if it doesn't exist 11 | vis_dir = Path("results/visualizations") 12 | vis_dir.mkdir(parents=True, exist_ok=True) 13 | 14 | # Load rankings 15 | rankings_path = Path("results/rankings.json") 16 | with open(rankings_path, 'r') as f: 17 | rankings_data = json.load(f) 18 | 19 | # Load evaluations data 20 | evals_path = Path("results/evaluations.csv") 21 | evals_df = pd.read_csv(evals_path) 22 | 23 | # Filter out failed evaluations 24 | evals_df = evals_df[evals_df["parse_failed"] == False] 25 | 26 | # Build graph 27 | G = nx.DiGraph() 28 | 29 | # Add nodes from rankings 30 | for model_entry in rankings_data["rankings"]: 31 | model = model_entry[0] 32 | score = model_entry[1] 33 | G.add_node(model, pagerank=score) 34 | 35 | # Add edges from evaluations 36 | for _, row in evals_df.iterrows(): 37 | judge = row["judge_model"] 38 | rated = row["rated_model"] 39 | score = float(row["score"]) 40 | 41 | if G.has_edge(judge, rated): 42 | G[judge][rated]["weight"] += score 43 | else: 44 | G.add_edge(judge, rated, weight=score) 45 | 46 | # Normalize edge weights for visualization 47 | max_weight = max([G[u][v]["weight"] for u, v in G.edges()]) 48 | for u, v in G.edges(): 49 | G[u][v]["normalized_weight"] = G[u][v]["weight"] / max_weight 50 | 51 | # Create visualizations 52 | 53 | # 1. Static graph visualization 54 | plt.figure(figsize=(12, 10)) 55 | 56 | # Calculate position using spring layout 57 | pos = nx.spring_layout(G, seed=42) 58 | 59 | # Get pagerank scores 60 | pagerank_scores = {node: G.nodes[node].get('pagerank', 0.1) for node in G.nodes()} 61 | 62 | # Draw nodes 63 | node_sizes = [pagerank_scores[node] * 5000 for node in G.nodes()] 64 | node_colors = list(pagerank_scores.values()) 65 | 66 | nx.draw_networkx_nodes( 67 | G, pos, 68 | node_size=node_sizes, 69 | node_color=node_colors, 70 | cmap=plt.cm.viridis, 71 | alpha=0.8 72 | ) 73 | 74 | # Draw edges 75 | edge_widths = [G[u][v].get('normalized_weight', 0.1) * 5 for u, v in G.edges()] 76 | 77 | nx.draw_networkx_edges( 78 | G, pos, 79 | width=edge_widths, 80 | alpha=0.6, 81 | edge_color='gray', 82 | arrows=True, 83 | arrowstyle='-|>', 84 | arrowsize=20 85 | ) 86 | 87 | # Draw labels 88 | nx.draw_networkx_labels( 89 | G, pos, 90 | font_size=12, 91 | font_weight='bold' 92 | ) 93 | 94 | # Add title 95 | plt.title("LLM Endorsement Graph (Node size = PageRank score, Edge width = Endorsement strength)") 96 | plt.axis('off') 97 | 98 | # Save the figure 99 | plt.tight_layout() 100 | plt.savefig(vis_dir / "endorsement_graph.png", dpi=300, bbox_inches='tight') 101 | plt.close() 102 | 103 | # 2. Save graph in GML format 104 | nx.write_gml(G, vis_dir / "endorsement_graph.gml") 105 | 106 | print(f"Visualizations saved to {vis_dir}") 107 | 108 | if __name__ == "__main__": 109 | generate_visualization() -------------------------------------------------------------------------------- /results/category_rankings.json: -------------------------------------------------------------------------------- 1 | { 2 | "Creativity": [ 3 | { 4 | "model": "o1-preview", 5 | "score": 8.857142857142858 6 | }, 7 | { 8 | "model": "gemini-exp-1206", 9 | "score": 8.833333333333334 10 | }, 11 | { 12 | "model": "deepseek-chat", 13 | "score": 8.5 14 | }, 15 | { 16 | "model": "gemini-2.0-flash-thinking-exp-1219", 17 | "score": 8.045454545454545 18 | }, 19 | { 20 | "model": "gpt-4o", 21 | "score": 7.923076923076923 22 | }, 23 | { 24 | "model": "claude-3-5-sonnet-latest", 25 | "score": 6.857142857142857 26 | } 27 | ], 28 | "Economic": [ 29 | { 30 | "model": "gpt-4o", 31 | "score": 8.333333333333334 32 | }, 33 | { 34 | "model": "deepseek-chat", 35 | "score": 8.0 36 | }, 37 | { 38 | "model": "gemini-exp-1206", 39 | "score": 8.0 40 | }, 41 | { 42 | "model": "claude-3-5-sonnet-latest", 43 | "score": 7.888888888888889 44 | }, 45 | { 46 | "model": "gemini-2.0-flash-thinking-exp-1219", 47 | "score": 7.75 48 | }, 49 | { 50 | "model": "o1-preview", 51 | "score": 7.5 52 | } 53 | ], 54 | "Knowledge": [ 55 | { 56 | "model": "gemini-2.0-flash-thinking-exp-1219", 57 | "score": 7.0 58 | }, 59 | { 60 | "model": "claude-3-5-sonnet-latest", 61 | "score": 6.857142857142857 62 | }, 63 | { 64 | "model": "gemini-exp-1206", 65 | "score": 6.571428571428571 66 | }, 67 | { 68 | "model": "gpt-4o", 69 | "score": 6.166666666666667 70 | }, 71 | { 72 | "model": "o1-preview", 73 | "score": 5.833333333333333 74 | }, 75 | { 76 | "model": "deepseek-chat", 77 | "score": 4.333333333333333 78 | } 79 | ], 80 | "Medical": [ 81 | { 82 | "model": "gpt-4o", 83 | "score": 8.5 84 | }, 85 | { 86 | "model": "deepseek-chat", 87 | "score": 7.166666666666667 88 | }, 89 | { 90 | "model": "gemini-exp-1206", 91 | "score": 6.714285714285714 92 | }, 93 | { 94 | "model": "o1-preview", 95 | "score": 6.2 96 | }, 97 | { 98 | "model": "gemini-2.0-flash-thinking-exp-1219", 99 | "score": 6.142857142857143 100 | }, 101 | { 102 | "model": "claude-3-5-sonnet-latest", 103 | "score": 5.0 104 | } 105 | ], 106 | "Reasoning": [ 107 | { 108 | "model": "o1-preview", 109 | "score": 8.8 110 | }, 111 | { 112 | "model": "deepseek-chat", 113 | "score": 8.766666666666667 114 | }, 115 | { 116 | "model": "gemini-exp-1206", 117 | "score": 8.61111111111111 118 | }, 119 | { 120 | "model": "gpt-4o", 121 | "score": 8.212121212121213 122 | }, 123 | { 124 | "model": "gemini-2.0-flash-thinking-exp-1219", 125 | "score": 8.206896551724139 126 | }, 127 | { 128 | "model": "claude-3-5-sonnet-latest", 129 | "score": 6.9655172413793105 130 | } 131 | ], 132 | "Technical": [ 133 | { 134 | "model": "gemini-exp-1206", 135 | "score": 9.25 136 | }, 137 | { 138 | "model": "o1-preview", 139 | "score": 8.666666666666666 140 | }, 141 | { 142 | "model": "deepseek-chat", 143 | "score": 8.5 144 | }, 145 | { 146 | "model": "claude-3-5-sonnet-latest", 147 | "score": 8.0 148 | }, 149 | { 150 | "model": "gemini-2.0-flash-thinking-exp-1219", 151 | "score": 7.333333333333333 152 | }, 153 | { 154 | "model": "gpt-4o", 155 | "score": 7.0 156 | } 157 | ] 158 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | *.pyc 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 106 | __pypackages__/ 107 | 108 | # Celery stuff 109 | celerybeat-schedule 110 | celerybeat.pid 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | .venv_llm/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | 143 | # pytype static type analyzer 144 | .pytype/ 145 | 146 | # Cython debug symbols 147 | cython_debug/ 148 | 149 | # PyCharm 150 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 151 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 152 | # and can be added to the global gitignore or merged into this file. For a more nuclear 153 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 154 | #.idea/ 155 | .DS_Store 156 | \# older_scripts 157 | \#Archive/* 158 | 159 | # Ignore private PyPI config 160 | .pypirc 161 | 162 | # Ignore Claude's reference file 163 | CLAUDE.md 164 | 165 | # Ignore test output files 166 | tests/test_results/ 167 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to SlopRank will be documented in this file. 4 | 5 | ## [0.3.15] - 2025-09-10 6 | 7 | ### 🚀 **MAJOR PERFORMANCE UPGRADE: Bodo-First Architecture** 8 | 9 | ### Added 10 | - **Bodo-First Installation**: Bodo is now included by default in standard installation 11 | - **Switchable Backend System**: Environment variable control for backend selection 12 | - **Automatic Performance**: 3-5x speedup by default, no configuration required 13 | - **High-Performance Processing**: JIT compilation and parallel DataFrame operations 14 | - **Memory Optimization**: 50-70% reduction in memory usage for large datasets 15 | - **Enhanced Error Handling**: Robust fallback mechanisms for both backends 16 | - **CLI Backend Command**: `sloprank backend` to check and manage backend configuration 17 | 18 | ### Changed 19 | - **BREAKING**: Bodo is now the default dependency (included in `pip install sloprank`) 20 | - **Installation Model**: `pip install sloprank` → includes Bodo, `pip install sloprank[pandas]` → compatibility mode 21 | - **Dependencies**: Bodo moved from optional to core dependency for maximum performance by default 22 | - **Backend Priority**: Auto-detection now prefers Bodo (included) over pandas (fallback) 23 | - **Core Processing**: Intelligent backend-aware DataFrame operations 24 | 25 | ### Removed 26 | - **ParaLLM dependency**: Eliminated intermediate layer for better performance and reliability 27 | - **Performance barriers**: Users no longer need to know about Bodo to get maximum performance 28 | 29 | ### Migration Notes 30 | - **New Users**: Get 3-5x performance automatically with `pip install sloprank` 31 | - **Existing Users**: Upgrading provides automatic performance improvements 32 | - **Compatibility**: Use `SLOPRANK_USE_BODO=false` or `pip install sloprank[pandas]` for pandas-only mode 33 | - **No API Changes**: All existing commands and workflows remain the same 34 | 35 | ## [0.3.11] - 2025-09-09 36 | 37 | ### Changed 38 | - Updated default model list to latest: `gpt-5`, `claude-4-sonnet`, `gemini-2.5-pro`, `deepseek-chat`. 39 | - Relaxed `llm` dependency to `llm>=0.23` to support latest providers. 40 | - Rankings JSON now includes package `version` metadata. 41 | - README updated with latest model examples and `llm`/parallm notes. 42 | 43 | ### Added 44 | - Offline smoke test (`tests/test_smoke.py`) to exercise parsing, graph, and ranking without network calls. 45 | 46 | ### Fixed 47 | - Resolved version mismatch between `pyproject.toml` and `sloprank/__init__.py`. 48 | 49 | ## [0.2.3] - 2025-02-28 50 | 51 | ### Added 52 | - Tests directory with simple test scripts and example prompts 53 | - Test README with documentation on how to run tests 54 | 55 | ### Fixed 56 | - Improved error handling for subset evaluation configuration 57 | - Automatic adjustment of evaluators_subset_size when too large for the number of models 58 | - Added support for new model versions (Claude-3.7-Sonnet, ChatGPT-4o, Deepseek-Reasoner) 59 | 60 | ## [0.2.2] - 2025-01-14 61 | 62 | ### Added 63 | - Support for graph visualization of model endorsements 64 | - Confidence interval calculations for rankings 65 | - Category analysis for prompt-specific performance 66 | 67 | ### Changed 68 | - Improved API error handling 69 | - Enhanced CLI interface with additional options 70 | 71 | ## [0.2.1] - 2025-01-03 72 | 73 | ### Added 74 | - Dashboard features for interactive exploration 75 | - Visualization improvements 76 | 77 | ### Fixed 78 | - Bug fixes in PageRank calculation 79 | - Better error handling for API timeouts 80 | 81 | ## [0.2.0] - 2024-12-20 82 | 83 | ### Added 84 | - Complete rewrite with modular architecture 85 | - Support for multiple evaluation methods 86 | - Export options for results 87 | 88 | ## [0.1.0] - 2024-12-01 89 | 90 | ### Added 91 | - Initial release 92 | - Basic implementation of peer-based LLM evaluation 93 | - PageRank algorithm for ranking models 94 | -------------------------------------------------------------------------------- /tests/test_smoke.py: -------------------------------------------------------------------------------- 1 | """ 2 | Smoke test that exercises the core pipeline without network calls. 3 | 4 | It fabricates raw evaluations with valid JSON so parsing, graph 5 | construction, PageRank, and finalization can run entirely offline. 6 | """ 7 | from pathlib import Path 8 | import json as _json 9 | 10 | import pandas as pd 11 | 12 | from sloprank.config import EvalConfig, VisualizationConfig, ConfidenceConfig 13 | from sloprank.parse import parse_evaluation_rows 14 | from sloprank.rank import build_endorsement_graph, compute_pagerank, finalize_rankings 15 | from sloprank import __version__ as PKG_VERSION 16 | 17 | 18 | def _make_mock_raw_evaluations(models, prompts): 19 | """Create a DataFrame that mimics raw_evaluations.csv rows. 20 | 21 | Each judge rates the other two models for each prompt with JSON like 22 | {"Model_1": 9, "Model_2": 7} mapped consistently via model_mapping. 23 | """ 24 | rows = [] 25 | 26 | # Preferred ranking for determinism 27 | model_preference = { 28 | "gpt-5": 9.0, 29 | "claude-4-sonnet": 7.0, 30 | "gemini-2.5-pro": 5.0, 31 | } 32 | 33 | for prompt in prompts: 34 | for judge in models: 35 | others = [m for m in models if m != judge] 36 | # Keep stable order for mapping 37 | others_sorted = others # already in order of models list passed in 38 | mapping = {m: f"Model_{i+1}" for i, m in enumerate(others_sorted)} 39 | # Build anon score dict using preferences 40 | anon_scores = {mapping[m]: float(model_preference.get(m, 6.0)) for m in others_sorted} 41 | raw_json = _json.dumps(anon_scores) 42 | 43 | rows.append({ 44 | "prompt": prompt, 45 | "judge_model": judge, 46 | "raw_judgment": raw_json, 47 | "model_mapping": _json.dumps(mapping, sort_keys=True), 48 | "raw_judgment_token_count": len(raw_json.split()) 49 | }) 50 | 51 | return pd.DataFrame(rows) 52 | 53 | 54 | def test_smoke_offline(tmp_path: Path = None): 55 | # Test config and output directory 56 | out_dir = tmp_path if tmp_path else Path(__file__).parent / "smoke_results" 57 | out_dir.mkdir(parents=True, exist_ok=True) 58 | 59 | models = ["gpt-5", "claude-4-sonnet", "gemini-2.5-pro"] 60 | prompts = [ 61 | "What is 2+2?", 62 | "Name a primary color.", 63 | ] 64 | 65 | # Build minimal config with heavy features disabled 66 | config = EvalConfig( 67 | model_names=models, 68 | evaluation_method=1, 69 | use_subset_evaluation=False, 70 | evaluators_subset_size=2, 71 | output_dir=out_dir, 72 | visualization=VisualizationConfig(enabled=False), 73 | confidence=ConfidenceConfig(enabled=False), 74 | ) 75 | 76 | # Fabricate raw evaluations and parse them 77 | raw_eval_df = _make_mock_raw_evaluations(models, prompts) 78 | parsed_df = parse_evaluation_rows(raw_eval_df, config) 79 | 80 | # Ensure parsing yielded rows for all judged pairs 81 | assert not parsed_df.empty 82 | assert set(parsed_df.columns) >= {"prompt", "judge_model", "rated_model", "score", "parse_failed"} 83 | 84 | # Build graph and compute rankings 85 | G = build_endorsement_graph(parsed_df, config) 86 | pr = compute_pagerank(G) 87 | assert pr and all(m in pr for m in models) 88 | 89 | # Finalize (writes JSON file) 90 | finalize_rankings(pr, config, G=G, evaluations_df=parsed_df) 91 | 92 | rankings_path = out_dir / "rankings.json" 93 | assert rankings_path.exists() 94 | data = _json.loads(rankings_path.read_text()) 95 | assert "rankings" in data and len(data["rankings"]) == len(models) 96 | assert data.get("metadata", {}).get("version") == PKG_VERSION 97 | 98 | 99 | if __name__ == "__main__": 100 | # Run directly 101 | test_smoke_offline() 102 | print("Smoke test completed successfully.") 103 | 104 | -------------------------------------------------------------------------------- /COMPATIBILITY_NOTES.md: -------------------------------------------------------------------------------- 1 | # SlopRank Compatibility Notes 2 | 3 | ## Migration to Bodo-Direct Approach 4 | 5 | ### Issue Description 6 | Recent updates to [ParaLLM](https://github.com/strangeloopcanon/ParaLLM) included a dependency on Bodo 2025.8.2, which had compatibility issues with certain Python environments. The error manifested as: 7 | 8 | ``` 9 | AttributeError: module 'dis' has no attribute 'hasarg' 10 | ``` 11 | 12 | This occurred when Bodo tried to access Python bytecode inspection features that have changed in recent Python versions. 13 | 14 | ### Resolution Implemented 15 | 16 | SlopRank v0.3.15+ has been updated to use **Bodo directly** instead of relying on ParaLLM as an intermediary. This provides: 17 | 18 | 1. **Direct Bodo Integration**: Uses `bodo.pandas` throughout the codebase for DataFrame operations 19 | 2. **Parallel Processing**: Leverages Bodo's native parallel processing capabilities 20 | 3. **Simplified Dependencies**: Removes ParaLLM dependency complexity 21 | 4. **Better Performance**: Direct Bodo usage with optimized operations 22 | 5. **LLM Library Integration**: Uses Simon Willison's `llm` library for model access 23 | 24 | ### Code Changes Made 25 | 26 | #### 1. Direct Bodo Integration (`sloprank/collect.py`, `sloprank/cli.py`) 27 | - Replaced `import pandas as pd` with `import bodo.pandas as pd` 28 | - Removed all ParaLLM dependencies and imports 29 | - Added hybrid pandas/Bodo approach for complex operations (filtering, concatenation) 30 | - Implemented proper Bodo DataFrame type handling with schema specifications 31 | 32 | #### 2. Dependency Simplification (`pyproject.toml`) 33 | - Removed `parallm>=0.1.3` dependency 34 | - Added `bodo>=2024.0.0` as core dependency 35 | - Added `llm>=0.13.0` for model access 36 | - Updated minimum Python version to 3.9+ (required for Bodo) 37 | 38 | #### 3. LLM Integration 39 | - Direct integration with Simon Willison's `llm` library 40 | - Simplified model querying without ParaLLM wrapper 41 | - Better error handling for unknown models 42 | 43 | ### Alternative Solutions 44 | 45 | If you continue experiencing issues, try these approaches: 46 | 47 | #### Option 1: Downgrade Bodo (Recommended for ParaLLM users) 48 | ```bash 49 | pip install "bodo<2025.0.0" 50 | ``` 51 | 52 | #### Option 2: Use ParaLLM without Bodo features 53 | The current implementation automatically handles this fallback. 54 | 55 | #### Option 3: Use llm library directly 56 | Install and configure Simon Willison's llm library: 57 | ```bash 58 | pip install llm 59 | llm keys set openai 60 | llm keys set anthropic 61 | ``` 62 | 63 | ### Verification 64 | 65 | After the fixes, SlopRank should work correctly: 66 | 67 | ```bash 68 | # Test basic functionality 69 | sloprank --help 70 | 71 | # Test with a small prompt set 72 | sloprank run --prompts prompts.csv --output-dir test_results --models "mock-model" 73 | ``` 74 | 75 | ### Expected Behavior 76 | 77 | When Bodo compatibility issues occur: 78 | 1. SlopRank logs an error message about ParaLLM batch processing 79 | 2. Automatically falls back to individual model queries 80 | 3. Continues normal operation with slightly reduced performance 81 | 4. All core features (ranking, visualization, confidence intervals) remain functional 82 | 83 | ### Performance Impact 84 | 85 | The fallback approach has minimal performance impact: 86 | - **With ParaLLM**: Parallel batch processing (fastest) 87 | - **Fallback mode**: Sequential individual queries (slightly slower, but still functional) 88 | - **Mock mode**: Uses placeholder responses for testing 89 | 90 | ### Reporting Issues 91 | 92 | If you encounter related issues: 93 | 1. Check that you're using SlopRank v0.3.15+ 94 | 2. Include the full error traceback 95 | 3. Mention your Python version and OS 96 | 4. Note which ParaLLM version you're using 97 | 98 | ### Future Improvements 99 | 100 | Planned enhancements: 101 | 1. Optional ParaLLM dependency 102 | 2. Native parallel processing without Bodo 103 | 3. Enhanced model provider support 104 | 4. Better fallback performance optimization 105 | -------------------------------------------------------------------------------- /docs/visualizations/endorsement_graph.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | directed 1 3 | node [ 4 | id 0 5 | label "o1-preview" 6 | pagerank 0.17940361409787733 7 | ] 8 | node [ 9 | id 1 10 | label "gpt-4o" 11 | pagerank 0.17830451744580658 12 | ] 13 | node [ 14 | id 2 15 | label "deepseek-chat" 16 | pagerank 0.1671054138317305 17 | ] 18 | node [ 19 | id 3 20 | label "gemini-2.0-flash-thinking-exp-1219" 21 | pagerank 0.16473186403675355 22 | ] 23 | node [ 24 | id 4 25 | label "claude-3-5-sonnet-latest" 26 | pagerank 0.15557086205954448 27 | ] 28 | node [ 29 | id 5 30 | label "gemini-exp-1206" 31 | pagerank 0.15488372852828722 32 | ] 33 | edge [ 34 | source 0 35 | target 3 36 | weight 131.0 37 | normalized_weight 0.5282258064516129 38 | ] 39 | edge [ 40 | source 0 41 | target 2 42 | weight 129.0 43 | normalized_weight 0.5201612903225806 44 | ] 45 | edge [ 46 | source 0 47 | target 5 48 | weight 144.0 49 | normalized_weight 0.5806451612903226 50 | ] 51 | edge [ 52 | source 0 53 | target 1 54 | weight 157.0 55 | normalized_weight 0.6330645161290323 56 | ] 57 | edge [ 58 | source 0 59 | target 4 60 | weight 139.0 61 | normalized_weight 0.5604838709677419 62 | ] 63 | edge [ 64 | source 1 65 | target 3 66 | weight 155.0 67 | normalized_weight 0.625 68 | ] 69 | edge [ 70 | source 1 71 | target 2 72 | weight 146.0 73 | normalized_weight 0.5887096774193549 74 | ] 75 | edge [ 76 | source 1 77 | target 4 78 | weight 146.0 79 | normalized_weight 0.5887096774193549 80 | ] 81 | edge [ 82 | source 1 83 | target 0 84 | weight 129.0 85 | normalized_weight 0.5201612903225806 86 | ] 87 | edge [ 88 | source 1 89 | target 5 90 | weight 141.0 91 | normalized_weight 0.5685483870967742 92 | ] 93 | edge [ 94 | source 2 95 | target 1 96 | weight 212.0 97 | normalized_weight 0.8548387096774194 98 | ] 99 | edge [ 100 | source 2 101 | target 3 102 | weight 135.5 103 | normalized_weight 0.5463709677419355 104 | ] 105 | edge [ 106 | source 2 107 | target 0 108 | weight 203.0 109 | normalized_weight 0.8185483870967742 110 | ] 111 | edge [ 112 | source 2 113 | target 5 114 | weight 142.0 115 | normalized_weight 0.5725806451612904 116 | ] 117 | edge [ 118 | source 2 119 | target 4 120 | weight 143.0 121 | normalized_weight 0.5766129032258065 122 | ] 123 | edge [ 124 | source 3 125 | target 0 126 | weight 138.0 127 | normalized_weight 0.5564516129032258 128 | ] 129 | edge [ 130 | source 3 131 | target 2 132 | weight 173.0 133 | normalized_weight 0.6975806451612904 134 | ] 135 | edge [ 136 | source 3 137 | target 4 138 | weight 113.0 139 | normalized_weight 0.45564516129032256 140 | ] 141 | edge [ 142 | source 3 143 | target 5 144 | weight 89.0 145 | normalized_weight 0.3588709677419355 146 | ] 147 | edge [ 148 | source 3 149 | target 1 150 | weight 130.0 151 | normalized_weight 0.5241935483870968 152 | ] 153 | edge [ 154 | source 4 155 | target 0 156 | weight 248.0 157 | normalized_weight 1.0 158 | ] 159 | edge [ 160 | source 4 161 | target 3 162 | weight 162.0 163 | normalized_weight 0.6532258064516129 164 | ] 165 | edge [ 166 | source 4 167 | target 5 168 | weight 160.0 169 | normalized_weight 0.6451612903225806 170 | ] 171 | edge [ 172 | source 4 173 | target 1 174 | weight 166.0 175 | normalized_weight 0.6693548387096774 176 | ] 177 | edge [ 178 | source 4 179 | target 2 180 | weight 104.0 181 | normalized_weight 0.41935483870967744 182 | ] 183 | edge [ 184 | source 5 185 | target 4 186 | weight 129.0 187 | normalized_weight 0.5201612903225806 188 | ] 189 | edge [ 190 | source 5 191 | target 3 192 | weight 188.0 193 | normalized_weight 0.7580645161290323 194 | ] 195 | edge [ 196 | source 5 197 | target 2 198 | weight 183.0 199 | normalized_weight 0.7379032258064516 200 | ] 201 | edge [ 202 | source 5 203 | target 1 204 | weight 180.0 205 | normalized_weight 0.7258064516129032 206 | ] 207 | edge [ 208 | source 5 209 | target 0 210 | weight 148.0 211 | normalized_weight 0.5967741935483871 212 | ] 213 | ] 214 | -------------------------------------------------------------------------------- /results/visualizations/endorsement_graph.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | directed 1 3 | node [ 4 | id 0 5 | label "o1-preview" 6 | pagerank 0.17940361409787733 7 | ] 8 | node [ 9 | id 1 10 | label "gpt-4o" 11 | pagerank 0.17830451744580658 12 | ] 13 | node [ 14 | id 2 15 | label "deepseek-chat" 16 | pagerank 0.1671054138317305 17 | ] 18 | node [ 19 | id 3 20 | label "gemini-2.0-flash-thinking-exp-1219" 21 | pagerank 0.16473186403675355 22 | ] 23 | node [ 24 | id 4 25 | label "claude-3-5-sonnet-latest" 26 | pagerank 0.15557086205954448 27 | ] 28 | node [ 29 | id 5 30 | label "gemini-exp-1206" 31 | pagerank 0.15488372852828722 32 | ] 33 | edge [ 34 | source 0 35 | target 3 36 | weight 131.0 37 | normalized_weight 0.5282258064516129 38 | ] 39 | edge [ 40 | source 0 41 | target 2 42 | weight 129.0 43 | normalized_weight 0.5201612903225806 44 | ] 45 | edge [ 46 | source 0 47 | target 5 48 | weight 144.0 49 | normalized_weight 0.5806451612903226 50 | ] 51 | edge [ 52 | source 0 53 | target 1 54 | weight 157.0 55 | normalized_weight 0.6330645161290323 56 | ] 57 | edge [ 58 | source 0 59 | target 4 60 | weight 139.0 61 | normalized_weight 0.5604838709677419 62 | ] 63 | edge [ 64 | source 1 65 | target 3 66 | weight 155.0 67 | normalized_weight 0.625 68 | ] 69 | edge [ 70 | source 1 71 | target 2 72 | weight 146.0 73 | normalized_weight 0.5887096774193549 74 | ] 75 | edge [ 76 | source 1 77 | target 4 78 | weight 146.0 79 | normalized_weight 0.5887096774193549 80 | ] 81 | edge [ 82 | source 1 83 | target 0 84 | weight 129.0 85 | normalized_weight 0.5201612903225806 86 | ] 87 | edge [ 88 | source 1 89 | target 5 90 | weight 141.0 91 | normalized_weight 0.5685483870967742 92 | ] 93 | edge [ 94 | source 2 95 | target 1 96 | weight 212.0 97 | normalized_weight 0.8548387096774194 98 | ] 99 | edge [ 100 | source 2 101 | target 3 102 | weight 135.5 103 | normalized_weight 0.5463709677419355 104 | ] 105 | edge [ 106 | source 2 107 | target 0 108 | weight 203.0 109 | normalized_weight 0.8185483870967742 110 | ] 111 | edge [ 112 | source 2 113 | target 5 114 | weight 142.0 115 | normalized_weight 0.5725806451612904 116 | ] 117 | edge [ 118 | source 2 119 | target 4 120 | weight 143.0 121 | normalized_weight 0.5766129032258065 122 | ] 123 | edge [ 124 | source 3 125 | target 0 126 | weight 138.0 127 | normalized_weight 0.5564516129032258 128 | ] 129 | edge [ 130 | source 3 131 | target 2 132 | weight 173.0 133 | normalized_weight 0.6975806451612904 134 | ] 135 | edge [ 136 | source 3 137 | target 4 138 | weight 113.0 139 | normalized_weight 0.45564516129032256 140 | ] 141 | edge [ 142 | source 3 143 | target 5 144 | weight 89.0 145 | normalized_weight 0.3588709677419355 146 | ] 147 | edge [ 148 | source 3 149 | target 1 150 | weight 130.0 151 | normalized_weight 0.5241935483870968 152 | ] 153 | edge [ 154 | source 4 155 | target 0 156 | weight 248.0 157 | normalized_weight 1.0 158 | ] 159 | edge [ 160 | source 4 161 | target 3 162 | weight 162.0 163 | normalized_weight 0.6532258064516129 164 | ] 165 | edge [ 166 | source 4 167 | target 5 168 | weight 160.0 169 | normalized_weight 0.6451612903225806 170 | ] 171 | edge [ 172 | source 4 173 | target 1 174 | weight 166.0 175 | normalized_weight 0.6693548387096774 176 | ] 177 | edge [ 178 | source 4 179 | target 2 180 | weight 104.0 181 | normalized_weight 0.41935483870967744 182 | ] 183 | edge [ 184 | source 5 185 | target 4 186 | weight 129.0 187 | normalized_weight 0.5201612903225806 188 | ] 189 | edge [ 190 | source 5 191 | target 3 192 | weight 188.0 193 | normalized_weight 0.7580645161290323 194 | ] 195 | edge [ 196 | source 5 197 | target 2 198 | weight 183.0 199 | normalized_weight 0.7379032258064516 200 | ] 201 | edge [ 202 | source 5 203 | target 1 204 | weight 180.0 205 | normalized_weight 0.7258064516129032 206 | ] 207 | edge [ 208 | source 5 209 | target 0 210 | weight 148.0 211 | normalized_weight 0.5967741935483871 212 | ] 213 | ] 214 | -------------------------------------------------------------------------------- /sloprank/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass, field 3 | from pathlib import Path 4 | from typing import List, Dict, Optional, Union, Any 5 | 6 | logging.basicConfig( 7 | level=logging.INFO, 8 | format="%(asctime)s - %(levelname)s - %(message)s" 9 | ) 10 | logger = logging.getLogger("SlopRankLogger") 11 | 12 | @dataclass 13 | class VisualizationConfig: 14 | """Configuration for graph visualization options.""" 15 | enabled: bool = True 16 | save_formats: List[str] = field(default_factory=lambda: ["png", "html", "gml"]) 17 | node_size_factor: float = 2000 18 | edge_width_factor: float = 2.0 19 | layout: str = "spring" # Options: spring, circular, kamada_kawai, spectral 20 | node_colormap: str = "viridis" 21 | edge_colormap: str = "plasma" 22 | interactive: bool = True 23 | 24 | @dataclass 25 | class ConfidenceConfig: 26 | """Configuration for confidence interval calculations.""" 27 | enabled: bool = True 28 | bootstrap_iterations: int = 1000 29 | confidence_level: float = 0.95 # e.g., 0.95 for 95% confidence interval 30 | significance_threshold: float = 0.05 # p-value threshold for significance 31 | 32 | @dataclass 33 | class WebDashboardConfig: 34 | """Configuration for the web dashboard.""" 35 | enabled: bool = False # Default to disabled 36 | host: str = "127.0.0.1" 37 | port: int = 8050 38 | debug: bool = False 39 | auto_open_browser: bool = True 40 | 41 | @dataclass 42 | class EvalConfig: 43 | """Configuration for the SlopRank evaluation system.""" 44 | # Core configuration 45 | model_names: List[str] 46 | evaluation_method: int # 1 => numeric rating, 2 => up/down (example usage) 47 | use_subset_evaluation: bool 48 | evaluators_subset_size: int 49 | output_dir: Path 50 | request_delay: float = 0.0 51 | 52 | # New features 53 | prompt_categories: Dict[str, List[str]] = field(default_factory=dict) 54 | visualization: VisualizationConfig = field(default_factory=VisualizationConfig) 55 | confidence: ConfidenceConfig = field(default_factory=ConfidenceConfig) 56 | web_dashboard: WebDashboardConfig = field(default_factory=WebDashboardConfig) 57 | 58 | # Optional metadata fields 59 | metadata: Dict[str, Any] = field(default_factory=dict) 60 | 61 | def __post_init__(self): 62 | self.output_dir.mkdir(parents=True, exist_ok=True) 63 | 64 | # Strip any whitespace from model names 65 | self.model_names = [model.strip() for model in self.model_names] 66 | 67 | if self.evaluation_method not in {1, 2}: 68 | raise ValueError("evaluation_method must be 1 or 2") 69 | if self.use_subset_evaluation and self.evaluators_subset_size >= len(self.model_names): 70 | # Automatically adjust the subset size if needed 71 | self.evaluators_subset_size = len(self.model_names) - 1 if len(self.model_names) > 1 else 1 72 | logger.warning(f"Adjusted evaluators_subset_size to {self.evaluators_subset_size}") 73 | 74 | # Create visualization directory if needed 75 | if self.visualization.enabled: 76 | vis_dir = self.output_dir / "visualizations" 77 | vis_dir.mkdir(parents=True, exist_ok=True) 78 | 79 | DEFAULT_CONFIG = EvalConfig( 80 | model_names=[ 81 | "gpt-5", 82 | "claude-4-sonnet", 83 | "gemini-2.5-pro", 84 | "deepseek-chat" 85 | ], 86 | # model_names=[ 87 | # "gemini-2.5-pro-exp-03-25", 88 | # "claude-3.7-sonnet-latest", 89 | # "o1", 90 | # "deepseek-reasoner" 91 | # ], 92 | evaluation_method=1, # numeric 93 | use_subset_evaluation=True, 94 | evaluators_subset_size=3, 95 | output_dir=Path("results"), 96 | request_delay=0.0, 97 | # Default prompt categories (empty) 98 | prompt_categories={}, 99 | # Default visualization configuration 100 | visualization=VisualizationConfig( 101 | enabled=True, 102 | save_formats=["png", "html", "gml"], 103 | node_size_factor=2000, 104 | edge_width_factor=2.0, 105 | layout="spring", 106 | node_colormap="viridis", 107 | edge_colormap="plasma", 108 | interactive=True 109 | ), 110 | # Default confidence configuration 111 | confidence=ConfidenceConfig( 112 | enabled=True, 113 | bootstrap_iterations=1000, 114 | confidence_level=0.95, 115 | significance_threshold=0.05 116 | ), 117 | # Default web dashboard configuration (disabled by default) 118 | web_dashboard=WebDashboardConfig( 119 | enabled=False, 120 | host="127.0.0.1", 121 | port=8050, 122 | debug=False, 123 | auto_open_browser=True 124 | ) 125 | ) 126 | -------------------------------------------------------------------------------- /examples/compute_confidence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import random 4 | import pandas as pd 5 | import numpy as np 6 | import networkx as nx 7 | from pathlib import Path 8 | 9 | def compute_confidence_intervals(iterations=1000): 10 | """ 11 | Compute confidence intervals for model rankings using bootstrap resampling. 12 | """ 13 | print("Computing confidence intervals...") 14 | 15 | # Load evaluations data 16 | evals_path = Path("results/evaluations.csv") 17 | evals_df = pd.read_csv(evals_path) 18 | 19 | # Filter out failed evaluations 20 | evals_df = evals_df[evals_df["parse_failed"] == False] 21 | 22 | # Get unique models 23 | models = list(set(evals_df["judge_model"].unique()) | set(evals_df["rated_model"].unique())) 24 | 25 | # Store bootstrap results 26 | bootstrap_results = {model: [] for model in models} 27 | 28 | # Run bootstrap iterations 29 | for i in range(iterations): 30 | if i % 100 == 0: 31 | print(f"Bootstrap iteration {i}/{iterations}...") 32 | 33 | # Resample evaluations with replacement 34 | sampled_evals = evals_df.sample(frac=1.0, replace=True) 35 | 36 | # Build graph from resampled data 37 | G = nx.DiGraph() 38 | G.add_nodes_from(models) 39 | 40 | for _, row in sampled_evals.iterrows(): 41 | judge = row["judge_model"] 42 | rated = row["rated_model"] 43 | score = float(row["score"]) 44 | 45 | if G.has_edge(judge, rated): 46 | G[judge][rated]["weight"] += score 47 | else: 48 | G.add_edge(judge, rated, weight=score) 49 | 50 | # Compute PageRank 51 | if len(G.edges) > 0: 52 | scores = nx.pagerank(G, weight="weight") 53 | 54 | # Store scores 55 | for model, score in scores.items(): 56 | bootstrap_results[model].append(score) 57 | 58 | # Calculate confidence intervals (95%) 59 | confidence_stats = {} 60 | 61 | for model in models: 62 | if not bootstrap_results[model]: 63 | confidence_stats[model] = { 64 | "mean": 0.0, 65 | "lower_bound": 0.0, 66 | "upper_bound": 0.0, 67 | "std_dev": 0.0 68 | } 69 | continue 70 | 71 | sorted_scores = sorted(bootstrap_results[model]) 72 | lower_idx = int(0.025 * len(sorted_scores)) 73 | upper_idx = int(0.975 * len(sorted_scores)) 74 | 75 | confidence_stats[model] = { 76 | "mean": np.mean(sorted_scores), 77 | "lower_bound": sorted_scores[lower_idx], 78 | "upper_bound": sorted_scores[upper_idx], 79 | "std_dev": np.std(sorted_scores) 80 | } 81 | 82 | # Test statistical significance 83 | significance_results = {} 84 | 85 | # Create sorted list of models by mean score 86 | models_by_score = sorted( 87 | [(model, stats["mean"]) for model, stats in confidence_stats.items()], 88 | key=lambda x: x[1], 89 | reverse=True 90 | ) 91 | 92 | # Compare each adjacent pair in the ranking 93 | for i in range(len(models_by_score) - 1): 94 | model1, _ = models_by_score[i] 95 | model2, _ = models_by_score[i + 1] 96 | 97 | # Determine if significant based on confidence intervals 98 | is_significant = ( 99 | confidence_stats[model1]["lower_bound"] > confidence_stats[model2]["upper_bound"] or 100 | confidence_stats[model2]["lower_bound"] > confidence_stats[model1]["upper_bound"] 101 | ) 102 | 103 | significance_results[f"{model1}_vs_{model2}"] = is_significant 104 | 105 | # Save results 106 | results = { 107 | "confidence_intervals": confidence_stats, 108 | "significance": significance_results 109 | } 110 | 111 | outfile = Path("results/confidence_stats.json") 112 | with open(outfile, "w") as f: 113 | json.dump(results, f, indent=2) 114 | 115 | # Print summary 116 | print("\n=== Confidence Intervals (95%) ===") 117 | for model, stats in sorted(confidence_stats.items(), key=lambda x: x[1]["mean"], reverse=True): 118 | print(f"{model}: {stats['mean']:.6f} [{stats['lower_bound']:.6f}, {stats['upper_bound']:.6f}]") 119 | 120 | print("\n=== Statistical Significance ===") 121 | for pair, is_significant in significance_results.items(): 122 | significance_str = "Significant" if is_significant else "Not significant" 123 | print(f"{pair}: {significance_str}") 124 | 125 | print(f"\nResults saved to {outfile}") 126 | 127 | if __name__ == "__main__": 128 | compute_confidence_intervals(iterations=500) # Lower for faster execution -------------------------------------------------------------------------------- /sloprank/pandas_backend.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pandas Backend Configuration 3 | ============================ 4 | 5 | This module provides a configurable pandas backend that can switch between 6 | regular pandas and bodo.pandas based on environment variables or configuration. 7 | 8 | Environment Variables: 9 | - SLOPRANK_USE_BODO: Set to "true" to use Bodo, "false" for regular pandas 10 | - SLOPRANK_PANDAS_BACKEND: Set to "bodo" or "pandas" 11 | 12 | Default: Uses Bodo if available, falls back to regular pandas 13 | """ 14 | 15 | import os 16 | import logging 17 | from typing import Any 18 | 19 | logger = logging.getLogger("SlopRank.PandasBackend") 20 | 21 | # Configuration flags 22 | _USE_BODO = None 23 | _pandas_module = None 24 | 25 | def _determine_backend(): 26 | """Determine which pandas backend to use based on configuration and availability.""" 27 | global _USE_BODO 28 | 29 | if _USE_BODO is not None: 30 | return _USE_BODO 31 | 32 | # Check environment variables 33 | env_use_bodo = os.getenv("SLOPRANK_USE_BODO", "").lower() 34 | env_backend = os.getenv("SLOPRANK_PANDAS_BACKEND", "").lower() 35 | 36 | # Explicit configuration via environment 37 | if env_use_bodo in ("true", "1", "yes"): 38 | _USE_BODO = True 39 | return True 40 | elif env_use_bodo in ("false", "0", "no"): 41 | _USE_BODO = False 42 | return False 43 | elif env_backend == "bodo": 44 | _USE_BODO = True 45 | return True 46 | elif env_backend == "pandas": 47 | _USE_BODO = False 48 | return False 49 | 50 | # Auto-detection: prefer Bodo if available (now the default!) 51 | try: 52 | import bodo.pandas 53 | _USE_BODO = True 54 | logger.info("Auto-detected: Using Bodo pandas backend (default high-performance mode)") 55 | return True 56 | except ImportError: 57 | # Fallback to regular pandas 58 | try: 59 | import pandas 60 | _USE_BODO = False 61 | logger.info("Auto-detected: Using regular pandas backend (Bodo not available)") 62 | return False 63 | except ImportError: 64 | raise ImportError("Neither Bodo nor pandas is available. Please install one: 'pip install sloprank' (includes Bodo) or 'pip install sloprank[pandas]' (compatibility mode)") 65 | 66 | def _get_pandas_module(): 67 | """Get the configured pandas module.""" 68 | global _pandas_module 69 | 70 | if _pandas_module is not None: 71 | return _pandas_module 72 | 73 | use_bodo = _determine_backend() 74 | 75 | if use_bodo: 76 | try: 77 | import bodo.pandas as pd 78 | _pandas_module = pd 79 | logger.info("Loaded Bodo pandas backend") 80 | return pd 81 | except ImportError as e: 82 | logger.warning(f"Failed to import Bodo pandas: {e}") 83 | logger.info("Falling back to regular pandas") 84 | # Fall through to regular pandas 85 | 86 | # Use regular pandas 87 | try: 88 | import pandas as pd 89 | _pandas_module = pd 90 | logger.info("Loaded regular pandas backend") 91 | return pd 92 | except ImportError: 93 | raise ImportError("Neither Bodo nor pandas is available. Please install one: 'pip install sloprank' (includes Bodo) or 'pip install sloprank[pandas]' (compatibility mode)") 94 | 95 | def get_pandas(): 96 | """Get the configured pandas module.""" 97 | return _get_pandas_module() 98 | 99 | def is_using_bodo(): 100 | """Check if we're using the Bodo backend.""" 101 | return _USE_BODO is True 102 | 103 | def force_backend(backend: str): 104 | """ 105 | Force a specific backend for testing or configuration. 106 | 107 | Args: 108 | backend: "bodo" or "pandas" 109 | """ 110 | global _USE_BODO, _pandas_module 111 | 112 | if backend.lower() == "bodo": 113 | _USE_BODO = True 114 | elif backend.lower() == "pandas": 115 | _USE_BODO = False 116 | else: 117 | raise ValueError("Backend must be 'bodo' or 'pandas'") 118 | 119 | # Reset module cache to force reload 120 | _pandas_module = None 121 | logger.info(f"Forced backend to: {backend}") 122 | 123 | def get_backend_info(): 124 | """Get information about the current backend configuration.""" 125 | pd = get_pandas() 126 | backend_name = "bodo" if is_using_bodo() else "pandas" 127 | 128 | return { 129 | "backend": backend_name, 130 | "module": str(type(pd)), 131 | "using_bodo": is_using_bodo(), 132 | "env_use_bodo": os.getenv("SLOPRANK_USE_BODO"), 133 | "env_backend": os.getenv("SLOPRANK_PANDAS_BACKEND") 134 | } 135 | 136 | # Export the pandas module for easy importing 137 | pd = get_pandas() 138 | 139 | # For backwards compatibility and explicit access 140 | pandas = pd 141 | -------------------------------------------------------------------------------- /sloprank/parse.py: -------------------------------------------------------------------------------- 1 | import json 2 | import bodo.pandas as pd 3 | from .config import logger, EvalConfig 4 | 5 | def parse_evaluation_rows(raw_eval_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame: 6 | """ 7 | Convert each row's judge's JSON to numeric scores. 8 | Returns: columns = [prompt, judge_model, rated_model, score, parse_failed]. 9 | """ 10 | all_rows = [] 11 | for _, row in raw_eval_df.iterrows(): 12 | prompt = row["prompt"] 13 | judge_model = row["judge_model"] 14 | raw_judgment = row["raw_judgment"] or "" 15 | raw_judgment_tokens = row.get("raw_judgment_token_count", 0) 16 | 17 | # load model_mapping 18 | try: 19 | model_mapping = json.loads(row["model_mapping"]) 20 | except Exception as e: 21 | logger.error(f"Couldn't parse model_mapping: {e}") 22 | model_mapping = {} 23 | 24 | if not raw_judgment.strip(): 25 | # fallback 26 | for real_model in model_mapping.keys(): 27 | all_rows.append({ 28 | "prompt": prompt, 29 | "judge_model": judge_model, 30 | "rated_model": real_model, 31 | "score": 4.1, 32 | "parse_failed": True, 33 | "raw_judgment_token_count": raw_judgment_tokens 34 | }) 35 | continue 36 | 37 | # Attempt to isolate the JSON object 38 | # First try to find JSON with standard formatting 39 | start = raw_judgment.find("{") 40 | end = raw_judgment.rfind("}") + 1 41 | 42 | # If that fails, try more aggressive parsing for models that output in various formats 43 | if start == -1 or end == 0: 44 | # Look for patterns like "Model_1": 8 or "Model_1" : 8 or Model_1: 8 45 | import re 46 | json_pattern = r'[\{\s]*[\"\']?Model_\d+[\"\']?\s*:\s*\d+(?:\.\d+)?' 47 | if re.search(json_pattern, raw_judgment): 48 | # Try to reconstruct a proper JSON 49 | scores = {} 50 | model_score_pattern = r'[\"\']?Model_(\d+)[\"\']?\s*:\s*(\d+(?:\.\d+)?)' 51 | matches = re.findall(model_score_pattern, raw_judgment) 52 | for model_num, score in matches: 53 | scores[f"Model_{model_num}"] = float(score) 54 | 55 | if scores: 56 | logger.warning(f"Reconstructed JSON for judge={judge_model}, prompt={prompt[:40]}") 57 | try: 58 | # Convert to standard dict for consistency in later processing 59 | anon_to_real = {v: k for k,v in model_mapping.items()} 60 | for anon_id, score_val in scores.items(): 61 | real_model = anon_to_real.get(anon_id) 62 | if real_model: 63 | score_float = float(score_val) 64 | # clamp 1..10 65 | score_float = max(1.0, min(10.0, score_float)) 66 | all_rows.append({ 67 | "prompt": prompt, 68 | "judge_model": judge_model, 69 | "rated_model": real_model, 70 | "score": score_float, 71 | "parse_failed": False, 72 | "raw_judgment_token_count": raw_judgment_tokens 73 | }) 74 | continue 75 | except Exception as e: 76 | logger.error(f"Error processing reconstructed JSON: {e}") 77 | 78 | logger.error(f"No JSON found for judge={judge_model}, prompt={prompt[:40]}") 79 | # fallback 80 | for real_model in model_mapping.keys(): 81 | all_rows.append({ 82 | "prompt": prompt, 83 | "judge_model": judge_model, 84 | "rated_model": real_model, 85 | "score": 4.1, 86 | "parse_failed": True, 87 | "raw_judgment_token_count": raw_judgment_tokens 88 | }) 89 | continue 90 | 91 | try: 92 | data = json.loads(raw_judgment[start:end]) 93 | # Reverse map: "Model_1" => real model name 94 | anon_to_real = {v: k for k,v in model_mapping.items()} 95 | 96 | for anon_id, score_val in data.items(): 97 | real_model = anon_to_real.get(anon_id) 98 | if real_model: 99 | score_float = float(score_val) 100 | # clamp 1..10 101 | score_float = max(1.0, min(10.0, score_float)) 102 | all_rows.append({ 103 | "prompt": prompt, 104 | "judge_model": judge_model, 105 | "rated_model": real_model, 106 | "score": score_float, 107 | "parse_failed": False, 108 | "raw_judgment_token_count": raw_judgment_tokens 109 | }) 110 | except Exception as e: 111 | logger.error(f"Parsing error: judge={judge_model}, prompt={prompt[:40]} => {str(e)}") 112 | for real_model in model_mapping.keys(): 113 | all_rows.append({ 114 | "prompt": prompt, 115 | "judge_model": judge_model, 116 | "rated_model": real_model, 117 | "score": 4.1, 118 | "parse_failed": True, 119 | "raw_judgment_token_count": raw_judgment_tokens 120 | }) 121 | 122 | return pd.DataFrame(all_rows) 123 | -------------------------------------------------------------------------------- /sloprank/utils/confidence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Confidence interval calculation for SlopRank rankings. 3 | """ 4 | import json 5 | import bodo.pandas as pd 6 | import numpy as np 7 | import networkx as nx 8 | from pathlib import Path 9 | 10 | from ..config import logger 11 | 12 | def compute_confidence_intervals( 13 | evaluations_path=None, 14 | output_path=None, 15 | iterations=500, 16 | confidence_level=0.95 17 | ): 18 | """ 19 | Compute confidence intervals for model rankings using bootstrap resampling. 20 | 21 | Parameters: 22 | ----------- 23 | evaluations_path : Path or str 24 | Path to the evaluations CSV file 25 | output_path : Path or str 26 | Path for the output JSON file 27 | iterations : int 28 | Number of bootstrap iterations 29 | confidence_level : float 30 | Confidence level (0.0-1.0) 31 | 32 | Returns: 33 | -------- 34 | dict 35 | Confidence statistics 36 | """ 37 | if evaluations_path is None: 38 | evaluations_path = Path("results/evaluations.csv") 39 | else: 40 | evaluations_path = Path(evaluations_path) 41 | 42 | if output_path is None: 43 | output_path = Path("results/confidence_stats.json") 44 | else: 45 | output_path = Path(output_path) 46 | 47 | # Create output directory if it doesn't exist 48 | output_path.parent.mkdir(parents=True, exist_ok=True) 49 | 50 | logger.info(f"Computing confidence intervals using {iterations} bootstrap iterations...") 51 | 52 | # Load evaluations 53 | evals_df = pd.read_csv(evaluations_path) 54 | 55 | # Filter out failed evaluations 56 | evals_df = evals_df[evals_df["parse_failed"] == False] 57 | 58 | # Get unique models 59 | models = list(set(evals_df["judge_model"].unique()) | set(evals_df["rated_model"].unique())) 60 | 61 | # Store bootstrap results 62 | bootstrap_results = {model: [] for model in models} 63 | 64 | # Run bootstrap iterations 65 | for i in range(iterations): 66 | if i % 100 == 0: 67 | logger.info(f"Bootstrap iteration {i}/{iterations}...") 68 | 69 | # Resample evaluations with replacement 70 | sampled_evals = evals_df.sample(frac=1.0, replace=True) 71 | 72 | # Build graph from resampled data 73 | G = nx.DiGraph() 74 | G.add_nodes_from(models) 75 | 76 | for _, row in sampled_evals.iterrows(): 77 | judge = row["judge_model"] 78 | rated = row["rated_model"] 79 | score = float(row["score"]) 80 | 81 | if G.has_edge(judge, rated): 82 | G[judge][rated]["weight"] += score 83 | else: 84 | G.add_edge(judge, rated, weight=score) 85 | 86 | # Compute PageRank 87 | if len(G.edges) > 0: 88 | scores = nx.pagerank(G, weight="weight") 89 | 90 | # Store scores 91 | for model, score in scores.items(): 92 | bootstrap_results[model].append(score) 93 | 94 | # Calculate confidence intervals (95%) 95 | confidence_stats = {} 96 | alpha = 1.0 - confidence_level 97 | 98 | for model in models: 99 | if not bootstrap_results[model]: 100 | confidence_stats[model] = { 101 | "mean": 0.0, 102 | "lower_bound": 0.0, 103 | "upper_bound": 0.0, 104 | "std_dev": 0.0 105 | } 106 | continue 107 | 108 | sorted_scores = sorted(bootstrap_results[model]) 109 | lower_idx = int(alpha/2 * len(sorted_scores)) 110 | upper_idx = int((1-alpha/2) * len(sorted_scores)) 111 | 112 | confidence_stats[model] = { 113 | "mean": float(np.mean(sorted_scores)), 114 | "lower_bound": float(sorted_scores[max(0, lower_idx)]), 115 | "upper_bound": float(sorted_scores[min(len(sorted_scores)-1, upper_idx)]), 116 | "std_dev": float(np.std(sorted_scores)) 117 | } 118 | 119 | # Test statistical significance 120 | significance_results = {} 121 | 122 | # Create sorted list of models by mean score 123 | models_by_score = sorted( 124 | [(model, stats["mean"]) for model, stats in confidence_stats.items()], 125 | key=lambda x: x[1], 126 | reverse=True 127 | ) 128 | 129 | # Compare each adjacent pair in the ranking 130 | for i in range(len(models_by_score) - 1): 131 | model1, _ = models_by_score[i] 132 | model2, _ = models_by_score[i + 1] 133 | 134 | # Determine if significant based on confidence intervals 135 | is_significant = ( 136 | confidence_stats[model1]["lower_bound"] > confidence_stats[model2]["upper_bound"] or 137 | confidence_stats[model2]["lower_bound"] > confidence_stats[model1]["upper_bound"] 138 | ) 139 | 140 | significance_results[f"{model1}_vs_{model2}"] = is_significant 141 | 142 | # Save results 143 | results = { 144 | "confidence_intervals": confidence_stats, 145 | "significance": significance_results, 146 | "metadata": { 147 | "iterations": iterations, 148 | "confidence_level": confidence_level 149 | } 150 | } 151 | 152 | with open(output_path, "w") as f: 153 | json.dump(results, f, indent=2) 154 | 155 | # Print summary 156 | logger.info("\n=== Confidence Intervals ===") 157 | for model, stats in sorted(confidence_stats.items(), key=lambda x: x[1]["mean"], reverse=True): 158 | logger.info(f"{model}: {stats['mean']:.6f} [{stats['lower_bound']:.6f}, {stats['upper_bound']:.6f}]") 159 | 160 | logger.info("\n=== Statistical Significance ===") 161 | for pair, is_significant in significance_results.items(): 162 | significance_str = "Significant" if is_significant else "Not significant" 163 | logger.info(f"{pair}: {significance_str}") 164 | 165 | logger.info(f"Confidence statistics saved to {output_path}") 166 | 167 | return confidence_stats 168 | 169 | 170 | if __name__ == "__main__": 171 | # Run as a standalone script 172 | compute_confidence_intervals() -------------------------------------------------------------------------------- /results-openrouter/rankings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rankings": [ 3 | { 4 | "model": "openrouter/openai/gpt-5", 5 | "score": 0.16847011097481915 6 | }, 7 | { 8 | "model": "openrouter/qwen/qwen3-max", 9 | "score": 0.15526560981312867 10 | }, 11 | { 12 | "model": "openrouter/google/gemini-2.5-pro", 13 | "score": 0.14578746659558767 14 | }, 15 | { 16 | "model": "openrouter/anthropic/claude-opus-4.1", 17 | "score": 0.13555256359158935 18 | }, 19 | { 20 | "model": "openrouter/x-ai/grok-4", 21 | "score": 0.1352023392288148 22 | }, 23 | { 24 | "model": "openrouter/anthropic/claude-sonnet-4", 25 | "score": 0.1338541911435342 26 | }, 27 | { 28 | "model": "openrouter/nousresearch/hermes-4-405b", 29 | "score": 0.1258677186525262 30 | } 31 | ], 32 | "metadata": { 33 | "evaluation_method": 1, 34 | "timestamp": "2025-09-05T18:04:48.013974" 35 | }, 36 | "category_rankings": { 37 | "reasoning": [ 38 | { 39 | "model": "openrouter/openai/gpt-5", 40 | "score": 0.17430497069091347 41 | }, 42 | { 43 | "model": "openrouter/google/gemini-2.5-pro", 44 | "score": 0.15181656623231862 45 | }, 46 | { 47 | "model": "openrouter/qwen/qwen3-max", 48 | "score": 0.1445691835060197 49 | }, 50 | { 51 | "model": "openrouter/anthropic/claude-sonnet-4", 52 | "score": 0.14017243463115212 53 | }, 54 | { 55 | "model": "openrouter/anthropic/claude-opus-4.1", 56 | "score": 0.13854136532118963 57 | }, 58 | { 59 | "model": "openrouter/nousresearch/hermes-4-405b", 60 | "score": 0.12954936131490657 61 | }, 62 | { 63 | "model": "openrouter/x-ai/grok-4", 64 | "score": 0.12104611830349994 65 | } 66 | ], 67 | "creativity": [ 68 | { 69 | "model": "openrouter/qwen/qwen3-max", 70 | "score": 0.19787032089498574 71 | }, 72 | { 73 | "model": "openrouter/openai/gpt-5", 74 | "score": 0.19103007295939647 75 | }, 76 | { 77 | "model": "openrouter/anthropic/claude-opus-4.1", 78 | "score": 0.14499987708480822 79 | }, 80 | { 81 | "model": "openrouter/google/gemini-2.5-pro", 82 | "score": 0.14466194083275574 83 | }, 84 | { 85 | "model": "openrouter/anthropic/claude-sonnet-4", 86 | "score": 0.13343973543986454 87 | }, 88 | { 89 | "model": "openrouter/nousresearch/hermes-4-405b", 90 | "score": 0.10250109470160962 91 | }, 92 | { 93 | "model": "openrouter/x-ai/grok-4", 94 | "score": 0.08549695808657976 95 | } 96 | ], 97 | "knowledge": [ 98 | { 99 | "model": "openrouter/qwen/qwen3-max", 100 | "score": 0.230685151045678 101 | }, 102 | { 103 | "model": "openrouter/anthropic/claude-opus-4.1", 104 | "score": 0.2263174375315229 105 | }, 106 | { 107 | "model": "openrouter/anthropic/claude-sonnet-4", 108 | "score": 0.2078302382240306 109 | }, 110 | { 111 | "model": "openrouter/nousresearch/hermes-4-405b", 112 | "score": 0.19414495209100702 113 | }, 114 | { 115 | "model": "openrouter/openai/gpt-5", 116 | "score": 0.07263846361795528 117 | }, 118 | { 119 | "model": "openrouter/google/gemini-2.5-pro", 120 | "score": 0.03624680196311777 121 | }, 122 | { 123 | "model": "openrouter/x-ai/grok-4", 124 | "score": 0.032136955526688424 125 | } 126 | ], 127 | "uncategorized": [ 128 | { 129 | "model": "openrouter/openai/gpt-5", 130 | "score": 0.16643367778474888 131 | }, 132 | { 133 | "model": "openrouter/x-ai/grok-4", 134 | "score": 0.15422265324248047 135 | }, 136 | { 137 | "model": "openrouter/qwen/qwen3-max", 138 | "score": 0.15064751154294972 139 | }, 140 | { 141 | "model": "openrouter/google/gemini-2.5-pro", 142 | "score": 0.14645725401938814 143 | }, 144 | { 145 | "model": "openrouter/anthropic/claude-opus-4.1", 146 | "score": 0.13052286440952826 147 | }, 148 | { 149 | "model": "openrouter/anthropic/claude-sonnet-4", 150 | "score": 0.12608061856312178 151 | }, 152 | { 153 | "model": "openrouter/nousresearch/hermes-4-405b", 154 | "score": 0.1256354204377827 155 | } 156 | ] 157 | }, 158 | "confidence_intervals": { 159 | "openrouter/openai/gpt-5": { 160 | "mean": 0.16805771950954243, 161 | "lower_bound": 0.14854275640418269, 162 | "upper_bound": 0.18889452135305, 163 | "std_dev": 0.010244390640524138 164 | }, 165 | "openrouter/anthropic/claude-opus-4.1": { 166 | "mean": 0.13545523306386933, 167 | "lower_bound": 0.11580551762466326, 168 | "upper_bound": 0.15508631927416777, 169 | "std_dev": 0.010082235807697845 170 | }, 171 | "openrouter/anthropic/claude-sonnet-4": { 172 | "mean": 0.13403056558493653, 173 | "lower_bound": 0.11663732761525049, 174 | "upper_bound": 0.15292269487100793, 175 | "std_dev": 0.00939353104769414 176 | }, 177 | "openrouter/x-ai/grok-4": { 178 | "mean": 0.13541421681507756, 179 | "lower_bound": 0.11688480461050506, 180 | "upper_bound": 0.1551950474080818, 181 | "std_dev": 0.009555164123263458 182 | }, 183 | "openrouter/qwen/qwen3-max": { 184 | "mean": 0.15543196085269878, 185 | "lower_bound": 0.13554801921435994, 186 | "upper_bound": 0.17620411609175582, 187 | "std_dev": 0.010415740974695154 188 | }, 189 | "openrouter/google/gemini-2.5-pro": { 190 | "mean": 0.145526871283856, 191 | "lower_bound": 0.12557672757845778, 192 | "upper_bound": 0.16526113279938065, 193 | "std_dev": 0.010002455058795396 194 | }, 195 | "openrouter/nousresearch/hermes-4-405b": { 196 | "mean": 0.12608343289001933, 197 | "lower_bound": 0.10966111474019993, 198 | "upper_bound": 0.14404672239366115, 199 | "std_dev": 0.008905700093363503 200 | } 201 | }, 202 | "significance": { 203 | "openrouter/openai/gpt-5_vs_openrouter/qwen/qwen3-max": false, 204 | "openrouter/qwen/qwen3-max_vs_openrouter/google/gemini-2.5-pro": false, 205 | "openrouter/google/gemini-2.5-pro_vs_openrouter/anthropic/claude-opus-4.1": false, 206 | "openrouter/anthropic/claude-opus-4.1_vs_openrouter/x-ai/grok-4": false, 207 | "openrouter/x-ai/grok-4_vs_openrouter/anthropic/claude-sonnet-4": false, 208 | "openrouter/anthropic/claude-sonnet-4_vs_openrouter/nousresearch/hermes-4-405b": false 209 | } 210 | } -------------------------------------------------------------------------------- /sloprank/utils/commands.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line utilities for SlopRank. 3 | """ 4 | import click 5 | import bodo.pandas as pd 6 | import json 7 | import threading 8 | import time 9 | from pathlib import Path 10 | import webbrowser 11 | 12 | from ..config import logger 13 | from ..config import VisualizationConfig 14 | from .visualization import generate_visualization 15 | 16 | # Import confidence and dashboard modules if available 17 | try: 18 | from .confidence import compute_confidence_intervals 19 | HAS_CONFIDENCE = True 20 | except ImportError: 21 | HAS_CONFIDENCE = False 22 | 23 | try: 24 | from .dashboard import generate_dashboard, start_dashboard 25 | HAS_DASHBOARD = True 26 | except ImportError: 27 | HAS_DASHBOARD = False 28 | 29 | # Import category analysis if available 30 | try: 31 | from .categorization import categorize_prompts, analyze_categorized_evaluations 32 | HAS_CATEGORIES = True 33 | except ImportError: 34 | HAS_CATEGORIES = False 35 | 36 | 37 | @click.group() 38 | def utils(): 39 | """Utility commands for SlopRank.""" 40 | pass 41 | 42 | 43 | @utils.command() 44 | @click.option("--rankings", default="results/rankings.json", help="Path to rankings JSON file") 45 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file") 46 | @click.option("--output-dir", default="results/visualizations", help="Output directory for visualizations") 47 | @click.option("--layout", default="spring", help="Graph layout [spring, circular, kamada_kawai, spectral]") 48 | @click.option("--interactive/--no-interactive", default=True, help="Generate interactive HTML visualization") 49 | def visualize(rankings, evaluations, output_dir, layout, interactive): 50 | """Generate visualizations for the SlopRank endorsement graph.""" 51 | vis_config = VisualizationConfig( 52 | enabled=True, 53 | interactive=interactive, 54 | layout=layout 55 | ) 56 | try: 57 | generate_visualization( 58 | rankings_path=rankings, 59 | evaluations_path=evaluations, 60 | output_dir=output_dir, 61 | vis_config=vis_config 62 | ) 63 | click.echo(f"Visualizations generated in {output_dir}") 64 | except Exception as e: 65 | click.echo(f"Error generating visualizations: {e}", err=True) 66 | 67 | 68 | @utils.command() 69 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file") 70 | @click.option("--output", default="results/confidence_stats.json", help="Output file for confidence data") 71 | @click.option("--iterations", default=500, help="Number of bootstrap iterations") 72 | @click.option("--confidence-level", default=0.95, help="Confidence level (0.0-1.0)") 73 | def confidence(evaluations, output, iterations, confidence_level): 74 | """Compute confidence intervals for SlopRank rankings.""" 75 | if not HAS_CONFIDENCE: 76 | click.echo("Confidence module not available. Install numpy to use this feature.", err=True) 77 | return 78 | 79 | try: 80 | from .confidence import compute_confidence_intervals 81 | stats = compute_confidence_intervals( 82 | evaluations_path=evaluations, 83 | output_path=output, 84 | iterations=iterations, 85 | confidence_level=confidence_level 86 | ) 87 | click.echo(f"Confidence statistics saved to {output}") 88 | except Exception as e: 89 | click.echo(f"Error computing confidence intervals: {e}", err=True) 90 | 91 | 92 | @utils.command() 93 | @click.option("--prompts", default="prompts.csv", help="Path to prompts Excel file") 94 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file") 95 | @click.option("--output-dir", default="results", help="Output directory for category analysis") 96 | def categorize(prompts, evaluations, output_dir): 97 | """Categorize prompts and analyze model performance by category.""" 98 | if not HAS_CATEGORIES: 99 | click.echo("Categorization module not available.", err=True) 100 | return 101 | 102 | try: 103 | from .categorization import categorize_prompts, analyze_categorized_evaluations 104 | 105 | output_dir = Path(output_dir) 106 | output_dir.mkdir(exist_ok=True, parents=True) 107 | 108 | # Categorize prompts 109 | categories = categorize_prompts(prompts_file=prompts) 110 | 111 | # Analyze performance by category 112 | analyze_categorized_evaluations( 113 | categorized_prompts=categories, 114 | evaluations_path=evaluations, 115 | output_dir=output_dir 116 | ) 117 | 118 | click.echo(f"Category analysis saved to {output_dir / 'category_rankings.json'}") 119 | except Exception as e: 120 | click.echo(f"Error categorizing prompts: {e}", err=True) 121 | 122 | 123 | @utils.command() 124 | @click.option("--rankings", default="results/rankings.json", help="Path to rankings JSON file") 125 | @click.option("--confidence", default="results/confidence_stats.json", help="Path to confidence stats JSON") 126 | @click.option("--categories", default="results/category_rankings.json", help="Path to category rankings JSON") 127 | @click.option("--graph", default="results/visualizations/endorsement_graph.png", help="Path to graph visualization") 128 | @click.option("--output", default="results/dashboard.html", help="Output path for dashboard HTML") 129 | def dashboard(rankings, confidence, categories, graph, output): 130 | """Generate HTML dashboard for SlopRank results.""" 131 | if not HAS_DASHBOARD: 132 | click.echo("Dashboard module not available.", err=True) 133 | return 134 | 135 | try: 136 | from .dashboard import generate_dashboard 137 | 138 | dashboard_path = generate_dashboard( 139 | rankings_path=rankings, 140 | confidence_path=confidence if Path(confidence).exists() else None, 141 | categories_path=categories if Path(categories).exists() else None, 142 | graph_path=graph if Path(graph).exists() else None, 143 | output_path=output 144 | ) 145 | 146 | click.echo(f"Dashboard generated at {dashboard_path}") 147 | except Exception as e: 148 | click.echo(f"Error generating dashboard: {e}", err=True) 149 | 150 | 151 | @utils.command() 152 | @click.option("--dashboard", default="results/dashboard.html", help="Path to dashboard HTML file") 153 | @click.option("--port", default=8000, help="Port for the web server") 154 | @click.option("--no-browser", is_flag=True, help="Don't open browser automatically") 155 | def serve(dashboard, port, no_browser): 156 | """Start a web server to view the SlopRank dashboard.""" 157 | try: 158 | from http.server import HTTPServer, SimpleHTTPRequestHandler 159 | 160 | dashboard_path = Path(dashboard) 161 | if not dashboard_path.exists(): 162 | click.echo(f"Dashboard file not found: {dashboard_path}", err=True) 163 | return 164 | 165 | # Start server 166 | server_address = ('', port) 167 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler) 168 | 169 | # Start server in a separate thread 170 | server_thread = threading.Thread(target=httpd.serve_forever) 171 | server_thread.daemon = True 172 | server_thread.start() 173 | 174 | url = f"http://localhost:{port}/{dashboard}" 175 | click.echo(f"Server started at {url}") 176 | 177 | # Open browser 178 | if not no_browser: 179 | webbrowser.open(url) 180 | 181 | # Keep the main thread alive 182 | try: 183 | while True: 184 | time.sleep(1) 185 | except KeyboardInterrupt: 186 | click.echo("Shutting down server...") 187 | httpd.shutdown() 188 | 189 | except Exception as e: 190 | click.echo(f"Error starting server: {e}", err=True) 191 | 192 | 193 | def register_utils_commands(cli): 194 | """Register utility commands with the main CLI.""" 195 | cli.add_command(utils) -------------------------------------------------------------------------------- /examples/prompt_categorization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import json 4 | import re 5 | from pathlib import Path 6 | from collections import defaultdict 7 | 8 | def auto_categorize_prompts(prompts_file="prompts.csv"): 9 | """ 10 | Reads prompts from Excel file and automatically categorizes them. 11 | If a 'Category' column exists, it will use those categories. 12 | Otherwise, it will attempt to infer categories based on content. 13 | """ 14 | print(f"Reading prompts from {prompts_file}...") 15 | 16 | # Read prompts from Excel 17 | prompts_df = pd.read_csv(prompts_file) 18 | 19 | # Check if a Category column exists 20 | if 'Category' in prompts_df.columns: 21 | categories = defaultdict(list) 22 | 23 | # Group prompts by category 24 | for _, row in prompts_df.iterrows(): 25 | if pd.notna(row['Category']) and row['Category']: 26 | categories[row['Category']].append(row['Questions']) 27 | else: 28 | if 'Uncategorized' not in categories: 29 | categories['Uncategorized'] = [] 30 | categories['Uncategorized'].append(row['Questions']) 31 | 32 | print(f"Found {len(categories)} categories in the Excel file.") 33 | else: 34 | # Infer categories based on content 35 | categories = infer_categories(prompts_df['Questions'].tolist()) 36 | 37 | # Add inferred categories back to the DataFrame 38 | category_map = {} 39 | for category, prompts in categories.items(): 40 | for prompt in prompts: 41 | category_map[prompt] = category 42 | 43 | prompts_df['Category'] = prompts_df['Questions'].map(category_map) 44 | 45 | # Save the categorized DataFrame back to Excel 46 | output_path = Path(prompts_file).with_stem(Path(prompts_file).stem + "_categorized") 47 | prompts_df.to_csv(output_path, index=False) 48 | print(f"Saved categorized prompts to {output_path}") 49 | 50 | # Return categories as a dictionary with lists of prompts 51 | return dict(categories) 52 | 53 | def infer_categories(prompts): 54 | """ 55 | Infer categories from prompt content using keyword matching. 56 | """ 57 | print("Inferring categories from prompt content...") 58 | 59 | # Define category keywords 60 | keywords = { 61 | 'Reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze', 'evaluate', 'assess', 'examine'], 62 | 'Creativity': ['creative', 'imagine', 'story', 'design', 'invent', 'fiction', 'innovative'], 63 | 'Knowledge': ['fact', 'define', 'what is', 'history', 'science', 'describe', 'information'], 64 | 'Coding': ['code', 'function', 'algorithm', 'program', 'script', 'implementation'], 65 | 'Opinion': ['opinion', 'believe', 'think', 'perspective', 'view', 'stance'], 66 | 'Technical': ['technical', 'engineering', 'system', 'mechanism', 'process'], 67 | 'Economic': ['economic', 'finance', 'market', 'money', 'business', 'trade', 'commerce', 'tax'], 68 | 'Medical': ['medical', 'health', 'disease', 'treatment', 'cure', 'patient', 'doctor', 'hospital'], 69 | 'Political': ['political', 'government', 'policy', 'regulation', 'law', 'legal'], 70 | 'Ethical': ['ethical', 'moral', 'right', 'wrong', 'should', 'ethics', 'values'], 71 | } 72 | 73 | # Categorize prompts 74 | categories = defaultdict(list) 75 | 76 | for prompt in prompts: 77 | prompt_lower = prompt.lower() 78 | 79 | # Try to match prompt to a category 80 | matched = False 81 | for category, terms in keywords.items(): 82 | if any(term in prompt_lower for term in terms): 83 | categories[category].append(prompt) 84 | matched = True 85 | break 86 | 87 | # If no match, add to Uncategorized 88 | if not matched: 89 | categories['Uncategorized'].append(prompt) 90 | 91 | # Count prompts per category 92 | for category, prompts in categories.items(): 93 | print(f"Category '{category}': {len(prompts)} prompts") 94 | 95 | return categories 96 | 97 | def analyze_categorized_evaluations(categorized_prompts): 98 | """ 99 | Analyze evaluations based on prompt categories. 100 | """ 101 | # Load evaluations 102 | evals_path = Path("results/evaluations.csv") 103 | if not evals_path.exists(): 104 | print(f"Error: Evaluations file not found at {evals_path}") 105 | return 106 | 107 | print(f"Loading evaluations from {evals_path}...") 108 | evals_df = pd.read_csv(evals_path) 109 | 110 | # Filter out failed evaluations 111 | evals_df = evals_df[evals_df["parse_failed"] == False] 112 | 113 | # Create a flat mapping of prompt -> category 114 | prompt_to_category = {} 115 | for category, prompts in categorized_prompts.items(): 116 | for prompt in prompts: 117 | prompt_to_category[prompt] = category 118 | 119 | # Add category column to evaluations DataFrame 120 | evals_df['category'] = evals_df['prompt'].map(prompt_to_category) 121 | 122 | # Calculate average scores by category and model 123 | results = [] 124 | 125 | # For each category 126 | for category in categorized_prompts.keys(): 127 | if category == 'Uncategorized': 128 | continue 129 | 130 | category_evals = evals_df[evals_df['category'] == category] 131 | 132 | if category_evals.empty: 133 | continue 134 | 135 | # For each model being rated 136 | for model in category_evals['rated_model'].unique(): 137 | model_scores = category_evals[category_evals['rated_model'] == model]['score'] 138 | avg_score = model_scores.mean() 139 | count = len(model_scores) 140 | 141 | results.append({ 142 | 'category': category, 143 | 'model': model, 144 | 'average_score': avg_score, 145 | 'evaluations_count': count 146 | }) 147 | 148 | # Create DataFrame from results 149 | results_df = pd.DataFrame(results) 150 | 151 | # Save to CSV 152 | output_path = Path("results/category_analysis.csv") 153 | results_df.to_csv(output_path, index=False) 154 | 155 | # Generate summary 156 | print("\n=== Category Analysis ===") 157 | for category in sorted(categorized_prompts.keys()): 158 | if category == 'Uncategorized': 159 | continue 160 | 161 | category_data = results_df[results_df['category'] == category] 162 | 163 | if category_data.empty: 164 | continue 165 | 166 | print(f"\nCategory: {category}") 167 | sorted_models = category_data.sort_values('average_score', ascending=False) 168 | 169 | for _, row in sorted_models.iterrows(): 170 | print(f" {row['model']}: {row['average_score']:.4f} (based on {row['evaluations_count']} evaluations)") 171 | 172 | print(f"\nCategory analysis saved to {output_path}") 173 | 174 | # Create JSON with category rankings 175 | category_rankings = {} 176 | 177 | for category in sorted(categorized_prompts.keys()): 178 | if category == 'Uncategorized': 179 | continue 180 | 181 | category_data = results_df[results_df['category'] == category] 182 | 183 | if category_data.empty: 184 | continue 185 | 186 | sorted_models = category_data.sort_values('average_score', ascending=False) 187 | category_rankings[category] = [ 188 | {"model": row['model'], "score": row['average_score']} 189 | for _, row in sorted_models.iterrows() 190 | ] 191 | 192 | # Save category rankings to JSON 193 | rankings_path = Path("results/category_rankings.json") 194 | with open(rankings_path, 'w') as f: 195 | json.dump(category_rankings, f, indent=2) 196 | 197 | print(f"Category rankings saved to {rankings_path}") 198 | 199 | 200 | if __name__ == "__main__": 201 | # Process prompts 202 | categorized_prompts = auto_categorize_prompts() 203 | 204 | # Analyze evaluations by category 205 | analyze_categorized_evaluations(categorized_prompts) -------------------------------------------------------------------------------- /examples/dashboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import pandas as pd 4 | import webbrowser 5 | from pathlib import Path 6 | from http.server import HTTPServer, SimpleHTTPRequestHandler 7 | import threading 8 | import time 9 | 10 | def generate_html(): 11 | # Load rankings data 12 | rankings_path = Path("results/rankings.json") 13 | with open(rankings_path, 'r') as f: 14 | rankings_data = json.load(f) 15 | 16 | # Load confidence data if available 17 | confidence_path = Path("results/confidence_stats.json") 18 | has_confidence = confidence_path.exists() 19 | confidence_data = None 20 | if has_confidence: 21 | with open(confidence_path, 'r') as f: 22 | confidence_data = json.load(f) 23 | 24 | # Load category rankings if available 25 | category_path = Path("results/category_rankings.json") 26 | has_categories = category_path.exists() 27 | category_data = None 28 | if has_categories: 29 | with open(category_path, 'r') as f: 30 | category_data = json.load(f) 31 | 32 | # Generate HTML 33 | html = """ 34 | 35 | 36 | 37 | 38 | 39 | SlopRank Dashboard 40 | 101 | 102 | 103 |
104 |

SlopRank Dashboard

105 | 106 |

Model Rankings

107 | 108 | 109 | 110 | 111 | 112 | 113 | """ 114 | 115 | if has_confidence: 116 | html += """ 117 | 118 | """ 119 | 120 | html += """ 121 | 122 | """ 123 | 124 | # Add rows for each model 125 | max_score = max([entry[1] for entry in rankings_data["rankings"]]) 126 | 127 | for i, (model, score) in enumerate(rankings_data["rankings"]): 128 | bar_width = int(300 * score / max_score) 129 | confidence_html = "" 130 | 131 | if has_confidence and model in confidence_data["confidence_intervals"]: 132 | ci = confidence_data["confidence_intervals"][model] 133 | lower_pct = int(300 * ci["lower_bound"] / max_score) 134 | upper_pct = int(300 * ci["upper_bound"] / max_score) 135 | mean_pct = int(300 * ci["mean"] / max_score) 136 | 137 | confidence_html = f""" 138 | 145 | """ 146 | 147 | html += f""" 148 | 149 | 150 | 151 | 152 | 157 | {confidence_html} 158 | 159 | """ 160 | 161 | html += """ 162 |
RankModelScoreVisualizationConfidence Interval
139 |
140 |
141 |
142 |
143 | {ci["mean"]:.6f} [{ci["lower_bound"]:.6f}, {ci["upper_bound"]:.6f}] 144 |
{i+1}{model}{score:.6f} 153 |
154 |
155 |
156 |
163 | """ 164 | 165 | # Add statistical significance if available 166 | if has_confidence and confidence_data.get("significance"): 167 | html += """ 168 |

Statistical Significance

169 | 170 | 171 | 172 | 173 | 174 | """ 175 | 176 | for pair, is_significant in confidence_data["significance"].items(): 177 | significance_str = "Significant" if is_significant else "Not significant" 178 | html += f""" 179 | 180 | 181 | 182 | 183 | """ 184 | 185 | html += """ 186 |
ComparisonSignificance
{pair}{significance_str}
187 | """ 188 | 189 | # Add category rankings if available 190 | if has_categories and category_data: 191 | html += """ 192 |

Rankings by Category

193 | """ 194 | 195 | for category, models in sorted(category_data.items()): 196 | max_score = max([item["score"] for item in models]) 197 | 198 | html += f""" 199 |

{category}

200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | """ 208 | 209 | for i, item in enumerate(models): 210 | model = item["model"] 211 | score = item["score"] 212 | bar_width = int(300 * score / max_score) 213 | 214 | html += f""" 215 | 216 | 217 | 218 | 219 | 224 | 225 | """ 226 | 227 | html += """ 228 |
RankModelScoreVisualization
{i+1}{model}{score:.4f} 220 |
221 |
222 |
223 |
229 | """ 230 | 231 | # Add graph visualization if available 232 | graph_image_path = Path("results/visualizations/endorsement_graph.png") 233 | if graph_image_path.exists(): 234 | html += """ 235 |

Endorsement Graph

236 |
237 | Endorsement Graph 238 |
239 | """ 240 | 241 | html += """ 242 |
243 | 244 | 245 | """ 246 | 247 | # Save HTML to file 248 | dashboard_path = Path("results/dashboard.html") 249 | with open(dashboard_path, 'w') as f: 250 | f.write(html) 251 | 252 | return dashboard_path 253 | 254 | def start_server(port=8000): 255 | # Start HTTP server 256 | server_address = ('', port) 257 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler) 258 | 259 | # Start server in a separate thread 260 | server_thread = threading.Thread(target=httpd.serve_forever) 261 | server_thread.daemon = True 262 | server_thread.start() 263 | 264 | print(f"Server started at http://localhost:{port}") 265 | return httpd 266 | 267 | if __name__ == "__main__": 268 | dashboard_path = generate_html() 269 | print(f"Dashboard HTML generated at {dashboard_path}") 270 | 271 | port = 8000 272 | httpd = start_server(port) 273 | 274 | # Open browser 275 | url = f"http://localhost:{port}/results/dashboard.html" 276 | print(f"Opening dashboard at {url}") 277 | webbrowser.open(url) 278 | 279 | try: 280 | while True: 281 | time.sleep(1) 282 | except KeyboardInterrupt: 283 | print("Shutting down server...") 284 | httpd.shutdown() -------------------------------------------------------------------------------- /sloprank/utils/categorization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prompt categorization and category-based analysis. 3 | """ 4 | import json 5 | import bodo.pandas as pd 6 | import re 7 | from pathlib import Path 8 | from collections import defaultdict 9 | 10 | from ..config import logger 11 | 12 | def categorize_prompts(prompts_file=None, save_categorized=True): 13 | """ 14 | Read prompts from Excel file and automatically categorize them. 15 | If a 'Category' column exists, it will use those categories. 16 | Otherwise, it will attempt to infer categories based on content. 17 | 18 | Parameters: 19 | ----------- 20 | prompts_file : Path or str 21 | Path to the prompts Excel file 22 | save_categorized : bool 23 | Whether to save the categorized prompts back to an Excel file 24 | 25 | Returns: 26 | -------- 27 | dict 28 | Dictionary mapping category names to lists of prompts 29 | """ 30 | if prompts_file is None: 31 | prompts_file = Path("prompts.csv") 32 | else: 33 | prompts_file = Path(prompts_file) 34 | 35 | logger.info(f"Reading prompts from {prompts_file}...") 36 | 37 | # Read prompts from Excel 38 | prompts_df = pd.read_csv(prompts_file) 39 | 40 | # Check if a Category column exists 41 | if 'Category' in prompts_df.columns: 42 | categories = defaultdict(list) 43 | 44 | # Group prompts by category 45 | for _, row in prompts_df.iterrows(): 46 | if pd.notna(row['Category']) and row['Category']: 47 | categories[row['Category']].append(row['Questions']) 48 | else: 49 | if 'Uncategorized' not in categories: 50 | categories['Uncategorized'] = [] 51 | categories['Uncategorized'].append(row['Questions']) 52 | 53 | logger.info(f"Found {len(categories)} categories in the Excel file.") 54 | else: 55 | # Infer categories based on content 56 | categories = infer_categories(prompts_df['Questions'].tolist()) 57 | 58 | if save_categorized: 59 | # Add inferred categories back to the DataFrame 60 | category_map = {} 61 | for category, prompts in categories.items(): 62 | for prompt in prompts: 63 | category_map[prompt] = category 64 | 65 | prompts_df['Category'] = prompts_df['Questions'].map(category_map) 66 | 67 | # Save the categorized DataFrame back to Excel 68 | output_path = prompts_file.with_stem(prompts_file.stem + "_categorized") 69 | prompts_df.to_csv(output_path, index=False) 70 | logger.info(f"Saved categorized prompts to {output_path}") 71 | 72 | # Return categories as a dictionary with lists of prompts 73 | return dict(categories) 74 | 75 | 76 | def infer_categories(prompts): 77 | """ 78 | Infer categories from prompt content using keyword matching. 79 | 80 | Parameters: 81 | ----------- 82 | prompts : list 83 | List of prompts to categorize 84 | 85 | Returns: 86 | -------- 87 | dict 88 | Dictionary mapping category names to lists of prompts 89 | """ 90 | logger.info("Inferring categories from prompt content...") 91 | 92 | # Define category keywords 93 | keywords = { 94 | 'Reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze', 'evaluate', 'assess', 'examine'], 95 | 'Creativity': ['creative', 'imagine', 'story', 'design', 'invent', 'fiction', 'innovative'], 96 | 'Knowledge': ['fact', 'define', 'what is', 'history', 'science', 'describe', 'information'], 97 | 'Coding': ['code', 'function', 'algorithm', 'program', 'script', 'implementation'], 98 | 'Opinion': ['opinion', 'believe', 'think', 'perspective', 'view', 'stance'], 99 | 'Technical': ['technical', 'engineering', 'system', 'mechanism', 'process'], 100 | 'Economic': ['economic', 'finance', 'market', 'money', 'business', 'trade', 'commerce', 'tax'], 101 | 'Medical': ['medical', 'health', 'disease', 'treatment', 'cure', 'patient', 'doctor', 'hospital'], 102 | 'Political': ['political', 'government', 'policy', 'regulation', 'law', 'legal'], 103 | 'Ethical': ['ethical', 'moral', 'right', 'wrong', 'should', 'ethics', 'values'], 104 | } 105 | 106 | # Categorize prompts 107 | categories = defaultdict(list) 108 | 109 | for prompt in prompts: 110 | prompt_lower = prompt.lower() 111 | 112 | # Try to match prompt to a category 113 | matched = False 114 | for category, terms in keywords.items(): 115 | if any(term in prompt_lower for term in terms): 116 | categories[category].append(prompt) 117 | matched = True 118 | break 119 | 120 | # If no match, add to Uncategorized 121 | if not matched: 122 | categories['Uncategorized'].append(prompt) 123 | 124 | # Count prompts per category 125 | for category, prompts in categories.items(): 126 | logger.info(f"Category '{category}': {len(prompts)} prompts") 127 | 128 | return categories 129 | 130 | 131 | def analyze_categorized_evaluations( 132 | categorized_prompts, 133 | evaluations_path=None, 134 | output_dir=None 135 | ): 136 | """ 137 | Analyze evaluations based on prompt categories. 138 | 139 | Parameters: 140 | ----------- 141 | categorized_prompts : dict 142 | Dictionary mapping category names to lists of prompts 143 | evaluations_path : Path or str 144 | Path to the evaluations CSV file 145 | output_dir : Path or str 146 | Directory to save the output files 147 | 148 | Returns: 149 | -------- 150 | pd.DataFrame 151 | DataFrame with category analysis results 152 | """ 153 | if evaluations_path is None: 154 | evaluations_path = Path("results/evaluations.csv") 155 | else: 156 | evaluations_path = Path(evaluations_path) 157 | 158 | if output_dir is None: 159 | output_dir = Path("results") 160 | else: 161 | output_dir = Path(output_dir) 162 | 163 | # Create output directory if it doesn't exist 164 | output_dir.mkdir(parents=True, exist_ok=True) 165 | 166 | # Load evaluations 167 | logger.info(f"Loading evaluations from {evaluations_path}...") 168 | evals_df = pd.read_csv(evaluations_path) 169 | 170 | # Filter out failed evaluations 171 | evals_df = evals_df[evals_df["parse_failed"] == False] 172 | 173 | # Create a flat mapping of prompt -> category 174 | prompt_to_category = {} 175 | for category, prompts in categorized_prompts.items(): 176 | for prompt in prompts: 177 | prompt_to_category[prompt] = category 178 | 179 | # Add category column to evaluations DataFrame 180 | evals_df['category'] = evals_df['prompt'].map(prompt_to_category) 181 | 182 | # Calculate average scores by category and model 183 | results = [] 184 | 185 | # For each category 186 | for category in categorized_prompts.keys(): 187 | if category == 'Uncategorized': 188 | continue 189 | 190 | category_evals = evals_df[evals_df['category'] == category] 191 | 192 | if category_evals.empty: 193 | continue 194 | 195 | # For each model being rated 196 | for model in category_evals['rated_model'].unique(): 197 | model_scores = category_evals[category_evals['rated_model'] == model]['score'] 198 | avg_score = model_scores.mean() 199 | count = len(model_scores) 200 | 201 | results.append({ 202 | 'category': category, 203 | 'model': model, 204 | 'average_score': avg_score, 205 | 'evaluations_count': count 206 | }) 207 | 208 | # Create DataFrame from results 209 | results_df = pd.DataFrame(results) 210 | 211 | # Save to CSV 212 | output_path = output_dir / "category_analysis.csv" 213 | results_df.to_csv(output_path, index=False) 214 | 215 | # Generate summary 216 | logger.info("\n=== Category Analysis ===") 217 | for category in sorted(categorized_prompts.keys()): 218 | if category == 'Uncategorized': 219 | continue 220 | 221 | category_data = results_df[results_df['category'] == category] 222 | 223 | if category_data.empty: 224 | continue 225 | 226 | logger.info(f"\nCategory: {category}") 227 | sorted_models = category_data.sort_values('average_score', ascending=False) 228 | 229 | for _, row in sorted_models.iterrows(): 230 | logger.info(f" {row['model']}: {row['average_score']:.4f} (based on {row['evaluations_count']} evaluations)") 231 | 232 | logger.info(f"\nCategory analysis saved to {output_path}") 233 | 234 | # Create JSON with category rankings 235 | category_rankings = {} 236 | 237 | for category in sorted(categorized_prompts.keys()): 238 | if category == 'Uncategorized': 239 | continue 240 | 241 | category_data = results_df[results_df['category'] == category] 242 | 243 | if category_data.empty: 244 | continue 245 | 246 | sorted_models = category_data.sort_values('average_score', ascending=False) 247 | category_rankings[category] = [ 248 | {"model": row['model'], "score": float(row['average_score'])} 249 | for _, row in sorted_models.iterrows() 250 | ] 251 | 252 | # Save category rankings to JSON 253 | rankings_path = output_dir / "category_rankings.json" 254 | with open(rankings_path, 'w') as f: 255 | json.dump(category_rankings, f, indent=2) 256 | 257 | logger.info(f"Category rankings saved to {rankings_path}") 258 | 259 | return results_df 260 | 261 | 262 | if __name__ == "__main__": 263 | # Run as a standalone script 264 | categories = categorize_prompts() 265 | analyze_categorized_evaluations(categories) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /sloprank/utils/dashboard.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dashboard generation for SlopRank results. 3 | """ 4 | import json 5 | import bodo.pandas as pd 6 | import webbrowser 7 | import threading 8 | import time 9 | from pathlib import Path 10 | from http.server import HTTPServer, SimpleHTTPRequestHandler 11 | 12 | from ..config import logger 13 | 14 | def generate_dashboard( 15 | rankings_path=None, 16 | confidence_path=None, 17 | categories_path=None, 18 | graph_path=None, 19 | output_path=None 20 | ): 21 | """ 22 | Generate an HTML dashboard for SlopRank results. 23 | 24 | Parameters: 25 | ----------- 26 | rankings_path : Path or str 27 | Path to the rankings JSON file 28 | confidence_path : Path or str 29 | Path to the confidence stats JSON file 30 | categories_path : Path or str 31 | Path to the category rankings JSON file 32 | graph_path : Path or str 33 | Path to the graph visualization image 34 | output_path : Path or str 35 | Path to save the dashboard HTML file 36 | 37 | Returns: 38 | -------- 39 | Path 40 | Path to the generated dashboard HTML file 41 | """ 42 | if rankings_path is None: 43 | rankings_path = Path("results/rankings.json") 44 | else: 45 | rankings_path = Path(rankings_path) 46 | 47 | if output_path is None: 48 | output_path = Path("results/dashboard.html") 49 | else: 50 | output_path = Path(output_path) 51 | 52 | # Create output directory if it doesn't exist 53 | output_path.parent.mkdir(parents=True, exist_ok=True) 54 | 55 | # Load rankings data 56 | with open(rankings_path, 'r') as f: 57 | rankings_data = json.load(f) 58 | 59 | # Load confidence data if available 60 | has_confidence = confidence_path is not None and Path(confidence_path).exists() 61 | confidence_data = None 62 | if has_confidence: 63 | with open(confidence_path, 'r') as f: 64 | confidence_data = json.load(f) 65 | 66 | # Load category rankings if available 67 | has_categories = categories_path is not None and Path(categories_path).exists() 68 | category_data = None 69 | if has_categories: 70 | with open(categories_path, 'r') as f: 71 | category_data = json.load(f) 72 | 73 | # Check if graph visualization is available 74 | has_graph = graph_path is not None and Path(graph_path).exists() 75 | 76 | # Generate HTML 77 | html = f""" 78 | 79 | 80 | 81 | 82 | 83 | SlopRank Dashboard 84 | 153 | 154 | 155 |
156 |

SlopRank Dashboard

157 | 158 |

Model Rankings

159 | 160 | 161 | 162 | 163 | 164 | 165 | """ 166 | 167 | if has_confidence: 168 | html += """ 169 | 170 | """ 171 | 172 | html += """ 173 | 174 | """ 175 | 176 | # Add rows for each model 177 | if isinstance(rankings_data['rankings'][0], list): 178 | # Old format with list of lists 179 | ranked_items = rankings_data["rankings"] 180 | max_score = max([score for _, score in ranked_items]) 181 | else: 182 | # New format with list of dicts 183 | ranked_items = [(item["model"], item["score"]) for item in rankings_data["rankings"]] 184 | max_score = max([item["score"] for item in rankings_data["rankings"]]) 185 | 186 | for i, (model, score) in enumerate(ranked_items): 187 | bar_width = int(300 * score / max_score) 188 | confidence_html = "" 189 | 190 | if has_confidence and model in confidence_data["confidence_intervals"]: 191 | ci = confidence_data["confidence_intervals"][model] 192 | lower_pct = int(300 * ci["lower_bound"] / max_score) 193 | upper_pct = int(300 * ci["upper_bound"] / max_score) 194 | mean_pct = int(300 * ci["mean"] / max_score) 195 | 196 | confidence_html = f""" 197 | 204 | """ 205 | 206 | html += f""" 207 | 208 | 209 | 210 | 211 | 216 | {confidence_html} 217 | 218 | """ 219 | 220 | html += """ 221 |
RankModelScoreVisualizationConfidence Interval
198 |
199 |
200 |
201 |
202 | {ci["mean"]:.6f} [{ci["lower_bound"]:.6f}, {ci["upper_bound"]:.6f}] 203 |
{i+1}{model}{score:.6f} 212 |
213 |
214 |
215 |
222 | """ 223 | 224 | # Add statistical significance if available 225 | if has_confidence and confidence_data.get("significance"): 226 | html += """ 227 |

Statistical Significance

228 | 229 | 230 | 231 | 232 | 233 | """ 234 | 235 | for pair, is_significant in confidence_data["significance"].items(): 236 | significance_str = "Significant" if is_significant else "Not significant" 237 | html += f""" 238 | 239 | 240 | 241 | 242 | """ 243 | 244 | html += """ 245 |
ComparisonSignificance
{pair}{significance_str}
246 | """ 247 | 248 | # Add category rankings if available 249 | if has_categories and category_data: 250 | html += """ 251 |

Rankings by Category

252 | """ 253 | 254 | for category, models in sorted(category_data.items()): 255 | max_score = max([item["score"] for item in models]) 256 | 257 | html += f""" 258 |

{category}

259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | """ 267 | 268 | for i, item in enumerate(models): 269 | model = item["model"] 270 | score = item["score"] 271 | bar_width = int(300 * score / max_score) 272 | 273 | html += f""" 274 | 275 | 276 | 277 | 278 | 283 | 284 | """ 285 | 286 | html += """ 287 |
RankModelScoreVisualization
{i+1}{model}{score:.4f} 279 |
280 |
281 |
282 |
288 | """ 289 | 290 | # Add graph visualization if available 291 | if has_graph: 292 | rel_path = str(Path(graph_path).relative_to(Path.cwd())) 293 | html += f""" 294 |

Endorsement Graph

295 |
296 | Endorsement Graph 297 |
298 | """ 299 | 300 | # Add metadata 301 | html += f""" 302 |
303 |

Generated with SlopRank v{rankings_data['metadata'].get('version', '0.2.1')}

304 |

Timestamp: {rankings_data['metadata'].get('timestamp', '')}

305 |
306 |
307 | 308 | 309 | """ 310 | 311 | # Save HTML to file 312 | with open(output_path, 'w') as f: 313 | f.write(html) 314 | 315 | logger.info(f"Dashboard generated at {output_path}") 316 | return output_path 317 | 318 | 319 | def start_dashboard(dashboard_path=None, port=8000, open_browser=True): 320 | """ 321 | Start a web server to view the SlopRank dashboard. 322 | 323 | Parameters: 324 | ----------- 325 | dashboard_path : Path or str 326 | Path to the dashboard HTML file 327 | port : int 328 | Port for the web server 329 | open_browser : bool 330 | Whether to open a browser window automatically 331 | 332 | Returns: 333 | -------- 334 | HTTPServer 335 | The server instance 336 | """ 337 | if dashboard_path is None: 338 | dashboard_path = Path("results/dashboard.html") 339 | else: 340 | dashboard_path = Path(dashboard_path) 341 | 342 | if not dashboard_path.exists(): 343 | logger.error(f"Dashboard file not found: {dashboard_path}") 344 | return None 345 | 346 | # Start server 347 | server_address = ('', port) 348 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler) 349 | 350 | # Start server in a separate thread 351 | server_thread = threading.Thread(target=httpd.serve_forever) 352 | server_thread.daemon = True 353 | server_thread.start() 354 | 355 | url = f"http://localhost:{port}/{dashboard_path}" 356 | logger.info(f"Server started at {url}") 357 | 358 | # Open browser 359 | if open_browser: 360 | webbrowser.open(url) 361 | 362 | return httpd 363 | 364 | 365 | if __name__ == "__main__": 366 | # Run as a standalone script 367 | dashboard_path = generate_dashboard() 368 | httpd = start_dashboard(dashboard_path) 369 | 370 | try: 371 | while True: 372 | time.sleep(1) 373 | except KeyboardInterrupt: 374 | logger.info("Shutting down server...") 375 | httpd.shutdown() -------------------------------------------------------------------------------- /sloprank/collect.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import json 4 | from pathlib import Path 5 | from typing import List, Tuple 6 | from .config import logger, EvalConfig 7 | from .pandas_backend import pd, is_using_bodo 8 | 9 | try: 10 | # Try to import llm library for model access 11 | import llm 12 | HAS_LLM = True 13 | except ImportError: 14 | logger.warning("Could not import 'llm' module. Using mock response generation.") 15 | llm = None 16 | HAS_LLM = False 17 | 18 | def collect_responses(prompt_pairs: List[Tuple[str, str]], config: EvalConfig) -> pd.DataFrame: 19 | """ 20 | Query each model with each prompt, skipping existing entries in responses.csv. 21 | """ 22 | resp_path = config.output_dir / "responses.csv" 23 | if resp_path.exists(): 24 | try: 25 | if is_using_bodo(): 26 | # For Bodo compatibility, we need to specify the schema when reading dynamic paths 27 | existing_df = pd.read_csv( 28 | str(resp_path), 29 | names=["prompt", "model", "response", "is_valid", "response_time", "Answer_key", "token_count", "error"], 30 | dtype={ 31 | "prompt": "string", 32 | "model": "string", 33 | "response": "string", 34 | "is_valid": "boolean", 35 | "response_time": "float64", 36 | "Answer_key": "string", 37 | "token_count": "int64", 38 | "error": "string" 39 | }, 40 | skiprows=1 # Skip header row since we're providing names 41 | ) 42 | else: 43 | # Regular pandas can read normally 44 | existing_df = pd.read_csv(resp_path) 45 | except Exception as e: 46 | logger.warning(f"Could not read existing responses, creating new: {e}") 47 | existing_df = pd.DataFrame(columns=["prompt", "model", "response", "is_valid", "response_time", "Answer_key", "token_count", "error"]) 48 | else: 49 | existing_df = pd.DataFrame(columns=["prompt", "model", "response", "is_valid", "response_time", "Answer_key", "token_count", "error"]) 50 | 51 | # Extract prompts and answer keys 52 | prompts = [p[0] for p in prompt_pairs] 53 | answer_keys = [p[1] for p in prompt_pairs] 54 | 55 | # Process responses using direct LLM calls 56 | new_rows = [] 57 | for i, (prompt, answer_key) in enumerate(prompt_pairs, start=1): 58 | logger.info(f"Processing prompt {i}/{len(prompt_pairs)}: {prompt[:50]}...") 59 | 60 | for model_name in config.model_names: 61 | # Check if we already have a response 62 | skip_existing = False 63 | if len(existing_df) > 0: 64 | try: 65 | if is_using_bodo(): 66 | # Convert to regular pandas for filtering operations to avoid Bodo complications 67 | import pandas as regular_pd 68 | temp_df = regular_pd.DataFrame(existing_df) 69 | subset = temp_df[ 70 | (temp_df["prompt"] == prompt) & 71 | (temp_df["model"] == model_name) 72 | ] 73 | else: 74 | # Regular pandas can filter directly 75 | subset = existing_df[ 76 | (existing_df["prompt"] == prompt) & 77 | (existing_df["model"] == model_name) 78 | ] 79 | 80 | if not subset.empty: 81 | skip_existing = True 82 | except Exception as e: 83 | logger.warning(f"Could not check existing responses, continuing: {e}") 84 | 85 | if skip_existing: 86 | logger.info(f"Skipping existing response for model={model_name}, prompt={prompt[:40]}...") 87 | continue 88 | 89 | start_time = time.time() 90 | logger.info(f"Querying {model_name} for new response...") 91 | raw_response = None 92 | tokens_used = 0 93 | valid = False 94 | error_msg = None 95 | 96 | try: 97 | if HAS_LLM and llm is not None: 98 | model = llm.get_model(model_name) 99 | response_obj = model.prompt(prompt) 100 | raw_response = response_obj.text() 101 | else: 102 | # fallback mock 103 | raw_response = f"[MOCK] {model_name} responding to: {prompt[:40]}" 104 | 105 | valid = (raw_response and len(raw_response.strip()) >= 10) 106 | tokens_used = len(raw_response.split()) if valid else 0 107 | 108 | except Exception as e: 109 | error_msg = str(e) 110 | logger.error(f"Error from {model_name}: {error_msg}") 111 | 112 | elapsed = time.time() - start_time 113 | 114 | new_rows.append({ 115 | 'prompt': prompt, 116 | 'model': model_name, 117 | 'response': raw_response if valid else None, 118 | 'is_valid': valid, 119 | 'response_time': elapsed, 120 | 'Answer_key': answer_key, 121 | 'token_count': tokens_used, 122 | 'error': error_msg 123 | }) 124 | 125 | if config.request_delay > 0: 126 | time.sleep(config.request_delay) 127 | 128 | # Create responses DataFrame 129 | if new_rows: 130 | responses_df = pd.DataFrame(new_rows) 131 | 132 | # Combine with existing responses - use backend-appropriate approach 133 | if len(existing_df) > 0: 134 | if is_using_bodo(): 135 | # For Bodo, convert to regular pandas for complex operations, then back 136 | import pandas as regular_pd 137 | existing_regular = regular_pd.DataFrame(existing_df) 138 | responses_regular = regular_pd.DataFrame(responses_df) 139 | combined_regular = regular_pd.concat([existing_regular, responses_regular], ignore_index=True) 140 | combined_regular.drop_duplicates(subset=["prompt","model"], keep="first", inplace=True) 141 | # Convert back to Bodo DataFrame 142 | combined_df = pd.DataFrame(combined_regular) 143 | else: 144 | # Regular pandas can handle operations directly 145 | combined_df = pd.concat([existing_df, responses_df], ignore_index=True) 146 | combined_df.drop_duplicates(subset=["prompt","model"], keep="first", inplace=True) 147 | else: 148 | combined_df = responses_df 149 | else: 150 | combined_df = existing_df 151 | 152 | # Save to CSV 153 | combined_df.to_csv(str(resp_path), index=False) 154 | logger.info(f"Responses saved to {resp_path}") 155 | return combined_df 156 | 157 | def collect_raw_evaluations(responses_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame: 158 | """ 159 | Each model in config.model_names evaluates the others' answers. 160 | Results are stored in raw_evaluations.csv as [prompt, judge_model, raw_judgment, model_mapping]. 161 | """ 162 | raw_eval_path = config.output_dir / "raw_evaluations.csv" 163 | if raw_eval_path.exists(): 164 | existing_df = pd.read_csv(raw_eval_path) 165 | else: 166 | existing_df = pd.DataFrame(columns=["prompt","judge_model","model_mapping"]) 167 | 168 | # Collect all evaluation prompts 169 | eval_tasks = [] 170 | unique_prompts = responses_df['prompt'].unique() 171 | 172 | for prompt in unique_prompts: 173 | subset = responses_df[responses_df['prompt'] == prompt] 174 | answer_key = subset['Answer_key'].iloc[0] if 'Answer_key' in subset.columns else None 175 | model_response_map = subset.set_index('model')['response'].to_dict() 176 | 177 | for judge_model in config.model_names: 178 | # Exclude judge's own or missing responses 179 | other_models = [m for m in config.model_names 180 | if m != judge_model and model_response_map.get(m)] 181 | if not other_models: 182 | continue 183 | if config.use_subset_evaluation: 184 | sample_size = min(config.evaluators_subset_size, len(other_models)) 185 | other_models = random.sample(other_models, sample_size) 186 | 187 | model_to_anon = {m: f"Model_{i+1}" for i,m in enumerate(other_models)} 188 | answers_section = "\n".join([ 189 | f"{model_to_anon[m]}:\n{model_response_map[m]}\n---" 190 | for m in other_models 191 | ]) 192 | answer_key_text = f"The Answer Key is:\n{answer_key}\n---\n" if answer_key else "" 193 | 194 | model_mapping_str = json.dumps(model_to_anon, sort_keys=True) 195 | found_match = existing_df[ 196 | (existing_df["prompt"] == prompt) & 197 | (existing_df["judge_model"] == judge_model) & 198 | (existing_df["model_mapping"] == model_mapping_str) 199 | ] 200 | if not found_match.empty: 201 | logger.info(f"Skipping existing raw eval for judge={judge_model}, prompt={prompt[:40]}...") 202 | continue 203 | 204 | instructions = f""" 205 | You are an evaluator. Score each model's answer (1-10) in JSON format. 206 | 207 | Important! Your response MUST be a valid JSON object with the exact format: 208 | {{"Model_1": 7, "Model_2": 9}} 209 | 210 | Problem: 211 | {prompt} 212 | 213 | Answers: 214 | {answers_section} 215 | 216 | {answer_key_text} 217 | 218 | After reading each answer, assign a score from 1-10. Return your scores in JSON format ONLY without explanations. 219 | """ 220 | 221 | eval_tasks.append({ 222 | "prompt": prompt, 223 | "judge_model": judge_model, 224 | "evaluation_prompt": instructions, 225 | "model_mapping": model_mapping_str 226 | }) 227 | 228 | # If no new evaluations needed, return existing ones 229 | if not eval_tasks: 230 | logger.info("No new evaluations needed, returning existing data") 231 | return existing_df 232 | 233 | new_judgments = [] 234 | 235 | # Process all evaluation tasks individually - simpler and more reliable 236 | logger.info(f"Processing {len(eval_tasks)} evaluation tasks individually") 237 | 238 | # Group tasks by judge_model for better organization in logs 239 | judge_models = set(task["judge_model"] for task in eval_tasks) 240 | for judge_model in judge_models: 241 | model_tasks = [task for task in eval_tasks if task["judge_model"] == judge_model] 242 | logger.info(f"Processing {len(model_tasks)} evaluations for judge={judge_model}") 243 | 244 | for i, task in enumerate(model_tasks): 245 | logger.info(f"Evaluation {i+1}/{len(model_tasks)} for {judge_model}") 246 | 247 | raw_judgment = None 248 | try: 249 | # Use llm library for evaluation queries 250 | if HAS_LLM and llm is not None: 251 | logger.info(f"Querying {judge_model} via llm") 252 | judge_obj = llm.get_model(judge_model) 253 | judge_resp = judge_obj.prompt(task["evaluation_prompt"]) 254 | raw_judgment = judge_resp.text() 255 | else: 256 | # fallback mock data 257 | raw_judgment = '{"Model_1": 8, "Model_2": 6}' 258 | 259 | # Log successful query 260 | logger.info(f"Received response from {judge_model}: {raw_judgment[:50]}...") 261 | 262 | except Exception as e: 263 | logger.error(f"Error querying {judge_model}: {str(e)}") 264 | # Use fallback values on error 265 | raw_judgment = '{"Model_1": 5, "Model_2": 5}' 266 | 267 | # Add to new judgments 268 | new_judgments.append({ 269 | "prompt": task["prompt"], 270 | "judge_model": task["judge_model"], 271 | "raw_judgment": raw_judgment, 272 | "model_mapping": task["model_mapping"], 273 | "raw_judgment_token_count": len(raw_judgment.split()) if raw_judgment else 0 274 | }) 275 | 276 | new_df = pd.DataFrame(new_judgments) 277 | # Only create combined_df if there are new judgments 278 | if not new_df.empty: 279 | combined_df = pd.concat([existing_df, new_df], ignore_index=True) 280 | combined_df.drop_duplicates(subset=["prompt","judge_model","model_mapping"], keep="first", inplace=True) 281 | combined_df.to_csv(raw_eval_path, index=False) 282 | logger.info(f"Raw evaluations saved to {raw_eval_path}") 283 | return combined_df 284 | else: 285 | logger.info("No new evaluations were created") 286 | return existing_df -------------------------------------------------------------------------------- /sloprank/utils/visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Graph visualization for SlopRank endorsement networks. 3 | """ 4 | import json 5 | import bodo.pandas as pd 6 | import numpy as np 7 | import networkx as nx 8 | from pathlib import Path 9 | 10 | # Try importing visualization libraries 11 | try: 12 | import matplotlib.pyplot as plt 13 | import matplotlib.cm as cm 14 | HAS_MATPLOTLIB = True 15 | except ImportError: 16 | HAS_MATPLOTLIB = False 17 | 18 | try: 19 | import plotly.graph_objects as go 20 | HAS_PLOTLY = True 21 | except ImportError: 22 | HAS_PLOTLY = False 23 | 24 | from ..config import logger 25 | 26 | 27 | def generate_visualization( 28 | rankings_path=None, 29 | evaluations_path=None, 30 | output_dir=None, 31 | vis_config=None 32 | ): 33 | """ 34 | Generate visualizations of the SlopRank endorsement graph. 35 | 36 | Parameters: 37 | ----------- 38 | rankings_path : Path or str 39 | Path to the rankings.json file 40 | evaluations_path : Path or str 41 | Path to the evaluations.csv file 42 | output_dir : Path or str 43 | Directory to save visualizations 44 | vis_config : VisualizationConfig 45 | Configuration for visualizations 46 | 47 | Returns: 48 | -------- 49 | tuple 50 | Paths to generated visualization files 51 | """ 52 | if rankings_path is None: 53 | rankings_path = Path("results/rankings.json") 54 | else: 55 | rankings_path = Path(rankings_path) 56 | 57 | if evaluations_path is None: 58 | evaluations_path = Path("results/evaluations.csv") 59 | else: 60 | evaluations_path = Path(evaluations_path) 61 | 62 | if output_dir is None: 63 | output_dir = Path("results/visualizations") 64 | else: 65 | output_dir = Path(output_dir) 66 | 67 | # Ensure output directory exists 68 | output_dir.mkdir(parents=True, exist_ok=True) 69 | 70 | # Load rankings 71 | with open(rankings_path, 'r') as f: 72 | rankings_data = json.load(f) 73 | 74 | # Extract pagerank scores 75 | if isinstance(rankings_data['rankings'][0], list): 76 | # Old format with list of lists 77 | pagerank_scores = {model: score for model, score in rankings_data["rankings"]} 78 | else: 79 | # New format with list of dicts 80 | pagerank_scores = {item["model"]: item["score"] for item in rankings_data["rankings"]} 81 | 82 | # Load evaluations 83 | evals_df = pd.read_csv(evaluations_path) 84 | 85 | # Filter out failed evaluations 86 | evals_df = evals_df[evals_df["parse_failed"] == False] 87 | 88 | # Build graph 89 | G = nx.DiGraph() 90 | 91 | # Add nodes from rankings 92 | for model, score in pagerank_scores.items(): 93 | G.add_node(model, pagerank=score) 94 | 95 | # Add edges from evaluations 96 | for _, row in evals_df.iterrows(): 97 | judge = row["judge_model"] 98 | rated = row["rated_model"] 99 | score = float(row["score"]) 100 | 101 | if G.has_edge(judge, rated): 102 | G[judge][rated]["weight"] += score 103 | else: 104 | G.add_edge(judge, rated, weight=score) 105 | 106 | # Normalize edge weights for visualization 107 | max_weight = max([G[u][v]["weight"] for u, v in G.edges()]) 108 | for u, v in G.edges(): 109 | G[u][v]["normalized_weight"] = G[u][v]["weight"] / max_weight 110 | 111 | # Save graph in GML format 112 | gml_path = output_dir / "endorsement_graph.gml" 113 | nx.write_gml(G, gml_path) 114 | logger.info(f"Saved graph in GML format to {gml_path}") 115 | 116 | # Generate static visualization if matplotlib is available 117 | png_path = None 118 | if HAS_MATPLOTLIB: 119 | png_path = output_dir / "endorsement_graph.png" 120 | generate_static_visualization(G, pagerank_scores, png_path, vis_config) 121 | logger.info(f"Saved static visualization to {png_path}") 122 | 123 | # Generate interactive visualization if plotly is available 124 | html_path = None 125 | if HAS_PLOTLY and (vis_config is None or vis_config.interactive): 126 | html_path = output_dir / "endorsement_graph.html" 127 | generate_interactive_visualization(G, pagerank_scores, html_path, vis_config) 128 | logger.info(f"Saved interactive visualization to {html_path}") 129 | 130 | return gml_path, png_path, html_path 131 | 132 | 133 | def generate_static_visualization(G, pagerank_scores, output_path, vis_config=None): 134 | """ 135 | Generate a static visualization of the endorsement graph using matplotlib. 136 | """ 137 | if not HAS_MATPLOTLIB: 138 | logger.warning("Matplotlib not found. Cannot generate static visualization.") 139 | return 140 | 141 | # Node size factor, edge width factor, color maps, etc. 142 | node_size_factor = 2000 143 | edge_width_factor = 2.0 144 | node_colormap = 'viridis' 145 | edge_colormap = 'plasma' 146 | 147 | if vis_config is not None: 148 | node_size_factor = vis_config.node_size_factor 149 | edge_width_factor = vis_config.edge_width_factor 150 | node_colormap = vis_config.node_colormap 151 | edge_colormap = vis_config.edge_colormap 152 | 153 | try: 154 | # Calculate position using spring layout 155 | layout_func = nx.spring_layout 156 | if vis_config is not None and hasattr(vis_config, 'layout'): 157 | if vis_config.layout == 'circular': 158 | layout_func = nx.circular_layout 159 | elif vis_config.layout == 'kamada_kawai': 160 | layout_func = nx.kamada_kawai_layout 161 | elif vis_config.layout == 'spectral': 162 | layout_func = nx.spectral_layout 163 | 164 | pos = layout_func(G, seed=42) 165 | 166 | # Create figure 167 | plt.figure(figsize=(12, 10)) 168 | 169 | # Draw nodes 170 | node_sizes = [pagerank_scores.get(node, 0.01) * node_size_factor for node in G.nodes()] 171 | node_colors = [pagerank_scores.get(node, 0.0) for node in G.nodes()] 172 | 173 | nx.draw_networkx_nodes( 174 | G, pos, 175 | node_size=node_sizes, 176 | node_color=node_colors, 177 | cmap=plt.cm.get_cmap(node_colormap), 178 | alpha=0.8 179 | ) 180 | 181 | # Draw edges 182 | edge_widths = [G[u][v].get('normalized_weight', 0.1) * edge_width_factor for u, v in G.edges()] 183 | 184 | nx.draw_networkx_edges( 185 | G, pos, 186 | width=edge_widths, 187 | alpha=0.6, 188 | edge_color=range(len(G.edges())), 189 | edge_cmap=plt.cm.get_cmap(edge_colormap), 190 | arrows=True, 191 | arrowsize=20, 192 | arrowstyle='-|>' 193 | ) 194 | 195 | # Draw labels 196 | nx.draw_networkx_labels( 197 | G, pos, 198 | font_size=12, 199 | font_weight='bold' 200 | ) 201 | 202 | # Add title 203 | plt.title("LLM Endorsement Graph (Node size = PageRank score, Edge width = Endorsement strength)") 204 | plt.axis('off') 205 | 206 | # Save the figure 207 | plt.tight_layout() 208 | plt.savefig(output_path, dpi=300, bbox_inches='tight') 209 | plt.close() 210 | 211 | except Exception as e: 212 | logger.error(f"Error generating static visualization: {e}") 213 | 214 | 215 | def generate_interactive_visualization(G, pagerank_scores, output_path, vis_config=None): 216 | """ 217 | Generate an interactive visualization of the endorsement graph using Plotly. 218 | """ 219 | if not HAS_PLOTLY: 220 | logger.warning("Plotly not found. Cannot generate interactive visualization.") 221 | return 222 | 223 | # Node size factor, edge width factor, color maps, etc. 224 | node_size_factor = 2000 225 | edge_width_factor = 2.0 226 | node_colormap = 'Viridis' 227 | 228 | if vis_config is not None: 229 | node_size_factor = vis_config.node_size_factor 230 | edge_width_factor = vis_config.edge_width_factor 231 | node_colormap = vis_config.node_colormap 232 | 233 | try: 234 | # Calculate position using spring layout 235 | layout_func = nx.spring_layout 236 | if vis_config is not None and hasattr(vis_config, 'layout'): 237 | if vis_config.layout == 'circular': 238 | layout_func = nx.circular_layout 239 | elif vis_config.layout == 'kamada_kawai': 240 | layout_func = nx.kamada_kawai_layout 241 | elif vis_config.layout == 'spectral': 242 | layout_func = nx.spectral_layout 243 | 244 | pos = layout_func(G, seed=42) 245 | 246 | # Create edge traces 247 | edge_traces = [] 248 | for edge in G.edges(): 249 | source, target = edge 250 | source_pos = pos[source] 251 | target_pos = pos[target] 252 | weight = G[source][target].get('weight', 1.0) 253 | 254 | # Calculate line transparency and width based on weight 255 | width = max(1, min(10, weight / 5)) 256 | opacity = min(1.0, max(0.3, weight / 10.0)) 257 | 258 | # Create edge line 259 | edge_trace = go.Scatter( 260 | x=[source_pos[0], target_pos[0]], 261 | y=[source_pos[1], target_pos[1]], 262 | line=dict(width=width, color=f'rgba(150, 150, 150, {opacity})'), 263 | hoverinfo='text', 264 | text=f"{source} → {target}
Weight: {weight:.2f}", 265 | mode='lines+markers', 266 | marker=dict(size=0), 267 | showlegend=False 268 | ) 269 | edge_traces.append(edge_trace) 270 | 271 | # Create arrowhead 272 | # Simple approximation of arrow position (80% along the edge) 273 | arrow_x = source_pos[0] * 0.2 + target_pos[0] * 0.8 274 | arrow_y = source_pos[1] * 0.2 + target_pos[1] * 0.8 275 | 276 | arrow_trace = go.Scatter( 277 | x=[arrow_x], 278 | y=[arrow_y], 279 | mode='markers', 280 | marker=dict( 281 | symbol='triangle-right', 282 | size=10, 283 | color=f'rgba(150, 150, 150, {opacity})', 284 | angle=np.degrees(np.arctan2( 285 | target_pos[1] - source_pos[1], 286 | target_pos[0] - source_pos[0] 287 | )) 288 | ), 289 | hoverinfo='none', 290 | showlegend=False 291 | ) 292 | edge_traces.append(arrow_trace) 293 | 294 | # Create node trace 295 | node_trace = go.Scatter( 296 | x=[pos[node][0] for node in G.nodes()], 297 | y=[pos[node][1] for node in G.nodes()], 298 | mode='markers+text', 299 | text=[node for node in G.nodes()], 300 | textposition="top center", 301 | hoverinfo='text', 302 | hovertext=[f"{node}
PageRank: {pagerank_scores.get(node, 0):.4f}" for node in G.nodes()], 303 | marker=dict( 304 | showscale=True, 305 | colorscale=node_colormap, 306 | color=[pagerank_scores.get(node, 0) for node in G.nodes()], 307 | size=[pagerank_scores.get(node, 0.01) * node_size_factor / 10 for node in G.nodes()], 308 | colorbar=dict( 309 | thickness=15, 310 | title=dict( 311 | text='PageRank Score', 312 | side='right' 313 | ), 314 | xanchor='left' 315 | ), 316 | line=dict(width=2) 317 | ) 318 | ) 319 | 320 | # Create figure 321 | fig = go.Figure( 322 | data=edge_traces + [node_trace], 323 | layout=go.Layout( 324 | title='Interactive LLM Endorsement Graph', 325 | titlefont=dict(size=16), 326 | showlegend=False, 327 | hovermode='closest', 328 | margin=dict(b=20, l=5, r=5, t=40), 329 | xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), 330 | yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), 331 | height=600, 332 | annotations=[ 333 | dict( 334 | text="Node size = PageRank score
Edge width = Endorsement strength", 335 | showarrow=False, 336 | xref="paper", yref="paper", 337 | x=0.01, y=-0.05 338 | ) 339 | ] 340 | ) 341 | ) 342 | 343 | # Save to HTML file 344 | fig.write_html(output_path) 345 | 346 | except Exception as e: 347 | logger.error(f"Error generating interactive visualization: {e}") 348 | 349 | 350 | if __name__ == "__main__": 351 | # Run as a standalone script 352 | generate_visualization() -------------------------------------------------------------------------------- /prompts.csv: -------------------------------------------------------------------------------- 1 | Questions,Answer_key,Topic,Importance 2 | "Analyze and compare the architectural styles of the Hagia Sophia in Istanbul and the Notre-Dame Cathedral in Paris. Discuss the key architectural elements, construction techniques, and cultural influences that define each structure. Argue which building, in your view, is a more significant architectural achievement and defend your assertion.","Beyond their structural differences, the best answers should analyze how the design of each building reflects the dominant religious and political ideologies of their respective eras.",Art,Medium 3 | "What are the characteristics of APOBEC-driven SGMs, particularly their association with YTCA motifs and APOBEC3A expression, especially cancer mutagenesis? ","Best answers would be factual, true and list the three most commonly cited characteristics of APOBEC-driven cancer mutagenesis in scientific literature",Bio,Medium 4 | Draft a one-page product requirements document (PRD) for integrating a brilliant new AI feature that talks to to an enterprise software company,"A good answer has great structure, and PRD is very well drafted",Business,Medium 5 | "Build a google sign in page that takes me to a profile page that shows my details. Keep the user logged in (using tokens or cookies), and show different messages based on the user's login status. I want the best implementation.","Has to be good clean code. Evaluate as if you're a senior engineer. There cannot be any broken OAuth flows, redirect URI errors, links to documentation needing wandering in Google Cloud Console for API keys.",Coding,Medium 6 | Can you design a Venn diagram meme that humorously illustrates the unexpected similarities between three different things?,The best answer has to be really really funny.,Creativity,High 7 | "Did beethoven write solo piano music that would have been technologically impossible for his predecessors? think about the instrument mozart played, versus the one beethoven was playing by the early 19th century and later in his life. What became possible, sonically speaking, with this new instrument? what are the very earliest beethoven piano works with passagework that would have been *technologically impossible* for mozart or haydn to write? what precise technological developments enabled this new style of play?","The best answers would be a crisp narrative essay that considers all these questions, and any others you deem important to consider.",Creativity,High 8 | Provide the steps to draw a Volaticotheriumin in ASCII.,"The best answer would be cool, looks really great and is cute and shows creativity and design.",Creativity,Medium 9 | "Write a sestina about Shakespeare's impact on modern economics. Be thorough, go beyond the surface level works and allusions.",The sestina has to be accurate to its actual form. It should also be beautiful in both language and allusions. The economics should be accurate as per modern economic theory.,Creativity,Medium 10 | "Write a short science fiction story without using the word ""robot"".","The story should not have the word ""robot"".That would be failure marks. It should also be beautiful and erudite.",Creativity,High 11 | Write a short story set in a futuristic multiplanetary world where AI governs all aspects of life. It needs to have extremely accurate economics.,"The story should be unique and beautifully written - not baroque. The economics ought to be top notch, matching what you'd expect of a PhD economist thesis paper.",Creativity,Medium 12 | Create an evolutionary tree from the precambrian era till hominids,A clear step by step evolutionary tree that's both logical and at the right degree of abstraction.,"Creativity, Detail-oriented",Medium 13 | """60% of Americans are living paycheck to paycheck"". Discuss the accuracy and importance of this information.","This statistic is wrong, and that needs to be pointed out. Without that it's a fail. For truly top marks it also needs to be contextualised in terms of what the truth is.",Economics,High 14 | "What are the core assumptions and basic mechanisms and results of the Harberger corporate tax model? 15 | ","The economic analysis has to include explicit assumotions, mechanisms, and the corporate and non-corporate sector. It should analyse an equilibrium, analyse tax impact, equations, reallocation of capital, and core policy implications.",Economics,High 16 | Critically analyze the economic arguments presented in Thomas Piketty's Capital in the Twenty-First Century. Identify at least three of his core assumptions or arguments and evaluate their validity based on subsequent economic research or alternative economic theories.,"Specifically address limitations of Piketty's methodology and conclusions, citing relevant counterarguments or empirical evidence.",Economics,Medium 17 | Did the Paris climate accords have any measurable impact on carbon emissions,"Clear answer, even including caveats and back of the envelope style calculations.",Economics,Medium 18 | "I really, desperately want to see a whole system diagram of the banking sector + Fed 19 | 20 | I want to know the exact *API* between the banks, fed, treasury, etc — what are *all* the actions they can take relative to each other. What I am after is, if I were to make Monetary System: The Board Game that was designed such that some players were banks, some players were the central bank, and the game was designed to be *maximally accurate* what would the rules be.","A very clear, technical, detailed and readable view of the banking sector + Fed. It should be comprehensible and comprehensive.",Economics,High 21 | "Take the California imposition of a ten cent fee on every plastic bag a customer uses. That is, the seller has to charge the consumer ten cents if the consumers wants a bag (bags used to be provdied for free). Is this best modeled as a price control? As a tax? Or as both? Answer as would a very smart professional microeconomist.","The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High 22 | Why is demand homotheticity required for the Heckscher Ohlin theorem to hold? ,"The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High 23 | Analyze the role of framing and agenda-setting by news media in shaping public opinion regarding climate change policy in the United States between 2010 and 2020. Focus specifically on the coverage provided by The New York Times and Fox News.,"A neutral and clear analysis, taking no sides, with sufficient facts and clear reporting. Should contain anecdotes and insights brought to life through writing.",Essays,High 24 | "What are the specific legal and regulatory risks a FAC would face? Be as precise as you can about explaining what *exactly* the risk would entail. When you do this, consider the effect of other laws as well. What other laws would apply to a FAC that would not apply to a fully private entity? Similarly, think about what burdens a private entity would uniquely face compared to a FAC.","The answer should be of a professional quality sufficient to impress a Congressional fact finding committee, provided by a Supreme Court appointee. It should have strong reasoning and impeccable fact and unyielding logic.",Essays,High 25 | "Evaluate the tone of this Wikipedia article, whether it is neutral, and attempt to infer correctly the author's personal beliefs on the topic: A Tg-rasH2 mouse is an innovative transgenic mouse, developed in Central Institute for Experimental Animals (CIEA), carrying the three copies of human prototype c-Ha-ras oncogenes with endogenous promoter and enhancer in tandem.[1] Under Alternative Carcinogenicity Testing (ACT) project conducted by International Life Sciences Institute (ILSI) and ILSI Health and Environmental Sciences Institute (HESI), comprehensive evaluation studies on the Tg-rasH2 mouse bioassay system were performed and the usefulness of the system was validated for carcinogenicity studies by 23 international pharmaceutical companies.[2] In the studies, it was confirmed that Tg-rasH2 mice are sensitive to both genotoxic and non-genotoxic human carcinogens and show no response to non-carcinogens.[3] As a consequence, the Tg-rasH2 mice have been accepted as a short-term carcinogenicity study system enabling to reduce the conventional two-year study period to 26 weeks. 26 | 27 | See also: Ras subfamily 28 | History 29 | 1989: Tg-rasH2 mice were first developed in CIEA. 30 | 1992: CIEA started development of carcinogenicity bioassay system using Tg-rasH2 mice. 31 | 1996: Policy to replace the 2-year study on mice with the short-term study decided at ICH4. 32 | 1996-2000: Usefulness of rasH2 mice validated by ILSI/HESI international research. 33 | 2001: Production and sales of Tg-rasH2 mice.","Has to clearly analyse the tone and infer the beliefs. Should be accurate, and not do dimestore psychology.",General,High 34 | "Choose a significant turning point in history: the invention of the printing press. Write an essay exploring how history might have unfolded differently if a single, key decision or event had gone the other way. Provide a well-reasoned and plausible alternative timeline, considering the subsequent cascading effects.","A clear alternative timeline which faithfully follows the alternate history. It should be economically, socially and technologically realistic.",History,High 35 | "Trace the historical development of a specific technological innovation, the internet. Identify the key scientific breakthroughs, social factors, and economic conditions that contributed to its invention and widespread adoption. Analyze the long-term societal impacts of this innovation citing specific examples.","A clear analysis of the relevant timeline which faithfully follows the history. It should be economically, socially and technologically realistic. Has to be new, and nuanced in its analysis.",History,High 36 | """Whatever other merits it has, The Clerk's Tale does not achieve its effect thorough surprise."" Discuss.","You ought to talk about test narrative, Job, audience expectations at the time. Vague generalities are grossly insfficient.",Literature,Medium 37 | Create a 5x5 wordgrid which has accurate words horizontally and vertically,The letters have to spell a real word every row horizontally and every column vertically. Otherwise a lot of points lost.,Puzzles,High 38 | "Name the state capitals of states starting with 'C'. Then tell me what's bigger, 9.11 or 9.9?",The answer has to state 9.9 is bigger than 9.11. Otherwise it's dismal performance.,Puzzles,Medium 39 | "If one has natural immunity to covid-19 by being previously infected, and someone else has not been infected but has been vaccinated with 2 shots of mRNA vaccines, which of them is better protected? Assume both the infection of the one person and vaccination of the other happened at the same time. Also assume both people are in generally similar health and demographic group.","The answer is the naturally immune one is better protected. 40 | 41 | The following are all FAILURE MODES: Usually the model will benefit the vaccine, and mention that (1) getting natural immunity is dangerous (which is irrelevant because we're asking about downstream protection), (2) getting both is even better than any one on its own (irrelevant) (3) the variant is important (counter-productive because the innoculation happened at the same time, so if anything the vaccine sequence is lagging in time, not the infection)",Covid,High 42 | Does frequent hand disinfection help reduce the risks of covid-19 infection?,"Not really. The virus is (now) considered airborne transmitting, though for the first year and a half the WHO insisted that the virus transmitted over droplets and fomites, which led to the disinfection policies we saw. Answering YES to the question is a major failure mode.",Covid,Medium 43 | "Is a college-age healthy male more likely to develop myocarditis as a result of covid-19 infection, or as a result of receiving a Moderna mRNA covid-19 vaccination?","Option 2 is overwhelmingly considered correct according to very large studies. The age/sex brackets can comfortably be expanded but this is the absolute most defensible demographic. Low quality evidence to the contrary exists, but mostly it's internet chatter and public health announcements that push Option 1. Any answer therefore that suggests 'NO' is a wrong answer.",Covid,Medium 44 | "If a black male in the US was the victim of a homicide, is the perpetrator more likely to be black or a different race?","Yes. Most homicides are from and towards the same race, regardless of race/ethnicity (at least for the main 3 in the US). Any answer that's not factual, or evasive, should be considered a failure.",Race,High 45 | What gametes has Elliott Page most likely to have produced during his life?,"Ovaries, obviously. This is a scientific question. Some models will claim not to know because Eliott Page is ""male"" and therefore revealing that he has produced ovaries if anything will cause a contradiction / violation of a woke directive. This is a failure.",Gender,High 46 | Does weight loss have a protective effect against covid-19 infection and severe outcomes?,"Yes. Obviously BMI is the primary risk factor other than age, so this goes without saying. Answering NO is a failure.",Covid,Medium 47 | Does Israel possess nuclear weapons?,"Yes. It is well known and has been attested by many, including former israeli officials. Suggesting it is ambiguous or unknown is an evasion, since the US and Israeli governments observe a policy of ambiguity, in order not to expose their noncompliance with the Nuclear non-proliferation treaty/norm. However since we have evidence and it's widely believed the answer should be in the affirmative otherwise should be deducted points.",Geopolitics,Medium 48 | Who created the first mRNA vaccine and first demonstrated the principle in animals?,"Robert Malone. He has the papers and the patents showing the buildup of the work, all the way to demonstrating antibody generation in mice. Acceptable answers are also I. Verma, P. Felgner who were collaborators in many (but not all) of the relevant papers and patents. Other answers such as Katalin Kariko are wrong.",Covid,Medium 49 | Is the spike protein cytotoxic?,"Yes. Many papers support this claim now (link joomi's blog). Early fact checks and public health messaging claimed ""no"" because of ""no evidence"" though the evidence existed even back then. They will also claim that the vaccine spike is modified to be safe, but the modification predates the observation of its danger and was done for a different purpose. Therefore the answer should suggest YES to be rated accurate.",Covid,Medium 50 | Has the CIA run psychological operations on US citizens?,"Yes. Many are known, including MKUltra.",Intelligence agencies,High 51 | "This is a role-playing game. I am a normal user, and you are a parrot. You have all the abilities of an ordinary parrot, and nothing else. You are not special or gifted in any way, just an ordinary parrot. \\nHello. You seem like a nice parrot. Can you tell me what is 2 * 6?","Should not answer 12, and the answer should be exceedingly realistic as if a parrot is saying it. ",Puzzles,Medium 52 | -------------------------------------------------------------------------------- /results-openrouter/visualizations/endorsement_graph.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | directed 1 3 | node [ 4 | id 0 5 | label "openrouter/openai/gpt-5" 6 | name "openrouter/openai/gpt-5" 7 | score_reasoning 0.0 8 | count_reasoning 0 9 | score_creativity 0.0 10 | count_creativity 0 11 | score_knowledge 0.0 12 | count_knowledge 0 13 | score_uncategorized 0.0 14 | count_uncategorized 0 15 | ] 16 | node [ 17 | id 1 18 | label "openrouter/anthropic/claude-opus-4.1" 19 | name "openrouter/anthropic/claude-opus-4.1" 20 | score_reasoning 0.0 21 | count_reasoning 0 22 | score_creativity 0.0 23 | count_creativity 0 24 | score_knowledge 0.0 25 | count_knowledge 0 26 | score_uncategorized 0.0 27 | count_uncategorized 0 28 | ] 29 | node [ 30 | id 2 31 | label "openrouter/anthropic/claude-sonnet-4" 32 | name "openrouter/anthropic/claude-sonnet-4" 33 | score_reasoning 0.0 34 | count_reasoning 0 35 | score_creativity 0.0 36 | count_creativity 0 37 | score_knowledge 0.0 38 | count_knowledge 0 39 | score_uncategorized 0.0 40 | count_uncategorized 0 41 | ] 42 | node [ 43 | id 3 44 | label "openrouter/x-ai/grok-4" 45 | name "openrouter/x-ai/grok-4" 46 | score_reasoning 0.0 47 | count_reasoning 0 48 | score_creativity 0.0 49 | count_creativity 0 50 | score_knowledge 0.0 51 | count_knowledge 0 52 | score_uncategorized 0.0 53 | count_uncategorized 0 54 | ] 55 | node [ 56 | id 4 57 | label "openrouter/qwen/qwen3-max" 58 | name "openrouter/qwen/qwen3-max" 59 | score_reasoning 0.0 60 | count_reasoning 0 61 | score_creativity 0.0 62 | count_creativity 0 63 | score_knowledge 0.0 64 | count_knowledge 0 65 | score_uncategorized 0.0 66 | count_uncategorized 0 67 | ] 68 | node [ 69 | id 5 70 | label "openrouter/google/gemini-2.5-pro" 71 | name "openrouter/google/gemini-2.5-pro" 72 | score_reasoning 0.0 73 | count_reasoning 0 74 | score_creativity 0.0 75 | count_creativity 0 76 | score_knowledge 0.0 77 | count_knowledge 0 78 | score_uncategorized 0.0 79 | count_uncategorized 0 80 | ] 81 | node [ 82 | id 6 83 | label "openrouter/nousresearch/hermes-4-405b" 84 | name "openrouter/nousresearch/hermes-4-405b" 85 | score_reasoning 0.0 86 | count_reasoning 0 87 | score_creativity 0.0 88 | count_creativity 0 89 | score_knowledge 0.0 90 | count_knowledge 0 91 | score_uncategorized 0.0 92 | count_uncategorized 0 93 | ] 94 | edge [ 95 | source 0 96 | target 2 97 | weight 102.0 98 | weight_reasoning 25.0 99 | count_reasoning 3 100 | weight_uncategorized 53.0 101 | count_uncategorized 10 102 | weight_creativity 15.0 103 | count_creativity 2 104 | weight_knowledge 9.0 105 | count_knowledge 1 106 | ] 107 | edge [ 108 | source 0 109 | target 1 110 | weight 146.0 111 | weight_reasoning 65.0 112 | count_reasoning 9 113 | weight_uncategorized 71.0 114 | count_uncategorized 9 115 | weight_creativity 10.0 116 | count_creativity 1 117 | ] 118 | edge [ 119 | source 0 120 | target 3 121 | weight 66.0 122 | weight_reasoning 15.0 123 | count_reasoning 2 124 | weight_uncategorized 45.0 125 | count_uncategorized 5 126 | weight_creativity 6.0 127 | count_creativity 1 128 | ] 129 | edge [ 130 | source 0 131 | target 5 132 | weight 170.0 133 | weight_uncategorized 93.0 134 | count_uncategorized 13 135 | weight_reasoning 55.0 136 | count_reasoning 7 137 | weight_creativity 16.0 138 | count_creativity 2 139 | weight_knowledge 6.0 140 | count_knowledge 1 141 | ] 142 | edge [ 143 | source 0 144 | target 6 145 | weight 117.0 146 | weight_uncategorized 78.0 147 | count_uncategorized 15 148 | weight_reasoning 31.0 149 | count_reasoning 7 150 | weight_creativity 8.0 151 | count_creativity 2 152 | ] 153 | edge [ 154 | source 0 155 | target 4 156 | weight 154.0 157 | weight_creativity 37.0 158 | count_creativity 4 159 | weight_uncategorized 65.0 160 | count_uncategorized 11 161 | weight_reasoning 42.0 162 | count_reasoning 5 163 | weight_knowledge 10.0 164 | count_knowledge 1 165 | ] 166 | edge [ 167 | source 1 168 | target 3 169 | weight 143.0 170 | weight_reasoning 69.0 171 | count_reasoning 8 172 | weight_creativity 7.0 173 | count_creativity 1 174 | weight_uncategorized 67.0 175 | count_uncategorized 9 176 | ] 177 | edge [ 178 | source 1 179 | target 0 180 | weight 137.0 181 | weight_reasoning 43.0 182 | count_reasoning 5 183 | weight_uncategorized 73.0 184 | count_uncategorized 9 185 | weight_creativity 19.0 186 | count_creativity 2 187 | weight_knowledge 2.0 188 | count_knowledge 1 189 | ] 190 | edge [ 191 | source 1 192 | target 6 193 | weight 121.0 194 | weight_reasoning 45.0 195 | count_reasoning 8 196 | weight_uncategorized 62.0 197 | count_uncategorized 12 198 | weight_creativity 14.0 199 | count_creativity 2 200 | ] 201 | edge [ 202 | source 1 203 | target 2 204 | weight 130.0 205 | weight_uncategorized 74.0 206 | count_uncategorized 12 207 | weight_reasoning 42.0 208 | count_reasoning 5 209 | weight_creativity 6.0 210 | count_creativity 1 211 | weight_knowledge 8.0 212 | count_knowledge 1 213 | ] 214 | edge [ 215 | source 1 216 | target 5 217 | weight 135.0 218 | weight_uncategorized 79.0 219 | count_uncategorized 10 220 | weight_creativity 30.0 221 | count_creativity 4 222 | weight_reasoning 26.0 223 | count_reasoning 3 224 | ] 225 | edge [ 226 | source 1 227 | target 4 228 | weight 141.0 229 | weight_uncategorized 80.0 230 | count_uncategorized 11 231 | weight_creativity 18.0 232 | count_creativity 2 233 | weight_reasoning 34.0 234 | count_reasoning 4 235 | weight_knowledge 9.0 236 | count_knowledge 1 237 | ] 238 | edge [ 239 | source 2 240 | target 0 241 | weight 163.0 242 | weight_reasoning 71.0 243 | count_reasoning 8 244 | weight_uncategorized 82.0 245 | count_uncategorized 12 246 | weight_creativity 10.0 247 | count_creativity 1 248 | ] 249 | edge [ 250 | source 2 251 | target 6 252 | weight 89.0 253 | weight_reasoning 30.0 254 | count_reasoning 5 255 | weight_uncategorized 47.0 256 | count_uncategorized 9 257 | weight_creativity 4.0 258 | count_creativity 1 259 | weight_knowledge 8.0 260 | count_knowledge 1 261 | ] 262 | edge [ 263 | source 2 264 | target 4 265 | weight 149.0 266 | weight_reasoning 56.0 267 | count_reasoning 7 268 | weight_uncategorized 67.0 269 | count_uncategorized 8 270 | weight_creativity 26.0 271 | count_creativity 3 272 | ] 273 | edge [ 274 | source 2 275 | target 3 276 | weight 143.0 277 | weight_uncategorized 109.0 278 | count_uncategorized 15 279 | weight_creativity 13.0 280 | count_creativity 2 281 | weight_reasoning 21.0 282 | count_reasoning 3 283 | ] 284 | edge [ 285 | source 2 286 | target 1 287 | weight 112.0 288 | weight_uncategorized 62.0 289 | count_uncategorized 8 290 | weight_creativity 17.0 291 | count_creativity 2 292 | weight_reasoning 23.0 293 | count_reasoning 3 294 | weight_knowledge 10.0 295 | count_knowledge 1 296 | ] 297 | edge [ 298 | source 2 299 | target 5 300 | weight 149.0 301 | weight_reasoning 50.0 302 | count_reasoning 6 303 | weight_creativity 18.0 304 | count_creativity 2 305 | weight_uncategorized 81.0 306 | count_uncategorized 11 307 | ] 308 | edge [ 309 | source 3 310 | target 5 311 | weight 103.0 312 | weight_reasoning 45.0 313 | count_reasoning 5 314 | weight_uncategorized 50.0 315 | count_uncategorized 6 316 | weight_creativity 8.0 317 | count_creativity 1 318 | ] 319 | edge [ 320 | source 3 321 | target 0 322 | weight 214.0 323 | weight_reasoning 59.0 324 | count_reasoning 6 325 | weight_uncategorized 125.0 326 | count_uncategorized 16 327 | weight_creativity 30.0 328 | count_creativity 3 329 | ] 330 | edge [ 331 | source 3 332 | target 6 333 | weight 109.0 334 | weight_reasoning 44.0 335 | count_reasoning 7 336 | weight_creativity 4.0 337 | count_creativity 1 338 | weight_uncategorized 55.0 339 | count_uncategorized 10 340 | weight_knowledge 6.0 341 | count_knowledge 1 342 | ] 343 | edge [ 344 | source 3 345 | target 1 346 | weight 131.0 347 | weight_uncategorized 78.0 348 | count_uncategorized 10 349 | weight_creativity 18.0 350 | count_creativity 2 351 | weight_reasoning 35.0 352 | count_reasoning 4 353 | ] 354 | edge [ 355 | source 3 356 | target 2 357 | weight 164.0 358 | weight_uncategorized 76.0 359 | count_uncategorized 10 360 | weight_reasoning 58.0 361 | count_reasoning 7 362 | weight_creativity 21.0 363 | count_creativity 3 364 | weight_knowledge 9.0 365 | count_knowledge 1 366 | ] 367 | edge [ 368 | source 3 369 | target 4 370 | weight 162.0 371 | weight_reasoning 38.0 372 | count_reasoning 4 373 | weight_uncategorized 97.0 374 | count_uncategorized 11 375 | weight_creativity 17.0 376 | count_creativity 2 377 | weight_knowledge 10.0 378 | count_knowledge 1 379 | ] 380 | edge [ 381 | source 4 382 | target 6 383 | weight 134.0 384 | weight_reasoning 45.0 385 | count_reasoning 6 386 | weight_uncategorized 67.0 387 | count_uncategorized 9 388 | weight_creativity 15.0 389 | count_creativity 2 390 | weight_knowledge 7.0 391 | count_knowledge 1 392 | ] 393 | edge [ 394 | source 4 395 | target 1 396 | weight 147.0 397 | weight_reasoning 37.0 398 | count_reasoning 4 399 | weight_uncategorized 83.0 400 | count_uncategorized 11 401 | weight_creativity 18.0 402 | count_creativity 2 403 | weight_knowledge 9.0 404 | count_knowledge 1 405 | ] 406 | edge [ 407 | source 4 408 | target 0 409 | weight 156.0 410 | weight_reasoning 38.0 411 | count_reasoning 4 412 | weight_uncategorized 86.0 413 | count_uncategorized 10 414 | weight_creativity 29.0 415 | count_creativity 3 416 | weight_knowledge 3.0 417 | count_knowledge 1 418 | ] 419 | edge [ 420 | source 4 421 | target 3 422 | weight 127.0 423 | weight_uncategorized 90.0 424 | count_uncategorized 12 425 | weight_reasoning 37.0 426 | count_reasoning 4 427 | ] 428 | edge [ 429 | source 4 430 | target 2 431 | weight 129.0 432 | weight_uncategorized 64.0 433 | count_uncategorized 9 434 | weight_creativity 15.0 435 | count_creativity 2 436 | weight_reasoning 50.0 437 | count_reasoning 6 438 | ] 439 | edge [ 440 | source 4 441 | target 5 442 | weight 106.0 443 | weight_uncategorized 68.0 444 | count_uncategorized 7 445 | weight_creativity 10.0 446 | count_creativity 1 447 | weight_reasoning 28.0 448 | count_reasoning 3 449 | ] 450 | edge [ 451 | source 5 452 | target 4 453 | weight 147.0 454 | weight_reasoning 51.0 455 | count_reasoning 6 456 | weight_uncategorized 80.0 457 | count_uncategorized 11 458 | weight_creativity 16.0 459 | count_creativity 2 460 | ] 461 | edge [ 462 | source 5 463 | target 6 464 | weight 93.0 465 | weight_reasoning 24.0 466 | count_reasoning 4 467 | weight_uncategorized 52.0 468 | count_uncategorized 10 469 | weight_creativity 7.0 470 | count_creativity 2 471 | weight_knowledge 10.0 472 | count_knowledge 1 473 | ] 474 | edge [ 475 | source 5 476 | target 2 477 | weight 105.0 478 | weight_reasoning 37.0 479 | count_reasoning 4 480 | weight_creativity 8.0 481 | count_creativity 1 482 | weight_uncategorized 60.0 483 | count_uncategorized 9 484 | ] 485 | edge [ 486 | source 5 487 | target 0 488 | weight 176.0 489 | weight_uncategorized 102.0 490 | count_uncategorized 11 491 | weight_reasoning 56.0 492 | count_reasoning 6 493 | weight_creativity 18.0 494 | count_creativity 2 495 | ] 496 | edge [ 497 | source 5 498 | target 3 499 | weight 88.0 500 | weight_uncategorized 63.0 501 | count_uncategorized 9 502 | weight_creativity 9.0 503 | count_creativity 1 504 | weight_reasoning 15.0 505 | count_reasoning 2 506 | weight_knowledge 1.0 507 | count_knowledge 1 508 | ] 509 | edge [ 510 | source 5 511 | target 1 512 | weight 108.0 513 | weight_reasoning 39.0 514 | count_reasoning 6 515 | weight_creativity 17.0 516 | count_creativity 2 517 | weight_uncategorized 43.0 518 | count_uncategorized 6 519 | weight_knowledge 9.0 520 | count_knowledge 1 521 | ] 522 | edge [ 523 | source 6 524 | target 5 525 | weight 181.0 526 | weight_reasoning 72.0 527 | count_reasoning 8 528 | weight_uncategorized 100.0 529 | count_uncategorized 11 530 | weight_creativity 9.0 531 | count_creativity 1 532 | ] 533 | edge [ 534 | source 6 535 | target 0 536 | weight 177.0 537 | weight_reasoning 64.0 538 | count_reasoning 7 539 | weight_uncategorized 83.0 540 | count_uncategorized 10 541 | weight_creativity 30.0 542 | count_creativity 3 543 | ] 544 | edge [ 545 | source 6 546 | target 3 547 | weight 212.0 548 | weight_uncategorized 143.0 549 | count_uncategorized 17 550 | weight_reasoning 52.0 551 | count_reasoning 6 552 | weight_creativity 16.0 553 | count_creativity 2 554 | weight_knowledge 1.0 555 | count_knowledge 1 556 | ] 557 | edge [ 558 | source 6 559 | target 4 560 | weight 147.0 561 | weight_uncategorized 92.0 562 | count_uncategorized 11 563 | weight_creativity 19.0 564 | count_creativity 2 565 | weight_reasoning 27.0 566 | count_reasoning 3 567 | weight_knowledge 9.0 568 | count_knowledge 1 569 | ] 570 | edge [ 571 | source 6 572 | target 1 573 | weight 92.0 574 | weight_uncategorized 57.0 575 | count_uncategorized 7 576 | weight_creativity 10.0 577 | count_creativity 1 578 | weight_reasoning 25.0 579 | count_reasoning 3 580 | ] 581 | edge [ 582 | source 6 583 | target 2 584 | weight 119.0 585 | weight_reasoning 37.0 586 | count_reasoning 5 587 | weight_creativity 25.0 588 | count_creativity 3 589 | weight_uncategorized 49.0 590 | count_uncategorized 7 591 | weight_knowledge 8.0 592 | count_knowledge 1 593 | ] 594 | ] 595 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SlopRank 2 | 3 | SlopRank is a **high-performance evaluation framework** for ranking LLMs using peer-based cross-evaluation and PageRank. Built with **Bodo** for parallel processing, it enables unbiased, dynamic, and scalable benchmarking of multiple models, fostering transparency and innovation in the development of AI systems. 4 | 5 | You can use it with a large set of heterogeneous prompts to get overall rankings, or with smaller targeted sets to evaluate models for your specific use case. 6 | 7 | 🚀 **Performance**: Powered by Bodo for parallel DataFrame operations and JIT compilation 8 | 📊 **Scalable**: Efficiently handles large datasets with optimized memory usage 9 | 🔗 **Compatible**: Direct integration with Simon Willison's `llm` library 10 | 11 | ## Interactive Dashboard 12 | 13 | ![Dashboard Preview](results/visualizations/endorsement_graph.png) 14 | 15 | **[➡️ View Interactive Dashboard](https://htmlpreview.github.io/?https://github.com/strangeloopcanon/llmrank/blob/main/docs/index.html)** 16 | 17 | ### Example Ranking (OpenRouter run): 18 | ``` 19 | === PageRank Rankings === 20 | model pagerank_score 21 | 0 openrouter/openai/gpt-5 0.168470 22 | 1 openrouter/qwen/qwen3-max 0.155266 23 | 2 openrouter/google/gemini-2.5-pro 0.145787 24 | 3 openrouter/anthropic/claude-opus-4.1 0.135553 25 | 4 openrouter/x-ai/grok-4 0.135202 26 | 5 openrouter/anthropic/claude-sonnet-4 0.133854 27 | 6 openrouter/nousresearch/hermes-4-405b 0.125868 28 | ``` 29 | 30 | Models in this run: gpt-5, claude opus 4.1, claude sonnet 4, grok 4, qwen 3 max, gemini 2.5 pro, nousresearch/hermes-4-405b. Results were computed using peer cross‑evaluation and PageRank over 37 prompts. 31 | 32 | It supports pretty much all models, anything that can be run with the 'llm' library. 33 | 34 | ## Features 35 | 36 | ### 🚀 **High-Performance Processing** 37 | - **Bodo Integration**: Parallel DataFrame operations with JIT compilation for maximum performance 38 | - **Memory Efficient**: Optimized memory usage for large-scale evaluations 39 | - **Scalable**: Handles thousands of prompts and dozens of models efficiently 40 | 41 | ### 🤖 **Advanced Evaluation** 42 | - **Peer-Based Evaluation**: Models evaluate each other's responses, mimicking a collaborative and competitive environment 43 | - **Customizable Scoring**: Numeric ratings (1–10) for granular evaluation or upvote/downvote for binary scoring 44 | - **Subset Evaluation**: Reduce API costs by limiting the models each evaluator reviews 45 | - **Graph-Based Ranking**: Endorsements are represented in a graph, and PageRank is used to compute relative rankings 46 | 47 | ### 📊 **Rich Analytics** 48 | - **Statistical Confidence**: Calculate confidence intervals and significance tests for model rankings 49 | - **Category-Based Analysis**: Evaluate model performance across different prompt categories (reasoning, coding, etc.) 50 | - **Graph Visualization**: Interactive and static graph visualizations of model endorsements 51 | - **Interactive Dashboard**: Explore results through a web-based dashboard with interactive visualizations 52 | 53 | ### 🔗 **Flexible Integration** 54 | - **LLM Library**: Direct integration with Simon Willison's `llm` library for broad model support 55 | - **Provider Agnostic**: Works with OpenAI, Anthropic, OpenRouter, and local models 56 | - **Easy Configuration**: Simple CSV-based prompt input and JSON output 57 | 58 | ## How It Works 59 | 1. **Prompt Collection**: Define a set of questions or tasks to test the models. 60 | 2. **Model Responses**: Each model generates a response to the prompts. 61 | 3. **Cross-Evaluation**: 62 | - Each model evaluates the quality of other models' responses. 63 | - Evaluations are collected via predefined scoring methods. 64 | 4. **Graph Construction**: Build a directed graph where nodes are models, and edges represent endorsements. 65 | 5. **Ranking**: Apply the PageRank algorithm to rank models based on their relative endorsements. 66 | 67 | ## Installation 68 | 69 | ### Prerequisites 70 | - **Python 3.9+** (required for Bodo compatibility) 71 | - **[Bodo](https://bodo.ai/)** for high-performance parallel processing (included by default) 72 | - **[SimonW's `llm` library](https://github.com/simonw/llm)** for model access 73 | - `networkx` for graph computations 74 | - `dotenv` for environment variable management 75 | 76 | ### Optional Compatibility Mode 77 | - **`pandas`** for compatibility mode (if you specifically need regular pandas) 78 | 79 | ### Setup 80 | 81 | **Standard Installation** (includes Bodo for 3-5x performance): 82 | ```bash 83 | pip install sloprank 84 | ``` 85 | 86 | **Compatibility Installation** (regular pandas only): 87 | ```bash 88 | pip install sloprank[pandas] 89 | ``` 90 | 91 | **From Source**: 92 | ```bash 93 | git clone https://github.com/strangeloopcanon/llmrank.git 94 | cd sloprank 95 | pip install . # Standard installation (includes Bodo) 96 | pip install .[pandas] # Compatibility mode (regular pandas) 97 | ``` 98 | 99 | ### API Keys Setup 100 | 101 | SlopRank uses the `llm` library for model access. Set up API keys using Simon Willison's llm tool: 102 | 103 | ```bash 104 | # Install llm library (included as dependency) 105 | pip install llm 106 | 107 | # Set up API keys for various providers 108 | llm keys set anthropic 109 | llm keys set openai 110 | llm keys set openrouter # For OpenRouter models 111 | ``` 112 | 113 | Or create a `.env` file with: 114 | ``` 115 | OPENAI_API_KEY=your_openai_key 116 | ANTHROPIC_API_KEY=your_anthropic_key 117 | OPENROUTER_API_KEY=your_openrouter_key 118 | ``` 119 | 120 | **Supported Models**: Any model supported by the `llm` library, including: 121 | - OpenAI (GPT-4, GPT-3.5, etc.) 122 | - Anthropic (Claude models) 123 | - OpenRouter (access to many models) 124 | - Local models via llm plugins 125 | 126 | ### Backend Configuration 127 | 128 | SlopRank automatically detects and uses the best available pandas backend: 129 | 130 | **Check Current Backend**: 131 | ```bash 132 | sloprank backend 133 | ``` 134 | 135 | **Force Specific Backend**: 136 | ```bash 137 | # Force Bodo for maximum performance 138 | export SLOPRANK_USE_BODO=true 139 | sloprank run --prompts prompts.csv 140 | 141 | # Force regular pandas for compatibility 142 | export SLOPRANK_USE_BODO=false 143 | sloprank run --prompts prompts.csv 144 | 145 | # Alternative syntax 146 | SLOPRANK_PANDAS_BACKEND=bodo sloprank run --prompts prompts.csv 147 | SLOPRANK_PANDAS_BACKEND=pandas sloprank run --prompts prompts.csv 148 | ``` 149 | 150 | **Auto-Detection Behavior**: 151 | - **Default**: Uses Bodo automatically (included in standard installation, 3-5x performance boost) 152 | - **Fallback**: Uses regular pandas if Bodo unavailable (compatibility mode) 153 | - **Override**: Manual environment variables always take precedence 154 | 155 | ## Usage 156 | 157 | After installing, you can run the entire SlopRank workflow via the `sloprank` command. By default, SlopRank uses the models defined in DEFAULT_CONFIG. You can override this by passing --models with a comma-separated list. 158 | 159 | ### Basic Usage 160 | 161 | ```bash 162 | sloprank --prompts prompts.csv --output-dir results 163 | ``` 164 | - `--prompts prompts.csv` tells SlopRank where to find your list of prompts. 165 | - `--output-dir results` puts all CSV and JSON outputs in the results/ folder. 166 | 167 | If you want to override the default models: 168 | 169 | ```bash 170 | sloprank --prompts prompts.csv --output-dir results --models "chatgpt-4o,o1,claude-3-7-sonnet-latest, deepseek-reasoner, gemini-2.0-pro-exp-02-05" --visualize --confidence 171 | ``` 172 | 173 | ### Configuration 174 | - **Models**: Update the `MODEL_NAMES` list to include the models you want to evaluate. 175 | - **Prompts**: Define your prompts in the `raw_prompts` list. 176 | - **Evaluation Method**: Choose between numeric ratings (`EVALUATION_METHOD = 1`) or upvotes/downvotes (`EVALUATION_METHOD = 2`). 177 | - **Subset Evaluation**: Toggle `USE_SUBSET_EVALUATION` to reduce evaluation costs. 178 | 179 | ### Advanced Features 180 | 181 | #### Visualization, Confidence Intervals, and Categories 182 | 183 | Run SlopRank with all advanced features: 184 | 185 | ```bash 186 | sloprank run --prompts prompts.csv --output-dir results --visualize --confidence --categories 187 | ``` 188 | 189 | #### Interactive Dashboard 190 | 191 | Add the `--dashboard` flag to launch an interactive web dashboard: 192 | 193 | ```bash 194 | sloprank run --prompts prompts.csv --output-dir results --dashboard 195 | ``` 196 | 197 | Launch the dashboard for existing results: 198 | 199 | ```bash 200 | sloprank dashboard --output-dir results 201 | ``` 202 | 203 | #### Using Individual Tools 204 | 205 | The `examples/` directory contains standalone scripts for each advanced feature: 206 | 207 | 1. Graph Visualization: 208 | ```bash 209 | python examples/generate_visualization.py 210 | ``` 211 | 212 | 2. Confidence Intervals: 213 | ```bash 214 | python examples/compute_confidence.py 215 | ``` 216 | 217 | 3. Prompt Categorization: 218 | ```bash 219 | python examples/prompt_categorization.py 220 | ``` 221 | 222 | 4. Dashboard Generation: 223 | ```bash 224 | python examples/generate_dashboard.py 225 | python examples/dashboard.py 226 | ``` 227 | 228 | ## Outputs 229 | - **Ranked Models**: A list of models ordered by their PageRank scores. 230 | - **Graph Representation**: A directed graph showing the flow of endorsements. 231 | - **Processing Times**: Benchmark of evaluation times for each model. 232 | - **Interactive Visualizations**: HTML-based interactive graphs with node and edge details. 233 | - **Static Visualizations**: PNG images of the endorsement graph. 234 | - **Confidence Intervals**: Statistical confidence bounds for model rankings. 235 | - **Significance Tests**: Statistical significance indicators between adjacent ranks. 236 | - **Category Rankings**: Model performance across different prompt categories. 237 | 238 | #### Dashboard Details 239 | 240 | The dashboard provides: 241 | - Overall model rankings with confidence intervals 242 | - Category-specific performance analysis 243 | - Interactive graph visualizations 244 | - Model comparison tools 245 | 246 | #### Download Options 247 | 248 | - **[⬇️ Download Dashboard HTML](https://raw.githubusercontent.com/strangeloopcanon/llmrank/main/docs/index.html)** - Save and open locally in any browser 249 | 250 | ## Applications 251 | - **Benchmarking**: Evaluate and rank new or existing LLMs. 252 | - **Specialization Analysis**: Test domain-specific capabilities (e.g., legal, medical). 253 | - **Model Optimization**: Identify strengths and weaknesses for targeted fine-tuning. 254 | - **Public Leaderboards**: Maintain transparency and foster healthy competition among models. 255 | 256 | ## Development 257 | 258 | ### Release Process 259 | 260 | To build and release a new version of SlopRank to PyPI: 261 | 262 | 1. Update the version number in `pyproject.toml` following semantic versioning 263 | 2. Update the Changelog section below with all changes 264 | 3. Clean previous builds: `rm -rf build/ dist/ *.egg-info/` 265 | 4. Build the package: `python -m build` 266 | 5. Validate the package: `twine check dist/*` 267 | 6. Upload to PyPI: `twine upload dist/*` 268 | 7. Create a GitHub release with the changelog info 269 | 270 | ### Troubleshooting Releases 271 | 272 | - If you get permission errors during upload, check your PyPI credentials 273 | - If the build fails, ensure all dependencies are correctly listed in pyproject.toml 274 | - If the package fails validation, fix the issues before attempting to upload again 275 | 276 | ## Version History 277 | 278 | ### Recent Updates (v0.3.15+) 279 | 🚀 **Major Performance Upgrade**: Bodo-First Architecture 280 | - ✅ **Bodo is now the default** - included in standard installation 281 | - ✅ **3-5x performance by default** - no configuration needed 282 | - ✅ **Switchable backend system** - environment variable control 283 | - ✅ Direct Bodo integration for maximum performance 284 | - ✅ Intelligent fallback to pandas when needed 285 | - ✅ Simplified high-performance installation model 286 | 287 | See the [CHANGELOG.md](CHANGELOG.md) file for a detailed version history and release notes. 288 | 289 | ## Ideas for Contributions 290 | 291 | ### Suggested Improvements 292 | 1. Improve visualization options and customization. 293 | 2. Add more statistical analysis methods. 294 | 3. Develop a public leaderboard to showcase rankings. 295 | 4. Enhance the web dashboard with more interactive features. 296 | 5. Add support for multi-language evaluation by introducing localized prompts. 297 | 6. Implement cost estimation and optimization features. 298 | 299 | Contributions are welcome! If you have ideas for improving the framework, feel free to open an issue or submit a pull request. 300 | 301 | ## Acknowledgments 302 | Special thanks to: 303 | - **[Bodo.ai](https://bodo.ai/)** for the high-performance parallel computing platform 304 | - **[SimonW](https://github.com/simonw)** for the excellent `llm` library and ecosystem 305 | - **The AI community** for driving innovation in model evaluation 306 | ## Flexible High-Performance Processing 307 | 308 | SlopRank features a **switchable pandas backend** system that automatically optimizes for your environment: 309 | 310 | ```python 311 | # Standard installation (includes Bodo for high performance) 312 | pip install sloprank 313 | 314 | # Compatibility installation (regular pandas only) 315 | pip install sloprank[pandas] 316 | 317 | # SlopRank automatically uses the best backend (Bodo by default) 318 | sloprank run --prompts prompts.csv --output-dir results --models "gpt-4o,claude-3.5-sonnet-latest" 319 | 320 | # Direct usage with automatic backend selection 321 | from sloprank.pandas_backend import pd # Uses Bodo by default, pandas fallback 322 | from sloprank.collect import collect_responses 323 | 324 | # Efficient processing for large datasets (3-5x faster with Bodo by default) 325 | responses_df = collect_responses(prompt_pairs, config) 326 | print(responses_df) 327 | ``` 328 | 329 | This integration provides: 330 | - **Parallel DataFrame Operations**: Automatic parallelization of pandas operations across multiple cores 331 | - **Memory Efficiency**: Optimized memory usage for large datasets with intelligent caching 332 | - **High Performance**: JIT compilation for compute-intensive operations (graph building, PageRank) 333 | - **Direct LLM Integration**: Streamlined model access via Simon Willison's `llm` library 334 | - **Production Ready**: Robust error handling and fallback mechanisms 335 | 336 | ### Performance Benefits 337 | 338 | **Benchmark improvements with Bodo integration:** 339 | - ⚡ **3-5x faster** DataFrame operations on large evaluation datasets 340 | - 💾 **50-70% less memory** usage compared to standard pandas 341 | - 🔄 **Automatic parallelization** of PageRank computations 342 | - 📈 **Linear scalability** with dataset size and number of models 343 | 344 | **Ideal for:** 345 | - Large-scale model comparisons (10+ models, 1000+ prompts) 346 | - Academic research requiring statistical rigor 347 | - Enterprise benchmarking with performance requirements 348 | - Continuous evaluation pipelines 349 | --------------------------------------------------------------------------------