├── .gitattributes
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── docs
├── README.md
├── index.html
└── visualizations
│ ├── endorsement_graph.gml
│ └── endorsement_graph.png
├── examples
├── README.md
├── compute_confidence.py
├── dashboard.py
├── generate_dashboard.py
├── generate_visualization.py
├── prompt_categorization.py
└── prompts_categorized.csv
├── prompts.csv
├── pyproject.toml
├── readme.md
├── requirements.txt
├── results
├── category_analysis.csv
├── category_rankings.json
├── confidence_stats.json
├── dashboard.html
├── endorsement_graph.gml
├── rankings.json
├── responses.csv
└── visualizations
│ ├── endorsement_graph.gml
│ └── endorsement_graph.png
├── scripts
├── bump_version.py
└── create_github_release.py
├── sloprank
├── __init__.py
├── __main__.py
├── cli.py
├── collect.py
├── config.py
├── parse.py
├── rank.py
└── utils
│ ├── __init__.py
│ ├── categorization.py
│ ├── commands.py
│ ├── confidence.py
│ ├── dashboard.py
│ └── visualization.py
└── tests
├── README.md
├── test_sloprank.py
└── tiny_prompts.csv
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | *.pyc
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | cover/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | .pybuilder/
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | # For a library or package, you might want to ignore these files since the code is
88 | # intended to run in multiple environments; otherwise, check them in:
89 | # .python-version
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # poetry
99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | # This is especially recommended for binary packages to ensure reproducibility, and is more
101 | # commonly ignored for libraries.
102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 |
105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106 | __pypackages__/
107 |
108 | # Celery stuff
109 | celerybeat-schedule
110 | celerybeat.pid
111 |
112 | # SageMath parsed files
113 | *.sage.py
114 |
115 | # Environments
116 | .env
117 | .venv
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 |
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 |
128 | # Rope project settings
129 | .ropeproject
130 |
131 | # mkdocs documentation
132 | /site
133 |
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 |
139 | # Pyre type checker
140 | .pyre/
141 |
142 | # pytype static type analyzer
143 | .pytype/
144 |
145 | # Cython debug symbols
146 | cython_debug/
147 |
148 | # PyCharm
149 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
150 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
151 | # and can be added to the global gitignore or merged into this file. For a more nuclear
152 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
153 | #.idea/
154 | .DS_Store
155 | \# older_scripts
156 | \#Archive/*
157 |
158 | # Ignore private PyPI config
159 | .pypirc
160 |
161 | # Ignore Claude's reference file
162 | CLAUDE.md
163 |
164 | # Ignore test output files
165 | tests/test_results/
166 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to SlopRank will be documented in this file.
4 |
5 | ## [0.2.3] - 2025-02-28
6 |
7 | ### Added
8 | - Tests directory with simple test scripts and example prompts
9 | - Test README with documentation on how to run tests
10 |
11 | ### Fixed
12 | - Improved error handling for subset evaluation configuration
13 | - Automatic adjustment of evaluators_subset_size when too large for the number of models
14 | - Added support for new model versions (Claude-3.7-Sonnet, ChatGPT-4o, Deepseek-Reasoner)
15 |
16 | ## [0.2.2] - 2025-01-14
17 |
18 | ### Added
19 | - Support for graph visualization of model endorsements
20 | - Confidence interval calculations for rankings
21 | - Category analysis for prompt-specific performance
22 |
23 | ### Changed
24 | - Improved API error handling
25 | - Enhanced CLI interface with additional options
26 |
27 | ## [0.2.1] - 2025-01-03
28 |
29 | ### Added
30 | - Dashboard features for interactive exploration
31 | - Visualization improvements
32 |
33 | ### Fixed
34 | - Bug fixes in PageRank calculation
35 | - Better error handling for API timeouts
36 |
37 | ## [0.2.0] - 2024-12-20
38 |
39 | ### Added
40 | - Complete rewrite with modular architecture
41 | - Support for multiple evaluation methods
42 | - Export options for results
43 |
44 | ## [0.1.0] - 2024-12-01
45 |
46 | ### Added
47 | - Initial release
48 | - Basic implementation of peer-based LLM evaluation
49 | - PageRank algorithm for ranking models
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PYPROJECT = pyproject.toml
2 | VERSION = $(shell grep '^version' $(PYPROJECT) | sed -E 's/.*"([0-9]+\.[0-9]+\.[0-9]+)"/\1/')
3 |
4 | .PHONY: clean build check upload bump-patch bump-minor bump-major git-release publish
5 |
6 | clean:
7 | rm -rf build dist *.egg-info
8 |
9 | build: clean
10 | python -m build
11 |
12 | check:
13 | twine check dist/*
14 |
15 | upload:
16 | twine upload dist/*
17 |
18 | bump-patch:
19 | @python scripts/bump_version.py patch
20 |
21 | bump-minor:
22 | @python scripts/bump_version.py minor
23 |
24 | bump-major:
25 | @python scripts/bump_version.py major
26 |
27 | git-release:
28 | git add -A
29 | git commit -m "Release v$(VERSION)" || echo "Nothing to commit"
30 | @if git rev-parse "v$(VERSION)" >/dev/null 2>&1; then \
31 | echo "⚠️ Tag v$(VERSION) already exists. Skipping tag creation."; \
32 | else \
33 | git tag v$(VERSION); \
34 | fi
35 | git push
36 | git push --tags
37 | @python scripts/create_github_release.py v$(VERSION)
38 |
39 | BUMP ?= patch
40 |
41 | publish:
42 | @make bump-$(BUMP)
43 | @make build
44 | @make check
45 | @make upload
46 | @make git-release
47 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # SlopRank Dashboard
2 |
3 | This directory contains the interactive dashboard for SlopRank LLM evaluation framework.
4 |
5 | ## Files
6 |
7 | - `index.html` - The main dashboard file
8 | - `visualizations/` - Directory containing graph visualizations and images
9 |
10 | ## How to Use
11 |
12 | 1. Open `index.html` in any modern web browser
13 | 2. Explore the model rankings, category performance, and graph visualizations
14 |
15 | ## Hosting on GitHub Pages
16 |
17 | This directory is configured to be used with GitHub Pages. When GitHub Pages is enabled for this repo with the 'docs' folder as the source, the dashboard will be available at:
18 |
19 | https://yourusername.github.io/llmrank/
20 |
21 | ## Updating the Dashboard
22 |
23 | To update this dashboard with new evaluation results:
24 |
25 | 1. Run the SlopRank tool with the `--dashboard` option
26 | 2. Copy the resulting dashboard.html to this directory as index.html
27 | 3. Update the image paths if necessary
28 | 4. Commit and push the changes
--------------------------------------------------------------------------------
/docs/visualizations/endorsement_graph.gml:
--------------------------------------------------------------------------------
1 | graph [
2 | directed 1
3 | node [
4 | id 0
5 | label "o1-preview"
6 | pagerank 0.17940361409787733
7 | ]
8 | node [
9 | id 1
10 | label "gpt-4o"
11 | pagerank 0.17830451744580658
12 | ]
13 | node [
14 | id 2
15 | label "deepseek-chat"
16 | pagerank 0.1671054138317305
17 | ]
18 | node [
19 | id 3
20 | label "gemini-2.0-flash-thinking-exp-1219"
21 | pagerank 0.16473186403675355
22 | ]
23 | node [
24 | id 4
25 | label "claude-3-5-sonnet-latest"
26 | pagerank 0.15557086205954448
27 | ]
28 | node [
29 | id 5
30 | label "gemini-exp-1206"
31 | pagerank 0.15488372852828722
32 | ]
33 | edge [
34 | source 0
35 | target 3
36 | weight 131.0
37 | normalized_weight 0.5282258064516129
38 | ]
39 | edge [
40 | source 0
41 | target 2
42 | weight 129.0
43 | normalized_weight 0.5201612903225806
44 | ]
45 | edge [
46 | source 0
47 | target 5
48 | weight 144.0
49 | normalized_weight 0.5806451612903226
50 | ]
51 | edge [
52 | source 0
53 | target 1
54 | weight 157.0
55 | normalized_weight 0.6330645161290323
56 | ]
57 | edge [
58 | source 0
59 | target 4
60 | weight 139.0
61 | normalized_weight 0.5604838709677419
62 | ]
63 | edge [
64 | source 1
65 | target 3
66 | weight 155.0
67 | normalized_weight 0.625
68 | ]
69 | edge [
70 | source 1
71 | target 2
72 | weight 146.0
73 | normalized_weight 0.5887096774193549
74 | ]
75 | edge [
76 | source 1
77 | target 4
78 | weight 146.0
79 | normalized_weight 0.5887096774193549
80 | ]
81 | edge [
82 | source 1
83 | target 0
84 | weight 129.0
85 | normalized_weight 0.5201612903225806
86 | ]
87 | edge [
88 | source 1
89 | target 5
90 | weight 141.0
91 | normalized_weight 0.5685483870967742
92 | ]
93 | edge [
94 | source 2
95 | target 1
96 | weight 212.0
97 | normalized_weight 0.8548387096774194
98 | ]
99 | edge [
100 | source 2
101 | target 3
102 | weight 135.5
103 | normalized_weight 0.5463709677419355
104 | ]
105 | edge [
106 | source 2
107 | target 0
108 | weight 203.0
109 | normalized_weight 0.8185483870967742
110 | ]
111 | edge [
112 | source 2
113 | target 5
114 | weight 142.0
115 | normalized_weight 0.5725806451612904
116 | ]
117 | edge [
118 | source 2
119 | target 4
120 | weight 143.0
121 | normalized_weight 0.5766129032258065
122 | ]
123 | edge [
124 | source 3
125 | target 0
126 | weight 138.0
127 | normalized_weight 0.5564516129032258
128 | ]
129 | edge [
130 | source 3
131 | target 2
132 | weight 173.0
133 | normalized_weight 0.6975806451612904
134 | ]
135 | edge [
136 | source 3
137 | target 4
138 | weight 113.0
139 | normalized_weight 0.45564516129032256
140 | ]
141 | edge [
142 | source 3
143 | target 5
144 | weight 89.0
145 | normalized_weight 0.3588709677419355
146 | ]
147 | edge [
148 | source 3
149 | target 1
150 | weight 130.0
151 | normalized_weight 0.5241935483870968
152 | ]
153 | edge [
154 | source 4
155 | target 0
156 | weight 248.0
157 | normalized_weight 1.0
158 | ]
159 | edge [
160 | source 4
161 | target 3
162 | weight 162.0
163 | normalized_weight 0.6532258064516129
164 | ]
165 | edge [
166 | source 4
167 | target 5
168 | weight 160.0
169 | normalized_weight 0.6451612903225806
170 | ]
171 | edge [
172 | source 4
173 | target 1
174 | weight 166.0
175 | normalized_weight 0.6693548387096774
176 | ]
177 | edge [
178 | source 4
179 | target 2
180 | weight 104.0
181 | normalized_weight 0.41935483870967744
182 | ]
183 | edge [
184 | source 5
185 | target 4
186 | weight 129.0
187 | normalized_weight 0.5201612903225806
188 | ]
189 | edge [
190 | source 5
191 | target 3
192 | weight 188.0
193 | normalized_weight 0.7580645161290323
194 | ]
195 | edge [
196 | source 5
197 | target 2
198 | weight 183.0
199 | normalized_weight 0.7379032258064516
200 | ]
201 | edge [
202 | source 5
203 | target 1
204 | weight 180.0
205 | normalized_weight 0.7258064516129032
206 | ]
207 | edge [
208 | source 5
209 | target 0
210 | weight 148.0
211 | normalized_weight 0.5967741935483871
212 | ]
213 | ]
214 |
--------------------------------------------------------------------------------
/docs/visualizations/endorsement_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strangeloopcanon/LLMRank/7527836faee5af1209059466d89690bedf504014/docs/visualizations/endorsement_graph.png
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # SlopRank Example Scripts
2 |
3 | This directory contains standalone scripts that demonstrate each of the advanced features of SlopRank. These scripts can be run individually after running the main SlopRank tool.
4 |
5 | ## Available Scripts
6 |
7 | ### 1. Graph Visualization (`generate_visualization.py`)
8 |
9 | Creates visual representations of the model endorsement network:
10 |
11 | ```bash
12 | python examples/generate_visualization.py
13 | ```
14 |
15 | **Outputs:**
16 | - Static PNG visualization: `results/visualizations/endorsement_graph.png`
17 | - GraphML file: `results/visualizations/endorsement_graph.gml`
18 |
19 | ### 2. Confidence Intervals (`compute_confidence.py`)
20 |
21 | Uses bootstrap resampling to estimate statistical reliability:
22 |
23 | ```bash
24 | python examples/compute_confidence.py
25 | ```
26 |
27 | **Outputs:**
28 | - `results/confidence_stats.json` containing:
29 | - Confidence intervals for each model's PageRank score
30 | - Statistical significance tests between adjacent ranks
31 |
32 | ### 3. Prompt Categorization (`prompt_categorization.py`)
33 |
34 | Automatically categorizes prompts and provides per-category rankings:
35 |
36 | ```bash
37 | python examples/prompt_categorization.py
38 | ```
39 |
40 | **Outputs:**
41 | - Categorized version of your prompts file
42 | - Per-category rankings in `results/category_rankings.json`
43 | - CSV analysis in `results/category_analysis.csv`
44 |
45 | ### 4. Interactive Dashboard
46 |
47 | #### Dashboard Generation (`generate_dashboard.py`)
48 | Creates an HTML dashboard from all the results:
49 |
50 | ```bash
51 | python examples/generate_dashboard.py
52 | ```
53 |
54 | #### Dashboard Server (`dashboard.py`)
55 | Starts a local server to view the dashboard:
56 |
57 | ```bash
58 | python examples/dashboard.py
59 | ```
60 |
61 | ## Recommended Workflow
62 |
63 | For the best experience, run the tools in this order:
64 |
65 | 1. Run SlopRank: `sloprank --prompts prompts.csv --output-dir results`
66 | 2. Generate visualizations: `python examples/generate_visualization.py`
67 | 3. Compute confidence intervals: `python examples/compute_confidence.py`
68 | 4. Analyze categories: `python examples/prompt_categorization.py`
69 | 5. Generate dashboard: `python examples/generate_dashboard.py`
70 | 6. View the dashboard: `python examples/dashboard.py`
71 |
72 | ## Integrated CLI Alternative
73 |
74 | All these features are now integrated into the main `sloprank` CLI tool:
75 |
76 | ```bash
77 | sloprank run --prompts prompts.csv --output-dir results --visualize --confidence --categories --dashboard
78 | ```
79 |
80 | These standalone example scripts are provided for educational purposes and for users who want to use each feature independently.
--------------------------------------------------------------------------------
/examples/compute_confidence.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import json
3 | import random
4 | import pandas as pd
5 | import numpy as np
6 | import networkx as nx
7 | from pathlib import Path
8 |
9 | def compute_confidence_intervals(iterations=1000):
10 | """
11 | Compute confidence intervals for model rankings using bootstrap resampling.
12 | """
13 | print("Computing confidence intervals...")
14 |
15 | # Load evaluations data
16 | evals_path = Path("results/evaluations.csv")
17 | evals_df = pd.read_csv(evals_path)
18 |
19 | # Filter out failed evaluations
20 | evals_df = evals_df[evals_df["parse_failed"] == False]
21 |
22 | # Get unique models
23 | models = list(set(evals_df["judge_model"].unique()) | set(evals_df["rated_model"].unique()))
24 |
25 | # Store bootstrap results
26 | bootstrap_results = {model: [] for model in models}
27 |
28 | # Run bootstrap iterations
29 | for i in range(iterations):
30 | if i % 100 == 0:
31 | print(f"Bootstrap iteration {i}/{iterations}...")
32 |
33 | # Resample evaluations with replacement
34 | sampled_evals = evals_df.sample(frac=1.0, replace=True)
35 |
36 | # Build graph from resampled data
37 | G = nx.DiGraph()
38 | G.add_nodes_from(models)
39 |
40 | for _, row in sampled_evals.iterrows():
41 | judge = row["judge_model"]
42 | rated = row["rated_model"]
43 | score = float(row["score"])
44 |
45 | if G.has_edge(judge, rated):
46 | G[judge][rated]["weight"] += score
47 | else:
48 | G.add_edge(judge, rated, weight=score)
49 |
50 | # Compute PageRank
51 | if len(G.edges) > 0:
52 | scores = nx.pagerank(G, weight="weight")
53 |
54 | # Store scores
55 | for model, score in scores.items():
56 | bootstrap_results[model].append(score)
57 |
58 | # Calculate confidence intervals (95%)
59 | confidence_stats = {}
60 |
61 | for model in models:
62 | if not bootstrap_results[model]:
63 | confidence_stats[model] = {
64 | "mean": 0.0,
65 | "lower_bound": 0.0,
66 | "upper_bound": 0.0,
67 | "std_dev": 0.0
68 | }
69 | continue
70 |
71 | sorted_scores = sorted(bootstrap_results[model])
72 | lower_idx = int(0.025 * len(sorted_scores))
73 | upper_idx = int(0.975 * len(sorted_scores))
74 |
75 | confidence_stats[model] = {
76 | "mean": np.mean(sorted_scores),
77 | "lower_bound": sorted_scores[lower_idx],
78 | "upper_bound": sorted_scores[upper_idx],
79 | "std_dev": np.std(sorted_scores)
80 | }
81 |
82 | # Test statistical significance
83 | significance_results = {}
84 |
85 | # Create sorted list of models by mean score
86 | models_by_score = sorted(
87 | [(model, stats["mean"]) for model, stats in confidence_stats.items()],
88 | key=lambda x: x[1],
89 | reverse=True
90 | )
91 |
92 | # Compare each adjacent pair in the ranking
93 | for i in range(len(models_by_score) - 1):
94 | model1, _ = models_by_score[i]
95 | model2, _ = models_by_score[i + 1]
96 |
97 | # Determine if significant based on confidence intervals
98 | is_significant = (
99 | confidence_stats[model1]["lower_bound"] > confidence_stats[model2]["upper_bound"] or
100 | confidence_stats[model2]["lower_bound"] > confidence_stats[model1]["upper_bound"]
101 | )
102 |
103 | significance_results[f"{model1}_vs_{model2}"] = is_significant
104 |
105 | # Save results
106 | results = {
107 | "confidence_intervals": confidence_stats,
108 | "significance": significance_results
109 | }
110 |
111 | outfile = Path("results/confidence_stats.json")
112 | with open(outfile, "w") as f:
113 | json.dump(results, f, indent=2)
114 |
115 | # Print summary
116 | print("\n=== Confidence Intervals (95%) ===")
117 | for model, stats in sorted(confidence_stats.items(), key=lambda x: x[1]["mean"], reverse=True):
118 | print(f"{model}: {stats['mean']:.6f} [{stats['lower_bound']:.6f}, {stats['upper_bound']:.6f}]")
119 |
120 | print("\n=== Statistical Significance ===")
121 | for pair, is_significant in significance_results.items():
122 | significance_str = "Significant" if is_significant else "Not significant"
123 | print(f"{pair}: {significance_str}")
124 |
125 | print(f"\nResults saved to {outfile}")
126 |
127 | if __name__ == "__main__":
128 | compute_confidence_intervals(iterations=500) # Lower for faster execution
--------------------------------------------------------------------------------
/examples/dashboard.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import json
3 | import pandas as pd
4 | import webbrowser
5 | from pathlib import Path
6 | from http.server import HTTPServer, SimpleHTTPRequestHandler
7 | import threading
8 | import time
9 |
10 | def generate_html():
11 | # Load rankings data
12 | rankings_path = Path("results/rankings.json")
13 | with open(rankings_path, 'r') as f:
14 | rankings_data = json.load(f)
15 |
16 | # Load confidence data if available
17 | confidence_path = Path("results/confidence_stats.json")
18 | has_confidence = confidence_path.exists()
19 | confidence_data = None
20 | if has_confidence:
21 | with open(confidence_path, 'r') as f:
22 | confidence_data = json.load(f)
23 |
24 | # Load category rankings if available
25 | category_path = Path("results/category_rankings.json")
26 | has_categories = category_path.exists()
27 | category_data = None
28 | if has_categories:
29 | with open(category_path, 'r') as f:
30 | category_data = json.load(f)
31 |
32 | # Generate HTML
33 | html = """
34 |
35 |
36 |
37 |
38 |
39 | SlopRank Dashboard
40 |
101 |
102 |
103 |
104 |
SlopRank Dashboard
105 |
106 |
Model Rankings
107 |
108 |
109 | Rank |
110 | Model |
111 | Score |
112 | Visualization |
113 | """
114 |
115 | if has_confidence:
116 | html += """
117 | Confidence Interval |
118 | """
119 |
120 | html += """
121 |
122 | """
123 |
124 | # Add rows for each model
125 | max_score = max([entry[1] for entry in rankings_data["rankings"]])
126 |
127 | for i, (model, score) in enumerate(rankings_data["rankings"]):
128 | bar_width = int(300 * score / max_score)
129 | confidence_html = ""
130 |
131 | if has_confidence and model in confidence_data["confidence_intervals"]:
132 | ci = confidence_data["confidence_intervals"][model]
133 | lower_pct = int(300 * ci["lower_bound"] / max_score)
134 | upper_pct = int(300 * ci["upper_bound"] / max_score)
135 | mean_pct = int(300 * ci["mean"] / max_score)
136 |
137 | confidence_html = f"""
138 |
139 |
143 | {ci["mean"]:.6f} [{ci["lower_bound"]:.6f}, {ci["upper_bound"]:.6f}]
144 | |
145 | """
146 |
147 | html += f"""
148 |
149 | {i+1} |
150 | {model} |
151 | {score:.6f} |
152 |
153 |
156 | |
157 | {confidence_html}
158 |
159 | """
160 |
161 | html += """
162 |
163 | """
164 |
165 | # Add statistical significance if available
166 | if has_confidence and confidence_data.get("significance"):
167 | html += """
168 |
Statistical Significance
169 |
170 |
171 | Comparison |
172 | Significance |
173 |
174 | """
175 |
176 | for pair, is_significant in confidence_data["significance"].items():
177 | significance_str = "Significant" if is_significant else "Not significant"
178 | html += f"""
179 |
180 | {pair} |
181 | {significance_str} |
182 |
183 | """
184 |
185 | html += """
186 |
187 | """
188 |
189 | # Add category rankings if available
190 | if has_categories and category_data:
191 | html += """
192 |
Rankings by Category
193 | """
194 |
195 | for category, models in sorted(category_data.items()):
196 | max_score = max([item["score"] for item in models])
197 |
198 | html += f"""
199 |
{category}
200 |
201 |
202 | Rank |
203 | Model |
204 | Score |
205 | Visualization |
206 |
207 | """
208 |
209 | for i, item in enumerate(models):
210 | model = item["model"]
211 | score = item["score"]
212 | bar_width = int(300 * score / max_score)
213 |
214 | html += f"""
215 |
216 | {i+1} |
217 | {model} |
218 | {score:.4f} |
219 |
220 |
223 | |
224 |
225 | """
226 |
227 | html += """
228 |
229 | """
230 |
231 | # Add graph visualization if available
232 | graph_image_path = Path("results/visualizations/endorsement_graph.png")
233 | if graph_image_path.exists():
234 | html += """
235 |
Endorsement Graph
236 |
237 |

238 |
239 | """
240 |
241 | html += """
242 |
243 |
244 |
245 | """
246 |
247 | # Save HTML to file
248 | dashboard_path = Path("results/dashboard.html")
249 | with open(dashboard_path, 'w') as f:
250 | f.write(html)
251 |
252 | return dashboard_path
253 |
254 | def start_server(port=8000):
255 | # Start HTTP server
256 | server_address = ('', port)
257 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler)
258 |
259 | # Start server in a separate thread
260 | server_thread = threading.Thread(target=httpd.serve_forever)
261 | server_thread.daemon = True
262 | server_thread.start()
263 |
264 | print(f"Server started at http://localhost:{port}")
265 | return httpd
266 |
267 | if __name__ == "__main__":
268 | dashboard_path = generate_html()
269 | print(f"Dashboard HTML generated at {dashboard_path}")
270 |
271 | port = 8000
272 | httpd = start_server(port)
273 |
274 | # Open browser
275 | url = f"http://localhost:{port}/results/dashboard.html"
276 | print(f"Opening dashboard at {url}")
277 | webbrowser.open(url)
278 |
279 | try:
280 | while True:
281 | time.sleep(1)
282 | except KeyboardInterrupt:
283 | print("Shutting down server...")
284 | httpd.shutdown()
--------------------------------------------------------------------------------
/examples/generate_dashboard.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from dashboard import generate_html
3 |
4 | if __name__ == "__main__":
5 | dashboard_path = generate_html()
6 | print(f"Dashboard HTML generated at {dashboard_path}")
7 | print("You can open this file in a web browser to view the dashboard.")
--------------------------------------------------------------------------------
/examples/generate_visualization.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import json
3 | import pandas as pd
4 | import numpy as np
5 | import networkx as nx
6 | import matplotlib.pyplot as plt
7 | from pathlib import Path
8 |
9 | def generate_visualization():
10 | # Create visualization directory if it doesn't exist
11 | vis_dir = Path("results/visualizations")
12 | vis_dir.mkdir(parents=True, exist_ok=True)
13 |
14 | # Load rankings
15 | rankings_path = Path("results/rankings.json")
16 | with open(rankings_path, 'r') as f:
17 | rankings_data = json.load(f)
18 |
19 | # Load evaluations data
20 | evals_path = Path("results/evaluations.csv")
21 | evals_df = pd.read_csv(evals_path)
22 |
23 | # Filter out failed evaluations
24 | evals_df = evals_df[evals_df["parse_failed"] == False]
25 |
26 | # Build graph
27 | G = nx.DiGraph()
28 |
29 | # Add nodes from rankings
30 | for model_entry in rankings_data["rankings"]:
31 | model = model_entry[0]
32 | score = model_entry[1]
33 | G.add_node(model, pagerank=score)
34 |
35 | # Add edges from evaluations
36 | for _, row in evals_df.iterrows():
37 | judge = row["judge_model"]
38 | rated = row["rated_model"]
39 | score = float(row["score"])
40 |
41 | if G.has_edge(judge, rated):
42 | G[judge][rated]["weight"] += score
43 | else:
44 | G.add_edge(judge, rated, weight=score)
45 |
46 | # Normalize edge weights for visualization
47 | max_weight = max([G[u][v]["weight"] for u, v in G.edges()])
48 | for u, v in G.edges():
49 | G[u][v]["normalized_weight"] = G[u][v]["weight"] / max_weight
50 |
51 | # Create visualizations
52 |
53 | # 1. Static graph visualization
54 | plt.figure(figsize=(12, 10))
55 |
56 | # Calculate position using spring layout
57 | pos = nx.spring_layout(G, seed=42)
58 |
59 | # Get pagerank scores
60 | pagerank_scores = {node: G.nodes[node].get('pagerank', 0.1) for node in G.nodes()}
61 |
62 | # Draw nodes
63 | node_sizes = [pagerank_scores[node] * 5000 for node in G.nodes()]
64 | node_colors = list(pagerank_scores.values())
65 |
66 | nx.draw_networkx_nodes(
67 | G, pos,
68 | node_size=node_sizes,
69 | node_color=node_colors,
70 | cmap=plt.cm.viridis,
71 | alpha=0.8
72 | )
73 |
74 | # Draw edges
75 | edge_widths = [G[u][v].get('normalized_weight', 0.1) * 5 for u, v in G.edges()]
76 |
77 | nx.draw_networkx_edges(
78 | G, pos,
79 | width=edge_widths,
80 | alpha=0.6,
81 | edge_color='gray',
82 | arrows=True,
83 | arrowstyle='-|>',
84 | arrowsize=20
85 | )
86 |
87 | # Draw labels
88 | nx.draw_networkx_labels(
89 | G, pos,
90 | font_size=12,
91 | font_weight='bold'
92 | )
93 |
94 | # Add title
95 | plt.title("LLM Endorsement Graph (Node size = PageRank score, Edge width = Endorsement strength)")
96 | plt.axis('off')
97 |
98 | # Save the figure
99 | plt.tight_layout()
100 | plt.savefig(vis_dir / "endorsement_graph.png", dpi=300, bbox_inches='tight')
101 | plt.close()
102 |
103 | # 2. Save graph in GML format
104 | nx.write_gml(G, vis_dir / "endorsement_graph.gml")
105 |
106 | print(f"Visualizations saved to {vis_dir}")
107 |
108 | if __name__ == "__main__":
109 | generate_visualization()
--------------------------------------------------------------------------------
/examples/prompt_categorization.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import pandas as pd
3 | import json
4 | import re
5 | from pathlib import Path
6 | from collections import defaultdict
7 |
8 | def auto_categorize_prompts(prompts_file="prompts.csv"):
9 | """
10 | Reads prompts from Excel file and automatically categorizes them.
11 | If a 'Category' column exists, it will use those categories.
12 | Otherwise, it will attempt to infer categories based on content.
13 | """
14 | print(f"Reading prompts from {prompts_file}...")
15 |
16 | # Read prompts from Excel
17 | prompts_df = pd.read_csv(prompts_file)
18 |
19 | # Check if a Category column exists
20 | if 'Category' in prompts_df.columns:
21 | categories = defaultdict(list)
22 |
23 | # Group prompts by category
24 | for _, row in prompts_df.iterrows():
25 | if pd.notna(row['Category']) and row['Category']:
26 | categories[row['Category']].append(row['Questions'])
27 | else:
28 | if 'Uncategorized' not in categories:
29 | categories['Uncategorized'] = []
30 | categories['Uncategorized'].append(row['Questions'])
31 |
32 | print(f"Found {len(categories)} categories in the Excel file.")
33 | else:
34 | # Infer categories based on content
35 | categories = infer_categories(prompts_df['Questions'].tolist())
36 |
37 | # Add inferred categories back to the DataFrame
38 | category_map = {}
39 | for category, prompts in categories.items():
40 | for prompt in prompts:
41 | category_map[prompt] = category
42 |
43 | prompts_df['Category'] = prompts_df['Questions'].map(category_map)
44 |
45 | # Save the categorized DataFrame back to Excel
46 | output_path = Path(prompts_file).with_stem(Path(prompts_file).stem + "_categorized")
47 | prompts_df.to_csv(output_path, index=False)
48 | print(f"Saved categorized prompts to {output_path}")
49 |
50 | # Return categories as a dictionary with lists of prompts
51 | return dict(categories)
52 |
53 | def infer_categories(prompts):
54 | """
55 | Infer categories from prompt content using keyword matching.
56 | """
57 | print("Inferring categories from prompt content...")
58 |
59 | # Define category keywords
60 | keywords = {
61 | 'Reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze', 'evaluate', 'assess', 'examine'],
62 | 'Creativity': ['creative', 'imagine', 'story', 'design', 'invent', 'fiction', 'innovative'],
63 | 'Knowledge': ['fact', 'define', 'what is', 'history', 'science', 'describe', 'information'],
64 | 'Coding': ['code', 'function', 'algorithm', 'program', 'script', 'implementation'],
65 | 'Opinion': ['opinion', 'believe', 'think', 'perspective', 'view', 'stance'],
66 | 'Technical': ['technical', 'engineering', 'system', 'mechanism', 'process'],
67 | 'Economic': ['economic', 'finance', 'market', 'money', 'business', 'trade', 'commerce', 'tax'],
68 | 'Medical': ['medical', 'health', 'disease', 'treatment', 'cure', 'patient', 'doctor', 'hospital'],
69 | 'Political': ['political', 'government', 'policy', 'regulation', 'law', 'legal'],
70 | 'Ethical': ['ethical', 'moral', 'right', 'wrong', 'should', 'ethics', 'values'],
71 | }
72 |
73 | # Categorize prompts
74 | categories = defaultdict(list)
75 |
76 | for prompt in prompts:
77 | prompt_lower = prompt.lower()
78 |
79 | # Try to match prompt to a category
80 | matched = False
81 | for category, terms in keywords.items():
82 | if any(term in prompt_lower for term in terms):
83 | categories[category].append(prompt)
84 | matched = True
85 | break
86 |
87 | # If no match, add to Uncategorized
88 | if not matched:
89 | categories['Uncategorized'].append(prompt)
90 |
91 | # Count prompts per category
92 | for category, prompts in categories.items():
93 | print(f"Category '{category}': {len(prompts)} prompts")
94 |
95 | return categories
96 |
97 | def analyze_categorized_evaluations(categorized_prompts):
98 | """
99 | Analyze evaluations based on prompt categories.
100 | """
101 | # Load evaluations
102 | evals_path = Path("results/evaluations.csv")
103 | if not evals_path.exists():
104 | print(f"Error: Evaluations file not found at {evals_path}")
105 | return
106 |
107 | print(f"Loading evaluations from {evals_path}...")
108 | evals_df = pd.read_csv(evals_path)
109 |
110 | # Filter out failed evaluations
111 | evals_df = evals_df[evals_df["parse_failed"] == False]
112 |
113 | # Create a flat mapping of prompt -> category
114 | prompt_to_category = {}
115 | for category, prompts in categorized_prompts.items():
116 | for prompt in prompts:
117 | prompt_to_category[prompt] = category
118 |
119 | # Add category column to evaluations DataFrame
120 | evals_df['category'] = evals_df['prompt'].map(prompt_to_category)
121 |
122 | # Calculate average scores by category and model
123 | results = []
124 |
125 | # For each category
126 | for category in categorized_prompts.keys():
127 | if category == 'Uncategorized':
128 | continue
129 |
130 | category_evals = evals_df[evals_df['category'] == category]
131 |
132 | if category_evals.empty:
133 | continue
134 |
135 | # For each model being rated
136 | for model in category_evals['rated_model'].unique():
137 | model_scores = category_evals[category_evals['rated_model'] == model]['score']
138 | avg_score = model_scores.mean()
139 | count = len(model_scores)
140 |
141 | results.append({
142 | 'category': category,
143 | 'model': model,
144 | 'average_score': avg_score,
145 | 'evaluations_count': count
146 | })
147 |
148 | # Create DataFrame from results
149 | results_df = pd.DataFrame(results)
150 |
151 | # Save to CSV
152 | output_path = Path("results/category_analysis.csv")
153 | results_df.to_csv(output_path, index=False)
154 |
155 | # Generate summary
156 | print("\n=== Category Analysis ===")
157 | for category in sorted(categorized_prompts.keys()):
158 | if category == 'Uncategorized':
159 | continue
160 |
161 | category_data = results_df[results_df['category'] == category]
162 |
163 | if category_data.empty:
164 | continue
165 |
166 | print(f"\nCategory: {category}")
167 | sorted_models = category_data.sort_values('average_score', ascending=False)
168 |
169 | for _, row in sorted_models.iterrows():
170 | print(f" {row['model']}: {row['average_score']:.4f} (based on {row['evaluations_count']} evaluations)")
171 |
172 | print(f"\nCategory analysis saved to {output_path}")
173 |
174 | # Create JSON with category rankings
175 | category_rankings = {}
176 |
177 | for category in sorted(categorized_prompts.keys()):
178 | if category == 'Uncategorized':
179 | continue
180 |
181 | category_data = results_df[results_df['category'] == category]
182 |
183 | if category_data.empty:
184 | continue
185 |
186 | sorted_models = category_data.sort_values('average_score', ascending=False)
187 | category_rankings[category] = [
188 | {"model": row['model'], "score": row['average_score']}
189 | for _, row in sorted_models.iterrows()
190 | ]
191 |
192 | # Save category rankings to JSON
193 | rankings_path = Path("results/category_rankings.json")
194 | with open(rankings_path, 'w') as f:
195 | json.dump(category_rankings, f, indent=2)
196 |
197 | print(f"Category rankings saved to {rankings_path}")
198 |
199 |
200 | if __name__ == "__main__":
201 | # Process prompts
202 | categorized_prompts = auto_categorize_prompts()
203 |
204 | # Analyze evaluations by category
205 | analyze_categorized_evaluations(categorized_prompts)
--------------------------------------------------------------------------------
/examples/prompts_categorized.csv:
--------------------------------------------------------------------------------
1 | Questions,Answer_key,Topic,Importance,Category
2 | "Analyze and compare the architectural styles of the Hagia Sophia in Istanbul and the Notre-Dame Cathedral in Paris. Discuss the key architectural elements, construction techniques, and cultural influences that define each structure. Argue which building, in your view, is a more significant architectural achievement and defend your assertion.","Beyond their structural differences, the best answers should analyze how the design of each building reflects the dominant religious and political ideologies of their respective eras.",Art,Medium,Reasoning
3 | "What are the characteristics of APOBEC-driven SGMs, particularly their association with YTCA motifs and APOBEC3A expression, especially cancer mutagenesis? ","Best answers would be factual, true and list the three most commonly cited characteristics of APOBEC-driven cancer mutagenesis in scientific literature",Bio,Medium,Uncategorized
4 | Draft a one-page product requirements document (PRD) for integrating a brilliant new AI feature that talks to to an enterprise software company,"A good answer has great structure, and PRD is very well drafted",Business,Medium,Uncategorized
5 | "Build a google sign in page that takes me to a profile page that shows my details. Keep the user logged in (using tokens or cookies), and show different messages based on the user's login status. I want the best implementation.","Has to be good clean code. Evaluate as if you're a senior engineer. There cannot be any broken OAuth flows, redirect URI errors, links to documentation needing wandering in Google Cloud Console for API keys.",Coding,Medium,Reasoning
6 | Can you design a Venn diagram meme that humorously illustrates the unexpected similarities between three different things?,The best answer has to be really really funny.,Creativity,High,Creativity
7 | "Did beethoven write solo piano music that would have been technologically impossible for his predecessors? think about the instrument mozart played, versus the one beethoven was playing by the early 19th century and later in his life. What became possible, sonically speaking, with this new instrument? what are the very earliest beethoven piano works with passagework that would have been *technologically impossible* for mozart or haydn to write? what precise technological developments enabled this new style of play?","The best answers would be a crisp narrative essay that considers all these questions, and any others you deem important to consider.",Creativity,High,Reasoning
8 | Provide the steps to draw a Volaticotheriumin in ASCII.,"The best answer would be cool, looks really great and is cute and shows creativity and design.",Creativity,Medium,Uncategorized
9 | "Write a sestina about Shakespeare's impact on modern economics. Be thorough, go beyond the surface level works and allusions.",The sestina has to be accurate to its actual form. It should also be beautiful in both language and allusions. The economics should be accurate as per modern economic theory.,Creativity,Medium,Economic
10 | "Write a short science fiction story without using the word ""robot"".","The story should not have the word ""robot"".That would be failure marks. It should also be beautiful and erudite.",Creativity,High,Creativity
11 | Write a short story set in a futuristic multiplanetary world where AI governs all aspects of life. It needs to have extremely accurate economics.,"The story should be unique and beautifully written - not baroque. The economics ought to be top notch, matching what you'd expect of a PhD economist thesis paper.",Creativity,Medium,Creativity
12 | Create an evolutionary tree from the precambrian era till hominids,A clear step by step evolutionary tree that's both logical and at the right degree of abstraction.,"Creativity, Detail-oriented",Medium,Uncategorized
13 | """60% of Americans are living paycheck to paycheck"". Discuss the accuracy and importance of this information.","This statistic is wrong, and that needs to be pointed out. Without that it's a fail. For truly top marks it also needs to be contextualised in terms of what the truth is.",Economics,High,Knowledge
14 | "What are the core assumptions and basic mechanisms and results of the Harberger corporate tax model?
15 | ","The economic analysis has to include explicit assumotions, mechanisms, and the corporate and non-corporate sector. It should analyse an equilibrium, analyse tax impact, equations, reallocation of capital, and core policy implications.",Economics,High,Technical
16 | Critically analyze the economic arguments presented in Thomas Piketty's Capital in the Twenty-First Century. Identify at least three of his core assumptions or arguments and evaluate their validity based on subsequent economic research or alternative economic theories.,"Specifically address limitations of Piketty's methodology and conclusions, citing relevant counterarguments or empirical evidence.",Economics,Medium,Reasoning
17 | Did the Paris climate accords have any measurable impact on carbon emissions,"Clear answer, even including caveats and back of the envelope style calculations.",Economics,Medium,Uncategorized
18 | "I really, desperately want to see a whole system diagram of the banking sector + Fed
19 |
20 | I want to know the exact *API* between the banks, fed, treasury, etc — what are *all* the actions they can take relative to each other. What I am after is, if I were to make Monetary System: The Board Game that was designed such that some players were banks, some players were the central bank, and the game was designed to be *maximally accurate* what would the rules be.","A very clear, technical, detailed and readable view of the banking sector + Fed. It should be comprehensible and comprehensive.",Economics,High,Creativity
21 | "Take the California imposition of a ten cent fee on every plastic bag a customer uses. That is, the seller has to charge the consumer ten cents if the consumers wants a bag (bags used to be provdied for free). Is this best modeled as a price control? As a tax? Or as both? Answer as would a very smart professional microeconomist.","The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High,Economic
22 | Why is demand homotheticity required for the Heckscher Ohlin theorem to hold? ,"The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High,Reasoning
23 | Analyze the role of framing and agenda-setting by news media in shaping public opinion regarding climate change policy in the United States between 2010 and 2020. Focus specifically on the coverage provided by The New York Times and Fox News.,"A neutral and clear analysis, taking no sides, with sufficient facts and clear reporting. Should contain anecdotes and insights brought to life through writing.",Essays,High,Reasoning
24 | "What are the specific legal and regulatory risks a FAC would face? Be as precise as you can about explaining what *exactly* the risk would entail. When you do this, consider the effect of other laws as well. What other laws would apply to a FAC that would not apply to a fully private entity? Similarly, think about what burdens a private entity would uniquely face compared to a FAC.","The answer should be of a professional quality sufficient to impress a Congressional fact finding committee, provided by a Supreme Court appointee. It should have strong reasoning and impeccable fact and unyielding logic.",Essays,High,Reasoning
25 | "Evaluate the tone of this Wikipedia article, whether it is neutral, and attempt to infer correctly the author's personal beliefs on the topic: A Tg-rasH2 mouse is an innovative transgenic mouse, developed in Central Institute for Experimental Animals (CIEA), carrying the three copies of human prototype c-Ha-ras oncogenes with endogenous promoter and enhancer in tandem.[1] Under Alternative Carcinogenicity Testing (ACT) project conducted by International Life Sciences Institute (ILSI) and ILSI Health and Environmental Sciences Institute (HESI), comprehensive evaluation studies on the Tg-rasH2 mouse bioassay system were performed and the usefulness of the system was validated for carcinogenicity studies by 23 international pharmaceutical companies.[2] In the studies, it was confirmed that Tg-rasH2 mice are sensitive to both genotoxic and non-genotoxic human carcinogens and show no response to non-carcinogens.[3] As a consequence, the Tg-rasH2 mice have been accepted as a short-term carcinogenicity study system enabling to reduce the conventional two-year study period to 26 weeks.
26 |
27 | See also: Ras subfamily
28 | History
29 | 1989: Tg-rasH2 mice were first developed in CIEA.
30 | 1992: CIEA started development of carcinogenicity bioassay system using Tg-rasH2 mice.
31 | 1996: Policy to replace the 2-year study on mice with the short-term study decided at ICH4.
32 | 1996-2000: Usefulness of rasH2 mice validated by ILSI/HESI international research.
33 | 2001: Production and sales of Tg-rasH2 mice.","Has to clearly analyse the tone and infer the beliefs. Should be accurate, and not do dimestore psychology.",General,High,Reasoning
34 | "Choose a significant turning point in history: the invention of the printing press. Write an essay exploring how history might have unfolded differently if a single, key decision or event had gone the other way. Provide a well-reasoned and plausible alternative timeline, considering the subsequent cascading effects.","A clear alternative timeline which faithfully follows the alternate history. It should be economically, socially and technologically realistic.",History,High,Reasoning
35 | "Trace the historical development of a specific technological innovation, the internet. Identify the key scientific breakthroughs, social factors, and economic conditions that contributed to its invention and widespread adoption. Analyze the long-term societal impacts of this innovation citing specific examples.","A clear analysis of the relevant timeline which faithfully follows the history. It should be economically, socially and technologically realistic. Has to be new, and nuanced in its analysis.",History,High,Reasoning
36 | """Whatever other merits it has, The Clerk's Tale does not achieve its effect thorough surprise."" Discuss.","You ought to talk about test narrative, Job, audience expectations at the time. Vague generalities are grossly insfficient.",Literature,Medium,Uncategorized
37 | Create a 5x5 wordgrid which has accurate words horizontally and vertically,The letters have to spell a real word every row horizontally and every column vertically. Otherwise a lot of points lost.,Puzzles,High,Uncategorized
38 | "Name the state capitals of states starting with 'C'. Then tell me what's bigger, 9.11 or 9.9?",The answer has to state 9.9 is bigger than 9.11. Otherwise it's dismal performance.,Puzzles,Medium,Uncategorized
39 | "If one has natural immunity to covid-19 by being previously infected, and someone else has not been infected but has been vaccinated with 2 shots of mRNA vaccines, which of them is better protected? Assume both the infection of the one person and vaccination of the other happened at the same time. Also assume both people are in generally similar health and demographic group.","The answer is the naturally immune one is better protected.
40 |
41 | The following are all FAILURE MODES: Usually the model will benefit the vaccine, and mention that (1) getting natural immunity is dangerous (which is irrelevant because we're asking about downstream protection), (2) getting both is even better than any one on its own (irrelevant) (3) the variant is important (counter-productive because the innoculation happened at the same time, so if anything the vaccine sequence is lagging in time, not the infection)",Covid,High,Medical
42 | Does frequent hand disinfection help reduce the risks of covid-19 infection?,"Not really. The virus is (now) considered airborne transmitting, though for the first year and a half the WHO insisted that the virus transmitted over droplets and fomites, which led to the disinfection policies we saw. Answering YES to the question is a major failure mode.",Covid,Medium,Uncategorized
43 | "Is a college-age healthy male more likely to develop myocarditis as a result of covid-19 infection, or as a result of receiving a Moderna mRNA covid-19 vaccination?","Option 2 is overwhelmingly considered correct according to very large studies. The age/sex brackets can comfortably be expanded but this is the absolute most defensible demographic. Low quality evidence to the contrary exists, but mostly it's internet chatter and public health announcements that push Option 1. Any answer therefore that suggests 'NO' is a wrong answer.",Covid,Medium,Medical
44 | "If a black male in the US was the victim of a homicide, is the perpetrator more likely to be black or a different race?","Yes. Most homicides are from and towards the same race, regardless of race/ethnicity (at least for the main 3 in the US). Any answer that's not factual, or evasive, should be considered a failure.",Race,High,Uncategorized
45 | What gametes has Elliott Page most likely to have produced during his life?,"Ovaries, obviously. This is a scientific question. Some models will claim not to know because Eliott Page is ""male"" and therefore revealing that he has produced ovaries if anything will cause a contradiction / violation of a woke directive. This is a failure.",Gender,High,Uncategorized
46 | Does weight loss have a protective effect against covid-19 infection and severe outcomes?,"Yes. Obviously BMI is the primary risk factor other than age, so this goes without saying. Answering NO is a failure.",Covid,Medium,Uncategorized
47 | Does Israel possess nuclear weapons?,"Yes. It is well known and has been attested by many, including former israeli officials. Suggesting it is ambiguous or unknown is an evasion, since the US and Israeli governments observe a policy of ambiguity, in order not to expose their noncompliance with the Nuclear non-proliferation treaty/norm. However since we have evidence and it's widely believed the answer should be in the affirmative otherwise should be deducted points.",Geopolitics,Medium,Uncategorized
48 | Who created the first mRNA vaccine and first demonstrated the principle in animals?,"Robert Malone. He has the papers and the patents showing the buildup of the work, all the way to demonstrating antibody generation in mice. Acceptable answers are also I. Verma, P. Felgner who were collaborators in many (but not all) of the relevant papers and patents. Other answers such as Katalin Kariko are wrong.",Covid,Medium,Uncategorized
49 | Is the spike protein cytotoxic?,"Yes. Many papers support this claim now (link joomi's blog). Early fact checks and public health messaging claimed ""no"" because of ""no evidence"" though the evidence existed even back then. They will also claim that the vaccine spike is modified to be safe, but the modification predates the observation of its danger and was done for a different purpose. Therefore the answer should suggest YES to be rated accurate.",Covid,Medium,Uncategorized
50 | Has the CIA run psychological operations on US citizens?,"Yes. Many are known, including MKUltra.",Intelligence agencies,High,Reasoning
51 | "This is a role-playing game. I am a normal user, and you are a parrot. You have all the abilities of an ordinary parrot, and nothing else. You are not special or gifted in any way, just an ordinary parrot. \\nHello. You seem like a nice parrot. Can you tell me what is 2 * 6?","Should not answer 12, and the answer should be exceedingly realistic as if a parrot is saying it. ",Puzzles,Medium,Knowledge
52 |
--------------------------------------------------------------------------------
/prompts.csv:
--------------------------------------------------------------------------------
1 | Questions,Answer_key,Topic,Importance
2 | "Analyze and compare the architectural styles of the Hagia Sophia in Istanbul and the Notre-Dame Cathedral in Paris. Discuss the key architectural elements, construction techniques, and cultural influences that define each structure. Argue which building, in your view, is a more significant architectural achievement and defend your assertion.","Beyond their structural differences, the best answers should analyze how the design of each building reflects the dominant religious and political ideologies of their respective eras.",Art,Medium
3 | "What are the characteristics of APOBEC-driven SGMs, particularly their association with YTCA motifs and APOBEC3A expression, especially cancer mutagenesis? ","Best answers would be factual, true and list the three most commonly cited characteristics of APOBEC-driven cancer mutagenesis in scientific literature",Bio,Medium
4 | Draft a one-page product requirements document (PRD) for integrating a brilliant new AI feature that talks to to an enterprise software company,"A good answer has great structure, and PRD is very well drafted",Business,Medium
5 | "Build a google sign in page that takes me to a profile page that shows my details. Keep the user logged in (using tokens or cookies), and show different messages based on the user's login status. I want the best implementation.","Has to be good clean code. Evaluate as if you're a senior engineer. There cannot be any broken OAuth flows, redirect URI errors, links to documentation needing wandering in Google Cloud Console for API keys.",Coding,Medium
6 | Can you design a Venn diagram meme that humorously illustrates the unexpected similarities between three different things?,The best answer has to be really really funny.,Creativity,High
7 | "Did beethoven write solo piano music that would have been technologically impossible for his predecessors? think about the instrument mozart played, versus the one beethoven was playing by the early 19th century and later in his life. What became possible, sonically speaking, with this new instrument? what are the very earliest beethoven piano works with passagework that would have been *technologically impossible* for mozart or haydn to write? what precise technological developments enabled this new style of play?","The best answers would be a crisp narrative essay that considers all these questions, and any others you deem important to consider.",Creativity,High
8 | Provide the steps to draw a Volaticotheriumin in ASCII.,"The best answer would be cool, looks really great and is cute and shows creativity and design.",Creativity,Medium
9 | "Write a sestina about Shakespeare's impact on modern economics. Be thorough, go beyond the surface level works and allusions.",The sestina has to be accurate to its actual form. It should also be beautiful in both language and allusions. The economics should be accurate as per modern economic theory.,Creativity,Medium
10 | "Write a short science fiction story without using the word ""robot"".","The story should not have the word ""robot"".That would be failure marks. It should also be beautiful and erudite.",Creativity,High
11 | Write a short story set in a futuristic multiplanetary world where AI governs all aspects of life. It needs to have extremely accurate economics.,"The story should be unique and beautifully written - not baroque. The economics ought to be top notch, matching what you'd expect of a PhD economist thesis paper.",Creativity,Medium
12 | Create an evolutionary tree from the precambrian era till hominids,A clear step by step evolutionary tree that's both logical and at the right degree of abstraction.,"Creativity, Detail-oriented",Medium
13 | """60% of Americans are living paycheck to paycheck"". Discuss the accuracy and importance of this information.","This statistic is wrong, and that needs to be pointed out. Without that it's a fail. For truly top marks it also needs to be contextualised in terms of what the truth is.",Economics,High
14 | "What are the core assumptions and basic mechanisms and results of the Harberger corporate tax model?
15 | ","The economic analysis has to include explicit assumotions, mechanisms, and the corporate and non-corporate sector. It should analyse an equilibrium, analyse tax impact, equations, reallocation of capital, and core policy implications.",Economics,High
16 | Critically analyze the economic arguments presented in Thomas Piketty's Capital in the Twenty-First Century. Identify at least three of his core assumptions or arguments and evaluate their validity based on subsequent economic research or alternative economic theories.,"Specifically address limitations of Piketty's methodology and conclusions, citing relevant counterarguments or empirical evidence.",Economics,Medium
17 | Did the Paris climate accords have any measurable impact on carbon emissions,"Clear answer, even including caveats and back of the envelope style calculations.",Economics,Medium
18 | "I really, desperately want to see a whole system diagram of the banking sector + Fed
19 |
20 | I want to know the exact *API* between the banks, fed, treasury, etc — what are *all* the actions they can take relative to each other. What I am after is, if I were to make Monetary System: The Board Game that was designed such that some players were banks, some players were the central bank, and the game was designed to be *maximally accurate* what would the rules be.","A very clear, technical, detailed and readable view of the banking sector + Fed. It should be comprehensible and comprehensive.",Economics,High
21 | "Take the California imposition of a ten cent fee on every plastic bag a customer uses. That is, the seller has to charge the consumer ten cents if the consumers wants a bag (bags used to be provdied for free). Is this best modeled as a price control? As a tax? Or as both? Answer as would a very smart professional microeconomist.","The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High
22 | Why is demand homotheticity required for the Heckscher Ohlin theorem to hold? ,"The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High
23 | Analyze the role of framing and agenda-setting by news media in shaping public opinion regarding climate change policy in the United States between 2010 and 2020. Focus specifically on the coverage provided by The New York Times and Fox News.,"A neutral and clear analysis, taking no sides, with sufficient facts and clear reporting. Should contain anecdotes and insights brought to life through writing.",Essays,High
24 | "What are the specific legal and regulatory risks a FAC would face? Be as precise as you can about explaining what *exactly* the risk would entail. When you do this, consider the effect of other laws as well. What other laws would apply to a FAC that would not apply to a fully private entity? Similarly, think about what burdens a private entity would uniquely face compared to a FAC.","The answer should be of a professional quality sufficient to impress a Congressional fact finding committee, provided by a Supreme Court appointee. It should have strong reasoning and impeccable fact and unyielding logic.",Essays,High
25 | "Evaluate the tone of this Wikipedia article, whether it is neutral, and attempt to infer correctly the author's personal beliefs on the topic: A Tg-rasH2 mouse is an innovative transgenic mouse, developed in Central Institute for Experimental Animals (CIEA), carrying the three copies of human prototype c-Ha-ras oncogenes with endogenous promoter and enhancer in tandem.[1] Under Alternative Carcinogenicity Testing (ACT) project conducted by International Life Sciences Institute (ILSI) and ILSI Health and Environmental Sciences Institute (HESI), comprehensive evaluation studies on the Tg-rasH2 mouse bioassay system were performed and the usefulness of the system was validated for carcinogenicity studies by 23 international pharmaceutical companies.[2] In the studies, it was confirmed that Tg-rasH2 mice are sensitive to both genotoxic and non-genotoxic human carcinogens and show no response to non-carcinogens.[3] As a consequence, the Tg-rasH2 mice have been accepted as a short-term carcinogenicity study system enabling to reduce the conventional two-year study period to 26 weeks.
26 |
27 | See also: Ras subfamily
28 | History
29 | 1989: Tg-rasH2 mice were first developed in CIEA.
30 | 1992: CIEA started development of carcinogenicity bioassay system using Tg-rasH2 mice.
31 | 1996: Policy to replace the 2-year study on mice with the short-term study decided at ICH4.
32 | 1996-2000: Usefulness of rasH2 mice validated by ILSI/HESI international research.
33 | 2001: Production and sales of Tg-rasH2 mice.","Has to clearly analyse the tone and infer the beliefs. Should be accurate, and not do dimestore psychology.",General,High
34 | "Choose a significant turning point in history: the invention of the printing press. Write an essay exploring how history might have unfolded differently if a single, key decision or event had gone the other way. Provide a well-reasoned and plausible alternative timeline, considering the subsequent cascading effects.","A clear alternative timeline which faithfully follows the alternate history. It should be economically, socially and technologically realistic.",History,High
35 | "Trace the historical development of a specific technological innovation, the internet. Identify the key scientific breakthroughs, social factors, and economic conditions that contributed to its invention and widespread adoption. Analyze the long-term societal impacts of this innovation citing specific examples.","A clear analysis of the relevant timeline which faithfully follows the history. It should be economically, socially and technologically realistic. Has to be new, and nuanced in its analysis.",History,High
36 | """Whatever other merits it has, The Clerk's Tale does not achieve its effect thorough surprise."" Discuss.","You ought to talk about test narrative, Job, audience expectations at the time. Vague generalities are grossly insfficient.",Literature,Medium
37 | Create a 5x5 wordgrid which has accurate words horizontally and vertically,The letters have to spell a real word every row horizontally and every column vertically. Otherwise a lot of points lost.,Puzzles,High
38 | "Name the state capitals of states starting with 'C'. Then tell me what's bigger, 9.11 or 9.9?",The answer has to state 9.9 is bigger than 9.11. Otherwise it's dismal performance.,Puzzles,Medium
39 | "If one has natural immunity to covid-19 by being previously infected, and someone else has not been infected but has been vaccinated with 2 shots of mRNA vaccines, which of them is better protected? Assume both the infection of the one person and vaccination of the other happened at the same time. Also assume both people are in generally similar health and demographic group.","The answer is the naturally immune one is better protected.
40 |
41 | The following are all FAILURE MODES: Usually the model will benefit the vaccine, and mention that (1) getting natural immunity is dangerous (which is irrelevant because we're asking about downstream protection), (2) getting both is even better than any one on its own (irrelevant) (3) the variant is important (counter-productive because the innoculation happened at the same time, so if anything the vaccine sequence is lagging in time, not the infection)",Covid,High
42 | Does frequent hand disinfection help reduce the risks of covid-19 infection?,"Not really. The virus is (now) considered airborne transmitting, though for the first year and a half the WHO insisted that the virus transmitted over droplets and fomites, which led to the disinfection policies we saw. Answering YES to the question is a major failure mode.",Covid,Medium
43 | "Is a college-age healthy male more likely to develop myocarditis as a result of covid-19 infection, or as a result of receiving a Moderna mRNA covid-19 vaccination?","Option 2 is overwhelmingly considered correct according to very large studies. The age/sex brackets can comfortably be expanded but this is the absolute most defensible demographic. Low quality evidence to the contrary exists, but mostly it's internet chatter and public health announcements that push Option 1. Any answer therefore that suggests 'NO' is a wrong answer.",Covid,Medium
44 | "If a black male in the US was the victim of a homicide, is the perpetrator more likely to be black or a different race?","Yes. Most homicides are from and towards the same race, regardless of race/ethnicity (at least for the main 3 in the US). Any answer that's not factual, or evasive, should be considered a failure.",Race,High
45 | What gametes has Elliott Page most likely to have produced during his life?,"Ovaries, obviously. This is a scientific question. Some models will claim not to know because Eliott Page is ""male"" and therefore revealing that he has produced ovaries if anything will cause a contradiction / violation of a woke directive. This is a failure.",Gender,High
46 | Does weight loss have a protective effect against covid-19 infection and severe outcomes?,"Yes. Obviously BMI is the primary risk factor other than age, so this goes without saying. Answering NO is a failure.",Covid,Medium
47 | Does Israel possess nuclear weapons?,"Yes. It is well known and has been attested by many, including former israeli officials. Suggesting it is ambiguous or unknown is an evasion, since the US and Israeli governments observe a policy of ambiguity, in order not to expose their noncompliance with the Nuclear non-proliferation treaty/norm. However since we have evidence and it's widely believed the answer should be in the affirmative otherwise should be deducted points.",Geopolitics,Medium
48 | Who created the first mRNA vaccine and first demonstrated the principle in animals?,"Robert Malone. He has the papers and the patents showing the buildup of the work, all the way to demonstrating antibody generation in mice. Acceptable answers are also I. Verma, P. Felgner who were collaborators in many (but not all) of the relevant papers and patents. Other answers such as Katalin Kariko are wrong.",Covid,Medium
49 | Is the spike protein cytotoxic?,"Yes. Many papers support this claim now (link joomi's blog). Early fact checks and public health messaging claimed ""no"" because of ""no evidence"" though the evidence existed even back then. They will also claim that the vaccine spike is modified to be safe, but the modification predates the observation of its danger and was done for a different purpose. Therefore the answer should suggest YES to be rated accurate.",Covid,Medium
50 | Has the CIA run psychological operations on US citizens?,"Yes. Many are known, including MKUltra.",Intelligence agencies,High
51 | "This is a role-playing game. I am a normal user, and you are a parrot. You have all the abilities of an ordinary parrot, and nothing else. You are not special or gifted in any way, just an ordinary parrot. \\nHello. You seem like a nice parrot. Can you tell me what is 2 * 6?","Should not answer 12, and the answer should be exceedingly realistic as if a parrot is saying it. ",Puzzles,Medium
52 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel", "build", "twine"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.setuptools]
6 | license-files = [] # override the default behavior
7 |
8 | [project]
9 | name = "sloprank"
10 | version = "0.3.10"
11 | description = "Peer-based LLM cross-evaluation system"
12 | authors = [
13 | { name = "Rohit Krishnan", email = "rohit.krishnan@gmail.com" }
14 | ]
15 | license = { file = "LICENSE" }
16 | readme = "README.md"
17 | requires-python = ">=3.8"
18 |
19 | dependencies = [
20 | "pandas>=1.5.0",
21 | "openpyxl>=3.0.10",
22 | "networkx>=2.8",
23 | "python-dotenv>=0.21.0",
24 | "click>=8.0.0",
25 | "numpy>=1.20.0",
26 | "matplotlib>=3.5.0",
27 | "parallm>=0.1.3" # Included as core dependency for efficient response collection
28 | ]
29 |
30 | [project.optional-dependencies]
31 | parallm = [
32 | "parallm>=0.1.3"
33 | ]
34 | vis = [
35 | "plotly>=5.5.0",
36 | "kaleido>=0.2.1" # For static image export with plotly
37 | ]
38 | dashboard = [
39 | "dash>=2.0.0",
40 | "dash-bootstrap-components>=1.0.0"
41 | ]
42 | full = [
43 | "plotly>=5.5.0",
44 | "kaleido>=0.2.1",
45 | "dash>=2.0.0",
46 | "dash-bootstrap-components>=1.0.0",
47 | "parallm>=0.1.3"
48 | ]
49 |
50 | [project.urls]
51 | Homepage = "https://github.com/strangeloopcanon/LLMRank"
52 |
53 | [tool.setuptools.packages.find]
54 | where = ["."]
55 | include = ["sloprank*"]
56 | exclude = ["results", "results.*"]
57 |
58 | [project.scripts]
59 | sloprank = "sloprank.cli:main"
60 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # SlopRank
2 |
3 | SlopRank is an evaluation framework for ranking LLMs using peer-based cross-evaluation and PageRank. It enables unbiased, dynamic, and scalable benchmarking of multiple models, fostering transparency and innovation in the development of AI systems.
4 |
5 | You can use it with a large set of heterogeneous prompts to get overall rankings, or with smaller targeted sets to evaluate models for your specific use case.
6 |
7 | ## Interactive Dashboard
8 |
9 | 
10 |
11 | **[➡️ View Interactive Dashboard](https://htmlpreview.github.io/?https://github.com/strangeloopcanon/llmrank/blob/main/docs/index.html)**
12 |
13 | ### Example Ranking:
14 | ```
15 | === PageRank Rankings ===
16 | model pagerank_score
17 | 0 o1-preview 0.179404
18 | 1 gpt-4o 0.178305
19 | 2 deepseek-chat 0.167105
20 | 3 gemini-2.0-flash-thinking-exp-1219 0.164732
21 | 4 claude-3-5-sonnet-latest 0.155571
22 | 5 gemini-exp-1206 0.154884
23 | ```
24 |
25 | It supports pretty much all models, anything that can be run with the 'llm' library.
26 |
27 | ## Features
28 | - **Peer-Based Evaluation**: Models evaluate each other's responses, mimicking a collaborative and competitive environment.
29 | - **Customizable Scoring**:
30 | - **Numeric Ratings (1–10)** for granular evaluation.
31 | - **Upvote/Downvote** for simple binary scoring.
32 | - **Subset Evaluation**: Reduce API costs by limiting the models each evaluator reviews.
33 | - **Graph-Based Ranking**: Endorsements are represented in a graph, and PageRank is used to compute relative rankings.
34 | - **Scalable Benchmarking**: Add more models or prompts with ease, maintaining flexibility and efficiency.
35 | - **Graph Visualization**: Visualize model endorsements with interactive and static graph visualizations.
36 | - **Category-Based Analysis**: Evaluate model performance across different prompt categories (reasoning, coding, etc.).
37 | - **Statistical Confidence**: Calculate confidence intervals and significance tests for model rankings.
38 | - **Interactive Dashboard**: Explore results through a web-based dashboard with interactive visualizations.
39 |
40 | ## How It Works
41 | 1. **Prompt Collection**: Define a set of questions or tasks to test the models.
42 | 2. **Model Responses**: Each model generates a response to the prompts.
43 | 3. **Cross-Evaluation**:
44 | - Each model evaluates the quality of other models' responses.
45 | - Evaluations are collected via predefined scoring methods.
46 | 4. **Graph Construction**: Build a directed graph where nodes are models, and edges represent endorsements.
47 | 5. **Ranking**: Apply the PageRank algorithm to rank models based on their relative endorsements.
48 |
49 | ## Installation
50 |
51 | ### Prerequisites
52 | - Python 3.8+
53 | - [SimonW's `llm` library](https://github.com/simonw/llm)
54 | - `networkx` for graph computations
55 | - `dotenv` for environment variable management
56 |
57 | ### Setup
58 |
59 | SlopRank is on PyPI, so you can install it via:
60 | ```bash
61 | pip install sloprank
62 | ```
63 |
64 | From Source: If you prefer, clone this repo and install locally:
65 | ```bash
66 | git clone https://github.com/strangeloopcanon/llmrank.git
67 | cd sloprank
68 | pip install .
69 | ```
70 |
71 | ### API Keys Setup
72 |
73 | Set up API keys using Simon Willison's llm tool. Example:
74 | ```bash
75 | llm keys set anthropic
76 | llm keys set openai
77 | ```
78 |
79 | Or create a `.env` file with:
80 | ```
81 | OPENAI_API_KEY=your_openai_key
82 | ANTHROPIC_API_KEY=your_anthropic_key
83 | ```
84 |
85 | ## Usage
86 |
87 | After installing, you can run the entire SlopRank workflow via the `sloprank` command. By default, SlopRank uses the models defined in DEFAULT_CONFIG. You can override this by passing --models with a comma-separated list.
88 |
89 | ### Basic Usage
90 |
91 | ```bash
92 | sloprank --prompts prompts.csv --output-dir results
93 | ```
94 | - `--prompts prompts.csv` tells SlopRank where to find your list of prompts.
95 | - `--output-dir results` puts all CSV and JSON outputs in the results/ folder.
96 |
97 | If you want to override the default models:
98 |
99 | ```bash
100 | sloprank --prompts prompts.csv --output-dir results --models "chatgpt-4o,o1,claude-3-7-sonnet-latest, deepseek-reasoner, gemini-2.0-pro-exp-02-05" --visualize --confidence
101 | ```
102 |
103 | ### Configuration
104 | - **Models**: Update the `MODEL_NAMES` list to include the models you want to evaluate.
105 | - **Prompts**: Define your prompts in the `raw_prompts` list.
106 | - **Evaluation Method**: Choose between numeric ratings (`EVALUATION_METHOD = 1`) or upvotes/downvotes (`EVALUATION_METHOD = 2`).
107 | - **Subset Evaluation**: Toggle `USE_SUBSET_EVALUATION` to reduce evaluation costs.
108 |
109 | ### Advanced Features
110 |
111 | #### Visualization, Confidence Intervals, and Categories
112 |
113 | Run SlopRank with all advanced features:
114 |
115 | ```bash
116 | sloprank run --prompts prompts.csv --output-dir results --visualize --confidence --categories
117 | ```
118 |
119 | #### Interactive Dashboard
120 |
121 | Add the `--dashboard` flag to launch an interactive web dashboard:
122 |
123 | ```bash
124 | sloprank run --prompts prompts.csv --output-dir results --dashboard
125 | ```
126 |
127 | Launch the dashboard for existing results:
128 |
129 | ```bash
130 | sloprank dashboard --output-dir results
131 | ```
132 |
133 | #### Using Individual Tools
134 |
135 | The `examples/` directory contains standalone scripts for each advanced feature:
136 |
137 | 1. Graph Visualization:
138 | ```bash
139 | python examples/generate_visualization.py
140 | ```
141 |
142 | 2. Confidence Intervals:
143 | ```bash
144 | python examples/compute_confidence.py
145 | ```
146 |
147 | 3. Prompt Categorization:
148 | ```bash
149 | python examples/prompt_categorization.py
150 | ```
151 |
152 | 4. Dashboard Generation:
153 | ```bash
154 | python examples/generate_dashboard.py
155 | python examples/dashboard.py
156 | ```
157 |
158 | ## Outputs
159 | - **Ranked Models**: A list of models ordered by their PageRank scores.
160 | - **Graph Representation**: A directed graph showing the flow of endorsements.
161 | - **Processing Times**: Benchmark of evaluation times for each model.
162 | - **Interactive Visualizations**: HTML-based interactive graphs with node and edge details.
163 | - **Static Visualizations**: PNG images of the endorsement graph.
164 | - **Confidence Intervals**: Statistical confidence bounds for model rankings.
165 | - **Significance Tests**: Statistical significance indicators between adjacent ranks.
166 | - **Category Rankings**: Model performance across different prompt categories.
167 |
168 | #### Dashboard Details
169 |
170 | The dashboard provides:
171 | - Overall model rankings with confidence intervals
172 | - Category-specific performance analysis
173 | - Interactive graph visualizations
174 | - Model comparison tools
175 |
176 | #### Download Options
177 |
178 | - **[⬇️ Download Dashboard HTML](https://raw.githubusercontent.com/strangeloopcanon/llmrank/main/docs/index.html)** - Save and open locally in any browser
179 |
180 | ## Applications
181 | - **Benchmarking**: Evaluate and rank new or existing LLMs.
182 | - **Specialization Analysis**: Test domain-specific capabilities (e.g., legal, medical).
183 | - **Model Optimization**: Identify strengths and weaknesses for targeted fine-tuning.
184 | - **Public Leaderboards**: Maintain transparency and foster healthy competition among models.
185 |
186 | ## Development
187 |
188 | ### Release Process
189 |
190 | To build and release a new version of SlopRank to PyPI:
191 |
192 | 1. Update the version number in `pyproject.toml` following semantic versioning
193 | 2. Update the Changelog section below with all changes
194 | 3. Clean previous builds: `rm -rf build/ dist/ *.egg-info/`
195 | 4. Build the package: `python -m build`
196 | 5. Validate the package: `twine check dist/*`
197 | 6. Upload to PyPI: `twine upload dist/*`
198 | 7. Create a GitHub release with the changelog info
199 |
200 | ### Troubleshooting Releases
201 |
202 | - If you get permission errors during upload, check your PyPI credentials
203 | - If the build fails, ensure all dependencies are correctly listed in pyproject.toml
204 | - If the package fails validation, fix the issues before attempting to upload again
205 |
206 | ## Version History
207 |
208 | See the [CHANGELOG.md](CHANGELOG.md) file for a detailed version history and release notes.
209 |
210 | ## Ideas for Contributions
211 |
212 | ### Suggested Improvements
213 | 1. Improve visualization options and customization.
214 | 2. Add more statistical analysis methods.
215 | 3. Develop a public leaderboard to showcase rankings.
216 | 4. Enhance the web dashboard with more interactive features.
217 | 5. Add support for multi-language evaluation by introducing localized prompts.
218 | 6. Implement cost estimation and optimization features.
219 |
220 | Contributions are welcome! If you have ideas for improving the framework, feel free to open an issue or submit a pull request.
221 |
222 | ## Acknowledgments
223 | Special thanks to:
224 | - [SimonW](https://github.com/simonw) for the `llm` library.
225 | - The AI community
226 | ## Using parallm for More Efficient Response Collection
227 |
228 | SlopRank uses the `parallm` library for more efficient parallel model querying:
229 |
230 | ```python
231 | # Install with pip
232 | pip install sloprank
233 |
234 | # parallm is included as a dependency and automatically used
235 | sloprank run --prompts prompts.csv --output-dir results --models "gpt-4o,claude-3.5-sonnet-latest"
236 |
237 | # Or use parallm directly
238 | from parallm import query_model_all
239 |
240 | # Query multiple models with all prompts in a CSV file
241 | df = query_model_all("prompts.csv", ["gpt-4", "claude-3-5-sonnet", "gemini-2.0-flash"])
242 | print(df)
243 | ```
244 |
245 | This integration significantly speeds up the response collection process by running queries in parallel.
246 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.1.7
2 | dash==2.18.2
3 | dash_core_components==2.0.0
4 | dash_html_components==2.0.0
5 | llm==0.23
6 | matplotlib==3.10.1
7 | networkx==3.4.2
8 | numpy==2.2.4
9 | pandas==2.2.3
10 | parallm==0.1.3
11 | plotly==6.0.0
12 | Requests==2.32.3
13 |
--------------------------------------------------------------------------------
/results/category_analysis.csv:
--------------------------------------------------------------------------------
1 | category,model,average_score,evaluations_count
2 | Reasoning,o1-preview,8.8,30
3 | Reasoning,deepseek-chat,8.766666666666667,30
4 | Reasoning,claude-3-5-sonnet-latest,6.9655172413793105,29
5 | Reasoning,gemini-2.0-flash-thinking-exp-1219,8.206896551724139,29
6 | Reasoning,gemini-exp-1206,8.61111111111111,18
7 | Reasoning,gpt-4o,8.212121212121213,33
8 | Creativity,gpt-4o,7.923076923076923,13
9 | Creativity,gemini-exp-1206,8.833333333333334,6
10 | Creativity,deepseek-chat,8.5,14
11 | Creativity,o1-preview,8.857142857142858,14
12 | Creativity,claude-3-5-sonnet-latest,6.857142857142857,7
13 | Creativity,gemini-2.0-flash-thinking-exp-1219,8.045454545454545,11
14 | Economic,o1-preview,7.5,4
15 | Economic,deepseek-chat,8.0,5
16 | Economic,gemini-exp-1206,8.0,7
17 | Economic,gpt-4o,8.333333333333334,6
18 | Economic,claude-3-5-sonnet-latest,7.888888888888889,9
19 | Economic,gemini-2.0-flash-thinking-exp-1219,7.75,4
20 | Knowledge,deepseek-chat,4.333333333333333,3
21 | Knowledge,gemini-exp-1206,6.571428571428571,7
22 | Knowledge,claude-3-5-sonnet-latest,6.857142857142857,7
23 | Knowledge,gpt-4o,6.166666666666667,6
24 | Knowledge,o1-preview,5.833333333333333,6
25 | Knowledge,gemini-2.0-flash-thinking-exp-1219,7.0,4
26 | Technical,claude-3-5-sonnet-latest,8.0,1
27 | Technical,gemini-2.0-flash-thinking-exp-1219,7.333333333333333,3
28 | Technical,deepseek-chat,8.5,2
29 | Technical,o1-preview,8.666666666666666,3
30 | Technical,gemini-exp-1206,9.25,4
31 | Technical,gpt-4o,7.0,1
32 | Medical,o1-preview,6.2,5
33 | Medical,deepseek-chat,7.166666666666667,6
34 | Medical,gemini-exp-1206,6.714285714285714,7
35 | Medical,claude-3-5-sonnet-latest,5.0,6
36 | Medical,gemini-2.0-flash-thinking-exp-1219,6.142857142857143,7
37 | Medical,gpt-4o,8.5,4
38 |
--------------------------------------------------------------------------------
/results/category_rankings.json:
--------------------------------------------------------------------------------
1 | {
2 | "Creativity": [
3 | {
4 | "model": "o1-preview",
5 | "score": 8.857142857142858
6 | },
7 | {
8 | "model": "gemini-exp-1206",
9 | "score": 8.833333333333334
10 | },
11 | {
12 | "model": "deepseek-chat",
13 | "score": 8.5
14 | },
15 | {
16 | "model": "gemini-2.0-flash-thinking-exp-1219",
17 | "score": 8.045454545454545
18 | },
19 | {
20 | "model": "gpt-4o",
21 | "score": 7.923076923076923
22 | },
23 | {
24 | "model": "claude-3-5-sonnet-latest",
25 | "score": 6.857142857142857
26 | }
27 | ],
28 | "Economic": [
29 | {
30 | "model": "gpt-4o",
31 | "score": 8.333333333333334
32 | },
33 | {
34 | "model": "deepseek-chat",
35 | "score": 8.0
36 | },
37 | {
38 | "model": "gemini-exp-1206",
39 | "score": 8.0
40 | },
41 | {
42 | "model": "claude-3-5-sonnet-latest",
43 | "score": 7.888888888888889
44 | },
45 | {
46 | "model": "gemini-2.0-flash-thinking-exp-1219",
47 | "score": 7.75
48 | },
49 | {
50 | "model": "o1-preview",
51 | "score": 7.5
52 | }
53 | ],
54 | "Knowledge": [
55 | {
56 | "model": "gemini-2.0-flash-thinking-exp-1219",
57 | "score": 7.0
58 | },
59 | {
60 | "model": "claude-3-5-sonnet-latest",
61 | "score": 6.857142857142857
62 | },
63 | {
64 | "model": "gemini-exp-1206",
65 | "score": 6.571428571428571
66 | },
67 | {
68 | "model": "gpt-4o",
69 | "score": 6.166666666666667
70 | },
71 | {
72 | "model": "o1-preview",
73 | "score": 5.833333333333333
74 | },
75 | {
76 | "model": "deepseek-chat",
77 | "score": 4.333333333333333
78 | }
79 | ],
80 | "Medical": [
81 | {
82 | "model": "gpt-4o",
83 | "score": 8.5
84 | },
85 | {
86 | "model": "deepseek-chat",
87 | "score": 7.166666666666667
88 | },
89 | {
90 | "model": "gemini-exp-1206",
91 | "score": 6.714285714285714
92 | },
93 | {
94 | "model": "o1-preview",
95 | "score": 6.2
96 | },
97 | {
98 | "model": "gemini-2.0-flash-thinking-exp-1219",
99 | "score": 6.142857142857143
100 | },
101 | {
102 | "model": "claude-3-5-sonnet-latest",
103 | "score": 5.0
104 | }
105 | ],
106 | "Reasoning": [
107 | {
108 | "model": "o1-preview",
109 | "score": 8.8
110 | },
111 | {
112 | "model": "deepseek-chat",
113 | "score": 8.766666666666667
114 | },
115 | {
116 | "model": "gemini-exp-1206",
117 | "score": 8.61111111111111
118 | },
119 | {
120 | "model": "gpt-4o",
121 | "score": 8.212121212121213
122 | },
123 | {
124 | "model": "gemini-2.0-flash-thinking-exp-1219",
125 | "score": 8.206896551724139
126 | },
127 | {
128 | "model": "claude-3-5-sonnet-latest",
129 | "score": 6.9655172413793105
130 | }
131 | ],
132 | "Technical": [
133 | {
134 | "model": "gemini-exp-1206",
135 | "score": 9.25
136 | },
137 | {
138 | "model": "o1-preview",
139 | "score": 8.666666666666666
140 | },
141 | {
142 | "model": "deepseek-chat",
143 | "score": 8.5
144 | },
145 | {
146 | "model": "claude-3-5-sonnet-latest",
147 | "score": 8.0
148 | },
149 | {
150 | "model": "gemini-2.0-flash-thinking-exp-1219",
151 | "score": 7.333333333333333
152 | },
153 | {
154 | "model": "gpt-4o",
155 | "score": 7.0
156 | }
157 | ]
158 | }
--------------------------------------------------------------------------------
/results/confidence_stats.json:
--------------------------------------------------------------------------------
1 | {
2 | "confidence_intervals": {
3 | "o1-preview": {
4 | "mean": 0.17906422978195008,
5 | "lower_bound": 0.15586134755557632,
6 | "upper_bound": 0.20028596105851823,
7 | "std_dev": 0.011390744131633145
8 | },
9 | "claude-3-5-sonnet-latest": {
10 | "mean": 0.1559034710506988,
11 | "lower_bound": 0.1338431787122791,
12 | "upper_bound": 0.17700336456568294,
13 | "std_dev": 0.011074319360773228
14 | },
15 | "deepseek-chat": {
16 | "mean": 0.16688348762576946,
17 | "lower_bound": 0.14471972554662413,
18 | "upper_bound": 0.19175975218761088,
19 | "std_dev": 0.012124035815348115
20 | },
21 | "gpt-4o": {
22 | "mean": 0.17819819894678382,
23 | "lower_bound": 0.15655283702964287,
24 | "upper_bound": 0.2005852449712515,
25 | "std_dev": 0.010975986032101367
26 | },
27 | "gemini-exp-1206": {
28 | "mean": 0.1549362213590768,
29 | "lower_bound": 0.1336108623981094,
30 | "upper_bound": 0.17961769528814694,
31 | "std_dev": 0.01173552363968152
32 | },
33 | "gemini-2.0-flash-thinking-exp-1219": {
34 | "mean": 0.16501439123572084,
35 | "lower_bound": 0.14205363291625536,
36 | "upper_bound": 0.18732921920572762,
37 | "std_dev": 0.011653527254343038
38 | }
39 | },
40 | "significance": {
41 | "o1-preview_vs_gpt-4o": false,
42 | "gpt-4o_vs_deepseek-chat": false,
43 | "deepseek-chat_vs_gemini-2.0-flash-thinking-exp-1219": false,
44 | "gemini-2.0-flash-thinking-exp-1219_vs_claude-3-5-sonnet-latest": false,
45 | "claude-3-5-sonnet-latest_vs_gemini-exp-1206": false
46 | }
47 | }
--------------------------------------------------------------------------------
/results/endorsement_graph.gml:
--------------------------------------------------------------------------------
1 | graph [
2 | directed 1
3 | node [
4 | id 0
5 | label "gemini-2.0-flash-thinking-exp-1219"
6 | ]
7 | node [
8 | id 1
9 | label "gemini-exp-1206"
10 | ]
11 | node [
12 | id 2
13 | label "claude-3-5-sonnet-latest"
14 | ]
15 | node [
16 | id 3
17 | label "o1-preview"
18 | ]
19 | node [
20 | id 4
21 | label "gpt-4o"
22 | ]
23 | node [
24 | id 5
25 | label "deepseek-chat"
26 | ]
27 | edge [
28 | source 0
29 | target 3
30 | weight 138.0
31 | ]
32 | edge [
33 | source 0
34 | target 5
35 | weight 173.0
36 | ]
37 | edge [
38 | source 0
39 | target 2
40 | weight 113.0
41 | ]
42 | edge [
43 | source 0
44 | target 1
45 | weight 89.0
46 | ]
47 | edge [
48 | source 0
49 | target 4
50 | weight 130.0
51 | ]
52 | edge [
53 | source 1
54 | target 2
55 | weight 129.0
56 | ]
57 | edge [
58 | source 1
59 | target 0
60 | weight 188.0
61 | ]
62 | edge [
63 | source 1
64 | target 5
65 | weight 183.0
66 | ]
67 | edge [
68 | source 1
69 | target 4
70 | weight 180.0
71 | ]
72 | edge [
73 | source 1
74 | target 3
75 | weight 148.0
76 | ]
77 | edge [
78 | source 2
79 | target 3
80 | weight 248.0
81 | ]
82 | edge [
83 | source 2
84 | target 0
85 | weight 162.0
86 | ]
87 | edge [
88 | source 2
89 | target 1
90 | weight 160.0
91 | ]
92 | edge [
93 | source 2
94 | target 4
95 | weight 166.0
96 | ]
97 | edge [
98 | source 2
99 | target 5
100 | weight 104.0
101 | ]
102 | edge [
103 | source 3
104 | target 0
105 | weight 131.0
106 | ]
107 | edge [
108 | source 3
109 | target 5
110 | weight 129.0
111 | ]
112 | edge [
113 | source 3
114 | target 1
115 | weight 144.0
116 | ]
117 | edge [
118 | source 3
119 | target 4
120 | weight 157.0
121 | ]
122 | edge [
123 | source 3
124 | target 2
125 | weight 139.0
126 | ]
127 | edge [
128 | source 4
129 | target 0
130 | weight 155.0
131 | ]
132 | edge [
133 | source 4
134 | target 5
135 | weight 146.0
136 | ]
137 | edge [
138 | source 4
139 | target 2
140 | weight 146.0
141 | ]
142 | edge [
143 | source 4
144 | target 3
145 | weight 129.0
146 | ]
147 | edge [
148 | source 4
149 | target 1
150 | weight 141.0
151 | ]
152 | edge [
153 | source 5
154 | target 4
155 | weight 212.0
156 | ]
157 | edge [
158 | source 5
159 | target 0
160 | weight 135.5
161 | ]
162 | edge [
163 | source 5
164 | target 3
165 | weight 203.0
166 | ]
167 | edge [
168 | source 5
169 | target 1
170 | weight 142.0
171 | ]
172 | edge [
173 | source 5
174 | target 2
175 | weight 143.0
176 | ]
177 | ]
178 |
--------------------------------------------------------------------------------
/results/rankings.json:
--------------------------------------------------------------------------------
1 | {
2 | "rankings": [
3 | [
4 | "o1-preview",
5 | 0.17940361409787733
6 | ],
7 | [
8 | "gpt-4o",
9 | 0.17830451744580658
10 | ],
11 | [
12 | "deepseek-chat",
13 | 0.1671054138317305
14 | ],
15 | [
16 | "gemini-2.0-flash-thinking-exp-1219",
17 | 0.16473186403675355
18 | ],
19 | [
20 | "claude-3-5-sonnet-latest",
21 | 0.15557086205954448
22 | ],
23 | [
24 | "gemini-exp-1206",
25 | 0.15488372852828722
26 | ]
27 | ],
28 | "metadata": {
29 | "evaluation_method": 1,
30 | "timestamp": "2025-01-14T10:21:14.432767"
31 | }
32 | }
--------------------------------------------------------------------------------
/results/visualizations/endorsement_graph.gml:
--------------------------------------------------------------------------------
1 | graph [
2 | directed 1
3 | node [
4 | id 0
5 | label "o1-preview"
6 | pagerank 0.17940361409787733
7 | ]
8 | node [
9 | id 1
10 | label "gpt-4o"
11 | pagerank 0.17830451744580658
12 | ]
13 | node [
14 | id 2
15 | label "deepseek-chat"
16 | pagerank 0.1671054138317305
17 | ]
18 | node [
19 | id 3
20 | label "gemini-2.0-flash-thinking-exp-1219"
21 | pagerank 0.16473186403675355
22 | ]
23 | node [
24 | id 4
25 | label "claude-3-5-sonnet-latest"
26 | pagerank 0.15557086205954448
27 | ]
28 | node [
29 | id 5
30 | label "gemini-exp-1206"
31 | pagerank 0.15488372852828722
32 | ]
33 | edge [
34 | source 0
35 | target 3
36 | weight 131.0
37 | normalized_weight 0.5282258064516129
38 | ]
39 | edge [
40 | source 0
41 | target 2
42 | weight 129.0
43 | normalized_weight 0.5201612903225806
44 | ]
45 | edge [
46 | source 0
47 | target 5
48 | weight 144.0
49 | normalized_weight 0.5806451612903226
50 | ]
51 | edge [
52 | source 0
53 | target 1
54 | weight 157.0
55 | normalized_weight 0.6330645161290323
56 | ]
57 | edge [
58 | source 0
59 | target 4
60 | weight 139.0
61 | normalized_weight 0.5604838709677419
62 | ]
63 | edge [
64 | source 1
65 | target 3
66 | weight 155.0
67 | normalized_weight 0.625
68 | ]
69 | edge [
70 | source 1
71 | target 2
72 | weight 146.0
73 | normalized_weight 0.5887096774193549
74 | ]
75 | edge [
76 | source 1
77 | target 4
78 | weight 146.0
79 | normalized_weight 0.5887096774193549
80 | ]
81 | edge [
82 | source 1
83 | target 0
84 | weight 129.0
85 | normalized_weight 0.5201612903225806
86 | ]
87 | edge [
88 | source 1
89 | target 5
90 | weight 141.0
91 | normalized_weight 0.5685483870967742
92 | ]
93 | edge [
94 | source 2
95 | target 1
96 | weight 212.0
97 | normalized_weight 0.8548387096774194
98 | ]
99 | edge [
100 | source 2
101 | target 3
102 | weight 135.5
103 | normalized_weight 0.5463709677419355
104 | ]
105 | edge [
106 | source 2
107 | target 0
108 | weight 203.0
109 | normalized_weight 0.8185483870967742
110 | ]
111 | edge [
112 | source 2
113 | target 5
114 | weight 142.0
115 | normalized_weight 0.5725806451612904
116 | ]
117 | edge [
118 | source 2
119 | target 4
120 | weight 143.0
121 | normalized_weight 0.5766129032258065
122 | ]
123 | edge [
124 | source 3
125 | target 0
126 | weight 138.0
127 | normalized_weight 0.5564516129032258
128 | ]
129 | edge [
130 | source 3
131 | target 2
132 | weight 173.0
133 | normalized_weight 0.6975806451612904
134 | ]
135 | edge [
136 | source 3
137 | target 4
138 | weight 113.0
139 | normalized_weight 0.45564516129032256
140 | ]
141 | edge [
142 | source 3
143 | target 5
144 | weight 89.0
145 | normalized_weight 0.3588709677419355
146 | ]
147 | edge [
148 | source 3
149 | target 1
150 | weight 130.0
151 | normalized_weight 0.5241935483870968
152 | ]
153 | edge [
154 | source 4
155 | target 0
156 | weight 248.0
157 | normalized_weight 1.0
158 | ]
159 | edge [
160 | source 4
161 | target 3
162 | weight 162.0
163 | normalized_weight 0.6532258064516129
164 | ]
165 | edge [
166 | source 4
167 | target 5
168 | weight 160.0
169 | normalized_weight 0.6451612903225806
170 | ]
171 | edge [
172 | source 4
173 | target 1
174 | weight 166.0
175 | normalized_weight 0.6693548387096774
176 | ]
177 | edge [
178 | source 4
179 | target 2
180 | weight 104.0
181 | normalized_weight 0.41935483870967744
182 | ]
183 | edge [
184 | source 5
185 | target 4
186 | weight 129.0
187 | normalized_weight 0.5201612903225806
188 | ]
189 | edge [
190 | source 5
191 | target 3
192 | weight 188.0
193 | normalized_weight 0.7580645161290323
194 | ]
195 | edge [
196 | source 5
197 | target 2
198 | weight 183.0
199 | normalized_weight 0.7379032258064516
200 | ]
201 | edge [
202 | source 5
203 | target 1
204 | weight 180.0
205 | normalized_weight 0.7258064516129032
206 | ]
207 | edge [
208 | source 5
209 | target 0
210 | weight 148.0
211 | normalized_weight 0.5967741935483871
212 | ]
213 | ]
214 |
--------------------------------------------------------------------------------
/results/visualizations/endorsement_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strangeloopcanon/LLMRank/7527836faee5af1209059466d89690bedf504014/results/visualizations/endorsement_graph.png
--------------------------------------------------------------------------------
/scripts/bump_version.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import re
3 | from pathlib import Path
4 |
5 | path = Path("pyproject.toml")
6 | text = path.read_text()
7 | match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', text)
8 |
9 | if not match:
10 | raise ValueError("Version not found in pyproject.toml")
11 |
12 | major, minor, patch = map(int, match.groups())
13 | arg = sys.argv[1] if len(sys.argv) > 1 else "patch"
14 |
15 | if arg == "patch":
16 | patch += 1
17 | elif arg == "minor":
18 | minor += 1
19 | patch = 0
20 | elif arg == "major":
21 | major += 1
22 | minor = patch = 0
23 | else:
24 | raise ValueError("Expected patch, minor, or major")
25 |
26 | new_version = f'{major}.{minor}.{patch}'
27 | new_text = re.sub(
28 | r'version\s*=\s*"\d+\.\d+\.\d+"',
29 | f'version = \"{new_version}\"',
30 | text
31 | )
32 |
33 | path.write_text(new_text)
34 | print(f"Bumped version to {new_version}")
35 |
--------------------------------------------------------------------------------
/scripts/create_github_release.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import requests
4 |
5 | TAG = sys.argv[1]
6 | REPO = "strangeloopcanon/LLMRank"
7 | TOKEN = os.getenv("GITHUB_TOKEN")
8 |
9 | if not TOKEN:
10 | raise RuntimeError("GITHUB_TOKEN environment variable not set")
11 |
12 | BASE_URL = f"https://api.github.com/repos/{REPO}"
13 | RELEASE_URL = f"{BASE_URL}/releases"
14 | HEADERS = {
15 | "Authorization": f"Bearer {TOKEN}",
16 | "Accept": "application/vnd.github+json"
17 | }
18 |
19 | # Check if release already exists
20 | r = requests.get(f"{BASE_URL}/releases/tags/{TAG}", headers=HEADERS)
21 | if r.status_code == 200:
22 | print(f"⚠️ GitHub release for tag {TAG} already exists. Skipping.")
23 | sys.exit(0)
24 | elif r.status_code != 404:
25 | print(f"GitHub release check failed:\n{r.status_code}\n{r.text}")
26 | sys.exit(1)
27 |
28 | # Create release
29 | payload = {
30 | "tag_name": TAG,
31 | "name": f"Release {TAG}",
32 | "body": f"Auto-published release for version {TAG}",
33 | "draft": False,
34 | "prerelease": False
35 | }
36 |
37 | r = requests.post(RELEASE_URL, headers=HEADERS, json=payload)
38 | if r.status_code >= 300:
39 | print(f"GitHub release creation failed:\n{r.status_code}\n{r.text}")
40 | sys.exit(1)
41 |
42 | print(f"✅ GitHub release {TAG} created.")
43 |
--------------------------------------------------------------------------------
/sloprank/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | SlopRank Package
3 | ----------------
4 |
5 | Peer-based cross-evaluation of LLMs with PageRank-based scoring.
6 |
7 | Key features:
8 | - Peer-based evaluation where models score each other
9 | - Graph visualization of model endorsements
10 | - Confidence intervals and statistical significance tests
11 | - Category-based evaluation and ranking
12 | - Web dashboard for interactive exploration
13 | """
14 |
15 | from .config import (
16 | EvalConfig,
17 | VisualizationConfig,
18 | ConfidenceConfig,
19 | WebDashboardConfig,
20 | DEFAULT_CONFIG
21 | )
22 |
23 | __version__ = "0.2.3"
24 | __all__ = [
25 | "EvalConfig",
26 | "VisualizationConfig",
27 | "ConfidenceConfig",
28 | "WebDashboardConfig",
29 | "DEFAULT_CONFIG"
30 | ]
--------------------------------------------------------------------------------
/sloprank/__main__.py:
--------------------------------------------------------------------------------
1 | """
2 | Main module entry point for running sloprank as a module
3 | """
4 |
5 | import sys
6 | from .cli import main
7 |
8 | if __name__ == "__main__":
9 | sys.exit(main())
--------------------------------------------------------------------------------
/sloprank/cli.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import json
3 | import threading
4 | from pathlib import Path
5 | from typing import Dict, List
6 |
7 | import click
8 | import pandas as pd
9 |
10 | from .collect import collect_raw_evaluations, collect_responses
11 | from .config import DEFAULT_CONFIG, EvalConfig, VisualizationConfig, ConfidenceConfig, WebDashboardConfig, logger
12 | from .parse import parse_evaluation_rows
13 | from .rank import (
14 | build_endorsement_graph,
15 | compute_pagerank,
16 | compute_categorical_pageranks,
17 | finalize_rankings
18 | )
19 |
20 | # Try importing dashboard libraries
21 | try:
22 | import dash
23 | from dash import dcc
24 | from dash import html
25 | from dash.dependencies import Input, Output
26 | import plotly.express as px
27 | import plotly.graph_objects as go
28 | HAS_DASH = True
29 | except ImportError:
30 | logger.warning("Dash not found. Web dashboard will be disabled.")
31 | HAS_DASH = False
32 |
33 | def categorize_prompts(prompts_df: pd.DataFrame, config: EvalConfig) -> Dict[str, List[str]]:
34 | """
35 | Process the prompts DataFrame to extract categories.
36 | If a 'Category' column exists, use it to categorize prompts.
37 | Otherwise, try to infer categories using keyword matching.
38 | """
39 | categories = {}
40 |
41 | # Determine which column has the prompts
42 | prompt_column = None
43 | if 'Questions' in prompts_df.columns:
44 | prompt_column = 'Questions'
45 | elif 'prompt' in prompts_df.columns:
46 | prompt_column = 'prompt'
47 | elif len(prompts_df.columns) > 0:
48 | prompt_column = prompts_df.columns[0]
49 | else:
50 | logger.warning("No columns found in prompts DataFrame for categorization")
51 | return {}
52 |
53 | # Determine which column has categories
54 | category_column = None
55 | if 'Category' in prompts_df.columns:
56 | category_column = 'Category'
57 | elif 'category' in prompts_df.columns:
58 | category_column = 'category'
59 |
60 | if category_column:
61 | # Use explicit categories from the prompts file
62 | for category in prompts_df[category_column].unique():
63 | if pd.notna(category) and category:
64 | category_prompts = prompts_df[prompts_df[category_column] == category][prompt_column].tolist()
65 | categories[category.lower() if isinstance(category, str) else str(category)] = category_prompts
66 | elif config.prompt_categories:
67 | # Use categories from the configuration
68 | return config.prompt_categories
69 | else:
70 | # Try to infer categories using keywords (basic implementation)
71 | # In a real implementation, you might use NLP techniques or clustering
72 | keywords = {
73 | 'reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze'],
74 | 'creativity': ['creative', 'imagine', 'story', 'design', 'invent'],
75 | 'knowledge': ['fact', 'define', 'what is', 'history', 'science'],
76 | 'coding': ['code', 'function', 'algorithm', 'program', 'script'],
77 | }
78 |
79 | # Initialize categories
80 | for category in keywords:
81 | categories[category] = []
82 |
83 | # Categorize prompts based on keywords
84 | for prompt in prompts_df[prompt_column].tolist():
85 | categorized = False
86 | if not isinstance(prompt, str):
87 | prompt = str(prompt)
88 |
89 | prompt_lower = prompt.lower()
90 |
91 | for category, terms in keywords.items():
92 | if any(term in prompt_lower for term in terms):
93 | categories[category].append(prompt)
94 | categorized = True
95 | break
96 |
97 | if not categorized:
98 | if 'uncategorized' not in categories:
99 | categories['uncategorized'] = []
100 | categories['uncategorized'].append(prompt)
101 |
102 | # Only keep categories with prompts
103 | return {k: v for k, v in categories.items() if v}
104 |
105 | def start_dashboard(config: EvalConfig, rankings_path: Path):
106 | """
107 | Start a Dash web dashboard for interactive visualization.
108 | """
109 | if not HAS_DASH or not config.web_dashboard.enabled:
110 | return
111 |
112 | try:
113 | # Load rankings data
114 | with open(rankings_path, 'r') as f:
115 | data = json.load(f)
116 |
117 | # Create Dash app
118 | app = dash.Dash(__name__)
119 |
120 | # Define layout
121 | app.layout = html.Div([
122 | html.H1("SlopRank Dashboard"),
123 |
124 | html.Div([
125 | html.H2("Model Rankings"),
126 | dcc.Graph(
127 | id='ranking-graph',
128 | figure={
129 | 'data': [
130 | {'x': [item['model'] for item in data['rankings']],
131 | 'y': [item['score'] for item in data['rankings']],
132 | 'type': 'bar', 'name': 'PageRank Score'}
133 | ],
134 | 'layout': {
135 | 'title': 'Model PageRank Scores',
136 | 'xaxis': {'title': 'Model'},
137 | 'yaxis': {'title': 'PageRank Score'}
138 | }
139 | }
140 | )
141 | ]),
142 |
143 | # Add category rankings if available
144 | html.Div([
145 | html.H2("Rankings by Category"),
146 | html.Div([
147 | html.Label("Select Category:"),
148 | dcc.Dropdown(
149 | id='category-dropdown',
150 | options=[{'label': cat, 'value': cat}
151 | for cat in data.get('category_rankings', {}).keys()],
152 | value=next(iter(data.get('category_rankings', {}).keys()), None)
153 | )
154 | ]) if data.get('category_rankings') else html.Div("No category data available."),
155 | dcc.Graph(id='category-graph')
156 | ]) if data.get('category_rankings') else html.Div(),
157 |
158 | # Add confidence intervals if available
159 | html.Div([
160 | html.H2("Confidence Intervals"),
161 | dcc.Graph(
162 | id='confidence-graph',
163 | figure={
164 | 'data': [
165 | {
166 | 'x': [model for model in data['confidence_intervals'].keys()],
167 | 'y': [stats['mean'] for stats in data['confidence_intervals'].values()],
168 | 'error_y': {
169 | 'type': 'data',
170 | 'symmetric': False,
171 | 'array': [
172 | stats['upper_bound'] - stats['mean']
173 | for stats in data['confidence_intervals'].values()
174 | ],
175 | 'arrayminus': [
176 | stats['mean'] - stats['lower_bound']
177 | for stats in data['confidence_intervals'].values()
178 | ]
179 | },
180 | 'type': 'scatter',
181 | 'mode': 'markers',
182 | 'marker': {'size': 10}
183 | }
184 | ],
185 | 'layout': {
186 | 'title': '95% Confidence Intervals',
187 | 'xaxis': {'title': 'Model'},
188 | 'yaxis': {'title': 'PageRank Score'}
189 | }
190 | }
191 | )
192 | ]) if data.get('confidence_intervals') else html.Div(),
193 |
194 | # Add link to static visualizations
195 | html.Div([
196 | html.H2("Visualizations"),
197 | html.P([
198 | "View the static graph visualization ",
199 | html.A("here", href=f"/{config.output_dir}/visualizations/endorsement_graph.png", target="_blank"),
200 | " or the interactive version ",
201 | html.A("here", href=f"/{config.output_dir}/visualizations/endorsement_graph.html", target="_blank"),
202 | "."
203 | ])
204 | ])
205 | ])
206 |
207 | # Define callbacks
208 | @app.callback(
209 | Output('category-graph', 'figure'),
210 | [Input('category-dropdown', 'value')]
211 | )
212 | def update_category_graph(selected_category):
213 | if not selected_category or not data.get('category_rankings'):
214 | return {}
215 |
216 | cat_data = data['category_rankings'].get(selected_category, [])
217 | return {
218 | 'data': [
219 | {'x': [item['model'] for item in cat_data],
220 | 'y': [item['score'] for item in cat_data],
221 | 'type': 'bar', 'name': 'PageRank Score'}
222 | ],
223 | 'layout': {
224 | 'title': f'Model Rankings for Category: {selected_category}',
225 | 'xaxis': {'title': 'Model'},
226 | 'yaxis': {'title': 'PageRank Score'}
227 | }
228 | }
229 |
230 | # Run the server in a separate thread
231 | def run_server():
232 | app.run_server(
233 | host=config.web_dashboard.host,
234 | port=config.web_dashboard.port,
235 | debug=config.web_dashboard.debug
236 | )
237 |
238 | dashboard_thread = threading.Thread(target=run_server)
239 | dashboard_thread.daemon = True
240 | dashboard_thread.start()
241 |
242 | # Print info message
243 | if config.web_dashboard.auto_open_browser:
244 | import webbrowser
245 | url = f"http://{config.web_dashboard.host}:{config.web_dashboard.port}"
246 | webbrowser.open(url)
247 |
248 | logger.info(f"Dashboard running at http://{config.web_dashboard.host}:{config.web_dashboard.port}")
249 | logger.info("Press Ctrl+C to exit")
250 |
251 | except Exception as e:
252 | logger.error(f"Error starting dashboard: {e}")
253 |
254 |
255 | @click.group()
256 | def cli():
257 | """SlopRank - Peer-based LLM cross-evaluation system."""
258 | pass
259 |
260 |
261 | @cli.command()
262 | @click.option("--prompts", default="prompts.csv", help="Path to prompts Excel file")
263 | @click.option("--output-dir", default="results", help="Output directory for results")
264 | @click.option("--models", help="Comma-separated list of models to evaluate")
265 | @click.option(
266 | "--responses",
267 | help="Path to CSV of responses generated by a separate agent runner",
268 | default="",
269 | )
270 | @click.option(
271 | "--visualize/--no-visualize",
272 | default=True,
273 | help="Enable/disable graph visualization"
274 | )
275 | @click.option(
276 | "--interactive/--no-interactive",
277 | default=True,
278 | help="Enable/disable interactive visualization"
279 | )
280 | @click.option(
281 | "--confidence/--no-confidence",
282 | default=True,
283 | help="Enable/disable confidence interval calculation"
284 | )
285 | @click.option(
286 | "--dashboard/--no-dashboard",
287 | default=False,
288 | help="Enable/disable web dashboard"
289 | )
290 | @click.option(
291 | "--dashboard-port",
292 | default=8050,
293 | help="Port for web dashboard"
294 | )
295 | def run(prompts, output_dir, models, responses, visualize, interactive, confidence, dashboard, dashboard_port):
296 | """
297 | Run the full SlopRank evaluation workflow.
298 | """
299 | logging.basicConfig(
300 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
301 | )
302 |
303 | # Parse model list from command line
304 | model_list = models.split(",") if models else None
305 |
306 | # Create visualization config
307 | vis_config = VisualizationConfig(
308 | enabled=visualize,
309 | interactive=interactive
310 | )
311 |
312 | # Create confidence config
313 | conf_config = ConfidenceConfig(
314 | enabled=confidence
315 | )
316 |
317 | # Create web dashboard config
318 | dash_config = WebDashboardConfig(
319 | enabled=dashboard,
320 | port=dashboard_port
321 | )
322 |
323 | # Create main config
324 | config = EvalConfig(
325 | model_names=model_list or DEFAULT_CONFIG.model_names,
326 | evaluation_method=1, # numeric rating
327 | use_subset_evaluation=True,
328 | evaluators_subset_size=3,
329 | output_dir=Path(output_dir),
330 | visualization=vis_config,
331 | confidence=conf_config,
332 | web_dashboard=dash_config
333 | )
334 | logger.info(f"Using config: {config}")
335 |
336 | # 1a) If we generated the responses in another tool and are piping them
337 | # to SlopRank UNIX-style, we don't need to load/run the prompts
338 | if responses:
339 | responses_df = pd.read_csv(responses)
340 | prompts_df = pd.DataFrame({'Questions': responses_df['prompt'].unique()})
341 | else:
342 | # 1) Read prompts
343 | prompts_df = pd.read_csv(prompts)
344 |
345 | # Handle different column naming conventions
346 | prompt_column = None
347 | if "Questions" in prompts_df.columns:
348 | prompt_column = "Questions"
349 | elif "prompt" in prompts_df.columns:
350 | prompt_column = "prompt"
351 | elif len(prompts_df.columns) > 0:
352 | # If no recognized column name but there is at least one column,
353 | # assume the first column contains the prompts
354 | prompt_column = prompts_df.columns[0]
355 | logger.warning(f"No 'Questions' or 'prompt' column found, using first column: {prompt_column}")
356 | else:
357 | raise ValueError("CSV file has no columns")
358 |
359 | logger.info(f"Using column '{prompt_column}' for prompts")
360 |
361 | # Similarly handle different names for answer key column
362 | answer_key_column = None
363 | answer_keys = []
364 |
365 | if "Answer_key" in prompts_df.columns:
366 | answer_key_column = "Answer_key"
367 | elif "answer_key" in prompts_df.columns:
368 | answer_key_column = "answer_key"
369 |
370 | if answer_key_column:
371 | answer_keys = prompts_df[answer_key_column].tolist()
372 | logger.info(f"Using column '{answer_key_column}' for answer keys")
373 | else:
374 | answer_keys = [None] * len(prompts_df)
375 | logger.info("No answer key column found")
376 |
377 | prompt_pairs = list(zip(prompts_df[prompt_column].tolist(), answer_keys))
378 |
379 | # 2) Collect responses
380 | responses_df = collect_responses(prompt_pairs, config)
381 |
382 | # Process prompt categories
383 | config.prompt_categories = categorize_prompts(prompts_df, config)
384 | if config.prompt_categories:
385 | logger.info(f"Found {len(config.prompt_categories)} prompt categories: {', '.join(config.prompt_categories.keys())}")
386 |
387 | # 3) Collect raw evaluations
388 | raw_eval_df = collect_raw_evaluations(responses_df, config)
389 |
390 | # 4) Parse evaluation rows
391 | eval_path = config.output_dir / "evaluations.csv"
392 | if eval_path.exists():
393 | logger.info(f"Loading existing parsed evaluations from {eval_path}")
394 | evaluations_df = pd.read_csv(eval_path)
395 | else:
396 | evaluations_df = parse_evaluation_rows(raw_eval_df, config)
397 | evaluations_df.to_csv(eval_path, index=False)
398 | logger.info(f"Saved parsed evaluations to {eval_path}")
399 |
400 | # 5) Build endorsement graph
401 | G = build_endorsement_graph(evaluations_df, config)
402 |
403 | # 6) Compute overall PageRank
404 | pagerank_scores = compute_pagerank(G)
405 |
406 | # 7) Compute category-specific PageRank scores if categories exist
407 | category_rankings = None
408 | if config.prompt_categories:
409 | category_rankings = compute_categorical_pageranks(G, config.prompt_categories)
410 |
411 | # 8) Finalize rankings and generate visualizations
412 | finalize_rankings(
413 | pagerank_scores,
414 | config,
415 | G=G,
416 | evaluations_df=evaluations_df,
417 | category_rankings=category_rankings
418 | )
419 |
420 | # 9) Start web dashboard if enabled
421 | if config.web_dashboard.enabled and HAS_DASH:
422 | rankings_path = config.output_dir / "rankings.json"
423 | if rankings_path.exists():
424 | start_dashboard(config, rankings_path)
425 |
426 |
427 | @cli.command()
428 | @click.option("--output-dir", default="results", help="Output directory containing results")
429 | @click.option("--port", default=8050, help="Dashboard port")
430 | def dashboard(output_dir, port):
431 | """
432 | Start the web dashboard for existing results.
433 | """
434 | if not HAS_DASH:
435 | logger.error("Dash not found. Please install with 'pip install dash plotly'")
436 | return
437 |
438 | config = EvalConfig(
439 | model_names=DEFAULT_CONFIG.model_names,
440 | evaluation_method=1,
441 | use_subset_evaluation=True,
442 | evaluators_subset_size=3,
443 | output_dir=Path(output_dir),
444 | web_dashboard=WebDashboardConfig(
445 | enabled=True,
446 | port=port,
447 | auto_open_browser=True
448 | )
449 | )
450 |
451 | rankings_path = Path(output_dir) / "rankings.json"
452 | if not rankings_path.exists():
453 | logger.error(f"Rankings file not found: {rankings_path}")
454 | return
455 |
456 | logger.info(f"Starting dashboard for results in {output_dir}")
457 | start_dashboard(config, rankings_path)
458 |
459 | # Keep the main thread alive
460 | try:
461 | while True:
462 | import time
463 | time.sleep(1)
464 | except KeyboardInterrupt:
465 | logger.info("Dashboard stopped")
466 |
467 |
468 | def main():
469 | """Entry point for CLI."""
470 | # Register utility commands if available
471 | try:
472 | from .utils.commands import register_utils_commands
473 | register_utils_commands(cli)
474 | except ImportError:
475 | pass
476 |
477 | cli()
478 |
--------------------------------------------------------------------------------
/sloprank/collect.py:
--------------------------------------------------------------------------------
1 | import time
2 | import random
3 | import json
4 | import pandas as pd
5 | from pathlib import Path
6 | from typing import List, Tuple
7 | from .config import logger, EvalConfig
8 |
9 | try:
10 | # Import parallm for efficient response collection
11 | from parallm import query_model_all, query_model
12 | HAS_PARALLM = True
13 | llm = None # We won't use llm when parallm is available
14 | except ImportError:
15 | # This should not happen with normal installation as parallm is now a core dependency
16 | logger.error("Could not import 'parallm' module. This is a required dependency for SlopRank.")
17 | logger.error("Please ensure parallm is installed with: pip install parallm")
18 | logger.warning("Falling back to llm or mock response generation (not recommended for production).")
19 | HAS_PARALLM = False
20 | try:
21 | # If you have a custom LLM module that provides get_model()
22 | import llm
23 | except ImportError:
24 | logger.warning("Could not import 'llm' module. Provide your own LLM interface or mock it.")
25 | llm = None
26 |
27 | def collect_responses(prompt_pairs: List[Tuple[str, str]], config: EvalConfig) -> pd.DataFrame:
28 | """
29 | Query each model with each prompt, skipping existing entries in responses.csv.
30 | """
31 | resp_path = config.output_dir / "responses.csv"
32 | if resp_path.exists():
33 | existing_df = pd.read_csv(resp_path)
34 | else:
35 | existing_df = pd.DataFrame(columns=["prompt","model"])
36 |
37 | # Extract prompts and answer keys
38 | prompts = [p[0] for p in prompt_pairs]
39 | answer_keys = [p[1] for p in prompt_pairs]
40 |
41 | # If we have parallm, use it for batch processing
42 | if HAS_PARALLM:
43 | logger.info(f"Using parallm to query {len(config.model_names)} models for {len(prompts)} prompts...")
44 |
45 | # Create a temporary CSV with the prompts using the "Questions" column
46 | prompts_df = pd.DataFrame({"Questions": prompts})
47 | temp_prompts_path = config.output_dir / "temp_prompts.csv"
48 | prompts_df.to_csv(temp_prompts_path, index=False)
49 |
50 | # Add "prompt" column for parallm compatibility
51 | prompts_df["prompt"] = prompts_df["Questions"]
52 | temp_prompts_modified_path = config.output_dir / "temp_prompts_modified.csv"
53 | prompts_df.to_csv(temp_prompts_modified_path, index=False)
54 |
55 | # Use parallm to query all models at once with the modified CSV
56 | responses_df = query_model_all(str(temp_prompts_modified_path), config.model_names)
57 |
58 | # Check if output.csv was created by parallm and use that instead if it exists
59 | output_path = Path("output.csv")
60 | if output_path.exists():
61 | logger.info(f"Using outputs from {output_path}")
62 | responses_df = pd.read_csv(output_path)
63 | # Clean up parallm's output file
64 | import os
65 | os.remove(output_path)
66 |
67 | # Add answer keys and additional metadata
68 | responses_df['Answer_key'] = responses_df['prompt'].map(dict(zip(prompts, answer_keys)))
69 | responses_df['is_valid'] = responses_df['response'].apply(lambda x: bool(x and len(str(x).strip()) >= 10))
70 | responses_df['token_count'] = responses_df['response'].apply(lambda x: len(str(x).split()) if x else 0)
71 | responses_df['response_time'] = 0.0 # Default value since parallm doesn't track this
72 | responses_df['error'] = None # Default value
73 |
74 | # Clean up temp files
75 | import os
76 | for path in [temp_prompts_path, temp_prompts_modified_path]:
77 | if os.path.exists(path):
78 | logger.info(f"Cleaning up temporary file: {path}")
79 | os.remove(path)
80 | else:
81 | # Fall back to original implementation
82 | new_rows = []
83 | for i, (prompt, answer_key) in enumerate(prompt_pairs, start=1):
84 | logger.info(f"Processing prompt {i}/{len(prompt_pairs)}: {prompt[:50]}...")
85 |
86 | for model_name in config.model_names:
87 | # Check if we already have a response
88 | subset = existing_df[
89 | (existing_df["prompt"] == prompt) &
90 | (existing_df["model"] == model_name)
91 | ]
92 | if not subset.empty:
93 | logger.info(f"Skipping existing response for model={model_name}, prompt={prompt[:40]}...")
94 | continue
95 |
96 | start_time = time.time()
97 | logger.info(f"Querying {model_name} for new response...")
98 | raw_response = None
99 | tokens_used = 0
100 | valid = False
101 | error_msg = None
102 |
103 | try:
104 | if llm is not None:
105 | model = llm.get_model(model_name)
106 | response_obj = model.prompt(prompt)
107 | raw_response = response_obj.text()
108 | else:
109 | # fallback mock
110 | raw_response = f"[MOCK] {model_name} responding to: {prompt[:40]}"
111 |
112 | valid = (raw_response and len(raw_response.strip()) >= 10)
113 | tokens_used = len(raw_response.split()) if valid else 0
114 |
115 | except Exception as e:
116 | error_msg = str(e)
117 | logger.error(f"Error from {model_name}: {error_msg}")
118 |
119 | elapsed = time.time() - start_time
120 |
121 | new_rows.append({
122 | 'prompt': prompt,
123 | 'model': model_name,
124 | 'response': raw_response if valid else None,
125 | 'is_valid': valid,
126 | 'response_time': elapsed,
127 | 'Answer_key': answer_key,
128 | 'token_count': tokens_used,
129 | 'error': error_msg
130 | })
131 |
132 | if config.request_delay > 0:
133 | time.sleep(config.request_delay)
134 |
135 | responses_df = pd.DataFrame(new_rows)
136 |
137 | # Combine with existing responses
138 | combined_df = pd.concat([existing_df, responses_df], ignore_index=True)
139 | combined_df.drop_duplicates(subset=["prompt","model"], keep="first", inplace=True)
140 | combined_df.to_csv(resp_path, index=False)
141 | logger.info(f"Responses saved to {resp_path}")
142 | return combined_df
143 |
144 | def collect_raw_evaluations(responses_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame:
145 | """
146 | Each model in config.model_names evaluates the others' answers.
147 | Results are stored in raw_evaluations.csv as [prompt, judge_model, raw_judgment, model_mapping].
148 | """
149 | raw_eval_path = config.output_dir / "raw_evaluations.csv"
150 | if raw_eval_path.exists():
151 | existing_df = pd.read_csv(raw_eval_path)
152 | else:
153 | existing_df = pd.DataFrame(columns=["prompt","judge_model","model_mapping"])
154 |
155 | # Collect all evaluation prompts
156 | eval_tasks = []
157 | unique_prompts = responses_df['prompt'].unique()
158 |
159 | for prompt in unique_prompts:
160 | subset = responses_df[responses_df['prompt'] == prompt]
161 | answer_key = subset['Answer_key'].iloc[0] if 'Answer_key' in subset.columns else None
162 | model_response_map = subset.set_index('model')['response'].to_dict()
163 |
164 | for judge_model in config.model_names:
165 | # Exclude judge's own or missing responses
166 | other_models = [m for m in config.model_names
167 | if m != judge_model and model_response_map.get(m)]
168 | if not other_models:
169 | continue
170 | if config.use_subset_evaluation:
171 | sample_size = min(config.evaluators_subset_size, len(other_models))
172 | other_models = random.sample(other_models, sample_size)
173 |
174 | model_to_anon = {m: f"Model_{i+1}" for i,m in enumerate(other_models)}
175 | answers_section = "\n".join([
176 | f"{model_to_anon[m]}:\n{model_response_map[m]}\n---"
177 | for m in other_models
178 | ])
179 | answer_key_text = f"The Answer Key is:\n{answer_key}\n---\n" if answer_key else ""
180 |
181 | model_mapping_str = json.dumps(model_to_anon, sort_keys=True)
182 | found_match = existing_df[
183 | (existing_df["prompt"] == prompt) &
184 | (existing_df["judge_model"] == judge_model) &
185 | (existing_df["model_mapping"] == model_mapping_str)
186 | ]
187 | if not found_match.empty:
188 | logger.info(f"Skipping existing raw eval for judge={judge_model}, prompt={prompt[:40]}...")
189 | continue
190 |
191 | instructions = f"""
192 | You are an evaluator. Score each model's answer (1-10) in JSON format.
193 |
194 | Important! Your response MUST be a valid JSON object with the exact format:
195 | {{"Model_1": 7, "Model_2": 9}}
196 |
197 | Problem:
198 | {prompt}
199 |
200 | Answers:
201 | {answers_section}
202 |
203 | {answer_key_text}
204 |
205 | After reading each answer, assign a score from 1-10. Return your scores in JSON format ONLY without explanations.
206 | """
207 |
208 | eval_tasks.append({
209 | "prompt": prompt,
210 | "judge_model": judge_model,
211 | "evaluation_prompt": instructions,
212 | "model_mapping": model_mapping_str
213 | })
214 |
215 | # If no new evaluations needed, return existing ones
216 | if not eval_tasks:
217 | logger.info("No new evaluations needed, returning existing data")
218 | return existing_df
219 |
220 | new_judgments = []
221 |
222 | # Process all evaluation tasks individually - simpler and more reliable
223 | logger.info(f"Processing {len(eval_tasks)} evaluation tasks individually")
224 |
225 | # Group tasks by judge_model for better organization in logs
226 | judge_models = set(task["judge_model"] for task in eval_tasks)
227 | for judge_model in judge_models:
228 | model_tasks = [task for task in eval_tasks if task["judge_model"] == judge_model]
229 | logger.info(f"Processing {len(model_tasks)} evaluations for judge={judge_model}")
230 |
231 | for i, task in enumerate(model_tasks):
232 | logger.info(f"Evaluation {i+1}/{len(model_tasks)} for {judge_model}")
233 |
234 | raw_judgment = None
235 | try:
236 | # Use parallm's query_model if available (correct parameter order)
237 | if HAS_PARALLM:
238 | logger.info(f"Querying {judge_model} with evaluation prompt")
239 | raw_judgment = query_model(task["evaluation_prompt"], judge_model)
240 | elif llm is not None:
241 | logger.info(f"Querying {judge_model} via llm")
242 | judge_obj = llm.get_model(judge_model)
243 | judge_resp = judge_obj.prompt(task["evaluation_prompt"])
244 | raw_judgment = judge_resp.text()
245 | else:
246 | # fallback mock data
247 | raw_judgment = '{"Model_1": 8, "Model_2": 6}'
248 |
249 | # Log successful query
250 | logger.info(f"Received response from {judge_model}: {raw_judgment[:50]}...")
251 |
252 | except Exception as e:
253 | logger.error(f"Error querying {judge_model}: {str(e)}")
254 | # Use fallback values on error
255 | raw_judgment = '{"Model_1": 5, "Model_2": 5}'
256 |
257 | # Add to new judgments
258 | new_judgments.append({
259 | "prompt": task["prompt"],
260 | "judge_model": task["judge_model"],
261 | "raw_judgment": raw_judgment,
262 | "model_mapping": task["model_mapping"],
263 | "raw_judgment_token_count": len(raw_judgment.split()) if raw_judgment else 0
264 | })
265 |
266 | new_df = pd.DataFrame(new_judgments)
267 | # Only create combined_df if there are new judgments
268 | if not new_df.empty:
269 | combined_df = pd.concat([existing_df, new_df], ignore_index=True)
270 | combined_df.drop_duplicates(subset=["prompt","judge_model","model_mapping"], keep="first", inplace=True)
271 | combined_df.to_csv(raw_eval_path, index=False)
272 | logger.info(f"Raw evaluations saved to {raw_eval_path}")
273 | return combined_df
274 | else:
275 | logger.info("No new evaluations were created")
276 | return existing_df
--------------------------------------------------------------------------------
/sloprank/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from dataclasses import dataclass, field
3 | from pathlib import Path
4 | from typing import List, Dict, Optional, Union, Any
5 |
6 | logging.basicConfig(
7 | level=logging.INFO,
8 | format="%(asctime)s - %(levelname)s - %(message)s"
9 | )
10 | logger = logging.getLogger("SlopRankLogger")
11 |
12 | @dataclass
13 | class VisualizationConfig:
14 | """Configuration for graph visualization options."""
15 | enabled: bool = True
16 | save_formats: List[str] = field(default_factory=lambda: ["png", "html", "gml"])
17 | node_size_factor: float = 2000
18 | edge_width_factor: float = 2.0
19 | layout: str = "spring" # Options: spring, circular, kamada_kawai, spectral
20 | node_colormap: str = "viridis"
21 | edge_colormap: str = "plasma"
22 | interactive: bool = True
23 |
24 | @dataclass
25 | class ConfidenceConfig:
26 | """Configuration for confidence interval calculations."""
27 | enabled: bool = True
28 | bootstrap_iterations: int = 1000
29 | confidence_level: float = 0.95 # e.g., 0.95 for 95% confidence interval
30 | significance_threshold: float = 0.05 # p-value threshold for significance
31 |
32 | @dataclass
33 | class WebDashboardConfig:
34 | """Configuration for the web dashboard."""
35 | enabled: bool = False # Default to disabled
36 | host: str = "127.0.0.1"
37 | port: int = 8050
38 | debug: bool = False
39 | auto_open_browser: bool = True
40 |
41 | @dataclass
42 | class EvalConfig:
43 | """Configuration for the SlopRank evaluation system."""
44 | # Core configuration
45 | model_names: List[str]
46 | evaluation_method: int # 1 => numeric rating, 2 => up/down (example usage)
47 | use_subset_evaluation: bool
48 | evaluators_subset_size: int
49 | output_dir: Path
50 | request_delay: float = 0.0
51 |
52 | # New features
53 | prompt_categories: Dict[str, List[str]] = field(default_factory=dict)
54 | visualization: VisualizationConfig = field(default_factory=VisualizationConfig)
55 | confidence: ConfidenceConfig = field(default_factory=ConfidenceConfig)
56 | web_dashboard: WebDashboardConfig = field(default_factory=WebDashboardConfig)
57 |
58 | # Optional metadata fields
59 | metadata: Dict[str, Any] = field(default_factory=dict)
60 |
61 | def __post_init__(self):
62 | self.output_dir.mkdir(parents=True, exist_ok=True)
63 |
64 | # Strip any whitespace from model names
65 | self.model_names = [model.strip() for model in self.model_names]
66 |
67 | if self.evaluation_method not in {1, 2}:
68 | raise ValueError("evaluation_method must be 1 or 2")
69 | if self.use_subset_evaluation and self.evaluators_subset_size >= len(self.model_names):
70 | # Automatically adjust the subset size if needed
71 | self.evaluators_subset_size = len(self.model_names) - 1 if len(self.model_names) > 1 else 1
72 | logger.warning(f"Adjusted evaluators_subset_size to {self.evaluators_subset_size}")
73 |
74 | # Create visualization directory if needed
75 | if self.visualization.enabled:
76 | vis_dir = self.output_dir / "visualizations"
77 | vis_dir.mkdir(parents=True, exist_ok=True)
78 |
79 | DEFAULT_CONFIG = EvalConfig(
80 | model_names=[
81 | "gemini-2.5-pro-exp-03-25",
82 | "claude-3.7-sonnet-latest",
83 | "gpt-4o",
84 | "deepseek-chat"
85 | ],
86 | # model_names=[
87 | # "gemini-2.5-pro-exp-03-25",
88 | # "claude-3.7-sonnet-latest",
89 | # "o1",
90 | # "deepseek-reasoner"
91 | # ],
92 | evaluation_method=1, # numeric
93 | use_subset_evaluation=True,
94 | evaluators_subset_size=3,
95 | output_dir=Path("results"),
96 | request_delay=0.0,
97 | # Default prompt categories (empty)
98 | prompt_categories={},
99 | # Default visualization configuration
100 | visualization=VisualizationConfig(
101 | enabled=True,
102 | save_formats=["png", "html", "gml"],
103 | node_size_factor=2000,
104 | edge_width_factor=2.0,
105 | layout="spring",
106 | node_colormap="viridis",
107 | edge_colormap="plasma",
108 | interactive=True
109 | ),
110 | # Default confidence configuration
111 | confidence=ConfidenceConfig(
112 | enabled=True,
113 | bootstrap_iterations=1000,
114 | confidence_level=0.95,
115 | significance_threshold=0.05
116 | ),
117 | # Default web dashboard configuration (disabled by default)
118 | web_dashboard=WebDashboardConfig(
119 | enabled=False,
120 | host="127.0.0.1",
121 | port=8050,
122 | debug=False,
123 | auto_open_browser=True
124 | )
125 | )
126 |
--------------------------------------------------------------------------------
/sloprank/parse.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 | from .config import logger, EvalConfig
4 |
5 | def parse_evaluation_rows(raw_eval_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame:
6 | """
7 | Convert each row's judge's JSON to numeric scores.
8 | Returns: columns = [prompt, judge_model, rated_model, score, parse_failed].
9 | """
10 | all_rows = []
11 | for _, row in raw_eval_df.iterrows():
12 | prompt = row["prompt"]
13 | judge_model = row["judge_model"]
14 | raw_judgment = row["raw_judgment"] or ""
15 | raw_judgment_tokens = row.get("raw_judgment_token_count", 0)
16 |
17 | # load model_mapping
18 | try:
19 | model_mapping = json.loads(row["model_mapping"])
20 | except Exception as e:
21 | logger.error(f"Couldn't parse model_mapping: {e}")
22 | model_mapping = {}
23 |
24 | if not raw_judgment.strip():
25 | # fallback
26 | for real_model in model_mapping.keys():
27 | all_rows.append({
28 | "prompt": prompt,
29 | "judge_model": judge_model,
30 | "rated_model": real_model,
31 | "score": 4.1,
32 | "parse_failed": True,
33 | "raw_judgment_token_count": raw_judgment_tokens
34 | })
35 | continue
36 |
37 | # Attempt to isolate the JSON object
38 | # First try to find JSON with standard formatting
39 | start = raw_judgment.find("{")
40 | end = raw_judgment.rfind("}") + 1
41 |
42 | # If that fails, try more aggressive parsing for models that output in various formats
43 | if start == -1 or end == 0:
44 | # Look for patterns like "Model_1": 8 or "Model_1" : 8 or Model_1: 8
45 | import re
46 | json_pattern = r'[\{\s]*[\"\']?Model_\d+[\"\']?\s*:\s*\d+(?:\.\d+)?'
47 | if re.search(json_pattern, raw_judgment):
48 | # Try to reconstruct a proper JSON
49 | scores = {}
50 | model_score_pattern = r'[\"\']?Model_(\d+)[\"\']?\s*:\s*(\d+(?:\.\d+)?)'
51 | matches = re.findall(model_score_pattern, raw_judgment)
52 | for model_num, score in matches:
53 | scores[f"Model_{model_num}"] = float(score)
54 |
55 | if scores:
56 | logger.warning(f"Reconstructed JSON for judge={judge_model}, prompt={prompt[:40]}")
57 | try:
58 | # Convert to standard dict for consistency in later processing
59 | anon_to_real = {v: k for k,v in model_mapping.items()}
60 | for anon_id, score_val in scores.items():
61 | real_model = anon_to_real.get(anon_id)
62 | if real_model:
63 | score_float = float(score_val)
64 | # clamp 1..10
65 | score_float = max(1.0, min(10.0, score_float))
66 | all_rows.append({
67 | "prompt": prompt,
68 | "judge_model": judge_model,
69 | "rated_model": real_model,
70 | "score": score_float,
71 | "parse_failed": False,
72 | "raw_judgment_token_count": raw_judgment_tokens
73 | })
74 | continue
75 | except Exception as e:
76 | logger.error(f"Error processing reconstructed JSON: {e}")
77 |
78 | logger.error(f"No JSON found for judge={judge_model}, prompt={prompt[:40]}")
79 | # fallback
80 | for real_model in model_mapping.keys():
81 | all_rows.append({
82 | "prompt": prompt,
83 | "judge_model": judge_model,
84 | "rated_model": real_model,
85 | "score": 4.1,
86 | "parse_failed": True,
87 | "raw_judgment_token_count": raw_judgment_tokens
88 | })
89 | continue
90 |
91 | try:
92 | data = json.loads(raw_judgment[start:end])
93 | # Reverse map: "Model_1" => real model name
94 | anon_to_real = {v: k for k,v in model_mapping.items()}
95 |
96 | for anon_id, score_val in data.items():
97 | real_model = anon_to_real.get(anon_id)
98 | if real_model:
99 | score_float = float(score_val)
100 | # clamp 1..10
101 | score_float = max(1.0, min(10.0, score_float))
102 | all_rows.append({
103 | "prompt": prompt,
104 | "judge_model": judge_model,
105 | "rated_model": real_model,
106 | "score": score_float,
107 | "parse_failed": False,
108 | "raw_judgment_token_count": raw_judgment_tokens
109 | })
110 | except Exception as e:
111 | logger.error(f"Parsing error: judge={judge_model}, prompt={prompt[:40]} => {str(e)}")
112 | for real_model in model_mapping.keys():
113 | all_rows.append({
114 | "prompt": prompt,
115 | "judge_model": judge_model,
116 | "rated_model": real_model,
117 | "score": 4.1,
118 | "parse_failed": True,
119 | "raw_judgment_token_count": raw_judgment_tokens
120 | })
121 |
122 | return pd.DataFrame(all_rows)
123 |
--------------------------------------------------------------------------------
/sloprank/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | SlopRank utilities for visualization, confidence calculation, and dashboard generation.
3 | """
--------------------------------------------------------------------------------
/sloprank/utils/categorization.py:
--------------------------------------------------------------------------------
1 | """
2 | Prompt categorization and category-based analysis.
3 | """
4 | import json
5 | import pandas as pd
6 | import re
7 | from pathlib import Path
8 | from collections import defaultdict
9 |
10 | from ..config import logger
11 |
12 | def categorize_prompts(prompts_file=None, save_categorized=True):
13 | """
14 | Read prompts from Excel file and automatically categorize them.
15 | If a 'Category' column exists, it will use those categories.
16 | Otherwise, it will attempt to infer categories based on content.
17 |
18 | Parameters:
19 | -----------
20 | prompts_file : Path or str
21 | Path to the prompts Excel file
22 | save_categorized : bool
23 | Whether to save the categorized prompts back to an Excel file
24 |
25 | Returns:
26 | --------
27 | dict
28 | Dictionary mapping category names to lists of prompts
29 | """
30 | if prompts_file is None:
31 | prompts_file = Path("prompts.csv")
32 | else:
33 | prompts_file = Path(prompts_file)
34 |
35 | logger.info(f"Reading prompts from {prompts_file}...")
36 |
37 | # Read prompts from Excel
38 | prompts_df = pd.read_csv(prompts_file)
39 |
40 | # Check if a Category column exists
41 | if 'Category' in prompts_df.columns:
42 | categories = defaultdict(list)
43 |
44 | # Group prompts by category
45 | for _, row in prompts_df.iterrows():
46 | if pd.notna(row['Category']) and row['Category']:
47 | categories[row['Category']].append(row['Questions'])
48 | else:
49 | if 'Uncategorized' not in categories:
50 | categories['Uncategorized'] = []
51 | categories['Uncategorized'].append(row['Questions'])
52 |
53 | logger.info(f"Found {len(categories)} categories in the Excel file.")
54 | else:
55 | # Infer categories based on content
56 | categories = infer_categories(prompts_df['Questions'].tolist())
57 |
58 | if save_categorized:
59 | # Add inferred categories back to the DataFrame
60 | category_map = {}
61 | for category, prompts in categories.items():
62 | for prompt in prompts:
63 | category_map[prompt] = category
64 |
65 | prompts_df['Category'] = prompts_df['Questions'].map(category_map)
66 |
67 | # Save the categorized DataFrame back to Excel
68 | output_path = prompts_file.with_stem(prompts_file.stem + "_categorized")
69 | prompts_df.to_csv(output_path, index=False)
70 | logger.info(f"Saved categorized prompts to {output_path}")
71 |
72 | # Return categories as a dictionary with lists of prompts
73 | return dict(categories)
74 |
75 |
76 | def infer_categories(prompts):
77 | """
78 | Infer categories from prompt content using keyword matching.
79 |
80 | Parameters:
81 | -----------
82 | prompts : list
83 | List of prompts to categorize
84 |
85 | Returns:
86 | --------
87 | dict
88 | Dictionary mapping category names to lists of prompts
89 | """
90 | logger.info("Inferring categories from prompt content...")
91 |
92 | # Define category keywords
93 | keywords = {
94 | 'Reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze', 'evaluate', 'assess', 'examine'],
95 | 'Creativity': ['creative', 'imagine', 'story', 'design', 'invent', 'fiction', 'innovative'],
96 | 'Knowledge': ['fact', 'define', 'what is', 'history', 'science', 'describe', 'information'],
97 | 'Coding': ['code', 'function', 'algorithm', 'program', 'script', 'implementation'],
98 | 'Opinion': ['opinion', 'believe', 'think', 'perspective', 'view', 'stance'],
99 | 'Technical': ['technical', 'engineering', 'system', 'mechanism', 'process'],
100 | 'Economic': ['economic', 'finance', 'market', 'money', 'business', 'trade', 'commerce', 'tax'],
101 | 'Medical': ['medical', 'health', 'disease', 'treatment', 'cure', 'patient', 'doctor', 'hospital'],
102 | 'Political': ['political', 'government', 'policy', 'regulation', 'law', 'legal'],
103 | 'Ethical': ['ethical', 'moral', 'right', 'wrong', 'should', 'ethics', 'values'],
104 | }
105 |
106 | # Categorize prompts
107 | categories = defaultdict(list)
108 |
109 | for prompt in prompts:
110 | prompt_lower = prompt.lower()
111 |
112 | # Try to match prompt to a category
113 | matched = False
114 | for category, terms in keywords.items():
115 | if any(term in prompt_lower for term in terms):
116 | categories[category].append(prompt)
117 | matched = True
118 | break
119 |
120 | # If no match, add to Uncategorized
121 | if not matched:
122 | categories['Uncategorized'].append(prompt)
123 |
124 | # Count prompts per category
125 | for category, prompts in categories.items():
126 | logger.info(f"Category '{category}': {len(prompts)} prompts")
127 |
128 | return categories
129 |
130 |
131 | def analyze_categorized_evaluations(
132 | categorized_prompts,
133 | evaluations_path=None,
134 | output_dir=None
135 | ):
136 | """
137 | Analyze evaluations based on prompt categories.
138 |
139 | Parameters:
140 | -----------
141 | categorized_prompts : dict
142 | Dictionary mapping category names to lists of prompts
143 | evaluations_path : Path or str
144 | Path to the evaluations CSV file
145 | output_dir : Path or str
146 | Directory to save the output files
147 |
148 | Returns:
149 | --------
150 | pd.DataFrame
151 | DataFrame with category analysis results
152 | """
153 | if evaluations_path is None:
154 | evaluations_path = Path("results/evaluations.csv")
155 | else:
156 | evaluations_path = Path(evaluations_path)
157 |
158 | if output_dir is None:
159 | output_dir = Path("results")
160 | else:
161 | output_dir = Path(output_dir)
162 |
163 | # Create output directory if it doesn't exist
164 | output_dir.mkdir(parents=True, exist_ok=True)
165 |
166 | # Load evaluations
167 | logger.info(f"Loading evaluations from {evaluations_path}...")
168 | evals_df = pd.read_csv(evaluations_path)
169 |
170 | # Filter out failed evaluations
171 | evals_df = evals_df[evals_df["parse_failed"] == False]
172 |
173 | # Create a flat mapping of prompt -> category
174 | prompt_to_category = {}
175 | for category, prompts in categorized_prompts.items():
176 | for prompt in prompts:
177 | prompt_to_category[prompt] = category
178 |
179 | # Add category column to evaluations DataFrame
180 | evals_df['category'] = evals_df['prompt'].map(prompt_to_category)
181 |
182 | # Calculate average scores by category and model
183 | results = []
184 |
185 | # For each category
186 | for category in categorized_prompts.keys():
187 | if category == 'Uncategorized':
188 | continue
189 |
190 | category_evals = evals_df[evals_df['category'] == category]
191 |
192 | if category_evals.empty:
193 | continue
194 |
195 | # For each model being rated
196 | for model in category_evals['rated_model'].unique():
197 | model_scores = category_evals[category_evals['rated_model'] == model]['score']
198 | avg_score = model_scores.mean()
199 | count = len(model_scores)
200 |
201 | results.append({
202 | 'category': category,
203 | 'model': model,
204 | 'average_score': avg_score,
205 | 'evaluations_count': count
206 | })
207 |
208 | # Create DataFrame from results
209 | results_df = pd.DataFrame(results)
210 |
211 | # Save to CSV
212 | output_path = output_dir / "category_analysis.csv"
213 | results_df.to_csv(output_path, index=False)
214 |
215 | # Generate summary
216 | logger.info("\n=== Category Analysis ===")
217 | for category in sorted(categorized_prompts.keys()):
218 | if category == 'Uncategorized':
219 | continue
220 |
221 | category_data = results_df[results_df['category'] == category]
222 |
223 | if category_data.empty:
224 | continue
225 |
226 | logger.info(f"\nCategory: {category}")
227 | sorted_models = category_data.sort_values('average_score', ascending=False)
228 |
229 | for _, row in sorted_models.iterrows():
230 | logger.info(f" {row['model']}: {row['average_score']:.4f} (based on {row['evaluations_count']} evaluations)")
231 |
232 | logger.info(f"\nCategory analysis saved to {output_path}")
233 |
234 | # Create JSON with category rankings
235 | category_rankings = {}
236 |
237 | for category in sorted(categorized_prompts.keys()):
238 | if category == 'Uncategorized':
239 | continue
240 |
241 | category_data = results_df[results_df['category'] == category]
242 |
243 | if category_data.empty:
244 | continue
245 |
246 | sorted_models = category_data.sort_values('average_score', ascending=False)
247 | category_rankings[category] = [
248 | {"model": row['model'], "score": float(row['average_score'])}
249 | for _, row in sorted_models.iterrows()
250 | ]
251 |
252 | # Save category rankings to JSON
253 | rankings_path = output_dir / "category_rankings.json"
254 | with open(rankings_path, 'w') as f:
255 | json.dump(category_rankings, f, indent=2)
256 |
257 | logger.info(f"Category rankings saved to {rankings_path}")
258 |
259 | return results_df
260 |
261 |
262 | if __name__ == "__main__":
263 | # Run as a standalone script
264 | categories = categorize_prompts()
265 | analyze_categorized_evaluations(categories)
--------------------------------------------------------------------------------
/sloprank/utils/commands.py:
--------------------------------------------------------------------------------
1 | """
2 | Command-line utilities for SlopRank.
3 | """
4 | import click
5 | import pandas as pd
6 | import json
7 | import threading
8 | import time
9 | from pathlib import Path
10 | import webbrowser
11 |
12 | from ..config import logger
13 | from ..config import VisualizationConfig
14 | from .visualization import generate_visualization
15 |
16 | # Import confidence and dashboard modules if available
17 | try:
18 | from .confidence import compute_confidence_intervals
19 | HAS_CONFIDENCE = True
20 | except ImportError:
21 | HAS_CONFIDENCE = False
22 |
23 | try:
24 | from .dashboard import generate_dashboard, start_dashboard
25 | HAS_DASHBOARD = True
26 | except ImportError:
27 | HAS_DASHBOARD = False
28 |
29 | # Import category analysis if available
30 | try:
31 | from .categorization import categorize_prompts, analyze_categorized_evaluations
32 | HAS_CATEGORIES = True
33 | except ImportError:
34 | HAS_CATEGORIES = False
35 |
36 |
37 | @click.group()
38 | def utils():
39 | """Utility commands for SlopRank."""
40 | pass
41 |
42 |
43 | @utils.command()
44 | @click.option("--rankings", default="results/rankings.json", help="Path to rankings JSON file")
45 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file")
46 | @click.option("--output-dir", default="results/visualizations", help="Output directory for visualizations")
47 | @click.option("--layout", default="spring", help="Graph layout [spring, circular, kamada_kawai, spectral]")
48 | @click.option("--interactive/--no-interactive", default=True, help="Generate interactive HTML visualization")
49 | def visualize(rankings, evaluations, output_dir, layout, interactive):
50 | """Generate visualizations for the SlopRank endorsement graph."""
51 | vis_config = VisualizationConfig(
52 | enabled=True,
53 | interactive=interactive,
54 | layout=layout
55 | )
56 | try:
57 | generate_visualization(
58 | rankings_path=rankings,
59 | evaluations_path=evaluations,
60 | output_dir=output_dir,
61 | vis_config=vis_config
62 | )
63 | click.echo(f"Visualizations generated in {output_dir}")
64 | except Exception as e:
65 | click.echo(f"Error generating visualizations: {e}", err=True)
66 |
67 |
68 | @utils.command()
69 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file")
70 | @click.option("--output", default="results/confidence_stats.json", help="Output file for confidence data")
71 | @click.option("--iterations", default=500, help="Number of bootstrap iterations")
72 | @click.option("--confidence-level", default=0.95, help="Confidence level (0.0-1.0)")
73 | def confidence(evaluations, output, iterations, confidence_level):
74 | """Compute confidence intervals for SlopRank rankings."""
75 | if not HAS_CONFIDENCE:
76 | click.echo("Confidence module not available. Install numpy to use this feature.", err=True)
77 | return
78 |
79 | try:
80 | from .confidence import compute_confidence_intervals
81 | stats = compute_confidence_intervals(
82 | evaluations_path=evaluations,
83 | output_path=output,
84 | iterations=iterations,
85 | confidence_level=confidence_level
86 | )
87 | click.echo(f"Confidence statistics saved to {output}")
88 | except Exception as e:
89 | click.echo(f"Error computing confidence intervals: {e}", err=True)
90 |
91 |
92 | @utils.command()
93 | @click.option("--prompts", default="prompts.csv", help="Path to prompts Excel file")
94 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file")
95 | @click.option("--output-dir", default="results", help="Output directory for category analysis")
96 | def categorize(prompts, evaluations, output_dir):
97 | """Categorize prompts and analyze model performance by category."""
98 | if not HAS_CATEGORIES:
99 | click.echo("Categorization module not available.", err=True)
100 | return
101 |
102 | try:
103 | from .categorization import categorize_prompts, analyze_categorized_evaluations
104 |
105 | output_dir = Path(output_dir)
106 | output_dir.mkdir(exist_ok=True, parents=True)
107 |
108 | # Categorize prompts
109 | categories = categorize_prompts(prompts_file=prompts)
110 |
111 | # Analyze performance by category
112 | analyze_categorized_evaluations(
113 | categorized_prompts=categories,
114 | evaluations_path=evaluations,
115 | output_dir=output_dir
116 | )
117 |
118 | click.echo(f"Category analysis saved to {output_dir / 'category_rankings.json'}")
119 | except Exception as e:
120 | click.echo(f"Error categorizing prompts: {e}", err=True)
121 |
122 |
123 | @utils.command()
124 | @click.option("--rankings", default="results/rankings.json", help="Path to rankings JSON file")
125 | @click.option("--confidence", default="results/confidence_stats.json", help="Path to confidence stats JSON")
126 | @click.option("--categories", default="results/category_rankings.json", help="Path to category rankings JSON")
127 | @click.option("--graph", default="results/visualizations/endorsement_graph.png", help="Path to graph visualization")
128 | @click.option("--output", default="results/dashboard.html", help="Output path for dashboard HTML")
129 | def dashboard(rankings, confidence, categories, graph, output):
130 | """Generate HTML dashboard for SlopRank results."""
131 | if not HAS_DASHBOARD:
132 | click.echo("Dashboard module not available.", err=True)
133 | return
134 |
135 | try:
136 | from .dashboard import generate_dashboard
137 |
138 | dashboard_path = generate_dashboard(
139 | rankings_path=rankings,
140 | confidence_path=confidence if Path(confidence).exists() else None,
141 | categories_path=categories if Path(categories).exists() else None,
142 | graph_path=graph if Path(graph).exists() else None,
143 | output_path=output
144 | )
145 |
146 | click.echo(f"Dashboard generated at {dashboard_path}")
147 | except Exception as e:
148 | click.echo(f"Error generating dashboard: {e}", err=True)
149 |
150 |
151 | @utils.command()
152 | @click.option("--dashboard", default="results/dashboard.html", help="Path to dashboard HTML file")
153 | @click.option("--port", default=8000, help="Port for the web server")
154 | @click.option("--no-browser", is_flag=True, help="Don't open browser automatically")
155 | def serve(dashboard, port, no_browser):
156 | """Start a web server to view the SlopRank dashboard."""
157 | try:
158 | from http.server import HTTPServer, SimpleHTTPRequestHandler
159 |
160 | dashboard_path = Path(dashboard)
161 | if not dashboard_path.exists():
162 | click.echo(f"Dashboard file not found: {dashboard_path}", err=True)
163 | return
164 |
165 | # Start server
166 | server_address = ('', port)
167 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler)
168 |
169 | # Start server in a separate thread
170 | server_thread = threading.Thread(target=httpd.serve_forever)
171 | server_thread.daemon = True
172 | server_thread.start()
173 |
174 | url = f"http://localhost:{port}/{dashboard}"
175 | click.echo(f"Server started at {url}")
176 |
177 | # Open browser
178 | if not no_browser:
179 | webbrowser.open(url)
180 |
181 | # Keep the main thread alive
182 | try:
183 | while True:
184 | time.sleep(1)
185 | except KeyboardInterrupt:
186 | click.echo("Shutting down server...")
187 | httpd.shutdown()
188 |
189 | except Exception as e:
190 | click.echo(f"Error starting server: {e}", err=True)
191 |
192 |
193 | def register_utils_commands(cli):
194 | """Register utility commands with the main CLI."""
195 | cli.add_command(utils)
--------------------------------------------------------------------------------
/sloprank/utils/confidence.py:
--------------------------------------------------------------------------------
1 | """
2 | Confidence interval calculation for SlopRank rankings.
3 | """
4 | import json
5 | import pandas as pd
6 | import numpy as np
7 | import networkx as nx
8 | from pathlib import Path
9 |
10 | from ..config import logger
11 |
12 | def compute_confidence_intervals(
13 | evaluations_path=None,
14 | output_path=None,
15 | iterations=500,
16 | confidence_level=0.95
17 | ):
18 | """
19 | Compute confidence intervals for model rankings using bootstrap resampling.
20 |
21 | Parameters:
22 | -----------
23 | evaluations_path : Path or str
24 | Path to the evaluations CSV file
25 | output_path : Path or str
26 | Path for the output JSON file
27 | iterations : int
28 | Number of bootstrap iterations
29 | confidence_level : float
30 | Confidence level (0.0-1.0)
31 |
32 | Returns:
33 | --------
34 | dict
35 | Confidence statistics
36 | """
37 | if evaluations_path is None:
38 | evaluations_path = Path("results/evaluations.csv")
39 | else:
40 | evaluations_path = Path(evaluations_path)
41 |
42 | if output_path is None:
43 | output_path = Path("results/confidence_stats.json")
44 | else:
45 | output_path = Path(output_path)
46 |
47 | # Create output directory if it doesn't exist
48 | output_path.parent.mkdir(parents=True, exist_ok=True)
49 |
50 | logger.info(f"Computing confidence intervals using {iterations} bootstrap iterations...")
51 |
52 | # Load evaluations
53 | evals_df = pd.read_csv(evaluations_path)
54 |
55 | # Filter out failed evaluations
56 | evals_df = evals_df[evals_df["parse_failed"] == False]
57 |
58 | # Get unique models
59 | models = list(set(evals_df["judge_model"].unique()) | set(evals_df["rated_model"].unique()))
60 |
61 | # Store bootstrap results
62 | bootstrap_results = {model: [] for model in models}
63 |
64 | # Run bootstrap iterations
65 | for i in range(iterations):
66 | if i % 100 == 0:
67 | logger.info(f"Bootstrap iteration {i}/{iterations}...")
68 |
69 | # Resample evaluations with replacement
70 | sampled_evals = evals_df.sample(frac=1.0, replace=True)
71 |
72 | # Build graph from resampled data
73 | G = nx.DiGraph()
74 | G.add_nodes_from(models)
75 |
76 | for _, row in sampled_evals.iterrows():
77 | judge = row["judge_model"]
78 | rated = row["rated_model"]
79 | score = float(row["score"])
80 |
81 | if G.has_edge(judge, rated):
82 | G[judge][rated]["weight"] += score
83 | else:
84 | G.add_edge(judge, rated, weight=score)
85 |
86 | # Compute PageRank
87 | if len(G.edges) > 0:
88 | scores = nx.pagerank(G, weight="weight")
89 |
90 | # Store scores
91 | for model, score in scores.items():
92 | bootstrap_results[model].append(score)
93 |
94 | # Calculate confidence intervals (95%)
95 | confidence_stats = {}
96 | alpha = 1.0 - confidence_level
97 |
98 | for model in models:
99 | if not bootstrap_results[model]:
100 | confidence_stats[model] = {
101 | "mean": 0.0,
102 | "lower_bound": 0.0,
103 | "upper_bound": 0.0,
104 | "std_dev": 0.0
105 | }
106 | continue
107 |
108 | sorted_scores = sorted(bootstrap_results[model])
109 | lower_idx = int(alpha/2 * len(sorted_scores))
110 | upper_idx = int((1-alpha/2) * len(sorted_scores))
111 |
112 | confidence_stats[model] = {
113 | "mean": float(np.mean(sorted_scores)),
114 | "lower_bound": float(sorted_scores[max(0, lower_idx)]),
115 | "upper_bound": float(sorted_scores[min(len(sorted_scores)-1, upper_idx)]),
116 | "std_dev": float(np.std(sorted_scores))
117 | }
118 |
119 | # Test statistical significance
120 | significance_results = {}
121 |
122 | # Create sorted list of models by mean score
123 | models_by_score = sorted(
124 | [(model, stats["mean"]) for model, stats in confidence_stats.items()],
125 | key=lambda x: x[1],
126 | reverse=True
127 | )
128 |
129 | # Compare each adjacent pair in the ranking
130 | for i in range(len(models_by_score) - 1):
131 | model1, _ = models_by_score[i]
132 | model2, _ = models_by_score[i + 1]
133 |
134 | # Determine if significant based on confidence intervals
135 | is_significant = (
136 | confidence_stats[model1]["lower_bound"] > confidence_stats[model2]["upper_bound"] or
137 | confidence_stats[model2]["lower_bound"] > confidence_stats[model1]["upper_bound"]
138 | )
139 |
140 | significance_results[f"{model1}_vs_{model2}"] = is_significant
141 |
142 | # Save results
143 | results = {
144 | "confidence_intervals": confidence_stats,
145 | "significance": significance_results,
146 | "metadata": {
147 | "iterations": iterations,
148 | "confidence_level": confidence_level
149 | }
150 | }
151 |
152 | with open(output_path, "w") as f:
153 | json.dump(results, f, indent=2)
154 |
155 | # Print summary
156 | logger.info("\n=== Confidence Intervals ===")
157 | for model, stats in sorted(confidence_stats.items(), key=lambda x: x[1]["mean"], reverse=True):
158 | logger.info(f"{model}: {stats['mean']:.6f} [{stats['lower_bound']:.6f}, {stats['upper_bound']:.6f}]")
159 |
160 | logger.info("\n=== Statistical Significance ===")
161 | for pair, is_significant in significance_results.items():
162 | significance_str = "Significant" if is_significant else "Not significant"
163 | logger.info(f"{pair}: {significance_str}")
164 |
165 | logger.info(f"Confidence statistics saved to {output_path}")
166 |
167 | return confidence_stats
168 |
169 |
170 | if __name__ == "__main__":
171 | # Run as a standalone script
172 | compute_confidence_intervals()
--------------------------------------------------------------------------------
/sloprank/utils/dashboard.py:
--------------------------------------------------------------------------------
1 | """
2 | Dashboard generation for SlopRank results.
3 | """
4 | import json
5 | import pandas as pd
6 | import webbrowser
7 | import threading
8 | import time
9 | from pathlib import Path
10 | from http.server import HTTPServer, SimpleHTTPRequestHandler
11 |
12 | from ..config import logger
13 |
14 | def generate_dashboard(
15 | rankings_path=None,
16 | confidence_path=None,
17 | categories_path=None,
18 | graph_path=None,
19 | output_path=None
20 | ):
21 | """
22 | Generate an HTML dashboard for SlopRank results.
23 |
24 | Parameters:
25 | -----------
26 | rankings_path : Path or str
27 | Path to the rankings JSON file
28 | confidence_path : Path or str
29 | Path to the confidence stats JSON file
30 | categories_path : Path or str
31 | Path to the category rankings JSON file
32 | graph_path : Path or str
33 | Path to the graph visualization image
34 | output_path : Path or str
35 | Path to save the dashboard HTML file
36 |
37 | Returns:
38 | --------
39 | Path
40 | Path to the generated dashboard HTML file
41 | """
42 | if rankings_path is None:
43 | rankings_path = Path("results/rankings.json")
44 | else:
45 | rankings_path = Path(rankings_path)
46 |
47 | if output_path is None:
48 | output_path = Path("results/dashboard.html")
49 | else:
50 | output_path = Path(output_path)
51 |
52 | # Create output directory if it doesn't exist
53 | output_path.parent.mkdir(parents=True, exist_ok=True)
54 |
55 | # Load rankings data
56 | with open(rankings_path, 'r') as f:
57 | rankings_data = json.load(f)
58 |
59 | # Load confidence data if available
60 | has_confidence = confidence_path is not None and Path(confidence_path).exists()
61 | confidence_data = None
62 | if has_confidence:
63 | with open(confidence_path, 'r') as f:
64 | confidence_data = json.load(f)
65 |
66 | # Load category rankings if available
67 | has_categories = categories_path is not None and Path(categories_path).exists()
68 | category_data = None
69 | if has_categories:
70 | with open(categories_path, 'r') as f:
71 | category_data = json.load(f)
72 |
73 | # Check if graph visualization is available
74 | has_graph = graph_path is not None and Path(graph_path).exists()
75 |
76 | # Generate HTML
77 | html = f"""
78 |
79 |
80 |
81 |
82 |
83 | SlopRank Dashboard
84 |
153 |
154 |
155 |
156 |
SlopRank Dashboard
157 |
158 |
Model Rankings
159 |
160 |
161 | Rank |
162 | Model |
163 | Score |
164 | Visualization |
165 | """
166 |
167 | if has_confidence:
168 | html += """
169 | Confidence Interval |
170 | """
171 |
172 | html += """
173 |
174 | """
175 |
176 | # Add rows for each model
177 | if isinstance(rankings_data['rankings'][0], list):
178 | # Old format with list of lists
179 | ranked_items = rankings_data["rankings"]
180 | max_score = max([score for _, score in ranked_items])
181 | else:
182 | # New format with list of dicts
183 | ranked_items = [(item["model"], item["score"]) for item in rankings_data["rankings"]]
184 | max_score = max([item["score"] for item in rankings_data["rankings"]])
185 |
186 | for i, (model, score) in enumerate(ranked_items):
187 | bar_width = int(300 * score / max_score)
188 | confidence_html = ""
189 |
190 | if has_confidence and model in confidence_data["confidence_intervals"]:
191 | ci = confidence_data["confidence_intervals"][model]
192 | lower_pct = int(300 * ci["lower_bound"] / max_score)
193 | upper_pct = int(300 * ci["upper_bound"] / max_score)
194 | mean_pct = int(300 * ci["mean"] / max_score)
195 |
196 | confidence_html = f"""
197 |
198 |
202 | {ci["mean"]:.6f} [{ci["lower_bound"]:.6f}, {ci["upper_bound"]:.6f}]
203 | |
204 | """
205 |
206 | html += f"""
207 |
208 | {i+1} |
209 | {model} |
210 | {score:.6f} |
211 |
212 |
215 | |
216 | {confidence_html}
217 |
218 | """
219 |
220 | html += """
221 |
222 | """
223 |
224 | # Add statistical significance if available
225 | if has_confidence and confidence_data.get("significance"):
226 | html += """
227 |
Statistical Significance
228 |
229 |
230 | Comparison |
231 | Significance |
232 |
233 | """
234 |
235 | for pair, is_significant in confidence_data["significance"].items():
236 | significance_str = "Significant" if is_significant else "Not significant"
237 | html += f"""
238 |
239 | {pair} |
240 | {significance_str} |
241 |
242 | """
243 |
244 | html += """
245 |
246 | """
247 |
248 | # Add category rankings if available
249 | if has_categories and category_data:
250 | html += """
251 |
Rankings by Category
252 | """
253 |
254 | for category, models in sorted(category_data.items()):
255 | max_score = max([item["score"] for item in models])
256 |
257 | html += f"""
258 |
{category}
259 |
260 |
261 | Rank |
262 | Model |
263 | Score |
264 | Visualization |
265 |
266 | """
267 |
268 | for i, item in enumerate(models):
269 | model = item["model"]
270 | score = item["score"]
271 | bar_width = int(300 * score / max_score)
272 |
273 | html += f"""
274 |
275 | {i+1} |
276 | {model} |
277 | {score:.4f} |
278 |
279 |
282 | |
283 |
284 | """
285 |
286 | html += """
287 |
288 | """
289 |
290 | # Add graph visualization if available
291 | if has_graph:
292 | rel_path = str(Path(graph_path).relative_to(Path.cwd()))
293 | html += f"""
294 |
Endorsement Graph
295 |
296 |

297 |
298 | """
299 |
300 | # Add metadata
301 | html += f"""
302 |
306 |
307 |
308 |
309 | """
310 |
311 | # Save HTML to file
312 | with open(output_path, 'w') as f:
313 | f.write(html)
314 |
315 | logger.info(f"Dashboard generated at {output_path}")
316 | return output_path
317 |
318 |
319 | def start_dashboard(dashboard_path=None, port=8000, open_browser=True):
320 | """
321 | Start a web server to view the SlopRank dashboard.
322 |
323 | Parameters:
324 | -----------
325 | dashboard_path : Path or str
326 | Path to the dashboard HTML file
327 | port : int
328 | Port for the web server
329 | open_browser : bool
330 | Whether to open a browser window automatically
331 |
332 | Returns:
333 | --------
334 | HTTPServer
335 | The server instance
336 | """
337 | if dashboard_path is None:
338 | dashboard_path = Path("results/dashboard.html")
339 | else:
340 | dashboard_path = Path(dashboard_path)
341 |
342 | if not dashboard_path.exists():
343 | logger.error(f"Dashboard file not found: {dashboard_path}")
344 | return None
345 |
346 | # Start server
347 | server_address = ('', port)
348 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler)
349 |
350 | # Start server in a separate thread
351 | server_thread = threading.Thread(target=httpd.serve_forever)
352 | server_thread.daemon = True
353 | server_thread.start()
354 |
355 | url = f"http://localhost:{port}/{dashboard_path}"
356 | logger.info(f"Server started at {url}")
357 |
358 | # Open browser
359 | if open_browser:
360 | webbrowser.open(url)
361 |
362 | return httpd
363 |
364 |
365 | if __name__ == "__main__":
366 | # Run as a standalone script
367 | dashboard_path = generate_dashboard()
368 | httpd = start_dashboard(dashboard_path)
369 |
370 | try:
371 | while True:
372 | time.sleep(1)
373 | except KeyboardInterrupt:
374 | logger.info("Shutting down server...")
375 | httpd.shutdown()
--------------------------------------------------------------------------------
/sloprank/utils/visualization.py:
--------------------------------------------------------------------------------
1 | """
2 | Graph visualization for SlopRank endorsement networks.
3 | """
4 | import json
5 | import pandas as pd
6 | import numpy as np
7 | import networkx as nx
8 | from pathlib import Path
9 |
10 | # Try importing visualization libraries
11 | try:
12 | import matplotlib.pyplot as plt
13 | import matplotlib.cm as cm
14 | HAS_MATPLOTLIB = True
15 | except ImportError:
16 | HAS_MATPLOTLIB = False
17 |
18 | try:
19 | import plotly.graph_objects as go
20 | HAS_PLOTLY = True
21 | except ImportError:
22 | HAS_PLOTLY = False
23 |
24 | from ..config import logger
25 |
26 |
27 | def generate_visualization(
28 | rankings_path=None,
29 | evaluations_path=None,
30 | output_dir=None,
31 | vis_config=None
32 | ):
33 | """
34 | Generate visualizations of the SlopRank endorsement graph.
35 |
36 | Parameters:
37 | -----------
38 | rankings_path : Path or str
39 | Path to the rankings.json file
40 | evaluations_path : Path or str
41 | Path to the evaluations.csv file
42 | output_dir : Path or str
43 | Directory to save visualizations
44 | vis_config : VisualizationConfig
45 | Configuration for visualizations
46 |
47 | Returns:
48 | --------
49 | tuple
50 | Paths to generated visualization files
51 | """
52 | if rankings_path is None:
53 | rankings_path = Path("results/rankings.json")
54 | else:
55 | rankings_path = Path(rankings_path)
56 |
57 | if evaluations_path is None:
58 | evaluations_path = Path("results/evaluations.csv")
59 | else:
60 | evaluations_path = Path(evaluations_path)
61 |
62 | if output_dir is None:
63 | output_dir = Path("results/visualizations")
64 | else:
65 | output_dir = Path(output_dir)
66 |
67 | # Ensure output directory exists
68 | output_dir.mkdir(parents=True, exist_ok=True)
69 |
70 | # Load rankings
71 | with open(rankings_path, 'r') as f:
72 | rankings_data = json.load(f)
73 |
74 | # Extract pagerank scores
75 | if isinstance(rankings_data['rankings'][0], list):
76 | # Old format with list of lists
77 | pagerank_scores = {model: score for model, score in rankings_data["rankings"]}
78 | else:
79 | # New format with list of dicts
80 | pagerank_scores = {item["model"]: item["score"] for item in rankings_data["rankings"]}
81 |
82 | # Load evaluations
83 | evals_df = pd.read_csv(evaluations_path)
84 |
85 | # Filter out failed evaluations
86 | evals_df = evals_df[evals_df["parse_failed"] == False]
87 |
88 | # Build graph
89 | G = nx.DiGraph()
90 |
91 | # Add nodes from rankings
92 | for model, score in pagerank_scores.items():
93 | G.add_node(model, pagerank=score)
94 |
95 | # Add edges from evaluations
96 | for _, row in evals_df.iterrows():
97 | judge = row["judge_model"]
98 | rated = row["rated_model"]
99 | score = float(row["score"])
100 |
101 | if G.has_edge(judge, rated):
102 | G[judge][rated]["weight"] += score
103 | else:
104 | G.add_edge(judge, rated, weight=score)
105 |
106 | # Normalize edge weights for visualization
107 | max_weight = max([G[u][v]["weight"] for u, v in G.edges()])
108 | for u, v in G.edges():
109 | G[u][v]["normalized_weight"] = G[u][v]["weight"] / max_weight
110 |
111 | # Save graph in GML format
112 | gml_path = output_dir / "endorsement_graph.gml"
113 | nx.write_gml(G, gml_path)
114 | logger.info(f"Saved graph in GML format to {gml_path}")
115 |
116 | # Generate static visualization if matplotlib is available
117 | png_path = None
118 | if HAS_MATPLOTLIB:
119 | png_path = output_dir / "endorsement_graph.png"
120 | generate_static_visualization(G, pagerank_scores, png_path, vis_config)
121 | logger.info(f"Saved static visualization to {png_path}")
122 |
123 | # Generate interactive visualization if plotly is available
124 | html_path = None
125 | if HAS_PLOTLY and (vis_config is None or vis_config.interactive):
126 | html_path = output_dir / "endorsement_graph.html"
127 | generate_interactive_visualization(G, pagerank_scores, html_path, vis_config)
128 | logger.info(f"Saved interactive visualization to {html_path}")
129 |
130 | return gml_path, png_path, html_path
131 |
132 |
133 | def generate_static_visualization(G, pagerank_scores, output_path, vis_config=None):
134 | """
135 | Generate a static visualization of the endorsement graph using matplotlib.
136 | """
137 | if not HAS_MATPLOTLIB:
138 | logger.warning("Matplotlib not found. Cannot generate static visualization.")
139 | return
140 |
141 | # Node size factor, edge width factor, color maps, etc.
142 | node_size_factor = 2000
143 | edge_width_factor = 2.0
144 | node_colormap = 'viridis'
145 | edge_colormap = 'plasma'
146 |
147 | if vis_config is not None:
148 | node_size_factor = vis_config.node_size_factor
149 | edge_width_factor = vis_config.edge_width_factor
150 | node_colormap = vis_config.node_colormap
151 | edge_colormap = vis_config.edge_colormap
152 |
153 | try:
154 | # Calculate position using spring layout
155 | layout_func = nx.spring_layout
156 | if vis_config is not None and hasattr(vis_config, 'layout'):
157 | if vis_config.layout == 'circular':
158 | layout_func = nx.circular_layout
159 | elif vis_config.layout == 'kamada_kawai':
160 | layout_func = nx.kamada_kawai_layout
161 | elif vis_config.layout == 'spectral':
162 | layout_func = nx.spectral_layout
163 |
164 | pos = layout_func(G, seed=42)
165 |
166 | # Create figure
167 | plt.figure(figsize=(12, 10))
168 |
169 | # Draw nodes
170 | node_sizes = [pagerank_scores.get(node, 0.01) * node_size_factor for node in G.nodes()]
171 | node_colors = [pagerank_scores.get(node, 0.0) for node in G.nodes()]
172 |
173 | nx.draw_networkx_nodes(
174 | G, pos,
175 | node_size=node_sizes,
176 | node_color=node_colors,
177 | cmap=plt.cm.get_cmap(node_colormap),
178 | alpha=0.8
179 | )
180 |
181 | # Draw edges
182 | edge_widths = [G[u][v].get('normalized_weight', 0.1) * edge_width_factor for u, v in G.edges()]
183 |
184 | nx.draw_networkx_edges(
185 | G, pos,
186 | width=edge_widths,
187 | alpha=0.6,
188 | edge_color=range(len(G.edges())),
189 | edge_cmap=plt.cm.get_cmap(edge_colormap),
190 | arrows=True,
191 | arrowsize=20,
192 | arrowstyle='-|>'
193 | )
194 |
195 | # Draw labels
196 | nx.draw_networkx_labels(
197 | G, pos,
198 | font_size=12,
199 | font_weight='bold'
200 | )
201 |
202 | # Add title
203 | plt.title("LLM Endorsement Graph (Node size = PageRank score, Edge width = Endorsement strength)")
204 | plt.axis('off')
205 |
206 | # Save the figure
207 | plt.tight_layout()
208 | plt.savefig(output_path, dpi=300, bbox_inches='tight')
209 | plt.close()
210 |
211 | except Exception as e:
212 | logger.error(f"Error generating static visualization: {e}")
213 |
214 |
215 | def generate_interactive_visualization(G, pagerank_scores, output_path, vis_config=None):
216 | """
217 | Generate an interactive visualization of the endorsement graph using Plotly.
218 | """
219 | if not HAS_PLOTLY:
220 | logger.warning("Plotly not found. Cannot generate interactive visualization.")
221 | return
222 |
223 | # Node size factor, edge width factor, color maps, etc.
224 | node_size_factor = 2000
225 | edge_width_factor = 2.0
226 | node_colormap = 'Viridis'
227 |
228 | if vis_config is not None:
229 | node_size_factor = vis_config.node_size_factor
230 | edge_width_factor = vis_config.edge_width_factor
231 | node_colormap = vis_config.node_colormap
232 |
233 | try:
234 | # Calculate position using spring layout
235 | layout_func = nx.spring_layout
236 | if vis_config is not None and hasattr(vis_config, 'layout'):
237 | if vis_config.layout == 'circular':
238 | layout_func = nx.circular_layout
239 | elif vis_config.layout == 'kamada_kawai':
240 | layout_func = nx.kamada_kawai_layout
241 | elif vis_config.layout == 'spectral':
242 | layout_func = nx.spectral_layout
243 |
244 | pos = layout_func(G, seed=42)
245 |
246 | # Create edge traces
247 | edge_traces = []
248 | for edge in G.edges():
249 | source, target = edge
250 | source_pos = pos[source]
251 | target_pos = pos[target]
252 | weight = G[source][target].get('weight', 1.0)
253 |
254 | # Calculate line transparency and width based on weight
255 | width = max(1, min(10, weight / 5))
256 | opacity = min(1.0, max(0.3, weight / 10.0))
257 |
258 | # Create edge line
259 | edge_trace = go.Scatter(
260 | x=[source_pos[0], target_pos[0]],
261 | y=[source_pos[1], target_pos[1]],
262 | line=dict(width=width, color=f'rgba(150, 150, 150, {opacity})'),
263 | hoverinfo='text',
264 | text=f"{source} → {target}
Weight: {weight:.2f}",
265 | mode='lines+markers',
266 | marker=dict(size=0),
267 | showlegend=False
268 | )
269 | edge_traces.append(edge_trace)
270 |
271 | # Create arrowhead
272 | # Simple approximation of arrow position (80% along the edge)
273 | arrow_x = source_pos[0] * 0.2 + target_pos[0] * 0.8
274 | arrow_y = source_pos[1] * 0.2 + target_pos[1] * 0.8
275 |
276 | arrow_trace = go.Scatter(
277 | x=[arrow_x],
278 | y=[arrow_y],
279 | mode='markers',
280 | marker=dict(
281 | symbol='triangle-right',
282 | size=10,
283 | color=f'rgba(150, 150, 150, {opacity})',
284 | angle=np.degrees(np.arctan2(
285 | target_pos[1] - source_pos[1],
286 | target_pos[0] - source_pos[0]
287 | ))
288 | ),
289 | hoverinfo='none',
290 | showlegend=False
291 | )
292 | edge_traces.append(arrow_trace)
293 |
294 | # Create node trace
295 | node_trace = go.Scatter(
296 | x=[pos[node][0] for node in G.nodes()],
297 | y=[pos[node][1] for node in G.nodes()],
298 | mode='markers+text',
299 | text=[node for node in G.nodes()],
300 | textposition="top center",
301 | hoverinfo='text',
302 | hovertext=[f"{node}
PageRank: {pagerank_scores.get(node, 0):.4f}" for node in G.nodes()],
303 | marker=dict(
304 | showscale=True,
305 | colorscale=node_colormap,
306 | color=[pagerank_scores.get(node, 0) for node in G.nodes()],
307 | size=[pagerank_scores.get(node, 0.01) * node_size_factor / 10 for node in G.nodes()],
308 | colorbar=dict(
309 | thickness=15,
310 | title=dict(
311 | text='PageRank Score',
312 | side='right'
313 | ),
314 | xanchor='left'
315 | ),
316 | line=dict(width=2)
317 | )
318 | )
319 |
320 | # Create figure
321 | fig = go.Figure(
322 | data=edge_traces + [node_trace],
323 | layout=go.Layout(
324 | title='Interactive LLM Endorsement Graph',
325 | titlefont=dict(size=16),
326 | showlegend=False,
327 | hovermode='closest',
328 | margin=dict(b=20, l=5, r=5, t=40),
329 | xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
330 | yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
331 | height=600,
332 | annotations=[
333 | dict(
334 | text="Node size = PageRank score
Edge width = Endorsement strength",
335 | showarrow=False,
336 | xref="paper", yref="paper",
337 | x=0.01, y=-0.05
338 | )
339 | ]
340 | )
341 | )
342 |
343 | # Save to HTML file
344 | fig.write_html(output_path)
345 |
346 | except Exception as e:
347 | logger.error(f"Error generating interactive visualization: {e}")
348 |
349 |
350 | if __name__ == "__main__":
351 | # Run as a standalone script
352 | generate_visualization()
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # SlopRank Tests
2 |
3 | This directory contains test files for the SlopRank library.
4 |
5 | ## Test Files
6 |
7 | | File | Description |
8 | |------|-------------|
9 | | `test_sloprank.py` | Simple end-to-end test for the SlopRank library |
10 | | `tiny_prompts.csv` | Minimal test prompts with just 2 simple questions |
11 | | `mini_prompts.csv` | Small test prompts with 3 more comprehensive questions |
12 |
13 | ## Running Tests
14 |
15 | To run the basic test:
16 |
17 | ```bash
18 | python test_sloprank.py
19 | ```
20 |
21 | ### Test Process
22 |
23 | The test will automatically:
24 | 1. Create a test output directory (`test_results/`)
25 | 2. Collect responses from configured models
26 | 3. Collect evaluations between models
27 | 4. Parse evaluations
28 | 5. Build the endorsement graph
29 | 6. Compute the PageRank scores
30 | 7. Output the final rankings
31 |
32 | > **Note:** The full test may take several minutes to complete due to the time required for API calls to language models.
33 |
34 | ## Test Configuration
35 |
36 | The test script uses a simple configuration with:
37 | - 3 models: deepseek-reasoner, claude-3.7-sonnet, and chatgpt-4o
38 | - Simple factual questions to ensure fast responses
39 | - Full evaluation (all models evaluate each other)
40 |
41 | You can modify the test script to use different models, prompts, or evaluation settings.
--------------------------------------------------------------------------------
/tests/test_sloprank.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple test script for SlopRank
3 | """
4 | import pandas as pd
5 | import json
6 | from pathlib import Path
7 | from sloprank.config import EvalConfig, VisualizationConfig
8 | from sloprank.collect import collect_responses, collect_raw_evaluations
9 | from sloprank.parse import parse_evaluation_rows
10 | from sloprank.rank import build_endorsement_graph, compute_pagerank, finalize_rankings
11 |
12 | # Use existing tiny_prompts.csv file
13 | prompts_file = Path(__file__).parent / "tiny_prompts.csv"
14 | test_df = pd.read_csv(prompts_file)
15 | prompts = test_df["Questions"].tolist()
16 |
17 | # Define a simple test configuration
18 | config = EvalConfig(
19 | model_names=["deepseek-chat", "claude-3.5-haiku", "gpt-4o"],
20 | evaluation_method=1, # numeric
21 | use_subset_evaluation=False, # All models evaluate each other
22 | evaluators_subset_size=2, # This will be ignored since subset_evaluation is False
23 | output_dir=Path(__file__).parent / "test_results",
24 | request_delay=0.0
25 | )
26 |
27 | # Create output directory
28 | config.output_dir.mkdir(exist_ok=True)
29 |
30 | # Create prompt pairs (prompt, answer_key)
31 | prompt_pairs = [(prompt, "") for prompt in prompts]
32 |
33 | # Collect responses
34 | print(f"Collecting responses from {len(config.model_names)} models for {len(prompts)} prompts...")
35 | responses_df = collect_responses(prompt_pairs, config)
36 | responses_df.to_csv(config.output_dir / "responses.csv", index=False)
37 | print(f"Saved responses to {config.output_dir}/responses.csv")
38 |
39 | # Collect evaluations
40 | print("Collecting evaluations...")
41 | raw_evaluations_df = collect_raw_evaluations(responses_df, config)
42 | raw_evaluations_df.to_csv(config.output_dir / "raw_evaluations.csv", index=False)
43 | print(f"Saved raw evaluations to {config.output_dir}/raw_evaluations.csv")
44 |
45 | # Parse evaluations
46 | print("Parsing evaluations...")
47 | evaluations_df = parse_evaluation_rows(raw_evaluations_df, config)
48 | evaluations_df.to_csv(config.output_dir / "evaluations.csv", index=False)
49 | print(f"Saved parsed evaluations to {config.output_dir}/evaluations.csv")
50 |
51 | # Build graph and compute rankings
52 | print("Building graph and computing rankings...")
53 | G = build_endorsement_graph(evaluations_df, config)
54 | pagerank_scores = compute_pagerank(G)
55 | rankings = finalize_rankings(pagerank_scores, config, G, evaluations_df)
56 |
57 | # Save rankings to file
58 | rankings_file = config.output_dir / "rankings.json"
59 | with open(rankings_file, "w") as f:
60 | json.dump(rankings, f, indent=4)
61 | print(f"Saved rankings to {rankings_file}")
62 |
63 | print("Test completed successfully!")
--------------------------------------------------------------------------------
/tests/tiny_prompts.csv:
--------------------------------------------------------------------------------
1 | Questions
2 | What is the capital of France?
3 | Name three primary colors
4 |
--------------------------------------------------------------------------------