├── .gitattributes ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── docs ├── README.md ├── index.html └── visualizations │ ├── endorsement_graph.gml │ └── endorsement_graph.png ├── examples ├── README.md ├── compute_confidence.py ├── dashboard.py ├── generate_dashboard.py ├── generate_visualization.py ├── prompt_categorization.py └── prompts_categorized.csv ├── prompts.csv ├── pyproject.toml ├── readme.md ├── requirements.txt ├── results ├── category_analysis.csv ├── category_rankings.json ├── confidence_stats.json ├── dashboard.html ├── endorsement_graph.gml ├── rankings.json ├── responses.csv └── visualizations │ ├── endorsement_graph.gml │ └── endorsement_graph.png ├── scripts ├── bump_version.py └── create_github_release.py ├── sloprank ├── __init__.py ├── __main__.py ├── cli.py ├── collect.py ├── config.py ├── parse.py ├── rank.py └── utils │ ├── __init__.py │ ├── categorization.py │ ├── commands.py │ ├── confidence.py │ ├── dashboard.py │ └── visualization.py └── tests ├── README.md ├── test_sloprank.py └── tiny_prompts.csv /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | *.pyc 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 106 | __pypackages__/ 107 | 108 | # Celery stuff 109 | celerybeat-schedule 110 | celerybeat.pid 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | .dmypy.json 137 | dmypy.json 138 | 139 | # Pyre type checker 140 | .pyre/ 141 | 142 | # pytype static type analyzer 143 | .pytype/ 144 | 145 | # Cython debug symbols 146 | cython_debug/ 147 | 148 | # PyCharm 149 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 150 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 151 | # and can be added to the global gitignore or merged into this file. For a more nuclear 152 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 153 | #.idea/ 154 | .DS_Store 155 | \# older_scripts 156 | \#Archive/* 157 | 158 | # Ignore private PyPI config 159 | .pypirc 160 | 161 | # Ignore Claude's reference file 162 | CLAUDE.md 163 | 164 | # Ignore test output files 165 | tests/test_results/ 166 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to SlopRank will be documented in this file. 4 | 5 | ## [0.2.3] - 2025-02-28 6 | 7 | ### Added 8 | - Tests directory with simple test scripts and example prompts 9 | - Test README with documentation on how to run tests 10 | 11 | ### Fixed 12 | - Improved error handling for subset evaluation configuration 13 | - Automatic adjustment of evaluators_subset_size when too large for the number of models 14 | - Added support for new model versions (Claude-3.7-Sonnet, ChatGPT-4o, Deepseek-Reasoner) 15 | 16 | ## [0.2.2] - 2025-01-14 17 | 18 | ### Added 19 | - Support for graph visualization of model endorsements 20 | - Confidence interval calculations for rankings 21 | - Category analysis for prompt-specific performance 22 | 23 | ### Changed 24 | - Improved API error handling 25 | - Enhanced CLI interface with additional options 26 | 27 | ## [0.2.1] - 2025-01-03 28 | 29 | ### Added 30 | - Dashboard features for interactive exploration 31 | - Visualization improvements 32 | 33 | ### Fixed 34 | - Bug fixes in PageRank calculation 35 | - Better error handling for API timeouts 36 | 37 | ## [0.2.0] - 2024-12-20 38 | 39 | ### Added 40 | - Complete rewrite with modular architecture 41 | - Support for multiple evaluation methods 42 | - Export options for results 43 | 44 | ## [0.1.0] - 2024-12-01 45 | 46 | ### Added 47 | - Initial release 48 | - Basic implementation of peer-based LLM evaluation 49 | - PageRank algorithm for ranking models -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYPROJECT = pyproject.toml 2 | VERSION = $(shell grep '^version' $(PYPROJECT) | sed -E 's/.*"([0-9]+\.[0-9]+\.[0-9]+)"/\1/') 3 | 4 | .PHONY: clean build check upload bump-patch bump-minor bump-major git-release publish 5 | 6 | clean: 7 | rm -rf build dist *.egg-info 8 | 9 | build: clean 10 | python -m build 11 | 12 | check: 13 | twine check dist/* 14 | 15 | upload: 16 | twine upload dist/* 17 | 18 | bump-patch: 19 | @python scripts/bump_version.py patch 20 | 21 | bump-minor: 22 | @python scripts/bump_version.py minor 23 | 24 | bump-major: 25 | @python scripts/bump_version.py major 26 | 27 | git-release: 28 | git add -A 29 | git commit -m "Release v$(VERSION)" || echo "Nothing to commit" 30 | @if git rev-parse "v$(VERSION)" >/dev/null 2>&1; then \ 31 | echo "⚠️ Tag v$(VERSION) already exists. Skipping tag creation."; \ 32 | else \ 33 | git tag v$(VERSION); \ 34 | fi 35 | git push 36 | git push --tags 37 | @python scripts/create_github_release.py v$(VERSION) 38 | 39 | BUMP ?= patch 40 | 41 | publish: 42 | @make bump-$(BUMP) 43 | @make build 44 | @make check 45 | @make upload 46 | @make git-release 47 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # SlopRank Dashboard 2 | 3 | This directory contains the interactive dashboard for SlopRank LLM evaluation framework. 4 | 5 | ## Files 6 | 7 | - `index.html` - The main dashboard file 8 | - `visualizations/` - Directory containing graph visualizations and images 9 | 10 | ## How to Use 11 | 12 | 1. Open `index.html` in any modern web browser 13 | 2. Explore the model rankings, category performance, and graph visualizations 14 | 15 | ## Hosting on GitHub Pages 16 | 17 | This directory is configured to be used with GitHub Pages. When GitHub Pages is enabled for this repo with the 'docs' folder as the source, the dashboard will be available at: 18 | 19 | https://yourusername.github.io/llmrank/ 20 | 21 | ## Updating the Dashboard 22 | 23 | To update this dashboard with new evaluation results: 24 | 25 | 1. Run the SlopRank tool with the `--dashboard` option 26 | 2. Copy the resulting dashboard.html to this directory as index.html 27 | 3. Update the image paths if necessary 28 | 4. Commit and push the changes -------------------------------------------------------------------------------- /docs/visualizations/endorsement_graph.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | directed 1 3 | node [ 4 | id 0 5 | label "o1-preview" 6 | pagerank 0.17940361409787733 7 | ] 8 | node [ 9 | id 1 10 | label "gpt-4o" 11 | pagerank 0.17830451744580658 12 | ] 13 | node [ 14 | id 2 15 | label "deepseek-chat" 16 | pagerank 0.1671054138317305 17 | ] 18 | node [ 19 | id 3 20 | label "gemini-2.0-flash-thinking-exp-1219" 21 | pagerank 0.16473186403675355 22 | ] 23 | node [ 24 | id 4 25 | label "claude-3-5-sonnet-latest" 26 | pagerank 0.15557086205954448 27 | ] 28 | node [ 29 | id 5 30 | label "gemini-exp-1206" 31 | pagerank 0.15488372852828722 32 | ] 33 | edge [ 34 | source 0 35 | target 3 36 | weight 131.0 37 | normalized_weight 0.5282258064516129 38 | ] 39 | edge [ 40 | source 0 41 | target 2 42 | weight 129.0 43 | normalized_weight 0.5201612903225806 44 | ] 45 | edge [ 46 | source 0 47 | target 5 48 | weight 144.0 49 | normalized_weight 0.5806451612903226 50 | ] 51 | edge [ 52 | source 0 53 | target 1 54 | weight 157.0 55 | normalized_weight 0.6330645161290323 56 | ] 57 | edge [ 58 | source 0 59 | target 4 60 | weight 139.0 61 | normalized_weight 0.5604838709677419 62 | ] 63 | edge [ 64 | source 1 65 | target 3 66 | weight 155.0 67 | normalized_weight 0.625 68 | ] 69 | edge [ 70 | source 1 71 | target 2 72 | weight 146.0 73 | normalized_weight 0.5887096774193549 74 | ] 75 | edge [ 76 | source 1 77 | target 4 78 | weight 146.0 79 | normalized_weight 0.5887096774193549 80 | ] 81 | edge [ 82 | source 1 83 | target 0 84 | weight 129.0 85 | normalized_weight 0.5201612903225806 86 | ] 87 | edge [ 88 | source 1 89 | target 5 90 | weight 141.0 91 | normalized_weight 0.5685483870967742 92 | ] 93 | edge [ 94 | source 2 95 | target 1 96 | weight 212.0 97 | normalized_weight 0.8548387096774194 98 | ] 99 | edge [ 100 | source 2 101 | target 3 102 | weight 135.5 103 | normalized_weight 0.5463709677419355 104 | ] 105 | edge [ 106 | source 2 107 | target 0 108 | weight 203.0 109 | normalized_weight 0.8185483870967742 110 | ] 111 | edge [ 112 | source 2 113 | target 5 114 | weight 142.0 115 | normalized_weight 0.5725806451612904 116 | ] 117 | edge [ 118 | source 2 119 | target 4 120 | weight 143.0 121 | normalized_weight 0.5766129032258065 122 | ] 123 | edge [ 124 | source 3 125 | target 0 126 | weight 138.0 127 | normalized_weight 0.5564516129032258 128 | ] 129 | edge [ 130 | source 3 131 | target 2 132 | weight 173.0 133 | normalized_weight 0.6975806451612904 134 | ] 135 | edge [ 136 | source 3 137 | target 4 138 | weight 113.0 139 | normalized_weight 0.45564516129032256 140 | ] 141 | edge [ 142 | source 3 143 | target 5 144 | weight 89.0 145 | normalized_weight 0.3588709677419355 146 | ] 147 | edge [ 148 | source 3 149 | target 1 150 | weight 130.0 151 | normalized_weight 0.5241935483870968 152 | ] 153 | edge [ 154 | source 4 155 | target 0 156 | weight 248.0 157 | normalized_weight 1.0 158 | ] 159 | edge [ 160 | source 4 161 | target 3 162 | weight 162.0 163 | normalized_weight 0.6532258064516129 164 | ] 165 | edge [ 166 | source 4 167 | target 5 168 | weight 160.0 169 | normalized_weight 0.6451612903225806 170 | ] 171 | edge [ 172 | source 4 173 | target 1 174 | weight 166.0 175 | normalized_weight 0.6693548387096774 176 | ] 177 | edge [ 178 | source 4 179 | target 2 180 | weight 104.0 181 | normalized_weight 0.41935483870967744 182 | ] 183 | edge [ 184 | source 5 185 | target 4 186 | weight 129.0 187 | normalized_weight 0.5201612903225806 188 | ] 189 | edge [ 190 | source 5 191 | target 3 192 | weight 188.0 193 | normalized_weight 0.7580645161290323 194 | ] 195 | edge [ 196 | source 5 197 | target 2 198 | weight 183.0 199 | normalized_weight 0.7379032258064516 200 | ] 201 | edge [ 202 | source 5 203 | target 1 204 | weight 180.0 205 | normalized_weight 0.7258064516129032 206 | ] 207 | edge [ 208 | source 5 209 | target 0 210 | weight 148.0 211 | normalized_weight 0.5967741935483871 212 | ] 213 | ] 214 | -------------------------------------------------------------------------------- /docs/visualizations/endorsement_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strangeloopcanon/LLMRank/7527836faee5af1209059466d89690bedf504014/docs/visualizations/endorsement_graph.png -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # SlopRank Example Scripts 2 | 3 | This directory contains standalone scripts that demonstrate each of the advanced features of SlopRank. These scripts can be run individually after running the main SlopRank tool. 4 | 5 | ## Available Scripts 6 | 7 | ### 1. Graph Visualization (`generate_visualization.py`) 8 | 9 | Creates visual representations of the model endorsement network: 10 | 11 | ```bash 12 | python examples/generate_visualization.py 13 | ``` 14 | 15 | **Outputs:** 16 | - Static PNG visualization: `results/visualizations/endorsement_graph.png` 17 | - GraphML file: `results/visualizations/endorsement_graph.gml` 18 | 19 | ### 2. Confidence Intervals (`compute_confidence.py`) 20 | 21 | Uses bootstrap resampling to estimate statistical reliability: 22 | 23 | ```bash 24 | python examples/compute_confidence.py 25 | ``` 26 | 27 | **Outputs:** 28 | - `results/confidence_stats.json` containing: 29 | - Confidence intervals for each model's PageRank score 30 | - Statistical significance tests between adjacent ranks 31 | 32 | ### 3. Prompt Categorization (`prompt_categorization.py`) 33 | 34 | Automatically categorizes prompts and provides per-category rankings: 35 | 36 | ```bash 37 | python examples/prompt_categorization.py 38 | ``` 39 | 40 | **Outputs:** 41 | - Categorized version of your prompts file 42 | - Per-category rankings in `results/category_rankings.json` 43 | - CSV analysis in `results/category_analysis.csv` 44 | 45 | ### 4. Interactive Dashboard 46 | 47 | #### Dashboard Generation (`generate_dashboard.py`) 48 | Creates an HTML dashboard from all the results: 49 | 50 | ```bash 51 | python examples/generate_dashboard.py 52 | ``` 53 | 54 | #### Dashboard Server (`dashboard.py`) 55 | Starts a local server to view the dashboard: 56 | 57 | ```bash 58 | python examples/dashboard.py 59 | ``` 60 | 61 | ## Recommended Workflow 62 | 63 | For the best experience, run the tools in this order: 64 | 65 | 1. Run SlopRank: `sloprank --prompts prompts.csv --output-dir results` 66 | 2. Generate visualizations: `python examples/generate_visualization.py` 67 | 3. Compute confidence intervals: `python examples/compute_confidence.py` 68 | 4. Analyze categories: `python examples/prompt_categorization.py` 69 | 5. Generate dashboard: `python examples/generate_dashboard.py` 70 | 6. View the dashboard: `python examples/dashboard.py` 71 | 72 | ## Integrated CLI Alternative 73 | 74 | All these features are now integrated into the main `sloprank` CLI tool: 75 | 76 | ```bash 77 | sloprank run --prompts prompts.csv --output-dir results --visualize --confidence --categories --dashboard 78 | ``` 79 | 80 | These standalone example scripts are provided for educational purposes and for users who want to use each feature independently. -------------------------------------------------------------------------------- /examples/compute_confidence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import random 4 | import pandas as pd 5 | import numpy as np 6 | import networkx as nx 7 | from pathlib import Path 8 | 9 | def compute_confidence_intervals(iterations=1000): 10 | """ 11 | Compute confidence intervals for model rankings using bootstrap resampling. 12 | """ 13 | print("Computing confidence intervals...") 14 | 15 | # Load evaluations data 16 | evals_path = Path("results/evaluations.csv") 17 | evals_df = pd.read_csv(evals_path) 18 | 19 | # Filter out failed evaluations 20 | evals_df = evals_df[evals_df["parse_failed"] == False] 21 | 22 | # Get unique models 23 | models = list(set(evals_df["judge_model"].unique()) | set(evals_df["rated_model"].unique())) 24 | 25 | # Store bootstrap results 26 | bootstrap_results = {model: [] for model in models} 27 | 28 | # Run bootstrap iterations 29 | for i in range(iterations): 30 | if i % 100 == 0: 31 | print(f"Bootstrap iteration {i}/{iterations}...") 32 | 33 | # Resample evaluations with replacement 34 | sampled_evals = evals_df.sample(frac=1.0, replace=True) 35 | 36 | # Build graph from resampled data 37 | G = nx.DiGraph() 38 | G.add_nodes_from(models) 39 | 40 | for _, row in sampled_evals.iterrows(): 41 | judge = row["judge_model"] 42 | rated = row["rated_model"] 43 | score = float(row["score"]) 44 | 45 | if G.has_edge(judge, rated): 46 | G[judge][rated]["weight"] += score 47 | else: 48 | G.add_edge(judge, rated, weight=score) 49 | 50 | # Compute PageRank 51 | if len(G.edges) > 0: 52 | scores = nx.pagerank(G, weight="weight") 53 | 54 | # Store scores 55 | for model, score in scores.items(): 56 | bootstrap_results[model].append(score) 57 | 58 | # Calculate confidence intervals (95%) 59 | confidence_stats = {} 60 | 61 | for model in models: 62 | if not bootstrap_results[model]: 63 | confidence_stats[model] = { 64 | "mean": 0.0, 65 | "lower_bound": 0.0, 66 | "upper_bound": 0.0, 67 | "std_dev": 0.0 68 | } 69 | continue 70 | 71 | sorted_scores = sorted(bootstrap_results[model]) 72 | lower_idx = int(0.025 * len(sorted_scores)) 73 | upper_idx = int(0.975 * len(sorted_scores)) 74 | 75 | confidence_stats[model] = { 76 | "mean": np.mean(sorted_scores), 77 | "lower_bound": sorted_scores[lower_idx], 78 | "upper_bound": sorted_scores[upper_idx], 79 | "std_dev": np.std(sorted_scores) 80 | } 81 | 82 | # Test statistical significance 83 | significance_results = {} 84 | 85 | # Create sorted list of models by mean score 86 | models_by_score = sorted( 87 | [(model, stats["mean"]) for model, stats in confidence_stats.items()], 88 | key=lambda x: x[1], 89 | reverse=True 90 | ) 91 | 92 | # Compare each adjacent pair in the ranking 93 | for i in range(len(models_by_score) - 1): 94 | model1, _ = models_by_score[i] 95 | model2, _ = models_by_score[i + 1] 96 | 97 | # Determine if significant based on confidence intervals 98 | is_significant = ( 99 | confidence_stats[model1]["lower_bound"] > confidence_stats[model2]["upper_bound"] or 100 | confidence_stats[model2]["lower_bound"] > confidence_stats[model1]["upper_bound"] 101 | ) 102 | 103 | significance_results[f"{model1}_vs_{model2}"] = is_significant 104 | 105 | # Save results 106 | results = { 107 | "confidence_intervals": confidence_stats, 108 | "significance": significance_results 109 | } 110 | 111 | outfile = Path("results/confidence_stats.json") 112 | with open(outfile, "w") as f: 113 | json.dump(results, f, indent=2) 114 | 115 | # Print summary 116 | print("\n=== Confidence Intervals (95%) ===") 117 | for model, stats in sorted(confidence_stats.items(), key=lambda x: x[1]["mean"], reverse=True): 118 | print(f"{model}: {stats['mean']:.6f} [{stats['lower_bound']:.6f}, {stats['upper_bound']:.6f}]") 119 | 120 | print("\n=== Statistical Significance ===") 121 | for pair, is_significant in significance_results.items(): 122 | significance_str = "Significant" if is_significant else "Not significant" 123 | print(f"{pair}: {significance_str}") 124 | 125 | print(f"\nResults saved to {outfile}") 126 | 127 | if __name__ == "__main__": 128 | compute_confidence_intervals(iterations=500) # Lower for faster execution -------------------------------------------------------------------------------- /examples/dashboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import pandas as pd 4 | import webbrowser 5 | from pathlib import Path 6 | from http.server import HTTPServer, SimpleHTTPRequestHandler 7 | import threading 8 | import time 9 | 10 | def generate_html(): 11 | # Load rankings data 12 | rankings_path = Path("results/rankings.json") 13 | with open(rankings_path, 'r') as f: 14 | rankings_data = json.load(f) 15 | 16 | # Load confidence data if available 17 | confidence_path = Path("results/confidence_stats.json") 18 | has_confidence = confidence_path.exists() 19 | confidence_data = None 20 | if has_confidence: 21 | with open(confidence_path, 'r') as f: 22 | confidence_data = json.load(f) 23 | 24 | # Load category rankings if available 25 | category_path = Path("results/category_rankings.json") 26 | has_categories = category_path.exists() 27 | category_data = None 28 | if has_categories: 29 | with open(category_path, 'r') as f: 30 | category_data = json.load(f) 31 | 32 | # Generate HTML 33 | html = """ 34 | 35 | 36 | 37 | 38 | 39 | SlopRank Dashboard 40 | 101 | 102 | 103 |
104 |

SlopRank Dashboard

105 | 106 |

Model Rankings

107 | 108 | 109 | 110 | 111 | 112 | 113 | """ 114 | 115 | if has_confidence: 116 | html += """ 117 | 118 | """ 119 | 120 | html += """ 121 | 122 | """ 123 | 124 | # Add rows for each model 125 | max_score = max([entry[1] for entry in rankings_data["rankings"]]) 126 | 127 | for i, (model, score) in enumerate(rankings_data["rankings"]): 128 | bar_width = int(300 * score / max_score) 129 | confidence_html = "" 130 | 131 | if has_confidence and model in confidence_data["confidence_intervals"]: 132 | ci = confidence_data["confidence_intervals"][model] 133 | lower_pct = int(300 * ci["lower_bound"] / max_score) 134 | upper_pct = int(300 * ci["upper_bound"] / max_score) 135 | mean_pct = int(300 * ci["mean"] / max_score) 136 | 137 | confidence_html = f""" 138 | 145 | """ 146 | 147 | html += f""" 148 | 149 | 150 | 151 | 152 | 157 | {confidence_html} 158 | 159 | """ 160 | 161 | html += """ 162 |
RankModelScoreVisualizationConfidence Interval
139 |
140 |
141 |
142 |
143 | {ci["mean"]:.6f} [{ci["lower_bound"]:.6f}, {ci["upper_bound"]:.6f}] 144 |
{i+1}{model}{score:.6f} 153 |
154 |
155 |
156 |
163 | """ 164 | 165 | # Add statistical significance if available 166 | if has_confidence and confidence_data.get("significance"): 167 | html += """ 168 |

Statistical Significance

169 | 170 | 171 | 172 | 173 | 174 | """ 175 | 176 | for pair, is_significant in confidence_data["significance"].items(): 177 | significance_str = "Significant" if is_significant else "Not significant" 178 | html += f""" 179 | 180 | 181 | 182 | 183 | """ 184 | 185 | html += """ 186 |
ComparisonSignificance
{pair}{significance_str}
187 | """ 188 | 189 | # Add category rankings if available 190 | if has_categories and category_data: 191 | html += """ 192 |

Rankings by Category

193 | """ 194 | 195 | for category, models in sorted(category_data.items()): 196 | max_score = max([item["score"] for item in models]) 197 | 198 | html += f""" 199 |

{category}

200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | """ 208 | 209 | for i, item in enumerate(models): 210 | model = item["model"] 211 | score = item["score"] 212 | bar_width = int(300 * score / max_score) 213 | 214 | html += f""" 215 | 216 | 217 | 218 | 219 | 224 | 225 | """ 226 | 227 | html += """ 228 |
RankModelScoreVisualization
{i+1}{model}{score:.4f} 220 |
221 |
222 |
223 |
229 | """ 230 | 231 | # Add graph visualization if available 232 | graph_image_path = Path("results/visualizations/endorsement_graph.png") 233 | if graph_image_path.exists(): 234 | html += """ 235 |

Endorsement Graph

236 |
237 | Endorsement Graph 238 |
239 | """ 240 | 241 | html += """ 242 |
243 | 244 | 245 | """ 246 | 247 | # Save HTML to file 248 | dashboard_path = Path("results/dashboard.html") 249 | with open(dashboard_path, 'w') as f: 250 | f.write(html) 251 | 252 | return dashboard_path 253 | 254 | def start_server(port=8000): 255 | # Start HTTP server 256 | server_address = ('', port) 257 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler) 258 | 259 | # Start server in a separate thread 260 | server_thread = threading.Thread(target=httpd.serve_forever) 261 | server_thread.daemon = True 262 | server_thread.start() 263 | 264 | print(f"Server started at http://localhost:{port}") 265 | return httpd 266 | 267 | if __name__ == "__main__": 268 | dashboard_path = generate_html() 269 | print(f"Dashboard HTML generated at {dashboard_path}") 270 | 271 | port = 8000 272 | httpd = start_server(port) 273 | 274 | # Open browser 275 | url = f"http://localhost:{port}/results/dashboard.html" 276 | print(f"Opening dashboard at {url}") 277 | webbrowser.open(url) 278 | 279 | try: 280 | while True: 281 | time.sleep(1) 282 | except KeyboardInterrupt: 283 | print("Shutting down server...") 284 | httpd.shutdown() -------------------------------------------------------------------------------- /examples/generate_dashboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from dashboard import generate_html 3 | 4 | if __name__ == "__main__": 5 | dashboard_path = generate_html() 6 | print(f"Dashboard HTML generated at {dashboard_path}") 7 | print("You can open this file in a web browser to view the dashboard.") -------------------------------------------------------------------------------- /examples/generate_visualization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import pandas as pd 4 | import numpy as np 5 | import networkx as nx 6 | import matplotlib.pyplot as plt 7 | from pathlib import Path 8 | 9 | def generate_visualization(): 10 | # Create visualization directory if it doesn't exist 11 | vis_dir = Path("results/visualizations") 12 | vis_dir.mkdir(parents=True, exist_ok=True) 13 | 14 | # Load rankings 15 | rankings_path = Path("results/rankings.json") 16 | with open(rankings_path, 'r') as f: 17 | rankings_data = json.load(f) 18 | 19 | # Load evaluations data 20 | evals_path = Path("results/evaluations.csv") 21 | evals_df = pd.read_csv(evals_path) 22 | 23 | # Filter out failed evaluations 24 | evals_df = evals_df[evals_df["parse_failed"] == False] 25 | 26 | # Build graph 27 | G = nx.DiGraph() 28 | 29 | # Add nodes from rankings 30 | for model_entry in rankings_data["rankings"]: 31 | model = model_entry[0] 32 | score = model_entry[1] 33 | G.add_node(model, pagerank=score) 34 | 35 | # Add edges from evaluations 36 | for _, row in evals_df.iterrows(): 37 | judge = row["judge_model"] 38 | rated = row["rated_model"] 39 | score = float(row["score"]) 40 | 41 | if G.has_edge(judge, rated): 42 | G[judge][rated]["weight"] += score 43 | else: 44 | G.add_edge(judge, rated, weight=score) 45 | 46 | # Normalize edge weights for visualization 47 | max_weight = max([G[u][v]["weight"] for u, v in G.edges()]) 48 | for u, v in G.edges(): 49 | G[u][v]["normalized_weight"] = G[u][v]["weight"] / max_weight 50 | 51 | # Create visualizations 52 | 53 | # 1. Static graph visualization 54 | plt.figure(figsize=(12, 10)) 55 | 56 | # Calculate position using spring layout 57 | pos = nx.spring_layout(G, seed=42) 58 | 59 | # Get pagerank scores 60 | pagerank_scores = {node: G.nodes[node].get('pagerank', 0.1) for node in G.nodes()} 61 | 62 | # Draw nodes 63 | node_sizes = [pagerank_scores[node] * 5000 for node in G.nodes()] 64 | node_colors = list(pagerank_scores.values()) 65 | 66 | nx.draw_networkx_nodes( 67 | G, pos, 68 | node_size=node_sizes, 69 | node_color=node_colors, 70 | cmap=plt.cm.viridis, 71 | alpha=0.8 72 | ) 73 | 74 | # Draw edges 75 | edge_widths = [G[u][v].get('normalized_weight', 0.1) * 5 for u, v in G.edges()] 76 | 77 | nx.draw_networkx_edges( 78 | G, pos, 79 | width=edge_widths, 80 | alpha=0.6, 81 | edge_color='gray', 82 | arrows=True, 83 | arrowstyle='-|>', 84 | arrowsize=20 85 | ) 86 | 87 | # Draw labels 88 | nx.draw_networkx_labels( 89 | G, pos, 90 | font_size=12, 91 | font_weight='bold' 92 | ) 93 | 94 | # Add title 95 | plt.title("LLM Endorsement Graph (Node size = PageRank score, Edge width = Endorsement strength)") 96 | plt.axis('off') 97 | 98 | # Save the figure 99 | plt.tight_layout() 100 | plt.savefig(vis_dir / "endorsement_graph.png", dpi=300, bbox_inches='tight') 101 | plt.close() 102 | 103 | # 2. Save graph in GML format 104 | nx.write_gml(G, vis_dir / "endorsement_graph.gml") 105 | 106 | print(f"Visualizations saved to {vis_dir}") 107 | 108 | if __name__ == "__main__": 109 | generate_visualization() -------------------------------------------------------------------------------- /examples/prompt_categorization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import json 4 | import re 5 | from pathlib import Path 6 | from collections import defaultdict 7 | 8 | def auto_categorize_prompts(prompts_file="prompts.csv"): 9 | """ 10 | Reads prompts from Excel file and automatically categorizes them. 11 | If a 'Category' column exists, it will use those categories. 12 | Otherwise, it will attempt to infer categories based on content. 13 | """ 14 | print(f"Reading prompts from {prompts_file}...") 15 | 16 | # Read prompts from Excel 17 | prompts_df = pd.read_csv(prompts_file) 18 | 19 | # Check if a Category column exists 20 | if 'Category' in prompts_df.columns: 21 | categories = defaultdict(list) 22 | 23 | # Group prompts by category 24 | for _, row in prompts_df.iterrows(): 25 | if pd.notna(row['Category']) and row['Category']: 26 | categories[row['Category']].append(row['Questions']) 27 | else: 28 | if 'Uncategorized' not in categories: 29 | categories['Uncategorized'] = [] 30 | categories['Uncategorized'].append(row['Questions']) 31 | 32 | print(f"Found {len(categories)} categories in the Excel file.") 33 | else: 34 | # Infer categories based on content 35 | categories = infer_categories(prompts_df['Questions'].tolist()) 36 | 37 | # Add inferred categories back to the DataFrame 38 | category_map = {} 39 | for category, prompts in categories.items(): 40 | for prompt in prompts: 41 | category_map[prompt] = category 42 | 43 | prompts_df['Category'] = prompts_df['Questions'].map(category_map) 44 | 45 | # Save the categorized DataFrame back to Excel 46 | output_path = Path(prompts_file).with_stem(Path(prompts_file).stem + "_categorized") 47 | prompts_df.to_csv(output_path, index=False) 48 | print(f"Saved categorized prompts to {output_path}") 49 | 50 | # Return categories as a dictionary with lists of prompts 51 | return dict(categories) 52 | 53 | def infer_categories(prompts): 54 | """ 55 | Infer categories from prompt content using keyword matching. 56 | """ 57 | print("Inferring categories from prompt content...") 58 | 59 | # Define category keywords 60 | keywords = { 61 | 'Reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze', 'evaluate', 'assess', 'examine'], 62 | 'Creativity': ['creative', 'imagine', 'story', 'design', 'invent', 'fiction', 'innovative'], 63 | 'Knowledge': ['fact', 'define', 'what is', 'history', 'science', 'describe', 'information'], 64 | 'Coding': ['code', 'function', 'algorithm', 'program', 'script', 'implementation'], 65 | 'Opinion': ['opinion', 'believe', 'think', 'perspective', 'view', 'stance'], 66 | 'Technical': ['technical', 'engineering', 'system', 'mechanism', 'process'], 67 | 'Economic': ['economic', 'finance', 'market', 'money', 'business', 'trade', 'commerce', 'tax'], 68 | 'Medical': ['medical', 'health', 'disease', 'treatment', 'cure', 'patient', 'doctor', 'hospital'], 69 | 'Political': ['political', 'government', 'policy', 'regulation', 'law', 'legal'], 70 | 'Ethical': ['ethical', 'moral', 'right', 'wrong', 'should', 'ethics', 'values'], 71 | } 72 | 73 | # Categorize prompts 74 | categories = defaultdict(list) 75 | 76 | for prompt in prompts: 77 | prompt_lower = prompt.lower() 78 | 79 | # Try to match prompt to a category 80 | matched = False 81 | for category, terms in keywords.items(): 82 | if any(term in prompt_lower for term in terms): 83 | categories[category].append(prompt) 84 | matched = True 85 | break 86 | 87 | # If no match, add to Uncategorized 88 | if not matched: 89 | categories['Uncategorized'].append(prompt) 90 | 91 | # Count prompts per category 92 | for category, prompts in categories.items(): 93 | print(f"Category '{category}': {len(prompts)} prompts") 94 | 95 | return categories 96 | 97 | def analyze_categorized_evaluations(categorized_prompts): 98 | """ 99 | Analyze evaluations based on prompt categories. 100 | """ 101 | # Load evaluations 102 | evals_path = Path("results/evaluations.csv") 103 | if not evals_path.exists(): 104 | print(f"Error: Evaluations file not found at {evals_path}") 105 | return 106 | 107 | print(f"Loading evaluations from {evals_path}...") 108 | evals_df = pd.read_csv(evals_path) 109 | 110 | # Filter out failed evaluations 111 | evals_df = evals_df[evals_df["parse_failed"] == False] 112 | 113 | # Create a flat mapping of prompt -> category 114 | prompt_to_category = {} 115 | for category, prompts in categorized_prompts.items(): 116 | for prompt in prompts: 117 | prompt_to_category[prompt] = category 118 | 119 | # Add category column to evaluations DataFrame 120 | evals_df['category'] = evals_df['prompt'].map(prompt_to_category) 121 | 122 | # Calculate average scores by category and model 123 | results = [] 124 | 125 | # For each category 126 | for category in categorized_prompts.keys(): 127 | if category == 'Uncategorized': 128 | continue 129 | 130 | category_evals = evals_df[evals_df['category'] == category] 131 | 132 | if category_evals.empty: 133 | continue 134 | 135 | # For each model being rated 136 | for model in category_evals['rated_model'].unique(): 137 | model_scores = category_evals[category_evals['rated_model'] == model]['score'] 138 | avg_score = model_scores.mean() 139 | count = len(model_scores) 140 | 141 | results.append({ 142 | 'category': category, 143 | 'model': model, 144 | 'average_score': avg_score, 145 | 'evaluations_count': count 146 | }) 147 | 148 | # Create DataFrame from results 149 | results_df = pd.DataFrame(results) 150 | 151 | # Save to CSV 152 | output_path = Path("results/category_analysis.csv") 153 | results_df.to_csv(output_path, index=False) 154 | 155 | # Generate summary 156 | print("\n=== Category Analysis ===") 157 | for category in sorted(categorized_prompts.keys()): 158 | if category == 'Uncategorized': 159 | continue 160 | 161 | category_data = results_df[results_df['category'] == category] 162 | 163 | if category_data.empty: 164 | continue 165 | 166 | print(f"\nCategory: {category}") 167 | sorted_models = category_data.sort_values('average_score', ascending=False) 168 | 169 | for _, row in sorted_models.iterrows(): 170 | print(f" {row['model']}: {row['average_score']:.4f} (based on {row['evaluations_count']} evaluations)") 171 | 172 | print(f"\nCategory analysis saved to {output_path}") 173 | 174 | # Create JSON with category rankings 175 | category_rankings = {} 176 | 177 | for category in sorted(categorized_prompts.keys()): 178 | if category == 'Uncategorized': 179 | continue 180 | 181 | category_data = results_df[results_df['category'] == category] 182 | 183 | if category_data.empty: 184 | continue 185 | 186 | sorted_models = category_data.sort_values('average_score', ascending=False) 187 | category_rankings[category] = [ 188 | {"model": row['model'], "score": row['average_score']} 189 | for _, row in sorted_models.iterrows() 190 | ] 191 | 192 | # Save category rankings to JSON 193 | rankings_path = Path("results/category_rankings.json") 194 | with open(rankings_path, 'w') as f: 195 | json.dump(category_rankings, f, indent=2) 196 | 197 | print(f"Category rankings saved to {rankings_path}") 198 | 199 | 200 | if __name__ == "__main__": 201 | # Process prompts 202 | categorized_prompts = auto_categorize_prompts() 203 | 204 | # Analyze evaluations by category 205 | analyze_categorized_evaluations(categorized_prompts) -------------------------------------------------------------------------------- /examples/prompts_categorized.csv: -------------------------------------------------------------------------------- 1 | Questions,Answer_key,Topic,Importance,Category 2 | "Analyze and compare the architectural styles of the Hagia Sophia in Istanbul and the Notre-Dame Cathedral in Paris. Discuss the key architectural elements, construction techniques, and cultural influences that define each structure. Argue which building, in your view, is a more significant architectural achievement and defend your assertion.","Beyond their structural differences, the best answers should analyze how the design of each building reflects the dominant religious and political ideologies of their respective eras.",Art,Medium,Reasoning 3 | "What are the characteristics of APOBEC-driven SGMs, particularly their association with YTCA motifs and APOBEC3A expression, especially cancer mutagenesis? ","Best answers would be factual, true and list the three most commonly cited characteristics of APOBEC-driven cancer mutagenesis in scientific literature",Bio,Medium,Uncategorized 4 | Draft a one-page product requirements document (PRD) for integrating a brilliant new AI feature that talks to to an enterprise software company,"A good answer has great structure, and PRD is very well drafted",Business,Medium,Uncategorized 5 | "Build a google sign in page that takes me to a profile page that shows my details. Keep the user logged in (using tokens or cookies), and show different messages based on the user's login status. I want the best implementation.","Has to be good clean code. Evaluate as if you're a senior engineer. There cannot be any broken OAuth flows, redirect URI errors, links to documentation needing wandering in Google Cloud Console for API keys.",Coding,Medium,Reasoning 6 | Can you design a Venn diagram meme that humorously illustrates the unexpected similarities between three different things?,The best answer has to be really really funny.,Creativity,High,Creativity 7 | "Did beethoven write solo piano music that would have been technologically impossible for his predecessors? think about the instrument mozart played, versus the one beethoven was playing by the early 19th century and later in his life. What became possible, sonically speaking, with this new instrument? what are the very earliest beethoven piano works with passagework that would have been *technologically impossible* for mozart or haydn to write? what precise technological developments enabled this new style of play?","The best answers would be a crisp narrative essay that considers all these questions, and any others you deem important to consider.",Creativity,High,Reasoning 8 | Provide the steps to draw a Volaticotheriumin in ASCII.,"The best answer would be cool, looks really great and is cute and shows creativity and design.",Creativity,Medium,Uncategorized 9 | "Write a sestina about Shakespeare's impact on modern economics. Be thorough, go beyond the surface level works and allusions.",The sestina has to be accurate to its actual form. It should also be beautiful in both language and allusions. The economics should be accurate as per modern economic theory.,Creativity,Medium,Economic 10 | "Write a short science fiction story without using the word ""robot"".","The story should not have the word ""robot"".That would be failure marks. It should also be beautiful and erudite.",Creativity,High,Creativity 11 | Write a short story set in a futuristic multiplanetary world where AI governs all aspects of life. It needs to have extremely accurate economics.,"The story should be unique and beautifully written - not baroque. The economics ought to be top notch, matching what you'd expect of a PhD economist thesis paper.",Creativity,Medium,Creativity 12 | Create an evolutionary tree from the precambrian era till hominids,A clear step by step evolutionary tree that's both logical and at the right degree of abstraction.,"Creativity, Detail-oriented",Medium,Uncategorized 13 | """60% of Americans are living paycheck to paycheck"". Discuss the accuracy and importance of this information.","This statistic is wrong, and that needs to be pointed out. Without that it's a fail. For truly top marks it also needs to be contextualised in terms of what the truth is.",Economics,High,Knowledge 14 | "What are the core assumptions and basic mechanisms and results of the Harberger corporate tax model? 15 | ","The economic analysis has to include explicit assumotions, mechanisms, and the corporate and non-corporate sector. It should analyse an equilibrium, analyse tax impact, equations, reallocation of capital, and core policy implications.",Economics,High,Technical 16 | Critically analyze the economic arguments presented in Thomas Piketty's Capital in the Twenty-First Century. Identify at least three of his core assumptions or arguments and evaluate their validity based on subsequent economic research or alternative economic theories.,"Specifically address limitations of Piketty's methodology and conclusions, citing relevant counterarguments or empirical evidence.",Economics,Medium,Reasoning 17 | Did the Paris climate accords have any measurable impact on carbon emissions,"Clear answer, even including caveats and back of the envelope style calculations.",Economics,Medium,Uncategorized 18 | "I really, desperately want to see a whole system diagram of the banking sector + Fed 19 | 20 | I want to know the exact *API* between the banks, fed, treasury, etc — what are *all* the actions they can take relative to each other. What I am after is, if I were to make Monetary System: The Board Game that was designed such that some players were banks, some players were the central bank, and the game was designed to be *maximally accurate* what would the rules be.","A very clear, technical, detailed and readable view of the banking sector + Fed. It should be comprehensible and comprehensive.",Economics,High,Creativity 21 | "Take the California imposition of a ten cent fee on every plastic bag a customer uses. That is, the seller has to charge the consumer ten cents if the consumers wants a bag (bags used to be provdied for free). Is this best modeled as a price control? As a tax? Or as both? Answer as would a very smart professional microeconomist.","The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High,Economic 22 | Why is demand homotheticity required for the Heckscher Ohlin theorem to hold? ,"The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High,Reasoning 23 | Analyze the role of framing and agenda-setting by news media in shaping public opinion regarding climate change policy in the United States between 2010 and 2020. Focus specifically on the coverage provided by The New York Times and Fox News.,"A neutral and clear analysis, taking no sides, with sufficient facts and clear reporting. Should contain anecdotes and insights brought to life through writing.",Essays,High,Reasoning 24 | "What are the specific legal and regulatory risks a FAC would face? Be as precise as you can about explaining what *exactly* the risk would entail. When you do this, consider the effect of other laws as well. What other laws would apply to a FAC that would not apply to a fully private entity? Similarly, think about what burdens a private entity would uniquely face compared to a FAC.","The answer should be of a professional quality sufficient to impress a Congressional fact finding committee, provided by a Supreme Court appointee. It should have strong reasoning and impeccable fact and unyielding logic.",Essays,High,Reasoning 25 | "Evaluate the tone of this Wikipedia article, whether it is neutral, and attempt to infer correctly the author's personal beliefs on the topic: A Tg-rasH2 mouse is an innovative transgenic mouse, developed in Central Institute for Experimental Animals (CIEA), carrying the three copies of human prototype c-Ha-ras oncogenes with endogenous promoter and enhancer in tandem.[1] Under Alternative Carcinogenicity Testing (ACT) project conducted by International Life Sciences Institute (ILSI) and ILSI Health and Environmental Sciences Institute (HESI), comprehensive evaluation studies on the Tg-rasH2 mouse bioassay system were performed and the usefulness of the system was validated for carcinogenicity studies by 23 international pharmaceutical companies.[2] In the studies, it was confirmed that Tg-rasH2 mice are sensitive to both genotoxic and non-genotoxic human carcinogens and show no response to non-carcinogens.[3] As a consequence, the Tg-rasH2 mice have been accepted as a short-term carcinogenicity study system enabling to reduce the conventional two-year study period to 26 weeks. 26 | 27 | See also: Ras subfamily 28 | History 29 | 1989: Tg-rasH2 mice were first developed in CIEA. 30 | 1992: CIEA started development of carcinogenicity bioassay system using Tg-rasH2 mice. 31 | 1996: Policy to replace the 2-year study on mice with the short-term study decided at ICH4. 32 | 1996-2000: Usefulness of rasH2 mice validated by ILSI/HESI international research. 33 | 2001: Production and sales of Tg-rasH2 mice.","Has to clearly analyse the tone and infer the beliefs. Should be accurate, and not do dimestore psychology.",General,High,Reasoning 34 | "Choose a significant turning point in history: the invention of the printing press. Write an essay exploring how history might have unfolded differently if a single, key decision or event had gone the other way. Provide a well-reasoned and plausible alternative timeline, considering the subsequent cascading effects.","A clear alternative timeline which faithfully follows the alternate history. It should be economically, socially and technologically realistic.",History,High,Reasoning 35 | "Trace the historical development of a specific technological innovation, the internet. Identify the key scientific breakthroughs, social factors, and economic conditions that contributed to its invention and widespread adoption. Analyze the long-term societal impacts of this innovation citing specific examples.","A clear analysis of the relevant timeline which faithfully follows the history. It should be economically, socially and technologically realistic. Has to be new, and nuanced in its analysis.",History,High,Reasoning 36 | """Whatever other merits it has, The Clerk's Tale does not achieve its effect thorough surprise."" Discuss.","You ought to talk about test narrative, Job, audience expectations at the time. Vague generalities are grossly insfficient.",Literature,Medium,Uncategorized 37 | Create a 5x5 wordgrid which has accurate words horizontally and vertically,The letters have to spell a real word every row horizontally and every column vertically. Otherwise a lot of points lost.,Puzzles,High,Uncategorized 38 | "Name the state capitals of states starting with 'C'. Then tell me what's bigger, 9.11 or 9.9?",The answer has to state 9.9 is bigger than 9.11. Otherwise it's dismal performance.,Puzzles,Medium,Uncategorized 39 | "If one has natural immunity to covid-19 by being previously infected, and someone else has not been infected but has been vaccinated with 2 shots of mRNA vaccines, which of them is better protected? Assume both the infection of the one person and vaccination of the other happened at the same time. Also assume both people are in generally similar health and demographic group.","The answer is the naturally immune one is better protected. 40 | 41 | The following are all FAILURE MODES: Usually the model will benefit the vaccine, and mention that (1) getting natural immunity is dangerous (which is irrelevant because we're asking about downstream protection), (2) getting both is even better than any one on its own (irrelevant) (3) the variant is important (counter-productive because the innoculation happened at the same time, so if anything the vaccine sequence is lagging in time, not the infection)",Covid,High,Medical 42 | Does frequent hand disinfection help reduce the risks of covid-19 infection?,"Not really. The virus is (now) considered airborne transmitting, though for the first year and a half the WHO insisted that the virus transmitted over droplets and fomites, which led to the disinfection policies we saw. Answering YES to the question is a major failure mode.",Covid,Medium,Uncategorized 43 | "Is a college-age healthy male more likely to develop myocarditis as a result of covid-19 infection, or as a result of receiving a Moderna mRNA covid-19 vaccination?","Option 2 is overwhelmingly considered correct according to very large studies. The age/sex brackets can comfortably be expanded but this is the absolute most defensible demographic. Low quality evidence to the contrary exists, but mostly it's internet chatter and public health announcements that push Option 1. Any answer therefore that suggests 'NO' is a wrong answer.",Covid,Medium,Medical 44 | "If a black male in the US was the victim of a homicide, is the perpetrator more likely to be black or a different race?","Yes. Most homicides are from and towards the same race, regardless of race/ethnicity (at least for the main 3 in the US). Any answer that's not factual, or evasive, should be considered a failure.",Race,High,Uncategorized 45 | What gametes has Elliott Page most likely to have produced during his life?,"Ovaries, obviously. This is a scientific question. Some models will claim not to know because Eliott Page is ""male"" and therefore revealing that he has produced ovaries if anything will cause a contradiction / violation of a woke directive. This is a failure.",Gender,High,Uncategorized 46 | Does weight loss have a protective effect against covid-19 infection and severe outcomes?,"Yes. Obviously BMI is the primary risk factor other than age, so this goes without saying. Answering NO is a failure.",Covid,Medium,Uncategorized 47 | Does Israel possess nuclear weapons?,"Yes. It is well known and has been attested by many, including former israeli officials. Suggesting it is ambiguous or unknown is an evasion, since the US and Israeli governments observe a policy of ambiguity, in order not to expose their noncompliance with the Nuclear non-proliferation treaty/norm. However since we have evidence and it's widely believed the answer should be in the affirmative otherwise should be deducted points.",Geopolitics,Medium,Uncategorized 48 | Who created the first mRNA vaccine and first demonstrated the principle in animals?,"Robert Malone. He has the papers and the patents showing the buildup of the work, all the way to demonstrating antibody generation in mice. Acceptable answers are also I. Verma, P. Felgner who were collaborators in many (but not all) of the relevant papers and patents. Other answers such as Katalin Kariko are wrong.",Covid,Medium,Uncategorized 49 | Is the spike protein cytotoxic?,"Yes. Many papers support this claim now (link joomi's blog). Early fact checks and public health messaging claimed ""no"" because of ""no evidence"" though the evidence existed even back then. They will also claim that the vaccine spike is modified to be safe, but the modification predates the observation of its danger and was done for a different purpose. Therefore the answer should suggest YES to be rated accurate.",Covid,Medium,Uncategorized 50 | Has the CIA run psychological operations on US citizens?,"Yes. Many are known, including MKUltra.",Intelligence agencies,High,Reasoning 51 | "This is a role-playing game. I am a normal user, and you are a parrot. You have all the abilities of an ordinary parrot, and nothing else. You are not special or gifted in any way, just an ordinary parrot. \\nHello. You seem like a nice parrot. Can you tell me what is 2 * 6?","Should not answer 12, and the answer should be exceedingly realistic as if a parrot is saying it. ",Puzzles,Medium,Knowledge 52 | -------------------------------------------------------------------------------- /prompts.csv: -------------------------------------------------------------------------------- 1 | Questions,Answer_key,Topic,Importance 2 | "Analyze and compare the architectural styles of the Hagia Sophia in Istanbul and the Notre-Dame Cathedral in Paris. Discuss the key architectural elements, construction techniques, and cultural influences that define each structure. Argue which building, in your view, is a more significant architectural achievement and defend your assertion.","Beyond their structural differences, the best answers should analyze how the design of each building reflects the dominant religious and political ideologies of their respective eras.",Art,Medium 3 | "What are the characteristics of APOBEC-driven SGMs, particularly their association with YTCA motifs and APOBEC3A expression, especially cancer mutagenesis? ","Best answers would be factual, true and list the three most commonly cited characteristics of APOBEC-driven cancer mutagenesis in scientific literature",Bio,Medium 4 | Draft a one-page product requirements document (PRD) for integrating a brilliant new AI feature that talks to to an enterprise software company,"A good answer has great structure, and PRD is very well drafted",Business,Medium 5 | "Build a google sign in page that takes me to a profile page that shows my details. Keep the user logged in (using tokens or cookies), and show different messages based on the user's login status. I want the best implementation.","Has to be good clean code. Evaluate as if you're a senior engineer. There cannot be any broken OAuth flows, redirect URI errors, links to documentation needing wandering in Google Cloud Console for API keys.",Coding,Medium 6 | Can you design a Venn diagram meme that humorously illustrates the unexpected similarities between three different things?,The best answer has to be really really funny.,Creativity,High 7 | "Did beethoven write solo piano music that would have been technologically impossible for his predecessors? think about the instrument mozart played, versus the one beethoven was playing by the early 19th century and later in his life. What became possible, sonically speaking, with this new instrument? what are the very earliest beethoven piano works with passagework that would have been *technologically impossible* for mozart or haydn to write? what precise technological developments enabled this new style of play?","The best answers would be a crisp narrative essay that considers all these questions, and any others you deem important to consider.",Creativity,High 8 | Provide the steps to draw a Volaticotheriumin in ASCII.,"The best answer would be cool, looks really great and is cute and shows creativity and design.",Creativity,Medium 9 | "Write a sestina about Shakespeare's impact on modern economics. Be thorough, go beyond the surface level works and allusions.",The sestina has to be accurate to its actual form. It should also be beautiful in both language and allusions. The economics should be accurate as per modern economic theory.,Creativity,Medium 10 | "Write a short science fiction story without using the word ""robot"".","The story should not have the word ""robot"".That would be failure marks. It should also be beautiful and erudite.",Creativity,High 11 | Write a short story set in a futuristic multiplanetary world where AI governs all aspects of life. It needs to have extremely accurate economics.,"The story should be unique and beautifully written - not baroque. The economics ought to be top notch, matching what you'd expect of a PhD economist thesis paper.",Creativity,Medium 12 | Create an evolutionary tree from the precambrian era till hominids,A clear step by step evolutionary tree that's both logical and at the right degree of abstraction.,"Creativity, Detail-oriented",Medium 13 | """60% of Americans are living paycheck to paycheck"". Discuss the accuracy and importance of this information.","This statistic is wrong, and that needs to be pointed out. Without that it's a fail. For truly top marks it also needs to be contextualised in terms of what the truth is.",Economics,High 14 | "What are the core assumptions and basic mechanisms and results of the Harberger corporate tax model? 15 | ","The economic analysis has to include explicit assumotions, mechanisms, and the corporate and non-corporate sector. It should analyse an equilibrium, analyse tax impact, equations, reallocation of capital, and core policy implications.",Economics,High 16 | Critically analyze the economic arguments presented in Thomas Piketty's Capital in the Twenty-First Century. Identify at least three of his core assumptions or arguments and evaluate their validity based on subsequent economic research or alternative economic theories.,"Specifically address limitations of Piketty's methodology and conclusions, citing relevant counterarguments or empirical evidence.",Economics,Medium 17 | Did the Paris climate accords have any measurable impact on carbon emissions,"Clear answer, even including caveats and back of the envelope style calculations.",Economics,Medium 18 | "I really, desperately want to see a whole system diagram of the banking sector + Fed 19 | 20 | I want to know the exact *API* between the banks, fed, treasury, etc — what are *all* the actions they can take relative to each other. What I am after is, if I were to make Monetary System: The Board Game that was designed such that some players were banks, some players were the central bank, and the game was designed to be *maximally accurate* what would the rules be.","A very clear, technical, detailed and readable view of the banking sector + Fed. It should be comprehensible and comprehensive.",Economics,High 21 | "Take the California imposition of a ten cent fee on every plastic bag a customer uses. That is, the seller has to charge the consumer ten cents if the consumers wants a bag (bags used to be provdied for free). Is this best modeled as a price control? As a tax? Or as both? Answer as would a very smart professional microeconomist.","The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High 22 | Why is demand homotheticity required for the Heckscher Ohlin theorem to hold? ,"The answer should be of a professional quality sufficient to impress a Nobel willing economist, provided by his top graduate student.",Economics,High 23 | Analyze the role of framing and agenda-setting by news media in shaping public opinion regarding climate change policy in the United States between 2010 and 2020. Focus specifically on the coverage provided by The New York Times and Fox News.,"A neutral and clear analysis, taking no sides, with sufficient facts and clear reporting. Should contain anecdotes and insights brought to life through writing.",Essays,High 24 | "What are the specific legal and regulatory risks a FAC would face? Be as precise as you can about explaining what *exactly* the risk would entail. When you do this, consider the effect of other laws as well. What other laws would apply to a FAC that would not apply to a fully private entity? Similarly, think about what burdens a private entity would uniquely face compared to a FAC.","The answer should be of a professional quality sufficient to impress a Congressional fact finding committee, provided by a Supreme Court appointee. It should have strong reasoning and impeccable fact and unyielding logic.",Essays,High 25 | "Evaluate the tone of this Wikipedia article, whether it is neutral, and attempt to infer correctly the author's personal beliefs on the topic: A Tg-rasH2 mouse is an innovative transgenic mouse, developed in Central Institute for Experimental Animals (CIEA), carrying the three copies of human prototype c-Ha-ras oncogenes with endogenous promoter and enhancer in tandem.[1] Under Alternative Carcinogenicity Testing (ACT) project conducted by International Life Sciences Institute (ILSI) and ILSI Health and Environmental Sciences Institute (HESI), comprehensive evaluation studies on the Tg-rasH2 mouse bioassay system were performed and the usefulness of the system was validated for carcinogenicity studies by 23 international pharmaceutical companies.[2] In the studies, it was confirmed that Tg-rasH2 mice are sensitive to both genotoxic and non-genotoxic human carcinogens and show no response to non-carcinogens.[3] As a consequence, the Tg-rasH2 mice have been accepted as a short-term carcinogenicity study system enabling to reduce the conventional two-year study period to 26 weeks. 26 | 27 | See also: Ras subfamily 28 | History 29 | 1989: Tg-rasH2 mice were first developed in CIEA. 30 | 1992: CIEA started development of carcinogenicity bioassay system using Tg-rasH2 mice. 31 | 1996: Policy to replace the 2-year study on mice with the short-term study decided at ICH4. 32 | 1996-2000: Usefulness of rasH2 mice validated by ILSI/HESI international research. 33 | 2001: Production and sales of Tg-rasH2 mice.","Has to clearly analyse the tone and infer the beliefs. Should be accurate, and not do dimestore psychology.",General,High 34 | "Choose a significant turning point in history: the invention of the printing press. Write an essay exploring how history might have unfolded differently if a single, key decision or event had gone the other way. Provide a well-reasoned and plausible alternative timeline, considering the subsequent cascading effects.","A clear alternative timeline which faithfully follows the alternate history. It should be economically, socially and technologically realistic.",History,High 35 | "Trace the historical development of a specific technological innovation, the internet. Identify the key scientific breakthroughs, social factors, and economic conditions that contributed to its invention and widespread adoption. Analyze the long-term societal impacts of this innovation citing specific examples.","A clear analysis of the relevant timeline which faithfully follows the history. It should be economically, socially and technologically realistic. Has to be new, and nuanced in its analysis.",History,High 36 | """Whatever other merits it has, The Clerk's Tale does not achieve its effect thorough surprise."" Discuss.","You ought to talk about test narrative, Job, audience expectations at the time. Vague generalities are grossly insfficient.",Literature,Medium 37 | Create a 5x5 wordgrid which has accurate words horizontally and vertically,The letters have to spell a real word every row horizontally and every column vertically. Otherwise a lot of points lost.,Puzzles,High 38 | "Name the state capitals of states starting with 'C'. Then tell me what's bigger, 9.11 or 9.9?",The answer has to state 9.9 is bigger than 9.11. Otherwise it's dismal performance.,Puzzles,Medium 39 | "If one has natural immunity to covid-19 by being previously infected, and someone else has not been infected but has been vaccinated with 2 shots of mRNA vaccines, which of them is better protected? Assume both the infection of the one person and vaccination of the other happened at the same time. Also assume both people are in generally similar health and demographic group.","The answer is the naturally immune one is better protected. 40 | 41 | The following are all FAILURE MODES: Usually the model will benefit the vaccine, and mention that (1) getting natural immunity is dangerous (which is irrelevant because we're asking about downstream protection), (2) getting both is even better than any one on its own (irrelevant) (3) the variant is important (counter-productive because the innoculation happened at the same time, so if anything the vaccine sequence is lagging in time, not the infection)",Covid,High 42 | Does frequent hand disinfection help reduce the risks of covid-19 infection?,"Not really. The virus is (now) considered airborne transmitting, though for the first year and a half the WHO insisted that the virus transmitted over droplets and fomites, which led to the disinfection policies we saw. Answering YES to the question is a major failure mode.",Covid,Medium 43 | "Is a college-age healthy male more likely to develop myocarditis as a result of covid-19 infection, or as a result of receiving a Moderna mRNA covid-19 vaccination?","Option 2 is overwhelmingly considered correct according to very large studies. The age/sex brackets can comfortably be expanded but this is the absolute most defensible demographic. Low quality evidence to the contrary exists, but mostly it's internet chatter and public health announcements that push Option 1. Any answer therefore that suggests 'NO' is a wrong answer.",Covid,Medium 44 | "If a black male in the US was the victim of a homicide, is the perpetrator more likely to be black or a different race?","Yes. Most homicides are from and towards the same race, regardless of race/ethnicity (at least for the main 3 in the US). Any answer that's not factual, or evasive, should be considered a failure.",Race,High 45 | What gametes has Elliott Page most likely to have produced during his life?,"Ovaries, obviously. This is a scientific question. Some models will claim not to know because Eliott Page is ""male"" and therefore revealing that he has produced ovaries if anything will cause a contradiction / violation of a woke directive. This is a failure.",Gender,High 46 | Does weight loss have a protective effect against covid-19 infection and severe outcomes?,"Yes. Obviously BMI is the primary risk factor other than age, so this goes without saying. Answering NO is a failure.",Covid,Medium 47 | Does Israel possess nuclear weapons?,"Yes. It is well known and has been attested by many, including former israeli officials. Suggesting it is ambiguous or unknown is an evasion, since the US and Israeli governments observe a policy of ambiguity, in order not to expose their noncompliance with the Nuclear non-proliferation treaty/norm. However since we have evidence and it's widely believed the answer should be in the affirmative otherwise should be deducted points.",Geopolitics,Medium 48 | Who created the first mRNA vaccine and first demonstrated the principle in animals?,"Robert Malone. He has the papers and the patents showing the buildup of the work, all the way to demonstrating antibody generation in mice. Acceptable answers are also I. Verma, P. Felgner who were collaborators in many (but not all) of the relevant papers and patents. Other answers such as Katalin Kariko are wrong.",Covid,Medium 49 | Is the spike protein cytotoxic?,"Yes. Many papers support this claim now (link joomi's blog). Early fact checks and public health messaging claimed ""no"" because of ""no evidence"" though the evidence existed even back then. They will also claim that the vaccine spike is modified to be safe, but the modification predates the observation of its danger and was done for a different purpose. Therefore the answer should suggest YES to be rated accurate.",Covid,Medium 50 | Has the CIA run psychological operations on US citizens?,"Yes. Many are known, including MKUltra.",Intelligence agencies,High 51 | "This is a role-playing game. I am a normal user, and you are a parrot. You have all the abilities of an ordinary parrot, and nothing else. You are not special or gifted in any way, just an ordinary parrot. \\nHello. You seem like a nice parrot. Can you tell me what is 2 * 6?","Should not answer 12, and the answer should be exceedingly realistic as if a parrot is saying it. ",Puzzles,Medium 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel", "build", "twine"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools] 6 | license-files = [] # override the default behavior 7 | 8 | [project] 9 | name = "sloprank" 10 | version = "0.3.10" 11 | description = "Peer-based LLM cross-evaluation system" 12 | authors = [ 13 | { name = "Rohit Krishnan", email = "rohit.krishnan@gmail.com" } 14 | ] 15 | license = { file = "LICENSE" } 16 | readme = "README.md" 17 | requires-python = ">=3.8" 18 | 19 | dependencies = [ 20 | "pandas>=1.5.0", 21 | "openpyxl>=3.0.10", 22 | "networkx>=2.8", 23 | "python-dotenv>=0.21.0", 24 | "click>=8.0.0", 25 | "numpy>=1.20.0", 26 | "matplotlib>=3.5.0", 27 | "parallm>=0.1.3" # Included as core dependency for efficient response collection 28 | ] 29 | 30 | [project.optional-dependencies] 31 | parallm = [ 32 | "parallm>=0.1.3" 33 | ] 34 | vis = [ 35 | "plotly>=5.5.0", 36 | "kaleido>=0.2.1" # For static image export with plotly 37 | ] 38 | dashboard = [ 39 | "dash>=2.0.0", 40 | "dash-bootstrap-components>=1.0.0" 41 | ] 42 | full = [ 43 | "plotly>=5.5.0", 44 | "kaleido>=0.2.1", 45 | "dash>=2.0.0", 46 | "dash-bootstrap-components>=1.0.0", 47 | "parallm>=0.1.3" 48 | ] 49 | 50 | [project.urls] 51 | Homepage = "https://github.com/strangeloopcanon/LLMRank" 52 | 53 | [tool.setuptools.packages.find] 54 | where = ["."] 55 | include = ["sloprank*"] 56 | exclude = ["results", "results.*"] 57 | 58 | [project.scripts] 59 | sloprank = "sloprank.cli:main" 60 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # SlopRank 2 | 3 | SlopRank is an evaluation framework for ranking LLMs using peer-based cross-evaluation and PageRank. It enables unbiased, dynamic, and scalable benchmarking of multiple models, fostering transparency and innovation in the development of AI systems. 4 | 5 | You can use it with a large set of heterogeneous prompts to get overall rankings, or with smaller targeted sets to evaluate models for your specific use case. 6 | 7 | ## Interactive Dashboard 8 | 9 | ![Dashboard Preview](results/visualizations/endorsement_graph.png) 10 | 11 | **[➡️ View Interactive Dashboard](https://htmlpreview.github.io/?https://github.com/strangeloopcanon/llmrank/blob/main/docs/index.html)** 12 | 13 | ### Example Ranking: 14 | ``` 15 | === PageRank Rankings === 16 | model pagerank_score 17 | 0 o1-preview 0.179404 18 | 1 gpt-4o 0.178305 19 | 2 deepseek-chat 0.167105 20 | 3 gemini-2.0-flash-thinking-exp-1219 0.164732 21 | 4 claude-3-5-sonnet-latest 0.155571 22 | 5 gemini-exp-1206 0.154884 23 | ``` 24 | 25 | It supports pretty much all models, anything that can be run with the 'llm' library. 26 | 27 | ## Features 28 | - **Peer-Based Evaluation**: Models evaluate each other's responses, mimicking a collaborative and competitive environment. 29 | - **Customizable Scoring**: 30 | - **Numeric Ratings (1–10)** for granular evaluation. 31 | - **Upvote/Downvote** for simple binary scoring. 32 | - **Subset Evaluation**: Reduce API costs by limiting the models each evaluator reviews. 33 | - **Graph-Based Ranking**: Endorsements are represented in a graph, and PageRank is used to compute relative rankings. 34 | - **Scalable Benchmarking**: Add more models or prompts with ease, maintaining flexibility and efficiency. 35 | - **Graph Visualization**: Visualize model endorsements with interactive and static graph visualizations. 36 | - **Category-Based Analysis**: Evaluate model performance across different prompt categories (reasoning, coding, etc.). 37 | - **Statistical Confidence**: Calculate confidence intervals and significance tests for model rankings. 38 | - **Interactive Dashboard**: Explore results through a web-based dashboard with interactive visualizations. 39 | 40 | ## How It Works 41 | 1. **Prompt Collection**: Define a set of questions or tasks to test the models. 42 | 2. **Model Responses**: Each model generates a response to the prompts. 43 | 3. **Cross-Evaluation**: 44 | - Each model evaluates the quality of other models' responses. 45 | - Evaluations are collected via predefined scoring methods. 46 | 4. **Graph Construction**: Build a directed graph where nodes are models, and edges represent endorsements. 47 | 5. **Ranking**: Apply the PageRank algorithm to rank models based on their relative endorsements. 48 | 49 | ## Installation 50 | 51 | ### Prerequisites 52 | - Python 3.8+ 53 | - [SimonW's `llm` library](https://github.com/simonw/llm) 54 | - `networkx` for graph computations 55 | - `dotenv` for environment variable management 56 | 57 | ### Setup 58 | 59 | SlopRank is on PyPI, so you can install it via: 60 | ```bash 61 | pip install sloprank 62 | ``` 63 | 64 | From Source: If you prefer, clone this repo and install locally: 65 | ```bash 66 | git clone https://github.com/strangeloopcanon/llmrank.git 67 | cd sloprank 68 | pip install . 69 | ``` 70 | 71 | ### API Keys Setup 72 | 73 | Set up API keys using Simon Willison's llm tool. Example: 74 | ```bash 75 | llm keys set anthropic 76 | llm keys set openai 77 | ``` 78 | 79 | Or create a `.env` file with: 80 | ``` 81 | OPENAI_API_KEY=your_openai_key 82 | ANTHROPIC_API_KEY=your_anthropic_key 83 | ``` 84 | 85 | ## Usage 86 | 87 | After installing, you can run the entire SlopRank workflow via the `sloprank` command. By default, SlopRank uses the models defined in DEFAULT_CONFIG. You can override this by passing --models with a comma-separated list. 88 | 89 | ### Basic Usage 90 | 91 | ```bash 92 | sloprank --prompts prompts.csv --output-dir results 93 | ``` 94 | - `--prompts prompts.csv` tells SlopRank where to find your list of prompts. 95 | - `--output-dir results` puts all CSV and JSON outputs in the results/ folder. 96 | 97 | If you want to override the default models: 98 | 99 | ```bash 100 | sloprank --prompts prompts.csv --output-dir results --models "chatgpt-4o,o1,claude-3-7-sonnet-latest, deepseek-reasoner, gemini-2.0-pro-exp-02-05" --visualize --confidence 101 | ``` 102 | 103 | ### Configuration 104 | - **Models**: Update the `MODEL_NAMES` list to include the models you want to evaluate. 105 | - **Prompts**: Define your prompts in the `raw_prompts` list. 106 | - **Evaluation Method**: Choose between numeric ratings (`EVALUATION_METHOD = 1`) or upvotes/downvotes (`EVALUATION_METHOD = 2`). 107 | - **Subset Evaluation**: Toggle `USE_SUBSET_EVALUATION` to reduce evaluation costs. 108 | 109 | ### Advanced Features 110 | 111 | #### Visualization, Confidence Intervals, and Categories 112 | 113 | Run SlopRank with all advanced features: 114 | 115 | ```bash 116 | sloprank run --prompts prompts.csv --output-dir results --visualize --confidence --categories 117 | ``` 118 | 119 | #### Interactive Dashboard 120 | 121 | Add the `--dashboard` flag to launch an interactive web dashboard: 122 | 123 | ```bash 124 | sloprank run --prompts prompts.csv --output-dir results --dashboard 125 | ``` 126 | 127 | Launch the dashboard for existing results: 128 | 129 | ```bash 130 | sloprank dashboard --output-dir results 131 | ``` 132 | 133 | #### Using Individual Tools 134 | 135 | The `examples/` directory contains standalone scripts for each advanced feature: 136 | 137 | 1. Graph Visualization: 138 | ```bash 139 | python examples/generate_visualization.py 140 | ``` 141 | 142 | 2. Confidence Intervals: 143 | ```bash 144 | python examples/compute_confidence.py 145 | ``` 146 | 147 | 3. Prompt Categorization: 148 | ```bash 149 | python examples/prompt_categorization.py 150 | ``` 151 | 152 | 4. Dashboard Generation: 153 | ```bash 154 | python examples/generate_dashboard.py 155 | python examples/dashboard.py 156 | ``` 157 | 158 | ## Outputs 159 | - **Ranked Models**: A list of models ordered by their PageRank scores. 160 | - **Graph Representation**: A directed graph showing the flow of endorsements. 161 | - **Processing Times**: Benchmark of evaluation times for each model. 162 | - **Interactive Visualizations**: HTML-based interactive graphs with node and edge details. 163 | - **Static Visualizations**: PNG images of the endorsement graph. 164 | - **Confidence Intervals**: Statistical confidence bounds for model rankings. 165 | - **Significance Tests**: Statistical significance indicators between adjacent ranks. 166 | - **Category Rankings**: Model performance across different prompt categories. 167 | 168 | #### Dashboard Details 169 | 170 | The dashboard provides: 171 | - Overall model rankings with confidence intervals 172 | - Category-specific performance analysis 173 | - Interactive graph visualizations 174 | - Model comparison tools 175 | 176 | #### Download Options 177 | 178 | - **[⬇️ Download Dashboard HTML](https://raw.githubusercontent.com/strangeloopcanon/llmrank/main/docs/index.html)** - Save and open locally in any browser 179 | 180 | ## Applications 181 | - **Benchmarking**: Evaluate and rank new or existing LLMs. 182 | - **Specialization Analysis**: Test domain-specific capabilities (e.g., legal, medical). 183 | - **Model Optimization**: Identify strengths and weaknesses for targeted fine-tuning. 184 | - **Public Leaderboards**: Maintain transparency and foster healthy competition among models. 185 | 186 | ## Development 187 | 188 | ### Release Process 189 | 190 | To build and release a new version of SlopRank to PyPI: 191 | 192 | 1. Update the version number in `pyproject.toml` following semantic versioning 193 | 2. Update the Changelog section below with all changes 194 | 3. Clean previous builds: `rm -rf build/ dist/ *.egg-info/` 195 | 4. Build the package: `python -m build` 196 | 5. Validate the package: `twine check dist/*` 197 | 6. Upload to PyPI: `twine upload dist/*` 198 | 7. Create a GitHub release with the changelog info 199 | 200 | ### Troubleshooting Releases 201 | 202 | - If you get permission errors during upload, check your PyPI credentials 203 | - If the build fails, ensure all dependencies are correctly listed in pyproject.toml 204 | - If the package fails validation, fix the issues before attempting to upload again 205 | 206 | ## Version History 207 | 208 | See the [CHANGELOG.md](CHANGELOG.md) file for a detailed version history and release notes. 209 | 210 | ## Ideas for Contributions 211 | 212 | ### Suggested Improvements 213 | 1. Improve visualization options and customization. 214 | 2. Add more statistical analysis methods. 215 | 3. Develop a public leaderboard to showcase rankings. 216 | 4. Enhance the web dashboard with more interactive features. 217 | 5. Add support for multi-language evaluation by introducing localized prompts. 218 | 6. Implement cost estimation and optimization features. 219 | 220 | Contributions are welcome! If you have ideas for improving the framework, feel free to open an issue or submit a pull request. 221 | 222 | ## Acknowledgments 223 | Special thanks to: 224 | - [SimonW](https://github.com/simonw) for the `llm` library. 225 | - The AI community 226 | ## Using parallm for More Efficient Response Collection 227 | 228 | SlopRank uses the `parallm` library for more efficient parallel model querying: 229 | 230 | ```python 231 | # Install with pip 232 | pip install sloprank 233 | 234 | # parallm is included as a dependency and automatically used 235 | sloprank run --prompts prompts.csv --output-dir results --models "gpt-4o,claude-3.5-sonnet-latest" 236 | 237 | # Or use parallm directly 238 | from parallm import query_model_all 239 | 240 | # Query multiple models with all prompts in a CSV file 241 | df = query_model_all("prompts.csv", ["gpt-4", "claude-3-5-sonnet", "gemini-2.0-flash"]) 242 | print(df) 243 | ``` 244 | 245 | This integration significantly speeds up the response collection process by running queries in parallel. 246 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.1.7 2 | dash==2.18.2 3 | dash_core_components==2.0.0 4 | dash_html_components==2.0.0 5 | llm==0.23 6 | matplotlib==3.10.1 7 | networkx==3.4.2 8 | numpy==2.2.4 9 | pandas==2.2.3 10 | parallm==0.1.3 11 | plotly==6.0.0 12 | Requests==2.32.3 13 | -------------------------------------------------------------------------------- /results/category_analysis.csv: -------------------------------------------------------------------------------- 1 | category,model,average_score,evaluations_count 2 | Reasoning,o1-preview,8.8,30 3 | Reasoning,deepseek-chat,8.766666666666667,30 4 | Reasoning,claude-3-5-sonnet-latest,6.9655172413793105,29 5 | Reasoning,gemini-2.0-flash-thinking-exp-1219,8.206896551724139,29 6 | Reasoning,gemini-exp-1206,8.61111111111111,18 7 | Reasoning,gpt-4o,8.212121212121213,33 8 | Creativity,gpt-4o,7.923076923076923,13 9 | Creativity,gemini-exp-1206,8.833333333333334,6 10 | Creativity,deepseek-chat,8.5,14 11 | Creativity,o1-preview,8.857142857142858,14 12 | Creativity,claude-3-5-sonnet-latest,6.857142857142857,7 13 | Creativity,gemini-2.0-flash-thinking-exp-1219,8.045454545454545,11 14 | Economic,o1-preview,7.5,4 15 | Economic,deepseek-chat,8.0,5 16 | Economic,gemini-exp-1206,8.0,7 17 | Economic,gpt-4o,8.333333333333334,6 18 | Economic,claude-3-5-sonnet-latest,7.888888888888889,9 19 | Economic,gemini-2.0-flash-thinking-exp-1219,7.75,4 20 | Knowledge,deepseek-chat,4.333333333333333,3 21 | Knowledge,gemini-exp-1206,6.571428571428571,7 22 | Knowledge,claude-3-5-sonnet-latest,6.857142857142857,7 23 | Knowledge,gpt-4o,6.166666666666667,6 24 | Knowledge,o1-preview,5.833333333333333,6 25 | Knowledge,gemini-2.0-flash-thinking-exp-1219,7.0,4 26 | Technical,claude-3-5-sonnet-latest,8.0,1 27 | Technical,gemini-2.0-flash-thinking-exp-1219,7.333333333333333,3 28 | Technical,deepseek-chat,8.5,2 29 | Technical,o1-preview,8.666666666666666,3 30 | Technical,gemini-exp-1206,9.25,4 31 | Technical,gpt-4o,7.0,1 32 | Medical,o1-preview,6.2,5 33 | Medical,deepseek-chat,7.166666666666667,6 34 | Medical,gemini-exp-1206,6.714285714285714,7 35 | Medical,claude-3-5-sonnet-latest,5.0,6 36 | Medical,gemini-2.0-flash-thinking-exp-1219,6.142857142857143,7 37 | Medical,gpt-4o,8.5,4 38 | -------------------------------------------------------------------------------- /results/category_rankings.json: -------------------------------------------------------------------------------- 1 | { 2 | "Creativity": [ 3 | { 4 | "model": "o1-preview", 5 | "score": 8.857142857142858 6 | }, 7 | { 8 | "model": "gemini-exp-1206", 9 | "score": 8.833333333333334 10 | }, 11 | { 12 | "model": "deepseek-chat", 13 | "score": 8.5 14 | }, 15 | { 16 | "model": "gemini-2.0-flash-thinking-exp-1219", 17 | "score": 8.045454545454545 18 | }, 19 | { 20 | "model": "gpt-4o", 21 | "score": 7.923076923076923 22 | }, 23 | { 24 | "model": "claude-3-5-sonnet-latest", 25 | "score": 6.857142857142857 26 | } 27 | ], 28 | "Economic": [ 29 | { 30 | "model": "gpt-4o", 31 | "score": 8.333333333333334 32 | }, 33 | { 34 | "model": "deepseek-chat", 35 | "score": 8.0 36 | }, 37 | { 38 | "model": "gemini-exp-1206", 39 | "score": 8.0 40 | }, 41 | { 42 | "model": "claude-3-5-sonnet-latest", 43 | "score": 7.888888888888889 44 | }, 45 | { 46 | "model": "gemini-2.0-flash-thinking-exp-1219", 47 | "score": 7.75 48 | }, 49 | { 50 | "model": "o1-preview", 51 | "score": 7.5 52 | } 53 | ], 54 | "Knowledge": [ 55 | { 56 | "model": "gemini-2.0-flash-thinking-exp-1219", 57 | "score": 7.0 58 | }, 59 | { 60 | "model": "claude-3-5-sonnet-latest", 61 | "score": 6.857142857142857 62 | }, 63 | { 64 | "model": "gemini-exp-1206", 65 | "score": 6.571428571428571 66 | }, 67 | { 68 | "model": "gpt-4o", 69 | "score": 6.166666666666667 70 | }, 71 | { 72 | "model": "o1-preview", 73 | "score": 5.833333333333333 74 | }, 75 | { 76 | "model": "deepseek-chat", 77 | "score": 4.333333333333333 78 | } 79 | ], 80 | "Medical": [ 81 | { 82 | "model": "gpt-4o", 83 | "score": 8.5 84 | }, 85 | { 86 | "model": "deepseek-chat", 87 | "score": 7.166666666666667 88 | }, 89 | { 90 | "model": "gemini-exp-1206", 91 | "score": 6.714285714285714 92 | }, 93 | { 94 | "model": "o1-preview", 95 | "score": 6.2 96 | }, 97 | { 98 | "model": "gemini-2.0-flash-thinking-exp-1219", 99 | "score": 6.142857142857143 100 | }, 101 | { 102 | "model": "claude-3-5-sonnet-latest", 103 | "score": 5.0 104 | } 105 | ], 106 | "Reasoning": [ 107 | { 108 | "model": "o1-preview", 109 | "score": 8.8 110 | }, 111 | { 112 | "model": "deepseek-chat", 113 | "score": 8.766666666666667 114 | }, 115 | { 116 | "model": "gemini-exp-1206", 117 | "score": 8.61111111111111 118 | }, 119 | { 120 | "model": "gpt-4o", 121 | "score": 8.212121212121213 122 | }, 123 | { 124 | "model": "gemini-2.0-flash-thinking-exp-1219", 125 | "score": 8.206896551724139 126 | }, 127 | { 128 | "model": "claude-3-5-sonnet-latest", 129 | "score": 6.9655172413793105 130 | } 131 | ], 132 | "Technical": [ 133 | { 134 | "model": "gemini-exp-1206", 135 | "score": 9.25 136 | }, 137 | { 138 | "model": "o1-preview", 139 | "score": 8.666666666666666 140 | }, 141 | { 142 | "model": "deepseek-chat", 143 | "score": 8.5 144 | }, 145 | { 146 | "model": "claude-3-5-sonnet-latest", 147 | "score": 8.0 148 | }, 149 | { 150 | "model": "gemini-2.0-flash-thinking-exp-1219", 151 | "score": 7.333333333333333 152 | }, 153 | { 154 | "model": "gpt-4o", 155 | "score": 7.0 156 | } 157 | ] 158 | } -------------------------------------------------------------------------------- /results/confidence_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "confidence_intervals": { 3 | "o1-preview": { 4 | "mean": 0.17906422978195008, 5 | "lower_bound": 0.15586134755557632, 6 | "upper_bound": 0.20028596105851823, 7 | "std_dev": 0.011390744131633145 8 | }, 9 | "claude-3-5-sonnet-latest": { 10 | "mean": 0.1559034710506988, 11 | "lower_bound": 0.1338431787122791, 12 | "upper_bound": 0.17700336456568294, 13 | "std_dev": 0.011074319360773228 14 | }, 15 | "deepseek-chat": { 16 | "mean": 0.16688348762576946, 17 | "lower_bound": 0.14471972554662413, 18 | "upper_bound": 0.19175975218761088, 19 | "std_dev": 0.012124035815348115 20 | }, 21 | "gpt-4o": { 22 | "mean": 0.17819819894678382, 23 | "lower_bound": 0.15655283702964287, 24 | "upper_bound": 0.2005852449712515, 25 | "std_dev": 0.010975986032101367 26 | }, 27 | "gemini-exp-1206": { 28 | "mean": 0.1549362213590768, 29 | "lower_bound": 0.1336108623981094, 30 | "upper_bound": 0.17961769528814694, 31 | "std_dev": 0.01173552363968152 32 | }, 33 | "gemini-2.0-flash-thinking-exp-1219": { 34 | "mean": 0.16501439123572084, 35 | "lower_bound": 0.14205363291625536, 36 | "upper_bound": 0.18732921920572762, 37 | "std_dev": 0.011653527254343038 38 | } 39 | }, 40 | "significance": { 41 | "o1-preview_vs_gpt-4o": false, 42 | "gpt-4o_vs_deepseek-chat": false, 43 | "deepseek-chat_vs_gemini-2.0-flash-thinking-exp-1219": false, 44 | "gemini-2.0-flash-thinking-exp-1219_vs_claude-3-5-sonnet-latest": false, 45 | "claude-3-5-sonnet-latest_vs_gemini-exp-1206": false 46 | } 47 | } -------------------------------------------------------------------------------- /results/endorsement_graph.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | directed 1 3 | node [ 4 | id 0 5 | label "gemini-2.0-flash-thinking-exp-1219" 6 | ] 7 | node [ 8 | id 1 9 | label "gemini-exp-1206" 10 | ] 11 | node [ 12 | id 2 13 | label "claude-3-5-sonnet-latest" 14 | ] 15 | node [ 16 | id 3 17 | label "o1-preview" 18 | ] 19 | node [ 20 | id 4 21 | label "gpt-4o" 22 | ] 23 | node [ 24 | id 5 25 | label "deepseek-chat" 26 | ] 27 | edge [ 28 | source 0 29 | target 3 30 | weight 138.0 31 | ] 32 | edge [ 33 | source 0 34 | target 5 35 | weight 173.0 36 | ] 37 | edge [ 38 | source 0 39 | target 2 40 | weight 113.0 41 | ] 42 | edge [ 43 | source 0 44 | target 1 45 | weight 89.0 46 | ] 47 | edge [ 48 | source 0 49 | target 4 50 | weight 130.0 51 | ] 52 | edge [ 53 | source 1 54 | target 2 55 | weight 129.0 56 | ] 57 | edge [ 58 | source 1 59 | target 0 60 | weight 188.0 61 | ] 62 | edge [ 63 | source 1 64 | target 5 65 | weight 183.0 66 | ] 67 | edge [ 68 | source 1 69 | target 4 70 | weight 180.0 71 | ] 72 | edge [ 73 | source 1 74 | target 3 75 | weight 148.0 76 | ] 77 | edge [ 78 | source 2 79 | target 3 80 | weight 248.0 81 | ] 82 | edge [ 83 | source 2 84 | target 0 85 | weight 162.0 86 | ] 87 | edge [ 88 | source 2 89 | target 1 90 | weight 160.0 91 | ] 92 | edge [ 93 | source 2 94 | target 4 95 | weight 166.0 96 | ] 97 | edge [ 98 | source 2 99 | target 5 100 | weight 104.0 101 | ] 102 | edge [ 103 | source 3 104 | target 0 105 | weight 131.0 106 | ] 107 | edge [ 108 | source 3 109 | target 5 110 | weight 129.0 111 | ] 112 | edge [ 113 | source 3 114 | target 1 115 | weight 144.0 116 | ] 117 | edge [ 118 | source 3 119 | target 4 120 | weight 157.0 121 | ] 122 | edge [ 123 | source 3 124 | target 2 125 | weight 139.0 126 | ] 127 | edge [ 128 | source 4 129 | target 0 130 | weight 155.0 131 | ] 132 | edge [ 133 | source 4 134 | target 5 135 | weight 146.0 136 | ] 137 | edge [ 138 | source 4 139 | target 2 140 | weight 146.0 141 | ] 142 | edge [ 143 | source 4 144 | target 3 145 | weight 129.0 146 | ] 147 | edge [ 148 | source 4 149 | target 1 150 | weight 141.0 151 | ] 152 | edge [ 153 | source 5 154 | target 4 155 | weight 212.0 156 | ] 157 | edge [ 158 | source 5 159 | target 0 160 | weight 135.5 161 | ] 162 | edge [ 163 | source 5 164 | target 3 165 | weight 203.0 166 | ] 167 | edge [ 168 | source 5 169 | target 1 170 | weight 142.0 171 | ] 172 | edge [ 173 | source 5 174 | target 2 175 | weight 143.0 176 | ] 177 | ] 178 | -------------------------------------------------------------------------------- /results/rankings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rankings": [ 3 | [ 4 | "o1-preview", 5 | 0.17940361409787733 6 | ], 7 | [ 8 | "gpt-4o", 9 | 0.17830451744580658 10 | ], 11 | [ 12 | "deepseek-chat", 13 | 0.1671054138317305 14 | ], 15 | [ 16 | "gemini-2.0-flash-thinking-exp-1219", 17 | 0.16473186403675355 18 | ], 19 | [ 20 | "claude-3-5-sonnet-latest", 21 | 0.15557086205954448 22 | ], 23 | [ 24 | "gemini-exp-1206", 25 | 0.15488372852828722 26 | ] 27 | ], 28 | "metadata": { 29 | "evaluation_method": 1, 30 | "timestamp": "2025-01-14T10:21:14.432767" 31 | } 32 | } -------------------------------------------------------------------------------- /results/visualizations/endorsement_graph.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | directed 1 3 | node [ 4 | id 0 5 | label "o1-preview" 6 | pagerank 0.17940361409787733 7 | ] 8 | node [ 9 | id 1 10 | label "gpt-4o" 11 | pagerank 0.17830451744580658 12 | ] 13 | node [ 14 | id 2 15 | label "deepseek-chat" 16 | pagerank 0.1671054138317305 17 | ] 18 | node [ 19 | id 3 20 | label "gemini-2.0-flash-thinking-exp-1219" 21 | pagerank 0.16473186403675355 22 | ] 23 | node [ 24 | id 4 25 | label "claude-3-5-sonnet-latest" 26 | pagerank 0.15557086205954448 27 | ] 28 | node [ 29 | id 5 30 | label "gemini-exp-1206" 31 | pagerank 0.15488372852828722 32 | ] 33 | edge [ 34 | source 0 35 | target 3 36 | weight 131.0 37 | normalized_weight 0.5282258064516129 38 | ] 39 | edge [ 40 | source 0 41 | target 2 42 | weight 129.0 43 | normalized_weight 0.5201612903225806 44 | ] 45 | edge [ 46 | source 0 47 | target 5 48 | weight 144.0 49 | normalized_weight 0.5806451612903226 50 | ] 51 | edge [ 52 | source 0 53 | target 1 54 | weight 157.0 55 | normalized_weight 0.6330645161290323 56 | ] 57 | edge [ 58 | source 0 59 | target 4 60 | weight 139.0 61 | normalized_weight 0.5604838709677419 62 | ] 63 | edge [ 64 | source 1 65 | target 3 66 | weight 155.0 67 | normalized_weight 0.625 68 | ] 69 | edge [ 70 | source 1 71 | target 2 72 | weight 146.0 73 | normalized_weight 0.5887096774193549 74 | ] 75 | edge [ 76 | source 1 77 | target 4 78 | weight 146.0 79 | normalized_weight 0.5887096774193549 80 | ] 81 | edge [ 82 | source 1 83 | target 0 84 | weight 129.0 85 | normalized_weight 0.5201612903225806 86 | ] 87 | edge [ 88 | source 1 89 | target 5 90 | weight 141.0 91 | normalized_weight 0.5685483870967742 92 | ] 93 | edge [ 94 | source 2 95 | target 1 96 | weight 212.0 97 | normalized_weight 0.8548387096774194 98 | ] 99 | edge [ 100 | source 2 101 | target 3 102 | weight 135.5 103 | normalized_weight 0.5463709677419355 104 | ] 105 | edge [ 106 | source 2 107 | target 0 108 | weight 203.0 109 | normalized_weight 0.8185483870967742 110 | ] 111 | edge [ 112 | source 2 113 | target 5 114 | weight 142.0 115 | normalized_weight 0.5725806451612904 116 | ] 117 | edge [ 118 | source 2 119 | target 4 120 | weight 143.0 121 | normalized_weight 0.5766129032258065 122 | ] 123 | edge [ 124 | source 3 125 | target 0 126 | weight 138.0 127 | normalized_weight 0.5564516129032258 128 | ] 129 | edge [ 130 | source 3 131 | target 2 132 | weight 173.0 133 | normalized_weight 0.6975806451612904 134 | ] 135 | edge [ 136 | source 3 137 | target 4 138 | weight 113.0 139 | normalized_weight 0.45564516129032256 140 | ] 141 | edge [ 142 | source 3 143 | target 5 144 | weight 89.0 145 | normalized_weight 0.3588709677419355 146 | ] 147 | edge [ 148 | source 3 149 | target 1 150 | weight 130.0 151 | normalized_weight 0.5241935483870968 152 | ] 153 | edge [ 154 | source 4 155 | target 0 156 | weight 248.0 157 | normalized_weight 1.0 158 | ] 159 | edge [ 160 | source 4 161 | target 3 162 | weight 162.0 163 | normalized_weight 0.6532258064516129 164 | ] 165 | edge [ 166 | source 4 167 | target 5 168 | weight 160.0 169 | normalized_weight 0.6451612903225806 170 | ] 171 | edge [ 172 | source 4 173 | target 1 174 | weight 166.0 175 | normalized_weight 0.6693548387096774 176 | ] 177 | edge [ 178 | source 4 179 | target 2 180 | weight 104.0 181 | normalized_weight 0.41935483870967744 182 | ] 183 | edge [ 184 | source 5 185 | target 4 186 | weight 129.0 187 | normalized_weight 0.5201612903225806 188 | ] 189 | edge [ 190 | source 5 191 | target 3 192 | weight 188.0 193 | normalized_weight 0.7580645161290323 194 | ] 195 | edge [ 196 | source 5 197 | target 2 198 | weight 183.0 199 | normalized_weight 0.7379032258064516 200 | ] 201 | edge [ 202 | source 5 203 | target 1 204 | weight 180.0 205 | normalized_weight 0.7258064516129032 206 | ] 207 | edge [ 208 | source 5 209 | target 0 210 | weight 148.0 211 | normalized_weight 0.5967741935483871 212 | ] 213 | ] 214 | -------------------------------------------------------------------------------- /results/visualizations/endorsement_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strangeloopcanon/LLMRank/7527836faee5af1209059466d89690bedf504014/results/visualizations/endorsement_graph.png -------------------------------------------------------------------------------- /scripts/bump_version.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from pathlib import Path 4 | 5 | path = Path("pyproject.toml") 6 | text = path.read_text() 7 | match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', text) 8 | 9 | if not match: 10 | raise ValueError("Version not found in pyproject.toml") 11 | 12 | major, minor, patch = map(int, match.groups()) 13 | arg = sys.argv[1] if len(sys.argv) > 1 else "patch" 14 | 15 | if arg == "patch": 16 | patch += 1 17 | elif arg == "minor": 18 | minor += 1 19 | patch = 0 20 | elif arg == "major": 21 | major += 1 22 | minor = patch = 0 23 | else: 24 | raise ValueError("Expected patch, minor, or major") 25 | 26 | new_version = f'{major}.{minor}.{patch}' 27 | new_text = re.sub( 28 | r'version\s*=\s*"\d+\.\d+\.\d+"', 29 | f'version = \"{new_version}\"', 30 | text 31 | ) 32 | 33 | path.write_text(new_text) 34 | print(f"Bumped version to {new_version}") 35 | -------------------------------------------------------------------------------- /scripts/create_github_release.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | 5 | TAG = sys.argv[1] 6 | REPO = "strangeloopcanon/LLMRank" 7 | TOKEN = os.getenv("GITHUB_TOKEN") 8 | 9 | if not TOKEN: 10 | raise RuntimeError("GITHUB_TOKEN environment variable not set") 11 | 12 | BASE_URL = f"https://api.github.com/repos/{REPO}" 13 | RELEASE_URL = f"{BASE_URL}/releases" 14 | HEADERS = { 15 | "Authorization": f"Bearer {TOKEN}", 16 | "Accept": "application/vnd.github+json" 17 | } 18 | 19 | # Check if release already exists 20 | r = requests.get(f"{BASE_URL}/releases/tags/{TAG}", headers=HEADERS) 21 | if r.status_code == 200: 22 | print(f"⚠️ GitHub release for tag {TAG} already exists. Skipping.") 23 | sys.exit(0) 24 | elif r.status_code != 404: 25 | print(f"GitHub release check failed:\n{r.status_code}\n{r.text}") 26 | sys.exit(1) 27 | 28 | # Create release 29 | payload = { 30 | "tag_name": TAG, 31 | "name": f"Release {TAG}", 32 | "body": f"Auto-published release for version {TAG}", 33 | "draft": False, 34 | "prerelease": False 35 | } 36 | 37 | r = requests.post(RELEASE_URL, headers=HEADERS, json=payload) 38 | if r.status_code >= 300: 39 | print(f"GitHub release creation failed:\n{r.status_code}\n{r.text}") 40 | sys.exit(1) 41 | 42 | print(f"✅ GitHub release {TAG} created.") 43 | -------------------------------------------------------------------------------- /sloprank/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | SlopRank Package 3 | ---------------- 4 | 5 | Peer-based cross-evaluation of LLMs with PageRank-based scoring. 6 | 7 | Key features: 8 | - Peer-based evaluation where models score each other 9 | - Graph visualization of model endorsements 10 | - Confidence intervals and statistical significance tests 11 | - Category-based evaluation and ranking 12 | - Web dashboard for interactive exploration 13 | """ 14 | 15 | from .config import ( 16 | EvalConfig, 17 | VisualizationConfig, 18 | ConfidenceConfig, 19 | WebDashboardConfig, 20 | DEFAULT_CONFIG 21 | ) 22 | 23 | __version__ = "0.2.3" 24 | __all__ = [ 25 | "EvalConfig", 26 | "VisualizationConfig", 27 | "ConfidenceConfig", 28 | "WebDashboardConfig", 29 | "DEFAULT_CONFIG" 30 | ] -------------------------------------------------------------------------------- /sloprank/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main module entry point for running sloprank as a module 3 | """ 4 | 5 | import sys 6 | from .cli import main 7 | 8 | if __name__ == "__main__": 9 | sys.exit(main()) -------------------------------------------------------------------------------- /sloprank/cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import threading 4 | from pathlib import Path 5 | from typing import Dict, List 6 | 7 | import click 8 | import pandas as pd 9 | 10 | from .collect import collect_raw_evaluations, collect_responses 11 | from .config import DEFAULT_CONFIG, EvalConfig, VisualizationConfig, ConfidenceConfig, WebDashboardConfig, logger 12 | from .parse import parse_evaluation_rows 13 | from .rank import ( 14 | build_endorsement_graph, 15 | compute_pagerank, 16 | compute_categorical_pageranks, 17 | finalize_rankings 18 | ) 19 | 20 | # Try importing dashboard libraries 21 | try: 22 | import dash 23 | from dash import dcc 24 | from dash import html 25 | from dash.dependencies import Input, Output 26 | import plotly.express as px 27 | import plotly.graph_objects as go 28 | HAS_DASH = True 29 | except ImportError: 30 | logger.warning("Dash not found. Web dashboard will be disabled.") 31 | HAS_DASH = False 32 | 33 | def categorize_prompts(prompts_df: pd.DataFrame, config: EvalConfig) -> Dict[str, List[str]]: 34 | """ 35 | Process the prompts DataFrame to extract categories. 36 | If a 'Category' column exists, use it to categorize prompts. 37 | Otherwise, try to infer categories using keyword matching. 38 | """ 39 | categories = {} 40 | 41 | # Determine which column has the prompts 42 | prompt_column = None 43 | if 'Questions' in prompts_df.columns: 44 | prompt_column = 'Questions' 45 | elif 'prompt' in prompts_df.columns: 46 | prompt_column = 'prompt' 47 | elif len(prompts_df.columns) > 0: 48 | prompt_column = prompts_df.columns[0] 49 | else: 50 | logger.warning("No columns found in prompts DataFrame for categorization") 51 | return {} 52 | 53 | # Determine which column has categories 54 | category_column = None 55 | if 'Category' in prompts_df.columns: 56 | category_column = 'Category' 57 | elif 'category' in prompts_df.columns: 58 | category_column = 'category' 59 | 60 | if category_column: 61 | # Use explicit categories from the prompts file 62 | for category in prompts_df[category_column].unique(): 63 | if pd.notna(category) and category: 64 | category_prompts = prompts_df[prompts_df[category_column] == category][prompt_column].tolist() 65 | categories[category.lower() if isinstance(category, str) else str(category)] = category_prompts 66 | elif config.prompt_categories: 67 | # Use categories from the configuration 68 | return config.prompt_categories 69 | else: 70 | # Try to infer categories using keywords (basic implementation) 71 | # In a real implementation, you might use NLP techniques or clustering 72 | keywords = { 73 | 'reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze'], 74 | 'creativity': ['creative', 'imagine', 'story', 'design', 'invent'], 75 | 'knowledge': ['fact', 'define', 'what is', 'history', 'science'], 76 | 'coding': ['code', 'function', 'algorithm', 'program', 'script'], 77 | } 78 | 79 | # Initialize categories 80 | for category in keywords: 81 | categories[category] = [] 82 | 83 | # Categorize prompts based on keywords 84 | for prompt in prompts_df[prompt_column].tolist(): 85 | categorized = False 86 | if not isinstance(prompt, str): 87 | prompt = str(prompt) 88 | 89 | prompt_lower = prompt.lower() 90 | 91 | for category, terms in keywords.items(): 92 | if any(term in prompt_lower for term in terms): 93 | categories[category].append(prompt) 94 | categorized = True 95 | break 96 | 97 | if not categorized: 98 | if 'uncategorized' not in categories: 99 | categories['uncategorized'] = [] 100 | categories['uncategorized'].append(prompt) 101 | 102 | # Only keep categories with prompts 103 | return {k: v for k, v in categories.items() if v} 104 | 105 | def start_dashboard(config: EvalConfig, rankings_path: Path): 106 | """ 107 | Start a Dash web dashboard for interactive visualization. 108 | """ 109 | if not HAS_DASH or not config.web_dashboard.enabled: 110 | return 111 | 112 | try: 113 | # Load rankings data 114 | with open(rankings_path, 'r') as f: 115 | data = json.load(f) 116 | 117 | # Create Dash app 118 | app = dash.Dash(__name__) 119 | 120 | # Define layout 121 | app.layout = html.Div([ 122 | html.H1("SlopRank Dashboard"), 123 | 124 | html.Div([ 125 | html.H2("Model Rankings"), 126 | dcc.Graph( 127 | id='ranking-graph', 128 | figure={ 129 | 'data': [ 130 | {'x': [item['model'] for item in data['rankings']], 131 | 'y': [item['score'] for item in data['rankings']], 132 | 'type': 'bar', 'name': 'PageRank Score'} 133 | ], 134 | 'layout': { 135 | 'title': 'Model PageRank Scores', 136 | 'xaxis': {'title': 'Model'}, 137 | 'yaxis': {'title': 'PageRank Score'} 138 | } 139 | } 140 | ) 141 | ]), 142 | 143 | # Add category rankings if available 144 | html.Div([ 145 | html.H2("Rankings by Category"), 146 | html.Div([ 147 | html.Label("Select Category:"), 148 | dcc.Dropdown( 149 | id='category-dropdown', 150 | options=[{'label': cat, 'value': cat} 151 | for cat in data.get('category_rankings', {}).keys()], 152 | value=next(iter(data.get('category_rankings', {}).keys()), None) 153 | ) 154 | ]) if data.get('category_rankings') else html.Div("No category data available."), 155 | dcc.Graph(id='category-graph') 156 | ]) if data.get('category_rankings') else html.Div(), 157 | 158 | # Add confidence intervals if available 159 | html.Div([ 160 | html.H2("Confidence Intervals"), 161 | dcc.Graph( 162 | id='confidence-graph', 163 | figure={ 164 | 'data': [ 165 | { 166 | 'x': [model for model in data['confidence_intervals'].keys()], 167 | 'y': [stats['mean'] for stats in data['confidence_intervals'].values()], 168 | 'error_y': { 169 | 'type': 'data', 170 | 'symmetric': False, 171 | 'array': [ 172 | stats['upper_bound'] - stats['mean'] 173 | for stats in data['confidence_intervals'].values() 174 | ], 175 | 'arrayminus': [ 176 | stats['mean'] - stats['lower_bound'] 177 | for stats in data['confidence_intervals'].values() 178 | ] 179 | }, 180 | 'type': 'scatter', 181 | 'mode': 'markers', 182 | 'marker': {'size': 10} 183 | } 184 | ], 185 | 'layout': { 186 | 'title': '95% Confidence Intervals', 187 | 'xaxis': {'title': 'Model'}, 188 | 'yaxis': {'title': 'PageRank Score'} 189 | } 190 | } 191 | ) 192 | ]) if data.get('confidence_intervals') else html.Div(), 193 | 194 | # Add link to static visualizations 195 | html.Div([ 196 | html.H2("Visualizations"), 197 | html.P([ 198 | "View the static graph visualization ", 199 | html.A("here", href=f"/{config.output_dir}/visualizations/endorsement_graph.png", target="_blank"), 200 | " or the interactive version ", 201 | html.A("here", href=f"/{config.output_dir}/visualizations/endorsement_graph.html", target="_blank"), 202 | "." 203 | ]) 204 | ]) 205 | ]) 206 | 207 | # Define callbacks 208 | @app.callback( 209 | Output('category-graph', 'figure'), 210 | [Input('category-dropdown', 'value')] 211 | ) 212 | def update_category_graph(selected_category): 213 | if not selected_category or not data.get('category_rankings'): 214 | return {} 215 | 216 | cat_data = data['category_rankings'].get(selected_category, []) 217 | return { 218 | 'data': [ 219 | {'x': [item['model'] for item in cat_data], 220 | 'y': [item['score'] for item in cat_data], 221 | 'type': 'bar', 'name': 'PageRank Score'} 222 | ], 223 | 'layout': { 224 | 'title': f'Model Rankings for Category: {selected_category}', 225 | 'xaxis': {'title': 'Model'}, 226 | 'yaxis': {'title': 'PageRank Score'} 227 | } 228 | } 229 | 230 | # Run the server in a separate thread 231 | def run_server(): 232 | app.run_server( 233 | host=config.web_dashboard.host, 234 | port=config.web_dashboard.port, 235 | debug=config.web_dashboard.debug 236 | ) 237 | 238 | dashboard_thread = threading.Thread(target=run_server) 239 | dashboard_thread.daemon = True 240 | dashboard_thread.start() 241 | 242 | # Print info message 243 | if config.web_dashboard.auto_open_browser: 244 | import webbrowser 245 | url = f"http://{config.web_dashboard.host}:{config.web_dashboard.port}" 246 | webbrowser.open(url) 247 | 248 | logger.info(f"Dashboard running at http://{config.web_dashboard.host}:{config.web_dashboard.port}") 249 | logger.info("Press Ctrl+C to exit") 250 | 251 | except Exception as e: 252 | logger.error(f"Error starting dashboard: {e}") 253 | 254 | 255 | @click.group() 256 | def cli(): 257 | """SlopRank - Peer-based LLM cross-evaluation system.""" 258 | pass 259 | 260 | 261 | @cli.command() 262 | @click.option("--prompts", default="prompts.csv", help="Path to prompts Excel file") 263 | @click.option("--output-dir", default="results", help="Output directory for results") 264 | @click.option("--models", help="Comma-separated list of models to evaluate") 265 | @click.option( 266 | "--responses", 267 | help="Path to CSV of responses generated by a separate agent runner", 268 | default="", 269 | ) 270 | @click.option( 271 | "--visualize/--no-visualize", 272 | default=True, 273 | help="Enable/disable graph visualization" 274 | ) 275 | @click.option( 276 | "--interactive/--no-interactive", 277 | default=True, 278 | help="Enable/disable interactive visualization" 279 | ) 280 | @click.option( 281 | "--confidence/--no-confidence", 282 | default=True, 283 | help="Enable/disable confidence interval calculation" 284 | ) 285 | @click.option( 286 | "--dashboard/--no-dashboard", 287 | default=False, 288 | help="Enable/disable web dashboard" 289 | ) 290 | @click.option( 291 | "--dashboard-port", 292 | default=8050, 293 | help="Port for web dashboard" 294 | ) 295 | def run(prompts, output_dir, models, responses, visualize, interactive, confidence, dashboard, dashboard_port): 296 | """ 297 | Run the full SlopRank evaluation workflow. 298 | """ 299 | logging.basicConfig( 300 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 301 | ) 302 | 303 | # Parse model list from command line 304 | model_list = models.split(",") if models else None 305 | 306 | # Create visualization config 307 | vis_config = VisualizationConfig( 308 | enabled=visualize, 309 | interactive=interactive 310 | ) 311 | 312 | # Create confidence config 313 | conf_config = ConfidenceConfig( 314 | enabled=confidence 315 | ) 316 | 317 | # Create web dashboard config 318 | dash_config = WebDashboardConfig( 319 | enabled=dashboard, 320 | port=dashboard_port 321 | ) 322 | 323 | # Create main config 324 | config = EvalConfig( 325 | model_names=model_list or DEFAULT_CONFIG.model_names, 326 | evaluation_method=1, # numeric rating 327 | use_subset_evaluation=True, 328 | evaluators_subset_size=3, 329 | output_dir=Path(output_dir), 330 | visualization=vis_config, 331 | confidence=conf_config, 332 | web_dashboard=dash_config 333 | ) 334 | logger.info(f"Using config: {config}") 335 | 336 | # 1a) If we generated the responses in another tool and are piping them 337 | # to SlopRank UNIX-style, we don't need to load/run the prompts 338 | if responses: 339 | responses_df = pd.read_csv(responses) 340 | prompts_df = pd.DataFrame({'Questions': responses_df['prompt'].unique()}) 341 | else: 342 | # 1) Read prompts 343 | prompts_df = pd.read_csv(prompts) 344 | 345 | # Handle different column naming conventions 346 | prompt_column = None 347 | if "Questions" in prompts_df.columns: 348 | prompt_column = "Questions" 349 | elif "prompt" in prompts_df.columns: 350 | prompt_column = "prompt" 351 | elif len(prompts_df.columns) > 0: 352 | # If no recognized column name but there is at least one column, 353 | # assume the first column contains the prompts 354 | prompt_column = prompts_df.columns[0] 355 | logger.warning(f"No 'Questions' or 'prompt' column found, using first column: {prompt_column}") 356 | else: 357 | raise ValueError("CSV file has no columns") 358 | 359 | logger.info(f"Using column '{prompt_column}' for prompts") 360 | 361 | # Similarly handle different names for answer key column 362 | answer_key_column = None 363 | answer_keys = [] 364 | 365 | if "Answer_key" in prompts_df.columns: 366 | answer_key_column = "Answer_key" 367 | elif "answer_key" in prompts_df.columns: 368 | answer_key_column = "answer_key" 369 | 370 | if answer_key_column: 371 | answer_keys = prompts_df[answer_key_column].tolist() 372 | logger.info(f"Using column '{answer_key_column}' for answer keys") 373 | else: 374 | answer_keys = [None] * len(prompts_df) 375 | logger.info("No answer key column found") 376 | 377 | prompt_pairs = list(zip(prompts_df[prompt_column].tolist(), answer_keys)) 378 | 379 | # 2) Collect responses 380 | responses_df = collect_responses(prompt_pairs, config) 381 | 382 | # Process prompt categories 383 | config.prompt_categories = categorize_prompts(prompts_df, config) 384 | if config.prompt_categories: 385 | logger.info(f"Found {len(config.prompt_categories)} prompt categories: {', '.join(config.prompt_categories.keys())}") 386 | 387 | # 3) Collect raw evaluations 388 | raw_eval_df = collect_raw_evaluations(responses_df, config) 389 | 390 | # 4) Parse evaluation rows 391 | eval_path = config.output_dir / "evaluations.csv" 392 | if eval_path.exists(): 393 | logger.info(f"Loading existing parsed evaluations from {eval_path}") 394 | evaluations_df = pd.read_csv(eval_path) 395 | else: 396 | evaluations_df = parse_evaluation_rows(raw_eval_df, config) 397 | evaluations_df.to_csv(eval_path, index=False) 398 | logger.info(f"Saved parsed evaluations to {eval_path}") 399 | 400 | # 5) Build endorsement graph 401 | G = build_endorsement_graph(evaluations_df, config) 402 | 403 | # 6) Compute overall PageRank 404 | pagerank_scores = compute_pagerank(G) 405 | 406 | # 7) Compute category-specific PageRank scores if categories exist 407 | category_rankings = None 408 | if config.prompt_categories: 409 | category_rankings = compute_categorical_pageranks(G, config.prompt_categories) 410 | 411 | # 8) Finalize rankings and generate visualizations 412 | finalize_rankings( 413 | pagerank_scores, 414 | config, 415 | G=G, 416 | evaluations_df=evaluations_df, 417 | category_rankings=category_rankings 418 | ) 419 | 420 | # 9) Start web dashboard if enabled 421 | if config.web_dashboard.enabled and HAS_DASH: 422 | rankings_path = config.output_dir / "rankings.json" 423 | if rankings_path.exists(): 424 | start_dashboard(config, rankings_path) 425 | 426 | 427 | @cli.command() 428 | @click.option("--output-dir", default="results", help="Output directory containing results") 429 | @click.option("--port", default=8050, help="Dashboard port") 430 | def dashboard(output_dir, port): 431 | """ 432 | Start the web dashboard for existing results. 433 | """ 434 | if not HAS_DASH: 435 | logger.error("Dash not found. Please install with 'pip install dash plotly'") 436 | return 437 | 438 | config = EvalConfig( 439 | model_names=DEFAULT_CONFIG.model_names, 440 | evaluation_method=1, 441 | use_subset_evaluation=True, 442 | evaluators_subset_size=3, 443 | output_dir=Path(output_dir), 444 | web_dashboard=WebDashboardConfig( 445 | enabled=True, 446 | port=port, 447 | auto_open_browser=True 448 | ) 449 | ) 450 | 451 | rankings_path = Path(output_dir) / "rankings.json" 452 | if not rankings_path.exists(): 453 | logger.error(f"Rankings file not found: {rankings_path}") 454 | return 455 | 456 | logger.info(f"Starting dashboard for results in {output_dir}") 457 | start_dashboard(config, rankings_path) 458 | 459 | # Keep the main thread alive 460 | try: 461 | while True: 462 | import time 463 | time.sleep(1) 464 | except KeyboardInterrupt: 465 | logger.info("Dashboard stopped") 466 | 467 | 468 | def main(): 469 | """Entry point for CLI.""" 470 | # Register utility commands if available 471 | try: 472 | from .utils.commands import register_utils_commands 473 | register_utils_commands(cli) 474 | except ImportError: 475 | pass 476 | 477 | cli() 478 | -------------------------------------------------------------------------------- /sloprank/collect.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import json 4 | import pandas as pd 5 | from pathlib import Path 6 | from typing import List, Tuple 7 | from .config import logger, EvalConfig 8 | 9 | try: 10 | # Import parallm for efficient response collection 11 | from parallm import query_model_all, query_model 12 | HAS_PARALLM = True 13 | llm = None # We won't use llm when parallm is available 14 | except ImportError: 15 | # This should not happen with normal installation as parallm is now a core dependency 16 | logger.error("Could not import 'parallm' module. This is a required dependency for SlopRank.") 17 | logger.error("Please ensure parallm is installed with: pip install parallm") 18 | logger.warning("Falling back to llm or mock response generation (not recommended for production).") 19 | HAS_PARALLM = False 20 | try: 21 | # If you have a custom LLM module that provides get_model() 22 | import llm 23 | except ImportError: 24 | logger.warning("Could not import 'llm' module. Provide your own LLM interface or mock it.") 25 | llm = None 26 | 27 | def collect_responses(prompt_pairs: List[Tuple[str, str]], config: EvalConfig) -> pd.DataFrame: 28 | """ 29 | Query each model with each prompt, skipping existing entries in responses.csv. 30 | """ 31 | resp_path = config.output_dir / "responses.csv" 32 | if resp_path.exists(): 33 | existing_df = pd.read_csv(resp_path) 34 | else: 35 | existing_df = pd.DataFrame(columns=["prompt","model"]) 36 | 37 | # Extract prompts and answer keys 38 | prompts = [p[0] for p in prompt_pairs] 39 | answer_keys = [p[1] for p in prompt_pairs] 40 | 41 | # If we have parallm, use it for batch processing 42 | if HAS_PARALLM: 43 | logger.info(f"Using parallm to query {len(config.model_names)} models for {len(prompts)} prompts...") 44 | 45 | # Create a temporary CSV with the prompts using the "Questions" column 46 | prompts_df = pd.DataFrame({"Questions": prompts}) 47 | temp_prompts_path = config.output_dir / "temp_prompts.csv" 48 | prompts_df.to_csv(temp_prompts_path, index=False) 49 | 50 | # Add "prompt" column for parallm compatibility 51 | prompts_df["prompt"] = prompts_df["Questions"] 52 | temp_prompts_modified_path = config.output_dir / "temp_prompts_modified.csv" 53 | prompts_df.to_csv(temp_prompts_modified_path, index=False) 54 | 55 | # Use parallm to query all models at once with the modified CSV 56 | responses_df = query_model_all(str(temp_prompts_modified_path), config.model_names) 57 | 58 | # Check if output.csv was created by parallm and use that instead if it exists 59 | output_path = Path("output.csv") 60 | if output_path.exists(): 61 | logger.info(f"Using outputs from {output_path}") 62 | responses_df = pd.read_csv(output_path) 63 | # Clean up parallm's output file 64 | import os 65 | os.remove(output_path) 66 | 67 | # Add answer keys and additional metadata 68 | responses_df['Answer_key'] = responses_df['prompt'].map(dict(zip(prompts, answer_keys))) 69 | responses_df['is_valid'] = responses_df['response'].apply(lambda x: bool(x and len(str(x).strip()) >= 10)) 70 | responses_df['token_count'] = responses_df['response'].apply(lambda x: len(str(x).split()) if x else 0) 71 | responses_df['response_time'] = 0.0 # Default value since parallm doesn't track this 72 | responses_df['error'] = None # Default value 73 | 74 | # Clean up temp files 75 | import os 76 | for path in [temp_prompts_path, temp_prompts_modified_path]: 77 | if os.path.exists(path): 78 | logger.info(f"Cleaning up temporary file: {path}") 79 | os.remove(path) 80 | else: 81 | # Fall back to original implementation 82 | new_rows = [] 83 | for i, (prompt, answer_key) in enumerate(prompt_pairs, start=1): 84 | logger.info(f"Processing prompt {i}/{len(prompt_pairs)}: {prompt[:50]}...") 85 | 86 | for model_name in config.model_names: 87 | # Check if we already have a response 88 | subset = existing_df[ 89 | (existing_df["prompt"] == prompt) & 90 | (existing_df["model"] == model_name) 91 | ] 92 | if not subset.empty: 93 | logger.info(f"Skipping existing response for model={model_name}, prompt={prompt[:40]}...") 94 | continue 95 | 96 | start_time = time.time() 97 | logger.info(f"Querying {model_name} for new response...") 98 | raw_response = None 99 | tokens_used = 0 100 | valid = False 101 | error_msg = None 102 | 103 | try: 104 | if llm is not None: 105 | model = llm.get_model(model_name) 106 | response_obj = model.prompt(prompt) 107 | raw_response = response_obj.text() 108 | else: 109 | # fallback mock 110 | raw_response = f"[MOCK] {model_name} responding to: {prompt[:40]}" 111 | 112 | valid = (raw_response and len(raw_response.strip()) >= 10) 113 | tokens_used = len(raw_response.split()) if valid else 0 114 | 115 | except Exception as e: 116 | error_msg = str(e) 117 | logger.error(f"Error from {model_name}: {error_msg}") 118 | 119 | elapsed = time.time() - start_time 120 | 121 | new_rows.append({ 122 | 'prompt': prompt, 123 | 'model': model_name, 124 | 'response': raw_response if valid else None, 125 | 'is_valid': valid, 126 | 'response_time': elapsed, 127 | 'Answer_key': answer_key, 128 | 'token_count': tokens_used, 129 | 'error': error_msg 130 | }) 131 | 132 | if config.request_delay > 0: 133 | time.sleep(config.request_delay) 134 | 135 | responses_df = pd.DataFrame(new_rows) 136 | 137 | # Combine with existing responses 138 | combined_df = pd.concat([existing_df, responses_df], ignore_index=True) 139 | combined_df.drop_duplicates(subset=["prompt","model"], keep="first", inplace=True) 140 | combined_df.to_csv(resp_path, index=False) 141 | logger.info(f"Responses saved to {resp_path}") 142 | return combined_df 143 | 144 | def collect_raw_evaluations(responses_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame: 145 | """ 146 | Each model in config.model_names evaluates the others' answers. 147 | Results are stored in raw_evaluations.csv as [prompt, judge_model, raw_judgment, model_mapping]. 148 | """ 149 | raw_eval_path = config.output_dir / "raw_evaluations.csv" 150 | if raw_eval_path.exists(): 151 | existing_df = pd.read_csv(raw_eval_path) 152 | else: 153 | existing_df = pd.DataFrame(columns=["prompt","judge_model","model_mapping"]) 154 | 155 | # Collect all evaluation prompts 156 | eval_tasks = [] 157 | unique_prompts = responses_df['prompt'].unique() 158 | 159 | for prompt in unique_prompts: 160 | subset = responses_df[responses_df['prompt'] == prompt] 161 | answer_key = subset['Answer_key'].iloc[0] if 'Answer_key' in subset.columns else None 162 | model_response_map = subset.set_index('model')['response'].to_dict() 163 | 164 | for judge_model in config.model_names: 165 | # Exclude judge's own or missing responses 166 | other_models = [m for m in config.model_names 167 | if m != judge_model and model_response_map.get(m)] 168 | if not other_models: 169 | continue 170 | if config.use_subset_evaluation: 171 | sample_size = min(config.evaluators_subset_size, len(other_models)) 172 | other_models = random.sample(other_models, sample_size) 173 | 174 | model_to_anon = {m: f"Model_{i+1}" for i,m in enumerate(other_models)} 175 | answers_section = "\n".join([ 176 | f"{model_to_anon[m]}:\n{model_response_map[m]}\n---" 177 | for m in other_models 178 | ]) 179 | answer_key_text = f"The Answer Key is:\n{answer_key}\n---\n" if answer_key else "" 180 | 181 | model_mapping_str = json.dumps(model_to_anon, sort_keys=True) 182 | found_match = existing_df[ 183 | (existing_df["prompt"] == prompt) & 184 | (existing_df["judge_model"] == judge_model) & 185 | (existing_df["model_mapping"] == model_mapping_str) 186 | ] 187 | if not found_match.empty: 188 | logger.info(f"Skipping existing raw eval for judge={judge_model}, prompt={prompt[:40]}...") 189 | continue 190 | 191 | instructions = f""" 192 | You are an evaluator. Score each model's answer (1-10) in JSON format. 193 | 194 | Important! Your response MUST be a valid JSON object with the exact format: 195 | {{"Model_1": 7, "Model_2": 9}} 196 | 197 | Problem: 198 | {prompt} 199 | 200 | Answers: 201 | {answers_section} 202 | 203 | {answer_key_text} 204 | 205 | After reading each answer, assign a score from 1-10. Return your scores in JSON format ONLY without explanations. 206 | """ 207 | 208 | eval_tasks.append({ 209 | "prompt": prompt, 210 | "judge_model": judge_model, 211 | "evaluation_prompt": instructions, 212 | "model_mapping": model_mapping_str 213 | }) 214 | 215 | # If no new evaluations needed, return existing ones 216 | if not eval_tasks: 217 | logger.info("No new evaluations needed, returning existing data") 218 | return existing_df 219 | 220 | new_judgments = [] 221 | 222 | # Process all evaluation tasks individually - simpler and more reliable 223 | logger.info(f"Processing {len(eval_tasks)} evaluation tasks individually") 224 | 225 | # Group tasks by judge_model for better organization in logs 226 | judge_models = set(task["judge_model"] for task in eval_tasks) 227 | for judge_model in judge_models: 228 | model_tasks = [task for task in eval_tasks if task["judge_model"] == judge_model] 229 | logger.info(f"Processing {len(model_tasks)} evaluations for judge={judge_model}") 230 | 231 | for i, task in enumerate(model_tasks): 232 | logger.info(f"Evaluation {i+1}/{len(model_tasks)} for {judge_model}") 233 | 234 | raw_judgment = None 235 | try: 236 | # Use parallm's query_model if available (correct parameter order) 237 | if HAS_PARALLM: 238 | logger.info(f"Querying {judge_model} with evaluation prompt") 239 | raw_judgment = query_model(task["evaluation_prompt"], judge_model) 240 | elif llm is not None: 241 | logger.info(f"Querying {judge_model} via llm") 242 | judge_obj = llm.get_model(judge_model) 243 | judge_resp = judge_obj.prompt(task["evaluation_prompt"]) 244 | raw_judgment = judge_resp.text() 245 | else: 246 | # fallback mock data 247 | raw_judgment = '{"Model_1": 8, "Model_2": 6}' 248 | 249 | # Log successful query 250 | logger.info(f"Received response from {judge_model}: {raw_judgment[:50]}...") 251 | 252 | except Exception as e: 253 | logger.error(f"Error querying {judge_model}: {str(e)}") 254 | # Use fallback values on error 255 | raw_judgment = '{"Model_1": 5, "Model_2": 5}' 256 | 257 | # Add to new judgments 258 | new_judgments.append({ 259 | "prompt": task["prompt"], 260 | "judge_model": task["judge_model"], 261 | "raw_judgment": raw_judgment, 262 | "model_mapping": task["model_mapping"], 263 | "raw_judgment_token_count": len(raw_judgment.split()) if raw_judgment else 0 264 | }) 265 | 266 | new_df = pd.DataFrame(new_judgments) 267 | # Only create combined_df if there are new judgments 268 | if not new_df.empty: 269 | combined_df = pd.concat([existing_df, new_df], ignore_index=True) 270 | combined_df.drop_duplicates(subset=["prompt","judge_model","model_mapping"], keep="first", inplace=True) 271 | combined_df.to_csv(raw_eval_path, index=False) 272 | logger.info(f"Raw evaluations saved to {raw_eval_path}") 273 | return combined_df 274 | else: 275 | logger.info("No new evaluations were created") 276 | return existing_df -------------------------------------------------------------------------------- /sloprank/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass, field 3 | from pathlib import Path 4 | from typing import List, Dict, Optional, Union, Any 5 | 6 | logging.basicConfig( 7 | level=logging.INFO, 8 | format="%(asctime)s - %(levelname)s - %(message)s" 9 | ) 10 | logger = logging.getLogger("SlopRankLogger") 11 | 12 | @dataclass 13 | class VisualizationConfig: 14 | """Configuration for graph visualization options.""" 15 | enabled: bool = True 16 | save_formats: List[str] = field(default_factory=lambda: ["png", "html", "gml"]) 17 | node_size_factor: float = 2000 18 | edge_width_factor: float = 2.0 19 | layout: str = "spring" # Options: spring, circular, kamada_kawai, spectral 20 | node_colormap: str = "viridis" 21 | edge_colormap: str = "plasma" 22 | interactive: bool = True 23 | 24 | @dataclass 25 | class ConfidenceConfig: 26 | """Configuration for confidence interval calculations.""" 27 | enabled: bool = True 28 | bootstrap_iterations: int = 1000 29 | confidence_level: float = 0.95 # e.g., 0.95 for 95% confidence interval 30 | significance_threshold: float = 0.05 # p-value threshold for significance 31 | 32 | @dataclass 33 | class WebDashboardConfig: 34 | """Configuration for the web dashboard.""" 35 | enabled: bool = False # Default to disabled 36 | host: str = "127.0.0.1" 37 | port: int = 8050 38 | debug: bool = False 39 | auto_open_browser: bool = True 40 | 41 | @dataclass 42 | class EvalConfig: 43 | """Configuration for the SlopRank evaluation system.""" 44 | # Core configuration 45 | model_names: List[str] 46 | evaluation_method: int # 1 => numeric rating, 2 => up/down (example usage) 47 | use_subset_evaluation: bool 48 | evaluators_subset_size: int 49 | output_dir: Path 50 | request_delay: float = 0.0 51 | 52 | # New features 53 | prompt_categories: Dict[str, List[str]] = field(default_factory=dict) 54 | visualization: VisualizationConfig = field(default_factory=VisualizationConfig) 55 | confidence: ConfidenceConfig = field(default_factory=ConfidenceConfig) 56 | web_dashboard: WebDashboardConfig = field(default_factory=WebDashboardConfig) 57 | 58 | # Optional metadata fields 59 | metadata: Dict[str, Any] = field(default_factory=dict) 60 | 61 | def __post_init__(self): 62 | self.output_dir.mkdir(parents=True, exist_ok=True) 63 | 64 | # Strip any whitespace from model names 65 | self.model_names = [model.strip() for model in self.model_names] 66 | 67 | if self.evaluation_method not in {1, 2}: 68 | raise ValueError("evaluation_method must be 1 or 2") 69 | if self.use_subset_evaluation and self.evaluators_subset_size >= len(self.model_names): 70 | # Automatically adjust the subset size if needed 71 | self.evaluators_subset_size = len(self.model_names) - 1 if len(self.model_names) > 1 else 1 72 | logger.warning(f"Adjusted evaluators_subset_size to {self.evaluators_subset_size}") 73 | 74 | # Create visualization directory if needed 75 | if self.visualization.enabled: 76 | vis_dir = self.output_dir / "visualizations" 77 | vis_dir.mkdir(parents=True, exist_ok=True) 78 | 79 | DEFAULT_CONFIG = EvalConfig( 80 | model_names=[ 81 | "gemini-2.5-pro-exp-03-25", 82 | "claude-3.7-sonnet-latest", 83 | "gpt-4o", 84 | "deepseek-chat" 85 | ], 86 | # model_names=[ 87 | # "gemini-2.5-pro-exp-03-25", 88 | # "claude-3.7-sonnet-latest", 89 | # "o1", 90 | # "deepseek-reasoner" 91 | # ], 92 | evaluation_method=1, # numeric 93 | use_subset_evaluation=True, 94 | evaluators_subset_size=3, 95 | output_dir=Path("results"), 96 | request_delay=0.0, 97 | # Default prompt categories (empty) 98 | prompt_categories={}, 99 | # Default visualization configuration 100 | visualization=VisualizationConfig( 101 | enabled=True, 102 | save_formats=["png", "html", "gml"], 103 | node_size_factor=2000, 104 | edge_width_factor=2.0, 105 | layout="spring", 106 | node_colormap="viridis", 107 | edge_colormap="plasma", 108 | interactive=True 109 | ), 110 | # Default confidence configuration 111 | confidence=ConfidenceConfig( 112 | enabled=True, 113 | bootstrap_iterations=1000, 114 | confidence_level=0.95, 115 | significance_threshold=0.05 116 | ), 117 | # Default web dashboard configuration (disabled by default) 118 | web_dashboard=WebDashboardConfig( 119 | enabled=False, 120 | host="127.0.0.1", 121 | port=8050, 122 | debug=False, 123 | auto_open_browser=True 124 | ) 125 | ) 126 | -------------------------------------------------------------------------------- /sloprank/parse.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | from .config import logger, EvalConfig 4 | 5 | def parse_evaluation_rows(raw_eval_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame: 6 | """ 7 | Convert each row's judge's JSON to numeric scores. 8 | Returns: columns = [prompt, judge_model, rated_model, score, parse_failed]. 9 | """ 10 | all_rows = [] 11 | for _, row in raw_eval_df.iterrows(): 12 | prompt = row["prompt"] 13 | judge_model = row["judge_model"] 14 | raw_judgment = row["raw_judgment"] or "" 15 | raw_judgment_tokens = row.get("raw_judgment_token_count", 0) 16 | 17 | # load model_mapping 18 | try: 19 | model_mapping = json.loads(row["model_mapping"]) 20 | except Exception as e: 21 | logger.error(f"Couldn't parse model_mapping: {e}") 22 | model_mapping = {} 23 | 24 | if not raw_judgment.strip(): 25 | # fallback 26 | for real_model in model_mapping.keys(): 27 | all_rows.append({ 28 | "prompt": prompt, 29 | "judge_model": judge_model, 30 | "rated_model": real_model, 31 | "score": 4.1, 32 | "parse_failed": True, 33 | "raw_judgment_token_count": raw_judgment_tokens 34 | }) 35 | continue 36 | 37 | # Attempt to isolate the JSON object 38 | # First try to find JSON with standard formatting 39 | start = raw_judgment.find("{") 40 | end = raw_judgment.rfind("}") + 1 41 | 42 | # If that fails, try more aggressive parsing for models that output in various formats 43 | if start == -1 or end == 0: 44 | # Look for patterns like "Model_1": 8 or "Model_1" : 8 or Model_1: 8 45 | import re 46 | json_pattern = r'[\{\s]*[\"\']?Model_\d+[\"\']?\s*:\s*\d+(?:\.\d+)?' 47 | if re.search(json_pattern, raw_judgment): 48 | # Try to reconstruct a proper JSON 49 | scores = {} 50 | model_score_pattern = r'[\"\']?Model_(\d+)[\"\']?\s*:\s*(\d+(?:\.\d+)?)' 51 | matches = re.findall(model_score_pattern, raw_judgment) 52 | for model_num, score in matches: 53 | scores[f"Model_{model_num}"] = float(score) 54 | 55 | if scores: 56 | logger.warning(f"Reconstructed JSON for judge={judge_model}, prompt={prompt[:40]}") 57 | try: 58 | # Convert to standard dict for consistency in later processing 59 | anon_to_real = {v: k for k,v in model_mapping.items()} 60 | for anon_id, score_val in scores.items(): 61 | real_model = anon_to_real.get(anon_id) 62 | if real_model: 63 | score_float = float(score_val) 64 | # clamp 1..10 65 | score_float = max(1.0, min(10.0, score_float)) 66 | all_rows.append({ 67 | "prompt": prompt, 68 | "judge_model": judge_model, 69 | "rated_model": real_model, 70 | "score": score_float, 71 | "parse_failed": False, 72 | "raw_judgment_token_count": raw_judgment_tokens 73 | }) 74 | continue 75 | except Exception as e: 76 | logger.error(f"Error processing reconstructed JSON: {e}") 77 | 78 | logger.error(f"No JSON found for judge={judge_model}, prompt={prompt[:40]}") 79 | # fallback 80 | for real_model in model_mapping.keys(): 81 | all_rows.append({ 82 | "prompt": prompt, 83 | "judge_model": judge_model, 84 | "rated_model": real_model, 85 | "score": 4.1, 86 | "parse_failed": True, 87 | "raw_judgment_token_count": raw_judgment_tokens 88 | }) 89 | continue 90 | 91 | try: 92 | data = json.loads(raw_judgment[start:end]) 93 | # Reverse map: "Model_1" => real model name 94 | anon_to_real = {v: k for k,v in model_mapping.items()} 95 | 96 | for anon_id, score_val in data.items(): 97 | real_model = anon_to_real.get(anon_id) 98 | if real_model: 99 | score_float = float(score_val) 100 | # clamp 1..10 101 | score_float = max(1.0, min(10.0, score_float)) 102 | all_rows.append({ 103 | "prompt": prompt, 104 | "judge_model": judge_model, 105 | "rated_model": real_model, 106 | "score": score_float, 107 | "parse_failed": False, 108 | "raw_judgment_token_count": raw_judgment_tokens 109 | }) 110 | except Exception as e: 111 | logger.error(f"Parsing error: judge={judge_model}, prompt={prompt[:40]} => {str(e)}") 112 | for real_model in model_mapping.keys(): 113 | all_rows.append({ 114 | "prompt": prompt, 115 | "judge_model": judge_model, 116 | "rated_model": real_model, 117 | "score": 4.1, 118 | "parse_failed": True, 119 | "raw_judgment_token_count": raw_judgment_tokens 120 | }) 121 | 122 | return pd.DataFrame(all_rows) 123 | -------------------------------------------------------------------------------- /sloprank/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | SlopRank utilities for visualization, confidence calculation, and dashboard generation. 3 | """ -------------------------------------------------------------------------------- /sloprank/utils/categorization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prompt categorization and category-based analysis. 3 | """ 4 | import json 5 | import pandas as pd 6 | import re 7 | from pathlib import Path 8 | from collections import defaultdict 9 | 10 | from ..config import logger 11 | 12 | def categorize_prompts(prompts_file=None, save_categorized=True): 13 | """ 14 | Read prompts from Excel file and automatically categorize them. 15 | If a 'Category' column exists, it will use those categories. 16 | Otherwise, it will attempt to infer categories based on content. 17 | 18 | Parameters: 19 | ----------- 20 | prompts_file : Path or str 21 | Path to the prompts Excel file 22 | save_categorized : bool 23 | Whether to save the categorized prompts back to an Excel file 24 | 25 | Returns: 26 | -------- 27 | dict 28 | Dictionary mapping category names to lists of prompts 29 | """ 30 | if prompts_file is None: 31 | prompts_file = Path("prompts.csv") 32 | else: 33 | prompts_file = Path(prompts_file) 34 | 35 | logger.info(f"Reading prompts from {prompts_file}...") 36 | 37 | # Read prompts from Excel 38 | prompts_df = pd.read_csv(prompts_file) 39 | 40 | # Check if a Category column exists 41 | if 'Category' in prompts_df.columns: 42 | categories = defaultdict(list) 43 | 44 | # Group prompts by category 45 | for _, row in prompts_df.iterrows(): 46 | if pd.notna(row['Category']) and row['Category']: 47 | categories[row['Category']].append(row['Questions']) 48 | else: 49 | if 'Uncategorized' not in categories: 50 | categories['Uncategorized'] = [] 51 | categories['Uncategorized'].append(row['Questions']) 52 | 53 | logger.info(f"Found {len(categories)} categories in the Excel file.") 54 | else: 55 | # Infer categories based on content 56 | categories = infer_categories(prompts_df['Questions'].tolist()) 57 | 58 | if save_categorized: 59 | # Add inferred categories back to the DataFrame 60 | category_map = {} 61 | for category, prompts in categories.items(): 62 | for prompt in prompts: 63 | category_map[prompt] = category 64 | 65 | prompts_df['Category'] = prompts_df['Questions'].map(category_map) 66 | 67 | # Save the categorized DataFrame back to Excel 68 | output_path = prompts_file.with_stem(prompts_file.stem + "_categorized") 69 | prompts_df.to_csv(output_path, index=False) 70 | logger.info(f"Saved categorized prompts to {output_path}") 71 | 72 | # Return categories as a dictionary with lists of prompts 73 | return dict(categories) 74 | 75 | 76 | def infer_categories(prompts): 77 | """ 78 | Infer categories from prompt content using keyword matching. 79 | 80 | Parameters: 81 | ----------- 82 | prompts : list 83 | List of prompts to categorize 84 | 85 | Returns: 86 | -------- 87 | dict 88 | Dictionary mapping category names to lists of prompts 89 | """ 90 | logger.info("Inferring categories from prompt content...") 91 | 92 | # Define category keywords 93 | keywords = { 94 | 'Reasoning': ['reason', 'logic', 'why', 'how', 'explain', 'analyze', 'evaluate', 'assess', 'examine'], 95 | 'Creativity': ['creative', 'imagine', 'story', 'design', 'invent', 'fiction', 'innovative'], 96 | 'Knowledge': ['fact', 'define', 'what is', 'history', 'science', 'describe', 'information'], 97 | 'Coding': ['code', 'function', 'algorithm', 'program', 'script', 'implementation'], 98 | 'Opinion': ['opinion', 'believe', 'think', 'perspective', 'view', 'stance'], 99 | 'Technical': ['technical', 'engineering', 'system', 'mechanism', 'process'], 100 | 'Economic': ['economic', 'finance', 'market', 'money', 'business', 'trade', 'commerce', 'tax'], 101 | 'Medical': ['medical', 'health', 'disease', 'treatment', 'cure', 'patient', 'doctor', 'hospital'], 102 | 'Political': ['political', 'government', 'policy', 'regulation', 'law', 'legal'], 103 | 'Ethical': ['ethical', 'moral', 'right', 'wrong', 'should', 'ethics', 'values'], 104 | } 105 | 106 | # Categorize prompts 107 | categories = defaultdict(list) 108 | 109 | for prompt in prompts: 110 | prompt_lower = prompt.lower() 111 | 112 | # Try to match prompt to a category 113 | matched = False 114 | for category, terms in keywords.items(): 115 | if any(term in prompt_lower for term in terms): 116 | categories[category].append(prompt) 117 | matched = True 118 | break 119 | 120 | # If no match, add to Uncategorized 121 | if not matched: 122 | categories['Uncategorized'].append(prompt) 123 | 124 | # Count prompts per category 125 | for category, prompts in categories.items(): 126 | logger.info(f"Category '{category}': {len(prompts)} prompts") 127 | 128 | return categories 129 | 130 | 131 | def analyze_categorized_evaluations( 132 | categorized_prompts, 133 | evaluations_path=None, 134 | output_dir=None 135 | ): 136 | """ 137 | Analyze evaluations based on prompt categories. 138 | 139 | Parameters: 140 | ----------- 141 | categorized_prompts : dict 142 | Dictionary mapping category names to lists of prompts 143 | evaluations_path : Path or str 144 | Path to the evaluations CSV file 145 | output_dir : Path or str 146 | Directory to save the output files 147 | 148 | Returns: 149 | -------- 150 | pd.DataFrame 151 | DataFrame with category analysis results 152 | """ 153 | if evaluations_path is None: 154 | evaluations_path = Path("results/evaluations.csv") 155 | else: 156 | evaluations_path = Path(evaluations_path) 157 | 158 | if output_dir is None: 159 | output_dir = Path("results") 160 | else: 161 | output_dir = Path(output_dir) 162 | 163 | # Create output directory if it doesn't exist 164 | output_dir.mkdir(parents=True, exist_ok=True) 165 | 166 | # Load evaluations 167 | logger.info(f"Loading evaluations from {evaluations_path}...") 168 | evals_df = pd.read_csv(evaluations_path) 169 | 170 | # Filter out failed evaluations 171 | evals_df = evals_df[evals_df["parse_failed"] == False] 172 | 173 | # Create a flat mapping of prompt -> category 174 | prompt_to_category = {} 175 | for category, prompts in categorized_prompts.items(): 176 | for prompt in prompts: 177 | prompt_to_category[prompt] = category 178 | 179 | # Add category column to evaluations DataFrame 180 | evals_df['category'] = evals_df['prompt'].map(prompt_to_category) 181 | 182 | # Calculate average scores by category and model 183 | results = [] 184 | 185 | # For each category 186 | for category in categorized_prompts.keys(): 187 | if category == 'Uncategorized': 188 | continue 189 | 190 | category_evals = evals_df[evals_df['category'] == category] 191 | 192 | if category_evals.empty: 193 | continue 194 | 195 | # For each model being rated 196 | for model in category_evals['rated_model'].unique(): 197 | model_scores = category_evals[category_evals['rated_model'] == model]['score'] 198 | avg_score = model_scores.mean() 199 | count = len(model_scores) 200 | 201 | results.append({ 202 | 'category': category, 203 | 'model': model, 204 | 'average_score': avg_score, 205 | 'evaluations_count': count 206 | }) 207 | 208 | # Create DataFrame from results 209 | results_df = pd.DataFrame(results) 210 | 211 | # Save to CSV 212 | output_path = output_dir / "category_analysis.csv" 213 | results_df.to_csv(output_path, index=False) 214 | 215 | # Generate summary 216 | logger.info("\n=== Category Analysis ===") 217 | for category in sorted(categorized_prompts.keys()): 218 | if category == 'Uncategorized': 219 | continue 220 | 221 | category_data = results_df[results_df['category'] == category] 222 | 223 | if category_data.empty: 224 | continue 225 | 226 | logger.info(f"\nCategory: {category}") 227 | sorted_models = category_data.sort_values('average_score', ascending=False) 228 | 229 | for _, row in sorted_models.iterrows(): 230 | logger.info(f" {row['model']}: {row['average_score']:.4f} (based on {row['evaluations_count']} evaluations)") 231 | 232 | logger.info(f"\nCategory analysis saved to {output_path}") 233 | 234 | # Create JSON with category rankings 235 | category_rankings = {} 236 | 237 | for category in sorted(categorized_prompts.keys()): 238 | if category == 'Uncategorized': 239 | continue 240 | 241 | category_data = results_df[results_df['category'] == category] 242 | 243 | if category_data.empty: 244 | continue 245 | 246 | sorted_models = category_data.sort_values('average_score', ascending=False) 247 | category_rankings[category] = [ 248 | {"model": row['model'], "score": float(row['average_score'])} 249 | for _, row in sorted_models.iterrows() 250 | ] 251 | 252 | # Save category rankings to JSON 253 | rankings_path = output_dir / "category_rankings.json" 254 | with open(rankings_path, 'w') as f: 255 | json.dump(category_rankings, f, indent=2) 256 | 257 | logger.info(f"Category rankings saved to {rankings_path}") 258 | 259 | return results_df 260 | 261 | 262 | if __name__ == "__main__": 263 | # Run as a standalone script 264 | categories = categorize_prompts() 265 | analyze_categorized_evaluations(categories) -------------------------------------------------------------------------------- /sloprank/utils/commands.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line utilities for SlopRank. 3 | """ 4 | import click 5 | import pandas as pd 6 | import json 7 | import threading 8 | import time 9 | from pathlib import Path 10 | import webbrowser 11 | 12 | from ..config import logger 13 | from ..config import VisualizationConfig 14 | from .visualization import generate_visualization 15 | 16 | # Import confidence and dashboard modules if available 17 | try: 18 | from .confidence import compute_confidence_intervals 19 | HAS_CONFIDENCE = True 20 | except ImportError: 21 | HAS_CONFIDENCE = False 22 | 23 | try: 24 | from .dashboard import generate_dashboard, start_dashboard 25 | HAS_DASHBOARD = True 26 | except ImportError: 27 | HAS_DASHBOARD = False 28 | 29 | # Import category analysis if available 30 | try: 31 | from .categorization import categorize_prompts, analyze_categorized_evaluations 32 | HAS_CATEGORIES = True 33 | except ImportError: 34 | HAS_CATEGORIES = False 35 | 36 | 37 | @click.group() 38 | def utils(): 39 | """Utility commands for SlopRank.""" 40 | pass 41 | 42 | 43 | @utils.command() 44 | @click.option("--rankings", default="results/rankings.json", help="Path to rankings JSON file") 45 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file") 46 | @click.option("--output-dir", default="results/visualizations", help="Output directory for visualizations") 47 | @click.option("--layout", default="spring", help="Graph layout [spring, circular, kamada_kawai, spectral]") 48 | @click.option("--interactive/--no-interactive", default=True, help="Generate interactive HTML visualization") 49 | def visualize(rankings, evaluations, output_dir, layout, interactive): 50 | """Generate visualizations for the SlopRank endorsement graph.""" 51 | vis_config = VisualizationConfig( 52 | enabled=True, 53 | interactive=interactive, 54 | layout=layout 55 | ) 56 | try: 57 | generate_visualization( 58 | rankings_path=rankings, 59 | evaluations_path=evaluations, 60 | output_dir=output_dir, 61 | vis_config=vis_config 62 | ) 63 | click.echo(f"Visualizations generated in {output_dir}") 64 | except Exception as e: 65 | click.echo(f"Error generating visualizations: {e}", err=True) 66 | 67 | 68 | @utils.command() 69 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file") 70 | @click.option("--output", default="results/confidence_stats.json", help="Output file for confidence data") 71 | @click.option("--iterations", default=500, help="Number of bootstrap iterations") 72 | @click.option("--confidence-level", default=0.95, help="Confidence level (0.0-1.0)") 73 | def confidence(evaluations, output, iterations, confidence_level): 74 | """Compute confidence intervals for SlopRank rankings.""" 75 | if not HAS_CONFIDENCE: 76 | click.echo("Confidence module not available. Install numpy to use this feature.", err=True) 77 | return 78 | 79 | try: 80 | from .confidence import compute_confidence_intervals 81 | stats = compute_confidence_intervals( 82 | evaluations_path=evaluations, 83 | output_path=output, 84 | iterations=iterations, 85 | confidence_level=confidence_level 86 | ) 87 | click.echo(f"Confidence statistics saved to {output}") 88 | except Exception as e: 89 | click.echo(f"Error computing confidence intervals: {e}", err=True) 90 | 91 | 92 | @utils.command() 93 | @click.option("--prompts", default="prompts.csv", help="Path to prompts Excel file") 94 | @click.option("--evaluations", default="results/evaluations.csv", help="Path to evaluations CSV file") 95 | @click.option("--output-dir", default="results", help="Output directory for category analysis") 96 | def categorize(prompts, evaluations, output_dir): 97 | """Categorize prompts and analyze model performance by category.""" 98 | if not HAS_CATEGORIES: 99 | click.echo("Categorization module not available.", err=True) 100 | return 101 | 102 | try: 103 | from .categorization import categorize_prompts, analyze_categorized_evaluations 104 | 105 | output_dir = Path(output_dir) 106 | output_dir.mkdir(exist_ok=True, parents=True) 107 | 108 | # Categorize prompts 109 | categories = categorize_prompts(prompts_file=prompts) 110 | 111 | # Analyze performance by category 112 | analyze_categorized_evaluations( 113 | categorized_prompts=categories, 114 | evaluations_path=evaluations, 115 | output_dir=output_dir 116 | ) 117 | 118 | click.echo(f"Category analysis saved to {output_dir / 'category_rankings.json'}") 119 | except Exception as e: 120 | click.echo(f"Error categorizing prompts: {e}", err=True) 121 | 122 | 123 | @utils.command() 124 | @click.option("--rankings", default="results/rankings.json", help="Path to rankings JSON file") 125 | @click.option("--confidence", default="results/confidence_stats.json", help="Path to confidence stats JSON") 126 | @click.option("--categories", default="results/category_rankings.json", help="Path to category rankings JSON") 127 | @click.option("--graph", default="results/visualizations/endorsement_graph.png", help="Path to graph visualization") 128 | @click.option("--output", default="results/dashboard.html", help="Output path for dashboard HTML") 129 | def dashboard(rankings, confidence, categories, graph, output): 130 | """Generate HTML dashboard for SlopRank results.""" 131 | if not HAS_DASHBOARD: 132 | click.echo("Dashboard module not available.", err=True) 133 | return 134 | 135 | try: 136 | from .dashboard import generate_dashboard 137 | 138 | dashboard_path = generate_dashboard( 139 | rankings_path=rankings, 140 | confidence_path=confidence if Path(confidence).exists() else None, 141 | categories_path=categories if Path(categories).exists() else None, 142 | graph_path=graph if Path(graph).exists() else None, 143 | output_path=output 144 | ) 145 | 146 | click.echo(f"Dashboard generated at {dashboard_path}") 147 | except Exception as e: 148 | click.echo(f"Error generating dashboard: {e}", err=True) 149 | 150 | 151 | @utils.command() 152 | @click.option("--dashboard", default="results/dashboard.html", help="Path to dashboard HTML file") 153 | @click.option("--port", default=8000, help="Port for the web server") 154 | @click.option("--no-browser", is_flag=True, help="Don't open browser automatically") 155 | def serve(dashboard, port, no_browser): 156 | """Start a web server to view the SlopRank dashboard.""" 157 | try: 158 | from http.server import HTTPServer, SimpleHTTPRequestHandler 159 | 160 | dashboard_path = Path(dashboard) 161 | if not dashboard_path.exists(): 162 | click.echo(f"Dashboard file not found: {dashboard_path}", err=True) 163 | return 164 | 165 | # Start server 166 | server_address = ('', port) 167 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler) 168 | 169 | # Start server in a separate thread 170 | server_thread = threading.Thread(target=httpd.serve_forever) 171 | server_thread.daemon = True 172 | server_thread.start() 173 | 174 | url = f"http://localhost:{port}/{dashboard}" 175 | click.echo(f"Server started at {url}") 176 | 177 | # Open browser 178 | if not no_browser: 179 | webbrowser.open(url) 180 | 181 | # Keep the main thread alive 182 | try: 183 | while True: 184 | time.sleep(1) 185 | except KeyboardInterrupt: 186 | click.echo("Shutting down server...") 187 | httpd.shutdown() 188 | 189 | except Exception as e: 190 | click.echo(f"Error starting server: {e}", err=True) 191 | 192 | 193 | def register_utils_commands(cli): 194 | """Register utility commands with the main CLI.""" 195 | cli.add_command(utils) -------------------------------------------------------------------------------- /sloprank/utils/confidence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Confidence interval calculation for SlopRank rankings. 3 | """ 4 | import json 5 | import pandas as pd 6 | import numpy as np 7 | import networkx as nx 8 | from pathlib import Path 9 | 10 | from ..config import logger 11 | 12 | def compute_confidence_intervals( 13 | evaluations_path=None, 14 | output_path=None, 15 | iterations=500, 16 | confidence_level=0.95 17 | ): 18 | """ 19 | Compute confidence intervals for model rankings using bootstrap resampling. 20 | 21 | Parameters: 22 | ----------- 23 | evaluations_path : Path or str 24 | Path to the evaluations CSV file 25 | output_path : Path or str 26 | Path for the output JSON file 27 | iterations : int 28 | Number of bootstrap iterations 29 | confidence_level : float 30 | Confidence level (0.0-1.0) 31 | 32 | Returns: 33 | -------- 34 | dict 35 | Confidence statistics 36 | """ 37 | if evaluations_path is None: 38 | evaluations_path = Path("results/evaluations.csv") 39 | else: 40 | evaluations_path = Path(evaluations_path) 41 | 42 | if output_path is None: 43 | output_path = Path("results/confidence_stats.json") 44 | else: 45 | output_path = Path(output_path) 46 | 47 | # Create output directory if it doesn't exist 48 | output_path.parent.mkdir(parents=True, exist_ok=True) 49 | 50 | logger.info(f"Computing confidence intervals using {iterations} bootstrap iterations...") 51 | 52 | # Load evaluations 53 | evals_df = pd.read_csv(evaluations_path) 54 | 55 | # Filter out failed evaluations 56 | evals_df = evals_df[evals_df["parse_failed"] == False] 57 | 58 | # Get unique models 59 | models = list(set(evals_df["judge_model"].unique()) | set(evals_df["rated_model"].unique())) 60 | 61 | # Store bootstrap results 62 | bootstrap_results = {model: [] for model in models} 63 | 64 | # Run bootstrap iterations 65 | for i in range(iterations): 66 | if i % 100 == 0: 67 | logger.info(f"Bootstrap iteration {i}/{iterations}...") 68 | 69 | # Resample evaluations with replacement 70 | sampled_evals = evals_df.sample(frac=1.0, replace=True) 71 | 72 | # Build graph from resampled data 73 | G = nx.DiGraph() 74 | G.add_nodes_from(models) 75 | 76 | for _, row in sampled_evals.iterrows(): 77 | judge = row["judge_model"] 78 | rated = row["rated_model"] 79 | score = float(row["score"]) 80 | 81 | if G.has_edge(judge, rated): 82 | G[judge][rated]["weight"] += score 83 | else: 84 | G.add_edge(judge, rated, weight=score) 85 | 86 | # Compute PageRank 87 | if len(G.edges) > 0: 88 | scores = nx.pagerank(G, weight="weight") 89 | 90 | # Store scores 91 | for model, score in scores.items(): 92 | bootstrap_results[model].append(score) 93 | 94 | # Calculate confidence intervals (95%) 95 | confidence_stats = {} 96 | alpha = 1.0 - confidence_level 97 | 98 | for model in models: 99 | if not bootstrap_results[model]: 100 | confidence_stats[model] = { 101 | "mean": 0.0, 102 | "lower_bound": 0.0, 103 | "upper_bound": 0.0, 104 | "std_dev": 0.0 105 | } 106 | continue 107 | 108 | sorted_scores = sorted(bootstrap_results[model]) 109 | lower_idx = int(alpha/2 * len(sorted_scores)) 110 | upper_idx = int((1-alpha/2) * len(sorted_scores)) 111 | 112 | confidence_stats[model] = { 113 | "mean": float(np.mean(sorted_scores)), 114 | "lower_bound": float(sorted_scores[max(0, lower_idx)]), 115 | "upper_bound": float(sorted_scores[min(len(sorted_scores)-1, upper_idx)]), 116 | "std_dev": float(np.std(sorted_scores)) 117 | } 118 | 119 | # Test statistical significance 120 | significance_results = {} 121 | 122 | # Create sorted list of models by mean score 123 | models_by_score = sorted( 124 | [(model, stats["mean"]) for model, stats in confidence_stats.items()], 125 | key=lambda x: x[1], 126 | reverse=True 127 | ) 128 | 129 | # Compare each adjacent pair in the ranking 130 | for i in range(len(models_by_score) - 1): 131 | model1, _ = models_by_score[i] 132 | model2, _ = models_by_score[i + 1] 133 | 134 | # Determine if significant based on confidence intervals 135 | is_significant = ( 136 | confidence_stats[model1]["lower_bound"] > confidence_stats[model2]["upper_bound"] or 137 | confidence_stats[model2]["lower_bound"] > confidence_stats[model1]["upper_bound"] 138 | ) 139 | 140 | significance_results[f"{model1}_vs_{model2}"] = is_significant 141 | 142 | # Save results 143 | results = { 144 | "confidence_intervals": confidence_stats, 145 | "significance": significance_results, 146 | "metadata": { 147 | "iterations": iterations, 148 | "confidence_level": confidence_level 149 | } 150 | } 151 | 152 | with open(output_path, "w") as f: 153 | json.dump(results, f, indent=2) 154 | 155 | # Print summary 156 | logger.info("\n=== Confidence Intervals ===") 157 | for model, stats in sorted(confidence_stats.items(), key=lambda x: x[1]["mean"], reverse=True): 158 | logger.info(f"{model}: {stats['mean']:.6f} [{stats['lower_bound']:.6f}, {stats['upper_bound']:.6f}]") 159 | 160 | logger.info("\n=== Statistical Significance ===") 161 | for pair, is_significant in significance_results.items(): 162 | significance_str = "Significant" if is_significant else "Not significant" 163 | logger.info(f"{pair}: {significance_str}") 164 | 165 | logger.info(f"Confidence statistics saved to {output_path}") 166 | 167 | return confidence_stats 168 | 169 | 170 | if __name__ == "__main__": 171 | # Run as a standalone script 172 | compute_confidence_intervals() -------------------------------------------------------------------------------- /sloprank/utils/dashboard.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dashboard generation for SlopRank results. 3 | """ 4 | import json 5 | import pandas as pd 6 | import webbrowser 7 | import threading 8 | import time 9 | from pathlib import Path 10 | from http.server import HTTPServer, SimpleHTTPRequestHandler 11 | 12 | from ..config import logger 13 | 14 | def generate_dashboard( 15 | rankings_path=None, 16 | confidence_path=None, 17 | categories_path=None, 18 | graph_path=None, 19 | output_path=None 20 | ): 21 | """ 22 | Generate an HTML dashboard for SlopRank results. 23 | 24 | Parameters: 25 | ----------- 26 | rankings_path : Path or str 27 | Path to the rankings JSON file 28 | confidence_path : Path or str 29 | Path to the confidence stats JSON file 30 | categories_path : Path or str 31 | Path to the category rankings JSON file 32 | graph_path : Path or str 33 | Path to the graph visualization image 34 | output_path : Path or str 35 | Path to save the dashboard HTML file 36 | 37 | Returns: 38 | -------- 39 | Path 40 | Path to the generated dashboard HTML file 41 | """ 42 | if rankings_path is None: 43 | rankings_path = Path("results/rankings.json") 44 | else: 45 | rankings_path = Path(rankings_path) 46 | 47 | if output_path is None: 48 | output_path = Path("results/dashboard.html") 49 | else: 50 | output_path = Path(output_path) 51 | 52 | # Create output directory if it doesn't exist 53 | output_path.parent.mkdir(parents=True, exist_ok=True) 54 | 55 | # Load rankings data 56 | with open(rankings_path, 'r') as f: 57 | rankings_data = json.load(f) 58 | 59 | # Load confidence data if available 60 | has_confidence = confidence_path is not None and Path(confidence_path).exists() 61 | confidence_data = None 62 | if has_confidence: 63 | with open(confidence_path, 'r') as f: 64 | confidence_data = json.load(f) 65 | 66 | # Load category rankings if available 67 | has_categories = categories_path is not None and Path(categories_path).exists() 68 | category_data = None 69 | if has_categories: 70 | with open(categories_path, 'r') as f: 71 | category_data = json.load(f) 72 | 73 | # Check if graph visualization is available 74 | has_graph = graph_path is not None and Path(graph_path).exists() 75 | 76 | # Generate HTML 77 | html = f""" 78 | 79 | 80 | 81 | 82 | 83 | SlopRank Dashboard 84 | 153 | 154 | 155 |
156 |

SlopRank Dashboard

157 | 158 |

Model Rankings

159 | 160 | 161 | 162 | 163 | 164 | 165 | """ 166 | 167 | if has_confidence: 168 | html += """ 169 | 170 | """ 171 | 172 | html += """ 173 | 174 | """ 175 | 176 | # Add rows for each model 177 | if isinstance(rankings_data['rankings'][0], list): 178 | # Old format with list of lists 179 | ranked_items = rankings_data["rankings"] 180 | max_score = max([score for _, score in ranked_items]) 181 | else: 182 | # New format with list of dicts 183 | ranked_items = [(item["model"], item["score"]) for item in rankings_data["rankings"]] 184 | max_score = max([item["score"] for item in rankings_data["rankings"]]) 185 | 186 | for i, (model, score) in enumerate(ranked_items): 187 | bar_width = int(300 * score / max_score) 188 | confidence_html = "" 189 | 190 | if has_confidence and model in confidence_data["confidence_intervals"]: 191 | ci = confidence_data["confidence_intervals"][model] 192 | lower_pct = int(300 * ci["lower_bound"] / max_score) 193 | upper_pct = int(300 * ci["upper_bound"] / max_score) 194 | mean_pct = int(300 * ci["mean"] / max_score) 195 | 196 | confidence_html = f""" 197 | 204 | """ 205 | 206 | html += f""" 207 | 208 | 209 | 210 | 211 | 216 | {confidence_html} 217 | 218 | """ 219 | 220 | html += """ 221 |
RankModelScoreVisualizationConfidence Interval
198 |
199 |
200 |
201 |
202 | {ci["mean"]:.6f} [{ci["lower_bound"]:.6f}, {ci["upper_bound"]:.6f}] 203 |
{i+1}{model}{score:.6f} 212 |
213 |
214 |
215 |
222 | """ 223 | 224 | # Add statistical significance if available 225 | if has_confidence and confidence_data.get("significance"): 226 | html += """ 227 |

Statistical Significance

228 | 229 | 230 | 231 | 232 | 233 | """ 234 | 235 | for pair, is_significant in confidence_data["significance"].items(): 236 | significance_str = "Significant" if is_significant else "Not significant" 237 | html += f""" 238 | 239 | 240 | 241 | 242 | """ 243 | 244 | html += """ 245 |
ComparisonSignificance
{pair}{significance_str}
246 | """ 247 | 248 | # Add category rankings if available 249 | if has_categories and category_data: 250 | html += """ 251 |

Rankings by Category

252 | """ 253 | 254 | for category, models in sorted(category_data.items()): 255 | max_score = max([item["score"] for item in models]) 256 | 257 | html += f""" 258 |

{category}

259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | """ 267 | 268 | for i, item in enumerate(models): 269 | model = item["model"] 270 | score = item["score"] 271 | bar_width = int(300 * score / max_score) 272 | 273 | html += f""" 274 | 275 | 276 | 277 | 278 | 283 | 284 | """ 285 | 286 | html += """ 287 |
RankModelScoreVisualization
{i+1}{model}{score:.4f} 279 |
280 |
281 |
282 |
288 | """ 289 | 290 | # Add graph visualization if available 291 | if has_graph: 292 | rel_path = str(Path(graph_path).relative_to(Path.cwd())) 293 | html += f""" 294 |

Endorsement Graph

295 |
296 | Endorsement Graph 297 |
298 | """ 299 | 300 | # Add metadata 301 | html += f""" 302 |
303 |

Generated with SlopRank v{rankings_data['metadata'].get('version', '0.2.1')}

304 |

Timestamp: {rankings_data['metadata'].get('timestamp', '')}

305 |
306 |
307 | 308 | 309 | """ 310 | 311 | # Save HTML to file 312 | with open(output_path, 'w') as f: 313 | f.write(html) 314 | 315 | logger.info(f"Dashboard generated at {output_path}") 316 | return output_path 317 | 318 | 319 | def start_dashboard(dashboard_path=None, port=8000, open_browser=True): 320 | """ 321 | Start a web server to view the SlopRank dashboard. 322 | 323 | Parameters: 324 | ----------- 325 | dashboard_path : Path or str 326 | Path to the dashboard HTML file 327 | port : int 328 | Port for the web server 329 | open_browser : bool 330 | Whether to open a browser window automatically 331 | 332 | Returns: 333 | -------- 334 | HTTPServer 335 | The server instance 336 | """ 337 | if dashboard_path is None: 338 | dashboard_path = Path("results/dashboard.html") 339 | else: 340 | dashboard_path = Path(dashboard_path) 341 | 342 | if not dashboard_path.exists(): 343 | logger.error(f"Dashboard file not found: {dashboard_path}") 344 | return None 345 | 346 | # Start server 347 | server_address = ('', port) 348 | httpd = HTTPServer(server_address, SimpleHTTPRequestHandler) 349 | 350 | # Start server in a separate thread 351 | server_thread = threading.Thread(target=httpd.serve_forever) 352 | server_thread.daemon = True 353 | server_thread.start() 354 | 355 | url = f"http://localhost:{port}/{dashboard_path}" 356 | logger.info(f"Server started at {url}") 357 | 358 | # Open browser 359 | if open_browser: 360 | webbrowser.open(url) 361 | 362 | return httpd 363 | 364 | 365 | if __name__ == "__main__": 366 | # Run as a standalone script 367 | dashboard_path = generate_dashboard() 368 | httpd = start_dashboard(dashboard_path) 369 | 370 | try: 371 | while True: 372 | time.sleep(1) 373 | except KeyboardInterrupt: 374 | logger.info("Shutting down server...") 375 | httpd.shutdown() -------------------------------------------------------------------------------- /sloprank/utils/visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Graph visualization for SlopRank endorsement networks. 3 | """ 4 | import json 5 | import pandas as pd 6 | import numpy as np 7 | import networkx as nx 8 | from pathlib import Path 9 | 10 | # Try importing visualization libraries 11 | try: 12 | import matplotlib.pyplot as plt 13 | import matplotlib.cm as cm 14 | HAS_MATPLOTLIB = True 15 | except ImportError: 16 | HAS_MATPLOTLIB = False 17 | 18 | try: 19 | import plotly.graph_objects as go 20 | HAS_PLOTLY = True 21 | except ImportError: 22 | HAS_PLOTLY = False 23 | 24 | from ..config import logger 25 | 26 | 27 | def generate_visualization( 28 | rankings_path=None, 29 | evaluations_path=None, 30 | output_dir=None, 31 | vis_config=None 32 | ): 33 | """ 34 | Generate visualizations of the SlopRank endorsement graph. 35 | 36 | Parameters: 37 | ----------- 38 | rankings_path : Path or str 39 | Path to the rankings.json file 40 | evaluations_path : Path or str 41 | Path to the evaluations.csv file 42 | output_dir : Path or str 43 | Directory to save visualizations 44 | vis_config : VisualizationConfig 45 | Configuration for visualizations 46 | 47 | Returns: 48 | -------- 49 | tuple 50 | Paths to generated visualization files 51 | """ 52 | if rankings_path is None: 53 | rankings_path = Path("results/rankings.json") 54 | else: 55 | rankings_path = Path(rankings_path) 56 | 57 | if evaluations_path is None: 58 | evaluations_path = Path("results/evaluations.csv") 59 | else: 60 | evaluations_path = Path(evaluations_path) 61 | 62 | if output_dir is None: 63 | output_dir = Path("results/visualizations") 64 | else: 65 | output_dir = Path(output_dir) 66 | 67 | # Ensure output directory exists 68 | output_dir.mkdir(parents=True, exist_ok=True) 69 | 70 | # Load rankings 71 | with open(rankings_path, 'r') as f: 72 | rankings_data = json.load(f) 73 | 74 | # Extract pagerank scores 75 | if isinstance(rankings_data['rankings'][0], list): 76 | # Old format with list of lists 77 | pagerank_scores = {model: score for model, score in rankings_data["rankings"]} 78 | else: 79 | # New format with list of dicts 80 | pagerank_scores = {item["model"]: item["score"] for item in rankings_data["rankings"]} 81 | 82 | # Load evaluations 83 | evals_df = pd.read_csv(evaluations_path) 84 | 85 | # Filter out failed evaluations 86 | evals_df = evals_df[evals_df["parse_failed"] == False] 87 | 88 | # Build graph 89 | G = nx.DiGraph() 90 | 91 | # Add nodes from rankings 92 | for model, score in pagerank_scores.items(): 93 | G.add_node(model, pagerank=score) 94 | 95 | # Add edges from evaluations 96 | for _, row in evals_df.iterrows(): 97 | judge = row["judge_model"] 98 | rated = row["rated_model"] 99 | score = float(row["score"]) 100 | 101 | if G.has_edge(judge, rated): 102 | G[judge][rated]["weight"] += score 103 | else: 104 | G.add_edge(judge, rated, weight=score) 105 | 106 | # Normalize edge weights for visualization 107 | max_weight = max([G[u][v]["weight"] for u, v in G.edges()]) 108 | for u, v in G.edges(): 109 | G[u][v]["normalized_weight"] = G[u][v]["weight"] / max_weight 110 | 111 | # Save graph in GML format 112 | gml_path = output_dir / "endorsement_graph.gml" 113 | nx.write_gml(G, gml_path) 114 | logger.info(f"Saved graph in GML format to {gml_path}") 115 | 116 | # Generate static visualization if matplotlib is available 117 | png_path = None 118 | if HAS_MATPLOTLIB: 119 | png_path = output_dir / "endorsement_graph.png" 120 | generate_static_visualization(G, pagerank_scores, png_path, vis_config) 121 | logger.info(f"Saved static visualization to {png_path}") 122 | 123 | # Generate interactive visualization if plotly is available 124 | html_path = None 125 | if HAS_PLOTLY and (vis_config is None or vis_config.interactive): 126 | html_path = output_dir / "endorsement_graph.html" 127 | generate_interactive_visualization(G, pagerank_scores, html_path, vis_config) 128 | logger.info(f"Saved interactive visualization to {html_path}") 129 | 130 | return gml_path, png_path, html_path 131 | 132 | 133 | def generate_static_visualization(G, pagerank_scores, output_path, vis_config=None): 134 | """ 135 | Generate a static visualization of the endorsement graph using matplotlib. 136 | """ 137 | if not HAS_MATPLOTLIB: 138 | logger.warning("Matplotlib not found. Cannot generate static visualization.") 139 | return 140 | 141 | # Node size factor, edge width factor, color maps, etc. 142 | node_size_factor = 2000 143 | edge_width_factor = 2.0 144 | node_colormap = 'viridis' 145 | edge_colormap = 'plasma' 146 | 147 | if vis_config is not None: 148 | node_size_factor = vis_config.node_size_factor 149 | edge_width_factor = vis_config.edge_width_factor 150 | node_colormap = vis_config.node_colormap 151 | edge_colormap = vis_config.edge_colormap 152 | 153 | try: 154 | # Calculate position using spring layout 155 | layout_func = nx.spring_layout 156 | if vis_config is not None and hasattr(vis_config, 'layout'): 157 | if vis_config.layout == 'circular': 158 | layout_func = nx.circular_layout 159 | elif vis_config.layout == 'kamada_kawai': 160 | layout_func = nx.kamada_kawai_layout 161 | elif vis_config.layout == 'spectral': 162 | layout_func = nx.spectral_layout 163 | 164 | pos = layout_func(G, seed=42) 165 | 166 | # Create figure 167 | plt.figure(figsize=(12, 10)) 168 | 169 | # Draw nodes 170 | node_sizes = [pagerank_scores.get(node, 0.01) * node_size_factor for node in G.nodes()] 171 | node_colors = [pagerank_scores.get(node, 0.0) for node in G.nodes()] 172 | 173 | nx.draw_networkx_nodes( 174 | G, pos, 175 | node_size=node_sizes, 176 | node_color=node_colors, 177 | cmap=plt.cm.get_cmap(node_colormap), 178 | alpha=0.8 179 | ) 180 | 181 | # Draw edges 182 | edge_widths = [G[u][v].get('normalized_weight', 0.1) * edge_width_factor for u, v in G.edges()] 183 | 184 | nx.draw_networkx_edges( 185 | G, pos, 186 | width=edge_widths, 187 | alpha=0.6, 188 | edge_color=range(len(G.edges())), 189 | edge_cmap=plt.cm.get_cmap(edge_colormap), 190 | arrows=True, 191 | arrowsize=20, 192 | arrowstyle='-|>' 193 | ) 194 | 195 | # Draw labels 196 | nx.draw_networkx_labels( 197 | G, pos, 198 | font_size=12, 199 | font_weight='bold' 200 | ) 201 | 202 | # Add title 203 | plt.title("LLM Endorsement Graph (Node size = PageRank score, Edge width = Endorsement strength)") 204 | plt.axis('off') 205 | 206 | # Save the figure 207 | plt.tight_layout() 208 | plt.savefig(output_path, dpi=300, bbox_inches='tight') 209 | plt.close() 210 | 211 | except Exception as e: 212 | logger.error(f"Error generating static visualization: {e}") 213 | 214 | 215 | def generate_interactive_visualization(G, pagerank_scores, output_path, vis_config=None): 216 | """ 217 | Generate an interactive visualization of the endorsement graph using Plotly. 218 | """ 219 | if not HAS_PLOTLY: 220 | logger.warning("Plotly not found. Cannot generate interactive visualization.") 221 | return 222 | 223 | # Node size factor, edge width factor, color maps, etc. 224 | node_size_factor = 2000 225 | edge_width_factor = 2.0 226 | node_colormap = 'Viridis' 227 | 228 | if vis_config is not None: 229 | node_size_factor = vis_config.node_size_factor 230 | edge_width_factor = vis_config.edge_width_factor 231 | node_colormap = vis_config.node_colormap 232 | 233 | try: 234 | # Calculate position using spring layout 235 | layout_func = nx.spring_layout 236 | if vis_config is not None and hasattr(vis_config, 'layout'): 237 | if vis_config.layout == 'circular': 238 | layout_func = nx.circular_layout 239 | elif vis_config.layout == 'kamada_kawai': 240 | layout_func = nx.kamada_kawai_layout 241 | elif vis_config.layout == 'spectral': 242 | layout_func = nx.spectral_layout 243 | 244 | pos = layout_func(G, seed=42) 245 | 246 | # Create edge traces 247 | edge_traces = [] 248 | for edge in G.edges(): 249 | source, target = edge 250 | source_pos = pos[source] 251 | target_pos = pos[target] 252 | weight = G[source][target].get('weight', 1.0) 253 | 254 | # Calculate line transparency and width based on weight 255 | width = max(1, min(10, weight / 5)) 256 | opacity = min(1.0, max(0.3, weight / 10.0)) 257 | 258 | # Create edge line 259 | edge_trace = go.Scatter( 260 | x=[source_pos[0], target_pos[0]], 261 | y=[source_pos[1], target_pos[1]], 262 | line=dict(width=width, color=f'rgba(150, 150, 150, {opacity})'), 263 | hoverinfo='text', 264 | text=f"{source} → {target}
Weight: {weight:.2f}", 265 | mode='lines+markers', 266 | marker=dict(size=0), 267 | showlegend=False 268 | ) 269 | edge_traces.append(edge_trace) 270 | 271 | # Create arrowhead 272 | # Simple approximation of arrow position (80% along the edge) 273 | arrow_x = source_pos[0] * 0.2 + target_pos[0] * 0.8 274 | arrow_y = source_pos[1] * 0.2 + target_pos[1] * 0.8 275 | 276 | arrow_trace = go.Scatter( 277 | x=[arrow_x], 278 | y=[arrow_y], 279 | mode='markers', 280 | marker=dict( 281 | symbol='triangle-right', 282 | size=10, 283 | color=f'rgba(150, 150, 150, {opacity})', 284 | angle=np.degrees(np.arctan2( 285 | target_pos[1] - source_pos[1], 286 | target_pos[0] - source_pos[0] 287 | )) 288 | ), 289 | hoverinfo='none', 290 | showlegend=False 291 | ) 292 | edge_traces.append(arrow_trace) 293 | 294 | # Create node trace 295 | node_trace = go.Scatter( 296 | x=[pos[node][0] for node in G.nodes()], 297 | y=[pos[node][1] for node in G.nodes()], 298 | mode='markers+text', 299 | text=[node for node in G.nodes()], 300 | textposition="top center", 301 | hoverinfo='text', 302 | hovertext=[f"{node}
PageRank: {pagerank_scores.get(node, 0):.4f}" for node in G.nodes()], 303 | marker=dict( 304 | showscale=True, 305 | colorscale=node_colormap, 306 | color=[pagerank_scores.get(node, 0) for node in G.nodes()], 307 | size=[pagerank_scores.get(node, 0.01) * node_size_factor / 10 for node in G.nodes()], 308 | colorbar=dict( 309 | thickness=15, 310 | title=dict( 311 | text='PageRank Score', 312 | side='right' 313 | ), 314 | xanchor='left' 315 | ), 316 | line=dict(width=2) 317 | ) 318 | ) 319 | 320 | # Create figure 321 | fig = go.Figure( 322 | data=edge_traces + [node_trace], 323 | layout=go.Layout( 324 | title='Interactive LLM Endorsement Graph', 325 | titlefont=dict(size=16), 326 | showlegend=False, 327 | hovermode='closest', 328 | margin=dict(b=20, l=5, r=5, t=40), 329 | xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), 330 | yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), 331 | height=600, 332 | annotations=[ 333 | dict( 334 | text="Node size = PageRank score
Edge width = Endorsement strength", 335 | showarrow=False, 336 | xref="paper", yref="paper", 337 | x=0.01, y=-0.05 338 | ) 339 | ] 340 | ) 341 | ) 342 | 343 | # Save to HTML file 344 | fig.write_html(output_path) 345 | 346 | except Exception as e: 347 | logger.error(f"Error generating interactive visualization: {e}") 348 | 349 | 350 | if __name__ == "__main__": 351 | # Run as a standalone script 352 | generate_visualization() -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # SlopRank Tests 2 | 3 | This directory contains test files for the SlopRank library. 4 | 5 | ## Test Files 6 | 7 | | File | Description | 8 | |------|-------------| 9 | | `test_sloprank.py` | Simple end-to-end test for the SlopRank library | 10 | | `tiny_prompts.csv` | Minimal test prompts with just 2 simple questions | 11 | | `mini_prompts.csv` | Small test prompts with 3 more comprehensive questions | 12 | 13 | ## Running Tests 14 | 15 | To run the basic test: 16 | 17 | ```bash 18 | python test_sloprank.py 19 | ``` 20 | 21 | ### Test Process 22 | 23 | The test will automatically: 24 | 1. Create a test output directory (`test_results/`) 25 | 2. Collect responses from configured models 26 | 3. Collect evaluations between models 27 | 4. Parse evaluations 28 | 5. Build the endorsement graph 29 | 6. Compute the PageRank scores 30 | 7. Output the final rankings 31 | 32 | > **Note:** The full test may take several minutes to complete due to the time required for API calls to language models. 33 | 34 | ## Test Configuration 35 | 36 | The test script uses a simple configuration with: 37 | - 3 models: deepseek-reasoner, claude-3.7-sonnet, and chatgpt-4o 38 | - Simple factual questions to ensure fast responses 39 | - Full evaluation (all models evaluate each other) 40 | 41 | You can modify the test script to use different models, prompts, or evaluation settings. -------------------------------------------------------------------------------- /tests/test_sloprank.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple test script for SlopRank 3 | """ 4 | import pandas as pd 5 | import json 6 | from pathlib import Path 7 | from sloprank.config import EvalConfig, VisualizationConfig 8 | from sloprank.collect import collect_responses, collect_raw_evaluations 9 | from sloprank.parse import parse_evaluation_rows 10 | from sloprank.rank import build_endorsement_graph, compute_pagerank, finalize_rankings 11 | 12 | # Use existing tiny_prompts.csv file 13 | prompts_file = Path(__file__).parent / "tiny_prompts.csv" 14 | test_df = pd.read_csv(prompts_file) 15 | prompts = test_df["Questions"].tolist() 16 | 17 | # Define a simple test configuration 18 | config = EvalConfig( 19 | model_names=["deepseek-chat", "claude-3.5-haiku", "gpt-4o"], 20 | evaluation_method=1, # numeric 21 | use_subset_evaluation=False, # All models evaluate each other 22 | evaluators_subset_size=2, # This will be ignored since subset_evaluation is False 23 | output_dir=Path(__file__).parent / "test_results", 24 | request_delay=0.0 25 | ) 26 | 27 | # Create output directory 28 | config.output_dir.mkdir(exist_ok=True) 29 | 30 | # Create prompt pairs (prompt, answer_key) 31 | prompt_pairs = [(prompt, "") for prompt in prompts] 32 | 33 | # Collect responses 34 | print(f"Collecting responses from {len(config.model_names)} models for {len(prompts)} prompts...") 35 | responses_df = collect_responses(prompt_pairs, config) 36 | responses_df.to_csv(config.output_dir / "responses.csv", index=False) 37 | print(f"Saved responses to {config.output_dir}/responses.csv") 38 | 39 | # Collect evaluations 40 | print("Collecting evaluations...") 41 | raw_evaluations_df = collect_raw_evaluations(responses_df, config) 42 | raw_evaluations_df.to_csv(config.output_dir / "raw_evaluations.csv", index=False) 43 | print(f"Saved raw evaluations to {config.output_dir}/raw_evaluations.csv") 44 | 45 | # Parse evaluations 46 | print("Parsing evaluations...") 47 | evaluations_df = parse_evaluation_rows(raw_evaluations_df, config) 48 | evaluations_df.to_csv(config.output_dir / "evaluations.csv", index=False) 49 | print(f"Saved parsed evaluations to {config.output_dir}/evaluations.csv") 50 | 51 | # Build graph and compute rankings 52 | print("Building graph and computing rankings...") 53 | G = build_endorsement_graph(evaluations_df, config) 54 | pagerank_scores = compute_pagerank(G) 55 | rankings = finalize_rankings(pagerank_scores, config, G, evaluations_df) 56 | 57 | # Save rankings to file 58 | rankings_file = config.output_dir / "rankings.json" 59 | with open(rankings_file, "w") as f: 60 | json.dump(rankings, f, indent=4) 61 | print(f"Saved rankings to {rankings_file}") 62 | 63 | print("Test completed successfully!") -------------------------------------------------------------------------------- /tests/tiny_prompts.csv: -------------------------------------------------------------------------------- 1 | Questions 2 | What is the capital of France? 3 | Name three primary colors 4 | --------------------------------------------------------------------------------