├── .gitignore ├── .gitmodules ├── README.md ├── brunotest └── peerBench │ ├── bruno.json │ ├── collection.bru │ └── submit-score.bru ├── dEval_sim ├── README.md ├── dEval_sim ├── dEval_sim_writeup.html └── dEval_sim_writeup.ipynb ├── meeting-notes └── 20250218 │ ├── .DS_Store │ ├── RobustAiBench_initial_meeting_notes.docx │ ├── RobustAiBench_initial_meeting_notes.md │ ├── RobustAiBench_initial_meeting_notes.pdf │ ├── RobustAiBench_initial_meeting_transcript.json │ └── RobustAiBench_initial_meeting_transcript.pdf ├── peerBenchJS ├── .DS_Store ├── LICENSE ├── README.md ├── config.example.json ├── config.mmlu.json ├── config.mmlu.onlyEngineering.json ├── config.mmlu.onlyHistory.json ├── config.mmlu.onlyLaw.generated.llama-4-scout-wrong.json ├── config.mmlu.onlyLaw.json ├── data │ └── tasks │ │ ├── bigbench │ │ └── social_iqa.json │ │ ├── chatgpt4o_generated_newMMLUhistory_based_on_those_others_get_wrong_alot.json │ │ ├── mmlu-pro.onlyHistory.gemma-3-4b-it-wrong.generated.json │ │ ├── mmlu-pro.onlyLaw.generated.llama-4-scout-wrong.json │ │ └── mmlu-pro │ │ ├── mmlu-pro.onlyEngineering.json │ │ ├── mmlu-pro.onlyHistory.jsonl │ │ ├── mmlu-pro.onlyLaw.json │ │ └── mmlu-pro.parquet ├── eslint.config.js ├── package-lock.json ├── package.json ├── peerbench ├── src │ ├── base │ │ ├── provider.ts │ │ ├── task-format.ts │ │ └── uploader.ts │ ├── commands │ │ ├── aggregate.ts │ │ ├── prompt.ts │ │ ├── rephrase.ts │ │ ├── score.ts │ │ ├── std.ts │ │ └── upload.ts │ ├── config.ts │ ├── core │ │ ├── aggregate.ts │ │ ├── format.ts │ │ ├── logger.ts │ │ ├── parser.ts │ │ ├── program.ts │ │ ├── prompt.ts │ │ ├── providers.ts │ │ ├── score.ts │ │ ├── std.ts │ │ ├── uploaders.ts │ │ └── utils.ts │ ├── errors │ │ └── task.ts │ ├── formats │ │ ├── bigbench.ts │ │ ├── medQA.ts │ │ └── mmlu-pro.ts │ ├── index.ts │ ├── providers │ │ ├── nearai.ts │ │ └── openrouter.ts │ ├── types.ts │ └── uploaders │ │ └── coserver.ts ├── tsconfig.json └── tsup.config.ts ├── peerBenchPY ├── CONTRIBUTING.md ├── Dockerfile ├── LICENCE ├── Makefile ├── README.md ├── pyproject.toml ├── requirements.txt ├── run │ ├── build.sh │ ├── enter.sh │ ├── start.sh │ ├── stop.sh │ └── test.sh ├── tmp_consolidate_responses.py ├── tmprun.py ├── tmprun2.py ├── tmptestcid.py └── val │ ├── __init__.py │ ├── auth.py │ ├── key.py │ ├── providers │ ├── litellm │ │ └── provider.py │ └── openrouter │ │ └── provider.py │ ├── storage.py │ ├── task │ ├── add │ │ └── task.py │ ├── divide │ │ └── task.py │ └── task.py │ ├── utils.py │ └── val.py ├── py ├── .DS_Store ├── Iris.csv ├── Iris.parquet ├── compute_ipfs_cid.py ├── create_secp256k1_jws.py ├── create_secp256k1_jws_rfc.py ├── create_secp256k1_jwt.py ├── csv_to_parquet.py ├── display_parquet.py ├── hello_world.txt ├── outline.md ├── requirements.txt └── run_evaluation_pseudo.py ├── requirements.txt └── sample_data ├── .DS_Store ├── demoscene-js_cgi_benchmark ├── .DS_Store ├── prompt00 │ ├── .DS_Store │ ├── anthropic-claude-3.7-sonnet_06042025_a │ │ └── .DS_Store │ ├── google-Gemini-2.5-Pro_06042025_c │ │ └── .DS_Store │ ├── metadata.yaml │ └── openAI-o1_06042025_a │ │ └── .DS_Store ├── prompt01 │ └── metadata.yaml ├── prompt0_wave │ ├── .DS_Store │ ├── anthropic-claude-3.7-sonnet_06042025_a │ │ ├── .DS_Store │ │ └── great-wave.html │ ├── google-Gemini-2.5-Pro_06042025_a │ │ ├── .DS_Store │ │ └── index.html │ ├── metadata.yaml │ └── openAI-o1_06042025_a │ │ ├── .DS_Store │ │ └── index.html ├── prompt1_tree │ ├── .DS_Store │ ├── anthropic-claude-3.7-sonnet_06042025_a │ │ └── index.html │ ├── google-Gemini-2.5-Pro_06042025_a │ │ └── index.html │ ├── metadata.yaml │ └── openAI-o1_06042025_a │ │ └── index.html ├── prompt2_torus │ ├── .DS_Store │ ├── anthropic-claude-3.7-sonnet_06042025_a │ │ ├── .DS_Store │ │ └── index.html │ ├── google-Gemini-2.5-Pro_06042025_c │ │ ├── index.html │ │ ├── script.js │ │ └── style.css │ ├── metadata.yaml │ └── openAI-o1_06042025_a │ │ ├── .DS_Store │ │ └── index.html ├── prompt3_Sierpinski triangle │ ├── .DS_Store │ ├── anthropic-claude-3.7-sonnet_06042025_a │ │ ├── .DS_Store │ │ └── index.html │ ├── google-Gemini-2.5-Pro_06042025_c │ │ ├── .DS_Store │ │ └── index.html │ ├── metadata.yaml │ └── openAI-o1_06042025_a │ │ ├── .DS_Store │ │ └── index.html ├── prompt4_Jerusalem_cube │ ├── .DS_Store │ ├── anthropic-claude-3.7-sonnet_06042025_a │ │ ├── .DS_Store │ │ └── index.html │ ├── google-Gemini-2.5-Pro_06042025_c │ │ ├── .DS_Store │ │ └── index.html │ ├── metadata.yaml │ └── openAI-o1_06042025_a │ │ └── .DS_Store ├── prompt5_Menger_sponge │ ├── .DS_Store │ ├── anthropic-claude-3.7-sonnet_06042025_a │ │ ├── .DS_Store │ │ └── index.html │ ├── google-Gemini-2.5-Pro_06042025_s │ │ └── index.html │ ├── metadata.yaml │ └── openAI-o1_06042025_a │ │ ├── .DS_Store │ │ └── index.html ├── scores_metadata.yaml └── scores_metadata_eth_signature.txt ├── example_evaluation_result_publication.yaml ├── example_evaluation_results_video.yaml ├── example_evaluation_test_stream_publication.yaml ├── example_provider_registration_publication.yaml └── turbine_aerodynamics_example_response_audit_.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dEval-flask"] 2 | path = dEval-flask 3 | url = git@github.com:jworender/dEval-flask.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Getting started 2 | CLI (primary): https://github.com/peerbench/peerBench/tree/main/peerBenchJS 3 | UI (beta) : https://dev-peer-bench-js.vercel.app/dashboard 4 | 5 | # The problem: 6 | Public benchmark test data sets make AI model performance comparable. But this creates an incentivization for closed source models to cheat the benchmarks by training on test data or creating heuristics that overfit the benchmark test dataset . 7 | 8 | # Our solution : 9 | A network of independent experts validating models continuously with newly generated private test data and scoring that can be publicly audited. 10 | 11 | # Addressing Industry Wide Overfitting of AI Models 12 | 13 | Another very important piece of context in accelerating AI innovation is **Model Overfitting plaguing the AI industry**. 14 | 15 | Public benchmark test data sets make AI model performance comparable. But this creates an incentivization for closed source models in particular to game the benchmarks by creating heuristics for them or overfitting their training data to include solutions to the known testsets. 16 | 17 | For open source models **Dynabench**[^45] tries to solve the problem of overfitting on test datasets with a community of humans **intentionally creating new test data designed to be hard for models**. But Dynabench only works with open source models. Additionally Dynabench has not seen significant adoption even after being managed by mlCommons. We believe this lack of traction is due to a lack of incentives for evaluators or AI model owners to participate. 18 | 19 | Requiring recency of test data is another approach taken by LiveBench[^48] but it is primarily focused on the problem of LLM's accidentally including test data as part of training. A malictious actor can still overfit liveBench within days of the test data being released. And as they are a single entity they can be covertly bribed to give out the test data to one AI provider before the release. But the idea of releasing new test data continously is something we agree with and will also push for in our standard. 20 | 21 | **Centralized private test data evaluation** is another approach that has been attempted to resolve the problem of AI companies gaming benchmark results. One currently active private evaluator is the SEAL LLM Leaderboards[^46] by Scale.ai[^47]. However, this lacks the transparency and audibility to be fully trustworthy and widely used as a standard. 22 | 23 | **Decentralised systems are strongly positioned to solve** all of those issues thanks to providing proper incentivization to all parties involved while leveraging blockchains' decentralisation and transparency. **Private datasets** still can be a fundamental **part of the validation strategy**. 24 | 25 | # Test Data Quality Control Process 26 | 27 | # Become a Validator 28 | 29 | # Get Your Model Benchmarked 30 | 31 | 32 | [^45]: Link: https://arxiv.org/abs/2104.14337 33 | 34 | [^46]: Link: https://scale.com/leaderboard 35 | 36 | [^47]: Link: https://scale.com/ 37 | [^48]: Link: https://livebench.ai/#/ 38 | 39 | -------------------------------------------------------------------------------- /brunotest/peerBench/bruno.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1", 3 | "name": "peerBench", 4 | "type": "collection", 5 | "ignore": [ 6 | "node_modules", 7 | ".git" 8 | ] 9 | } -------------------------------------------------------------------------------- /brunotest/peerBench/collection.bru: -------------------------------------------------------------------------------- 1 | auth { 2 | mode: bearer 3 | } 4 | 5 | auth:bearer { 6 | token: eyJhbGciOiJFUzI1NksiLCJ0eXAiOiJKV1QifQ.eyJ1c2VyIjoidmFsaWRhdG9yIiwicm9sZSI6InZhbGlkYXRvciJ9.MEQCIE_8VzeUPhGrp_EgtrUEWLLUZO_zDq-zAxVPiG3FHfEsAiBi73HHTyQDJq-dBJoc9UkZ48WBiZ704r4STqEo-fm9Cw 7 | } 8 | -------------------------------------------------------------------------------- /brunotest/peerBench/submit-score.bru: -------------------------------------------------------------------------------- 1 | meta { 2 | name: submit-score 3 | type: http 4 | seq: 2 5 | } 6 | 7 | post { 8 | url: https://deval-flask.onrender.com/submit-score 9 | body: json 10 | auth: bearer 11 | } 12 | 13 | headers { 14 | Content-Type: application/json 15 | } 16 | 17 | auth:bearer { 18 | token: eyJhbGciOiJFUzI1NksiLCJ0eXAiOiJKV1QifQ.eyJ1c2VyIjoidmFsaWRhdG9yIiwicm9sZSI6InZhbGlkYXRvciJ9.MEQCIE_8VzeUPhGrp_EgtrUEWLLUZO_zDq-zAxVPiG3FHfEsAiBi73HHTyQDJq-dBJoc9UkZ48WBiZ704r4STqEo-fm9Cw 19 | } 20 | 21 | body:json { 22 | { 23 | "test_id": "1c97c740-bba0-4d2e-8459-bf7c761531a2", 24 | "model_id": "7a18d370-41c1-4d9d-8135-93d8fc937317", 25 | "validator_id": "0d845dc3-8f91-4eb8-978f-474d5dc86643", 26 | "score": 93.5, 27 | "metrics": { 28 | "accuracy": 0.94, 29 | "latency_ms": 120 30 | }, 31 | "evaluation_type": "primary", 32 | "hash": "0xabc123...", 33 | "evaluation_timestamp": "2024-03-30T15:00:00Z", 34 | "metadata": { 35 | "Target": 100, 36 | "goals": "return a json object with a sum of 7 and 93" 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /dEval_sim/README.md: -------------------------------------------------------------------------------- 1 | # dEval-validator-sim 2 | 3 | We propose a decentralized evaluation framework for generative AI models leveraging a validator-based approach. Each validator independently formulates evaluation prompts (original submissions) and executes them across a set of generative models, submitting their recorded performance metrics to a shared database. Subsequently, other validators cross-validate these prompts by independently running identical evaluations, with the outcomes used to corroborate or dispute the original results. Evaluations are aggregated through credibility-weighted averages, dynamically adjusting validators' credibility scores based on consistency between original submissions and corresponding cross-validation outcomes. A change in any validator’s credibility score retroactively influences all previous evaluations involving that validator, ensuring continuous refinement of model assessment accuracy. 4 | 5 | ## Mathematical Framework 6 | 7 | # Mathematical Framework for Decentralized Evaluation Simulation 8 | 9 | This document outlines the mathematical framework describing the operation of the validator simulation. 10 | 11 | **1. Definitions and Notation** 12 | 13 | * **Validators:** Let $V$ be the set of validators, indexed by $i$ or $j$. $V = \{0, 1, ..., N-1\}$, where $N = |V|$ is the total number of validators ($N=8$ in the simulation). 14 | * **Models:** Let $\mathcal{M}$ be the set of generative AI models being evaluated, indexed by $m$. $\mathcal{M} = \{\text{Model}_A, \text{Model}_B, ...\}$. Let $M = |\mathcal{M}|$ be the number of models. 15 | * **Time Steps:** Let $t$ denote the discrete time step or simulation round, $t = 0, 1, 2, ...$. 16 | * **Tests/Prompts:** Let $k$ denote a unique test instance (prompt and evaluation criteria). We assume a new test $k(t)$ is generated at each step $t \ge 1$. 17 | * **Credibility Score:** Let $C_i(t)$ be the credibility score of validator $i$ at the end of step $t$. $C_i(t) \in [C_{min}, C_{max}]$ (e.g., $[0.01, 1.0]$). 18 | * **"True" Score:** Let $T_{k,m}$ be the theoretical "true" performance score of model $m$ on test $k$. $T_{k,m} \in [0, 1]$. This exists only in the simulation. 19 | * **Submitted Score:** Let $S_{i,k,m}$ be the score submitted by validator $i$ for model $m$ on test $k$. $S_{i,k,m} \in [0, 1]$. 20 | * **Submission Type:** Let $\tau_s$ denote the type of a submission $s$, where $\tau_s \in \{\text{original}, \text{cross}\}$. 21 | * **Bad Actor Status:** Let $B_i$ be a boolean indicator, $B_i = 1$ if validator $i$ is a bad actor, $B_i = 0$ otherwise. 22 | * **Bad Actor Bias Function:** Let $\beta(i, m, \tau_s)$ be the bias introduced by validator $i$ for model $m$ during submission type $\tau_s$. 23 | * $\beta(i, m, \text{original}) > 0$ if $B_i=1$ and $m$ is the target model. 24 | * $\beta(i, m, \text{original}) < 0$ if $B_i=1$ and $m$ is the victim model. 25 | * $\beta(i, m, \text{original}) = 0$ if $B_i=0$. 26 | * $\beta(i, m, \text{cross}) = 0$ for all $i, m$ (as per simulation rules). 27 | * **Noise Function:** Let $\eta_{i,k,m}$ be a random noise term (e.g., drawn from $\mathcal{N}(0, \sigma^2)$), representing honest scoring variations. 28 | * **Results Store:** Let $R(t)$ be the set of all submissions recorded up to the end of step $t$. Each submission $s \in R(t)$ contains $(i_s, k_s, m_s, S_s, \tau_s, O_s, t_s)$ representing (submitter ID, test ID, model ID, score, submission type, original validator ID for test $k_s$, submission step). 29 | * **Originating Validator:** Let $O(t) \in V$ be the index of the validator who originates the test $k(t)$ at step $t$. (e.g., $O(t) = (t-1) \pmod N$). 30 | * **Cross-Validating Set:** Let $XVal(t)$ be the set of validators performing cross-validation at step $t$. $XVal(t) = V \setminus \{O(t)\}$. 31 | 32 | **2. Simulation Dynamics** 33 | 34 | * **Initialization (t=0):** 35 | * Set initial credibility $C_i(0) = C_{init}$ for all $i \in V$. 36 | * Initialize the results store $R(0) = \emptyset$. 37 | * Initialize final model scores $F_m(0)$ (e.g., $0.5$) for all $m \in \mathcal{M}$. 38 | 39 | * **Simulation Step $t$ (for $t \ge 1$):** 40 | 41 | * **Step 2.1: Test Generation & Origination** 42 | * Select originator $O(t) \in V$. 43 | * Generate new test $k(t)$. 44 | * Determine "true" scores $\{T_{k(t),m}\}_{m \in \mathcal{M}}$. 45 | 46 | * **Step 2.2: Score Submission (Original)** 47 | * Validator $O(t)$ calculates scores for all models $m \in \mathcal{M}$: 48 | * $S_{O(t), k(t), m} = \text{clamp}( T_{k(t),m} + \beta(O(t), m, \text{original}) + \eta_{O(t),k(t),m}, 0, 1 )$ 49 | 50 | where $\text{clamp}(x, a, b) = \max(a, \min(x, b))$. 51 | * Add original submissions $s_{orig} = (O(t), k(t), m, S_{O(t),k(t),m}, \text{original}, O(t), t)$ for each $m$ to $R(t-1)$ to form intermediate store $R'(t)$. 52 | 53 | * **Step 2.3: Score Submission (Cross-Validation)** 54 | * For each cross-validator $j \in XVal(t)$: 55 | * Validator $j$ calculates scores for all models $m \in \mathcal{M}$: 56 | * $S_{j, k(t), m} = \text{clamp}( T_{k(t),m} + \beta(j, m, \text{cross}) + \eta_{j,k(t),m}, 0, 1 )$ 57 | (Note: $\beta(j, m, \text{cross}) = 0$). 58 | * Add cross-validation submissions $s_{cross,j} = (j, k(t), m, S_{j,k(t),m}, \text{cross}, O(t), t)$ for each $m$ to $R'(t)$ to form the full results store $R(t)$. 59 | 60 | * **Step 2.4: Calculate Weighted Mean of Cross-Validations** 61 | * For the current test $k(t)$ and each model $m \in \mathcal{M}$: 62 | * Retrieve cross-validation scores $\{S_{j,k(t),m} | j \in XVal(t)\}$ and previous credibilities $\{C_j(t-1) | j \in XVal(t)\}$. 63 | * Calculate the weighted mean $WMean_{k(t),m}$: 64 | * $\text{Numerator} = \sum_{j \in XVal(t)} S_{j,k(t),m} \cdot C_j(t-1)$ 65 | * $\text{Denominator} = \sum_{j \in XVal(t)} C_j(t-1)$ 66 | * $WMean_{k(t),m} = \frac{\text{Numerator}}{\max(\text{Denominator}, \epsilon)}$ 67 | (where $\epsilon$ is a small positive constant, e.g., $10^{-9}$). 68 | 69 | * **Step 2.5: Calculate Discrepancy for Originator** 70 | * Retrieve the original scores $S_{O(t),k(t),m}$ for test $k(t)$. 71 | * Calculate a discrepancy measure, $\Delta_{O(t), k(t)}$. Example: Average Absolute Difference: 72 | * $\Delta_{O(t), k(t)} = \frac{1}{M} \sum_{m \in \mathcal{M}} | S_{O(t),k(t),m} - WMean_{k(t),m} |$ 73 | 74 | * **Step 2.6: Update Credibility Scores** 75 | * Define a credibility update function $f(\Delta, C_{old})$. Example using learning rate $L$ and penalty factor $P$: 76 | * $\text{Change} = L \cdot (1 - P \cdot \Delta)$ 77 | * $C^1_{O(t)}(t) = C_{O(t)}(t-1) + \text{Change}$ 78 | * $C_{O(t)}(t) = \text{clamp}( C^1_{O(t)}(t), C_{min}, C_{max} )$ 79 | 80 | * For all other validators $j \neq O(t)$: 81 | * $C_j(t) = C_j(t-1)$ 82 | 83 | * **Step 2.7: Recalculate Final Model Scores (Retroactive)** 84 | * For each model $m \in \mathcal{M}$: 85 | * Initialize $Num_m = 0$, $Denom_m = 0$. 86 | * Iterate through *all* submissions $s = (i_s, k_s, m_s, S_s, \tau_s, O_s, t_s)$ in the *entire history* $R(t)$. 87 | * If $m_s = m$: 88 | * Retrieve the *current* credibility $C_{i_s}(t)$ for the submitter $i_s$. 89 | * $Num_m = Num_m + S_s \cdot C_{i_s}(t)$ 90 | * $Denom_m = Denom_m + C_{i_s}(t)$ 91 | * Calculate the final score for model $m$ at step $t$: 92 | * $F_m(t) = \frac{Num_m}{\max(Denom_m, \epsilon)}$ 93 | 94 | **3. Goal Demonstration** 95 | 96 | The framework shows that if a bad actor validator $i$ (where $B_i=1$) consistently submits biased scores $S_{i,k,m}$ during its origination steps (Step 2.2), the discrepancy $\Delta_{i, k(t)}$ (Step 2.5) will likely be larger compared to honest validators. This leads to repeated negative credibility updates (Step 2.6), causing $C_i(t)$ to decrease. Consequently, in the final score calculation (Step 2.7), the contribution of validator $i$'s submissions ($S_s$ where $i_s = i$) is down-weighted by the low $C_i(t)$, reducing the bad actor's influence on the final model scores $F_m(t)$. 97 | -------------------------------------------------------------------------------- /meeting-notes/20250218/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/.DS_Store -------------------------------------------------------------------------------- /meeting-notes/20250218/RobustAiBench_initial_meeting_notes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/RobustAiBench_initial_meeting_notes.docx -------------------------------------------------------------------------------- /meeting-notes/20250218/RobustAiBench_initial_meeting_notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/RobustAiBench_initial_meeting_notes.pdf -------------------------------------------------------------------------------- /meeting-notes/20250218/RobustAiBench_initial_meeting_transcript.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/RobustAiBench_initial_meeting_transcript.pdf -------------------------------------------------------------------------------- /peerBenchJS/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/peerBenchJS/.DS_Store -------------------------------------------------------------------------------- /peerBenchJS/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Forest-Protocols 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /peerBenchJS/config.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": [ 3 | "data/tasks/mmlu-pro/mmlu-pro.parquet", 4 | "data/tasks/bigbench/social_iqa.json" 5 | ], 6 | "models": [ 7 | "near.ai:fireworks::accounts/fireworks/models/deepseek-v3", 8 | "near.ai:fireworks::accounts/fireworks/models/llama-v3p1-8b-instruct", 9 | "openrouter.ai:openai/chatgpt-4o-latest", 10 | "openrouter.ai:google/gemini-2.0-flash-001", 11 | "openrouter.ai:google/gemini-2.0-flash-lite-001", 12 | "openrouter.ai:meta-llama/llama-4-scout", 13 | "openrouter.ai:meta-llama/llama-4-maverick", 14 | "openrouter.ai:mistralai/mistral-small-3.1-24b-instruct" 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /peerBenchJS/config.mmlu.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": ["data/tasks/mmlu-pro.parquet"], 3 | "models": ["openrouter.ai:meta-llama/llama-4-scout"] 4 | } 5 | -------------------------------------------------------------------------------- /peerBenchJS/config.mmlu.onlyEngineering.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": ["./data/tasks/mmlu-pro.onlyEngineering.json"], 3 | "models": ["openrouter.ai:meta-llama/llama-4-scout"] 4 | } 5 | -------------------------------------------------------------------------------- /peerBenchJS/config.mmlu.onlyHistory.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": ["./data/tasks/mmlu-pro.onlyHistory.jsonl"], 3 | "models": [ 4 | "openrouter.ai:google/gemma-3-4b-it", 5 | "openrouter.ai:meta-llama/llama-4-scout", 6 | "openrouter.ai:mistralai/mistral-small-3.1-24b-instruct" 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /peerBenchJS/config.mmlu.onlyLaw.generated.llama-4-scout-wrong.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": ["./data/tasks/mmlu-pro.onlyLaw.generated.llama-4-scout-wrong.json"], 3 | "models": ["openrouter.ai:meta-llama/llama-4-scout"] 4 | } 5 | -------------------------------------------------------------------------------- /peerBenchJS/config.mmlu.onlyLaw.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": ["./data/tasks/mmlu-pro.onlyLaw.json"], 3 | "models": ["openrouter.ai:google/gemma-3-4b-it"] 4 | } 5 | -------------------------------------------------------------------------------- /peerBenchJS/data/tasks/mmlu-pro/mmlu-pro.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/peerBenchJS/data/tasks/mmlu-pro/mmlu-pro.parquet -------------------------------------------------------------------------------- /peerBenchJS/eslint.config.js: -------------------------------------------------------------------------------- 1 | import eslint from "@eslint/js"; 2 | import tseslint from "typescript-eslint"; 3 | 4 | export default [ 5 | { 6 | ignores: ["dist/**/*.js"], 7 | }, 8 | ...tseslint.config({ 9 | extends: [eslint.configs.recommended, tseslint.configs.recommended], 10 | files: ["src/**/*.ts"], 11 | 12 | rules: { 13 | "@typescript-eslint/no-explicit-any": "off", 14 | "@typescript-eslint/no-empty-object-type": "off", 15 | }, 16 | }), 17 | ]; 18 | -------------------------------------------------------------------------------- /peerBenchJS/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "peerbench", 3 | "version": "1.0.0", 4 | "description": "Peer Bench is a decentralized AI evaluation framework", 5 | "keywords": [ 6 | "ai", 7 | "evaluation", 8 | "validation", 9 | "benchmark" 10 | ], 11 | "homepage": "https://github.com/Forest-Protocols/peer-bench#readme", 12 | "bugs": { 13 | "url": "https://github.com/Forest-Protocols/peer-bench/issues" 14 | }, 15 | "repository": { 16 | "type": "git", 17 | "url": "git+https://github.com/Forest-Protocols/peer-bench.git" 18 | }, 19 | "license": "MIT", 20 | "author": "Forest AI Development Team", 21 | "type": "module", 22 | "main": "dist/index.js", 23 | "scripts": { 24 | "lint": "tsc && eslint", 25 | "dev": "tsx src/index.ts", 26 | "watch": "tsup --silent --watch --onSuccess 'tsc-alias -p tsconfig.json -f && node dist/index.js'", 27 | "start": "node dist/index.js", 28 | "build": "tsup --onSuccess 'tsc-alias -p tsconfig.json -f'", 29 | "clean": "rm -rf dist", 30 | "clean:outputs": "rm -rf data/output/*", 31 | "clean:logs": "rm -rf data/logs/*" 32 | }, 33 | "dependencies": { 34 | "@dotenvx/dotenvx": "^1.39.1", 35 | "@inquirer/prompts": "^7.4.1", 36 | "ansis": "^3.17.0", 37 | "axios": "^1.8.4", 38 | "commander": "^13.1.0", 39 | "csv": "^6.3.11", 40 | "glob": "^11.0.2", 41 | "hyparquet": "^1.11.0", 42 | "multiformats": "^13.3.2", 43 | "openai": "^4.92.1", 44 | "table": "^6.9.0", 45 | "uuid": "^11.1.0", 46 | "viem": "^2.26.2", 47 | "winston": "^3.17.0", 48 | "yaml": "^2.7.1", 49 | "zod": "^3.24.2" 50 | }, 51 | "devDependencies": { 52 | "@eslint/js": "^9.24.0", 53 | "@types/node": "^22.14.0", 54 | "eslint": "^9.24.0", 55 | "tsc-alias": "^1.8.13", 56 | "tsup": "^8.4.0", 57 | "tsx": "^4.19.3", 58 | "typescript": "^5.8.3", 59 | "typescript-eslint": "^8.29.0" 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /peerBenchJS/peerbench: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | npm run dev --silent -- $* -------------------------------------------------------------------------------- /peerBenchJS/src/base/provider.ts: -------------------------------------------------------------------------------- 1 | import { parseEnvVariables } from "@/config"; 2 | import { logger } from "@/core/logger"; 3 | import { MaybePromise, ModelResponse } from "@/types"; 4 | import winston from "winston"; 5 | import { z } from "zod"; 6 | 7 | /** 8 | * Base class for Providers 9 | */ 10 | export abstract class AbstractProvider< 11 | T extends Record = Record, 12 | > { 13 | readonly name: string; 14 | 15 | logger: winston.Logger; 16 | env: { [K in keyof T]: z.infer }; 17 | 18 | /** 19 | * Initialize a new Provider 20 | * @param options 21 | */ 22 | constructor(options: { 23 | /** 24 | * Name of the provider 25 | */ 26 | name: string; 27 | 28 | env?: T; 29 | }) { 30 | this.name = options.name; 31 | this.env = {} as T; 32 | 33 | try { 34 | const capitalizedName = options.name.replace(".", "_").toUpperCase(); 35 | 36 | if (options.env !== undefined) { 37 | const env: Record = {}; 38 | const originalKeys: Record = {}; 39 | 40 | for (const [key, schema] of Object.entries(options.env)) { 41 | const envKey = `PB_${capitalizedName}_${key}`; 42 | 43 | originalKeys[envKey] = key; 44 | env[envKey] = schema; 45 | } 46 | 47 | const parsedEnv = parseEnvVariables(env); 48 | 49 | for (const [key, value] of Object.entries(parsedEnv)) { 50 | this.env[originalKeys[key] as keyof T] = value; 51 | } 52 | } 53 | 54 | this.logger = logger.child({ 55 | context: `Provider(${this.name})`, 56 | }); 57 | } catch (err: any) { 58 | throw new Error(`${this.name}: ${err?.message || err}`, { 59 | cause: err, 60 | }); 61 | } 62 | } 63 | 64 | /** 65 | * Decentralized identifier of the Provider 66 | */ 67 | get did() { 68 | return `did:prov:${this.name.toLowerCase()}`; 69 | } 70 | 71 | /** 72 | * Executes the given prompt and returns the response 73 | * @param prompt 74 | */ 75 | abstract forward( 76 | prompt: string, 77 | model: string, 78 | system: string 79 | ): Promise; 80 | 81 | abstract parseModelIdentifier(identifier: string): MaybePromise<{ 82 | modelName: string; 83 | modelOwner: string; 84 | subProvider?: string; 85 | }>; 86 | } 87 | -------------------------------------------------------------------------------- /peerBenchJS/src/base/task-format.ts: -------------------------------------------------------------------------------- 1 | import { MaybePromise, Task } from "@/types"; 2 | 3 | /** 4 | * Abstract class that formats needs to implement 5 | */ 6 | export abstract class AbstractTaskFormat { 7 | /** 8 | * Name of the format 9 | */ 10 | abstract name: string; 11 | 12 | /** 13 | * Read and parses the given file as a Task object 14 | */ 15 | abstract parseFromFile(content: string): MaybePromise; 16 | 17 | /** 18 | * Returns `true` if the given content recognized by this class 19 | */ 20 | abstract recognize(content: any): MaybePromise; 21 | 22 | /** 23 | * Creates a raw object that can be parsed by the `targetFOrmat`. 24 | * 25 | * @param task The task to convert 26 | * @param targetFormat The format type to convert to 27 | * @returns The converted data in a format that the target format can parse 28 | */ 29 | abstract convertTo(task: Task, targetFormat: string): MaybePromise; 30 | } 31 | -------------------------------------------------------------------------------- /peerBenchJS/src/base/uploader.ts: -------------------------------------------------------------------------------- 1 | import { logger } from "@/core/logger"; 2 | 3 | /** 4 | * Base class for uploader classes 5 | */ 6 | export abstract class AbstractUploader { 7 | name = "Abstract"; 8 | logger = logger.child({ context: `Uploader(${this.constructor.name})` }); 9 | 10 | abstract init(): Promise; 11 | 12 | abstract uploadPromptResponses( 13 | filePath: string, 14 | options?: UploadOptions 15 | ): Promise; 16 | abstract uploadScores( 17 | filePath: string, 18 | options?: UploadOptions 19 | ): Promise; 20 | 21 | // TODO: upload aggregation method 22 | } 23 | 24 | export type UploadOptions = { 25 | batchSize?: number; 26 | }; 27 | -------------------------------------------------------------------------------- /peerBenchJS/src/commands/std.ts: -------------------------------------------------------------------------------- 1 | import { config } from "@/config"; 2 | import { logger } from "@/core/logger"; 3 | import { checkValidationError } from "@/core/utils"; 4 | import { program } from "@/core/program"; 5 | import { yellow } from "ansis"; 6 | import { z } from "zod"; 7 | import { convertTaskFormat } from "@/core/std"; 8 | import { taskFormats } from "@/core/format"; 9 | 10 | const taskFormatNames = taskFormats.map((t) => t.name); 11 | 12 | program 13 | .command("std") 14 | .description("Standardize data format between different schemas") 15 | .requiredOption( 16 | "-s, --source ", 17 | "Source file with data to be standardized" 18 | ) 19 | .option( 20 | "-t, --to ", 21 | `Target schema type: ${taskFormatNames.map((t) => `"${t}"`).join(", ")}`, 22 | "medqa" 23 | ) 24 | .option( 25 | "-o, --output ", 26 | "Output file path (default: derived from source filename)" 27 | ) 28 | .action( 29 | async (rawOptions: { 30 | source: string; 31 | from: string; 32 | to: string; 33 | output?: string; 34 | }) => { 35 | logger.debug(`Validator DID ${yellow.bold(config.VALIDATOR_DID)}`); 36 | const options = checkValidationError( 37 | z 38 | .object({ 39 | source: z.string(), 40 | to: z 41 | .string() 42 | .default("medqa") 43 | .transform((value, ctx) => { 44 | if (!taskFormatNames.includes(value)) { 45 | ctx.addIssue({ 46 | code: z.ZodIssueCode.custom, 47 | message: "Invalid target schema type", 48 | }); 49 | return z.NEVER; 50 | } 51 | 52 | return value; 53 | }), 54 | output: z.string().optional(), 55 | }) 56 | .safeParse(rawOptions) 57 | ); 58 | 59 | await convertTaskFormat({ 60 | sourceTaskFile: options.source, 61 | output: options.output, 62 | targetFormat: options.to, 63 | }); 64 | } 65 | ) 66 | .allowUnknownOption(true); 67 | -------------------------------------------------------------------------------- /peerBenchJS/src/commands/upload.ts: -------------------------------------------------------------------------------- 1 | import { logger } from "@/core/logger"; 2 | import { getUploader, uploaders } from "@/core/uploaders"; 3 | import { checkValidationError, saveJobLog } from "@/core/utils"; 4 | import { program } from "@/core/program"; 5 | import { z } from "zod"; 6 | import { v7 as uuidv7 } from "uuid"; 7 | 8 | program 9 | .command("upload") 10 | .description("Uploads the given files to a remote server") 11 | .requiredOption( 12 | "-t, --target ", 13 | `Points the target remote server name. Available targets servers ${uploaders 14 | .map((u) => u.name) 15 | .join(", ")}` 16 | ) 17 | .option( 18 | "-s, --scores ", 19 | "The score files that is going to be uploaded" 20 | ) 21 | .option( 22 | "-r, --responses ", 23 | "The response files that is going to be uploaded" 24 | ) 25 | .action( 26 | async (rawOptions: { 27 | target: string; 28 | scores?: string[]; 29 | responses?: string[]; 30 | }) => { 31 | const options = checkValidationError( 32 | z 33 | .object({ 34 | target: z.string(), 35 | task: z.string().default("default"), 36 | scores: z.array(z.string()).default([]), 37 | responses: z.array(z.string()).default([]), 38 | }) 39 | .safeParse(rawOptions) 40 | ); 41 | 42 | const startedAt = Date.now(); 43 | const uploader = getUploader(options.target); 44 | 45 | if (uploader === undefined) { 46 | throw new Error(`Target "${options.target}" is not available`); 47 | } 48 | 49 | if (options.responses.length > 0) { 50 | await uploader.init(); 51 | 52 | logger.info(`Uploading given responses to ${options.target}`); 53 | await Promise.all( 54 | options.responses.map((path) => 55 | uploader.uploadPromptResponses(path, { batchSize: 20 }) 56 | ) 57 | ); 58 | } 59 | 60 | if (options.scores.length > 0) { 61 | await uploader.init(); 62 | 63 | logger.info(`Uploading given scores to ${options.target}`); 64 | await Promise.all( 65 | options.scores.map((path) => 66 | uploader.uploadScores(path, { batchSize: 20 }) 67 | ) 68 | ); 69 | } 70 | 71 | await saveJobLog( 72 | { 73 | uuid: uuidv7(), 74 | jobType: "upload", 75 | startedAt: +startedAt, 76 | completedAt: Date.now(), 77 | scoreFiles: options.scores, 78 | responseFiles: options.responses, 79 | }, 80 | "upload", 81 | startedAt 82 | ); 83 | } 84 | ) 85 | .allowUnknownOption(true); 86 | -------------------------------------------------------------------------------- /peerBenchJS/src/config.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { red } from "ansis"; 3 | import dotenv from "@dotenvx/dotenvx"; 4 | import { join } from "path"; 5 | import { mkdirSync } from "fs"; 6 | import { LogLevels, NodeEnvs } from "./types"; 7 | import { Address, privateKeyToAccount } from "viem/accounts"; 8 | 9 | dotenv.config({ ignore: ["MISSING_ENV_FILE"], logLevel: "blank", quiet: true }); 10 | 11 | /** 12 | * Parses the environment variables based on the given fields. 13 | * Exits with code 1 if the parse is failed 14 | * @returns Parsed environment variables 15 | */ 16 | export function parseEnvVariables>( 17 | fields: T 18 | ): z.infer> { 19 | const environmentSchema = z.object(fields); 20 | const validation = environmentSchema.safeParse(process.env, {}); 21 | 22 | if (validation.error) { 23 | const error = validation.error.errors[0]; 24 | const path = error.path.length > 0 ? error.path.join(".") + ": " : ""; 25 | console.error( 26 | red(`Error while parsing environment variables: ${path}${error.message}`) 27 | ); 28 | process.exit(1); 29 | } 30 | 31 | return validation.data; 32 | } 33 | 34 | // Parse variables 35 | const env = parseEnvVariables({ 36 | NODE_ENV: z.enum(NodeEnvs).default("dev"), 37 | LOG_LEVEL: z.enum(LogLevels).default("debug"), 38 | COOL_DOWN_INTERVAL: z.coerce.number().default(2000), 39 | PRIVATE_KEY: z.string().nonempty(), 40 | }); 41 | 42 | const VALIDATOR_ACCOUNT = privateKeyToAccount(env.PRIVATE_KEY as Address); 43 | export const config = { 44 | ...env, 45 | OUTPUT_DIR: join(process.cwd(), "data", "output"), 46 | DATA_DIR: join(process.cwd(), "data"), 47 | VALIDATOR_ADDRESS: VALIDATOR_ACCOUNT.address, 48 | VALIDATOR_DID: `did:val:${VALIDATOR_ACCOUNT.address}`, 49 | VALIDATOR_ACCOUNT, 50 | }; 51 | 52 | // This also makes `data` directory 53 | mkdirSync(config.OUTPUT_DIR, { recursive: true }); 54 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/aggregate.ts: -------------------------------------------------------------------------------- 1 | import { 2 | AggregationResult as AggregationResults, 3 | PromptScoreSchema, 4 | } from "@/types"; 5 | import { checkValidationError, readFile } from "./utils"; 6 | import { logger } from "./logger"; 7 | import { z } from "zod"; 8 | 9 | export async function aggregate( 10 | scoreFilePaths: string[] 11 | ): Promise { 12 | const scores = scoreFilePaths 13 | .map((path) => { 14 | try { 15 | // TODO: Make it possible to read CSV files 16 | const content = readFile(path); 17 | const arraySchema = z.array(PromptScoreSchema); 18 | const json = JSON.parse(content); 19 | 20 | return checkValidationError(arraySchema.safeParse(json)); 21 | } catch (err) { 22 | logger.warning(`Score file ${path} couldn't read: ${err}`); 23 | } 24 | }) 25 | .filter((s) => s !== undefined) 26 | .flat(); 27 | 28 | if (scores.length === 0) { 29 | throw new Error(`No scores read to aggregate`); 30 | } 31 | 32 | // We assume that all of the scores come from the same task 33 | // otherwise they wouldn't be comparable so we can simply find 34 | // the taskDID by just looking at any of the score from the array. 35 | const taskDID = scores[0].taskDID; 36 | const results: Record< 37 | string, 38 | { 39 | score: number; 40 | latency: number; 41 | responseCount: number; 42 | wrongAnswers: number; 43 | modelDID: string; 44 | providerDID: string; 45 | runIds: Set; 46 | sourcePromptDatasetCIDs: Set; 47 | sourceFileNames: Set; 48 | } 49 | > = {}; 50 | 51 | const result: AggregationResults = []; 52 | 53 | for (const score of scores) { 54 | const key = `${score.providerDID}:${score.modelDID}`; 55 | 56 | if (!results[key]) { 57 | results[key] = { 58 | providerDID: score.providerDID, 59 | modelDID: score.modelDID, 60 | score: 0, 61 | latency: 0, 62 | responseCount: 0, 63 | wrongAnswers: 0, 64 | runIds: new Set(), 65 | sourcePromptDatasetCIDs: new Set(), 66 | sourceFileNames: new Set(), 67 | }; 68 | } 69 | 70 | results[key].score += score.score; 71 | results[key].latency += score.repliedAt - score.promptedAt; 72 | results[key].responseCount++; 73 | 74 | // Add runId to the set (this will handle duplicates automatically) 75 | if (score.runId) { 76 | results[key].runIds.add(score.runId); 77 | } 78 | 79 | // Add sourcePromptDatasetCID to the set 80 | if (score.sourcePromptDatasetCID) { 81 | results[key].sourcePromptDatasetCIDs.add(score.sourcePromptDatasetCID); 82 | } 83 | 84 | // Add sourceFileName to the set 85 | if (score.sourceFileName) { 86 | results[key].sourceFileNames.add(score.sourceFileName); 87 | } 88 | 89 | if (score.score === 0) { 90 | results[key].wrongAnswers++; 91 | } 92 | } 93 | 94 | for (const [, values] of Object.entries(results)) { 95 | result.push({ 96 | providerDID: values.providerDID, 97 | modelDID: values.modelDID, 98 | taskDID, 99 | avgLatency: values.latency / values.responseCount / 1000, 100 | avgScore: parseFloat((values.score / values.responseCount).toFixed(2)), 101 | missingAnswers: Math.abs( 102 | values.responseCount - values.score - values.wrongAnswers 103 | ), 104 | score: values.score, 105 | totalResponse: values.responseCount, 106 | wrongAnswers: values.wrongAnswers, 107 | score_runIds: Array.from(values.runIds), 108 | sourcePromptDatasetCIDs: Array.from(values.sourcePromptDatasetCIDs), 109 | sourceFileNames: Array.from(values.sourceFileNames), 110 | }); 111 | } 112 | 113 | result.sort((a, b) => { 114 | const order = [ 115 | [b.score, a.score], 116 | [a.avgScore, b.avgScore], 117 | [a.avgLatency, b.avgLatency], 118 | [b.totalResponse, a.totalResponse], 119 | ]; 120 | 121 | for (const values of order) { 122 | if (values[0] !== values[1]) { 123 | return values[0] - values[1]; 124 | } 125 | } 126 | 127 | const lastOrderColumn = order[order.length - 1]; 128 | return lastOrderColumn[0] - lastOrderColumn[1]; 129 | }); 130 | 131 | return result; 132 | } 133 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/format.ts: -------------------------------------------------------------------------------- 1 | import { BigBenchTaskFormat } from "@/formats/bigbench"; 2 | import { logger } from "./logger"; 3 | import { parse as parsePath } from "path"; 4 | import { MMLUProTaskFormat } from "@/formats/mmlu-pro"; 5 | import { MedQATaskFormat } from "@/formats/medQA"; 6 | import { parseTaskDID } from "./parser"; 7 | import { statSync } from "fs"; 8 | 9 | /** 10 | * Add all the possible task file formats 11 | */ 12 | export const taskFormats = [ 13 | new BigBenchTaskFormat(), 14 | new MMLUProTaskFormat(), 15 | new MedQATaskFormat(), 16 | ]; 17 | 18 | export function getTaskFormat(name: string) { 19 | for (const tp of taskFormats) { 20 | if (tp.name === name) { 21 | return tp; 22 | } 23 | } 24 | throw new Error(`No task processor found called "${name}"`); 25 | } 26 | 27 | /** 28 | * Loads and parses the given task file with the first possible reader from readers list. 29 | */ 30 | export async function readTask(path: string) { 31 | if (!statSync(path, { throwIfNoEntry: false })?.isFile()) { 32 | throw new Error(`The task file doesn't exist: ${path}`); 33 | } 34 | 35 | for (const taskFormat of taskFormats) { 36 | try { 37 | const task = await taskFormat.parseFromFile(path); 38 | const taskName = parseTaskDID(task.did); 39 | 40 | // If the task name doesn't exist, just use the file name (without extension) as its name 41 | if (taskName === "") { 42 | task.did = `did:task:${parsePath(path).name}`; 43 | } 44 | 45 | return { 46 | formatName: taskFormat.name, 47 | task, 48 | }; 49 | } catch (err) { 50 | logger.debug( 51 | `Task format "${taskFormat.name}" didn't work for the task file ${path}: ${err}` 52 | ); 53 | } 54 | } 55 | 56 | throw new Error(`No valid task format found for the given file: ${path}`); 57 | } 58 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/logger.ts: -------------------------------------------------------------------------------- 1 | import { createLogger, format, transports } from "winston"; 2 | import { config } from "../config"; 3 | import * as ansis from "ansis"; 4 | 5 | type ColorName = "red" | "yellow" | "green" | "magenta" | "cyan"; 6 | type LogLevel = "error" | "warning" | "info" | "debug"; 7 | 8 | const colors = { 9 | error: "red", 10 | warning: "yellow", 11 | info: "green", 12 | debug: "cyan", 13 | }; 14 | 15 | export const logger = createLogger({ 16 | level: config.LOG_LEVEL, 17 | levels: { 18 | error: 0, 19 | warning: 1, 20 | info: 2, 21 | debug: 3, 22 | }, 23 | transports: [ 24 | new transports.Console({ 25 | format: format.combine( 26 | format.prettyPrint(), 27 | format.splat(), 28 | format.timestamp({ 29 | format: "YYYY/MM/DD - HH:mm:ss", 30 | }), 31 | format.printf((info) => { 32 | const color = colors[info.level as LogLevel]; 33 | const levelColor = ansis[color as ColorName]; 34 | const context = (info.context as string) || ""; 35 | const formatted = [ 36 | `${info.timestamp}`, 37 | levelColor.bold(info.level.toUpperCase()), 38 | levelColor.bold(`[${context || "Main"}]`).padEnd(25), 39 | config.NODE_ENV == "dev" && info.stack 40 | ? levelColor( 41 | `${info.message}${(info.stack as string[]).join("\n")}` 42 | ) 43 | : levelColor(info.message), 44 | ]; 45 | 46 | return formatted.filter((f) => f).join(" "); 47 | }) 48 | ), 49 | }), 50 | ], 51 | }); 52 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/parser.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { logger } from "./logger"; 3 | 4 | /** 5 | * Tries to parse the given string as JSON. 6 | * Returns undefined if it is not a valid JSON entity. 7 | */ 8 | export function tryParseJson(content: string): T | undefined { 9 | try { 10 | return JSON.parse(content); 11 | } catch { 12 | // Invalid JSON 13 | } 14 | } 15 | 16 | /** 17 | * Parses provider and model config which follow the pattern below: 18 | * : 19 | * @returns Provider name and model identifier 20 | */ 21 | export function parseProviderConfig(identifier: string) { 22 | const regex = /^([^:]+):(.+)$/; 23 | const match = identifier.match(regex); 24 | if (!match) { 25 | logger.warning(`Invalid provider config: ${identifier}`); 26 | return; 27 | } 28 | 29 | const [, name, modelIdentifier] = match; 30 | return { 31 | providerName: name, 32 | modelIdentifier, 33 | }; 34 | } 35 | 36 | /** 37 | * Parses JSONL formatted string into an array 38 | * @returns An array of parsed JSON lines 39 | */ 40 | export function parseJSONL(str: string): T[] { 41 | return str 42 | .split("\n") // Split per line 43 | .filter((line) => line.trim() !== "") // Filter empty lines 44 | .map((line) => tryParseJson(line)) // Parse line (parse as undefined if it is invalid) 45 | .filter((obj) => obj !== undefined); // Filter invalid lines 46 | } 47 | 48 | /** 49 | * Extracts the first error message (if there is any) 50 | * from a Zod safe parse result and format it. 51 | * @param safeParseReturn 52 | * @param path Path of the parsing object. It will be used to indicate the invalid field if the info is not available in the validation error. 53 | */ 54 | export function parseValidationError( 55 | safeParseReturn: z.SafeParseReturnType, 56 | path?: string 57 | ) { 58 | path ??= ""; 59 | 60 | if (safeParseReturn?.error) { 61 | const firstError = safeParseReturn.error.errors[0]; 62 | 63 | if (path) { 64 | path = `${path}: `; 65 | } 66 | 67 | // Include path if there is 68 | path = 69 | firstError.path.length > 0 ? `"${firstError.path.join(".")}": ` : path; 70 | return `${path}${firstError.message}`; 71 | } 72 | } 73 | 74 | /** 75 | * Parses provider name from a DID provider definition 76 | */ 77 | export function parseProviderDID(did: string) { 78 | return did.split(":")[2]; 79 | } 80 | 81 | /** 82 | * Parses task name from a DID definition (yes it is the same thing as `parseProviderDID`) 83 | */ 84 | export function parseTaskDID(did: string) { 85 | return did.split(":")[2]; 86 | } 87 | 88 | /** 89 | * Parses the model name (including owner name) from a DID model definition 90 | */ 91 | export function parseModelDID(did: string) { 92 | const parts = did.split(":").slice(2); 93 | 94 | // Model name includes ':' character 95 | if (parts.length > 2) { 96 | return parts.join(":"); 97 | } 98 | return parts.join(""); 99 | } 100 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/program.ts: -------------------------------------------------------------------------------- 1 | import { Command } from "commander"; 2 | 3 | export const program = new Command("peerbench") 4 | .allowUnknownOption(true) 5 | .configureHelp({ 6 | showGlobalOptions: true, 7 | }); 8 | 9 | program.configureHelp({ 10 | optionTerm(option) { 11 | return option.flags; 12 | }, 13 | subcommandTerm(cmd) { 14 | return cmd.name(); 15 | }, 16 | commandUsage(cmd) { 17 | const usage: string[] = []; 18 | for (let parent = cmd.parent; parent; parent = parent.parent) { 19 | usage.push(parent.name()); 20 | } 21 | usage.reverse(); 22 | return `${usage.join(" ")} ${cmd.name()} ${cmd.usage()}`; 23 | }, 24 | }); 25 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/prompt.ts: -------------------------------------------------------------------------------- 1 | import { readTask } from "./format"; 2 | import { generateCID, readableTime } from "./utils"; 3 | import { config } from "@/config"; 4 | import { AbstractProvider } from "@/base/provider"; 5 | import { 6 | MaybePromise, 7 | Prompt, 8 | PromptOptions, 9 | PromptResponse, 10 | Task, 11 | } from "@/types"; 12 | import { v7 as uuidv7 } from "uuid"; 13 | import { parseProviderConfig } from "./parser"; 14 | import { logger } from "./logger"; 15 | import { blue, yellow } from "ansis"; 16 | import { getProvider } from "./providers"; 17 | import { basename } from "path"; 18 | import { calculateSHA256 } from "./std"; 19 | 20 | /** 21 | * Sends the prompts from the given task files to the given Providers and 22 | * collects the responses. 23 | * @param identifiers Provider and model identifiers. Should be in `providerName:modelOwner/modelName` format 24 | * @param taskPaths Path of the task files in the local file system 25 | */ 26 | export async function prompt( 27 | identifiers: string[], 28 | taskPaths: string[], 29 | options?: { 30 | /** 31 | * Maximum amount of prompt that will be used from the beginning of the given tasks. 32 | * Uses all tasks if undefined 33 | * @default undefined 34 | */ 35 | maxPrompt?: number; 36 | /** 37 | * Called whenever a response is received. 38 | */ 39 | onResponseReceived?: (response: PromptResponse) => MaybePromise; 40 | } 41 | ) { 42 | const promises: Promise[] = []; 43 | 44 | // Read all the tasks and make them usable 45 | const tasks = await Promise.all( 46 | taskPaths.map(async (taskPath) => (await readTask(taskPath)).task) 47 | ); 48 | 49 | // Total amount of prompt request to be sent 50 | let totalPromptCount = 51 | tasks.reduce((acc, t) => acc + t.prompts.length, 0) * identifiers.length; 52 | let responseCount = 0; 53 | 54 | if (options?.maxPrompt) { 55 | totalPromptCount = options.maxPrompt * tasks.length * identifiers.length; 56 | logger.warning( 57 | `Only ${options.maxPrompt} prompt will be used from each given task file` 58 | ); 59 | } 60 | 61 | for (let taskIndex = 0; taskIndex < tasks.length; taskIndex++) { 62 | const task = tasks[taskIndex]; 63 | const taskPath = taskPaths[taskIndex]; 64 | const taskFileName = basename(taskPath); // Ensure we always have a filename 65 | const runId = uuidv7(); // New evaluation ID per given task 66 | 67 | logger.debug( 68 | `Found ${task.prompts.length} prompt in ${yellow.bold( 69 | task.did 70 | )} (${yellow.bold(taskFileName)})` 71 | ); 72 | 73 | for (const identifier of identifiers) { 74 | const info = parseProviderConfig(identifier); 75 | if (info === undefined) { 76 | continue; 77 | } 78 | 79 | const provider = getProvider(info.providerName); 80 | if (provider === undefined) { 81 | continue; 82 | } 83 | 84 | if (options?.maxPrompt) { 85 | task.prompts = task.prompts.slice(0, options.maxPrompt); 86 | } 87 | 88 | promises.push( 89 | execPrompts(provider, task, info.modelIdentifier, runId, (response) => { 90 | responseCount++; 91 | logger.info( 92 | `${responseCount} prompt done (from ${yellow.bold( 93 | taskFileName 94 | )}), ${totalPromptCount - responseCount} prompt left` 95 | ); 96 | options?.onResponseReceived?.(response); 97 | }) 98 | ); 99 | } 100 | } 101 | 102 | await Promise.all(promises); 103 | logger.info(`Prompt phase is done`); 104 | } 105 | 106 | async function execPrompt( 107 | task: Task, 108 | promptNumber: number, 109 | provider: AbstractProvider, 110 | prompt: Prompt, 111 | model: string, 112 | runId: string 113 | ) { 114 | const providerLogger = provider.logger.child({ 115 | context: `Provider(${provider.name}:${model})`, 116 | }); 117 | const promptIdentifier = `${promptNumber} from ${yellow.bold( 118 | task.did 119 | )} (${yellow.bold(task.fileName)})`; 120 | 121 | try { 122 | const input = prompt.other.stdFullPromptText; 123 | const result = await provider.forward( 124 | input, 125 | model, 126 | // TODO: Change prompt based on the evaluation type 127 | "You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...` without any other additional text or explanation." 128 | ); 129 | const elapsedSeconds = 130 | (result.completedAt.getTime() - result.startedAt.getTime()) / 1000; 131 | const response = result.response.trim(); 132 | 133 | providerLogger.debug(`Result of prompt ${promptIdentifier}: ${response}`); 134 | providerLogger.info( 135 | `Prompt ${promptIdentifier} is completed in ${blue.bold( 136 | readableTime(elapsedSeconds) 137 | )}` 138 | ); 139 | 140 | const promptCID = (await generateCID(input)).toString(); 141 | const responseCID = (await generateCID(response)).toString(); 142 | const promptResponse: PromptResponse = { 143 | modelDID: `did:model:${model}`, 144 | validatorDID: config.VALIDATOR_DID, 145 | providerDID: provider.did, 146 | taskDID: task.did, 147 | 148 | runId, 149 | sourcePromptDatasetCID: task.cid, 150 | sourceFileName: task.fileName, 151 | 152 | correctResponse: prompt.answer_idx, 153 | promptCID, 154 | responseCID, 155 | 156 | promptData: input, 157 | responseData: response, 158 | 159 | promptedAt: result.startedAt.getTime(), 160 | repliedAt: result.completedAt.getTime(), 161 | 162 | questionUUID: prompt.other?.stdQuestionUUID || uuidv7(), 163 | questionHash: calculateSHA256(prompt.question), 164 | 165 | fullPromptData: input, 166 | fullPromptHash: calculateSHA256(input), 167 | }; 168 | return promptResponse; 169 | } catch (err) { 170 | providerLogger.error(`Error on prompt ${promptIdentifier}: ${err}`); 171 | } 172 | } 173 | 174 | async function execPrompts( 175 | provider: AbstractProvider, 176 | task: Task, 177 | model: string, 178 | evaluationRunId: string, 179 | onResponseReceived?: (response: PromptResponse) => MaybePromise 180 | ) { 181 | const promises: Promise[] = []; 182 | for (let i = 0; i < task.prompts.length; i++) { 183 | const prompt = task.prompts[i]; 184 | 185 | promises.push( 186 | execPrompt(task, i + 1, provider, prompt, model, evaluationRunId).then( 187 | (response) => { 188 | if (response) { 189 | onResponseReceived?.(response); 190 | } 191 | } 192 | ) 193 | ); 194 | } 195 | await Promise.all(promises); 196 | } 197 | 198 | /** 199 | * Prepares the whole prompt that is going to be asked to the model 200 | */ 201 | export function preparePrompt(question: string, options: PromptOptions) { 202 | // Append answers to the result 203 | let result = `${question}\n\n`; 204 | for (const [letter, answer] of Object.entries(options)) { 205 | result += `${letter}: ${answer}\n`; 206 | } 207 | 208 | return result; 209 | } 210 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/providers.ts: -------------------------------------------------------------------------------- 1 | import { AbstractProvider } from "@/base/provider"; 2 | import { logger } from "./logger"; 3 | import { OpenRouterProvider } from "@/providers/openrouter"; 4 | import { NearAIProvider } from "@/providers/nearai"; 5 | 6 | function addProvider(instantiate: () => AbstractProvider) { 7 | try { 8 | providers.push(instantiate()); 9 | } catch (err: any) { 10 | logger.warning(`Couldn't add provider: ${err?.message}`); 11 | } 12 | } 13 | 14 | export const providers: AbstractProvider[] = []; 15 | 16 | addProvider(() => new OpenRouterProvider()); 17 | addProvider(() => new NearAIProvider()); 18 | 19 | export function getProvider(name: string) { 20 | const provider = providers.find((p) => p.name === name); 21 | if (provider === undefined) { 22 | logger.warning(`Provider not found: ${name}`); 23 | return; 24 | } 25 | 26 | return provider; 27 | } 28 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/score.ts: -------------------------------------------------------------------------------- 1 | import { MaybePromise, PromptResponse, PromptScore } from "@/types"; 2 | import { readFile } from "./utils"; 3 | 4 | export async function score( 5 | responseFilePaths: string[], 6 | scorer?: (response: PromptResponse) => MaybePromise 7 | ) { 8 | const contents = responseFilePaths.map((path) => readFile(path)); 9 | const promptResponses: PromptResponse[] = contents 10 | .map((content) => 11 | // TODO: Ability to read CSV files 12 | JSON.parse(content) 13 | ) 14 | .flat(); 15 | // TODO: Validate the `promptResponses` schema via Zod 16 | 17 | const scores: PromptScore[] = []; 18 | 19 | for (const promptResponse of promptResponses) { 20 | let score = 0; 21 | // TODO: Maybe also check the CIDs to be sure everything is correct? 22 | 23 | // If the `scorer` function is presented, use it. 24 | if (scorer) { 25 | score = await scorer(promptResponse); 26 | } else { 27 | // TODO: Use different answer check approaches by using `promptResponse.evalType` 28 | if (promptResponse.responseData === promptResponse.correctResponse) { 29 | score = 1; 30 | } else { 31 | // Look for some patterns for the answer 32 | const answer = lookForAnswer(promptResponse.responseData, [ 33 | { 34 | regex: /answer is\s+([A-Z])/gi, 35 | answerGroupIndex: 1, 36 | }, 37 | { 38 | regex: /answer is\s+\**([A-Z])\**/gi, 39 | answerGroupIndex: 1, 40 | }, 41 | { 42 | regex: /([A-Z]):.+/g, 43 | answerGroupIndex: 1, 44 | }, 45 | ]); 46 | 47 | if (answer !== undefined && answer === promptResponse.correctResponse) { 48 | score = 1; 49 | } 50 | } 51 | } 52 | 53 | scores.push({ 54 | ...promptResponse, 55 | score, 56 | }); 57 | } 58 | 59 | return scores; 60 | } 61 | 62 | function lookForAnswer( 63 | response: string, 64 | patterns: { 65 | regex: RegExp; 66 | answerGroupIndex: number; 67 | }[] 68 | ) { 69 | for (const pattern of patterns) { 70 | const matches = Array.from(response.matchAll(pattern.regex)); 71 | const match = matches.at(-1); 72 | 73 | if (match) { 74 | return match[pattern.answerGroupIndex]; 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /peerBenchJS/src/core/uploaders.ts: -------------------------------------------------------------------------------- 1 | import { CoServerUploader } from "@/uploaders/coserver"; 2 | import { logger } from "./logger"; 3 | 4 | export const uploaders = [new CoServerUploader()]; 5 | 6 | export function getUploader(name: string) { 7 | const uploader = uploaders.find( 8 | (p) => p.name.toLowerCase() === name.toLowerCase() 9 | ); 10 | if (uploader === undefined) { 11 | logger.warning(`Uploader not found: ${name}`); 12 | return; 13 | } 14 | 15 | return uploader; 16 | } 17 | -------------------------------------------------------------------------------- /peerBenchJS/src/errors/task.ts: -------------------------------------------------------------------------------- 1 | export class InvalidTaskError extends Error { 2 | constructor(message?: string) { 3 | super(message || "Task is in an invalid format"); 4 | this.name = "InvalidTaskError"; 5 | } 6 | } 7 | 8 | export class TaskNotRecognizedError extends InvalidTaskError { 9 | constructor() { 10 | super("Task is not recognized"); 11 | this.name = "TaskNotRecognizedError"; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /peerBenchJS/src/formats/bigbench.ts: -------------------------------------------------------------------------------- 1 | import { AbstractTaskFormat } from "@/base/task-format"; 2 | import { parseValidationError, tryParseJson } from "@/core/parser"; 3 | import { checkValidationError, readFile, generateCID } from "@/core/utils"; 4 | import { InvalidTaskError, TaskNotRecognizedError } from "@/errors/task"; 5 | import { MaybePromise, Prompt, Task, EvalType, EvalTypes } from "@/types"; 6 | import { z } from "zod"; 7 | import { basename } from "path"; 8 | import { generatePromptMetaFields } from "@/core/std"; 9 | import { v7 as uuidv7 } from "uuid"; 10 | 11 | export const BigBenchTaskSchema = z.object({ 12 | canary: z.string().startsWith("BENCHMARK DATA SHOULD NEVER"), 13 | name: z.string(), 14 | description: z.string(), 15 | keywords: z.array(z.string()), 16 | metrics: z.array(z.string()), 17 | preferred_score: z.string(), 18 | output_regex: z.string().optional(), 19 | example_input_suffix: z.string().optional(), 20 | example_input_prefix: z.string().optional(), 21 | example_output_suffix: z.string().optional(), 22 | example_output_prefix: z.string().optional(), 23 | choice_prefix: z.string().optional(), 24 | examples: z.array( 25 | z.object({ 26 | id: z.coerce.number().optional(), 27 | input: z.string(), 28 | target: z.string().optional(), 29 | target_scores: z.record(z.string(), z.coerce.number()), 30 | }) 31 | ), 32 | }); 33 | 34 | export type BigBenchTask = z.infer; 35 | 36 | export class BigBenchTaskFormat extends AbstractTaskFormat { 37 | name = "bigbench"; 38 | 39 | async parseFromFile(path: string): Promise { 40 | const content = readFile(path); 41 | const sourceFileName = basename(path); 42 | const sourceFileCID = (await generateCID(content)).toString(); 43 | const json = tryParseJson(content); 44 | 45 | if (json === undefined) { 46 | throw new InvalidTaskError(); 47 | } 48 | 49 | if (!this.recognize(json)) { 50 | throw new TaskNotRecognizedError(); 51 | } 52 | 53 | const task = checkValidationError(BigBenchTaskSchema.safeParse(json)); 54 | const prompts: Prompt[] = []; 55 | const evalTypes: EvalType[] = []; 56 | 57 | for (const metric of task.metrics) { 58 | switch (metric) { 59 | case "multiple_choice_grade": 60 | evalTypes.push(EvalTypes.MultipleChoice); 61 | break; 62 | case "exact_str_match": 63 | evalTypes.push(EvalTypes.ExactEquality); 64 | break; 65 | default: 66 | // TODO: Check other metric types 67 | break; 68 | } 69 | } 70 | 71 | // Parse examples (aka prompts, tests) 72 | let exampleRowNumber = 0; 73 | for (const example of task.examples || []) { 74 | const options: Record = {}; 75 | let data = example.input; 76 | let answer_idx = ""; 77 | 78 | if (task.example_input_prefix) { 79 | data = `${task.example_input_prefix}${data}`; 80 | } 81 | if (task.example_input_suffix) { 82 | data = `${data}${task.example_input_suffix}`; 83 | } 84 | 85 | let answerLetterIndex = 0; 86 | for (const [answer, score] of Object.entries(example.target_scores)) { 87 | const letter = String.fromCharCode(65 + answerLetterIndex); 88 | 89 | if (score === 1) { 90 | answer_idx = letter; 91 | } 92 | 93 | options[letter] = answer; 94 | 95 | answerLetterIndex++; 96 | } 97 | 98 | prompts.push({ 99 | question: data, 100 | options, 101 | answer_idx, 102 | answer: options[answer_idx], 103 | meta_info: task.name, 104 | other: { 105 | ...generatePromptMetaFields({ 106 | options, 107 | question: data, 108 | rowNumber: exampleRowNumber, 109 | sourceFileCID, 110 | sourceFileName, 111 | uuid: uuidv7(), 112 | }), 113 | }, 114 | }); 115 | exampleRowNumber++; 116 | } 117 | 118 | return { 119 | did: `did:task:bigbench/${task.name.replaceAll(" ", "-")}`, 120 | prompts, 121 | path, 122 | cid: sourceFileCID, 123 | fileName: basename(path), 124 | }; 125 | } 126 | 127 | recognize(content: any): MaybePromise { 128 | const err = parseValidationError(BigBenchTaskSchema.safeParse(content)); 129 | 130 | if (err) { 131 | return false; 132 | } 133 | 134 | return true; 135 | } 136 | 137 | async convertTo(task: Task, targetFormat: string): Promise { 138 | switch (targetFormat) { 139 | default: 140 | throw new Error(`Unsupported target format: ${targetFormat}`); 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /peerBenchJS/src/formats/mmlu-pro.ts: -------------------------------------------------------------------------------- 1 | import { AbstractTaskFormat } from "@/base/task-format"; 2 | import { parseJSONL, tryParseJson } from "@/core/parser"; 3 | import { checkValidationError, readFile, generateCID } from "@/core/utils"; 4 | import { InvalidTaskError, TaskNotRecognizedError } from "@/errors/task"; 5 | import { MaybePromise, Prompt, Task } from "@/types"; 6 | import { asyncBufferFromFile, parquetReadObjects } from "hyparquet"; 7 | import { z } from "zod"; 8 | import { basename } from "path"; 9 | import { generatePromptMetaFields } from "@/core/std"; 10 | import { v7 as uuidv7 } from "uuid"; 11 | 12 | export const MMLUProTaskSchema = z.object({ 13 | question_id: z.coerce.number(), 14 | question: z.string(), 15 | options: z.array(z.string()), 16 | answer: z.string(), 17 | answer_index: z.coerce.number(), 18 | cot_content: z.string(), 19 | category: z.string(), 20 | src: z.string(), 21 | }); 22 | 23 | export type MMLUProTask = z.infer; 24 | 25 | export class MMLUProTaskFormat extends AbstractTaskFormat { 26 | name = "mmlu-pro"; 27 | 28 | async parseFromFile(path: string): Promise { 29 | const sourceFileName = basename(path); 30 | let sourceFileCID: string = ""; 31 | 32 | // TODO: calculate CID for parquet file 33 | // Try to parse the file as Parquet 34 | let data = await this.tryParseParquet(path); 35 | 36 | // Not a Parquet file 37 | if (!data) { 38 | const content = readFile(path); 39 | 40 | // Calculate CID for the source file 41 | sourceFileCID = (await generateCID(content)).toString(); 42 | 43 | // Try to parse it as JSON or JSONL 44 | data = this.tryParseJSON(content); 45 | 46 | // Not a JSON file 47 | if (!data) { 48 | data = this.tryParseJSONL(content); 49 | } 50 | } 51 | 52 | if (!data || data.length == 0) { 53 | throw new InvalidTaskError(); 54 | } 55 | 56 | if (!this.recognize(data)) { 57 | throw new TaskNotRecognizedError(); 58 | } 59 | 60 | const categories = new Set(); 61 | const prompts: Prompt[] = []; 62 | const validatedData = checkValidationError( 63 | z.array(MMLUProTaskSchema).safeParse(data) 64 | ); 65 | 66 | // Parse prompts 67 | for (const prompt of validatedData) { 68 | const options: Record = {}; 69 | let answer_idx = ""; 70 | 71 | for (let i = 0; i < prompt.options.length; i++) { 72 | const option = prompt.options[i]; 73 | const letter = String.fromCharCode(65 + i); 74 | options[letter] = option; 75 | 76 | // Get the answer letter 77 | if (i === prompt.answer_index) { 78 | answer_idx = letter; 79 | } 80 | } 81 | 82 | const standardPrompt: Prompt = { 83 | question: prompt.question, 84 | answer: prompt.options[prompt.answer_index], 85 | answer_idx, 86 | options, 87 | meta_info: prompt.category, 88 | other: { 89 | ...generatePromptMetaFields({ 90 | options, 91 | question: prompt.question, 92 | rowNumber: prompt.question_id, 93 | sourceFileCID, 94 | sourceFileName, 95 | uuid: uuidv7(), 96 | }), 97 | 98 | "mmlu-pro__question_id": prompt.question_id, 99 | "mmlu-pro__answer_index": prompt.answer_index, 100 | "mmlu-pro__cot_content": prompt.cot_content, 101 | "mmlu-pro__category": prompt.category, 102 | "mmlu-pro__src": prompt.src, 103 | }, 104 | }; 105 | 106 | for (const [name, value] of Object.entries(prompt)) { 107 | // If original object has fields other than those, include them in the standardized object as well 108 | if ( 109 | ![ 110 | "question", 111 | "options", 112 | "answer", 113 | "answer_index", 114 | "question_id", 115 | "cot_content", 116 | "category", 117 | "src", 118 | ].includes(name) 119 | ) { 120 | standardPrompt.other![`mmlu-pro__${name}`] = value; 121 | } 122 | } 123 | 124 | categories.add(prompt.category); 125 | prompts.push(standardPrompt); 126 | } 127 | 128 | let did = "did:task:mmlu-pro"; 129 | 130 | // Not all of the categories are presented so update 131 | // the DID according to the found categories 132 | if (categories.size != 14) { 133 | const categoryNames = [...categories].map((category) => 134 | category.replaceAll(" ", "-").toLowerCase() 135 | ); 136 | did += `/${categoryNames.join("-")}`; 137 | } 138 | 139 | return { 140 | did, 141 | prompts, 142 | cid: sourceFileCID, 143 | fileName: sourceFileName, 144 | path, 145 | }; 146 | } 147 | 148 | recognize(content: any): MaybePromise { 149 | if (Array.isArray(content)) { 150 | const json: MMLUProTask[] = content; 151 | for (const task of json) { 152 | // If any of the items is not valid 153 | // we cannot recognize this object as a whole 154 | if (!this.recognize(task)) { 155 | return false; 156 | } 157 | } 158 | return true; 159 | } 160 | 161 | if (typeof content !== "object") { 162 | return false; 163 | } 164 | 165 | const validation = MMLUProTaskSchema.safeParse(content); 166 | if (validation.error) { 167 | return false; 168 | } 169 | 170 | return true; 171 | } 172 | 173 | tryParseJSON(content: string) { 174 | return tryParseJson(content); 175 | } 176 | 177 | tryParseJSONL(content: string) { 178 | return parseJSONL(content); 179 | } 180 | 181 | async tryParseParquet(path: string): Promise { 182 | try { 183 | const file = await asyncBufferFromFile(path); 184 | return await parquetReadObjects({ file }); 185 | } catch { 186 | // Not a valid Parquet file 187 | } 188 | } 189 | 190 | async convertTo(task: Task, targetFormat: string): Promise { 191 | switch (targetFormat) { 192 | case "mmlu-pro": 193 | return task.prompts.map((prompt) => { 194 | // Convert options from object to array 195 | const options = Object.entries(prompt.options) 196 | .sort(([a], [b]) => a.localeCompare(b)) 197 | .map(([, value]) => value); 198 | const answer_index = options.findIndex( 199 | (option) => option === prompt.answer 200 | ); 201 | 202 | return { 203 | question_id: 204 | prompt.other?.src_row_number || 205 | prompt.other?.question_id || 206 | prompt.other?.["mmlu-pro__question_id"], 207 | question: prompt.question, 208 | options, 209 | answer: prompt.answer_idx, // Answer letter 210 | answer_index, // Index of the correct answer in the options array 211 | cot_content: "", 212 | category: 213 | prompt.meta_info || 214 | prompt.other?.category || 215 | prompt.other?.["mmlu-pro__category"], 216 | src: prompt.other?.["src"] || prompt.other?.["mmlu-pro__src"], 217 | }; 218 | }); 219 | case "medqa": 220 | // Since we are using MedQA as the unified schema, we can just return the prompts 221 | return [...task.prompts]; 222 | 223 | default: 224 | throw new Error(`Unsupported target format: ${targetFormat}`); 225 | } 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /peerBenchJS/src/index.ts: -------------------------------------------------------------------------------- 1 | import { red } from "ansis"; 2 | import { logger } from "./core/logger"; 3 | import { program } from "./core/program"; 4 | 5 | import "@/commands/prompt"; 6 | import "@/commands/score"; 7 | import "@/commands/aggregate"; 8 | import "@/commands/upload"; 9 | import "@/commands/rephrase"; 10 | import "@/commands/std"; 11 | import { ensureError } from "./core/utils"; 12 | 13 | program.parseAsync().catch((err) => { 14 | const error = ensureError(err); 15 | logger.error(red(error.stack)); 16 | }); 17 | -------------------------------------------------------------------------------- /peerBenchJS/src/providers/nearai.ts: -------------------------------------------------------------------------------- 1 | import { AbstractProvider } from "@/base/provider"; 2 | import { readFile, sleep } from "@/core/utils"; 3 | import { ModelResponse } from "@/types"; 4 | import OpenAI from "openai"; 5 | import { homedir } from "os"; 6 | import { join } from "path"; 7 | import { z } from "zod"; 8 | 9 | const NearAIEnv = { 10 | CONFIG_PATH: z.string().default(join(homedir(), ".nearai", "config.json")), 11 | RATE_LIMIT: z.coerce.number().default(1), 12 | RATE_LIMIT_TIME_WINDOW: z.coerce.number().default(3000), 13 | TIMEOUT: z.coerce.number().default(60_0000), // 1 minute 14 | MAX_RETRIES: z.coerce.number().default(3), 15 | }; 16 | 17 | export class NearAIProvider extends AbstractProvider { 18 | client: OpenAI; 19 | private timestamps: number[] = []; 20 | 21 | constructor() { 22 | super({ 23 | name: "near.ai", 24 | env: NearAIEnv, 25 | }); 26 | 27 | // Parse config file 28 | const config = JSON.parse(readFile(this.env.CONFIG_PATH)); 29 | 30 | if (!config?.auth?.signature) { 31 | throw new Error( 32 | 'Signature is not found. Please try to login via "nearai" CLI' 33 | ); 34 | } 35 | 36 | // Initialize the client 37 | this.client = new OpenAI({ 38 | baseURL: "https://api.near.ai/v1", 39 | apiKey: JSON.stringify(config?.auth), 40 | maxRetries: this.env.MAX_RETRIES, 41 | timeout: this.env.TIMEOUT, 42 | }); 43 | } 44 | 45 | async enforceRateLimit(): Promise { 46 | const now = Date.now(); 47 | 48 | this.timestamps = this.timestamps.filter( 49 | (ts) => now - ts < this.env.RATE_LIMIT_TIME_WINDOW 50 | ); 51 | 52 | if (this.timestamps.length < this.env.RATE_LIMIT) { 53 | this.timestamps.push(now); 54 | return; 55 | } 56 | 57 | const earliest = this.timestamps[0]; 58 | const waitTime = this.env.RATE_LIMIT_TIME_WINDOW - (now - earliest); 59 | 60 | await sleep(waitTime); 61 | 62 | return this.enforceRateLimit(); 63 | } 64 | 65 | async forward( 66 | prompt: string, 67 | model: string, 68 | system: string 69 | ): Promise { 70 | await this.enforceRateLimit(); 71 | 72 | const startedAt = new Date(); 73 | const response = await this.client.chat.completions.create({ 74 | model, 75 | messages: [ 76 | { 77 | role: "system", 78 | content: system, 79 | }, 80 | { 81 | role: "user", 82 | content: prompt, 83 | }, 84 | ], 85 | }); 86 | 87 | if ("error" in response) { 88 | const err = response.error as any; 89 | throw new Error( 90 | `${err.message} - Code ${err.code} - ${JSON.stringify(err)}` 91 | ); 92 | } 93 | 94 | return { 95 | response: response?.choices?.[0]?.message?.content || "", 96 | startedAt, 97 | completedAt: new Date(), 98 | }; 99 | } 100 | 101 | parseModelIdentifier(identifier: string): { 102 | modelName: string; 103 | modelOwner: string; 104 | subProvider?: string; 105 | } { 106 | const regex = 107 | /^(?[^:]+)::(?[^/]+)\/(?.+)$/; 108 | const match = identifier.match(regex); 109 | if (match?.groups) { 110 | const providerName = match.groups.providerName; 111 | let modelOwner = match.groups.modelOwner; 112 | let modelName = match.groups.modelName; 113 | 114 | if (modelOwner === "accounts") { 115 | // Model owner field is not available, 116 | // so just use the provider name for it. 117 | modelOwner = providerName; 118 | } 119 | 120 | // Delete redundant part 121 | if (modelName.startsWith(`${providerName}/models`)) { 122 | modelName = modelName.replaceAll(`${providerName}/models`, ""); 123 | } 124 | 125 | return { 126 | modelOwner, 127 | modelName, 128 | subProvider: providerName, 129 | }; 130 | } 131 | 132 | throw new Error(`Invalid identifier format: ${identifier}`); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /peerBenchJS/src/providers/openrouter.ts: -------------------------------------------------------------------------------- 1 | import { AbstractProvider } from "@/base/provider"; 2 | import { sleep } from "@/core/utils"; 3 | import { ModelResponse } from "@/types"; 4 | import OpenAI from "openai"; 5 | import { z } from "zod"; 6 | 7 | const OpenRouterEnv = { 8 | KEY: z.string().nonempty(), 9 | RATE_LIMIT: z.coerce.number().default(20), 10 | RATE_LIMIT_TIME_WINDOW: z.coerce.number().default(3000), 11 | TIMEOUT: z.coerce.number().default(60_0000), // 1 minute 12 | MAX_RETRIES: z.coerce.number().default(3), 13 | }; 14 | 15 | export class OpenRouterProvider extends AbstractProvider { 16 | client: OpenAI; 17 | private timestamps: number[] = []; 18 | 19 | constructor() { 20 | super({ 21 | name: "openrouter.ai", 22 | env: OpenRouterEnv, 23 | }); 24 | 25 | // Initialize the client 26 | this.client = new OpenAI({ 27 | baseURL: "https://openrouter.ai/api/v1", 28 | apiKey: this.env.KEY, 29 | maxRetries: this.env.MAX_RETRIES, 30 | timeout: this.env.TIMEOUT, 31 | }); 32 | } 33 | 34 | async enforceRateLimit(): Promise { 35 | const now = Date.now(); 36 | 37 | this.timestamps = this.timestamps.filter( 38 | (ts) => now - ts < this.env.RATE_LIMIT_TIME_WINDOW 39 | ); 40 | 41 | if (this.timestamps.length < this.env.RATE_LIMIT) { 42 | this.timestamps.push(now); 43 | return; 44 | } 45 | 46 | const earliest = this.timestamps[0]; 47 | const waitTime = this.env.RATE_LIMIT_TIME_WINDOW - (now - earliest); 48 | 49 | await sleep(waitTime); 50 | 51 | return this.enforceRateLimit(); 52 | } 53 | 54 | async forward( 55 | prompt: string, 56 | model: string, 57 | system: string 58 | ): Promise { 59 | await this.enforceRateLimit(); 60 | 61 | const startedAt = new Date(); 62 | const response = await this.client.chat.completions.create({ 63 | model, 64 | messages: [ 65 | { 66 | role: "system", 67 | content: system, 68 | }, 69 | { 70 | role: "user", 71 | content: prompt, 72 | }, 73 | ], 74 | }); 75 | 76 | if ("error" in response) { 77 | const err = response.error as any; 78 | throw new Error( 79 | `${err.message} - Code ${err.code} - ${JSON.stringify(err)}` 80 | ); 81 | } 82 | 83 | return { 84 | response: response?.choices?.[0]?.message?.content || "", 85 | startedAt, 86 | completedAt: new Date(), 87 | }; 88 | } 89 | 90 | parseModelIdentifier(identifier: string): { 91 | modelName: string; 92 | modelOwner: string; 93 | subProvider?: string; 94 | } { 95 | const regex = /^(?[^/]+)\/(?[^:]+)(?::(?.+))?$/; 96 | const match = identifier.match(regex); 97 | 98 | if (match?.groups) { 99 | const modelOwner = match.groups.owner; 100 | const tier = match.groups.tier; 101 | let modelName = match.groups.model; 102 | 103 | if (tier) { 104 | modelName += "-" + tier; 105 | } 106 | 107 | return { 108 | modelOwner, 109 | modelName, 110 | subProvider: undefined, // TODO: parse sub provider if exists 111 | }; 112 | } 113 | 114 | throw new Error(`Invalid identifier format: ${identifier}`); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /peerBenchJS/src/types.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | 3 | export const NodeEnvs = ["dev", "production"] as const; 4 | export type NodeEnv = (typeof NodeEnvs)[number]; 5 | 6 | export const LogLevels = ["error", "warning", "info", "debug"] as const; 7 | export type LogLevel = (typeof LogLevels)[number]; 8 | 9 | export type MaybePromise = T | Promise; 10 | 11 | export type PromptOptions = Record; 12 | 13 | export type ModelResponse = { 14 | startedAt: Date; 15 | completedAt: Date; 16 | response: string; 17 | }; 18 | 19 | export const EvalTypes = { 20 | ExactEquality: "exact-equality", 21 | MultipleChoice: "multiple-choice", 22 | } as const; 23 | export type EvalType = (typeof EvalTypes)[keyof typeof EvalTypes]; 24 | 25 | export const PromptSchema = z.object({ 26 | question: z.string(), 27 | options: z.record(z.string(), z.string()), 28 | answer_idx: z.string(), 29 | answer: z.string(), 30 | meta_info: z.string().optional(), 31 | // TODO: Rename `other` to `metadata` 32 | other: z 33 | .object({ 34 | hash_full_question: z.string(), 35 | hash_first_sentence: z.string(), 36 | hash_first_question_sentence: z.string(), 37 | hash_last_sentence: z.string(), 38 | stdQuestionUUID: z.string(), 39 | stdFullPromptText: z.string(), 40 | stdFullPromptHash: z.string(), 41 | src_row_number: z.number(), 42 | preSTDsrcFileName: z.string(), 43 | preSTDsrcCID: z.string(), 44 | }) 45 | .catchall(z.any()), 46 | }); 47 | 48 | export type Prompt = z.infer; 49 | 50 | export const TaskSchema = z.object({ 51 | did: z.string().startsWith("did:task:"), 52 | prompts: z.array(PromptSchema), 53 | cid: z.string(), 54 | fileName: z.string(), 55 | path: z.string(), 56 | }); 57 | export type Task = z.infer; 58 | 59 | export const PromptResponseSchema = z.object({ 60 | validatorDID: z.string().startsWith("did:val:"), 61 | providerDID: z.string().startsWith("did:prov:"), 62 | modelDID: z.string().startsWith("did:model:"), 63 | taskDID: z.string().startsWith("did:task:"), 64 | 65 | promptCID: z.string(), 66 | responseCID: z.string(), 67 | 68 | promptData: z.string(), 69 | responseData: z.string(), 70 | correctResponse: z.string(), 71 | 72 | promptedAt: z.number(), 73 | repliedAt: z.number(), 74 | 75 | runId: z.string(), 76 | 77 | questionUUID: z.string(), 78 | questionHash: z.string(), 79 | 80 | fullPromptData: z.string(), 81 | fullPromptHash: z.string(), 82 | 83 | // aka taskFileCID 84 | sourcePromptDatasetCID: z.string(), 85 | 86 | // aka taskFileName 87 | sourceFileName: z.string().optional(), 88 | }); 89 | 90 | export type PromptResponse = z.infer; 91 | 92 | export const PromptScoreSchema = PromptResponseSchema.extend({ 93 | promptData: z.string().optional(), 94 | responseData: z.string().optional(), 95 | correctResponse: z.string().optional(), 96 | sourcePromptDatasetCID: z.string().optional(), 97 | fullPromptData: z.string().optional(), 98 | 99 | score: z.number(), 100 | }); 101 | export type PromptScore = z.infer; 102 | 103 | export const AggregationResultSchema = z.array( 104 | z.object({ 105 | taskDID: z.string(), 106 | providerDID: z.string(), 107 | modelDID: z.string(), 108 | totalResponse: z.number(), 109 | score: z.number(), 110 | wrongAnswers: z.number(), 111 | missingAnswers: z.number(), 112 | avgLatency: z.number(), 113 | avgScore: z.number(), 114 | score_runIds: z.array(z.string()), 115 | sourcePromptDatasetCIDs: z.array(z.string()), 116 | sourceFileNames: z.array(z.string()), 117 | }) 118 | ); 119 | 120 | export type AggregationResult = z.infer; 121 | -------------------------------------------------------------------------------- /peerBenchJS/src/uploaders/coserver.ts: -------------------------------------------------------------------------------- 1 | import { AbstractUploader, UploadOptions } from "@/base/uploader"; 2 | import { checkValidationError, readFile } from "@/core/utils"; 3 | import { PromptScoreSchema } from "@/types"; 4 | import axios, { AxiosError } from "axios"; 5 | import { z } from "zod"; 6 | 7 | /** 8 | * Uploader class to upload files to the coordination server 9 | */ 10 | export class CoServerUploader extends AbstractUploader { 11 | override name = "CoServer"; 12 | token: string = ""; 13 | client = axios.create({ 14 | baseURL: "https://deval-flask.onrender.com", 15 | }); 16 | 17 | async init() { 18 | // We already have the token, no need to fetch it again. 19 | if (this.token !== "") return; 20 | 21 | this.logger.info(`Generating a new token`); 22 | const response = await this.client.post(`/token`, { 23 | user: "validator", 24 | role: "validator", 25 | }); 26 | 27 | this.token = response.data?.token; 28 | this.client = axios.create({ 29 | baseURL: "https://deval-flask.onrender.com", 30 | headers: { 31 | Authorization: `Bearer ${this.token}`, 32 | }, 33 | }); 34 | } 35 | 36 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 37 | async uploadPromptResponses(filePath: string, options?: UploadOptions) { 38 | throw new Error("CoServer doesn't support uploading responses"); 39 | // TODO: Paused since the CoServer doesn't support submitting responses 40 | 41 | // const fileContent = readFile(filePath); 42 | // const arraySchema = z.array(PromptResponseSchema); 43 | // const parseResult = arraySchema.safeParse(JSON.parse(fileContent)); 44 | // const responses = checkValidationError(parseResult); 45 | 46 | // TODO: paused since the CoServer doesn't support batch requests 47 | // const upload = async (skip: number, take: number) => { 48 | // const partition: PromptResponse[] = []; 49 | // for (let i = skip; i < responses.length; i++) { 50 | // // We've taken all the items that we want, no need to continue; 51 | // if (i > take) break; 52 | // partition.push(responses[i]); 53 | // } 54 | // const res = await this.client.post("/submit-score", partition); 55 | // }; 56 | // let num = 1; 57 | // for (const response of responses) { 58 | // try { 59 | // const res = await this.client.post("/submit-score", response); 60 | // const data = res.data; 61 | // if (data?.status === "success") { 62 | // this.logger.info( 63 | // `Response ${num} uploaded to the CoServer successfully` 64 | // ); 65 | // } else { 66 | // throw new Error(JSON.stringify(data || {})); 67 | // } 68 | // } catch (err: any) { 69 | // this.logger.debug( 70 | // `Error while uploading the responses: ${ 71 | // err?.stack || err?.message || err 72 | // }` 73 | // ); 74 | // this.logger.error(`Error while uploading response ${num}: ${err}`); 75 | // } 76 | // num++; 77 | // } 78 | } 79 | 80 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 81 | async uploadScores(filePath: string, options?: UploadOptions) { 82 | const fileContent = readFile(filePath); 83 | const arraySchema = z.array(PromptScoreSchema); 84 | const parseResult = arraySchema.safeParse(JSON.parse(fileContent)); 85 | const scores = checkValidationError(parseResult); 86 | 87 | let num = 1; 88 | for (const score of scores) { 89 | try { 90 | const res = await this.client.post("/submit-score", { 91 | ...score, 92 | // TODO: On the CoServer side, change `evaluationRunId` to `runId` 93 | evaluationRunId: score.runId, 94 | }); 95 | const data = res.data; 96 | 97 | if (data?.status === "success") { 98 | this.logger.info( 99 | `Score ${num} uploaded to the CoServer successfully` 100 | ); 101 | } else { 102 | throw new Error(JSON.stringify(data || {})); 103 | } 104 | } catch (err: any) { 105 | let message: any = err; 106 | if (err instanceof AxiosError) { 107 | message = err.response?.data || err; 108 | } 109 | this.logger.error( 110 | `Error while uploading score ${num}: ${JSON.stringify(message)}` 111 | ); 112 | } 113 | num++; 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /peerBenchJS/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ESNext", 4 | "module": "ESNext", 5 | "moduleResolution": "Node", 6 | "esModuleInterop": true, 7 | 8 | "forceConsistentCasingInFileNames": true, 9 | "strict": true, 10 | "strictNullChecks": true, 11 | "skipLibCheck": true, 12 | 13 | "noEmit": true, 14 | 15 | "outDir": "dist", 16 | 17 | "paths": { 18 | "@/*": ["./src/*"] 19 | } 20 | }, 21 | "exclude": ["node_modules", "dist"], 22 | "include": ["src/**/*"] 23 | } 24 | -------------------------------------------------------------------------------- /peerBenchJS/tsup.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "tsup"; 2 | 3 | export default defineConfig({ 4 | entry: ["src/**/*.ts"], 5 | target: "node20", 6 | format: ["esm"], 7 | clean: true, 8 | bundle: false, 9 | skipNodeModulesBundle: true, 10 | outDir: "dist", 11 | splitting: false, 12 | platform: "node", 13 | }); 14 | -------------------------------------------------------------------------------- /peerBenchPY/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to val 2 | 3 | We're excited that you're interested in contributing to val! This guide will help you set up your development environment and understand our contribution workflow. 4 | 5 | ## Table of Contents 6 | 7 | - [Development Environment Setup](#development-environment-setup) 8 | - [Development Guidelines](#development-guidelines) 9 | - [Code Quality Standards](#code-quality-standards) 10 | - [Contribution Workflow](#contribution-workflow) 11 | - [Testing Guidelines](#testing-guidelines) 12 | - [Documentation](#documentation) 13 | - [Community Guidelines](#community-guidelines) 14 | 15 | ## Development Environment Setup 16 | 17 | ### Prerequisites 18 | 19 | - Python (3.9 <= version <= 3.12) 20 | - [Poetry](https://python-poetry.org/docs/#installation) (recommended) or pip 21 | - Git 22 | 23 | ### Installation Steps 24 | 25 | 1. Clone the repository: 26 | ```bash 27 | git clone https://github.com/your-org/val.git 28 | cd val 29 | ``` 30 | 31 | 2. Set up your development environment: 32 | 33 | Using Poetry (recommended): 34 | ```bash 35 | poetry install --with dev 36 | ``` 37 | 38 | Using pip: 39 | ```bash 40 | pip install -e ".[dev]" 41 | ``` 42 | 43 | ## Development Guidelines 44 | 45 | ### Code Style 46 | 47 | We maintain strict code quality standards to ensure maintainability and consistency: 48 | 49 | 1. Code Formatting: 50 | ```bash 51 | # Format code using Black 52 | poetry run black val 53 | 54 | # Sort imports 55 | poetry run isort val 56 | ``` 57 | 58 | 2. Type Checking: 59 | ```bash 60 | # Run type checker 61 | poetry run mypy val 62 | ``` 63 | 64 | 3. Linting: 65 | ```bash 66 | # Run linter 67 | poetry run flake8 val 68 | ``` 69 | 70 | ## Code Quality Standards 71 | 72 | ### Type Hints 73 | 74 | All new code should include type hints: 75 | 76 | ```python 77 | from typing import List, Optional 78 | 79 | def process_data(data: List[str], config: Optional[dict] = None) -> bool: 80 | return True 81 | ``` 82 | 83 | ### Documentation 84 | 85 | - All modules, classes, and functions should have docstrings 86 | - Follow Google style docstring format 87 | - Include usage examples for complex functionality 88 | 89 | ## Contribution Workflow 90 | 91 | 1. Create a new branch: 92 | ```bash 93 | git checkout -b feature/your-feature-name 94 | ``` 95 | 96 | 2. Make your changes and commit them following our commit message convention: 97 | ``` 98 | (): 99 | 100 | [optional body] 101 | 102 | [optional footer] 103 | ``` 104 | 105 | Types: 106 | - `feat`: New feature 107 | - `fix`: Bug fix 108 | - `docs`: Documentation changes 109 | - `style`: Formatting changes 110 | - `refactor`: Code refactoring 111 | - `test`: Adding or modifying tests 112 | - `chore`: Maintenance tasks 113 | 114 | Example: 115 | ``` 116 | feat(auth): implement JWT authentication 117 | 118 | - Add JWT token generation 119 | - Implement token validation 120 | - Add user authentication middleware 121 | 122 | Closes #123 123 | ``` 124 | 125 | 3. Push your changes and create a pull request: 126 | ```bash 127 | git push origin feature/your-feature-name 128 | ``` 129 | 130 | ### Pull Request Guidelines 131 | 132 | 1. Ensure all tests pass 133 | 2. Update documentation if needed 134 | 3. Add tests for new features 135 | 4. Follow the pull request template 136 | 5. Link related issues 137 | 138 | ## Testing Guidelines 139 | 140 | 1. Write tests for new features: 141 | ```python 142 | def test_feature(): 143 | # Arrange 144 | data = prepare_test_data() 145 | 146 | # Act 147 | result = process_data(data) 148 | 149 | # Assert 150 | assert result is True 151 | ``` 152 | 153 | 2. Run tests: 154 | ```bash 155 | poetry run pytest 156 | ``` 157 | 158 | 3. Check test coverage: 159 | ```bash 160 | poetry run pytest --cov=val 161 | ``` 162 | 163 | ## Documentation 164 | 165 | - Update documentation for new features 166 | - Include code examples 167 | - Add docstrings to all public APIs 168 | - Update README.md if needed 169 | 170 | ## Community Guidelines 171 | 172 | ### Issue Reporting 173 | 174 | When reporting issues, please include: 175 | 176 | 1. Steps to reproduce 177 | 2. Expected behavior 178 | 3. Actual behavior 179 | 4. Environment details 180 | 5. Relevant logs or screenshots 181 | 182 | ### Communication 183 | 184 | - Be respectful and inclusive 185 | - Provide constructive feedback 186 | - Help others when possible 187 | - Follow our code of conduct 188 | 189 | ## License 190 | 191 | By contributing to val, you agree that your contributions will be licensed under our project's MIT License. 192 | 193 | ## Questions or Need Help? 194 | 195 | - Open a [GitHub Discussion](https://github.com/val-ai/val/discussions) 196 | - Join our [Discord Community](https://discord.gg/val-ai-941362322000203776) 197 | 198 | Thank you for contributing to val! 🚀 -------------------------------------------------------------------------------- /peerBenchPY/Dockerfile: -------------------------------------------------------------------------------- 1 | # THIS CONTAINER IS A NPM, RUST, PYTHON, DOCKER INTO ONE CONTAINER 2 | # THIS GENERAL CONTAINER IS THE CORE OF val, USE IT AS YOU WISH AT YOUR OWN RISK 3 | 4 | FROM ubuntu:22.04 5 | 6 | # SYSTEM ENVIRONMENT 7 | ARG DEBIAN_FRONTEND=noninteractive 8 | RUN usermod -s /bin/bash root 9 | RUN apt-get update && apt-get install -y \ 10 | python3 \ 11 | python3-pip \ 12 | python3-venv \ 13 | apt-transport-https \ 14 | ca-certificates \ 15 | curl \ 16 | gnupg \ 17 | lsb-release \ 18 | software-properties-common 19 | 20 | # INSTALL DOCKER 21 | RUN mkdir -p /etc/apt/keyrings 22 | RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg 23 | RUN echo \ 24 | "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ 25 | $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null 26 | RUN apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin 27 | RUN groupadd docker || true 28 | RUN usermod -aG docker root 29 | EXPOSE 2375 30 | 31 | # MODULE ENVIRONMENT 32 | WORKDIR /app 33 | COPY . . 34 | RUN pip install -e ./ 35 | 36 | # ENTRYPOINT (default to container running) 37 | ENTRYPOINT ["tail", "-f", "/dev/null"] -------------------------------------------------------------------------------- /peerBenchPY/LICENCE: -------------------------------------------------------------------------------- 1 | MITehe License 2 | 3 | Copyright (d) 2025 valAI Organization 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /peerBenchPY/Makefile: -------------------------------------------------------------------------------- 1 | SCRIPTS_PATH=./run 2 | build: 3 | ${SCRIPTS_PATH}/build.sh 4 | start: 5 | ${SCRIPTS_PATH}/start.sh 6 | stop: 7 | ${SCRIPTS_PATH}/stop.sh 8 | enter: 9 | ${SCRIPTS_PATH}/enter.sh 10 | test: 11 | ${SCRIPTS_PATH}/test.sh 12 | freshtest: 13 | make build && make test 14 | install: 15 | ${SCRIPTS_PATH}/install.sh 16 | restart: 17 | make stop 18 | make start 19 | chmod: 20 | chmod +x ${SCRIPTS_PATH}/* 21 | up: 22 | make start 23 | down: 24 | make stop 25 | 26 | -------------------------------------------------------------------------------- /peerBenchPY/README.md: -------------------------------------------------------------------------------- 1 | # val: Decentralized Evaluation Framework 2 | 3 | val is a powerful framework for evaluating and benchmarking AI models across different providers in a decentralized manner. It provides a simple, flexible interface for testing model performance on various tasks with cryptographic verification. 4 | 5 | ## Quick Start 6 | 7 | ```python 8 | import val as v 9 | 10 | # Initialize the evaluator 11 | val = v.Val( 12 | task='add', # Task to evaluate (e.g., 'add', 'divide') 13 | provider='openrouter', # Model provider 14 | n=4, # Number of models to test 15 | samples_per_epoch=2 # Samples per evaluation epoch 16 | ) 17 | 18 | # Run an evaluation epoch 19 | results = val.epoch() 20 | print(results) 21 | ``` 22 | 23 | ## Installation 24 | 25 | ### Using pip 26 | 27 | ```bash 28 | pip install val 29 | ``` 30 | 31 | ### Using Docker 32 | 33 | We provide a Docker environment for easy setup and isolation: 34 | 35 | ```bash 36 | # Clone the repository 37 | git clone https://github.com/val-ai/val.git 38 | cd val 39 | 40 | # Build the Docker image 41 | make build 42 | 43 | # Start the container 44 | make start 45 | 46 | # Enter the container 47 | make enter 48 | 49 | # Run tests 50 | make test 51 | ``` 52 | 53 | ## Core Components 54 | 55 | ### Tasks 56 | 57 | Tasks define what you want to evaluate. val comes with several built-in tasks: 58 | 59 | ```python 60 | # List available tasks 61 | tasks = val.tasks() 62 | print(tasks) # ['add', 'divide', ...] 63 | 64 | # Set a specific task 65 | val.set_task('add') 66 | ``` 67 | 68 | ### Providers 69 | 70 | Providers connect to different AI model APIs: 71 | 72 | ```python 73 | # Set a provider 74 | val.set_provider('openrouter') 75 | 76 | # List available models from the provider 77 | models = val.models() 78 | print(models) 79 | ``` 80 | 81 | ### Authentication 82 | 83 | Secure your evaluations with cryptographic authentication: 84 | 85 | ```python 86 | # Generate a new key 87 | key = v.get_key('my_key', crypto_type='ecdsa') 88 | 89 | # Create an authentication token 90 | auth = v.module('auth')() 91 | token = auth.get_token({'data': 'test'}, key=key) 92 | 93 | # Verify a token 94 | verified_data = auth.verify_token(token) 95 | ``` 96 | 97 | ## Advanced Usage 98 | 99 | ### Custom Tasks 100 | 101 | Create custom evaluation tasks by extending the base Task class: 102 | 103 | ```python 104 | # Define a custom task in task/custom/task.py 105 | class CustomTask: 106 | features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token'] 107 | sort_by = ['score'] 108 | sort_by_asc = [False] 109 | description = 'My custom evaluation task' 110 | 111 | def sample(self, idx=None, sample=None): 112 | # Generate or return a sample 113 | return {'message': {'prompt': 'Your test prompt'}} 114 | 115 | def forward(self, model, sample=None, idx=None): 116 | # Run the model on the sample 117 | sample = self.sample(idx=idx, sample=sample) 118 | result = model(**sample) 119 | return self.score({'sample': sample, 'result': result}) 120 | 121 | def score(self, data): 122 | # Score the model's response 123 | data['score'] = 1.0 # Your scoring logic here 124 | return data 125 | ``` 126 | 127 | ### Background Evaluation 128 | 129 | Run evaluations in the background: 130 | 131 | ```python 132 | evaluator = v.Val( 133 | task='add', 134 | background=True, # Run in background 135 | tempo=60 # Run every 60 seconds 136 | ) 137 | ``` 138 | 139 | ### Aggregating Results 140 | 141 | View and analyze evaluation results: 142 | 143 | ```python 144 | # Get aggregated results 145 | print(val.results()) 146 | ``` 147 | 148 | ## Command Line Interface 149 | 150 | val includes a CLI for common operations: 151 | 152 | ```bash 153 | # Run an evaluation epoch 154 | d epoch --task=add --n=4 155 | 156 | # List available tasks 157 | d tasks 158 | 159 | # Test components 160 | d test 161 | ``` 162 | 163 | ## Docker Environment 164 | 165 | The included Docker environment provides a complete setup for val: 166 | 167 | ```dockerfile 168 | # FROM ubuntu:22.04 base with Python, Docker, and other dependencies 169 | # See Dockerfile for details 170 | 171 | # Build the image 172 | docker build -t val . 173 | 174 | # Run the container 175 | docker run -d \ 176 | --name val \ 177 | --network=host \ 178 | --restart unless-stopped \ 179 | --privileged --shm-size 4g \ 180 | -v $(pwd):/app \ 181 | -v /var/run/docker.sock:/var/run/docker.sock \ 182 | val -------------------------------------------------------------------------------- /peerBenchPY/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "val" 3 | version = "0.0.1" 4 | description = "Decentralized Evaluation Framework" 5 | authors = ["val ai organization "] 6 | license = "MIT" 7 | readme = "README.md" 8 | packages = [ 9 | { include = "val" } 10 | ] 11 | repository = "https://github.com/val-ai/val" 12 | homepage = "https://valai.org/" 13 | 14 | [tool.poetry.dependencies] 15 | python = "^3.9" 16 | fastapi = "*" 17 | sse-starlette = "*" 18 | paramiko = "*" 19 | nest_asyncio = "*" 20 | uvicorn = "*" 21 | scalecodec = ">=1.2.10,<1.3" 22 | asyncio = "*" 23 | aiofiles = "*" 24 | aiohttp = "*" 25 | pytest = "*" 26 | openai = "*" 27 | torch = "*" 28 | safetensors = "*" 29 | msgpack_numpy = "*" 30 | munch = "*" 31 | retry = "*" 32 | netaddr = "*" 33 | typer = "*" 34 | loguru = "*" 35 | pyyaml = "*" 36 | pandas = "*" 37 | websocket-client = ">=0.57.0" 38 | base58 = ">=1.0.3" 39 | certifi = ">=2019.3.9" 40 | idna = ">=2.1.0" 41 | requests = ">=2.21.0" 42 | xxhash = ">=1.3.0" 43 | ecdsa = ">=0.17.0" 44 | eth-keys = ">=0.2.1" 45 | eth_utils = ">=1.3.0" 46 | pycryptodome = ">=3.11.0" 47 | PyNaCl = ">=1.0.1" 48 | py-sr25519-bindings = ">=0.2.0" 49 | py-ed25519-zebra-bindings = ">=1.0" 50 | py-bip39-bindings = ">=0.1.9" 51 | twine = "*" 52 | psutil = "*" 53 | 54 | [tool.poetry.group.dev] 55 | optional = true 56 | 57 | [tool.poetry.group.dev.dependencies] 58 | black = "^23.12.1" 59 | mypy = "^1.8.0" 60 | flake8 = "^7.0.0" 61 | isort = "^5.13.2" 62 | pytest-cov = "^4.1.0" 63 | 64 | [tool.poetry.scripts] 65 | v = "val.val:main" 66 | 67 | [tool.mypy] 68 | files = ["val"] 69 | 70 | [tool.isort] 71 | profile = "black" 72 | multi_line_output = 3 73 | include_trailing_comma = true 74 | force_grid_wrap = 0 75 | use_parentheses = true 76 | line_length = 88 77 | 78 | [tool.black] 79 | line-length = 88 80 | target-version = ['py39', 'py310', 'py311', 'py312'] 81 | include = '\.pyi?$' 82 | 83 | [tool.flake8] 84 | max-line-length = 88 85 | extend-ignore = "E203" 86 | exclude = [".git", "__pycache__", "build", "dist"] 87 | 88 | [build-system] 89 | requires = ["poetry-core"] 90 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /peerBenchPY/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | sse-starlette 3 | paramiko 4 | nest_asyncio 5 | uvicorn 6 | scalecodec>=1.2.10,<1.3 7 | asyncio 8 | aiofiles 9 | aiohttp 10 | pytest 11 | openai 12 | torch 13 | safetensors 14 | msgpack_numpy 15 | munch 16 | retry 17 | netaddr 18 | typer 19 | loguru 20 | pyyaml 21 | pandas 22 | websocket-client>=0.57.0 23 | base58>=1.0.3 24 | certifi>=2019.3.9 25 | idna>=2.1.0 26 | requests>=2.21.0 27 | xxhash>=1.3.0 28 | ecdsa>=0.17.0 29 | eth-keys>=0.2.1 30 | eth_utils>=1.3.0 31 | pycryptodome>=3.11.0 32 | PyNaCl>=1.0.1 33 | scalecodec>=1.2.10 34 | py-sr25519-bindings>=0.2.0 35 | py-ed25519-zebra-bindings>=1.0 36 | py-bip39-bindings>=0.1.9 37 | twine 38 | psutil 39 | dotenv 40 | multiformats 41 | -------------------------------------------------------------------------------- /peerBenchPY/run/build.sh: -------------------------------------------------------------------------------- 1 | REPO_NAME=$(basename $(pwd)) 2 | if [ -z $1 ]; then 3 | NAME=$REPO_NAME 4 | else 5 | NAME=$1 6 | fi 7 | docker build -t $NAME $(pwd) -------------------------------------------------------------------------------- /peerBenchPY/run/enter.sh: -------------------------------------------------------------------------------- 1 | 2 | REPO_PATH=$(pwd) ; 3 | REPO_NAME=$(basename $REPO_PATH) 4 | if [ -z $1 ]; then 5 | NAME=$REPO_NAME 6 | else 7 | NAME=$1 8 | fi 9 | docker exec -it $NAME /bin/bash -------------------------------------------------------------------------------- /peerBenchPY/run/start.sh: -------------------------------------------------------------------------------- 1 | # starts the container with the name of the current directory 2 | 3 | 4 | PWD=$(pwd) 5 | REPO=$(basename $(pwd)) # get the name of the current directory 6 | # if no argument is passed, start the container with the name of the current directory 7 | 8 | if [ $# -eq 0 ]; then 9 | NAME=$(basename $PWD) 10 | else 11 | NAME=$1 12 | fi 13 | 14 | docker run -d \ 15 | --name $NAME \ 16 | --network=host \ 17 | --restart unless-stopped \ 18 | --privileged --shm-size 4g \ 19 | -v $PWD:/app \ 20 | -v /root/.{$REPO}:/root/.{$REPO} \ 21 | -v /var/run/docker.sock:/var/run/docker.sock \ 22 | $REPO 23 | 24 | CONTAINER_ID=$(docker ps -q -f name=$NAME) 25 | echo "STARTING(name=$NAME repo=$REPO container=$CONTAINER_ID)" 26 | 27 | 28 | 29 | # Path: run/stop.sh -------------------------------------------------------------------------------- /peerBenchPY/run/stop.sh: -------------------------------------------------------------------------------- 1 | 2 | # if no argument is passed, stop the container with the name of the current directory 3 | 4 | # if an argument is passed, stop the container with the name of the argument 5 | REPO=$(basename $(pwd)) 6 | 7 | if [ $# -eq 0 ]; then 8 | NAME=$(basename $(pwd)) 9 | else 10 | NAME=$1 11 | fi 12 | CONTAINER_EXISTS=$(docker ps -q -f name=$NAME) 13 | CONTAINER_ID=$(docker ps -aq -f name=$NAME) 14 | 15 | echo "STOPING(name=$NAME repo=$REPO container=$CONTAINER_ID)" 16 | if [ $CONTAINER_EXISTS ]; then 17 | docker kill $NAME 18 | docker rm $NAME 19 | fi 20 | -------------------------------------------------------------------------------- /peerBenchPY/run/test.sh: -------------------------------------------------------------------------------- 1 | 2 | PWD=$(pwd) ; 3 | REPO=$(basename $PWD) 4 | NAME=$REPO-test 5 | TEST_CMD="d test" 6 | ./run/stop.sh $NAME 7 | docker run -d --name $NAME -v $PWD:/app $REPO 8 | docker exec -it $NAME bash -c "$TEST_CMD" 9 | ./run/stop.sh $NAME 10 | -------------------------------------------------------------------------------- /peerBenchPY/tmp_consolidate_responses.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import duckdb 4 | import pandas as pd 5 | from pathlib import Path 6 | from val.utils import storage_path 7 | 8 | def flatten_dict(d, parent_key='', sep='_'): 9 | """Flatten nested dictionary by concatenating keys with separator.""" 10 | items = [] 11 | for k, v in v.items(): 12 | new_key = f"{parent_key}{sep}{k}" if parent_key else k 13 | if isinstance(v, dict): 14 | items.extend(flatten_dict(v, new_key, sep=sep).items()) 15 | else: 16 | items.append((new_key, v)) 17 | return dict(items) 18 | 19 | def is_sensitive_file(file_path): 20 | """Check if the file contains sensitive information.""" 21 | sensitive_paths = ['key', 'private', 'secret', 'mnemonic', 'wallet'] 22 | return any(sensitive in file_path.lower() for sensitive in sensitive_paths) 23 | 24 | def consolidate_responses(): 25 | # Get the storage path 26 | storage_dir = Path(storage_path) 27 | results_dir = storage_dir / 'results' 28 | 29 | # List to store all responses 30 | all_responses = [] 31 | 32 | # Only process files in the results directory 33 | if not results_dir.exists(): 34 | print(f"Results directory not found at {results_dir}") 35 | return 36 | 37 | for root, dirs, files in os.walk(results_dir): 38 | for file in files: 39 | if file.endswith('.json'): 40 | file_path = os.path.join(root, file) 41 | 42 | # Skip sensitive files 43 | if is_sensitive_file(file_path): 44 | print(f"Skipping sensitive file: {file_path}") 45 | continue 46 | 47 | try: 48 | with open(file_path, 'r') as f: 49 | response_data = json.load(f) 50 | 51 | # Flatten the params field if it exists 52 | if 'params' in response_data: 53 | flattened_params = flatten_dict(response_data['params']) 54 | # Remove the original params and add flattened version 55 | del response_data['params'] 56 | response_data.update(flattened_params) 57 | 58 | # Add source file information 59 | response_data['source_file'] = file_path 60 | all_responses.append(response_data) 61 | except Exception as e: 62 | print(f"Error reading {file_path}: {str(e)}") 63 | 64 | # Create output directory if it doesn't exist 65 | output_dir = storage_dir / 'consolidated' 66 | output_dir.mkdir(exist_ok=True) 67 | 68 | # Save consolidated responses as JSON 69 | output_file_json = output_dir / 'all_responses.json' 70 | with open(output_file_json, 'w') as f: 71 | json.dump(all_responses, f, indent=2) 72 | 73 | # Convert to DataFrame and save as parquet using duckdb 74 | df = pd.DataFrame(all_responses) 75 | output_file_parquet = output_dir / 'all_responses.parquet' 76 | 77 | # Create a DuckDB connection 78 | con = duckdb.connect() 79 | 80 | # Register the DataFrame as a table 81 | con.register('responses', df) 82 | 83 | # Save to parquet 84 | con.execute(f"COPY responses TO '{output_file_parquet}' (FORMAT PARQUET)") 85 | 86 | # Close the connection 87 | con.close() 88 | 89 | print(f"Consolidated {len(all_responses)} responses into:") 90 | print(f"- JSON: {output_file_json}") 91 | print(f"- Parquet: {output_file_parquet}") 92 | 93 | if __name__ == "__main__": 94 | consolidate_responses() -------------------------------------------------------------------------------- /peerBenchPY/tmprun.py: -------------------------------------------------------------------------------- 1 | import val as v 2 | 3 | # Initialize val with default settings 4 | val = v.val( 5 | task='add', # Task to evaluate 6 | provider='providers.openrouter', # Model provider 7 | batch_size=16, # Number of parallel evaluations 8 | n=10 # Number of models to evaluate 9 | ) 10 | 11 | # Run an evaluation epoch 12 | results = val.epoch() 13 | 14 | 15 | all_results = val.results() 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | # View the results 24 | print(results) -------------------------------------------------------------------------------- /peerBenchPY/tmprun2.py: -------------------------------------------------------------------------------- 1 | import val as v 2 | 3 | # Initialize the evaluator 4 | val = v.val( 5 | task='add', # Task to evaluate (e.g., 'add', 'divide') 6 | provider='openrouter', # Model provider 7 | n=4, # Number of models to test 8 | samples_per_epoch=2 # Samples per evaluation epoch 9 | ) 10 | 11 | # Run an evaluation epoch 12 | results = val.epoch() 13 | print(results) 14 | a = 1+1 -------------------------------------------------------------------------------- /peerBenchPY/tmptestcid.py: -------------------------------------------------------------------------------- 1 | from val.utils import cid_sha256_from_file, cid_sha256_from_str 2 | 3 | testfname = "requirements.txt" 4 | with open(testfname, 'r') as file: 5 | testfilecontent = file.read() 6 | cid_from_file = cid_sha256_from_file(testfname) 7 | cid_from_text_self = cid_sha256_from_str(testfilecontent) 8 | 9 | print("Test file content:") 10 | print(testfilecontent) 11 | print(f"CID from file: {cid_from_file} https://cid.ipfs.tech/#{cid_from_file}") 12 | print(f"CID from text self: {cid_from_text_self} https://cid.ipfs.tech/#{cid_from_text_self}") 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /peerBenchPY/val/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .val import val 3 | val.init(globals()) -------------------------------------------------------------------------------- /peerBenchPY/val/auth.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hmac 3 | import json 4 | import time 5 | from typing import Dict, Optional, Any 6 | import val as v 7 | 8 | class Auth: 9 | 10 | def get_token(self, data: Dict='hey', key:Optional[str]=None, crypto_type: str = 'ecdsa', expiration: int = 3600, mode='bytes') -> str: 11 | """ 12 | Generate a JWT token with the given data 13 | Args: 14 | data: Dictionary containing the data to encode in the token 15 | expiration: Optional custom expiration time in seconds 16 | Returns: 17 | JWT token string 18 | """ 19 | if isinstance(key, str) or key == None: 20 | key = v.get_key(key, crypto_type=crypto_type) 21 | else: 22 | key = key 23 | if not isinstance(data, dict): 24 | data = {'data': data } 25 | token_data = data.copy() 26 | # Add standard JWT claims 27 | token_data.update({ 28 | 'iat': str(float(time.time())), # Issued at time 29 | 'exp': str(float(time.time() + expiration)), # Expiration time 30 | 'iss': key.key_address, # Issuer (key address) 31 | }) 32 | 33 | # Create JWT header 34 | if crypto_type != key.crypto_type: 35 | crypto_type = key.crypto_type 36 | header = { 37 | 'alg': crypto_type, 38 | 'typ': 'JWT', 39 | } 40 | 41 | # Create message to sign 42 | message = f"{self._base64url_encode(header)}.{self._base64url_encode(token_data)}" 43 | # For asymmetric algorithms, use the key's sign method 44 | signature = key.sign(message, mode='bytes') 45 | signature_encoded = self._base64url_encode(signature) 46 | # Combine to create the token 47 | 48 | assert mode in ['bytes', 'dict'], f'Invalid mode {mode}' 49 | 50 | 51 | token = f"{message}.{signature_encoded}" 52 | 53 | if mode == 'dict': 54 | return self.verify_token(token) 55 | elif mode == 'bytes': 56 | return f"{message}.{signature_encoded}" 57 | else: 58 | raise 59 | 60 | return f"{message}.{signature_encoded}" 61 | 62 | def verify_token(self, token: str) -> Dict: 63 | """ 64 | Verify and decode a JWT token 65 | """ 66 | if isinstance(token, dict) and 'token' in token: 67 | token = token['token'] 68 | # Split the token into parts 69 | header_encoded, data_encoded, signature_encoded = token.split('.') 70 | # Decode the data 71 | data = json.loads(self._base64url_decode(data_encoded)) 72 | headers = json.loads(self._base64url_decode(header_encoded)) 73 | # Check if token is expired 74 | if 'exp' in data and float(data['exp']) < time.time(): 75 | raise Exception("Token has expired") 76 | # Verify signature 77 | message = f"{header_encoded}.{data_encoded}" 78 | signature = self._base64url_decode(signature_encoded) 79 | assert v.verify(data=message, signature=signature, address=data['iss'], crypto_type=headers['alg']), "Invalid token signature" 80 | # data['data'] = message 81 | data['time'] = data['iat'] # set time field for semanitcally easy people 82 | data['signature'] = '0x'+signature.hex() 83 | data['alg'] = headers['alg'] 84 | data['typ'] = headers['typ'] 85 | data['token'] = token 86 | data['key'] = data['iss'] 87 | 88 | return data 89 | 90 | def _base64url_encode(self, data): 91 | """Encode data in base64url format""" 92 | if isinstance(data, str): 93 | data = data.encode('utf-8') 94 | elif isinstance(data, dict): 95 | data = json.dumps(data, separators=(',', ':')).encode('utf-8') 96 | encoded = base64.urlsafe_b64encode(data).rstrip(b'=') 97 | return encoded.decode('utf-8') 98 | 99 | def _base64url_decode(self, data): 100 | """Decode base64url data""" 101 | padding = b'=' * (4 - (len(data) % 4)) 102 | return base64.urlsafe_b64decode(data.encode('utf-8') + padding) 103 | 104 | def test(self, test_data = {'fam': 'fam', 'admin': 1} , crypto_type='ecdsa'): 105 | """ 106 | Test the JWT token functionality 107 | 108 | Returns: 109 | Dictionary with test results 110 | """ 111 | # Generate a token 112 | token = self.get_token(test_data, crypto_type=crypto_type) 113 | # Verify the token 114 | decoded = self.verify_token(token) 115 | # Check if original data is in the decoded data 116 | validation_passed = all(test_data[key] == decoded[key] for key in test_data) 117 | assert validation_passed, "Decoded data does not match original data" 118 | # Test token expiration 119 | quick_token = self.get_token(test_data, expiration=0.1, crypto_type=crypto_type) 120 | time.sleep(0.2) # Wait for token to expire 121 | 122 | expired_token_caught = False 123 | try: 124 | decoded = self.verify_token(quick_token) 125 | except Exception as e: 126 | expired_token_caught = True 127 | assert expired_token_caught, "Expired token not caught" 128 | 129 | 130 | return { 131 | "token": token, 132 | "decoded_data": decoded, 133 | "crypto_type": crypto_type, 134 | "quick_token": quick_token, 135 | "expired_token_caught": expired_token_caught 136 | } -------------------------------------------------------------------------------- /peerBenchPY/val/providers/litellm/provider.py: -------------------------------------------------------------------------------- 1 | import os 2 | from litellm import completion, acompletion 3 | import asyncio 4 | 5 | class LiteLLM: 6 | def __init__(self, provider_keys: dict = {}): 7 | """ 8 | Initialize LiteLLMClient with provider API keys. 9 | 10 | Args: 11 | provider_keys (dict): API keys for providers, e.g., {'openai': 'key', 'anthropic': 'key'} 12 | """ 13 | for provider, key in provider_keys.items(): 14 | os.environ[f"{provider.upper()}_API_KEY"] = key 15 | 16 | def forward(self, model: str, messages: list, stream: bool = False): 17 | """ 18 | Generate completion synchronously. 19 | 20 | Args: 21 | model (str): Provider and model name (e.g., "openai/gpt-4o"). 22 | messages (list): List of message dicts. 23 | stream (bool): Stream response or not. 24 | 25 | Returns: 26 | Completion response. 27 | """ 28 | response = completion(model=model, messages=messages, stream=stream) 29 | if stream: 30 | return (part.choices[0].delta.content or "" for part in response) 31 | return response.choices[0].message.content 32 | 33 | async def async_forward(self, model: str, messages: list): 34 | """ 35 | Generate completion asynchronously. 36 | 37 | Args: 38 | model (str): Provider and model name. 39 | messages (list): List of message dicts. 40 | 41 | Returns: 42 | Completion response. 43 | """ 44 | response = await acompletion(model=model, messages=messages) 45 | return response.choices[0].message.content 46 | 47 | def set_callbacks(self, callbacks: list): 48 | """ 49 | Set logging and observability callbacks. 50 | 51 | Args: 52 | callbacks (list): List of callbacks (e.g., ['lunary', 'mlflow']). 53 | """ 54 | import litellm 55 | litellm.success_callback = callbacks 56 | 57 | 58 | def test(self): 59 | provider_keys = { 60 | "openai": "your-openai-key", 61 | "anthropic": "your-anthropic-key" 62 | } 63 | 64 | client = LiteLLMClient(provider_keys) 65 | 66 | messages = [{"content": "Hello, how are you?", "role": "user"}] 67 | 68 | # synchronous 69 | response = client.forward("openai/gpt-4o", messages) 70 | print(response) 71 | 72 | # asynchronous 73 | async def main(): 74 | response = await client.async_forward("anthropic/claude-3-sonnet-20240229", messages) 75 | print(response) 76 | 77 | asyncio.run(main()) 78 | -------------------------------------------------------------------------------- /peerBenchPY/val/storage.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import os 4 | import time 5 | from .utils import get_text, sha256 6 | 7 | class Storage: 8 | 9 | def __init__(self, storage_dirpath='~/.val', mode='json'): 10 | self.storage_dirpath = self.abspath(storage_dirpath) 11 | self.mode = mode 12 | 13 | def put(self, path, data): 14 | path = self.get_item_path(path) 15 | dirpath = '/'.join(path.split('/')[:-1]) 16 | if not os.path.exists(dirpath): 17 | os.makedirs(dirpath, exist_ok=True) 18 | with open(path, 'w') as f: 19 | json.dump(data, f) 20 | return path 21 | 22 | def get(self, path, default=None, max_age=None, update=False): 23 | path = self.get_item_path(path) 24 | with open(path, 'r') as f: 25 | data = json.load(f) 26 | if update: 27 | max_age = 0 28 | if max_age != None: 29 | if time.time() - os.path.getmtime(path) > max_age: 30 | data = default 31 | return data 32 | 33 | def get_item_path(self, path): 34 | if not path.startswith('/'): 35 | path = f'{self.storage_dirpath}/{path}' 36 | if self.mode != None: 37 | if not path.endswith(f'.{self.mode}'): 38 | path = f'{path}.{self.mode}' 39 | return path 40 | 41 | def rm(self, path): 42 | path = self.get_item_path(path) 43 | assert os.path.exists(path), f'Failed to find path {path}' 44 | os.remove(path) 45 | return path 46 | 47 | def items(self, df=False, features=None): 48 | paths = self.paths() 49 | data = [] 50 | for p in paths: 51 | try: 52 | data.append(self.get(p)) 53 | except Exception as e: 54 | print(f'Failed to get {p} error={e}') 55 | if df: 56 | import pandas as pd 57 | data = pd.DataFrame(data) 58 | return data 59 | 60 | def paths(self): 61 | import glob 62 | paths = glob.glob(f'{self.storage_dirpath}/**/*', recursive=True) 63 | return [self.abspath(p) for p in paths if os.path.isfile(p)] 64 | 65 | def exists(self, path): 66 | path = self.get_item_path(path) 67 | return os.path.exists(path) 68 | 69 | 70 | def item2age(self): 71 | """ 72 | returns the age of the item in seconds 73 | """ 74 | paths = self.paths() 75 | ages = {} 76 | for p in paths: 77 | ages[p] = time.time() - os.path.getmtime(p) 78 | return ages 79 | 80 | def n(self): 81 | paths = self.items() 82 | return len(paths) 83 | 84 | def _rm_all(self): 85 | """ 86 | removes all items in the storage 87 | """ 88 | paths = self.paths() 89 | for p in paths: 90 | os.remove(p) 91 | return paths 92 | 93 | def test(self, path='test.json', data={'test': 'test', 'fam': {'test': 'test'}}): 94 | t0 = time.time() 95 | n0 = self.n() 96 | if self.exists(path): 97 | self.rm(path) 98 | assert not self.exists(path), f'Failed to delete' 99 | self.put('test.json', {'test': 'test'}) 100 | n1 = self.n() 101 | assert n1 == n0 + 1, f'Failed to add item n0={n0} n1={n1}' 102 | assert self.exists(path), f'Failed to find {path}' 103 | data = self.get(path) 104 | self.rm(path) 105 | n2 = self.n() 106 | assert n2 == n0, f'Failed to delete item n0={n0} n2={n2}' 107 | assert not self.exists(path), f'Failed to delete {path}' 108 | assert data == {'test': 'test'}, f'Failed test data={data}' 109 | t1 = time.time() 110 | print(f'Passed all tests in {t1 - t0} seconds') 111 | return {'success': True, 'msg': 'Passed all tests'} 112 | 113 | def abspath(self, path): 114 | return os.path.abspath(os.path.expanduser(path)) 115 | 116 | def path2age(self): 117 | """ 118 | returns the age of the item in seconds 119 | """ 120 | paths = self.paths() 121 | ages = {} 122 | for p in paths: 123 | ages[p] = time.time() - os.path.getmtime(p) 124 | return ages 125 | 126 | def cid(self, path, ignore_names=['__pycache__', '.DS_Store','.git', '.gitignore']): 127 | """ 128 | Get the CID of the val module 129 | """ 130 | path = self.abspath(path) 131 | if os.path.isdir(path): 132 | files = os.listdir(path) 133 | content = [] 134 | for f in files: 135 | if any([ignore in f for ignore in ignore_names]): 136 | continue 137 | f = path + '/' + f 138 | content.append(self.cid(f)) 139 | content = ''.join(content) 140 | elif os.path.isfile(path): 141 | content = get_text(path) 142 | else: 143 | raise Exception(f'Failed to find path {path}') 144 | cid = sha256(content) 145 | print(f'cid={cid} path={path}') 146 | return cid 147 | -------------------------------------------------------------------------------- /peerBenchPY/val/task/add/task.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Optional 3 | import json 4 | class AddTask: 5 | features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token'] 6 | sort_by = ['score'] 7 | sort_by_asc = [False, True] 8 | description = 'tests a model to add two numberts' 9 | output_bounds = ['', ''] 10 | temperature = 0 11 | max_tokens = 10000 12 | 13 | def sample(self , idx:int = None, sample=None,) -> dict: 14 | """ 15 | generate the sample 16 | """ 17 | # if params is not None, use it 18 | if sample is not None: 19 | return sample 20 | # generate two random numbers from the id seed 21 | idx = idx or random.randint(1, 1000) 22 | random.seed(idx) 23 | a = random.randint(1, 100) 24 | b = random.randint(1, 100) 25 | # return a sample with the two numbers 26 | return {'message': { 27 | 'a': a, 28 | 'b': b, 29 | 'goal': 'return a json object with the sum ', 30 | 'output_format': f'strictly as {self.output_bounds[0]}json(y:int){self.output_bounds[1]}' 31 | }, 32 | 'temperature': self.temperature, 33 | 'max_tokens': self.max_tokens 34 | } 35 | 36 | def forward(self, model: callable, sample:Optional[dict]=None, idx=None) -> dict: 37 | """ 38 | run the model on the sample 39 | Args: 40 | model: the model to run 41 | sample: the sample to run on 42 | idx: the index of the sample 43 | Returns: 44 | dict: the result of the model 45 | """ 46 | 47 | sample = self.sample(idx=idx, sample=sample) 48 | 49 | result = model(**sample) 50 | 51 | data = { 52 | 'sample': sample, 53 | 'result': result, 54 | } 55 | # step 3 : score the data 56 | data = self.score(data) 57 | return data 58 | 59 | def score(self, data:dict) -> float: 60 | sample_data = data['sample']['message'] 61 | target = str(sample_data['a']+ sample_data['b']) 62 | data['score'] = float(str(target) in data['result']) 63 | return data 64 | -------------------------------------------------------------------------------- /peerBenchPY/val/task/divide/task.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Optional 3 | import json 4 | class AddTask: 5 | features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token'] 6 | sort_by = ['score'] 7 | sort_by_asc = [False, True] 8 | description = 'tests a model to add two numbers' 9 | output_bounds = ['', ''] 10 | temperature = 0 11 | max_tokens = 10000 12 | 13 | def sample(self , idx:int = None, sample=None,) -> dict: 14 | """ 15 | generate the sample 16 | """ 17 | # if params is not None, use it 18 | if sample is not None: 19 | return sample 20 | # generate two random numbers from the id seed 21 | idx = idx or random.randint(1, 1000) 22 | random.seed(idx) 23 | a = random.randint(1, 100) 24 | b = random.randint(1, 100) 25 | # return a sample with the two numbers 26 | return {'message': { 27 | 'a': a, 28 | 'b': b, 29 | 'goal': 'return a json object with the sum', 30 | 'output_format': f'strictly as {self.output_bounds[0]}json(y:int){self.output_bounds[1]}' 31 | }, 32 | 'temperature': self.temperature, 33 | 'max_tokens': self.max_tokens 34 | } 35 | 36 | def forward(self, model: callable, sample:Optional[dict]=None, idx=None) -> dict: 37 | """ 38 | run the model on the sample 39 | Args: 40 | model: the model to run 41 | sample: the sample to run on 42 | idx: the index of the sample 43 | Returns: 44 | dict: the result of the model 45 | """ 46 | 47 | sample = self.sample(idx=idx, sample=sample) 48 | 49 | result = model(**sample) 50 | 51 | data = { 52 | 'sample': sample, 53 | 'result': result, 54 | } 55 | 56 | # step 3 : score the data 57 | return self.score(data) 58 | 59 | def score(self, data:dict) -> float: 60 | sample_data = data['sample']['message'] 61 | target = str(sample_data['a']+ sample_data['b']) 62 | data['score'] = float(str(target) in data['result']) 63 | return data 64 | 65 | def verify_sample(self, data): 66 | assert all([f in data for f in self.features]), f'Missing features {self.features}' -------------------------------------------------------------------------------- /peerBenchPY/val/task/task.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Optional 3 | import json 4 | class Task: 5 | features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token'] 6 | sort_by = ['score'] 7 | sort_by_asc = [False] 8 | description = 'tests a model to add two numberts' 9 | output_bounds = ['', ''] 10 | temperature = 0 11 | max_tokens = 10000 12 | 13 | def sample(self , idx:int = None, sample=None) -> dict: 14 | """ 15 | generate the sample 16 | """ 17 | # if params is not None, use it 18 | if sample is not None: 19 | return sample 20 | # generate two random numbers from the id seed 21 | if idx != None: 22 | random.seed(idx) 23 | idx = idx or random.randint(1, 1000) 24 | random.seed(idx) 25 | a = random.randint(1, 100) 26 | b = random.randint(1, 100) 27 | # return a sample with the two numbers 28 | return {'message': { 29 | 'a': a, 30 | 'b': b, 31 | 'goal': 'return a json object with the sum ', 32 | 'output_format': f'strictly as {self.output_bounds[0]}json(y:int){self.output_bounds[1]}' 33 | }, 34 | 'temperature': self.temperature, 35 | 'max_tokens': self.max_tokens 36 | } 37 | 38 | def forward(self, model: callable, sample:Optional[dict]=None, idx=None) -> dict: 39 | """ 40 | run the model on the sample 41 | Args: 42 | model: the model to run 43 | sample: the sample to run on 44 | idx: the index of the sample 45 | Returns: 46 | dict: the result of the model 47 | """ 48 | 49 | sample = self.sample(idx=idx, sample=sample) 50 | 51 | result = model(**sample) 52 | 53 | data = { 54 | 'sample': sample, 55 | 'result': result, 56 | } 57 | 58 | # step 3 : score the data 59 | return self.score(data) 60 | 61 | def score(self, data:dict) -> float: 62 | sample_data = data['sample']['message'] 63 | target = str(sample_data['a']+ sample_data['b']) 64 | data['score'] = float(str(target) in data['result']) 65 | return data 66 | 67 | def verify_sample(self, data): 68 | assert all([f in data for f in self.features]), f'Missing features {self.features}' -------------------------------------------------------------------------------- /py/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/py/.DS_Store -------------------------------------------------------------------------------- /py/Iris.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species 2 | 1,5.1,3.5,1.4,0.2,Iris-setosa 3 | 2,4.9,3.0,1.4,0.2,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,5.0,3.6,1.4,0.2,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,Iris-setosa 9 | 8,5.0,3.4,1.5,0.2,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor 100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor 101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor 102 | 101,6.3,3.3,6.0,2.5,Iris-virginica 103 | 102,5.8,2.7,5.1,1.9,Iris-virginica 104 | 103,7.1,3.0,5.9,2.1,Iris-virginica 105 | 104,6.3,2.9,5.6,1.8,Iris-virginica 106 | 105,6.5,3.0,5.8,2.2,Iris-virginica 107 | 106,7.6,3.0,6.6,2.1,Iris-virginica 108 | 107,4.9,2.5,4.5,1.7,Iris-virginica 109 | 108,7.3,2.9,6.3,1.8,Iris-virginica 110 | 109,6.7,2.5,5.8,1.8,Iris-virginica 111 | 110,7.2,3.6,6.1,2.5,Iris-virginica 112 | 111,6.5,3.2,5.1,2.0,Iris-virginica 113 | 112,6.4,2.7,5.3,1.9,Iris-virginica 114 | 113,6.8,3.0,5.5,2.1,Iris-virginica 115 | 114,5.7,2.5,5.0,2.0,Iris-virginica 116 | 115,5.8,2.8,5.1,2.4,Iris-virginica 117 | 116,6.4,3.2,5.3,2.3,Iris-virginica 118 | 117,6.5,3.0,5.5,1.8,Iris-virginica 119 | 118,7.7,3.8,6.7,2.2,Iris-virginica 120 | 119,7.7,2.6,6.9,2.3,Iris-virginica 121 | 120,6.0,2.2,5.0,1.5,Iris-virginica 122 | 121,6.9,3.2,5.7,2.3,Iris-virginica 123 | 122,5.6,2.8,4.9,2.0,Iris-virginica 124 | 123,7.7,2.8,6.7,2.0,Iris-virginica 125 | 124,6.3,2.7,4.9,1.8,Iris-virginica 126 | 125,6.7,3.3,5.7,2.1,Iris-virginica 127 | 126,7.2,3.2,6.0,1.8,Iris-virginica 128 | 127,6.2,2.8,4.8,1.8,Iris-virginica 129 | 128,6.1,3.0,4.9,1.8,Iris-virginica 130 | 129,6.4,2.8,5.6,2.1,Iris-virginica 131 | 130,7.2,3.0,5.8,1.6,Iris-virginica 132 | 131,7.4,2.8,6.1,1.9,Iris-virginica 133 | 132,7.9,3.8,6.4,2.0,Iris-virginica 134 | 133,6.4,2.8,5.6,2.2,Iris-virginica 135 | 134,6.3,2.8,5.1,1.5,Iris-virginica 136 | 135,6.1,2.6,5.6,1.4,Iris-virginica 137 | 136,7.7,3.0,6.1,2.3,Iris-virginica 138 | 137,6.3,3.4,5.6,2.4,Iris-virginica 139 | 138,6.4,3.1,5.5,1.8,Iris-virginica 140 | 139,6.0,3.0,4.8,1.8,Iris-virginica 141 | 140,6.9,3.1,5.4,2.1,Iris-virginica 142 | 141,6.7,3.1,5.6,2.4,Iris-virginica 143 | 142,6.9,3.1,5.1,2.3,Iris-virginica 144 | 143,5.8,2.7,5.1,1.9,Iris-virginica 145 | 144,6.8,3.2,5.9,2.3,Iris-virginica 146 | 145,6.7,3.3,5.7,2.5,Iris-virginica 147 | 146,6.7,3.0,5.2,2.3,Iris-virginica 148 | 147,6.3,2.5,5.0,1.9,Iris-virginica 149 | 148,6.5,3.0,5.2,2.0,Iris-virginica 150 | 149,6.2,3.4,5.4,2.3,Iris-virginica 151 | 150,5.9,3.0,5.1,1.8,Iris-virginica 152 | -------------------------------------------------------------------------------- /py/Iris.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/py/Iris.parquet -------------------------------------------------------------------------------- /py/compute_ipfs_cid.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import hashlib 3 | import base64 4 | import base58 5 | import struct 6 | #TODO redo this with some library I just don't know which one we want to use https://github.com/hashberg-io/multiformats or https://github.com/PancakesArchitect/py-multiformats-cid have not been touched for a while but i guess its a simple algo that doesn't need to be touched 7 | 8 | def compute_ipfs_cids(file_path: str) -> tuple[str, str, str]: 9 | print("hello3") 10 | try: 11 | # Read file in binary mode 12 | with open(file_path, 'rb') as f: 13 | file_data = f.read() 14 | 15 | # Compute SHA2-256 hash of raw data 16 | h = hashlib.sha256(file_data).digest() 17 | print(f"SHA256 raw digest: {h.hex()}") 18 | 19 | # Construct the multihash 20 | # 0x12 = sha2-256 identifier 21 | # 0x20 = length (32 bytes) of sha2-256 hash 22 | multihash = bytes([0x12, 0x20]) + h 23 | 24 | # CIDv0 (base58btc) 25 | cidv0 = base58.b58encode(multihash).decode('utf-8') 26 | 27 | # CIDv1 with raw codec (base32) 28 | # 0x01 = CID version 1 29 | # 0x55 = raw binary codec 30 | cidv1_bytes = bytes([0x01, 0x55]) + multihash 31 | cidv1_base32 = 'b' + base64.b32encode(cidv1_bytes).decode('utf-8').lower() 32 | 33 | 34 | return cidv1_base32 35 | 36 | except Exception as e: 37 | raise Exception(f"Failed to compute CID: {str(e)}") 38 | 39 | def main(): 40 | parser = argparse.ArgumentParser(description='Compute IPFS CID for a file using SHA2-256') 41 | parser.add_argument('file_path', help='Path to the file') 42 | 43 | args = parser.parse_args() 44 | 45 | try: 46 | cidv1_raw = compute_ipfs_cids(args.file_path) 47 | print(f"IPFS CIDv1 (base32, raw): {cidv1_raw}") 48 | 49 | except Exception as e: 50 | print(f"Error: {e}") 51 | exit(1) 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /py/create_secp256k1_jws.py: -------------------------------------------------------------------------------- 1 | import json 2 | from jose import jws 3 | from jose.constants import ALGORITHMS 4 | from ecdsa import SigningKey 5 | import binascii 6 | 7 | def create_secp256k1_jws(): 8 | # The private key (in hex) 9 | private_key_hex = "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2" 10 | 11 | # Convert hex private key to bytes 12 | private_key_bytes = binascii.unhexlify(private_key_hex) 13 | 14 | # Create signing key from private key bytes 15 | signing_key = SigningKey.from_string(private_key_bytes, curve=ALGORITHMS.ES256K) 16 | 17 | # The message to sign 18 | payload = { 19 | "hello": "world" 20 | } 21 | 22 | try: 23 | # Create JWS signature 24 | signature = jws.sign(payload, signing_key, algorithm=ALGORITHMS.ES256K) 25 | 26 | print("\nPayload:") 27 | print(json.dumps(payload, indent=2)) 28 | print("\nJWS Signature:") 29 | print(signature) 30 | 31 | # Verify the signature (optional verification step) 32 | verify_key = signing_key.get_verifying_key() 33 | try: 34 | verified_payload = jws.verify(signature, verify_key, algorithms=ALGORITHMS.ES256K) 35 | print("\nSignature verified successfully!") 36 | print("Verified payload:", json.loads(verified_payload)) 37 | except Exception as e: 38 | print("\nSignature verification failed:", str(e)) 39 | 40 | except Exception as e: 41 | print(f"Error creating JWS: {str(e)}") 42 | exit(1) 43 | 44 | if __name__ == "__main__": 45 | print("hello5.\n") 46 | create_secp256k1_jws() 47 | -------------------------------------------------------------------------------- /py/create_secp256k1_jws_rfc.py: -------------------------------------------------------------------------------- 1 | import json 2 | import binascii 3 | from joserfc import jws 4 | from joserfc.jwk import ECKey 5 | from ecdsa import SigningKey, SECP256k1 6 | import base64 7 | 8 | def create_secp256k1_jws(): 9 | # The private key (in hex) 10 | private_key_hex = "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2" 11 | private_key_bytes = binascii.unhexlify(private_key_hex) 12 | 13 | # Create ECDSA signing key and get public key points 14 | signing_key = SigningKey.from_string(private_key_bytes, curve=SECP256k1) 15 | verifying_key = signing_key.get_verifying_key() 16 | 17 | # Get public key coordinates 18 | x_coord = verifying_key.pubkey.point.x() 19 | y_coord = verifying_key.pubkey.point.y() 20 | 21 | # Convert coordinates to base64url 22 | x_b64 = base64.urlsafe_b64encode(x_coord.to_bytes(32, 'big')).rstrip(b'=').decode('ascii') 23 | y_b64 = base64.urlsafe_b64encode(y_coord.to_bytes(32, 'big')).rstrip(b'=').decode('ascii') 24 | d_b64 = base64.urlsafe_b64encode(private_key_bytes).rstrip(b'=').decode('ascii') 25 | 26 | # Create EC Key with all required parameters 27 | key = ECKey.import_key({ 28 | "kty": "EC", 29 | "crv": "secp256k1", 30 | "x": x_b64, 31 | "y": y_b64, 32 | "d": d_b64 33 | }) 34 | 35 | # The message to sign 36 | payload = json.dumps({"hello": "world"}).encode('utf-8') 37 | 38 | try: 39 | # Create protected header 40 | protected = {"alg": "ES256K"} 41 | 42 | # Create JWS signature 43 | token = jws.serialize_compact(protected, payload, key) 44 | 45 | print("\nPayload:") 46 | print(json.dumps({"hello": "world"}, indent=2)) 47 | print("\nJWS Token:") 48 | print(token) 49 | 50 | # Verify the signature 51 | try: 52 | verified = jws.verify_compact(token, key) 53 | print("\nSignature verified successfully!") 54 | print("Verified payload:", json.loads(verified.payload)) 55 | except Exception as e: 56 | print("\nSignature verification failed:", str(e)) 57 | 58 | except Exception as e: 59 | print(f"Error creating JWS: {str(e)}") 60 | exit(1) 61 | 62 | if __name__ == "__main__": 63 | print("hello511.\n") 64 | create_secp256k1_jws() 65 | -------------------------------------------------------------------------------- /py/create_secp256k1_jwt.py: -------------------------------------------------------------------------------- 1 | import jwt 2 | import json 3 | import binascii 4 | from ecdsa import SigningKey, SECP256k1 5 | from ecdsa.util import sigencode_der 6 | import base64 7 | 8 | def create_key_pair(private_key_hex: str): 9 | private_key_bytes = binascii.unhexlify(private_key_hex) 10 | signing_key = SigningKey.from_string(private_key_bytes, curve=SECP256k1) 11 | verifying_key = signing_key.get_verifying_key() 12 | return signing_key, verifying_key 13 | 14 | def create_secp256k1_jwt(): 15 | signing_key, verifying_key = create_key_pair( 16 | "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2" 17 | ) 18 | 19 | payload = { 20 | "hello": "world" 21 | } 22 | 23 | try: 24 | # Create JWT with ES256K algorithm 25 | token = jwt.encode( 26 | payload, 27 | signing_key.to_pem(), 28 | algorithm="ES256K" 29 | ) 30 | 31 | print("\nJWT Creation:") 32 | print("Payload:", json.dumps(payload, indent=2)) 33 | print("JWT Token:", token) 34 | 35 | # Verify JWT 36 | verified_payload = jwt.decode( 37 | token, 38 | verifying_key.to_pem(), 39 | algorithms=["ES256K"] 40 | ) 41 | print("JWT Verification successful!") 42 | 43 | except Exception as e: 44 | print(f"Error with JWT: {str(e)}") 45 | exit(1) 46 | 47 | def create_secp256k1_jws(): 48 | signing_key, verifying_key = create_key_pair( 49 | "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2" 50 | ) 51 | 52 | payload = { 53 | "hello": "world" 54 | } 55 | 56 | try: 57 | # Create JWS with ES256K algorithm 58 | header = { 59 | "alg": "ES256K", 60 | "typ": "JWS" 61 | } 62 | 63 | token = jwt.encode( 64 | payload, 65 | signing_key.to_pem(), 66 | algorithm="ES256K", 67 | headers=header 68 | ) 69 | 70 | print("\nJWS Creation:") 71 | print("Payload:", json.dumps(payload, indent=2)) 72 | print("JWS Token:", token) 73 | 74 | # Verify JWS 75 | verified_payload = jwt.decode( 76 | token, 77 | verifying_key.to_pem(), 78 | algorithms=["ES256K"] 79 | ) 80 | print("JWS Verification successful!") 81 | 82 | except Exception as e: 83 | print(f"Error with JWS: {str(e)}") 84 | exit(1) 85 | 86 | if __name__ == "__main__": 87 | print("Creating JWT and JWS examples... 2") 88 | create_secp256k1_jwt() 89 | create_secp256k1_jws() 90 | -------------------------------------------------------------------------------- /py/csv_to_parquet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | import os 4 | 5 | def convert_csv_to_parquet(input_csv, output_parquet=None): 6 | try: 7 | # Read CSV 8 | df = pd.read_csv(input_csv) 9 | 10 | # If no output path specified, use input path with .parquet extension 11 | if output_parquet is None: 12 | output_parquet = os.path.splitext(input_csv)[0] + '.parquet' 13 | 14 | # Save as Parquet 15 | df.to_parquet(output_parquet, engine='pyarrow') 16 | return output_parquet 17 | except Exception as e: 18 | raise Exception(f"Conversion failed: {str(e)}") 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(description='Convert CSV file to Parquet format') 22 | parser.add_argument('input_csv', help='Path to input CSV file') 23 | parser.add_argument('--output', '-o', help='Path to output Parquet file (optional)') 24 | 25 | args = parser.parse_args() 26 | 27 | try: 28 | output_path = convert_csv_to_parquet(args.input_csv, args.output) 29 | print(f"Successfully converted to: {output_path}") 30 | except Exception as e: 31 | print(f"Error: {e}") 32 | exit(1) 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /py/display_parquet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import polars as pl 3 | from typing import Optional 4 | 5 | def display_parquet(file_path: str, max_rows: Optional[int] = None, max_cols: Optional[int] = None): 6 | try: 7 | # Read the parquet file using polars 8 | df = pl.read_parquet(file_path) 9 | 10 | # Print basic information about the dataset 11 | print("\n=== Dataset Info ===") 12 | print(f"Number of rows: {df.height}") 13 | print(f"Number of columns: {df.width}") 14 | 15 | print("\n=== Column Names ===") 16 | print(df.columns) 17 | 18 | print("\n=== Data Types ===") 19 | print(df.schema) 20 | 21 | print("\n=== Data Preview ===") 22 | if max_rows: 23 | print(df.head(max_rows)) 24 | else: 25 | print(df) 26 | 27 | # Print some basic statistics for numeric columns 28 | numeric_cols = [name for name, dtype in df.schema.items() if dtype in [pl.Float64, pl.Int64]] 29 | if numeric_cols: 30 | print("\n=== Numeric Columns Statistics ===") 31 | print(df.select(numeric_cols).describe()) 32 | 33 | except Exception as e: 34 | print(f"Error reading/displaying Parquet file: {str(e)}") 35 | exit(1) 36 | 37 | def main(): 38 | parser = argparse.ArgumentParser(description='Display Parquet file contents with nice formatting') 39 | parser.add_argument('parquet_file', help='Path to input Parquet file') 40 | parser.add_argument('--max-rows', type=int, help='Maximum number of rows to display') 41 | parser.add_argument('--max-cols', type=int, help='Maximum number of columns to display') 42 | 43 | args = parser.parse_args() 44 | display_parquet(args.parquet_file, args.max_rows, args.max_cols) 45 | 46 | if __name__ == "__main__": 47 | print("hi2 .\n") 48 | main() 49 | -------------------------------------------------------------------------------- /py/hello_world.txt: -------------------------------------------------------------------------------- 1 | hello world -------------------------------------------------------------------------------- /py/outline.md: -------------------------------------------------------------------------------- 1 | # Assumptions 2 | - We want to be compatible with closed soruce systems/ If someone wants their opensoruce model valdiatred they can just put it into openrouter or anyother host and list it like that 3 | 4 | 5 | # Validator functional componenets 6 | - SDK framework that makes it easy for different AI model providers to add a plugin for their specific API to a generic model type function. For instance a machine tanslation SDK function will have many adapters for different AI model providers with their different API endpoints and authentication types. [ Similar to litellm https://github.com/BerriAI/litellm 7 | ] {for lazyness we can use openrouter through littlellm so we don't need to go get so many API keys} 8 | - Validators must send each prompt to all providers being tested simultinaously they can not do it sequencially. They can wait for the first prompt to be finished before the next for rate limiting but providers must be prompted simultainously. 9 | - Each of the validators need to have a place to store all of their API keys for each provider. [can start as env variables but later needs to be some KMS ] 10 | - Validator registration simply to connect their public key to an organizations email or domain. 11 | - Validators decide which providers they want to test this is a simple config. It's a human decision to test people 12 | - Validator must have the ability to upload scored resposnes. (The validator will have their employees score each resposne in excel or google sheets ). The evaluation defines how to aggregate the scores per prompt simple Average or Accuracy or more compelx BLUE score is determined by the Evaluation spec. 13 | - SDK helps produce signed evlaution publications in schema example_evaluation_test_stream_publication.yaml (these should be submitted within 7 days of the prompt or how ever long the Evaluation Stream stipulates) 14 | 15 | 16 | # Centralized Information aggregation server 17 | - API accepting evaluation JSON [auth : pk signature validation checked. JWS and secp256k1 ] 18 | - 19 | - API accepting new Evaluation Test Stream description doc [No AUTH required] 20 | - This could have a very simple web form since not key is needed 21 | - AKA Functions that help ai researchers and validators publish new test data creation standards and evaluation methods. 22 | - API accepting provider saying they want to be tested for a certain Evaluation Test Stream [auth : pk signature validation checked ] 23 | - AKA Function to help AI model providers announce that they want to get benchmarked with fresh data on new test data evaluation streams 24 | - [later] A background process that checks if one of the API submitted JSON messages was not seen on WAKU gossip then the server should push the message again via waku 25 | - This server should be designed in a very lean manner such that the JS version can just be a client side wrapper that also calls a local version of this server. 26 | 27 | # Identity 28 | - We will use decentralized identifiers (DIDs ). To start we'll use a simple did:key or did:pkh where we resolve the endpoint document on our own centralized servers but while coding we should consider that we will want to support other did mehtods 29 | - In general we just agree to use the concept of a did:string the is the non mutable ID of Validator and there is a document describing more about them like name etc which must be resolved from some server 30 | - We decided that we will run a centralized server ourselves for the resolving extra metadata 31 | - [Later] did:plc or similar that allows rotation and pubkey is not inside the identity string 32 | 33 | Everything above is POC/MVP April 7 2025 deadline 34 | 35 | 36 | # Things that can be done in a second step [Later] ( Release target May 22, 2025 ) 37 | - Open data publication of all the test data that was used and what answers different models gave (aka the audit log) 38 | - Web UI displays results in aggregate. 39 | - Distributing registrations and evaluations via WAKU or other open decentralized message bus 40 | - users/vakidators can spot check the responses that validators gave by reprompting the provider and checking if they get the same or similar quality response to make sure that the validator is not lying about a provider giving a bad response to a prompt 41 | 42 | 43 | # Provider functionality 44 | - no real provider code needs to be written. Providers just run their own API's as they do for regular business. 45 | -------------------------------------------------------------------------------- /py/requirements.txt: -------------------------------------------------------------------------------- 1 | litellm 2 | pandas 3 | pyarrow -------------------------------------------------------------------------------- /py/run_evaluation_pseudo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import csv 4 | import pandas as pd 5 | import polars as pl 6 | from litellm import completion 7 | from datetime import datetime 8 | from compute_ipfs_cid import compute_ipfs_cids # Add this import at the top 9 | 10 | os.environ["validator_privatekey_hot"] = "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2" #cafe 11 | os.environ["OPENAI_API_KEY"] = "your-api-key" #later add pulling these configs from a secrets file 12 | os.environ["OPENROUTER_API_KEY"] = "openrouter_api_key" 13 | os.environ["XAI_API_KEY"] = "your-api-key" 14 | os.environ["ANTHROPIC_API_KEY"] = "your-api-key" 15 | 16 | to_be_evaluated = [ "openai/gpt-4o","anthropic/claude-3-sonnet-20240229","xai/grok-2-latest","openrouter/google/palm-2-chat-bison"] #later add pulling the list of to be evaluated model from a config file 17 | 18 | test_data = [ "what are the differences between ed25519 and secp256k1 when should I use one over the other", "can I derive a secp256k1 private key from a ed25519 private key and in the reverse" , "what is a self hosted fully opensource laternative to openrouter"] #later add pulling this from a csv and add pulling this from a server 19 | 20 | async def get_model_completion(model_name, test_question): 21 | try: 22 | start_time = datetime.now().isoformat() 23 | response = await completion( 24 | model=model_name, 25 | messages=[{"content": test_question, "role": "user"}] 26 | ) 27 | end_time = datetime.now().isoformat() 28 | return { 29 | "model": model_name, 30 | "response": response, 31 | "error": None, 32 | "request_timestamp": start_time, 33 | "response_timestamp": end_time 34 | } 35 | except Exception as e: 36 | end_time = datetime.now().isoformat() 37 | return { 38 | "model": model_name, 39 | "error": str(e), 40 | "response": None, 41 | "request_timestamp": start_time, 42 | "response_timestamp": end_time 43 | } 44 | 45 | async def evaluate_models(): 46 | all_results = [] 47 | for test_question in test_data: 48 | tasks = [get_model_completion(model, test_question) for model in to_be_evaluated] 49 | question_results = await asyncio.gather(*tasks) 50 | 51 | # Flatten results for this question 52 | for model_result in question_results: 53 | all_results.append({ 54 | "question": test_question, 55 | "model": model_result['model'], 56 | "response": model_result['response'], 57 | "error": model_result['error'], 58 | "request_timestamp": model_result['request_timestamp'], 59 | "response_timestamp": model_result['response_timestamp'] 60 | }) 61 | return all_results 62 | 63 | def save_results_to_csv(results, filename='evaluation_results.csv'): 64 | filepath = os.path.join(os.path.dirname(__file__), filename) 65 | fieldnames = ['question', 'model', 'response', 'error', 'request_timestamp', 'response_timestamp'] 66 | 67 | with open(filepath, 'w', newline='', encoding='utf-8') as csvfile: 68 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 69 | writer.writeheader() 70 | writer.writerows(results) 71 | return filepath 72 | 73 | def save_results_to_parquet(results, filename='evaluation_results.parquet'): 74 | filepath = os.path.join(os.path.dirname(__file__), filename) 75 | # Convert results list to polars DataFrame and save 76 | df = pl.DataFrame(results) 77 | df.write_parquet(filepath) 78 | return filepath 79 | 80 | if __name__ == "__main__": 81 | print("hello4.\n") 82 | results = asyncio.run(evaluate_models()) 83 | current_question = None 84 | for result in results: 85 | if current_question != result['question']: 86 | current_question = result['question'] 87 | print(f"\nQuestion: {current_question}") 88 | print("-" * 50) 89 | print(f"\nModel: {result['model']}") 90 | print(result['response']) 91 | if result['model'] == to_be_evaluated[-1]: # If it's the last model 92 | print("\n" + "=" * 80) 93 | 94 | # Save results to CSV and Parquet 95 | csv_path = save_results_to_csv(results) 96 | parquet_path = save_results_to_parquet(results) 97 | print(f"\nResults saved to CSV: {csv_path}") 98 | print(f"Results saved to Parquet: {parquet_path}") 99 | 100 | # Compute and display IPFS CID for the parquet file 101 | try: 102 | cidv1 = compute_ipfs_cids(parquet_path) 103 | 104 | print(f"Parquet file IPFS CID: {cidv1}") 105 | except Exception as e: 106 | print(f"Error computing IPFS CID: {str(e)}") -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | polars 2 | base58 3 | python-jose[cryptography] 4 | ecdsa 5 | joserfc 6 | PyJWT[crypto] -------------------------------------------------------------------------------- /sample_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt00/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt00/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt00/google-Gemini-2.5-Pro_06042025_c/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/google-Gemini-2.5-Pro_06042025_c/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt00/metadata.yaml: -------------------------------------------------------------------------------- 1 | prompt: 2 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt00/openAI-o1_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/openAI-o1_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/anthropic-claude-3.7-sonnet_06042025_a/great-wave.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | The Great Wave off Kanagawa - SVG 7 | 23 | 24 | 25 |
26 | 27 | 28 | 38 | 39 | 40 | 45 | 46 | 47 | 52 | 53 | 54 | 58 | 59 | 60 | 64 | 65 | 66 | 70 | 71 | 72 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 |
92 | 93 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/google-Gemini-2.5-Pro_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/google-Gemini-2.5-Pro_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/google-Gemini-2.5-Pro_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Great Wave SVG 7 | 25 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/metadata.yaml: -------------------------------------------------------------------------------- 1 | prompt: using html , svg and vanila js create a scene that looks like The Great Wave off Kanagawa. Its a large wave asian style of art. But your only allowed to use SVG and 1 color. Keep the code small. Put it all into one file. Its a static image no animations. -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/openAI-o1_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/openAI-o1_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt0_wave/openAI-o1_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | The Great Wave (Single Color) 6 | 7 | 8 | 9 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt1_tree/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt1_tree/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt1_tree/anthropic-claude-3.7-sonnet_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | 20 | 21 | 61 | 62 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt1_tree/google-Gemini-2.5-Pro_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Fractal Tree 5 | 9 | 10 | 11 | 12 | 60 | 61 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt1_tree/metadata.yaml: -------------------------------------------------------------------------------- 1 | prompt: create an svg tree made from simple fractal formulas with randomness build it. Colors are grey lines on black background. Only generate a new tree on load. Use the full window for drawing. Do not clip the tree on the edge of the screen. Fade out the lines as you get towards the leaves. Try to write this using as little code as possible. Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. 2 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt1_tree/openAI-o1_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 31 | 32 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/anthropic-claude-3.7-sonnet_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | 11 | 12 | 118 | 119 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/google-Gemini-2.5-Pro_06042025_c/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Torus Wireframe Animation 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/google-Gemini-2.5-Pro_06042025_c/script.js: -------------------------------------------------------------------------------- 1 | const svg = document.getElementById('torus-svg'); 2 | const svgNS = "http://www.w3.org/2000/svg"; 3 | 4 | let width = window.innerWidth; 5 | let height = window.innerHeight; 6 | 7 | // --- Randomize Parameters --- 8 | const R = Math.random() * 100 + 100; // Major Radius (100-200) 9 | const r = Math.random() * 50 + 25; // Minor Radius (25-75) ensures hole changes 10 | const numMajor = Math.floor(Math.random() * 20) + 20; // Density Major segments (20-40) 11 | const numMinor = Math.floor(Math.random() * 10) + 10; // Density Minor segments (10-20) 12 | const tiltX = (Math.random() - 0.5) * Math.PI; // Initial X tilt (-PI/2 to PI/2) 13 | const tiltY = (Math.random() - 0.5) * Math.PI; // Initial Y tilt (-PI/2 to PI/2) 14 | const rotationSpeed = (Math.random() * 0.01) + 0.005; // Rotation speed (0.005-0.015 rad/frame) 15 | 16 | // --- 3D Point Generation --- 17 | function generateTorusPoints(R, r, numMajor, numMinor) { 18 | const points = []; 19 | for (let i = 0; i < numMajor; i++) { 20 | const majorAngle = 2 * Math.PI * i / numMajor; 21 | const cosMajor = Math.cos(majorAngle); 22 | const sinMajor = Math.sin(majorAngle); 23 | 24 | for (let j = 0; j < numMinor; j++) { 25 | const minorAngle = 2 * Math.PI * j / numMinor; 26 | const cosMinor = Math.cos(minorAngle); 27 | const sinMinor = Math.sin(minorAngle); 28 | 29 | const x = (R + r * cosMinor) * cosMajor; 30 | const y = (R + r * cosMinor) * sinMajor; 31 | const z = r * sinMinor; 32 | points.push({ x, y, z, i, j }); // Store segment indices 33 | } 34 | } 35 | return points; 36 | } 37 | 38 | // --- 3D Rotation --- 39 | function rotateX(point, angle) { 40 | const y = point.y; 41 | const z = point.z; 42 | point.y = y * Math.cos(angle) - z * Math.sin(angle); 43 | point.z = y * Math.sin(angle) + z * Math.cos(angle); 44 | } 45 | 46 | function rotateY(point, angle) { 47 | const x = point.x; 48 | const z = point.z; 49 | point.x = x * Math.cos(angle) + z * Math.sin(angle); 50 | point.z = -x * Math.sin(angle) + z * Math.cos(angle); 51 | } 52 | 53 | function rotateZ(point, angle) { 54 | const x = point.x; 55 | const y = point.y; 56 | point.x = x * Math.cos(angle) - y * Math.sin(angle); 57 | point.y = x * Math.sin(angle) + y * Math.cos(angle); 58 | } 59 | 60 | // --- Projection --- 61 | function project(point) { 62 | // Simple orthographic projection 63 | const scale = 1.5; // Adjust scale as needed 64 | return { 65 | x: point.x * scale + width / 2, 66 | y: point.y * scale + height / 2 67 | }; 68 | } 69 | 70 | // --- Drawing --- 71 | const points3D = generateTorusPoints(R, r, numMajor, numMinor); 72 | const paths = []; 73 | 74 | // Create paths for major and minor segments 75 | for (let i = 0; i < numMajor; i++) { 76 | for (let j = 0; j < numMinor; j++) { 77 | const pathMajor = document.createElementNS(svgNS, 'path'); 78 | pathMajor.setAttribute('stroke', 'grey'); 79 | pathMajor.setAttribute('stroke-width', '1'); 80 | pathMajor.setAttribute('fill', 'none'); 81 | svg.appendChild(pathMajor); 82 | paths.push({ type: 'major', i, j, element: pathMajor }); 83 | 84 | const pathMinor = document.createElementNS(svgNS, 'path'); 85 | pathMinor.setAttribute('stroke', 'grey'); 86 | pathMinor.setAttribute('stroke-width', '1'); 87 | pathMinor.setAttribute('fill', 'none'); 88 | svg.appendChild(pathMinor); 89 | paths.push({ type: 'minor', i, j, element: pathMinor }); 90 | } 91 | } 92 | 93 | 94 | let angle = 0; 95 | 96 | function animate() { 97 | // Clone points for rotation to avoid modifying originals permanently per frame 98 | const rotatedPoints = points3D.map(p => ({ ...p })); 99 | 100 | // Apply initial tilt 101 | rotatedPoints.forEach(p => { 102 | rotateX(p, tiltX); 103 | rotateY(p, tiltY); 104 | }); 105 | 106 | // Apply animation rotation (around Y axis for this example) 107 | rotatedPoints.forEach(p => rotateY(p, angle)); 108 | 109 | // Project points 110 | const projectedPoints = rotatedPoints.map(project); 111 | 112 | // Update SVG paths 113 | paths.forEach(pathInfo => { 114 | const { type, i, j, element } = pathInfo; 115 | const currentPointIndex = i * numMinor + j; 116 | 117 | let nextPointIndex; 118 | if (type === 'major') { 119 | // Connect point to the next point in the same major ring 120 | nextPointIndex = i * numMinor + (j + 1) % numMinor; 121 | } else { // type === 'minor' 122 | // Connect point to the corresponding point in the next major ring 123 | nextPointIndex = ((i + 1) % numMajor) * numMinor + j; 124 | } 125 | 126 | const p1 = projectedPoints[currentPointIndex]; 127 | const p2 = projectedPoints[nextPointIndex]; 128 | 129 | element.setAttribute('d', `M ${p1.x} ${p1.y} L ${p2.x} ${p2.y}`); 130 | }); 131 | 132 | 133 | angle += rotationSpeed; 134 | requestAnimationFrame(animate); 135 | } 136 | 137 | // Handle window resize 138 | window.addEventListener('resize', () => { 139 | width = window.innerWidth; 140 | height = window.innerHeight; 141 | svg.setAttribute('viewBox', `0 0 ${width} ${height}`); 142 | }); 143 | 144 | // Initial setup for viewbox 145 | svg.setAttribute('viewBox', `0 0 ${width} ${height}`); 146 | 147 | // Apply initial tilt to base points ONCE 148 | points3D.forEach(p => { 149 | rotateX(p, tiltX); 150 | rotateY(p, tiltY); 151 | }); 152 | 153 | 154 | animate(); -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/google-Gemini-2.5-Pro_06042025_c/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | overflow: hidden; 4 | background-color: black; 5 | } 6 | 7 | svg { 8 | display: block; 9 | width: 100vw; 10 | height: 100vh; 11 | } -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/metadata.yaml: -------------------------------------------------------------------------------- 1 | prompt: create an svg animation of a wire frame torus with the wire mesh . Colors are grey lines on black background. Try to write this using as little code as possible. Randomize the density of the wire mesh. Randomize the angle of the torus with respect to the horizontal and vertical. Also randomize the Major Radius and Minor Radius such that the hole in the middle changes. Choose an angle and then animate a rotation of the torus. Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. 2 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/openAI-o1_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/openAI-o1_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt2_torus/openAI-o1_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Wireframe Torus 6 | 10 | 11 | 12 | 13 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/anthropic-claude-3.7-sonnet_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Sierpiński Triangle Fractal Zoom 7 | 18 | 19 | 20 | 21 | 22 | 106 | 107 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/google-Gemini-2.5-Pro_06042025_c/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/google-Gemini-2.5-Pro_06042025_c/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/google-Gemini-2.5-Pro_06042025_c/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Sierpinski Zoom 5 | 9 | 10 | 11 | 12 | 13 | 127 | 128 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/metadata.yaml: -------------------------------------------------------------------------------- 1 | prompt: Procedurally generate Sierpiński triangle / Sierpiński gasket / Sierpiński sieve fractal as the camera zooms into it. Zoom into the edge of the fractal. Zoom in kinda fast. Use SVG only. Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. Put everything into one index.html file. 2 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/openAI-o1_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/openAI-o1_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/openAI-o1_06042025_a/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Sierpinski Triangle Zoom 6 | 24 | 25 | 26 |
27 | 28 |
29 | 30 | 76 | 77 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/google-Gemini-2.5-Pro_06042025_c/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/google-Gemini-2.5-Pro_06042025_c/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/metadata.yaml: -------------------------------------------------------------------------------- 1 | prompt: Procedurally generate Jerusalem cube fractal in 3d. Use SVG mesh only. Rotate the cub around a random angle. Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. Put everything into one index.html file. 2 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/openAI-o1_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/openAI-o1_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/metadata.yaml: -------------------------------------------------------------------------------- 1 | prompt: Procedurally generate a 3d fractal call ed the Menger sponge (also known as the Menger cube, Menger universal curve, Sierpinski cube, or Sierpinski sponge . . Use only vanila js with webgl. Change the colors of the faces as you go deeper into the fractal. This fractal mathematically has a Hausdorff dimension of {\displaystyle \log _{3}8} . Think about how to do this, list yourself several options its not that easy. the Menger sponge has an infinite surface area and zero volume so you need to generate it procedurally and potentially set an arbitrary cutoff limit on how deep the fractal will go . I will look at the result only in chrome v8. Put everything into one index.html file. 2 | -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/openAI-o1_06042025_a/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/openAI-o1_06042025_a/.DS_Store -------------------------------------------------------------------------------- /sample_data/demoscene-js_cgi_benchmark/scores_metadata_eth_signature.txt: -------------------------------------------------------------------------------- 1 | 0x10a6b15012feee6ce7b181ce609712c02700585b19899acf8bf1954b75ba03ff566dd36f93ff272a5062b6b3aad2ea490eb8a14e5ae7601c99f7713a96a1b68c1b -------------------------------------------------------------------------------- /sample_data/example_evaluation_result_publication.yaml: -------------------------------------------------------------------------------- 1 | validator: Did:ridValidator:0x6FCB287c943C1119c550F350B07e7B4c9f02677C:mit.edu # did:ridValidator:: Each domain can have multiple valida keys. The did method has additional logic to check if a certain public key is valid for a given domain. Domains are verified by DKMI email signature or DNS TXT records. 2 | evaluation: did:rideval:MTRobustBLUE:dasdasdasdasdasdasdasdasdasda:github.com/robust/MTbench/spec 3 | results_per_provider: 4 | - provider: Did:ridProvider:Grok3:0x6FCB287c943C1119c550F350B07e7B4c9f026001:api.x.ai # did:ridValidator:: Each domain can have multiple valida keys. The did method has additional logic to check if a certain public key is valid for a given domain. Domains are verified by DKMI email signature or DNS TXT records. 5 | score: 0.8 # One overall normalized response quality score that must be present in all evaluations. Larger score is better. In float form it ranges from 0.0 to 1.0. It may be a simple average of all the metircs or a more complex convulusion. 6 | metrics: 7 | - metric_name: average_BLUE 8 | metric_type: float 9 | metric_value: 81.1 10 | - metric_name: ROC 11 | metric_type: float 12 | metric_value: 81.1 13 | - metric_name: GRE 14 | metric_type: float 15 | metric_value: 81.1 16 | - metric_name: averageRoundReponseTIme 17 | metric_type: int_ms 18 | metric_value: 302 19 | - provider: Did:ridProvider:openAi/o3-mini:0x6FCB287c943C1119c550F350B07e7B4c9f026456:openai.org/api 20 | score: 0.95 21 | metrics: 22 | - metric_name: average_BLUE 23 | metric_type: float 24 | metric_value: 95.1 25 | - metric_name: ROC 26 | metric_type: float 27 | metric_value: 95.1 28 | - metric_name: GRE 29 | metric_type: float 30 | metric_value: 92.1 31 | - metric_name: averageRoundReponseTIme 32 | metric_type: int_ms 33 | metric_value: 202 34 | - provider: Did:ridProvider:0x6FCB287c943C1119c550F350B07e7B4c9f026412:anthropic.com/api 35 | model_used: anthropic/claude-3-7-sonnet-20250219 36 | score: 0.91 37 | metrics: 38 | - metric_name: average_BLUE 39 | metric_type: float 40 | metric_value: 91.1 41 | - metric_name: ROC 42 | metric_type: float 43 | metric_value: 91.1 44 | - metric_name: GRE 45 | metric_type: float 46 | metric_value: 91.1 47 | - metric_name: averageRoundReponseTIme 48 | metric_type: int_ms 49 | metric_value: 402 50 | - provider: Did:ridProvider:openrouter/anthropic/claude-3-7-sonnet-20250219::openrouter.com/api 51 | score: 0.91 52 | metrics: 53 | - metric_name: average_BLUE 54 | metric_type: float 55 | metric_value: 91.1 56 | - metric_name: ROC 57 | metric_type: float 58 | metric_value: 91.1 59 | - metric_name: GRE 60 | metric_type: float 61 | metric_value: 91.1 62 | - metric_name: averageRoundReponseTIme 63 | metric_type: int_ms 64 | metric_value: 402 65 | timestamp_start_eval: 1742145905 # When the first prompt was sent to the first provider 66 | timestamp_end_eval: 1742149905 # When the last prompt was sent to the first provider 67 | number_of_prompts_behind_score: 100 68 | private_test_data: false 69 | public_test_data: true 70 | test_data_publicized_within: 6months 71 | signature_of_above: 0xyu898uhgyhbjnkoyghjikoi9u8yghbjnkoiuyghjnkmougyhbjnkiyghbjnkoiytghji9u8yghb0xyu898uhgyhbjnkoyghjikoi9u8yghbjnkoiuyghjnkmougyhbjnkiyghbjnkoiytghji9u8yghb 72 | -------------------------------------------------------------------------------- /sample_data/example_evaluation_results_video.yaml: -------------------------------------------------------------------------------- 1 | dataPoint: 2 | Validator: "Did:ridValidator:0x7A2B287c943C1119b650F350B07e7A4c9f02677C:university.edu" 3 | Provider: "Did:ridprovider:VideoAI:0x8F7C432b943C1119c550F350B07e7B4c9f02677C:api.videoai.com" 4 | Evaluation: "did:rideval:ImageCatDetectionV1:abcd1234efgh5678ijkl9012mnopqrst:github.com/engineer/AIbench" 5 | evaluations: 6 | - prompt_timestamp: "1678953600" # epoch timestamp when the video was uploaded for analysis 7 | response_timestamp: "1678954200" # epoch timestamp when the result was returned 8 | prompt: 9 | input_video_cid: "xyzvideo12345cid" # CID for the input video file (can be video of a cat or not) 10 | input_video_url: "https://example.com/path_to_video.mp4" # URL to the video for remote input 11 | response: 12 | contains_cat: "true" # Response indicating whether a cat was detected in the video (true/false) 13 | detection_confidence: 0.98 # Confidence score for the cat detection 14 | processing_time: 2.5 # Time taken for processing the video (in seconds) 15 | frame_count: 300 # Total number of frames in the video 16 | detected_objects_cid: "abc123detected_objects_cid" # CID pointing to the file containing detected objects data 17 | evals: 18 | contains_cat_correctness: 1 # 0 or 1 indicating whether the evaluation result matches the truth 19 | detection_accuracy: 0.95 # Model's accuracy in detecting objects in the video 20 | processing_accuracy: 0.97 # Accuracy in processing the video correctly 21 | validator_signature: "0x2F1e83bd09ad72bc9a81f1a09d2c9d88e39a871a" 22 | -------------------------------------------------------------------------------- /sample_data/example_evaluation_test_stream_publication.yaml: -------------------------------------------------------------------------------- 1 | shortname: genLLMchatArena #Author should put effort into ensureing this shortname is unique. It should be short as it will be typed and filtered on 2 | title: LLM question answer with blind A vs B comparison to get ELO score 3 | specification_mutable_url : https://github.com/lm-sys/FastChat/blob/main/docs/arena.md # contents behind this url can evolve over time 4 | specification_current_cid: 0x6ACB287c943C229c550F350B07e7B4c9f0212333 5 | specification_all_cid_versions: 6 | - 0x6FCB287c943C1119c550F350B07e7B4c9f026001 7 | - 0x6FCB287c943C1119c550F350B07e7B4c9f026000 8 | - 0x6ACB287c943C229c550F350B07e7B4c9f0212333 9 | paper_pdf: https://arxiv.org/pdf/2403.04132.pdf #optional 10 | paper_doi: 10.48550/arXiv.2403.04132 #Optionally they could write the full url https://doi.org/10.48550/arXiv.2403.04132 . DOI's are super good b/c they are permanent and guranteed to be unique 11 | paper_BibTex: | #optional 12 | @misc{chiang2024chatbotarenaopenplatform, 13 | title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference}, 14 | author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica}, 15 | year={2024}, 16 | eprint={2403.04132}, 17 | archivePrefix={arXiv}, 18 | primaryClass={cs.AI}, 19 | url={https://arxiv.org/abs/2403.04132}, 20 | } 21 | input_dimensions: 22 | - input_name : prompt_text 23 | input_type : string 24 | evaluation_dimensions: 25 | - eval_name: BLUE 26 | eval_type: float 27 | - eval_name: ResponseTime 28 | eval_type: int 29 | aggregate_metrics: 30 | - metric_name: average_BLUE 31 | metric_type: float 32 | metric_higher_is_better: true 33 | - metric_name: averageRoundReponseTIme 34 | metric_type: int_ms 35 | metric_higher_is_better: false 36 | max_response_time_ms: 10000 #optional 37 | minimum_sample_size: 100 #optional 38 | test_data_publicaion_required_within: 6months #optional 39 | -------------------------------------------------------------------------------- /sample_data/example_provider_registration_publication.yaml: -------------------------------------------------------------------------------- 1 | alsoKnownAs: [ ed://mt.lesan.ai, ed://mt.lesan.org , ed://mt.lesan ] #priority sorted shortnames 2 | domain_of_api: lesan.ai/api 3 | domain_of_admin_email: admin@lesan.ai 4 | billing_type: api_call_usage_billing_monthly_minimum 5 | purchasingmethods: credit_card, polygon_usdc, polygon_matic 6 | target_evals: 7 | - did:rideval:MTRobustBLUE:dasdasdasdasdasdasdasdasdasda:github.com/robust/MTbench/spec 8 | - did:rideval:mtBlueHard:ddasdasdasd:github.com/abcde 9 | cid_of_dot_eml_formated_dkim_signed_email_listing_the_evaluations_they_want_to_be_tested_for 10 | 11 | 12 | # or 13 | 14 | 15 | shortname: [ ed://mt.lesan.ai, ed://mt.lesan.org , ed://mt.lesan, 16 | domain_of_api: lesan.ai/api 17 | domain_of_admin_email: admin@lesan.ai 18 | billing_type: api_call_usage_billing_monthly_minimum 19 | purchasingmethods: credit_card, polygon_usdc, polygon_matic 20 | target_evals: 21 | - cid: 0x6ACB287c943C229c550F350B07e7B4c9f0212333 22 | url: https://github.com/lm-sys/FastChat/blob/main/docs/arena.md 23 | paper_doi: 10.48550/arXiv.2403.04132 24 | - did:rideval:mtBlueHard:ddasdasdasd:github.com/abcde 25 | cid_of_dot_eml_formated_dkim_signed_email_listing_the_evaluations_they_want_to_be_tested_for 26 | 27 | 28 | -------------------------------------------------------------------------------- /sample_data/turbine_aerodynamics_example_response_audit_.yaml: -------------------------------------------------------------------------------- 1 | dataPoint: 2 | Validator: "Did:ridValidator:0x9A4B287c943C1119c550F350B07e7B4c9f02677C:mit.edu" 3 | Provider: "Did:ridprovider:WindTurbineCorp:0x5F7C432b943C1119c550F350B07e7B4c9f02677C:api.windturbine.com" 4 | Evaluation: "did:rideval:DesignEvalV1:abcd1234efgh5678ijkl9012mnopqrst:github.com/engineer/DesignBench" 5 | evaluations: 6 | - prompt_timestamp: "1678953600" # epoch timestamp for the time the simulation was run 7 | response_timestamp: "1678954200" # epoch timestamp for the time the result was returned 8 | prompt: 9 | turbine_mesh_cid: ghjklkjhghjklkjhghjklcid 10 | load_parma: hjkojhbnjkoihbnjkcid #this is also a CID, we can detect if its a cid in the first bytes the eval designer can choose to include it in the name or nto 11 | Response: 12 | failure_prob_over50p: "true" # resposnes & prompts can be CID to file, string<1000charters , number, or array of numbers [123,1,2,3] 13 | stresspoints: hjkojhbnjkoihbnjkcid 14 | evals: 15 | failure_prob_over50p_correctness: 1 # 0 or 1 16 | stresspoints_physically_plausalbe: 0.1 # The model's response is professional and polite 17 | stresspoints_physically_correctness: 0.9 # No spelling errors in the response 18 | - prompt_timestamp: "1678953610" # epoch timestamp for the time the simulation was run 19 | response_timestamp: "1678954200" # epoch timestamp for the time the result was returned 20 | prompt: 21 | turbine_mesh_cid: ghjklkjhghjklkjhghjklcid 22 | load_parma: aaaajhbnjkoihbnjkcid #this is also a CID, we can detect if its a cid in the first bytes the eval designer can choose to include it in the name or nto 23 | Response: 24 | failure_prob_over50p: "true" # resposnes & prompts can be CID to file, string<1000charters , number, or array of numbers [123,1,2,3] 25 | stresspoints: hjkojhbnjkoihbnjkcid 26 | evals: 27 | failure_prob_over50p_correctness: 1 # 0 or 1 28 | stresspoints_physically_plausalbe: 0.1 # The model's response is professional and polite 29 | stresspoints_physically_correctness: 0.9 # No spelling errors in the response 30 | validator_signature: "0x2F1e83bd09ad72bc9a81f1a09d2c9d88e39a871a" 31 | --------------------------------------------------------------------------------