├── .gitignore
├── .gitmodules
├── README.md
├── brunotest
    └── peerBench
    │   ├── bruno.json
    │   ├── collection.bru
    │   └── submit-score.bru
├── dEval_sim
    ├── README.md
    ├── dEval_sim
    ├── dEval_sim_writeup.html
    └── dEval_sim_writeup.ipynb
├── meeting-notes
    └── 20250218
    │   ├── .DS_Store
    │   ├── RobustAiBench_initial_meeting_notes.docx
    │   ├── RobustAiBench_initial_meeting_notes.md
    │   ├── RobustAiBench_initial_meeting_notes.pdf
    │   ├── RobustAiBench_initial_meeting_transcript.json
    │   └── RobustAiBench_initial_meeting_transcript.pdf
├── peerBenchJS
    ├── .DS_Store
    ├── LICENSE
    ├── README.md
    ├── config.example.json
    ├── config.mmlu.json
    ├── config.mmlu.onlyEngineering.json
    ├── config.mmlu.onlyHistory.json
    ├── config.mmlu.onlyLaw.generated.llama-4-scout-wrong.json
    ├── config.mmlu.onlyLaw.json
    ├── data
    │   └── tasks
    │   │   ├── bigbench
    │   │       └── social_iqa.json
    │   │   ├── chatgpt4o_generated_newMMLUhistory_based_on_those_others_get_wrong_alot.json
    │   │   ├── mmlu-pro.onlyHistory.gemma-3-4b-it-wrong.generated.json
    │   │   ├── mmlu-pro.onlyLaw.generated.llama-4-scout-wrong.json
    │   │   └── mmlu-pro
    │   │       ├── mmlu-pro.onlyEngineering.json
    │   │       ├── mmlu-pro.onlyHistory.jsonl
    │   │       ├── mmlu-pro.onlyLaw.json
    │   │       └── mmlu-pro.parquet
    ├── eslint.config.js
    ├── package-lock.json
    ├── package.json
    ├── peerbench
    ├── src
    │   ├── base
    │   │   ├── provider.ts
    │   │   ├── task-format.ts
    │   │   └── uploader.ts
    │   ├── commands
    │   │   ├── aggregate.ts
    │   │   ├── prompt.ts
    │   │   ├── rephrase.ts
    │   │   ├── score.ts
    │   │   ├── std.ts
    │   │   └── upload.ts
    │   ├── config.ts
    │   ├── core
    │   │   ├── aggregate.ts
    │   │   ├── format.ts
    │   │   ├── logger.ts
    │   │   ├── parser.ts
    │   │   ├── program.ts
    │   │   ├── prompt.ts
    │   │   ├── providers.ts
    │   │   ├── score.ts
    │   │   ├── std.ts
    │   │   ├── uploaders.ts
    │   │   └── utils.ts
    │   ├── errors
    │   │   └── task.ts
    │   ├── formats
    │   │   ├── bigbench.ts
    │   │   ├── medQA.ts
    │   │   └── mmlu-pro.ts
    │   ├── index.ts
    │   ├── providers
    │   │   ├── nearai.ts
    │   │   └── openrouter.ts
    │   ├── types.ts
    │   └── uploaders
    │   │   └── coserver.ts
    ├── tsconfig.json
    └── tsup.config.ts
├── peerBenchPY
    ├── CONTRIBUTING.md
    ├── Dockerfile
    ├── LICENCE
    ├── Makefile
    ├── README.md
    ├── pyproject.toml
    ├── requirements.txt
    ├── run
    │   ├── build.sh
    │   ├── enter.sh
    │   ├── start.sh
    │   ├── stop.sh
    │   └── test.sh
    ├── tmp_consolidate_responses.py
    ├── tmprun.py
    ├── tmprun2.py
    ├── tmptestcid.py
    └── val
    │   ├── __init__.py
    │   ├── auth.py
    │   ├── key.py
    │   ├── providers
    │       ├── litellm
    │       │   └── provider.py
    │       └── openrouter
    │       │   └── provider.py
    │   ├── storage.py
    │   ├── task
    │       ├── add
    │       │   └── task.py
    │       ├── divide
    │       │   └── task.py
    │       └── task.py
    │   ├── utils.py
    │   └── val.py
├── py
    ├── .DS_Store
    ├── Iris.csv
    ├── Iris.parquet
    ├── compute_ipfs_cid.py
    ├── create_secp256k1_jws.py
    ├── create_secp256k1_jws_rfc.py
    ├── create_secp256k1_jwt.py
    ├── csv_to_parquet.py
    ├── display_parquet.py
    ├── hello_world.txt
    ├── outline.md
    ├── requirements.txt
    └── run_evaluation_pseudo.py
├── requirements.txt
└── sample_data
    ├── .DS_Store
    ├── demoscene-js_cgi_benchmark
        ├── .DS_Store
        ├── prompt00
        │   ├── .DS_Store
        │   ├── anthropic-claude-3.7-sonnet_06042025_a
        │   │   └── .DS_Store
        │   ├── google-Gemini-2.5-Pro_06042025_c
        │   │   └── .DS_Store
        │   ├── metadata.yaml
        │   └── openAI-o1_06042025_a
        │   │   └── .DS_Store
        ├── prompt01
        │   └── metadata.yaml
        ├── prompt0_wave
        │   ├── .DS_Store
        │   ├── anthropic-claude-3.7-sonnet_06042025_a
        │   │   ├── .DS_Store
        │   │   └── great-wave.html
        │   ├── google-Gemini-2.5-Pro_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        │   ├── metadata.yaml
        │   └── openAI-o1_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        ├── prompt1_tree
        │   ├── .DS_Store
        │   ├── anthropic-claude-3.7-sonnet_06042025_a
        │   │   └── index.html
        │   ├── google-Gemini-2.5-Pro_06042025_a
        │   │   └── index.html
        │   ├── metadata.yaml
        │   └── openAI-o1_06042025_a
        │   │   └── index.html
        ├── prompt2_torus
        │   ├── .DS_Store
        │   ├── anthropic-claude-3.7-sonnet_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        │   ├── google-Gemini-2.5-Pro_06042025_c
        │   │   ├── index.html
        │   │   ├── script.js
        │   │   └── style.css
        │   ├── metadata.yaml
        │   └── openAI-o1_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        ├── prompt3_Sierpinski triangle
        │   ├── .DS_Store
        │   ├── anthropic-claude-3.7-sonnet_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        │   ├── google-Gemini-2.5-Pro_06042025_c
        │   │   ├── .DS_Store
        │   │   └── index.html
        │   ├── metadata.yaml
        │   └── openAI-o1_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        ├── prompt4_Jerusalem_cube
        │   ├── .DS_Store
        │   ├── anthropic-claude-3.7-sonnet_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        │   ├── google-Gemini-2.5-Pro_06042025_c
        │   │   ├── .DS_Store
        │   │   └── index.html
        │   ├── metadata.yaml
        │   └── openAI-o1_06042025_a
        │   │   └── .DS_Store
        ├── prompt5_Menger_sponge
        │   ├── .DS_Store
        │   ├── anthropic-claude-3.7-sonnet_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        │   ├── google-Gemini-2.5-Pro_06042025_s
        │   │   └── index.html
        │   ├── metadata.yaml
        │   └── openAI-o1_06042025_a
        │   │   ├── .DS_Store
        │   │   └── index.html
        ├── scores_metadata.yaml
        └── scores_metadata_eth_signature.txt
    ├── example_evaluation_result_publication.yaml
    ├── example_evaluation_results_video.yaml
    ├── example_evaluation_test_stream_publication.yaml
    ├── example_provider_registration_publication.yaml
    └── turbine_aerodynamics_example_response_audit_.yaml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "dEval-flask"]
2 | 	path = dEval-flask
3 | 	url = git@github.com:jworender/dEval-flask.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Getting started  
 2 | CLI (primary): https://github.com/peerbench/peerBench/tree/main/peerBenchJS  
 3 | UI (beta)    : https://dev-peer-bench-js.vercel.app/dashboard  
 4 | 
 5 | # The problem:   
 6 | Public benchmark test data sets make AI model performance comparable.  But this creates an incentivization for closed source models to cheat the benchmarks  by training on test data or creating heuristics that overfit the benchmark test dataset .
 7 | 
 8 | # Our  solution :  
 9 | A network of independent experts validating models continuously with newly generated private test data and scoring that can be publicly audited. 
10 | 
11 | # Addressing Industry Wide Overfitting of AI Models
12 | 
13 | Another very important piece of context in accelerating AI innovation is **Model Overfitting plaguing the AI industry**.
14 | 
15 | Public benchmark test data sets make AI model performance comparable. But this creates an incentivization for closed source models in particular to game the benchmarks by creating heuristics for them or overfitting their training data to include solutions to the known testsets. 
16 | 
17 | For open source models **Dynabench**[^45] tries to solve the problem of overfitting on test datasets with a community of humans **intentionally creating new test data designed to be hard for models**. But Dynabench only works with open source models. Additionally Dynabench has not seen significant adoption even after being managed by mlCommons. We believe this lack of traction is due to a lack of incentives for evaluators or AI model owners to participate. 
18 | 
19 | Requiring recency of test data is another approach  taken by LiveBench[^48] but it is primarily focused on the problem of LLM's  accidentally including test data as part of training. A malictious actor can still overfit liveBench within days of the test data being released. And as they are a single entity they can be covertly bribed to give out the test data to one AI provider before the release. But the idea of releasing new test data continously is something we agree with and will also push for in our standard. 
20 | 
21 | **Centralized private test data evaluation** is another approach that has been attempted to resolve the problem of AI companies gaming benchmark results. One currently active private evaluator is the SEAL LLM Leaderboards[^46] by Scale.ai[^47]. However, this lacks the transparency and audibility to be fully trustworthy and widely used as a standard. 
22 | 
23 | **Decentralised systems are strongly positioned to solve** all of those issues thanks to providing proper incentivization to all parties involved while leveraging blockchains' decentralisation and transparency. **Private datasets** still can be a fundamental **part of the validation strategy**. 
24 | 
25 | # Test Data Quality Control Process 
26 | 
27 | # Become a Validator 
28 | 
29 | # Get Your Model Benchmarked 
30 | 
31 | 
32 | [^45]:  Link: https://arxiv.org/abs/2104.14337
33 | 
34 | [^46]:  Link: https://scale.com/leaderboard
35 | 
36 | [^47]:  Link: https://scale.com/
37 | [^48]:  Link: https://livebench.ai/#/ 
38 | 
39 | 


--------------------------------------------------------------------------------
/brunotest/peerBench/bruno.json:
--------------------------------------------------------------------------------
1 | {
2 |   "version": "1",
3 |   "name": "peerBench",
4 |   "type": "collection",
5 |   "ignore": [
6 |     "node_modules",
7 |     ".git"
8 |   ]
9 | }


--------------------------------------------------------------------------------
/brunotest/peerBench/collection.bru:
--------------------------------------------------------------------------------
1 | auth {
2 |   mode: bearer
3 | }
4 | 
5 | auth:bearer {
6 |   token: eyJhbGciOiJFUzI1NksiLCJ0eXAiOiJKV1QifQ.eyJ1c2VyIjoidmFsaWRhdG9yIiwicm9sZSI6InZhbGlkYXRvciJ9.MEQCIE_8VzeUPhGrp_EgtrUEWLLUZO_zDq-zAxVPiG3FHfEsAiBi73HHTyQDJq-dBJoc9UkZ48WBiZ704r4STqEo-fm9Cw
7 | }
8 | 


--------------------------------------------------------------------------------
/brunotest/peerBench/submit-score.bru:
--------------------------------------------------------------------------------
 1 | meta {
 2 |   name: submit-score
 3 |   type: http
 4 |   seq: 2
 5 | }
 6 | 
 7 | post {
 8 |   url: https://deval-flask.onrender.com/submit-score
 9 |   body: json
10 |   auth: bearer
11 | }
12 | 
13 | headers {
14 |   Content-Type: application/json
15 | }
16 | 
17 | auth:bearer {
18 |   token: eyJhbGciOiJFUzI1NksiLCJ0eXAiOiJKV1QifQ.eyJ1c2VyIjoidmFsaWRhdG9yIiwicm9sZSI6InZhbGlkYXRvciJ9.MEQCIE_8VzeUPhGrp_EgtrUEWLLUZO_zDq-zAxVPiG3FHfEsAiBi73HHTyQDJq-dBJoc9UkZ48WBiZ704r4STqEo-fm9Cw
19 | }
20 | 
21 | body:json {
22 |   {
23 |     "test_id": "1c97c740-bba0-4d2e-8459-bf7c761531a2",
24 |     "model_id": "7a18d370-41c1-4d9d-8135-93d8fc937317",
25 |     "validator_id": "0d845dc3-8f91-4eb8-978f-474d5dc86643",
26 |     "score": 93.5,
27 |     "metrics": {
28 |       "accuracy": 0.94,
29 |       "latency_ms": 120
30 |     },
31 |     "evaluation_type": "primary",
32 |     "hash": "0xabc123...",
33 |     "evaluation_timestamp": "2024-03-30T15:00:00Z",
34 |     "metadata": {
35 |       "Target": 100,
36 |       "goals": "return a json object with a sum of 7 and 93"
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/dEval_sim/README.md:
--------------------------------------------------------------------------------
 1 | # dEval-validator-sim
 2 | 
 3 | We propose a decentralized evaluation framework for generative AI models leveraging a validator-based approach. Each validator independently formulates evaluation prompts (original submissions) and executes them across a set of generative models, submitting their recorded performance metrics to a shared database. Subsequently, other validators cross-validate these prompts by independently running identical evaluations, with the outcomes used to corroborate or dispute the original results. Evaluations are aggregated through credibility-weighted averages, dynamically adjusting validators' credibility scores based on consistency between original submissions and corresponding cross-validation outcomes. A change in any validator’s credibility score retroactively influences all previous evaluations involving that validator, ensuring continuous refinement of model assessment accuracy.
 4 | 
 5 | ## Mathematical Framework
 6 | 
 7 | # Mathematical Framework for Decentralized Evaluation Simulation
 8 | 
 9 | This document outlines the mathematical framework describing the operation of the validator simulation.
10 | 
11 | **1. Definitions and Notation**
12 | 
13 | * **Validators:** Let $V$ be the set of validators, indexed by $i$ or $j$. $V = \{0, 1, ..., N-1\}$, where $N = |V|$ is the total number of validators ($N=8$ in the simulation).
14 | * **Models:** Let $\mathcal{M}$ be the set of generative AI models being evaluated, indexed by $m$. $\mathcal{M} = \{\text{Model}_A, \text{Model}_B, ...\}$. Let $M = |\mathcal{M}|$ be the number of models.
15 | * **Time Steps:** Let $t$ denote the discrete time step or simulation round, $t = 0, 1, 2, ...$.
16 | * **Tests/Prompts:** Let $k$ denote a unique test instance (prompt and evaluation criteria). We assume a new test $k(t)$ is generated at each step $t \ge 1$.
17 | * **Credibility Score:** Let $C_i(t)$ be the credibility score of validator $i$ at the end of step $t$. $C_i(t) \in [C_{min}, C_{max}]$ (e.g., $[0.01, 1.0]$).
18 | * **"True" Score:** Let $T_{k,m}$ be the theoretical "true" performance score of model $m$ on test $k$. $T_{k,m} \in [0, 1]$. This exists only in the simulation.
19 | * **Submitted Score:** Let $S_{i,k,m}$ be the score submitted by validator $i$ for model $m$ on test $k$. $S_{i,k,m} \in [0, 1]$.
20 | * **Submission Type:** Let $\tau_s$ denote the type of a submission $s$, where $\tau_s \in \{\text{original}, \text{cross}\}$.
21 | * **Bad Actor Status:** Let $B_i$ be a boolean indicator, $B_i = 1$ if validator $i$ is a bad actor, $B_i = 0$ otherwise.
22 | * **Bad Actor Bias Function:** Let $\beta(i, m, \tau_s)$ be the bias introduced by validator $i$ for model $m$ during submission type $\tau_s$.
23 |     * $\beta(i, m, \text{original}) > 0$ if $B_i=1$ and $m$ is the target model.
24 |     * $\beta(i, m, \text{original}) < 0$ if $B_i=1$ and $m$ is the victim model.
25 |     * $\beta(i, m, \text{original}) = 0$ if $B_i=0$.
26 |     * $\beta(i, m, \text{cross}) = 0$ for all $i, m$ (as per simulation rules).
27 | * **Noise Function:** Let $\eta_{i,k,m}$ be a random noise term (e.g., drawn from $\mathcal{N}(0, \sigma^2)$), representing honest scoring variations.
28 | * **Results Store:** Let $R(t)$ be the set of all submissions recorded up to the end of step $t$. Each submission $s \in R(t)$ contains $(i_s, k_s, m_s, S_s, \tau_s, O_s, t_s)$ representing (submitter ID, test ID, model ID, score, submission type, original validator ID for test $k_s$, submission step).
29 | * **Originating Validator:** Let $O(t) \in V$ be the index of the validator who originates the test $k(t)$ at step $t$. (e.g., $O(t) = (t-1) \pmod N$).
30 | * **Cross-Validating Set:** Let $XVal(t)$ be the set of validators performing cross-validation at step $t$. $XVal(t) = V \setminus \{O(t)\}$.
31 | 
32 | **2. Simulation Dynamics**
33 | 
34 | * **Initialization (t=0):**
35 |     * Set initial credibility $C_i(0) = C_{init}$ for all $i \in V$.
36 |     * Initialize the results store $R(0) = \emptyset$.
37 |     * Initialize final model scores $F_m(0)$ (e.g., $0.5$) for all $m \in \mathcal{M}$.
38 | 
39 | * **Simulation Step $t$ (for $t \ge 1$):**
40 | 
41 |     * **Step 2.1: Test Generation & Origination**
42 |         * Select originator $O(t) \in V$.
43 |         * Generate new test $k(t)$.
44 |         * Determine "true" scores $\{T_{k(t),m}\}_{m \in \mathcal{M}}$.
45 | 
46 |     * **Step 2.2: Score Submission (Original)**
47 |         * Validator $O(t)$ calculates scores for all models $m \in \mathcal{M}$:
48 |             * $S_{O(t), k(t), m} = \text{clamp}( T_{k(t),m} + \beta(O(t), m, \text{original}) + \eta_{O(t),k(t),m}, 0, 1 )$
49 | 
50 |             where $\text{clamp}(x, a, b) = \max(a, \min(x, b))$.
51 |         * Add original submissions $s_{orig} = (O(t), k(t), m, S_{O(t),k(t),m}, \text{original}, O(t), t)$ for each $m$ to $R(t-1)$ to form intermediate store $R'(t)$.
52 | 
53 |     * **Step 2.3: Score Submission (Cross-Validation)**
54 |         * For each cross-validator $j \in XVal(t)$:
55 |             * Validator $j$ calculates scores for all models $m \in \mathcal{M}$:
56 |                * $S_{j, k(t), m} = \text{clamp}( T_{k(t),m} + \beta(j, m, \text{cross}) + \eta_{j,k(t),m}, 0, 1 )$
57 |                  (Note: $\beta(j, m, \text{cross}) = 0$).
58 |             * Add cross-validation submissions $s_{cross,j} = (j, k(t), m, S_{j,k(t),m}, \text{cross}, O(t), t)$ for each $m$ to $R'(t)$ to form the full results store $R(t)$.
59 | 
60 |     * **Step 2.4: Calculate Weighted Mean of Cross-Validations**
61 |         * For the current test $k(t)$ and each model $m \in \mathcal{M}$:
62 |             * Retrieve cross-validation scores $\{S_{j,k(t),m} | j \in XVal(t)\}$ and previous credibilities $\{C_j(t-1) | j \in XVal(t)\}$.
63 |             * Calculate the weighted mean $WMean_{k(t),m}$:
64 |                * $\text{Numerator} = \sum_{j \in XVal(t)} S_{j,k(t),m} \cdot C_j(t-1)$
65 |                * $\text{Denominator} = \sum_{j \in XVal(t)} C_j(t-1)$
66 |                * $WMean_{k(t),m} = \frac{\text{Numerator}}{\max(\text{Denominator}, \epsilon)}$
67 |                  (where $\epsilon$ is a small positive constant, e.g., $10^{-9}$).
68 | 
69 |     * **Step 2.5: Calculate Discrepancy for Originator**
70 |         * Retrieve the original scores $S_{O(t),k(t),m}$ for test $k(t)$.
71 |         * Calculate a discrepancy measure, $\Delta_{O(t), k(t)}$. Example: Average Absolute Difference:
72 |             * $\Delta_{O(t), k(t)} = \frac{1}{M} \sum_{m \in \mathcal{M}} | S_{O(t),k(t),m} - WMean_{k(t),m} |$
73 | 
74 |     * **Step 2.6: Update Credibility Scores**
75 |         * Define a credibility update function $f(\Delta, C_{old})$. Example using learning rate $L$ and penalty factor $P$:
76 |            * $\text{Change} = L \cdot (1 - P \cdot \Delta)$
77 |            * $C^1_{O(t)}(t) = C_{O(t)}(t-1) + \text{Change}$
78 |            * $C_{O(t)}(t) = \text{clamp}( C^1_{O(t)}(t), C_{min}, C_{max} )$
79 | 
80 |         * For all other validators $j \neq O(t)$:
81 |            * $C_j(t) = C_j(t-1)$
82 | 
83 |     * **Step 2.7: Recalculate Final Model Scores (Retroactive)**
84 |         * For each model $m \in \mathcal{M}$:
85 |             * Initialize $Num_m = 0$, $Denom_m = 0$.
86 |             * Iterate through *all* submissions $s = (i_s, k_s, m_s, S_s, \tau_s, O_s, t_s)$ in the *entire history* $R(t)$.
87 |             * If $m_s = m$:
88 |                 * Retrieve the *current* credibility $C_{i_s}(t)$ for the submitter $i_s$.
89 |                 * $Num_m = Num_m + S_s \cdot C_{i_s}(t)$
90 |                 * $Denom_m = Denom_m + C_{i_s}(t)$
91 |             * Calculate the final score for model $m$ at step $t$:
92 |                 * $F_m(t) = \frac{Num_m}{\max(Denom_m, \epsilon)}$
93 | 
94 | **3. Goal Demonstration**
95 | 
96 | The framework shows that if a bad actor validator $i$ (where $B_i=1$) consistently submits biased scores $S_{i,k,m}$ during its origination steps (Step 2.2), the discrepancy $\Delta_{i, k(t)}$ (Step 2.5) will likely be larger compared to honest validators. This leads to repeated negative credibility updates (Step 2.6), causing $C_i(t)$ to decrease. Consequently, in the final score calculation (Step 2.7), the contribution of validator $i$'s submissions ($S_s$ where $i_s = i$) is down-weighted by the low $C_i(t)$, reducing the bad actor's influence on the final model scores $F_m(t)$.
97 | 


--------------------------------------------------------------------------------
/meeting-notes/20250218/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/.DS_Store


--------------------------------------------------------------------------------
/meeting-notes/20250218/RobustAiBench_initial_meeting_notes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/RobustAiBench_initial_meeting_notes.docx


--------------------------------------------------------------------------------
/meeting-notes/20250218/RobustAiBench_initial_meeting_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/RobustAiBench_initial_meeting_notes.pdf


--------------------------------------------------------------------------------
/meeting-notes/20250218/RobustAiBench_initial_meeting_transcript.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/meeting-notes/20250218/RobustAiBench_initial_meeting_transcript.pdf


--------------------------------------------------------------------------------
/peerBenchJS/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/peerBenchJS/.DS_Store


--------------------------------------------------------------------------------
/peerBenchJS/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Forest-Protocols
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/peerBenchJS/config.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tasks": [
 3 |     "data/tasks/mmlu-pro/mmlu-pro.parquet",
 4 |     "data/tasks/bigbench/social_iqa.json"
 5 |   ],
 6 |   "models": [
 7 |     "near.ai:fireworks::accounts/fireworks/models/deepseek-v3",
 8 |     "near.ai:fireworks::accounts/fireworks/models/llama-v3p1-8b-instruct",
 9 |     "openrouter.ai:openai/chatgpt-4o-latest",
10 |     "openrouter.ai:google/gemini-2.0-flash-001",
11 |     "openrouter.ai:google/gemini-2.0-flash-lite-001",
12 |     "openrouter.ai:meta-llama/llama-4-scout",
13 |     "openrouter.ai:meta-llama/llama-4-maverick",
14 |     "openrouter.ai:mistralai/mistral-small-3.1-24b-instruct"
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/peerBenchJS/config.mmlu.json:
--------------------------------------------------------------------------------
1 | {
2 |   "tasks": ["data/tasks/mmlu-pro.parquet"],
3 |   "models": ["openrouter.ai:meta-llama/llama-4-scout"]
4 | }
5 | 


--------------------------------------------------------------------------------
/peerBenchJS/config.mmlu.onlyEngineering.json:
--------------------------------------------------------------------------------
1 | {
2 |   "tasks": ["./data/tasks/mmlu-pro.onlyEngineering.json"],
3 |   "models": ["openrouter.ai:meta-llama/llama-4-scout"]
4 | }
5 | 


--------------------------------------------------------------------------------
/peerBenchJS/config.mmlu.onlyHistory.json:
--------------------------------------------------------------------------------
1 | {
2 |   "tasks": ["./data/tasks/mmlu-pro.onlyHistory.jsonl"],
3 |   "models": [
4 |     "openrouter.ai:google/gemma-3-4b-it",
5 |     "openrouter.ai:meta-llama/llama-4-scout",
6 |     "openrouter.ai:mistralai/mistral-small-3.1-24b-instruct"
7 |   ]
8 | }
9 | 


--------------------------------------------------------------------------------
/peerBenchJS/config.mmlu.onlyLaw.generated.llama-4-scout-wrong.json:
--------------------------------------------------------------------------------
1 | {
2 |   "tasks": ["./data/tasks/mmlu-pro.onlyLaw.generated.llama-4-scout-wrong.json"],
3 |   "models": ["openrouter.ai:meta-llama/llama-4-scout"]
4 | }
5 | 


--------------------------------------------------------------------------------
/peerBenchJS/config.mmlu.onlyLaw.json:
--------------------------------------------------------------------------------
1 | {
2 |   "tasks": ["./data/tasks/mmlu-pro.onlyLaw.json"],
3 |   "models": ["openrouter.ai:google/gemma-3-4b-it"]
4 | }
5 | 


--------------------------------------------------------------------------------
/peerBenchJS/data/tasks/mmlu-pro/mmlu-pro.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/peerBenchJS/data/tasks/mmlu-pro/mmlu-pro.parquet


--------------------------------------------------------------------------------
/peerBenchJS/eslint.config.js:
--------------------------------------------------------------------------------
 1 | import eslint from "@eslint/js";
 2 | import tseslint from "typescript-eslint";
 3 | 
 4 | export default [
 5 |   {
 6 |     ignores: ["dist/**/*.js"],
 7 |   },
 8 |   ...tseslint.config({
 9 |     extends: [eslint.configs.recommended, tseslint.configs.recommended],
10 |     files: ["src/**/*.ts"],
11 | 
12 |     rules: {
13 |       "@typescript-eslint/no-explicit-any": "off",
14 |       "@typescript-eslint/no-empty-object-type": "off",
15 |     },
16 |   }),
17 | ];
18 | 


--------------------------------------------------------------------------------
/peerBenchJS/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "peerbench",
 3 |   "version": "1.0.0",
 4 |   "description": "Peer Bench is a decentralized AI evaluation framework",
 5 |   "keywords": [
 6 |     "ai",
 7 |     "evaluation",
 8 |     "validation",
 9 |     "benchmark"
10 |   ],
11 |   "homepage": "https://github.com/Forest-Protocols/peer-bench#readme",
12 |   "bugs": {
13 |     "url": "https://github.com/Forest-Protocols/peer-bench/issues"
14 |   },
15 |   "repository": {
16 |     "type": "git",
17 |     "url": "git+https://github.com/Forest-Protocols/peer-bench.git"
18 |   },
19 |   "license": "MIT",
20 |   "author": "Forest AI Development Team",
21 |   "type": "module",
22 |   "main": "dist/index.js",
23 |   "scripts": {
24 |     "lint": "tsc && eslint",
25 |     "dev": "tsx src/index.ts",
26 |     "watch": "tsup --silent --watch --onSuccess 'tsc-alias -p tsconfig.json -f && node dist/index.js'",
27 |     "start": "node dist/index.js",
28 |     "build": "tsup --onSuccess 'tsc-alias -p tsconfig.json -f'",
29 |     "clean": "rm -rf dist",
30 |     "clean:outputs": "rm -rf data/output/*",
31 |     "clean:logs": "rm -rf data/logs/*"
32 |   },
33 |   "dependencies": {
34 |     "@dotenvx/dotenvx": "^1.39.1",
35 |     "@inquirer/prompts": "^7.4.1",
36 |     "ansis": "^3.17.0",
37 |     "axios": "^1.8.4",
38 |     "commander": "^13.1.0",
39 |     "csv": "^6.3.11",
40 |     "glob": "^11.0.2",
41 |     "hyparquet": "^1.11.0",
42 |     "multiformats": "^13.3.2",
43 |     "openai": "^4.92.1",
44 |     "table": "^6.9.0",
45 |     "uuid": "^11.1.0",
46 |     "viem": "^2.26.2",
47 |     "winston": "^3.17.0",
48 |     "yaml": "^2.7.1",
49 |     "zod": "^3.24.2"
50 |   },
51 |   "devDependencies": {
52 |     "@eslint/js": "^9.24.0",
53 |     "@types/node": "^22.14.0",
54 |     "eslint": "^9.24.0",
55 |     "tsc-alias": "^1.8.13",
56 |     "tsup": "^8.4.0",
57 |     "tsx": "^4.19.3",
58 |     "typescript": "^5.8.3",
59 |     "typescript-eslint": "^8.29.0"
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/peerBenchJS/peerbench:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | npm run dev --silent -- $*


--------------------------------------------------------------------------------
/peerBenchJS/src/base/provider.ts:
--------------------------------------------------------------------------------
 1 | import { parseEnvVariables } from "@/config";
 2 | import { logger } from "@/core/logger";
 3 | import { MaybePromise, ModelResponse } from "@/types";
 4 | import winston from "winston";
 5 | import { z } from "zod";
 6 | 
 7 | /**
 8 |  * Base class for Providers
 9 |  */
10 | export abstract class AbstractProvider<
11 |   T extends Record<string, z.ZodTypeAny> = Record<string, z.ZodTypeAny>,
12 | > {
13 |   readonly name: string;
14 | 
15 |   logger: winston.Logger;
16 |   env: { [K in keyof T]: z.infer<T[K]> };
17 | 
18 |   /**
19 |    * Initialize a new Provider
20 |    * @param options
21 |    */
22 |   constructor(options: {
23 |     /**
24 |      * Name of the provider
25 |      */
26 |     name: string;
27 | 
28 |     env?: T;
29 |   }) {
30 |     this.name = options.name;
31 |     this.env = {} as T;
32 | 
33 |     try {
34 |       const capitalizedName = options.name.replace(".", "_").toUpperCase();
35 | 
36 |       if (options.env !== undefined) {
37 |         const env: Record<string, z.ZodTypeAny> = {};
38 |         const originalKeys: Record<string, keyof T> = {};
39 | 
40 |         for (const [key, schema] of Object.entries(options.env)) {
41 |           const envKey = `PB_${capitalizedName}_${key}`;
42 | 
43 |           originalKeys[envKey] = key;
44 |           env[envKey] = schema;
45 |         }
46 | 
47 |         const parsedEnv = parseEnvVariables(env);
48 | 
49 |         for (const [key, value] of Object.entries(parsedEnv)) {
50 |           this.env[originalKeys[key] as keyof T] = value;
51 |         }
52 |       }
53 | 
54 |       this.logger = logger.child({
55 |         context: `Provider(${this.name})`,
56 |       });
57 |     } catch (err: any) {
58 |       throw new Error(`${this.name}: ${err?.message || err}`, {
59 |         cause: err,
60 |       });
61 |     }
62 |   }
63 | 
64 |   /**
65 |    * Decentralized identifier of the Provider
66 |    */
67 |   get did() {
68 |     return `did:prov:${this.name.toLowerCase()}`;
69 |   }
70 | 
71 |   /**
72 |    * Executes the given prompt and returns the response
73 |    * @param prompt
74 |    */
75 |   abstract forward(
76 |     prompt: string,
77 |     model: string,
78 |     system: string
79 |   ): Promise<ModelResponse>;
80 | 
81 |   abstract parseModelIdentifier(identifier: string): MaybePromise<{
82 |     modelName: string;
83 |     modelOwner: string;
84 |     subProvider?: string;
85 |   }>;
86 | }
87 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/base/task-format.ts:
--------------------------------------------------------------------------------
 1 | import { MaybePromise, Task } from "@/types";
 2 | 
 3 | /**
 4 |  * Abstract class that formats needs to implement
 5 |  */
 6 | export abstract class AbstractTaskFormat {
 7 |   /**
 8 |    * Name of the format
 9 |    */
10 |   abstract name: string;
11 | 
12 |   /**
13 |    * Read and parses the given file as a Task object
14 |    */
15 |   abstract parseFromFile(content: string): MaybePromise<Task>;
16 | 
17 |   /**
18 |    * Returns `true` if the given content recognized by this class
19 |    */
20 |   abstract recognize(content: any): MaybePromise<boolean>;
21 | 
22 |   /**
23 |    * Creates a raw object that can be parsed by the `targetFOrmat`.
24 |    *
25 |    * @param task The task to convert
26 |    * @param targetFormat The format type to convert to
27 |    * @returns The converted data in a format that the target format can parse
28 |    */
29 |   abstract convertTo(task: Task, targetFormat: string): MaybePromise<any>;
30 | }
31 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/base/uploader.ts:
--------------------------------------------------------------------------------
 1 | import { logger } from "@/core/logger";
 2 | 
 3 | /**
 4 |  * Base class for uploader classes
 5 |  */
 6 | export abstract class AbstractUploader {
 7 |   name = "Abstract";
 8 |   logger = logger.child({ context: `Uploader(${this.constructor.name})` });
 9 | 
10 |   abstract init(): Promise<unknown>;
11 | 
12 |   abstract uploadPromptResponses(
13 |     filePath: string,
14 |     options?: UploadOptions
15 |   ): Promise<unknown>;
16 |   abstract uploadScores(
17 |     filePath: string,
18 |     options?: UploadOptions
19 |   ): Promise<unknown>;
20 | 
21 |   // TODO: upload aggregation method
22 | }
23 | 
24 | export type UploadOptions = {
25 |   batchSize?: number;
26 | };
27 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/commands/std.ts:
--------------------------------------------------------------------------------
 1 | import { config } from "@/config";
 2 | import { logger } from "@/core/logger";
 3 | import { checkValidationError } from "@/core/utils";
 4 | import { program } from "@/core/program";
 5 | import { yellow } from "ansis";
 6 | import { z } from "zod";
 7 | import { convertTaskFormat } from "@/core/std";
 8 | import { taskFormats } from "@/core/format";
 9 | 
10 | const taskFormatNames = taskFormats.map((t) => t.name);
11 | 
12 | program
13 |   .command("std")
14 |   .description("Standardize data format between different schemas")
15 |   .requiredOption(
16 |     "-s, --source <file>",
17 |     "Source file with data to be standardized"
18 |   )
19 |   .option(
20 |     "-t, --to <schema>",
21 |     `Target schema type: ${taskFormatNames.map((t) => `"${t}"`).join(", ")}`,
22 |     "medqa"
23 |   )
24 |   .option(
25 |     "-o, --output <file>",
26 |     "Output file path (default: derived from source filename)"
27 |   )
28 |   .action(
29 |     async (rawOptions: {
30 |       source: string;
31 |       from: string;
32 |       to: string;
33 |       output?: string;
34 |     }) => {
35 |       logger.debug(`Validator DID ${yellow.bold(config.VALIDATOR_DID)}`);
36 |       const options = checkValidationError(
37 |         z
38 |           .object({
39 |             source: z.string(),
40 |             to: z
41 |               .string()
42 |               .default("medqa")
43 |               .transform((value, ctx) => {
44 |                 if (!taskFormatNames.includes(value)) {
45 |                   ctx.addIssue({
46 |                     code: z.ZodIssueCode.custom,
47 |                     message: "Invalid target schema type",
48 |                   });
49 |                   return z.NEVER;
50 |                 }
51 | 
52 |                 return value;
53 |               }),
54 |             output: z.string().optional(),
55 |           })
56 |           .safeParse(rawOptions)
57 |       );
58 | 
59 |       await convertTaskFormat({
60 |         sourceTaskFile: options.source,
61 |         output: options.output,
62 |         targetFormat: options.to,
63 |       });
64 |     }
65 |   )
66 |   .allowUnknownOption(true);
67 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/commands/upload.ts:
--------------------------------------------------------------------------------
 1 | import { logger } from "@/core/logger";
 2 | import { getUploader, uploaders } from "@/core/uploaders";
 3 | import { checkValidationError, saveJobLog } from "@/core/utils";
 4 | import { program } from "@/core/program";
 5 | import { z } from "zod";
 6 | import { v7 as uuidv7 } from "uuid";
 7 | 
 8 | program
 9 |   .command("upload")
10 |   .description("Uploads the given files to a remote server")
11 |   .requiredOption(
12 |     "-t, --target <name>",
13 |     `Points the target remote server name. Available targets servers ${uploaders
14 |       .map((u) => u.name)
15 |       .join(", ")}`
16 |   )
17 |   .option(
18 |     "-s, --scores <files...>",
19 |     "The score files that is going to be uploaded"
20 |   )
21 |   .option(
22 |     "-r, --responses <files...>",
23 |     "The response files that is going to be uploaded"
24 |   )
25 |   .action(
26 |     async (rawOptions: {
27 |       target: string;
28 |       scores?: string[];
29 |       responses?: string[];
30 |     }) => {
31 |       const options = checkValidationError(
32 |         z
33 |           .object({
34 |             target: z.string(),
35 |             task: z.string().default("default"),
36 |             scores: z.array(z.string()).default([]),
37 |             responses: z.array(z.string()).default([]),
38 |           })
39 |           .safeParse(rawOptions)
40 |       );
41 | 
42 |       const startedAt = Date.now();
43 |       const uploader = getUploader(options.target);
44 | 
45 |       if (uploader === undefined) {
46 |         throw new Error(`Target "${options.target}" is not available`);
47 |       }
48 | 
49 |       if (options.responses.length > 0) {
50 |         await uploader.init();
51 | 
52 |         logger.info(`Uploading given responses to ${options.target}`);
53 |         await Promise.all(
54 |           options.responses.map((path) =>
55 |             uploader.uploadPromptResponses(path, { batchSize: 20 })
56 |           )
57 |         );
58 |       }
59 | 
60 |       if (options.scores.length > 0) {
61 |         await uploader.init();
62 | 
63 |         logger.info(`Uploading given scores to ${options.target}`);
64 |         await Promise.all(
65 |           options.scores.map((path) =>
66 |             uploader.uploadScores(path, { batchSize: 20 })
67 |           )
68 |         );
69 |       }
70 | 
71 |       await saveJobLog(
72 |         {
73 |           uuid: uuidv7(),
74 |           jobType: "upload",
75 |           startedAt: +startedAt,
76 |           completedAt: Date.now(),
77 |           scoreFiles: options.scores,
78 |           responseFiles: options.responses,
79 |         },
80 |         "upload",
81 |         startedAt
82 |       );
83 |     }
84 |   )
85 |   .allowUnknownOption(true);
86 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/config.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { red } from "ansis";
 3 | import dotenv from "@dotenvx/dotenvx";
 4 | import { join } from "path";
 5 | import { mkdirSync } from "fs";
 6 | import { LogLevels, NodeEnvs } from "./types";
 7 | import { Address, privateKeyToAccount } from "viem/accounts";
 8 | 
 9 | dotenv.config({ ignore: ["MISSING_ENV_FILE"], logLevel: "blank", quiet: true });
10 | 
11 | /**
12 |  * Parses the environment variables based on the given fields.
13 |  * Exits with code 1 if the parse is failed
14 |  * @returns Parsed environment variables
15 |  */
16 | export function parseEnvVariables<T extends Record<string, z.ZodTypeAny>>(
17 |   fields: T
18 | ): z.infer<z.ZodObject<T>> {
19 |   const environmentSchema = z.object(fields);
20 |   const validation = environmentSchema.safeParse(process.env, {});
21 | 
22 |   if (validation.error) {
23 |     const error = validation.error.errors[0];
24 |     const path = error.path.length > 0 ? error.path.join(".") + ": " : "";
25 |     console.error(
26 |       red(`Error while parsing environment variables: ${path}${error.message}`)
27 |     );
28 |     process.exit(1);
29 |   }
30 | 
31 |   return validation.data;
32 | }
33 | 
34 | // Parse variables
35 | const env = parseEnvVariables({
36 |   NODE_ENV: z.enum(NodeEnvs).default("dev"),
37 |   LOG_LEVEL: z.enum(LogLevels).default("debug"),
38 |   COOL_DOWN_INTERVAL: z.coerce.number().default(2000),
39 |   PRIVATE_KEY: z.string().nonempty(),
40 | });
41 | 
42 | const VALIDATOR_ACCOUNT = privateKeyToAccount(env.PRIVATE_KEY as Address);
43 | export const config = {
44 |   ...env,
45 |   OUTPUT_DIR: join(process.cwd(), "data", "output"),
46 |   DATA_DIR: join(process.cwd(), "data"),
47 |   VALIDATOR_ADDRESS: VALIDATOR_ACCOUNT.address,
48 |   VALIDATOR_DID: `did:val:${VALIDATOR_ACCOUNT.address}`,
49 |   VALIDATOR_ACCOUNT,
50 | };
51 | 
52 | // This also makes `data` directory
53 | mkdirSync(config.OUTPUT_DIR, { recursive: true });
54 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/aggregate.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   AggregationResult as AggregationResults,
  3 |   PromptScoreSchema,
  4 | } from "@/types";
  5 | import { checkValidationError, readFile } from "./utils";
  6 | import { logger } from "./logger";
  7 | import { z } from "zod";
  8 | 
  9 | export async function aggregate(
 10 |   scoreFilePaths: string[]
 11 | ): Promise<AggregationResults> {
 12 |   const scores = scoreFilePaths
 13 |     .map((path) => {
 14 |       try {
 15 |         // TODO: Make it possible to read CSV files
 16 |         const content = readFile(path);
 17 |         const arraySchema = z.array(PromptScoreSchema);
 18 |         const json = JSON.parse(content);
 19 | 
 20 |         return checkValidationError(arraySchema.safeParse(json));
 21 |       } catch (err) {
 22 |         logger.warning(`Score file ${path} couldn't read: ${err}`);
 23 |       }
 24 |     })
 25 |     .filter((s) => s !== undefined)
 26 |     .flat();
 27 | 
 28 |   if (scores.length === 0) {
 29 |     throw new Error(`No scores read to aggregate`);
 30 |   }
 31 | 
 32 |   // We assume that all of the scores come from the same task
 33 |   // otherwise they wouldn't be comparable so we can simply find
 34 |   // the taskDID by just looking at any of the score from the array.
 35 |   const taskDID = scores[0].taskDID;
 36 |   const results: Record<
 37 |     string,
 38 |     {
 39 |       score: number;
 40 |       latency: number;
 41 |       responseCount: number;
 42 |       wrongAnswers: number;
 43 |       modelDID: string;
 44 |       providerDID: string;
 45 |       runIds: Set<string>;
 46 |       sourcePromptDatasetCIDs: Set<string>;
 47 |       sourceFileNames: Set<string>;
 48 |     }
 49 |   > = {};
 50 | 
 51 |   const result: AggregationResults = [];
 52 | 
 53 |   for (const score of scores) {
 54 |     const key = `${score.providerDID}:${score.modelDID}`;
 55 | 
 56 |     if (!results[key]) {
 57 |       results[key] = {
 58 |         providerDID: score.providerDID,
 59 |         modelDID: score.modelDID,
 60 |         score: 0,
 61 |         latency: 0,
 62 |         responseCount: 0,
 63 |         wrongAnswers: 0,
 64 |         runIds: new Set<string>(),
 65 |         sourcePromptDatasetCIDs: new Set<string>(),
 66 |         sourceFileNames: new Set<string>(),
 67 |       };
 68 |     }
 69 | 
 70 |     results[key].score += score.score;
 71 |     results[key].latency += score.repliedAt - score.promptedAt;
 72 |     results[key].responseCount++;
 73 | 
 74 |     // Add runId to the set (this will handle duplicates automatically)
 75 |     if (score.runId) {
 76 |       results[key].runIds.add(score.runId);
 77 |     }
 78 | 
 79 |     // Add sourcePromptDatasetCID to the set
 80 |     if (score.sourcePromptDatasetCID) {
 81 |       results[key].sourcePromptDatasetCIDs.add(score.sourcePromptDatasetCID);
 82 |     }
 83 | 
 84 |     // Add sourceFileName to the set
 85 |     if (score.sourceFileName) {
 86 |       results[key].sourceFileNames.add(score.sourceFileName);
 87 |     }
 88 | 
 89 |     if (score.score === 0) {
 90 |       results[key].wrongAnswers++;
 91 |     }
 92 |   }
 93 | 
 94 |   for (const [, values] of Object.entries(results)) {
 95 |     result.push({
 96 |       providerDID: values.providerDID,
 97 |       modelDID: values.modelDID,
 98 |       taskDID,
 99 |       avgLatency: values.latency / values.responseCount / 1000,
100 |       avgScore: parseFloat((values.score / values.responseCount).toFixed(2)),
101 |       missingAnswers: Math.abs(
102 |         values.responseCount - values.score - values.wrongAnswers
103 |       ),
104 |       score: values.score,
105 |       totalResponse: values.responseCount,
106 |       wrongAnswers: values.wrongAnswers,
107 |       score_runIds: Array.from(values.runIds),
108 |       sourcePromptDatasetCIDs: Array.from(values.sourcePromptDatasetCIDs),
109 |       sourceFileNames: Array.from(values.sourceFileNames),
110 |     });
111 |   }
112 | 
113 |   result.sort((a, b) => {
114 |     const order = [
115 |       [b.score, a.score],
116 |       [a.avgScore, b.avgScore],
117 |       [a.avgLatency, b.avgLatency],
118 |       [b.totalResponse, a.totalResponse],
119 |     ];
120 | 
121 |     for (const values of order) {
122 |       if (values[0] !== values[1]) {
123 |         return values[0] - values[1];
124 |       }
125 |     }
126 | 
127 |     const lastOrderColumn = order[order.length - 1];
128 |     return lastOrderColumn[0] - lastOrderColumn[1];
129 |   });
130 | 
131 |   return result;
132 | }
133 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/format.ts:
--------------------------------------------------------------------------------
 1 | import { BigBenchTaskFormat } from "@/formats/bigbench";
 2 | import { logger } from "./logger";
 3 | import { parse as parsePath } from "path";
 4 | import { MMLUProTaskFormat } from "@/formats/mmlu-pro";
 5 | import { MedQATaskFormat } from "@/formats/medQA";
 6 | import { parseTaskDID } from "./parser";
 7 | import { statSync } from "fs";
 8 | 
 9 | /**
10 |  * Add all the possible task file formats
11 |  */
12 | export const taskFormats = [
13 |   new BigBenchTaskFormat(),
14 |   new MMLUProTaskFormat(),
15 |   new MedQATaskFormat(),
16 | ];
17 | 
18 | export function getTaskFormat(name: string) {
19 |   for (const tp of taskFormats) {
20 |     if (tp.name === name) {
21 |       return tp;
22 |     }
23 |   }
24 |   throw new Error(`No task processor found called "${name}"`);
25 | }
26 | 
27 | /**
28 |  * Loads and parses the given task file with the first possible reader from readers list.
29 |  */
30 | export async function readTask(path: string) {
31 |   if (!statSync(path, { throwIfNoEntry: false })?.isFile()) {
32 |     throw new Error(`The task file doesn't exist: ${path}`);
33 |   }
34 | 
35 |   for (const taskFormat of taskFormats) {
36 |     try {
37 |       const task = await taskFormat.parseFromFile(path);
38 |       const taskName = parseTaskDID(task.did);
39 | 
40 |       // If the task name doesn't exist, just use the file name (without extension) as its name
41 |       if (taskName === "") {
42 |         task.did = `did:task:${parsePath(path).name}`;
43 |       }
44 | 
45 |       return {
46 |         formatName: taskFormat.name,
47 |         task,
48 |       };
49 |     } catch (err) {
50 |       logger.debug(
51 |         `Task format "${taskFormat.name}" didn't work for the task file ${path}: ${err}`
52 |       );
53 |     }
54 |   }
55 | 
56 |   throw new Error(`No valid task format found for the given file: ${path}`);
57 | }
58 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/logger.ts:
--------------------------------------------------------------------------------
 1 | import { createLogger, format, transports } from "winston";
 2 | import { config } from "../config";
 3 | import * as ansis from "ansis";
 4 | 
 5 | type ColorName = "red" | "yellow" | "green" | "magenta" | "cyan";
 6 | type LogLevel = "error" | "warning" | "info" | "debug";
 7 | 
 8 | const colors = {
 9 |   error: "red",
10 |   warning: "yellow",
11 |   info: "green",
12 |   debug: "cyan",
13 | };
14 | 
15 | export const logger = createLogger({
16 |   level: config.LOG_LEVEL,
17 |   levels: {
18 |     error: 0,
19 |     warning: 1,
20 |     info: 2,
21 |     debug: 3,
22 |   },
23 |   transports: [
24 |     new transports.Console({
25 |       format: format.combine(
26 |         format.prettyPrint(),
27 |         format.splat(),
28 |         format.timestamp({
29 |           format: "YYYY/MM/DD - HH:mm:ss",
30 |         }),
31 |         format.printf((info) => {
32 |           const color = colors[info.level as LogLevel];
33 |           const levelColor = ansis[color as ColorName];
34 |           const context = (info.context as string) || "";
35 |           const formatted = [
36 |             `${info.timestamp}`,
37 |             levelColor.bold(info.level.toUpperCase()),
38 |             levelColor.bold(`[${context || "Main"}]`).padEnd(25),
39 |             config.NODE_ENV == "dev" && info.stack
40 |               ? levelColor(
41 |                   `${info.message}${(info.stack as string[]).join("\n")}`
42 |                 )
43 |               : levelColor(info.message),
44 |           ];
45 | 
46 |           return formatted.filter((f) => f).join(" ");
47 |         })
48 |       ),
49 |     }),
50 |   ],
51 | });
52 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/parser.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | import { logger } from "./logger";
  3 | 
  4 | /**
  5 |  * Tries to parse the given string as JSON.
  6 |  * Returns undefined if it is not a valid JSON entity.
  7 |  */
  8 | export function tryParseJson<T = any>(content: string): T | undefined {
  9 |   try {
 10 |     return JSON.parse(content);
 11 |   } catch {
 12 |     // Invalid JSON
 13 |   }
 14 | }
 15 | 
 16 | /**
 17 |  * Parses provider and model config which follow the pattern below:
 18 |  * <provider name>:<model identifier (depends on the provider)>
 19 |  * @returns Provider name and model identifier
 20 |  */
 21 | export function parseProviderConfig(identifier: string) {
 22 |   const regex = /^([^:]+):(.+)$/;
 23 |   const match = identifier.match(regex);
 24 |   if (!match) {
 25 |     logger.warning(`Invalid provider config: ${identifier}`);
 26 |     return;
 27 |   }
 28 | 
 29 |   const [, name, modelIdentifier] = match;
 30 |   return {
 31 |     providerName: name,
 32 |     modelIdentifier,
 33 |   };
 34 | }
 35 | 
 36 | /**
 37 |  * Parses JSONL formatted string into an array
 38 |  * @returns An array of parsed JSON lines
 39 |  */
 40 | export function parseJSONL<T>(str: string): T[] {
 41 |   return str
 42 |     .split("\n") // Split per line
 43 |     .filter((line) => line.trim() !== "") // Filter empty lines
 44 |     .map((line) => tryParseJson(line)) // Parse line (parse as undefined if it is invalid)
 45 |     .filter((obj) => obj !== undefined); // Filter invalid lines
 46 | }
 47 | 
 48 | /**
 49 |  * Extracts the first error message (if there is any)
 50 |  * from a Zod safe parse result and format it.
 51 |  * @param safeParseReturn
 52 |  * @param path Path of the parsing object. It will be used to indicate the invalid field if the info is not available in the validation error.
 53 |  */
 54 | export function parseValidationError<T, K>(
 55 |   safeParseReturn: z.SafeParseReturnType<T, K>,
 56 |   path?: string
 57 | ) {
 58 |   path ??= "";
 59 | 
 60 |   if (safeParseReturn?.error) {
 61 |     const firstError = safeParseReturn.error.errors[0];
 62 | 
 63 |     if (path) {
 64 |       path = `${path}: `;
 65 |     }
 66 | 
 67 |     // Include path if there is
 68 |     path =
 69 |       firstError.path.length > 0 ? `"${firstError.path.join(".")}": ` : path;
 70 |     return `${path}${firstError.message}`;
 71 |   }
 72 | }
 73 | 
 74 | /**
 75 |  * Parses provider name from a DID provider definition
 76 |  */
 77 | export function parseProviderDID(did: string) {
 78 |   return did.split(":")[2];
 79 | }
 80 | 
 81 | /**
 82 |  * Parses task name from a DID definition (yes it is the same thing as `parseProviderDID`)
 83 |  */
 84 | export function parseTaskDID(did: string) {
 85 |   return did.split(":")[2];
 86 | }
 87 | 
 88 | /**
 89 |  * Parses the model name (including owner name) from a DID model definition
 90 |  */
 91 | export function parseModelDID(did: string) {
 92 |   const parts = did.split(":").slice(2);
 93 | 
 94 |   // Model name includes ':' character
 95 |   if (parts.length > 2) {
 96 |     return parts.join(":");
 97 |   }
 98 |   return parts.join("");
 99 | }
100 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/program.ts:
--------------------------------------------------------------------------------
 1 | import { Command } from "commander";
 2 | 
 3 | export const program = new Command("peerbench")
 4 |   .allowUnknownOption(true)
 5 |   .configureHelp({
 6 |     showGlobalOptions: true,
 7 |   });
 8 | 
 9 | program.configureHelp({
10 |   optionTerm(option) {
11 |     return option.flags;
12 |   },
13 |   subcommandTerm(cmd) {
14 |     return cmd.name();
15 |   },
16 |   commandUsage(cmd) {
17 |     const usage: string[] = [];
18 |     for (let parent = cmd.parent; parent; parent = parent.parent) {
19 |       usage.push(parent.name());
20 |     }
21 |     usage.reverse();
22 |     return `${usage.join(" ")} ${cmd.name()} ${cmd.usage()}`;
23 |   },
24 | });
25 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/prompt.ts:
--------------------------------------------------------------------------------
  1 | import { readTask } from "./format";
  2 | import { generateCID, readableTime } from "./utils";
  3 | import { config } from "@/config";
  4 | import { AbstractProvider } from "@/base/provider";
  5 | import {
  6 |   MaybePromise,
  7 |   Prompt,
  8 |   PromptOptions,
  9 |   PromptResponse,
 10 |   Task,
 11 | } from "@/types";
 12 | import { v7 as uuidv7 } from "uuid";
 13 | import { parseProviderConfig } from "./parser";
 14 | import { logger } from "./logger";
 15 | import { blue, yellow } from "ansis";
 16 | import { getProvider } from "./providers";
 17 | import { basename } from "path";
 18 | import { calculateSHA256 } from "./std";
 19 | 
 20 | /**
 21 |  * Sends the prompts from the given task files to the given Providers and
 22 |  * collects the responses.
 23 |  * @param identifiers Provider and model identifiers. Should be in `providerName:modelOwner/modelName` format
 24 |  * @param taskPaths Path of the task files in the local file system
 25 |  */
 26 | export async function prompt(
 27 |   identifiers: string[],
 28 |   taskPaths: string[],
 29 |   options?: {
 30 |     /**
 31 |      * Maximum amount of prompt that will be used from the beginning of the given tasks.
 32 |      * Uses all tasks if undefined
 33 |      * @default undefined
 34 |      */
 35 |     maxPrompt?: number;
 36 |     /**
 37 |      * Called whenever a response is received.
 38 |      */
 39 |     onResponseReceived?: (response: PromptResponse) => MaybePromise<unknown>;
 40 |   }
 41 | ) {
 42 |   const promises: Promise<unknown>[] = [];
 43 | 
 44 |   // Read all the tasks and make them usable
 45 |   const tasks = await Promise.all(
 46 |     taskPaths.map(async (taskPath) => (await readTask(taskPath)).task)
 47 |   );
 48 | 
 49 |   // Total amount of prompt request to be sent
 50 |   let totalPromptCount =
 51 |     tasks.reduce((acc, t) => acc + t.prompts.length, 0) * identifiers.length;
 52 |   let responseCount = 0;
 53 | 
 54 |   if (options?.maxPrompt) {
 55 |     totalPromptCount = options.maxPrompt * tasks.length * identifiers.length;
 56 |     logger.warning(
 57 |       `Only ${options.maxPrompt} prompt will be used from each given task file`
 58 |     );
 59 |   }
 60 | 
 61 |   for (let taskIndex = 0; taskIndex < tasks.length; taskIndex++) {
 62 |     const task = tasks[taskIndex];
 63 |     const taskPath = taskPaths[taskIndex];
 64 |     const taskFileName = basename(taskPath); // Ensure we always have a filename
 65 |     const runId = uuidv7(); // New evaluation ID per given task
 66 | 
 67 |     logger.debug(
 68 |       `Found ${task.prompts.length} prompt in ${yellow.bold(
 69 |         task.did
 70 |       )} (${yellow.bold(taskFileName)})`
 71 |     );
 72 | 
 73 |     for (const identifier of identifiers) {
 74 |       const info = parseProviderConfig(identifier);
 75 |       if (info === undefined) {
 76 |         continue;
 77 |       }
 78 | 
 79 |       const provider = getProvider(info.providerName);
 80 |       if (provider === undefined) {
 81 |         continue;
 82 |       }
 83 | 
 84 |       if (options?.maxPrompt) {
 85 |         task.prompts = task.prompts.slice(0, options.maxPrompt);
 86 |       }
 87 | 
 88 |       promises.push(
 89 |         execPrompts(provider, task, info.modelIdentifier, runId, (response) => {
 90 |           responseCount++;
 91 |           logger.info(
 92 |             `${responseCount} prompt done (from ${yellow.bold(
 93 |               taskFileName
 94 |             )}), ${totalPromptCount - responseCount} prompt left`
 95 |           );
 96 |           options?.onResponseReceived?.(response);
 97 |         })
 98 |       );
 99 |     }
100 |   }
101 | 
102 |   await Promise.all(promises);
103 |   logger.info(`Prompt phase is done`);
104 | }
105 | 
106 | async function execPrompt(
107 |   task: Task,
108 |   promptNumber: number,
109 |   provider: AbstractProvider,
110 |   prompt: Prompt,
111 |   model: string,
112 |   runId: string
113 | ) {
114 |   const providerLogger = provider.logger.child({
115 |     context: `Provider(${provider.name}:${model})`,
116 |   });
117 |   const promptIdentifier = `${promptNumber} from ${yellow.bold(
118 |     task.did
119 |   )} (${yellow.bold(task.fileName)})`;
120 | 
121 |   try {
122 |     const input = prompt.other.stdFullPromptText;
123 |     const result = await provider.forward(
124 |       input,
125 |       model,
126 |       // TODO: Change prompt based on the evaluation type
127 |       "You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...` without any other additional text or explanation."
128 |     );
129 |     const elapsedSeconds =
130 |       (result.completedAt.getTime() - result.startedAt.getTime()) / 1000;
131 |     const response = result.response.trim();
132 | 
133 |     providerLogger.debug(`Result of prompt ${promptIdentifier}: ${response}`);
134 |     providerLogger.info(
135 |       `Prompt ${promptIdentifier} is completed in ${blue.bold(
136 |         readableTime(elapsedSeconds)
137 |       )}`
138 |     );
139 | 
140 |     const promptCID = (await generateCID(input)).toString();
141 |     const responseCID = (await generateCID(response)).toString();
142 |     const promptResponse: PromptResponse = {
143 |       modelDID: `did:model:${model}`,
144 |       validatorDID: config.VALIDATOR_DID,
145 |       providerDID: provider.did,
146 |       taskDID: task.did,
147 | 
148 |       runId,
149 |       sourcePromptDatasetCID: task.cid,
150 |       sourceFileName: task.fileName,
151 | 
152 |       correctResponse: prompt.answer_idx,
153 |       promptCID,
154 |       responseCID,
155 | 
156 |       promptData: input,
157 |       responseData: response,
158 | 
159 |       promptedAt: result.startedAt.getTime(),
160 |       repliedAt: result.completedAt.getTime(),
161 | 
162 |       questionUUID: prompt.other?.stdQuestionUUID || uuidv7(),
163 |       questionHash: calculateSHA256(prompt.question),
164 | 
165 |       fullPromptData: input,
166 |       fullPromptHash: calculateSHA256(input),
167 |     };
168 |     return promptResponse;
169 |   } catch (err) {
170 |     providerLogger.error(`Error on prompt ${promptIdentifier}: ${err}`);
171 |   }
172 | }
173 | 
174 | async function execPrompts(
175 |   provider: AbstractProvider,
176 |   task: Task,
177 |   model: string,
178 |   evaluationRunId: string,
179 |   onResponseReceived?: (response: PromptResponse) => MaybePromise<unknown>
180 | ) {
181 |   const promises: Promise<any>[] = [];
182 |   for (let i = 0; i < task.prompts.length; i++) {
183 |     const prompt = task.prompts[i];
184 | 
185 |     promises.push(
186 |       execPrompt(task, i + 1, provider, prompt, model, evaluationRunId).then(
187 |         (response) => {
188 |           if (response) {
189 |             onResponseReceived?.(response);
190 |           }
191 |         }
192 |       )
193 |     );
194 |   }
195 |   await Promise.all(promises);
196 | }
197 | 
198 | /**
199 |  * Prepares the whole prompt that is going to be asked to the model
200 |  */
201 | export function preparePrompt(question: string, options: PromptOptions) {
202 |   // Append answers to the result
203 |   let result = `${question}\n\n`;
204 |   for (const [letter, answer] of Object.entries(options)) {
205 |     result += `${letter}: ${answer}\n`;
206 |   }
207 | 
208 |   return result;
209 | }
210 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/providers.ts:
--------------------------------------------------------------------------------
 1 | import { AbstractProvider } from "@/base/provider";
 2 | import { logger } from "./logger";
 3 | import { OpenRouterProvider } from "@/providers/openrouter";
 4 | import { NearAIProvider } from "@/providers/nearai";
 5 | 
 6 | function addProvider(instantiate: () => AbstractProvider) {
 7 |   try {
 8 |     providers.push(instantiate());
 9 |   } catch (err: any) {
10 |     logger.warning(`Couldn't add provider: ${err?.message}`);
11 |   }
12 | }
13 | 
14 | export const providers: AbstractProvider[] = [];
15 | 
16 | addProvider(() => new OpenRouterProvider());
17 | addProvider(() => new NearAIProvider());
18 | 
19 | export function getProvider(name: string) {
20 |   const provider = providers.find((p) => p.name === name);
21 |   if (provider === undefined) {
22 |     logger.warning(`Provider not found: ${name}`);
23 |     return;
24 |   }
25 | 
26 |   return provider;
27 | }
28 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/score.ts:
--------------------------------------------------------------------------------
 1 | import { MaybePromise, PromptResponse, PromptScore } from "@/types";
 2 | import { readFile } from "./utils";
 3 | 
 4 | export async function score(
 5 |   responseFilePaths: string[],
 6 |   scorer?: (response: PromptResponse) => MaybePromise<number>
 7 | ) {
 8 |   const contents = responseFilePaths.map((path) => readFile(path));
 9 |   const promptResponses: PromptResponse[] = contents
10 |     .map((content) =>
11 |       // TODO: Ability to read CSV files
12 |       JSON.parse(content)
13 |     )
14 |     .flat();
15 |   // TODO: Validate the `promptResponses` schema via Zod
16 | 
17 |   const scores: PromptScore[] = [];
18 | 
19 |   for (const promptResponse of promptResponses) {
20 |     let score = 0;
21 |     // TODO: Maybe also check the CIDs to be sure everything is correct?
22 | 
23 |     // If the `scorer` function is presented, use it.
24 |     if (scorer) {
25 |       score = await scorer(promptResponse);
26 |     } else {
27 |       // TODO: Use different answer check approaches by using `promptResponse.evalType`
28 |       if (promptResponse.responseData === promptResponse.correctResponse) {
29 |         score = 1;
30 |       } else {
31 |         // Look for some patterns for the answer
32 |         const answer = lookForAnswer(promptResponse.responseData, [
33 |           {
34 |             regex: /answer is\s+([A-Z])/gi,
35 |             answerGroupIndex: 1,
36 |           },
37 |           {
38 |             regex: /answer is\s+\**([A-Z])\**/gi,
39 |             answerGroupIndex: 1,
40 |           },
41 |           {
42 |             regex: /([A-Z]):.+/g,
43 |             answerGroupIndex: 1,
44 |           },
45 |         ]);
46 | 
47 |         if (answer !== undefined && answer === promptResponse.correctResponse) {
48 |           score = 1;
49 |         }
50 |       }
51 |     }
52 | 
53 |     scores.push({
54 |       ...promptResponse,
55 |       score,
56 |     });
57 |   }
58 | 
59 |   return scores;
60 | }
61 | 
62 | function lookForAnswer(
63 |   response: string,
64 |   patterns: {
65 |     regex: RegExp;
66 |     answerGroupIndex: number;
67 |   }[]
68 | ) {
69 |   for (const pattern of patterns) {
70 |     const matches = Array.from(response.matchAll(pattern.regex));
71 |     const match = matches.at(-1);
72 | 
73 |     if (match) {
74 |       return match[pattern.answerGroupIndex];
75 |     }
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/core/uploaders.ts:
--------------------------------------------------------------------------------
 1 | import { CoServerUploader } from "@/uploaders/coserver";
 2 | import { logger } from "./logger";
 3 | 
 4 | export const uploaders = [new CoServerUploader()];
 5 | 
 6 | export function getUploader(name: string) {
 7 |   const uploader = uploaders.find(
 8 |     (p) => p.name.toLowerCase() === name.toLowerCase()
 9 |   );
10 |   if (uploader === undefined) {
11 |     logger.warning(`Uploader not found: ${name}`);
12 |     return;
13 |   }
14 | 
15 |   return uploader;
16 | }
17 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/errors/task.ts:
--------------------------------------------------------------------------------
 1 | export class InvalidTaskError extends Error {
 2 |   constructor(message?: string) {
 3 |     super(message || "Task is in an invalid format");
 4 |     this.name = "InvalidTaskError";
 5 |   }
 6 | }
 7 | 
 8 | export class TaskNotRecognizedError extends InvalidTaskError {
 9 |   constructor() {
10 |     super("Task is not recognized");
11 |     this.name = "TaskNotRecognizedError";
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/formats/bigbench.ts:
--------------------------------------------------------------------------------
  1 | import { AbstractTaskFormat } from "@/base/task-format";
  2 | import { parseValidationError, tryParseJson } from "@/core/parser";
  3 | import { checkValidationError, readFile, generateCID } from "@/core/utils";
  4 | import { InvalidTaskError, TaskNotRecognizedError } from "@/errors/task";
  5 | import { MaybePromise, Prompt, Task, EvalType, EvalTypes } from "@/types";
  6 | import { z } from "zod";
  7 | import { basename } from "path";
  8 | import { generatePromptMetaFields } from "@/core/std";
  9 | import { v7 as uuidv7 } from "uuid";
 10 | 
 11 | export const BigBenchTaskSchema = z.object({
 12 |   canary: z.string().startsWith("BENCHMARK DATA SHOULD NEVER"),
 13 |   name: z.string(),
 14 |   description: z.string(),
 15 |   keywords: z.array(z.string()),
 16 |   metrics: z.array(z.string()),
 17 |   preferred_score: z.string(),
 18 |   output_regex: z.string().optional(),
 19 |   example_input_suffix: z.string().optional(),
 20 |   example_input_prefix: z.string().optional(),
 21 |   example_output_suffix: z.string().optional(),
 22 |   example_output_prefix: z.string().optional(),
 23 |   choice_prefix: z.string().optional(),
 24 |   examples: z.array(
 25 |     z.object({
 26 |       id: z.coerce.number().optional(),
 27 |       input: z.string(),
 28 |       target: z.string().optional(),
 29 |       target_scores: z.record(z.string(), z.coerce.number()),
 30 |     })
 31 |   ),
 32 | });
 33 | 
 34 | export type BigBenchTask = z.infer<typeof BigBenchTaskSchema>;
 35 | 
 36 | export class BigBenchTaskFormat extends AbstractTaskFormat {
 37 |   name = "bigbench";
 38 | 
 39 |   async parseFromFile(path: string): Promise<Task> {
 40 |     const content = readFile(path);
 41 |     const sourceFileName = basename(path);
 42 |     const sourceFileCID = (await generateCID(content)).toString();
 43 |     const json = tryParseJson(content);
 44 | 
 45 |     if (json === undefined) {
 46 |       throw new InvalidTaskError();
 47 |     }
 48 | 
 49 |     if (!this.recognize(json)) {
 50 |       throw new TaskNotRecognizedError();
 51 |     }
 52 | 
 53 |     const task = checkValidationError(BigBenchTaskSchema.safeParse(json));
 54 |     const prompts: Prompt[] = [];
 55 |     const evalTypes: EvalType[] = [];
 56 | 
 57 |     for (const metric of task.metrics) {
 58 |       switch (metric) {
 59 |         case "multiple_choice_grade":
 60 |           evalTypes.push(EvalTypes.MultipleChoice);
 61 |           break;
 62 |         case "exact_str_match":
 63 |           evalTypes.push(EvalTypes.ExactEquality);
 64 |           break;
 65 |         default:
 66 |           // TODO: Check other metric types
 67 |           break;
 68 |       }
 69 |     }
 70 | 
 71 |     // Parse examples (aka prompts, tests)
 72 |     let exampleRowNumber = 0;
 73 |     for (const example of task.examples || []) {
 74 |       const options: Record<string, string> = {};
 75 |       let data = example.input;
 76 |       let answer_idx = "";
 77 | 
 78 |       if (task.example_input_prefix) {
 79 |         data = `${task.example_input_prefix}${data}`;
 80 |       }
 81 |       if (task.example_input_suffix) {
 82 |         data = `${data}${task.example_input_suffix}`;
 83 |       }
 84 | 
 85 |       let answerLetterIndex = 0;
 86 |       for (const [answer, score] of Object.entries(example.target_scores)) {
 87 |         const letter = String.fromCharCode(65 + answerLetterIndex);
 88 | 
 89 |         if (score === 1) {
 90 |           answer_idx = letter;
 91 |         }
 92 | 
 93 |         options[letter] = answer;
 94 | 
 95 |         answerLetterIndex++;
 96 |       }
 97 | 
 98 |       prompts.push({
 99 |         question: data,
100 |         options,
101 |         answer_idx,
102 |         answer: options[answer_idx],
103 |         meta_info: task.name,
104 |         other: {
105 |           ...generatePromptMetaFields({
106 |             options,
107 |             question: data,
108 |             rowNumber: exampleRowNumber,
109 |             sourceFileCID,
110 |             sourceFileName,
111 |             uuid: uuidv7(),
112 |           }),
113 |         },
114 |       });
115 |       exampleRowNumber++;
116 |     }
117 | 
118 |     return {
119 |       did: `did:task:bigbench/${task.name.replaceAll(" ", "-")}`,
120 |       prompts,
121 |       path,
122 |       cid: sourceFileCID,
123 |       fileName: basename(path),
124 |     };
125 |   }
126 | 
127 |   recognize(content: any): MaybePromise<boolean> {
128 |     const err = parseValidationError(BigBenchTaskSchema.safeParse(content));
129 | 
130 |     if (err) {
131 |       return false;
132 |     }
133 | 
134 |     return true;
135 |   }
136 | 
137 |   async convertTo(task: Task, targetFormat: string): Promise<any> {
138 |     switch (targetFormat) {
139 |       default:
140 |         throw new Error(`Unsupported target format: ${targetFormat}`);
141 |     }
142 |   }
143 | }
144 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/formats/mmlu-pro.ts:
--------------------------------------------------------------------------------
  1 | import { AbstractTaskFormat } from "@/base/task-format";
  2 | import { parseJSONL, tryParseJson } from "@/core/parser";
  3 | import { checkValidationError, readFile, generateCID } from "@/core/utils";
  4 | import { InvalidTaskError, TaskNotRecognizedError } from "@/errors/task";
  5 | import { MaybePromise, Prompt, Task } from "@/types";
  6 | import { asyncBufferFromFile, parquetReadObjects } from "hyparquet";
  7 | import { z } from "zod";
  8 | import { basename } from "path";
  9 | import { generatePromptMetaFields } from "@/core/std";
 10 | import { v7 as uuidv7 } from "uuid";
 11 | 
 12 | export const MMLUProTaskSchema = z.object({
 13 |   question_id: z.coerce.number(),
 14 |   question: z.string(),
 15 |   options: z.array(z.string()),
 16 |   answer: z.string(),
 17 |   answer_index: z.coerce.number(),
 18 |   cot_content: z.string(),
 19 |   category: z.string(),
 20 |   src: z.string(),
 21 | });
 22 | 
 23 | export type MMLUProTask = z.infer<typeof MMLUProTaskSchema>;
 24 | 
 25 | export class MMLUProTaskFormat extends AbstractTaskFormat {
 26 |   name = "mmlu-pro";
 27 | 
 28 |   async parseFromFile(path: string): Promise<Task> {
 29 |     const sourceFileName = basename(path);
 30 |     let sourceFileCID: string = "";
 31 | 
 32 |     // TODO: calculate CID for parquet file
 33 |     // Try to parse the file as Parquet
 34 |     let data = await this.tryParseParquet(path);
 35 | 
 36 |     // Not a Parquet file
 37 |     if (!data) {
 38 |       const content = readFile(path);
 39 | 
 40 |       // Calculate CID for the source file
 41 |       sourceFileCID = (await generateCID(content)).toString();
 42 | 
 43 |       // Try to parse it as JSON or JSONL
 44 |       data = this.tryParseJSON(content);
 45 | 
 46 |       // Not a JSON file
 47 |       if (!data) {
 48 |         data = this.tryParseJSONL(content);
 49 |       }
 50 |     }
 51 | 
 52 |     if (!data || data.length == 0) {
 53 |       throw new InvalidTaskError();
 54 |     }
 55 | 
 56 |     if (!this.recognize(data)) {
 57 |       throw new TaskNotRecognizedError();
 58 |     }
 59 | 
 60 |     const categories = new Set<string>();
 61 |     const prompts: Prompt[] = [];
 62 |     const validatedData = checkValidationError(
 63 |       z.array(MMLUProTaskSchema).safeParse(data)
 64 |     );
 65 | 
 66 |     // Parse prompts
 67 |     for (const prompt of validatedData) {
 68 |       const options: Record<string, string> = {};
 69 |       let answer_idx = "";
 70 | 
 71 |       for (let i = 0; i < prompt.options.length; i++) {
 72 |         const option = prompt.options[i];
 73 |         const letter = String.fromCharCode(65 + i);
 74 |         options[letter] = option;
 75 | 
 76 |         // Get the answer letter
 77 |         if (i === prompt.answer_index) {
 78 |           answer_idx = letter;
 79 |         }
 80 |       }
 81 | 
 82 |       const standardPrompt: Prompt = {
 83 |         question: prompt.question,
 84 |         answer: prompt.options[prompt.answer_index],
 85 |         answer_idx,
 86 |         options,
 87 |         meta_info: prompt.category,
 88 |         other: {
 89 |           ...generatePromptMetaFields({
 90 |             options,
 91 |             question: prompt.question,
 92 |             rowNumber: prompt.question_id,
 93 |             sourceFileCID,
 94 |             sourceFileName,
 95 |             uuid: uuidv7(),
 96 |           }),
 97 | 
 98 |           "mmlu-pro__question_id": prompt.question_id,
 99 |           "mmlu-pro__answer_index": prompt.answer_index,
100 |           "mmlu-pro__cot_content": prompt.cot_content,
101 |           "mmlu-pro__category": prompt.category,
102 |           "mmlu-pro__src": prompt.src,
103 |         },
104 |       };
105 | 
106 |       for (const [name, value] of Object.entries(prompt)) {
107 |         // If original object has fields other than those, include them in the standardized object as well
108 |         if (
109 |           ![
110 |             "question",
111 |             "options",
112 |             "answer",
113 |             "answer_index",
114 |             "question_id",
115 |             "cot_content",
116 |             "category",
117 |             "src",
118 |           ].includes(name)
119 |         ) {
120 |           standardPrompt.other![`mmlu-pro__${name}`] = value;
121 |         }
122 |       }
123 | 
124 |       categories.add(prompt.category);
125 |       prompts.push(standardPrompt);
126 |     }
127 | 
128 |     let did = "did:task:mmlu-pro";
129 | 
130 |     // Not all of the categories are presented so update
131 |     // the DID according to the found categories
132 |     if (categories.size != 14) {
133 |       const categoryNames = [...categories].map((category) =>
134 |         category.replaceAll(" ", "-").toLowerCase()
135 |       );
136 |       did += `/${categoryNames.join("-")}`;
137 |     }
138 | 
139 |     return {
140 |       did,
141 |       prompts,
142 |       cid: sourceFileCID,
143 |       fileName: sourceFileName,
144 |       path,
145 |     };
146 |   }
147 | 
148 |   recognize(content: any): MaybePromise<boolean> {
149 |     if (Array.isArray(content)) {
150 |       const json: MMLUProTask[] = content;
151 |       for (const task of json) {
152 |         // If any of the items is not valid
153 |         // we cannot recognize this object as a whole
154 |         if (!this.recognize(task)) {
155 |           return false;
156 |         }
157 |       }
158 |       return true;
159 |     }
160 | 
161 |     if (typeof content !== "object") {
162 |       return false;
163 |     }
164 | 
165 |     const validation = MMLUProTaskSchema.safeParse(content);
166 |     if (validation.error) {
167 |       return false;
168 |     }
169 | 
170 |     return true;
171 |   }
172 | 
173 |   tryParseJSON(content: string) {
174 |     return tryParseJson<unknown[]>(content);
175 |   }
176 | 
177 |   tryParseJSONL(content: string) {
178 |     return parseJSONL<unknown[]>(content);
179 |   }
180 | 
181 |   async tryParseParquet(path: string): Promise<unknown[] | undefined> {
182 |     try {
183 |       const file = await asyncBufferFromFile(path);
184 |       return await parquetReadObjects({ file });
185 |     } catch {
186 |       // Not a valid Parquet file
187 |     }
188 |   }
189 | 
190 |   async convertTo(task: Task, targetFormat: string): Promise<any> {
191 |     switch (targetFormat) {
192 |       case "mmlu-pro":
193 |         return task.prompts.map((prompt) => {
194 |           // Convert options from object to array
195 |           const options = Object.entries(prompt.options)
196 |             .sort(([a], [b]) => a.localeCompare(b))
197 |             .map(([, value]) => value);
198 |           const answer_index = options.findIndex(
199 |             (option) => option === prompt.answer
200 |           );
201 | 
202 |           return {
203 |             question_id:
204 |               prompt.other?.src_row_number ||
205 |               prompt.other?.question_id ||
206 |               prompt.other?.["mmlu-pro__question_id"],
207 |             question: prompt.question,
208 |             options,
209 |             answer: prompt.answer_idx, // Answer letter
210 |             answer_index, // Index of the correct answer in the options array
211 |             cot_content: "",
212 |             category:
213 |               prompt.meta_info ||
214 |               prompt.other?.category ||
215 |               prompt.other?.["mmlu-pro__category"],
216 |             src: prompt.other?.["src"] || prompt.other?.["mmlu-pro__src"],
217 |           };
218 |         });
219 |       case "medqa":
220 |         // Since we are using MedQA as the unified schema, we can just return the prompts
221 |         return [...task.prompts];
222 | 
223 |       default:
224 |         throw new Error(`Unsupported target format: ${targetFormat}`);
225 |     }
226 |   }
227 | }
228 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { red } from "ansis";
 2 | import { logger } from "./core/logger";
 3 | import { program } from "./core/program";
 4 | 
 5 | import "@/commands/prompt";
 6 | import "@/commands/score";
 7 | import "@/commands/aggregate";
 8 | import "@/commands/upload";
 9 | import "@/commands/rephrase";
10 | import "@/commands/std";
11 | import { ensureError } from "./core/utils";
12 | 
13 | program.parseAsync().catch((err) => {
14 |   const error = ensureError(err);
15 |   logger.error(red(error.stack));
16 | });
17 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/providers/nearai.ts:
--------------------------------------------------------------------------------
  1 | import { AbstractProvider } from "@/base/provider";
  2 | import { readFile, sleep } from "@/core/utils";
  3 | import { ModelResponse } from "@/types";
  4 | import OpenAI from "openai";
  5 | import { homedir } from "os";
  6 | import { join } from "path";
  7 | import { z } from "zod";
  8 | 
  9 | const NearAIEnv = {
 10 |   CONFIG_PATH: z.string().default(join(homedir(), ".nearai", "config.json")),
 11 |   RATE_LIMIT: z.coerce.number().default(1),
 12 |   RATE_LIMIT_TIME_WINDOW: z.coerce.number().default(3000),
 13 |   TIMEOUT: z.coerce.number().default(60_0000), // 1 minute
 14 |   MAX_RETRIES: z.coerce.number().default(3),
 15 | };
 16 | 
 17 | export class NearAIProvider extends AbstractProvider<typeof NearAIEnv> {
 18 |   client: OpenAI;
 19 |   private timestamps: number[] = [];
 20 | 
 21 |   constructor() {
 22 |     super({
 23 |       name: "near.ai",
 24 |       env: NearAIEnv,
 25 |     });
 26 | 
 27 |     // Parse config file
 28 |     const config = JSON.parse(readFile(this.env.CONFIG_PATH));
 29 | 
 30 |     if (!config?.auth?.signature) {
 31 |       throw new Error(
 32 |         'Signature is not found. Please try to login via "nearai" CLI'
 33 |       );
 34 |     }
 35 | 
 36 |     // Initialize the client
 37 |     this.client = new OpenAI({
 38 |       baseURL: "https://api.near.ai/v1",
 39 |       apiKey: JSON.stringify(config?.auth),
 40 |       maxRetries: this.env.MAX_RETRIES,
 41 |       timeout: this.env.TIMEOUT,
 42 |     });
 43 |   }
 44 | 
 45 |   async enforceRateLimit(): Promise<void> {
 46 |     const now = Date.now();
 47 | 
 48 |     this.timestamps = this.timestamps.filter(
 49 |       (ts) => now - ts < this.env.RATE_LIMIT_TIME_WINDOW
 50 |     );
 51 | 
 52 |     if (this.timestamps.length < this.env.RATE_LIMIT) {
 53 |       this.timestamps.push(now);
 54 |       return;
 55 |     }
 56 | 
 57 |     const earliest = this.timestamps[0];
 58 |     const waitTime = this.env.RATE_LIMIT_TIME_WINDOW - (now - earliest);
 59 | 
 60 |     await sleep(waitTime);
 61 | 
 62 |     return this.enforceRateLimit();
 63 |   }
 64 | 
 65 |   async forward(
 66 |     prompt: string,
 67 |     model: string,
 68 |     system: string
 69 |   ): Promise<ModelResponse> {
 70 |     await this.enforceRateLimit();
 71 | 
 72 |     const startedAt = new Date();
 73 |     const response = await this.client.chat.completions.create({
 74 |       model,
 75 |       messages: [
 76 |         {
 77 |           role: "system",
 78 |           content: system,
 79 |         },
 80 |         {
 81 |           role: "user",
 82 |           content: prompt,
 83 |         },
 84 |       ],
 85 |     });
 86 | 
 87 |     if ("error" in response) {
 88 |       const err = response.error as any;
 89 |       throw new Error(
 90 |         `${err.message} - Code ${err.code} - ${JSON.stringify(err)}`
 91 |       );
 92 |     }
 93 | 
 94 |     return {
 95 |       response: response?.choices?.[0]?.message?.content || "",
 96 |       startedAt,
 97 |       completedAt: new Date(),
 98 |     };
 99 |   }
100 | 
101 |   parseModelIdentifier(identifier: string): {
102 |     modelName: string;
103 |     modelOwner: string;
104 |     subProvider?: string;
105 |   } {
106 |     const regex =
107 |       /^(?<providerName>[^:]+)::(?<modelOwner>[^/]+)\/(?<modelName>.+)$/;
108 |     const match = identifier.match(regex);
109 |     if (match?.groups) {
110 |       const providerName = match.groups.providerName;
111 |       let modelOwner = match.groups.modelOwner;
112 |       let modelName = match.groups.modelName;
113 | 
114 |       if (modelOwner === "accounts") {
115 |         // Model owner field is not available,
116 |         // so just use the provider name for it.
117 |         modelOwner = providerName;
118 |       }
119 | 
120 |       // Delete redundant part
121 |       if (modelName.startsWith(`${providerName}/models`)) {
122 |         modelName = modelName.replaceAll(`${providerName}/models`, "");
123 |       }
124 | 
125 |       return {
126 |         modelOwner,
127 |         modelName,
128 |         subProvider: providerName,
129 |       };
130 |     }
131 | 
132 |     throw new Error(`Invalid identifier format: ${identifier}`);
133 |   }
134 | }
135 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/providers/openrouter.ts:
--------------------------------------------------------------------------------
  1 | import { AbstractProvider } from "@/base/provider";
  2 | import { sleep } from "@/core/utils";
  3 | import { ModelResponse } from "@/types";
  4 | import OpenAI from "openai";
  5 | import { z } from "zod";
  6 | 
  7 | const OpenRouterEnv = {
  8 |   KEY: z.string().nonempty(),
  9 |   RATE_LIMIT: z.coerce.number().default(20),
 10 |   RATE_LIMIT_TIME_WINDOW: z.coerce.number().default(3000),
 11 |   TIMEOUT: z.coerce.number().default(60_0000), // 1 minute
 12 |   MAX_RETRIES: z.coerce.number().default(3),
 13 | };
 14 | 
 15 | export class OpenRouterProvider extends AbstractProvider<typeof OpenRouterEnv> {
 16 |   client: OpenAI;
 17 |   private timestamps: number[] = [];
 18 | 
 19 |   constructor() {
 20 |     super({
 21 |       name: "openrouter.ai",
 22 |       env: OpenRouterEnv,
 23 |     });
 24 | 
 25 |     // Initialize the client
 26 |     this.client = new OpenAI({
 27 |       baseURL: "https://openrouter.ai/api/v1",
 28 |       apiKey: this.env.KEY,
 29 |       maxRetries: this.env.MAX_RETRIES,
 30 |       timeout: this.env.TIMEOUT,
 31 |     });
 32 |   }
 33 | 
 34 |   async enforceRateLimit(): Promise<void> {
 35 |     const now = Date.now();
 36 | 
 37 |     this.timestamps = this.timestamps.filter(
 38 |       (ts) => now - ts < this.env.RATE_LIMIT_TIME_WINDOW
 39 |     );
 40 | 
 41 |     if (this.timestamps.length < this.env.RATE_LIMIT) {
 42 |       this.timestamps.push(now);
 43 |       return;
 44 |     }
 45 | 
 46 |     const earliest = this.timestamps[0];
 47 |     const waitTime = this.env.RATE_LIMIT_TIME_WINDOW - (now - earliest);
 48 | 
 49 |     await sleep(waitTime);
 50 | 
 51 |     return this.enforceRateLimit();
 52 |   }
 53 | 
 54 |   async forward(
 55 |     prompt: string,
 56 |     model: string,
 57 |     system: string
 58 |   ): Promise<ModelResponse> {
 59 |     await this.enforceRateLimit();
 60 | 
 61 |     const startedAt = new Date();
 62 |     const response = await this.client.chat.completions.create({
 63 |       model,
 64 |       messages: [
 65 |         {
 66 |           role: "system",
 67 |           content: system,
 68 |         },
 69 |         {
 70 |           role: "user",
 71 |           content: prompt,
 72 |         },
 73 |       ],
 74 |     });
 75 | 
 76 |     if ("error" in response) {
 77 |       const err = response.error as any;
 78 |       throw new Error(
 79 |         `${err.message} - Code ${err.code} - ${JSON.stringify(err)}`
 80 |       );
 81 |     }
 82 | 
 83 |     return {
 84 |       response: response?.choices?.[0]?.message?.content || "",
 85 |       startedAt,
 86 |       completedAt: new Date(),
 87 |     };
 88 |   }
 89 | 
 90 |   parseModelIdentifier(identifier: string): {
 91 |     modelName: string;
 92 |     modelOwner: string;
 93 |     subProvider?: string;
 94 |   } {
 95 |     const regex = /^(?<owner>[^/]+)\/(?<model>[^:]+)(?::(?<tier>.+))?$/;
 96 |     const match = identifier.match(regex);
 97 | 
 98 |     if (match?.groups) {
 99 |       const modelOwner = match.groups.owner;
100 |       const tier = match.groups.tier;
101 |       let modelName = match.groups.model;
102 | 
103 |       if (tier) {
104 |         modelName += "-" + tier;
105 |       }
106 | 
107 |       return {
108 |         modelOwner,
109 |         modelName,
110 |         subProvider: undefined, // TODO: parse sub provider if exists
111 |       };
112 |     }
113 | 
114 |     throw new Error(`Invalid identifier format: ${identifier}`);
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/types.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | 
  3 | export const NodeEnvs = ["dev", "production"] as const;
  4 | export type NodeEnv = (typeof NodeEnvs)[number];
  5 | 
  6 | export const LogLevels = ["error", "warning", "info", "debug"] as const;
  7 | export type LogLevel = (typeof LogLevels)[number];
  8 | 
  9 | export type MaybePromise<T> = T | Promise<T>;
 10 | 
 11 | export type PromptOptions = Record<string, string>;
 12 | 
 13 | export type ModelResponse = {
 14 |   startedAt: Date;
 15 |   completedAt: Date;
 16 |   response: string;
 17 | };
 18 | 
 19 | export const EvalTypes = {
 20 |   ExactEquality: "exact-equality",
 21 |   MultipleChoice: "multiple-choice",
 22 | } as const;
 23 | export type EvalType = (typeof EvalTypes)[keyof typeof EvalTypes];
 24 | 
 25 | export const PromptSchema = z.object({
 26 |   question: z.string(),
 27 |   options: z.record(z.string(), z.string()),
 28 |   answer_idx: z.string(),
 29 |   answer: z.string(),
 30 |   meta_info: z.string().optional(),
 31 |   // TODO: Rename `other` to `metadata`
 32 |   other: z
 33 |     .object({
 34 |       hash_full_question: z.string(),
 35 |       hash_first_sentence: z.string(),
 36 |       hash_first_question_sentence: z.string(),
 37 |       hash_last_sentence: z.string(),
 38 |       stdQuestionUUID: z.string(),
 39 |       stdFullPromptText: z.string(),
 40 |       stdFullPromptHash: z.string(),
 41 |       src_row_number: z.number(),
 42 |       preSTDsrcFileName: z.string(),
 43 |       preSTDsrcCID: z.string(),
 44 |     })
 45 |     .catchall(z.any()),
 46 | });
 47 | 
 48 | export type Prompt = z.infer<typeof PromptSchema>;
 49 | 
 50 | export const TaskSchema = z.object({
 51 |   did: z.string().startsWith("did:task:"),
 52 |   prompts: z.array(PromptSchema),
 53 |   cid: z.string(),
 54 |   fileName: z.string(),
 55 |   path: z.string(),
 56 | });
 57 | export type Task = z.infer<typeof TaskSchema>;
 58 | 
 59 | export const PromptResponseSchema = z.object({
 60 |   validatorDID: z.string().startsWith("did:val:"),
 61 |   providerDID: z.string().startsWith("did:prov:"),
 62 |   modelDID: z.string().startsWith("did:model:"),
 63 |   taskDID: z.string().startsWith("did:task:"),
 64 | 
 65 |   promptCID: z.string(),
 66 |   responseCID: z.string(),
 67 | 
 68 |   promptData: z.string(),
 69 |   responseData: z.string(),
 70 |   correctResponse: z.string(),
 71 | 
 72 |   promptedAt: z.number(),
 73 |   repliedAt: z.number(),
 74 | 
 75 |   runId: z.string(),
 76 | 
 77 |   questionUUID: z.string(),
 78 |   questionHash: z.string(),
 79 | 
 80 |   fullPromptData: z.string(),
 81 |   fullPromptHash: z.string(),
 82 | 
 83 |   // aka taskFileCID
 84 |   sourcePromptDatasetCID: z.string(),
 85 | 
 86 |   // aka taskFileName
 87 |   sourceFileName: z.string().optional(),
 88 | });
 89 | 
 90 | export type PromptResponse = z.infer<typeof PromptResponseSchema>;
 91 | 
 92 | export const PromptScoreSchema = PromptResponseSchema.extend({
 93 |   promptData: z.string().optional(),
 94 |   responseData: z.string().optional(),
 95 |   correctResponse: z.string().optional(),
 96 |   sourcePromptDatasetCID: z.string().optional(),
 97 |   fullPromptData: z.string().optional(),
 98 | 
 99 |   score: z.number(),
100 | });
101 | export type PromptScore = z.infer<typeof PromptScoreSchema>;
102 | 
103 | export const AggregationResultSchema = z.array(
104 |   z.object({
105 |     taskDID: z.string(),
106 |     providerDID: z.string(),
107 |     modelDID: z.string(),
108 |     totalResponse: z.number(),
109 |     score: z.number(),
110 |     wrongAnswers: z.number(),
111 |     missingAnswers: z.number(),
112 |     avgLatency: z.number(),
113 |     avgScore: z.number(),
114 |     score_runIds: z.array(z.string()),
115 |     sourcePromptDatasetCIDs: z.array(z.string()),
116 |     sourceFileNames: z.array(z.string()),
117 |   })
118 | );
119 | 
120 | export type AggregationResult = z.infer<typeof AggregationResultSchema>;
121 | 


--------------------------------------------------------------------------------
/peerBenchJS/src/uploaders/coserver.ts:
--------------------------------------------------------------------------------
  1 | import { AbstractUploader, UploadOptions } from "@/base/uploader";
  2 | import { checkValidationError, readFile } from "@/core/utils";
  3 | import { PromptScoreSchema } from "@/types";
  4 | import axios, { AxiosError } from "axios";
  5 | import { z } from "zod";
  6 | 
  7 | /**
  8 |  * Uploader class to upload files to the coordination server
  9 |  */
 10 | export class CoServerUploader extends AbstractUploader {
 11 |   override name = "CoServer";
 12 |   token: string = "";
 13 |   client = axios.create({
 14 |     baseURL: "https://deval-flask.onrender.com",
 15 |   });
 16 | 
 17 |   async init() {
 18 |     // We already have the token, no need to fetch it again.
 19 |     if (this.token !== "") return;
 20 | 
 21 |     this.logger.info(`Generating a new token`);
 22 |     const response = await this.client.post(`/token`, {
 23 |       user: "validator",
 24 |       role: "validator",
 25 |     });
 26 | 
 27 |     this.token = response.data?.token;
 28 |     this.client = axios.create({
 29 |       baseURL: "https://deval-flask.onrender.com",
 30 |       headers: {
 31 |         Authorization: `Bearer ${this.token}`,
 32 |       },
 33 |     });
 34 |   }
 35 | 
 36 |   // eslint-disable-next-line @typescript-eslint/no-unused-vars
 37 |   async uploadPromptResponses(filePath: string, options?: UploadOptions) {
 38 |     throw new Error("CoServer doesn't support uploading responses");
 39 |     // TODO: Paused since the CoServer doesn't support submitting responses
 40 | 
 41 |     // const fileContent = readFile(filePath);
 42 |     // const arraySchema = z.array(PromptResponseSchema);
 43 |     // const parseResult = arraySchema.safeParse(JSON.parse(fileContent));
 44 |     // const responses = checkValidationError(parseResult);
 45 | 
 46 |     // TODO: paused since the CoServer doesn't support batch requests
 47 |     // const upload = async (skip: number, take: number) => {
 48 |     //   const partition: PromptResponse[] = [];
 49 |     //   for (let i = skip; i < responses.length; i++) {
 50 |     //     // We've taken all the items that we want, no need to continue;
 51 |     //     if (i > take) break;
 52 |     //     partition.push(responses[i]);
 53 |     //   }
 54 |     //   const res = await this.client.post("/submit-score", partition);
 55 |     // };
 56 |     // let num = 1;
 57 |     // for (const response of responses) {
 58 |     //   try {
 59 |     //     const res = await this.client.post("/submit-score", response);
 60 |     //     const data = res.data;
 61 |     //     if (data?.status === "success") {
 62 |     //       this.logger.info(
 63 |     //         `Response ${num} uploaded to the CoServer successfully`
 64 |     //       );
 65 |     //     } else {
 66 |     //       throw new Error(JSON.stringify(data || {}));
 67 |     //     }
 68 |     //   } catch (err: any) {
 69 |     //     this.logger.debug(
 70 |     //       `Error while uploading the responses: ${
 71 |     //         err?.stack || err?.message || err
 72 |     //       }`
 73 |     //     );
 74 |     //     this.logger.error(`Error while uploading response ${num}: ${err}`);
 75 |     //   }
 76 |     //   num++;
 77 |     // }
 78 |   }
 79 | 
 80 |   // eslint-disable-next-line @typescript-eslint/no-unused-vars
 81 |   async uploadScores(filePath: string, options?: UploadOptions) {
 82 |     const fileContent = readFile(filePath);
 83 |     const arraySchema = z.array(PromptScoreSchema);
 84 |     const parseResult = arraySchema.safeParse(JSON.parse(fileContent));
 85 |     const scores = checkValidationError(parseResult);
 86 | 
 87 |     let num = 1;
 88 |     for (const score of scores) {
 89 |       try {
 90 |         const res = await this.client.post("/submit-score", {
 91 |           ...score,
 92 |           // TODO: On the CoServer side, change `evaluationRunId` to `runId`
 93 |           evaluationRunId: score.runId,
 94 |         });
 95 |         const data = res.data;
 96 | 
 97 |         if (data?.status === "success") {
 98 |           this.logger.info(
 99 |             `Score ${num} uploaded to the CoServer successfully`
100 |           );
101 |         } else {
102 |           throw new Error(JSON.stringify(data || {}));
103 |         }
104 |       } catch (err: any) {
105 |         let message: any = err;
106 |         if (err instanceof AxiosError) {
107 |           message = err.response?.data || err;
108 |         }
109 |         this.logger.error(
110 |           `Error while uploading score ${num}: ${JSON.stringify(message)}`
111 |         );
112 |       }
113 |       num++;
114 |     }
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/peerBenchJS/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ESNext",
 4 |     "module": "ESNext",
 5 |     "moduleResolution": "Node",
 6 |     "esModuleInterop": true,
 7 | 
 8 |     "forceConsistentCasingInFileNames": true,
 9 |     "strict": true,
10 |     "strictNullChecks": true,
11 |     "skipLibCheck": true,
12 | 
13 |     "noEmit": true,
14 | 
15 |     "outDir": "dist",
16 | 
17 |     "paths": {
18 |       "@/*": ["./src/*"]
19 |     }
20 |   },
21 |   "exclude": ["node_modules", "dist"],
22 |   "include": ["src/**/*"]
23 | }
24 | 


--------------------------------------------------------------------------------
/peerBenchJS/tsup.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "tsup";
 2 | 
 3 | export default defineConfig({
 4 |   entry: ["src/**/*.ts"],
 5 |   target: "node20",
 6 |   format: ["esm"],
 7 |   clean: true,
 8 |   bundle: false,
 9 |   skipNodeModulesBundle: true,
10 |   outDir: "dist",
11 |   splitting: false,
12 |   platform: "node",
13 | });
14 | 


--------------------------------------------------------------------------------
/peerBenchPY/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to val
  2 | 
  3 | We're excited that you're interested in contributing to val! This guide will help you set up your development environment and understand our contribution workflow.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Development Environment Setup](#development-environment-setup)
  8 | - [Development Guidelines](#development-guidelines)
  9 | - [Code Quality Standards](#code-quality-standards)
 10 | - [Contribution Workflow](#contribution-workflow)
 11 | - [Testing Guidelines](#testing-guidelines)
 12 | - [Documentation](#documentation)
 13 | - [Community Guidelines](#community-guidelines)
 14 | 
 15 | ## Development Environment Setup
 16 | 
 17 | ### Prerequisites
 18 | 
 19 | - Python (3.9 <= version <= 3.12)
 20 | - [Poetry](https://python-poetry.org/docs/#installation) (recommended) or pip
 21 | - Git
 22 | 
 23 | ### Installation Steps
 24 | 
 25 | 1. Clone the repository:
 26 |    ```bash
 27 |    git clone https://github.com/your-org/val.git
 28 |    cd val
 29 |    ```
 30 | 
 31 | 2. Set up your development environment:
 32 | 
 33 |    Using Poetry (recommended):
 34 |    ```bash
 35 |    poetry install --with dev
 36 |    ```
 37 | 
 38 |    Using pip:
 39 |    ```bash
 40 |    pip install -e ".[dev]"
 41 |    ```
 42 | 
 43 | ## Development Guidelines
 44 | 
 45 | ### Code Style
 46 | 
 47 | We maintain strict code quality standards to ensure maintainability and consistency:
 48 | 
 49 | 1. Code Formatting:
 50 |    ```bash
 51 |    # Format code using Black
 52 |    poetry run black val
 53 | 
 54 |    # Sort imports
 55 |    poetry run isort val
 56 |    ```
 57 | 
 58 | 2. Type Checking:
 59 |    ```bash
 60 |    # Run type checker
 61 |    poetry run mypy val
 62 |    ```
 63 | 
 64 | 3. Linting:
 65 |    ```bash
 66 |    # Run linter
 67 |    poetry run flake8 val
 68 |    ```
 69 | 
 70 | ## Code Quality Standards
 71 | 
 72 | ### Type Hints
 73 | 
 74 | All new code should include type hints:
 75 | 
 76 | ```python
 77 | from typing import List, Optional
 78 | 
 79 | def process_data(data: List[str], config: Optional[dict] = None) -> bool:
 80 |     return True
 81 | ```
 82 | 
 83 | ### Documentation
 84 | 
 85 | - All modules, classes, and functions should have docstrings
 86 | - Follow Google style docstring format
 87 | - Include usage examples for complex functionality
 88 | 
 89 | ## Contribution Workflow
 90 | 
 91 | 1. Create a new branch:
 92 |    ```bash
 93 |    git checkout -b feature/your-feature-name
 94 |    ```
 95 | 
 96 | 2. Make your changes and commit them following our commit message convention:
 97 |    ```
 98 |    <type>(<scope>): <description>
 99 | 
100 |    [optional body]
101 | 
102 |    [optional footer]
103 |    ```
104 | 
105 |    Types:
106 |    - `feat`: New feature
107 |    - `fix`: Bug fix
108 |    - `docs`: Documentation changes
109 |    - `style`: Formatting changes
110 |    - `refactor`: Code refactoring
111 |    - `test`: Adding or modifying tests
112 |    - `chore`: Maintenance tasks
113 | 
114 |    Example:
115 |    ```
116 |    feat(auth): implement JWT authentication
117 | 
118 |    - Add JWT token generation
119 |    - Implement token validation
120 |    - Add user authentication middleware
121 | 
122 |    Closes #123
123 |    ```
124 | 
125 | 3. Push your changes and create a pull request:
126 |    ```bash
127 |    git push origin feature/your-feature-name
128 |    ```
129 | 
130 | ### Pull Request Guidelines
131 | 
132 | 1. Ensure all tests pass
133 | 2. Update documentation if needed
134 | 3. Add tests for new features
135 | 4. Follow the pull request template
136 | 5. Link related issues
137 | 
138 | ## Testing Guidelines
139 | 
140 | 1. Write tests for new features:
141 |    ```python
142 |    def test_feature():
143 |        # Arrange
144 |        data = prepare_test_data()
145 |        
146 |        # Act
147 |        result = process_data(data)
148 |        
149 |        # Assert
150 |        assert result is True
151 |    ```
152 | 
153 | 2. Run tests:
154 |    ```bash
155 |    poetry run pytest
156 |    ```
157 | 
158 | 3. Check test coverage:
159 |    ```bash
160 |    poetry run pytest --cov=val
161 |    ```
162 | 
163 | ## Documentation
164 | 
165 | - Update documentation for new features
166 | - Include code examples
167 | - Add docstrings to all public APIs
168 | - Update README.md if needed
169 | 
170 | ## Community Guidelines
171 | 
172 | ### Issue Reporting
173 | 
174 | When reporting issues, please include:
175 | 
176 | 1. Steps to reproduce
177 | 2. Expected behavior
178 | 3. Actual behavior
179 | 4. Environment details
180 | 5. Relevant logs or screenshots
181 | 
182 | ### Communication
183 | 
184 | - Be respectful and inclusive
185 | - Provide constructive feedback
186 | - Help others when possible
187 | - Follow our code of conduct
188 | 
189 | ## License
190 | 
191 | By contributing to val, you agree that your contributions will be licensed under our project's MIT License.
192 | 
193 | ## Questions or Need Help?
194 | 
195 | - Open a [GitHub Discussion](https://github.com/val-ai/val/discussions)
196 | - Join our [Discord Community](https://discord.gg/val-ai-941362322000203776)
197 | 
198 | Thank you for contributing to val! 🚀


--------------------------------------------------------------------------------
/peerBenchPY/Dockerfile:
--------------------------------------------------------------------------------
 1 | # THIS CONTAINER IS A NPM, RUST, PYTHON, DOCKER INTO ONE CONTAINER
 2 | # THIS GENERAL CONTAINER IS THE CORE OF val, USE IT AS YOU WISH AT YOUR OWN RISK
 3 | 
 4 | FROM ubuntu:22.04
 5 | 
 6 | # SYSTEM ENVIRONMENT
 7 | ARG DEBIAN_FRONTEND=noninteractive
 8 | RUN usermod -s /bin/bash root
 9 | RUN apt-get update && apt-get install -y \
10 |     python3 \
11 |     python3-pip \
12 |     python3-venv \
13 |     apt-transport-https \
14 |     ca-certificates \
15 |     curl \
16 |     gnupg \
17 |     lsb-release \
18 |     software-properties-common
19 | 
20 | # INSTALL DOCKER
21 | RUN mkdir -p /etc/apt/keyrings
22 | RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
23 | RUN echo \
24 |   "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
25 |   $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
26 | RUN apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
27 | RUN groupadd docker || true
28 | RUN usermod -aG docker root
29 | EXPOSE 2375
30 | 
31 | # MODULE ENVIRONMENT
32 | WORKDIR /app
33 | COPY . .
34 | RUN pip install -e ./
35 | 
36 | # ENTRYPOINT (default to container running)
37 | ENTRYPOINT ["tail", "-f", "/dev/null"]


--------------------------------------------------------------------------------
/peerBenchPY/LICENCE:
--------------------------------------------------------------------------------
 1 | MITehe License
 2 | 
 3 | Copyright (d) 2025 valAI Organization
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE. 
22 | 


--------------------------------------------------------------------------------
/peerBenchPY/Makefile:
--------------------------------------------------------------------------------
 1 | SCRIPTS_PATH=./run
 2 | build:
 3 | 	${SCRIPTS_PATH}/build.sh 
 4 | start:
 5 | 	${SCRIPTS_PATH}/start.sh 
 6 | stop:
 7 | 	${SCRIPTS_PATH}/stop.sh 
 8 | enter:
 9 | 	${SCRIPTS_PATH}/enter.sh
10 | test:
11 | 	${SCRIPTS_PATH}/test.sh
12 | freshtest:
13 | 	make build && make test
14 | install:
15 | 	${SCRIPTS_PATH}/install.sh	
16 | restart:
17 | 	make stop
18 | 	make start
19 | chmod:
20 | 	chmod +x ${SCRIPTS_PATH}/*
21 | up: 
22 | 	make start
23 | down:
24 | 	make stop
25 | 
26 | 


--------------------------------------------------------------------------------
/peerBenchPY/README.md:
--------------------------------------------------------------------------------
  1 | # val: Decentralized Evaluation Framework
  2 | 
  3 | val is a powerful framework for evaluating and benchmarking AI models across different providers in a decentralized manner. It provides a simple, flexible interface for testing model performance on various tasks with cryptographic verification.
  4 | 
  5 | ## Quick Start
  6 | 
  7 | ```python
  8 | import val as v
  9 | 
 10 | # Initialize the evaluator
 11 | val = v.Val(
 12 |     task='add',          # Task to evaluate (e.g., 'add', 'divide')
 13 |     provider='openrouter', # Model provider
 14 |     n=4,                 # Number of models to test
 15 |     samples_per_epoch=2  # Samples per evaluation epoch
 16 | )
 17 | 
 18 | # Run an evaluation epoch
 19 | results = val.epoch()
 20 | print(results)
 21 | ```
 22 | 
 23 | ## Installation
 24 | 
 25 | ### Using pip
 26 | 
 27 | ```bash
 28 | pip install val
 29 | ```
 30 | 
 31 | ### Using Docker
 32 | 
 33 | We provide a Docker environment for easy setup and isolation:
 34 | 
 35 | ```bash
 36 | # Clone the repository
 37 | git clone https://github.com/val-ai/val.git
 38 | cd val
 39 | 
 40 | # Build the Docker image
 41 | make build
 42 | 
 43 | # Start the container
 44 | make start
 45 | 
 46 | # Enter the container
 47 | make enter
 48 | 
 49 | # Run tests
 50 | make test
 51 | ```
 52 | 
 53 | ## Core Components
 54 | 
 55 | ### Tasks
 56 | 
 57 | Tasks define what you want to evaluate. val comes with several built-in tasks:
 58 | 
 59 | ```python
 60 | # List available tasks
 61 | tasks = val.tasks()
 62 | print(tasks)  # ['add', 'divide', ...]
 63 | 
 64 | # Set a specific task
 65 | val.set_task('add')
 66 | ```
 67 | 
 68 | ### Providers
 69 | 
 70 | Providers connect to different AI model APIs:
 71 | 
 72 | ```python
 73 | # Set a provider
 74 | val.set_provider('openrouter')
 75 | 
 76 | # List available models from the provider
 77 | models = val.models()
 78 | print(models)
 79 | ```
 80 | 
 81 | ### Authentication
 82 | 
 83 | Secure your evaluations with cryptographic authentication:
 84 | 
 85 | ```python
 86 | # Generate a new key
 87 | key = v.get_key('my_key', crypto_type='ecdsa')
 88 | 
 89 | # Create an authentication token
 90 | auth = v.module('auth')()
 91 | token = auth.get_token({'data': 'test'}, key=key)
 92 | 
 93 | # Verify a token
 94 | verified_data = auth.verify_token(token)
 95 | ```
 96 | 
 97 | ## Advanced Usage
 98 | 
 99 | ### Custom Tasks
100 | 
101 | Create custom evaluation tasks by extending the base Task class:
102 | 
103 | ```python
104 | # Define a custom task in task/custom/task.py
105 | class CustomTask:
106 |     features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token']
107 |     sort_by = ['score']
108 |     sort_by_asc = [False]
109 |     description = 'My custom evaluation task'
110 |     
111 |     def sample(self, idx=None, sample=None):
112 |         # Generate or return a sample
113 |         return {'message': {'prompt': 'Your test prompt'}}
114 |     
115 |     def forward(self, model, sample=None, idx=None):
116 |         # Run the model on the sample
117 |         sample = self.sample(idx=idx, sample=sample)
118 |         result = model(**sample)
119 |         return self.score({'sample': sample, 'result': result})
120 |     
121 |     def score(self, data):
122 |         # Score the model's response
123 |         data['score'] = 1.0  # Your scoring logic here
124 |         return data
125 | ```
126 | 
127 | ### Background Evaluation
128 | 
129 | Run evaluations in the background:
130 | 
131 | ```python
132 | evaluator = v.Val(
133 |     task='add',
134 |     background=True,  # Run in background
135 |     tempo=60          # Run every 60 seconds
136 | )
137 | ```
138 | 
139 | ### Aggregating Results
140 | 
141 | View and analyze evaluation results:
142 | 
143 | ```python
144 | # Get aggregated results
145 | print(val.results())
146 | ```
147 | 
148 | ## Command Line Interface
149 | 
150 | val includes a CLI for common operations:
151 | 
152 | ```bash
153 | # Run an evaluation epoch
154 | d epoch --task=add --n=4
155 | 
156 | # List available tasks
157 | d tasks
158 | 
159 | # Test components
160 | d test
161 | ```
162 | 
163 | ## Docker Environment
164 | 
165 | The included Docker environment provides a complete setup for val:
166 | 
167 | ```dockerfile
168 | # FROM ubuntu:22.04 base with Python, Docker, and other dependencies
169 | # See Dockerfile for details
170 | 
171 | # Build the image
172 | docker build -t val .
173 | 
174 | # Run the container
175 | docker run -d \
176 |   --name val \
177 |   --network=host \
178 |   --restart unless-stopped \
179 |   --privileged --shm-size 4g \
180 |   -v $(pwd):/app \
181 |   -v /var/run/docker.sock:/var/run/docker.sock \
182 |   val


--------------------------------------------------------------------------------
/peerBenchPY/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "val"
 3 | version = "0.0.1"
 4 | description = "Decentralized Evaluation Framework"
 5 | authors = ["val ai organization <contact@valai.org>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | packages = [
 9 |     { include = "val" }
10 | ]
11 | repository = "https://github.com/val-ai/val"
12 | homepage = "https://valai.org/"
13 | 
14 | [tool.poetry.dependencies]
15 | python = "^3.9"
16 | fastapi = "*"
17 | sse-starlette = "*"
18 | paramiko = "*"
19 | nest_asyncio = "*"
20 | uvicorn = "*"
21 | scalecodec = ">=1.2.10,<1.3"
22 | asyncio = "*"
23 | aiofiles = "*"
24 | aiohttp = "*"
25 | pytest = "*"
26 | openai = "*"
27 | torch = "*"
28 | safetensors = "*"
29 | msgpack_numpy = "*"
30 | munch = "*"
31 | retry = "*"
32 | netaddr = "*"
33 | typer = "*"
34 | loguru = "*"
35 | pyyaml = "*"
36 | pandas = "*"
37 | websocket-client = ">=0.57.0"
38 | base58 = ">=1.0.3"
39 | certifi = ">=2019.3.9"
40 | idna = ">=2.1.0"
41 | requests = ">=2.21.0"
42 | xxhash = ">=1.3.0"
43 | ecdsa = ">=0.17.0"
44 | eth-keys = ">=0.2.1"
45 | eth_utils = ">=1.3.0"
46 | pycryptodome = ">=3.11.0"
47 | PyNaCl = ">=1.0.1"
48 | py-sr25519-bindings = ">=0.2.0"
49 | py-ed25519-zebra-bindings = ">=1.0"
50 | py-bip39-bindings = ">=0.1.9"
51 | twine = "*"
52 | psutil = "*"
53 | 
54 | [tool.poetry.group.dev]
55 | optional = true
56 | 
57 | [tool.poetry.group.dev.dependencies]
58 | black = "^23.12.1"
59 | mypy = "^1.8.0"
60 | flake8 = "^7.0.0"
61 | isort = "^5.13.2"
62 | pytest-cov = "^4.1.0"
63 | 
64 | [tool.poetry.scripts]
65 | v = "val.val:main"
66 | 
67 | [tool.mypy]
68 | files = ["val"]
69 | 
70 | [tool.isort]
71 | profile = "black"
72 | multi_line_output = 3
73 | include_trailing_comma = true
74 | force_grid_wrap = 0
75 | use_parentheses = true
76 | line_length = 88
77 | 
78 | [tool.black]
79 | line-length = 88
80 | target-version = ['py39', 'py310', 'py311', 'py312']
81 | include = '\.pyi?$'
82 | 
83 | [tool.flake8]
84 | max-line-length = 88
85 | extend-ignore = "E203"
86 | exclude = [".git", "__pycache__", "build", "dist"]
87 | 
88 | [build-system]
89 | requires = ["poetry-core"]
90 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/peerBenchPY/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi 
 2 | sse-starlette 
 3 | paramiko
 4 | nest_asyncio
 5 | uvicorn
 6 | scalecodec>=1.2.10,<1.3
 7 | asyncio
 8 | aiofiles
 9 | aiohttp
10 | pytest
11 | openai
12 | torch
13 | safetensors
14 | msgpack_numpy
15 | munch
16 | retry
17 | netaddr
18 | typer
19 | loguru
20 | pyyaml
21 | pandas
22 | websocket-client>=0.57.0
23 | base58>=1.0.3
24 | certifi>=2019.3.9
25 | idna>=2.1.0
26 | requests>=2.21.0
27 | xxhash>=1.3.0
28 | ecdsa>=0.17.0
29 | eth-keys>=0.2.1
30 | eth_utils>=1.3.0
31 | pycryptodome>=3.11.0
32 | PyNaCl>=1.0.1
33 | scalecodec>=1.2.10
34 | py-sr25519-bindings>=0.2.0
35 | py-ed25519-zebra-bindings>=1.0
36 | py-bip39-bindings>=0.1.9
37 | twine
38 | psutil
39 | dotenv
40 | multiformats
41 | 


--------------------------------------------------------------------------------
/peerBenchPY/run/build.sh:
--------------------------------------------------------------------------------
1 | REPO_NAME=$(basename $(pwd))
2 | if [ -z $1 ]; then
3 |   NAME=$REPO_NAME
4 | else
5 |   NAME=$1
6 | fi
7 | docker build -t $NAME $(pwd)


--------------------------------------------------------------------------------
/peerBenchPY/run/enter.sh:
--------------------------------------------------------------------------------
1 | 
2 | REPO_PATH=$(pwd) ;  
3 | REPO_NAME=$(basename $REPO_PATH)
4 | if [ -z $1 ]; then
5 |   NAME=$REPO_NAME
6 | else
7 |   NAME=$1
8 | fi
9 | docker exec -it $NAME /bin/bash


--------------------------------------------------------------------------------
/peerBenchPY/run/start.sh:
--------------------------------------------------------------------------------
 1 | # starts the container with the name of the current directory
 2 | 
 3 | 
 4 | PWD=$(pwd)
 5 | REPO=$(basename $(pwd)) # get the name of the current directory
 6 | # if no argument is passed, start the container with the name of the current directory
 7 | 
 8 | if [ $# -eq 0 ]; then
 9 |   NAME=$(basename $PWD)
10 | else
11 |   NAME=$1
12 | fi
13 | 
14 | docker run -d \
15 |   --name $NAME \
16 |   --network=host \
17 |   --restart unless-stopped \
18 |   --privileged --shm-size 4g \
19 |   -v $PWD:/app \
20 |   -v /root/.{$REPO}:/root/.{$REPO} \
21 |   -v /var/run/docker.sock:/var/run/docker.sock \
22 |   $REPO
23 | 
24 | CONTAINER_ID=$(docker ps -q -f name=$NAME)
25 | echo "STARTING(name=$NAME repo=$REPO container=$CONTAINER_ID)"
26 | 
27 | 
28 | 
29 | # Path: run/stop.sh


--------------------------------------------------------------------------------
/peerBenchPY/run/stop.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # if no argument is passed, stop the container with the name of the current directory
 3 | 
 4 | # if an argument is passed, stop the container with the name of the argument
 5 | REPO=$(basename $(pwd))
 6 | 
 7 | if [ $# -eq 0 ]; then
 8 |   NAME=$(basename $(pwd))
 9 | else
10 |   NAME=$1
11 | fi
12 | CONTAINER_EXISTS=$(docker ps -q -f name=$NAME)  
13 | CONTAINER_ID=$(docker ps -aq -f name=$NAME)
14 | 
15 | echo "STOPING(name=$NAME repo=$REPO container=$CONTAINER_ID)"
16 | if [ $CONTAINER_EXISTS ]; then
17 |   docker kill $NAME
18 |   docker rm $NAME
19 | fi
20 | 


--------------------------------------------------------------------------------
/peerBenchPY/run/test.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | PWD=$(pwd) ;  
 3 | REPO=$(basename $PWD)
 4 | NAME=$REPO-test
 5 | TEST_CMD="d test"
 6 | ./run/stop.sh $NAME
 7 | docker run -d --name $NAME -v $PWD:/app $REPO
 8 | docker exec -it $NAME bash -c "$TEST_CMD"
 9 | ./run/stop.sh $NAME
10 | 


--------------------------------------------------------------------------------
/peerBenchPY/tmp_consolidate_responses.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import duckdb
 4 | import pandas as pd
 5 | from pathlib import Path
 6 | from val.utils import storage_path
 7 | 
 8 | def flatten_dict(d, parent_key='', sep='_'):
 9 |     """Flatten nested dictionary by concatenating keys with separator."""
10 |     items = []
11 |     for k, v in v.items():
12 |         new_key = f"{parent_key}{sep}{k}" if parent_key else k
13 |         if isinstance(v, dict):
14 |             items.extend(flatten_dict(v, new_key, sep=sep).items())
15 |         else:
16 |             items.append((new_key, v))
17 |     return dict(items)
18 | 
19 | def is_sensitive_file(file_path):
20 |     """Check if the file contains sensitive information."""
21 |     sensitive_paths = ['key', 'private', 'secret', 'mnemonic', 'wallet']
22 |     return any(sensitive in file_path.lower() for sensitive in sensitive_paths)
23 | 
24 | def consolidate_responses():
25 |     # Get the storage path
26 |     storage_dir = Path(storage_path)
27 |     results_dir = storage_dir / 'results'
28 |     
29 |     # List to store all responses
30 |     all_responses = []
31 |     
32 |     # Only process files in the results directory
33 |     if not results_dir.exists():
34 |         print(f"Results directory not found at {results_dir}")
35 |         return
36 |         
37 |     for root, dirs, files in os.walk(results_dir):
38 |         for file in files:
39 |             if file.endswith('.json'):
40 |                 file_path = os.path.join(root, file)
41 |                 
42 |                 # Skip sensitive files
43 |                 if is_sensitive_file(file_path):
44 |                     print(f"Skipping sensitive file: {file_path}")
45 |                     continue
46 |                     
47 |                 try:
48 |                     with open(file_path, 'r') as f:
49 |                         response_data = json.load(f)
50 |                         
51 |                         # Flatten the params field if it exists
52 |                         if 'params' in response_data:
53 |                             flattened_params = flatten_dict(response_data['params'])
54 |                             # Remove the original params and add flattened version
55 |                             del response_data['params']
56 |                             response_data.update(flattened_params)
57 |                         
58 |                         # Add source file information
59 |                         response_data['source_file'] = file_path
60 |                         all_responses.append(response_data)
61 |                 except Exception as e:
62 |                     print(f"Error reading {file_path}: {str(e)}")
63 |     
64 |     # Create output directory if it doesn't exist
65 |     output_dir = storage_dir / 'consolidated'
66 |     output_dir.mkdir(exist_ok=True)
67 |     
68 |     # Save consolidated responses as JSON
69 |     output_file_json = output_dir / 'all_responses.json'
70 |     with open(output_file_json, 'w') as f:
71 |         json.dump(all_responses, f, indent=2)
72 |     
73 |     # Convert to DataFrame and save as parquet using duckdb
74 |     df = pd.DataFrame(all_responses)
75 |     output_file_parquet = output_dir / 'all_responses.parquet'
76 |     
77 |     # Create a DuckDB connection
78 |     con = duckdb.connect()
79 |     
80 |     # Register the DataFrame as a table
81 |     con.register('responses', df)
82 |     
83 |     # Save to parquet
84 |     con.execute(f"COPY responses TO '{output_file_parquet}' (FORMAT PARQUET)")
85 |     
86 |     # Close the connection
87 |     con.close()
88 |     
89 |     print(f"Consolidated {len(all_responses)} responses into:")
90 |     print(f"- JSON: {output_file_json}")
91 |     print(f"- Parquet: {output_file_parquet}")
92 | 
93 | if __name__ == "__main__":
94 |     consolidate_responses() 


--------------------------------------------------------------------------------
/peerBenchPY/tmprun.py:
--------------------------------------------------------------------------------
 1 | import val as v
 2 | 
 3 | # Initialize val with default settings
 4 | val = v.val(
 5 |     task='add',               # Task to evaluate
 6 |     provider='providers.openrouter',  # Model provider
 7 |     batch_size=16,            # Number of parallel evaluations
 8 |     n=10                      # Number of models to evaluate
 9 | )
10 | 
11 | # Run an evaluation epoch
12 | results = val.epoch()
13 | 
14 | 
15 | all_results = val.results()
16 | 
17 | 
18 | 
19 | 
20 |  
21 | 
22 | 
23 | # View the results
24 | print(results)


--------------------------------------------------------------------------------
/peerBenchPY/tmprun2.py:
--------------------------------------------------------------------------------
 1 | import val as v
 2 | 
 3 | # Initialize the evaluator
 4 | val = v.val(
 5 |     task='add',          # Task to evaluate (e.g., 'add', 'divide')
 6 |     provider='openrouter', # Model provider
 7 |     n=4,                 # Number of models to test
 8 |     samples_per_epoch=2  # Samples per evaluation epoch
 9 | )
10 | 
11 | # Run an evaluation epoch
12 | results = val.epoch()
13 | print(results)
14 | a = 1+1


--------------------------------------------------------------------------------
/peerBenchPY/tmptestcid.py:
--------------------------------------------------------------------------------
 1 | from val.utils import  cid_sha256_from_file, cid_sha256_from_str
 2 | 
 3 | testfname = "requirements.txt"
 4 | with open(testfname, 'r') as file:
 5 |     testfilecontent = file.read()
 6 |     cid_from_file = cid_sha256_from_file(testfname) 
 7 |     cid_from_text_self = cid_sha256_from_str(testfilecontent) 
 8 | 
 9 |     print("Test file content:")
10 |     print(testfilecontent)
11 |     print(f"CID from file: {cid_from_file} https://cid.ipfs.tech/#{cid_from_file}")
12 |     print(f"CID from text self: {cid_from_text_self} https://cid.ipfs.tech/#{cid_from_text_self}")
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 |  
20 | 


--------------------------------------------------------------------------------
/peerBenchPY/val/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .val import val
3 | val.init(globals())


--------------------------------------------------------------------------------
/peerBenchPY/val/auth.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import hmac
  3 | import json
  4 | import time
  5 | from typing import Dict, Optional, Any
  6 | import val as v
  7 | 
  8 | class Auth:
  9 | 
 10 |     def get_token(self, data: Dict='hey',  key:Optional[str]=None,   crypto_type: str = 'ecdsa', expiration: int = 3600, mode='bytes') -> str:
 11 |         """
 12 |         Generate a JWT token with the given data
 13 |         Args:
 14 |             data: Dictionary containing the data to encode in the token
 15 |             expiration: Optional custom expiration time in seconds
 16 |         Returns:
 17 |             JWT token string
 18 |         """
 19 |         if isinstance(key, str) or key == None:
 20 |             key = v.get_key(key, crypto_type=crypto_type)
 21 |         else:
 22 |             key = key
 23 |         if not isinstance(data, dict):
 24 |             data = {'data': data }
 25 |         token_data = data.copy()        
 26 |         # Add standard JWT claims
 27 |         token_data.update({
 28 |             'iat': str(float(time.time())),  # Issued at time
 29 |             'exp': str(float(time.time() + expiration)),  # Expiration time
 30 |             'iss': key.key_address,  # Issuer (key address)
 31 |         })
 32 |         
 33 |         # Create JWT header
 34 |         if crypto_type != key.crypto_type:
 35 |             crypto_type = key.crypto_type
 36 |         header = {
 37 |             'alg': crypto_type,
 38 |             'typ': 'JWT',
 39 |         }
 40 |         
 41 |         # Create message to sign
 42 |         message = f"{self._base64url_encode(header)}.{self._base64url_encode(token_data)}"
 43 |         # For asymmetric algorithms, use the key's sign method
 44 |         signature = key.sign(message, mode='bytes')
 45 |         signature_encoded = self._base64url_encode(signature)
 46 |         # Combine to create the token
 47 | 
 48 |         assert mode in ['bytes', 'dict'], f'Invalid mode {mode}'
 49 | 
 50 | 
 51 |         token = f"{message}.{signature_encoded}"
 52 | 
 53 |         if mode == 'dict':
 54 |             return self.verify_token(token)
 55 |         elif mode == 'bytes':
 56 |             return f"{message}.{signature_encoded}"
 57 |         else:
 58 |             raise 
 59 | 
 60 |         return f"{message}.{signature_encoded}"
 61 |             
 62 |     def verify_token(self, token: str) -> Dict:
 63 |         """
 64 |         Verify and decode a JWT token
 65 |         """
 66 |         if isinstance(token, dict) and 'token' in token:
 67 |             token = token['token']
 68 |         # Split the token into parts
 69 |         header_encoded, data_encoded, signature_encoded = token.split('.')
 70 |         # Decode the data
 71 |         data = json.loads(self._base64url_decode(data_encoded))
 72 |         headers = json.loads(self._base64url_decode(header_encoded))
 73 |         # Check if token is expired
 74 |         if 'exp' in data and float(data['exp']) < time.time():
 75 |             raise Exception("Token has expired")
 76 |         # Verify signature
 77 |         message = f"{header_encoded}.{data_encoded}"
 78 |         signature = self._base64url_decode(signature_encoded)
 79 |         assert v.verify(data=message, signature=signature, address=data['iss'], crypto_type=headers['alg']), "Invalid token signature"
 80 |         # data['data'] = message
 81 |         data['time'] = data['iat'] # set time field for semanitcally easy people
 82 |         data['signature'] = '0x'+signature.hex()
 83 |         data['alg'] = headers['alg']
 84 |         data['typ'] = headers['typ']
 85 |         data['token'] = token
 86 |         data['key'] = data['iss']
 87 | 
 88 |         return data
 89 | 
 90 |     def _base64url_encode(self, data):
 91 |         """Encode data in base64url format"""
 92 |         if isinstance(data, str):
 93 |             data = data.encode('utf-8')
 94 |         elif isinstance(data, dict):
 95 |             data = json.dumps(data, separators=(',', ':')).encode('utf-8')
 96 |         encoded = base64.urlsafe_b64encode(data).rstrip(b'=')
 97 |         return encoded.decode('utf-8')
 98 |     
 99 |     def _base64url_decode(self, data):
100 |         """Decode base64url data"""
101 |         padding = b'=' * (4 - (len(data) % 4))
102 |         return base64.urlsafe_b64decode(data.encode('utf-8') + padding)
103 | 
104 |     def test(self, test_data = {'fam': 'fam', 'admin': 1} , crypto_type='ecdsa'):
105 |         """
106 |         Test the JWT token functionality
107 |         
108 |         Returns:
109 |             Dictionary with test results
110 |         """
111 |         # Generate a token
112 |         token = self.get_token(test_data, crypto_type=crypto_type)
113 |         # Verify the token
114 |         decoded = self.verify_token(token)
115 |         # Check if original data is in the decoded data
116 |         validation_passed = all(test_data[key] == decoded[key] for key in test_data)
117 |         assert validation_passed, "Decoded data does not match original data"
118 |         # Test token expiration
119 |         quick_token = self.get_token(test_data, expiration=0.1, crypto_type=crypto_type)
120 |         time.sleep(0.2)  # Wait for token to expire
121 |         
122 |         expired_token_caught = False
123 |         try:
124 |             decoded = self.verify_token(quick_token)
125 |         except Exception as e:
126 |             expired_token_caught = True
127 |         assert expired_token_caught, "Expired token not caught"
128 |         
129 |         
130 |         return {
131 |             "token": token,
132 |             "decoded_data": decoded,
133 |             "crypto_type": crypto_type,
134 |             "quick_token": quick_token,
135 |             "expired_token_caught": expired_token_caught
136 |             }


--------------------------------------------------------------------------------
/peerBenchPY/val/providers/litellm/provider.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from litellm import completion, acompletion
 3 | import asyncio
 4 | 
 5 | class LiteLLM:
 6 |     def __init__(self, provider_keys: dict = {}):
 7 |         """
 8 |         Initialize LiteLLMClient with provider API keys.
 9 | 
10 |         Args:
11 |             provider_keys (dict): API keys for providers, e.g., {'openai': 'key', 'anthropic': 'key'}
12 |         """
13 |         for provider, key in provider_keys.items():
14 |             os.environ[f"{provider.upper()}_API_KEY"] = key
15 | 
16 |     def forward(self, model: str, messages: list, stream: bool = False):
17 |         """
18 |         Generate completion synchronously.
19 | 
20 |         Args:
21 |             model (str): Provider and model name (e.g., "openai/gpt-4o").
22 |             messages (list): List of message dicts.
23 |             stream (bool): Stream response or not.
24 | 
25 |         Returns:
26 |             Completion response.
27 |         """
28 |         response = completion(model=model, messages=messages, stream=stream)
29 |         if stream:
30 |             return (part.choices[0].delta.content or "" for part in response)
31 |         return response.choices[0].message.content
32 | 
33 |     async def async_forward(self, model: str, messages: list):
34 |         """
35 |         Generate completion asynchronously.
36 | 
37 |         Args:
38 |             model (str): Provider and model name.
39 |             messages (list): List of message dicts.
40 | 
41 |         Returns:
42 |             Completion response.
43 |         """
44 |         response = await acompletion(model=model, messages=messages)
45 |         return response.choices[0].message.content
46 | 
47 |     def set_callbacks(self, callbacks: list):
48 |         """
49 |         Set logging and observability callbacks.
50 | 
51 |         Args:
52 |             callbacks (list): List of callbacks (e.g., ['lunary', 'mlflow']).
53 |         """
54 |         import litellm
55 |         litellm.success_callback = callbacks
56 |         
57 | 
58 |     def test(self):
59 |         provider_keys = {
60 |             "openai": "your-openai-key",
61 |             "anthropic": "your-anthropic-key"
62 |         }
63 | 
64 |         client = LiteLLMClient(provider_keys)
65 | 
66 |         messages = [{"content": "Hello, how are you?", "role": "user"}]
67 | 
68 |         # synchronous
69 |         response = client.forward("openai/gpt-4o", messages)
70 |         print(response)
71 | 
72 |         # asynchronous
73 |         async def main():
74 |             response = await client.async_forward("anthropic/claude-3-sonnet-20240229", messages)
75 |             print(response)
76 | 
77 |         asyncio.run(main())
78 | 


--------------------------------------------------------------------------------
/peerBenchPY/val/storage.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import json
  3 | import os
  4 | import time
  5 | from .utils import get_text, sha256
  6 | 
  7 | class Storage:
  8 | 
  9 |     def __init__(self, storage_dirpath='~/.val', mode='json'):
 10 |         self.storage_dirpath = self.abspath(storage_dirpath)
 11 |         self.mode = mode
 12 | 
 13 |     def put(self, path, data):
 14 |         path = self.get_item_path(path)
 15 |         dirpath = '/'.join(path.split('/')[:-1])
 16 |         if not os.path.exists(dirpath):
 17 |             os.makedirs(dirpath, exist_ok=True)
 18 |         with open(path, 'w') as f:
 19 |             json.dump(data, f)
 20 |         return path
 21 | 
 22 |     def get(self, path, default=None, max_age=None, update=False):
 23 |         path = self.get_item_path(path)
 24 |         with open(path, 'r') as f:
 25 |             data = json.load(f)
 26 |         if update:
 27 |             max_age = 0
 28 |         if max_age != None:
 29 |             if time.time() - os.path.getmtime(path) > max_age:
 30 |                 data = default
 31 |         return data
 32 | 
 33 |     def get_item_path(self, path):
 34 |         if not path.startswith('/'):
 35 |             path  = f'{self.storage_dirpath}/{path}'
 36 |             if self.mode != None:
 37 |                 if not path.endswith(f'.{self.mode}'):
 38 |                     path = f'{path}.{self.mode}'
 39 |         return path
 40 | 
 41 |     def rm(self, path):
 42 |         path = self.get_item_path(path)
 43 |         assert os.path.exists(path), f'Failed to find path {path}'
 44 |         os.remove(path)
 45 |         return path
 46 | 
 47 |     def items(self, df=False, features=None):
 48 |         paths = self.paths()
 49 |         data = []
 50 |         for p in paths:
 51 |             try:
 52 |                 data.append(self.get(p))
 53 |             except Exception as e:
 54 |                 print(f'Failed to get {p} error={e}')
 55 |         if df:
 56 |             import pandas as pd
 57 |             data = pd.DataFrame(data)
 58 |         return data
 59 | 
 60 |     def paths(self):
 61 |         import glob
 62 |         paths = glob.glob(f'{self.storage_dirpath}/**/*', recursive=True)
 63 |         return [self.abspath(p) for p in paths if os.path.isfile(p)]
 64 | 
 65 |     def exists(self, path):
 66 |         path = self.get_item_path(path)
 67 |         return os.path.exists(path)
 68 |         
 69 | 
 70 |     def item2age(self):
 71 |         """
 72 |         returns the age of the item in seconds
 73 |         """
 74 |         paths = self.paths()
 75 |         ages = {}
 76 |         for p in paths:
 77 |             ages[p] = time.time() - os.path.getmtime(p)
 78 |         return ages
 79 |         
 80 |     def n(self):
 81 |         paths = self.items()
 82 |         return len(paths)
 83 | 
 84 |     def _rm_all(self):
 85 |         """
 86 |         removes all items in the storage
 87 |         """
 88 |         paths = self.paths()
 89 |         for p in paths:
 90 |             os.remove(p)
 91 |         return paths
 92 | 
 93 |     def test(self, path='test.json', data={'test': 'test', 'fam': {'test': 'test'}}):
 94 |         t0 = time.time()
 95 |         n0 = self.n()
 96 |         if self.exists(path):
 97 |             self.rm(path)
 98 |         assert not self.exists(path), f'Failed to delete'
 99 |         self.put('test.json', {'test': 'test'})
100 |         n1 = self.n()
101 |         assert n1 == n0 + 1, f'Failed to add item n0={n0} n1={n1}'
102 |         assert self.exists(path), f'Failed to find {path}'
103 |         data = self.get(path)
104 |         self.rm(path)
105 |         n2 = self.n()
106 |         assert n2 == n0, f'Failed to delete item n0={n0} n2={n2}'
107 |         assert not self.exists(path), f'Failed to delete {path}'
108 |         assert data == {'test': 'test'}, f'Failed test data={data}'
109 |         t1 = time.time()
110 |         print(f'Passed all tests in {t1 - t0} seconds')
111 |         return {'success': True, 'msg': 'Passed all tests'}
112 | 
113 |     def abspath(self, path):
114 |         return os.path.abspath(os.path.expanduser(path))
115 | 
116 |     def path2age(self):
117 |         """
118 |         returns the age of the item in seconds
119 |         """
120 |         paths = self.paths()
121 |         ages = {}
122 |         for p in paths:
123 |             ages[p] = time.time() - os.path.getmtime(p)
124 |         return ages
125 | 
126 |     def cid(self, path, ignore_names=['__pycache__', '.DS_Store','.git', '.gitignore']):
127 |         """
128 |         Get the CID of the val module
129 |         """
130 |         path = self.abspath(path)
131 |         if os.path.isdir(path):
132 |             files = os.listdir(path)
133 |             content = []
134 |             for f in files:
135 |                 if any([ignore in f for ignore in ignore_names]):
136 |                     continue
137 |                 f = path + '/' + f
138 |                 content.append(self.cid(f))
139 |             content = ''.join(content)
140 |         elif os.path.isfile(path):
141 |             content =  get_text(path)
142 |         else: 
143 |             raise Exception(f'Failed to find path {path}')
144 |         cid =  sha256(content)
145 |         print(f'cid={cid} path={path}')
146 |         return cid
147 | 


--------------------------------------------------------------------------------
/peerBenchPY/val/task/add/task.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import Optional
 3 | import json
 4 | class AddTask:
 5 |     features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token']
 6 |     sort_by = ['score']
 7 |     sort_by_asc = [False, True]
 8 |     description = 'tests a model to add two numberts'
 9 |     output_bounds = ['<OUTPUT_JSON>', '</OUTPUT_JSON>']
10 |     temperature = 0
11 |     max_tokens = 10000
12 | 
13 |     def sample(self , idx:int = None, sample=None,) -> dict:
14 |         """
15 |         generate the sample
16 |         """
17 |         # if params is not None, use it
18 |         if sample is not None:
19 |             return sample
20 |         # generate two random numbers from the id seed
21 |         idx = idx or random.randint(1, 1000)
22 |         random.seed(idx)
23 |         a = random.randint(1, 100)
24 |         b = random.randint(1, 100)
25 |         # return a sample with the two numbers
26 |         return {'message': {
27 |                     'a': a,
28 |                     'b': b,
29 |                     'goal': 'return a json object with the sum  ',
30 |                     'output_format': f'strictly as {self.output_bounds[0]}json(y:int){self.output_bounds[1]}'
31 |                 },  
32 |                 'temperature': self.temperature, 
33 |                 'max_tokens': self.max_tokens
34 |                 }
35 | 
36 |     def forward(self, model: callable, sample:Optional[dict]=None, idx=None) -> dict:
37 |         """
38 |         run the model on the sample
39 |         Args:
40 |             model: the model to run
41 |             sample: the sample to run on
42 |             idx: the index of the sample
43 |         Returns:
44 |             dict: the result of the model
45 |         """ 
46 |         
47 |         sample = self.sample(idx=idx, sample=sample)
48 | 
49 |         result = model(**sample)
50 | 
51 |         data = {
52 |             'sample': sample,
53 |             'result': result,
54 |         }
55 |         # step 3 : score the data
56 |         data =  self.score(data)
57 |         return data
58 |     
59 |     def score(self, data:dict) -> float:
60 |         sample_data = data['sample']['message']
61 |         target = str(sample_data['a']+ sample_data['b'])
62 |         data['score'] =  float(str(target) in  data['result'])
63 |         return data
64 |  


--------------------------------------------------------------------------------
/peerBenchPY/val/task/divide/task.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import Optional
 3 | import json
 4 | class AddTask:
 5 |     features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token']
 6 |     sort_by = ['score']
 7 |     sort_by_asc = [False, True]
 8 |     description = 'tests a model to add two numbers'
 9 |     output_bounds = ['<OUTPUT_JSON>', '</OUTPUT_JSON>']
10 |     temperature = 0
11 |     max_tokens = 10000
12 | 
13 |     def sample(self , idx:int = None, sample=None,) -> dict:
14 |         """
15 |         generate the sample
16 |         """
17 |         # if params is not None, use it
18 |         if sample is not None:
19 |             return sample
20 |         # generate two random numbers from the id seed
21 |         idx = idx or random.randint(1, 1000)
22 |         random.seed(idx)
23 |         a = random.randint(1, 100)
24 |         b = random.randint(1, 100)
25 |         # return a sample with the two numbers
26 |         return {'message': {
27 |                     'a': a,
28 |                     'b': b,
29 |                     'goal': 'return a json object with the sum',
30 |                     'output_format': f'strictly as {self.output_bounds[0]}json(y:int){self.output_bounds[1]}'
31 |                 },  
32 |                 'temperature': self.temperature, 
33 |                 'max_tokens': self.max_tokens
34 |                 }
35 | 
36 |     def forward(self, model: callable, sample:Optional[dict]=None, idx=None) -> dict:
37 |         """
38 |         run the model on the sample
39 |         Args:
40 |             model: the model to run
41 |             sample: the sample to run on
42 |             idx: the index of the sample
43 |         Returns:
44 |             dict: the result of the model
45 |         """ 
46 |         
47 |         sample = self.sample(idx=idx, sample=sample)
48 | 
49 |         result = model(**sample)
50 | 
51 |         data = {
52 |             'sample': sample,
53 |             'result': result,
54 |         }
55 | 
56 |         # step 3 : score the data
57 |         return self.score(data)
58 |     
59 |     def score(self, data:dict) -> float:
60 |         sample_data = data['sample']['message']
61 |         target = str(sample_data['a']+ sample_data['b'])
62 |         data['score'] =  float(str(target) in  data['result'])
63 |         return data
64 |  
65 |     def verify_sample(self, data):
66 |         assert all([f in data for f in self.features]), f'Missing features {self.features}'


--------------------------------------------------------------------------------
/peerBenchPY/val/task/task.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import Optional
 3 | import json
 4 | class Task:
 5 |     features = ['params', 'result', 'target', 'score', 'model', 'provider', 'token']
 6 |     sort_by = ['score']
 7 |     sort_by_asc = [False]
 8 |     description = 'tests a model to add two numberts'
 9 |     output_bounds = ['<OUTPUT_JSON>', '</OUTPUT_JSON>']
10 |     temperature = 0
11 |     max_tokens = 10000
12 | 
13 |     def sample(self , idx:int = None, sample=None) -> dict:
14 |         """
15 |         generate the sample
16 |         """
17 |         # if params is not None, use it
18 |         if sample is not None:
19 |             return sample
20 |         # generate two random numbers from the id seed
21 |         if idx != None:
22 |             random.seed(idx)
23 |         idx = idx or random.randint(1, 1000)
24 |         random.seed(idx)
25 |         a = random.randint(1, 100)
26 |         b = random.randint(1, 100)
27 |         # return a sample with the two numbers
28 |         return {'message': {
29 |                     'a': a,
30 |                     'b': b,
31 |                     'goal': 'return a json object with the sum  ',
32 |                     'output_format': f'strictly as {self.output_bounds[0]}json(y:int){self.output_bounds[1]}'
33 |                 },  
34 |                 'temperature': self.temperature, 
35 |                 'max_tokens': self.max_tokens
36 |                 }
37 | 
38 |     def forward(self, model: callable, sample:Optional[dict]=None, idx=None) -> dict:
39 |         """
40 |         run the model on the sample
41 |         Args:
42 |             model: the model to run
43 |             sample: the sample to run on
44 |             idx: the index of the sample
45 |         Returns:
46 |             dict: the result of the model
47 |         """ 
48 |         
49 |         sample = self.sample(idx=idx, sample=sample)
50 | 
51 |         result = model(**sample)
52 | 
53 |         data = {
54 |             'sample': sample,
55 |             'result': result,
56 |         }
57 | 
58 |         # step 3 : score the data
59 |         return self.score(data)
60 |     
61 |     def score(self, data:dict) -> float:
62 |         sample_data = data['sample']['message']
63 |         target = str(sample_data['a']+ sample_data['b'])
64 |         data['score'] =  float(str(target) in  data['result'])
65 |         return data
66 |  
67 |     def verify_sample(self, data):
68 |         assert all([f in data for f in self.features]), f'Missing features {self.features}'


--------------------------------------------------------------------------------
/py/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/py/.DS_Store


--------------------------------------------------------------------------------
/py/Iris.csv:
--------------------------------------------------------------------------------
  1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
  2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
  3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
  4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
  7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
  8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
  9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa
 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa
 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa
 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa
 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa
 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa
 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa
 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa
 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa
 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa
 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa
 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor
100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor
102 | 101,6.3,3.3,6.0,2.5,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,Iris-virginica
104 | 103,7.1,3.0,5.9,2.1,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,Iris-virginica
106 | 105,6.5,3.0,5.8,2.2,Iris-virginica
107 | 106,7.6,3.0,6.6,2.1,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,Iris-virginica
112 | 111,6.5,3.2,5.1,2.0,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,Iris-virginica
114 | 113,6.8,3.0,5.5,2.1,Iris-virginica
115 | 114,5.7,2.5,5.0,2.0,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,Iris-virginica
118 | 117,6.5,3.0,5.5,1.8,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,Iris-virginica
121 | 120,6.0,2.2,5.0,1.5,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,Iris-virginica
123 | 122,5.6,2.8,4.9,2.0,Iris-virginica
124 | 123,7.7,2.8,6.7,2.0,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,Iris-virginica
127 | 126,7.2,3.2,6.0,1.8,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,Iris-virginica
129 | 128,6.1,3.0,4.9,1.8,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,Iris-virginica
131 | 130,7.2,3.0,5.8,1.6,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,Iris-virginica
133 | 132,7.9,3.8,6.4,2.0,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,Iris-virginica
137 | 136,7.7,3.0,6.1,2.3,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,Iris-virginica
140 | 139,6.0,3.0,4.8,1.8,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,Iris-virginica
147 | 146,6.7,3.0,5.2,2.3,Iris-virginica
148 | 147,6.3,2.5,5.0,1.9,Iris-virginica
149 | 148,6.5,3.0,5.2,2.0,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,Iris-virginica
151 | 150,5.9,3.0,5.1,1.8,Iris-virginica
152 | 


--------------------------------------------------------------------------------
/py/Iris.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/py/Iris.parquet


--------------------------------------------------------------------------------
/py/compute_ipfs_cid.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import hashlib
 3 | import base64
 4 | import base58
 5 | import struct
 6 | #TODO redo this with some library I just don't know which one we want to use https://github.com/hashberg-io/multiformats   or https://github.com/PancakesArchitect/py-multiformats-cid      have not been touched for a while but i guess its a simple algo that doesn't need to be touched 
 7 | 
 8 | def compute_ipfs_cids(file_path: str) -> tuple[str, str, str]:
 9 |     print("hello3")
10 |     try:
11 |         # Read file in binary mode
12 |         with open(file_path, 'rb') as f:
13 |             file_data = f.read()
14 | 
15 |         # Compute SHA2-256 hash of raw data
16 |         h = hashlib.sha256(file_data).digest()
17 |         print(f"SHA256 raw digest: {h.hex()}")
18 |         
19 |         # Construct the multihash
20 |         # 0x12 = sha2-256 identifier
21 |         # 0x20 = length (32 bytes) of sha2-256 hash
22 |         multihash = bytes([0x12, 0x20]) + h
23 |         
24 |         # CIDv0 (base58btc)
25 |         cidv0 = base58.b58encode(multihash).decode('utf-8')
26 |         
27 |         # CIDv1 with raw codec (base32)
28 |         # 0x01 = CID version 1
29 |         # 0x55 = raw binary codec
30 |         cidv1_bytes = bytes([0x01, 0x55]) + multihash
31 |         cidv1_base32 = 'b' + base64.b32encode(cidv1_bytes).decode('utf-8').lower()
32 |         
33 | 
34 |         return   cidv1_base32
35 |         
36 |     except Exception as e:
37 |         raise Exception(f"Failed to compute CID: {str(e)}")
38 | 
39 | def main():
40 |     parser = argparse.ArgumentParser(description='Compute IPFS CID for a file using SHA2-256')
41 |     parser.add_argument('file_path', help='Path to the file')
42 |     
43 |     args = parser.parse_args()
44 |     
45 |     try:
46 |         cidv1_raw = compute_ipfs_cids(args.file_path) 
47 |         print(f"IPFS CIDv1 (base32, raw): {cidv1_raw}")
48 |         
49 |     except Exception as e:
50 |         print(f"Error: {e}")
51 |         exit(1)
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/py/create_secp256k1_jws.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from jose import jws
 3 | from jose.constants import ALGORITHMS
 4 | from ecdsa import SigningKey
 5 | import binascii
 6 | 
 7 | def create_secp256k1_jws():
 8 |     # The private key (in hex)
 9 |     private_key_hex = "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2"
10 |     
11 |     # Convert hex private key to bytes
12 |     private_key_bytes = binascii.unhexlify(private_key_hex)
13 |     
14 |     # Create signing key from private key bytes
15 |     signing_key = SigningKey.from_string(private_key_bytes, curve=ALGORITHMS.ES256K)
16 |     
17 |     # The message to sign
18 |     payload = {
19 |         "hello": "world"
20 |     }
21 |     
22 |     try:
23 |         # Create JWS signature
24 |         signature = jws.sign(payload, signing_key, algorithm=ALGORITHMS.ES256K)
25 |         
26 |         print("\nPayload:")
27 |         print(json.dumps(payload, indent=2))
28 |         print("\nJWS Signature:")
29 |         print(signature)
30 |         
31 |         # Verify the signature (optional verification step)
32 |         verify_key = signing_key.get_verifying_key()
33 |         try:
34 |             verified_payload = jws.verify(signature, verify_key, algorithms=ALGORITHMS.ES256K)
35 |             print("\nSignature verified successfully!")
36 |             print("Verified payload:", json.loads(verified_payload))
37 |         except Exception as e:
38 |             print("\nSignature verification failed:", str(e))
39 |             
40 |     except Exception as e:
41 |         print(f"Error creating JWS: {str(e)}")
42 |         exit(1)
43 | 
44 | if __name__ == "__main__":
45 |     print("hello5.\n")
46 |     create_secp256k1_jws()
47 | 


--------------------------------------------------------------------------------
/py/create_secp256k1_jws_rfc.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import binascii
 3 | from joserfc import jws
 4 | from joserfc.jwk import ECKey
 5 | from ecdsa import SigningKey, SECP256k1
 6 | import base64
 7 | 
 8 | def create_secp256k1_jws():
 9 |     # The private key (in hex)
10 |     private_key_hex = "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2"
11 |     private_key_bytes = binascii.unhexlify(private_key_hex)
12 |     
13 |     # Create ECDSA signing key and get public key points
14 |     signing_key = SigningKey.from_string(private_key_bytes, curve=SECP256k1)
15 |     verifying_key = signing_key.get_verifying_key()
16 |     
17 |     # Get public key coordinates
18 |     x_coord = verifying_key.pubkey.point.x()
19 |     y_coord = verifying_key.pubkey.point.y()
20 |     
21 |     # Convert coordinates to base64url
22 |     x_b64 = base64.urlsafe_b64encode(x_coord.to_bytes(32, 'big')).rstrip(b'=').decode('ascii')
23 |     y_b64 = base64.urlsafe_b64encode(y_coord.to_bytes(32, 'big')).rstrip(b'=').decode('ascii')
24 |     d_b64 = base64.urlsafe_b64encode(private_key_bytes).rstrip(b'=').decode('ascii')
25 |     
26 |     # Create EC Key with all required parameters
27 |     key = ECKey.import_key({
28 |         "kty": "EC",
29 |         "crv": "secp256k1",
30 |         "x": x_b64,
31 |         "y": y_b64,
32 |         "d": d_b64
33 |     })
34 |     
35 |     # The message to sign
36 |     payload = json.dumps({"hello": "world"}).encode('utf-8')
37 |     
38 |     try:
39 |         # Create protected header
40 |         protected = {"alg": "ES256K"}
41 |         
42 |         # Create JWS signature
43 |         token = jws.serialize_compact(protected, payload, key)
44 |         
45 |         print("\nPayload:")
46 |         print(json.dumps({"hello": "world"}, indent=2))
47 |         print("\nJWS Token:")
48 |         print(token)
49 |         
50 |         # Verify the signature
51 |         try:
52 |             verified = jws.verify_compact(token, key)
53 |             print("\nSignature verified successfully!")
54 |             print("Verified payload:", json.loads(verified.payload))
55 |         except Exception as e:
56 |             print("\nSignature verification failed:", str(e))
57 |             
58 |     except Exception as e:
59 |         print(f"Error creating JWS: {str(e)}")
60 |         exit(1)
61 | 
62 | if __name__ == "__main__":
63 |     print("hello511.\n")
64 |     create_secp256k1_jws()
65 | 


--------------------------------------------------------------------------------
/py/create_secp256k1_jwt.py:
--------------------------------------------------------------------------------
 1 | import jwt
 2 | import json
 3 | import binascii
 4 | from ecdsa import SigningKey, SECP256k1
 5 | from ecdsa.util import sigencode_der
 6 | import base64
 7 | 
 8 | def create_key_pair(private_key_hex: str):
 9 |     private_key_bytes = binascii.unhexlify(private_key_hex)
10 |     signing_key = SigningKey.from_string(private_key_bytes, curve=SECP256k1)
11 |     verifying_key = signing_key.get_verifying_key()
12 |     return signing_key, verifying_key
13 | 
14 | def create_secp256k1_jwt():
15 |     signing_key, verifying_key = create_key_pair(
16 |         "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2"
17 |     )
18 |     
19 |     payload = {
20 |         "hello": "world"
21 |     }
22 |     
23 |     try:
24 |         # Create JWT with ES256K algorithm
25 |         token = jwt.encode(
26 |             payload,
27 |             signing_key.to_pem(),
28 |             algorithm="ES256K"
29 |         )
30 |         
31 |         print("\nJWT Creation:")
32 |         print("Payload:", json.dumps(payload, indent=2))
33 |         print("JWT Token:", token)
34 |         
35 |         # Verify JWT
36 |         verified_payload = jwt.decode(
37 |             token,
38 |             verifying_key.to_pem(),
39 |             algorithms=["ES256K"]
40 |         )
41 |         print("JWT Verification successful!")
42 |         
43 |     except Exception as e:
44 |         print(f"Error with JWT: {str(e)}")
45 |         exit(1)
46 | 
47 | def create_secp256k1_jws():
48 |     signing_key, verifying_key = create_key_pair(
49 |         "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2"
50 |     )
51 |     
52 |     payload = {
53 |         "hello": "world"
54 |     }
55 |     
56 |     try:
57 |         # Create JWS with ES256K algorithm
58 |         header = {
59 |             "alg": "ES256K",
60 |             "typ": "JWS"
61 |         }
62 |         
63 |         token = jwt.encode(
64 |             payload,
65 |             signing_key.to_pem(),
66 |             algorithm="ES256K",
67 |             headers=header
68 |         )
69 |         
70 |         print("\nJWS Creation:")
71 |         print("Payload:", json.dumps(payload, indent=2))
72 |         print("JWS Token:", token)
73 |         
74 |         # Verify JWS
75 |         verified_payload = jwt.decode(
76 |             token,
77 |             verifying_key.to_pem(),
78 |             algorithms=["ES256K"]
79 |         )
80 |         print("JWS Verification successful!")
81 |         
82 |     except Exception as e:
83 |         print(f"Error with JWS: {str(e)}")
84 |         exit(1)
85 | 
86 | if __name__ == "__main__":
87 |     print("Creating JWT and JWS examples... 2")
88 |     create_secp256k1_jwt()
89 |     create_secp256k1_jws()
90 | 


--------------------------------------------------------------------------------
/py/csv_to_parquet.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | import os
 4 | 
 5 | def convert_csv_to_parquet(input_csv, output_parquet=None):
 6 |     try:
 7 |         # Read CSV
 8 |         df = pd.read_csv(input_csv)
 9 |         
10 |         # If no output path specified, use input path with .parquet extension
11 |         if output_parquet is None:
12 |             output_parquet = os.path.splitext(input_csv)[0] + '.parquet'
13 |         
14 |         # Save as Parquet
15 |         df.to_parquet(output_parquet, engine='pyarrow')
16 |         return output_parquet
17 |     except Exception as e:
18 |         raise Exception(f"Conversion failed: {str(e)}")
19 | 
20 | def main():
21 |     parser = argparse.ArgumentParser(description='Convert CSV file to Parquet format')
22 |     parser.add_argument('input_csv', help='Path to input CSV file')
23 |     parser.add_argument('--output', '-o', help='Path to output Parquet file (optional)')
24 |     
25 |     args = parser.parse_args()
26 |     
27 |     try:
28 |         output_path = convert_csv_to_parquet(args.input_csv, args.output)
29 |         print(f"Successfully converted to: {output_path}")
30 |     except Exception as e:
31 |         print(f"Error: {e}")
32 |         exit(1)
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/py/display_parquet.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import polars as pl
 3 | from typing import Optional
 4 | 
 5 | def display_parquet(file_path: str, max_rows: Optional[int] = None, max_cols: Optional[int] = None):
 6 |     try:
 7 |         # Read the parquet file using polars
 8 |         df = pl.read_parquet(file_path)
 9 |         
10 |         # Print basic information about the dataset
11 |         print("\n=== Dataset Info ===")
12 |         print(f"Number of rows: {df.height}")
13 |         print(f"Number of columns: {df.width}")
14 |         
15 |         print("\n=== Column Names ===")
16 |         print(df.columns)
17 |         
18 |         print("\n=== Data Types ===")
19 |         print(df.schema)
20 |         
21 |         print("\n=== Data Preview ===")
22 |         if max_rows:
23 |             print(df.head(max_rows))
24 |         else:
25 |             print(df)
26 |         
27 |         # Print some basic statistics for numeric columns
28 |         numeric_cols = [name for name, dtype in df.schema.items() if dtype in [pl.Float64, pl.Int64]]
29 |         if numeric_cols:
30 |             print("\n=== Numeric Columns Statistics ===")
31 |             print(df.select(numeric_cols).describe())
32 | 
33 |     except Exception as e:
34 |         print(f"Error reading/displaying Parquet file: {str(e)}")
35 |         exit(1)
36 | 
37 | def main():
38 |     parser = argparse.ArgumentParser(description='Display Parquet file contents with nice formatting')
39 |     parser.add_argument('parquet_file', help='Path to input Parquet file')
40 |     parser.add_argument('--max-rows', type=int, help='Maximum number of rows to display')
41 |     parser.add_argument('--max-cols', type=int, help='Maximum number of columns to display')
42 |     
43 |     args = parser.parse_args()
44 |     display_parquet(args.parquet_file, args.max_rows, args.max_cols)
45 | 
46 | if __name__ == "__main__":
47 |     print("hi2 .\n")
48 |     main()
49 | 


--------------------------------------------------------------------------------
/py/hello_world.txt:
--------------------------------------------------------------------------------
1 | hello world 


--------------------------------------------------------------------------------
/py/outline.md:
--------------------------------------------------------------------------------
 1 | # Assumptions 
 2 | - We want to be compatible with closed soruce systems/  If someone wants their opensoruce model valdiatred they can just put it into openrouter or anyother host  and list it like that 
 3 | 
 4 | 
 5 | # Validator functional componenets 
 6 | - SDK framework that makes it easy for different AI model providers to add a plugin for their specific API to a generic model type function. For instance a machine tanslation SDK  function will have many adapters for different AI model providers with their different API endpoints and authentication types.  [ Similar to   litellm   https://github.com/BerriAI/litellm
 7 | ]  {for lazyness we can use openrouter through littlellm so we don't need to go get so many API keys}
 8 | - Validators must send each prompt to all providers being tested  simultinaously  they can not do it sequencially.  They can wait for the first prompt to be finished before the next for rate limiting but providers must be prompted simultainously. 
 9 | - Each of the validators need to have a place to store all of their API keys for each provider. [can start as env variables but later needs to be some KMS  ]
10 | - Validator registration simply to connect their public key to an organizations  email or domain. 
11 | - Validators decide which providers they want to test this is a simple config. It's a human decision to test people 
12 | - Validator must have the ability to upload scored resposnes. (The validator will have their employees score each resposne in excel or google sheets  ).  The evaluation defines how to aggregate the scores per prompt simple Average or Accuracy or more compelx BLUE score is determined by the Evaluation spec.   
13 | - SDK  helps produce  signed evlaution publications in schema example_evaluation_test_stream_publication.yaml  (these should be submitted within 7 days of the prompt or how ever long the Evaluation Stream stipulates)
14 | 
15 | 
16 | # Centralized Information aggregation server 
17 | - API accepting  evaluation JSON   [auth :  pk signature validation checked.  JWS and secp256k1  ]
18 |     -
19 | - API accepting new Evaluation Test Stream description doc [No AUTH required]
20 |     - This could have a very simple web form since not key is needed 
21 |     - AKA Functions that help ai researchers and validators publish new test data creation standards and evaluation methods. 
22 | - API accepting provider saying they want to be tested for a certain Evaluation Test Stream  [auth :  pk signature validation checked ]
23 |     - AKA Function to help AI model providers announce that they want to get benchmarked with fresh data on new test data evaluation streams  
24 | - [later] A background process that checks if one of the API submitted JSON messages was not seen on WAKU gossip then the server should push the message again via waku 
25 | - This server should be designed in a very lean manner such that the JS version can just be a client side wrapper that also calls a local version of this server.  
26 | 
27 | # Identity  
28 | - We will use decentralized identifiers (DIDs ). To start we'll use a simple   did:key or did:pkh  where we resolve the  endpoint document on our own centralized servers  but while coding we should consider that we will want to support other did mehtods 
29 |     - In general we just agree to use the concept of  a did:string the is the non mutable ID of Validator and there is a document describing more about them like name etc  which must be resolved from some server 
30 |     - We decided that we will run a centralized server ourselves for the resolving extra metadata 
31 |     - [Later]  did:plc   or similar    that allows rotation   and  pubkey is not inside the identity string 
32 | 
33 | Everything above is  POC/MVP   April 7 2025   deadline  
34 | 
35 | 
36 | # Things that can be done in a second step  [Later] (  Release target May 22, 2025 ) 
37 | - Open data publication of all the test data that was used and what answers different models gave (aka the audit log)
38 | - Web UI displays results in aggregate.  
39 | - Distributing registrations and evaluations via WAKU or other open decentralized message bus 
40 | - users/vakidators can   spot check the responses that validators gave by reprompting the provider and checking if they get the same or similar quality response to make sure that the validator is not lying about a provider giving a bad response to a prompt 
41 | 
42 | 
43 | # Provider functionality 
44 | -  no real provider code needs to be written.  Providers just run their own API's as they do for regular business. 
45 | 


--------------------------------------------------------------------------------
/py/requirements.txt:
--------------------------------------------------------------------------------
1 | litellm
2 | pandas
3 | pyarrow


--------------------------------------------------------------------------------
/py/run_evaluation_pseudo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import asyncio
  3 | import csv
  4 | import pandas as pd
  5 | import polars as pl
  6 | from litellm import completion
  7 | from datetime import datetime
  8 | from compute_ipfs_cid import compute_ipfs_cids  # Add this import at the top
  9 |  
 10 | os.environ["validator_privatekey_hot"] = "67499ec186126dcdb98f2af7f396d1b410cc979959b56fb3bc6ca5d294e76bb2" #cafe 
 11 | os.environ["OPENAI_API_KEY"] = "your-api-key"  #later add pulling these configs from a secrets file 
 12 | os.environ["OPENROUTER_API_KEY"] = "openrouter_api_key"
 13 | os.environ["XAI_API_KEY"] = "your-api-key"
 14 | os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
 15 | 
 16 | to_be_evaluated = [ "openai/gpt-4o","anthropic/claude-3-sonnet-20240229","xai/grok-2-latest","openrouter/google/palm-2-chat-bison"]   #later add pulling the list of to be evaluated model from a config file 
 17 | 
 18 | test_data = [ "what are the differences between ed25519 and secp256k1 when should I use one over the other", "can I derive a secp256k1 private key from a ed25519 private key and in the reverse" , "what is a self hosted fully opensource laternative to openrouter"]  #later add pulling this from a csv  and   add pulling this from a server 
 19 | 
 20 | async def get_model_completion(model_name, test_question):
 21 |     try:
 22 |         start_time = datetime.now().isoformat()
 23 |         response = await completion(
 24 |             model=model_name,
 25 |             messages=[{"content": test_question, "role": "user"}]
 26 |         )
 27 |         end_time = datetime.now().isoformat()
 28 |         return {
 29 |             "model": model_name,
 30 |             "response": response,
 31 |             "error": None,
 32 |             "request_timestamp": start_time,
 33 |             "response_timestamp": end_time
 34 |         }
 35 |     except Exception as e:
 36 |         end_time = datetime.now().isoformat()
 37 |         return {
 38 |             "model": model_name,
 39 |             "error": str(e),
 40 |             "response": None,
 41 |             "request_timestamp": start_time,
 42 |             "response_timestamp": end_time
 43 |         }
 44 | 
 45 | async def evaluate_models():
 46 |     all_results = []
 47 |     for test_question in test_data:
 48 |         tasks = [get_model_completion(model, test_question) for model in to_be_evaluated]
 49 |         question_results = await asyncio.gather(*tasks)
 50 |         
 51 |         # Flatten results for this question
 52 |         for model_result in question_results:
 53 |             all_results.append({
 54 |                 "question": test_question,
 55 |                 "model": model_result['model'],
 56 |                 "response": model_result['response'],
 57 |                 "error": model_result['error'],
 58 |                 "request_timestamp": model_result['request_timestamp'],
 59 |                 "response_timestamp": model_result['response_timestamp']
 60 |             })
 61 |     return all_results
 62 | 
 63 | def save_results_to_csv(results, filename='evaluation_results.csv'):
 64 |     filepath = os.path.join(os.path.dirname(__file__), filename)
 65 |     fieldnames = ['question', 'model', 'response', 'error', 'request_timestamp', 'response_timestamp']
 66 |     
 67 |     with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
 68 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 69 |         writer.writeheader()
 70 |         writer.writerows(results)
 71 |     return filepath
 72 | 
 73 | def save_results_to_parquet(results, filename='evaluation_results.parquet'):
 74 |     filepath = os.path.join(os.path.dirname(__file__), filename)
 75 |     # Convert results list to polars DataFrame and save
 76 |     df = pl.DataFrame(results)
 77 |     df.write_parquet(filepath)
 78 |     return filepath
 79 | 
 80 | if __name__ == "__main__":
 81 |     print("hello4.\n")
 82 |     results = asyncio.run(evaluate_models())
 83 |     current_question = None
 84 |     for result in results:
 85 |         if current_question != result['question']:
 86 |             current_question = result['question']
 87 |             print(f"\nQuestion: {current_question}")
 88 |             print("-" * 50)
 89 |         print(f"\nModel: {result['model']}")
 90 |         print(result['response'])
 91 |         if result['model'] == to_be_evaluated[-1]:  # If it's the last model
 92 |             print("\n" + "=" * 80)
 93 |     
 94 |     # Save results to CSV and Parquet
 95 |     csv_path = save_results_to_csv(results)
 96 |     parquet_path = save_results_to_parquet(results)
 97 |     print(f"\nResults saved to CSV: {csv_path}")
 98 |     print(f"Results saved to Parquet: {parquet_path}")
 99 |     
100 |     # Compute and display IPFS CID for the parquet file
101 |     try:
102 |         cidv1 = compute_ipfs_cids(parquet_path)
103 | 
104 |         print(f"Parquet file IPFS CID: {cidv1}")
105 |     except Exception as e:
106 |         print(f"Error computing IPFS CID: {str(e)}")


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | polars
2 | base58
3 | python-jose[cryptography]
4 | ecdsa
5 | joserfc
6 | PyJWT[crypto]


--------------------------------------------------------------------------------
/sample_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt00/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt00/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt00/google-Gemini-2.5-Pro_06042025_c/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/google-Gemini-2.5-Pro_06042025_c/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt00/metadata.yaml:
--------------------------------------------------------------------------------
1 | prompt:  
2 |     


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt00/openAI-o1_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt00/openAI-o1_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/anthropic-claude-3.7-sonnet_06042025_a/great-wave.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>The Great Wave off Kanagawa - SVG</title>
 7 |     <style>
 8 |         body {
 9 |             margin: 0;
10 |             display: flex;
11 |             justify-content: center;
12 |             align-items: center;
13 |             min-height: 100vh;
14 |             background-color: #f9f9f9;
15 |         }
16 |         .container {
17 |             width: 800px;
18 |             height: 600px;
19 |             border: 1px solid #ddd;
20 |             background-color: white;
21 |         }
22 |     </style>
23 | </head>
24 | <body>
25 |     <div class="container">
26 |         <svg viewBox="0 0 800 600" xmlns="http://www.w3.org/2000/svg">
27 |             <!-- Main wave -->
28 |             <path d="M 0,400 
29 |                      C 100,380 150,350 200,300 
30 |                      Q 250,250 300,300
31 |                      C 350,350 380,320 400,280
32 |                      C 420,240 450,200 500,230
33 |                      C 550,260 580,320 600,350
34 |                      C 620,380 650,390 700,370
35 |                      Q 750,350 800,400
36 |                      L 800,600 L 0,600 Z" 
37 |                   fill="#1a53a2"/>
38 |             
39 |             <!-- Wave foam -->
40 |             <path d="M 180,320 
41 |                      C 200,300 220,290 240,310
42 |                      C 260,330 280,310 300,280
43 |                      C 320,250 340,270 350,300" 
44 |                   stroke="#1a53a2" fill="none" stroke-width="10" stroke-linecap="round"/>
45 |             
46 |             <!-- Wave crest -->
47 |             <path d="M 380,260
48 |                      C 410,230 430,240 450,270
49 |                      C 470,300 490,280 510,250
50 |                      C 530,220 550,240 560,270"
51 |                   stroke="#1a53a2" fill="none" stroke-width="12" stroke-linecap="round"/>
52 |             
53 |             <!-- Wave foam details -->
54 |             <path d="M 500,280
55 |                      C 520,300 540,290 560,270
56 |                      C 580,250 600,270 620,300"
57 |                   stroke="#1a53a2" fill="none" stroke-width="10" stroke-linecap="round"/>
58 |             
59 |             <!-- Small waves -->
60 |             <path d="M 100,380
61 |                      Q 130,360 160,380
62 |                      Q 190,400 220,380"
63 |                   stroke="#1a53a2" fill="none" stroke-width="6" stroke-linecap="round"/>
64 |             
65 |             <!-- Mount Fuji in the background -->
66 |             <path d="M 350,390
67 |                      L 450,320
68 |                      L 550,390"
69 |                   stroke="#1a53a2" fill="none" stroke-width="5"/>
70 |             
71 |             <!-- Additional wave details -->
72 |             <path d="M 620,340
73 |                      C 640,320 660,330 680,350"
74 |                   stroke="#1a53a2" fill="none" stroke-width="8" stroke-linecap="round"/>
75 |             
76 |             <!-- Wave foam clusters -->
77 |             <g fill="#1a53a2">
78 |                 <circle cx="200" cy="320" r="10"/>
79 |                 <circle cx="220" cy="310" r="8"/>
80 |                 <circle cx="240" cy="325" r="7"/>
81 |                 
82 |                 <circle cx="460" cy="290" r="12"/>
83 |                 <circle cx="480" cy="270" r="10"/>
84 |                 <circle cx="500" cy="285" r="8"/>
85 |                 
86 |                 <circle cx="580" cy="340" r="9"/>
87 |                 <circle cx="600" cy="330" r="7"/>
88 |                 <circle cx="620" cy="345" r="6"/>
89 |             </g>
90 |         </svg>
91 |     </div>
92 | 
93 |     <script>
94 |         // No JavaScript needed for this static image
95 |     </script>
96 | </body>
97 | </html>
98 | 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/google-Gemini-2.5-Pro_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/google-Gemini-2.5-Pro_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/google-Gemini-2.5-Pro_06042025_a/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Great Wave SVG</title>
 7 |     <style>
 8 |         body {
 9 |             margin: 0;
10 |             overflow: hidden;
11 |             background-color: black; /* Black background */
12 |             display: flex;
13 |             justify-content: center;
14 |             align-items: center;
15 |             height: 100vh;
16 |         }
17 |         svg {
18 |             stroke: white;       /* Single color: white */
19 |             stroke-width: 2;
20 |             fill: none;          /* No fill for line art style */
21 |             stroke-linecap: round; /* Smoother line endings */
22 |             stroke-linejoin: round;
23 |         }
24 |     </style>
25 | </head>
26 | <body>
27 |     <svg id="scene" width="800" height="600" viewBox="0 0 800 600">
28 |         <!-- Main Wave Crest -->
29 |         <path d="M 100 500 
30 |                  Q 200 200, 350 300 
31 |                  C 450 350, 500 250, 600 200 
32 |                  Q 700 180, 750 300" 
33 |               stroke-width="4"/> 
34 |               
35 |         <!-- Wave Foam/Tendrils (simplified) -->
36 |         <path d="M 600 200 C 580 170, 550 150, 520 180" />
37 |         <path d="M 620 190 C 600 160, 570 140, 540 170" />
38 |         <path d="M 640 180 C 620 150, 590 130, 560 160" />
39 |         <path d="M 660 170 C 640 140, 610 120, 580 150" />
40 | 
41 |         <!-- Smaller wave elements -->
42 |         <path d="M 50 550 Q 150 450, 250 500 T 400 520" />
43 |         <path d="M 300 580 Q 400 500, 500 530 T 650 550" />
44 |         <path d="M 550 590 Q 650 540, 750 560" />
45 |         
46 |         <!-- Optional: Simplified Mount Fuji in background -->
47 |         <!-- <path d="M 300 150 L 350 50 L 400 150 Z" fill="black" stroke-width="1.5"/> -->
48 |         <!-- <path d="M 330 90 H 370" stroke-width="3" stroke="white" /> Snow cap -->
49 | 
50 |     </svg>
51 | </body>
52 | </html> 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/metadata.yaml:
--------------------------------------------------------------------------------
1 | prompt: using html , svg and vanila js create a scene that looks like The Great Wave off Kanagawa. Its a large wave asian style of art. But your only allowed to use SVG and 1 color. Keep the code small. Put it all into one file. Its a static image no animations. 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/openAI-o1_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/openAI-o1_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt0_wave/openAI-o1_06042025_a/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |   <meta charset="utf-8"/>
 5 |   <title>The Great Wave (Single Color)</title>
 6 | </head>
 7 | <body style="margin:0;padding:0;">
 8 | <svg viewBox="0 0 300 200" width="300" height="200">
 9 |   <path fill="black" d="
10 |     M 0,150 
11 |     C 20,120 40,90 70,100 
12 |     C 85,105 85,120 95,125 
13 |     C 130,100 200,60 220,50 
14 |     C 250,35 280,60 290,50 
15 |     Q 300,70 290,85 
16 |     Q 260,120 235,110 
17 |     C 200,135 130,160 90,170 
18 |     C 45,180 20,160 0,150Z"/>
19 | </svg>
20 | </body>
21 | </html>


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt1_tree/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt1_tree/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt1_tree/anthropic-claude-3.7-sonnet_06042025_a/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <style>
 5 |     body, html {
 6 |       margin: 0;
 7 |       padding: 0;
 8 |       width: 100%;
 9 |       height: 100%;
10 |       overflow: hidden;
11 |       background: black;
12 |     }
13 |     svg {
14 |       width: 100%;
15 |       height: 100%;
16 |     }
17 |   </style>
18 | </head>
19 | <body>
20 |   <svg id="canvas"></svg>
21 |   <script>
22 |     document.addEventListener('DOMContentLoaded', () => {
23 |       const svg = document.getElementById('canvas');
24 |       const width = window.innerWidth;
25 |       const height = window.innerHeight;
26 |       
27 |       function createTree(x, y, length, angle, depth, opacity) {
28 |         if (depth <= 0) return;
29 |         
30 |         const x2 = x + Math.cos(angle) * length;
31 |         const y2 = y + Math.sin(angle) * length;
32 |         
33 |         const line = document.createElementNS('http://www.w3.org/2000/svg', 'line');
34 |         line.setAttribute('x1', x);
35 |         line.setAttribute('y1', y);
36 |         line.setAttribute('x2', x2);
37 |         line.setAttribute('y2', y2);
38 |         line.setAttribute('stroke', `rgba(200,200,200,${opacity})`);
39 |         line.setAttribute('stroke-width', depth * 0.5);
40 |         svg.appendChild(line);
41 |         
42 |         const branchCount = Math.floor(Math.random() * 3) + 2;
43 |         const newLength = length * (0.5 + Math.random() * 0.3);
44 |         const newOpacity = opacity * 0.9;
45 |         
46 |         for (let i = 0; i < branchCount; i++) {
47 |           const newAngle = angle + (Math.random() * 1.0 - 0.5);
48 |           createTree(x2, y2, newLength, newAngle, depth - 1, newOpacity);
49 |         }
50 |       }
51 |       
52 |       // Scale tree to fit window
53 |       const startX = width / 2;
54 |       const startY = height * 0.9;
55 |       const startLength = height * 0.15;
56 |       const maxDepth = 8;
57 |       
58 |       createTree(startX, startY, startLength, -Math.PI/2, maxDepth, 1.0);
59 |     });
60 |   </script>
61 | </body>
62 | </html> 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt1_tree/google-Gemini-2.5-Pro_06042025_a/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 | <title>Fractal Tree</title>
 5 | <style>
 6 |   body { margin: 0; overflow: hidden; background-color: black; }
 7 |   svg { display: block; }
 8 | </style>
 9 | </head>
10 | <body>
11 | <svg id="treeSvg" width="100%" height="100%"></svg>
12 | <script>
13 |   const svg = document.getElementById('treeSvg');
14 |   const width = window.innerWidth;
15 |   const height = window.innerHeight;
16 |   svg.setAttribute('viewBox', `0 0 ${width} ${height}`);
17 | 
18 |   function drawLine(x1, y1, x2, y2, depth, maxDepth) {
19 |     const line = document.createElementNS('http://www.w3.org/2000/svg', 'line');
20 |     const opacity = 1 - (depth / (maxDepth + 1)); // Fade out based on depth
21 |     const colorIntensity = Math.floor(128 + 127 * opacity); // Grey, fades to dark grey/black
22 |     line.setAttribute('x1', x1);
23 |     line.setAttribute('y1', y1);
24 |     line.setAttribute('x2', x2);
25 |     line.setAttribute('y2', y2);
26 |     line.setAttribute('stroke', `rgb(${colorIntensity},${colorIntensity},${colorIntensity})`);
27 |     line.setAttribute('stroke-width', Math.max(1, (maxDepth - depth) / 2)); // Thinner branches
28 |     svg.appendChild(line);
29 |   }
30 | 
31 |   function drawTree(x1, y1, angle, length, depth, maxDepth) {
32 |     if (depth > maxDepth || length < 2) {
33 |       return;
34 |     }
35 | 
36 |     const x2 = x1 + Math.cos(angle * Math.PI / 180) * length;
37 |     const y2 = y1 + Math.sin(angle * Math.PI / 180) * length;
38 | 
39 |     drawLine(x1, y1, x2, y2, depth, maxDepth);
40 | 
41 |     // Branching logic with randomness
42 |     const branchAngle1 = angle - 20 + (Math.random() * 20 - 10);
43 |     const branchAngle2 = angle + 20 + (Math.random() * 20 - 10);
44 |     const branchLength = length * (0.7 + Math.random() * 0.2);
45 | 
46 |     drawTree(x2, y2, branchAngle1, branchLength, depth + 1, maxDepth);
47 |     drawTree(x2, y2, branchAngle2, branchLength, depth + 1, maxDepth);
48 |   }
49 | 
50 |   // Initial parameters - start from bottom center, point upwards
51 |   const startX = width / 2;
52 |   const startY = height;
53 |   const initialLength = Math.min(width, height) * 0.15; // Scale initial length based on window size
54 |   const initialAngle = -90; // Point straight up
55 |   const maxDepth = 10; // Adjust depth for complexity
56 | 
57 |   drawTree(startX, startY, initialAngle, initialLength, 0, maxDepth);
58 | 
59 | </script>
60 | </body>
61 | </html> 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt1_tree/metadata.yaml:
--------------------------------------------------------------------------------
1 | prompt: create an svg tree made from simple fractal formulas with randomness build it. Colors are grey lines on black background. Only generate a new tree on load. Use the full window for drawing. Do not clip the tree on the edge of the screen. Fade out the lines as you get towards the leaves. Try to write this using as little code as possible.  Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. 
2 |     


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt1_tree/openAI-o1_06042025_a/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html style="margin:0;padding:0;background:black;">
 3 | <body style="margin:0;">
 4 | <script>
 5 | (function(){
 6 |   let w=window.innerWidth,h=window.innerHeight;
 7 |   let svg=document.createElementNS("http://www.w3.org/2000/svg","svg");
 8 |   svg.setAttribute("width",w);svg.setAttribute("height",h);
 9 |   document.body.appendChild(svg);
10 |   
11 |   let maxDepth=10;
12 |   function drawLine(x1,y1,x2,y2,alpha){
13 |     let line=document.createElementNS("http://www.w3.org/2000/svg","line");
14 |     line.setAttribute("x1",x1);line.setAttribute("y1",y1);
15 |     line.setAttribute("x2",x2);line.setAttribute("y2",y2);
16 |     line.setAttribute("stroke","rgba(128,128,128,"+alpha+")");
17 |     svg.appendChild(line);
18 |   }
19 |   function branch(x,y,a,len,d){
20 |     if(d<1)return;
21 |     let x2=x+len*Math.cos(a),y2=y-len*Math.sin(a);
22 |     let alpha=d/maxDepth;
23 |     drawLine(x,y,x2,y2,alpha);
24 |     let r1=(Math.random()*0.5+0.5),r2=(Math.random()*0.5+0.5);
25 |     branch(x2,y2,a+(Math.random()*0.5),len*0.7*r1,d-1);
26 |     branch(x2,y2,a-(Math.random()*0.5),len*0.7*r2,d-1);
27 |   }
28 |   branch(w/2,h, -Math.PI/2, h*0.25, maxDepth);
29 | })();
30 | </script>
31 | </body>
32 | </html>


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/anthropic-claude-3.7-sonnet_06042025_a/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <style>
  5 |     body { margin: 0; overflow: hidden; background: black; }
  6 |     svg { width: 100vw; height: 100vh; }
  7 |     path { stroke: #aaa; stroke-width: 0.5; fill: none; }
  8 |   </style>
  9 | </head>
 10 | <body>
 11 |   <svg id="canvas"></svg>
 12 |   <script>
 13 |     const svg = document.getElementById('canvas');
 14 |     
 15 |     // Randomize parameters
 16 |     const majorRadius = 150 + Math.random() * 50;
 17 |     const minorRadius = 30 + Math.random() * 70;
 18 |     const segments = Math.floor(8 + Math.random() * 30); // mesh density
 19 |     const rotX = Math.random() * Math.PI;
 20 |     const rotY = Math.random() * Math.PI;
 21 |     const rotZ = Math.random() * Math.PI;
 22 |     
 23 |     // Setup rotation animation
 24 |     const rotSpeed = 0.01;
 25 |     let currentAngle = 0;
 26 |     
 27 |     // Center of the screen
 28 |     let cx = window.innerWidth / 2;
 29 |     let cy = window.innerHeight / 2;
 30 |     
 31 |     // 3D to 2D projection
 32 |     function project(x, y, z) {
 33 |       // Apply initial fixed rotation
 34 |       let nx = x;
 35 |       let ny = y * Math.cos(rotX) - z * Math.sin(rotX);
 36 |       let nz = y * Math.sin(rotX) + z * Math.cos(rotX);
 37 |       
 38 |       x = nx * Math.cos(rotY) + nz * Math.sin(rotY);
 39 |       y = ny;
 40 |       z = -nx * Math.sin(rotY) + nz * Math.cos(rotY);
 41 |       
 42 |       nx = x * Math.cos(rotZ) - y * Math.sin(rotZ);
 43 |       ny = x * Math.sin(rotZ) + y * Math.cos(rotZ);
 44 |       nz = z;
 45 |       
 46 |       // Apply animation rotation around Y axis
 47 |       x = nx * Math.cos(currentAngle) + nz * Math.sin(currentAngle);
 48 |       y = ny;
 49 |       z = -nx * Math.sin(currentAngle) + nz * Math.cos(currentAngle);
 50 |       
 51 |       // Simple perspective
 52 |       const scale = 1000 / (1000 + z);
 53 |       return {
 54 |         x: cx + x * scale,
 55 |         y: cy + y * scale
 56 |       };
 57 |     }
 58 |     
 59 |     function renderTorus() {
 60 |       svg.innerHTML = ''; // Clear previous frame
 61 |       
 62 |       // Create paths for longitude lines (around minor circle)
 63 |       for (let i = 0; i < segments; i++) {
 64 |         const u = (i / segments) * Math.PI * 2;
 65 |         const path = document.createElementNS('http://www.w3.org/2000/svg', 'path');
 66 |         let d = '';
 67 |         
 68 |         for (let j = 0; j <= segments; j++) {
 69 |           const v = (j / segments) * Math.PI * 2;
 70 |           const x = (majorRadius + minorRadius * Math.cos(v)) * Math.cos(u);
 71 |           const y = (majorRadius + minorRadius * Math.cos(v)) * Math.sin(u);
 72 |           const z = minorRadius * Math.sin(v);
 73 |           
 74 |           const point = project(x, y, z);
 75 |           d += (j === 0 ? 'M' : 'L') + point.x + ',' + point.y;
 76 |         }
 77 |         
 78 |         path.setAttribute('d', d);
 79 |         svg.appendChild(path);
 80 |       }
 81 |       
 82 |       // Create paths for latitude lines (around major circle)
 83 |       for (let j = 0; j < segments; j++) {
 84 |         const v = (j / segments) * Math.PI * 2;
 85 |         const path = document.createElementNS('http://www.w3.org/2000/svg', 'path');
 86 |         let d = '';
 87 |         
 88 |         for (let i = 0; i <= segments; i++) {
 89 |           const u = (i / segments) * Math.PI * 2;
 90 |           const x = (majorRadius + minorRadius * Math.cos(v)) * Math.cos(u);
 91 |           const y = (majorRadius + minorRadius * Math.cos(v)) * Math.sin(u);
 92 |           const z = minorRadius * Math.sin(v);
 93 |           
 94 |           const point = project(x, y, z);
 95 |           d += (i === 0 ? 'M' : 'L') + point.x + ',' + point.y;
 96 |         }
 97 |         
 98 |         path.setAttribute('d', d);
 99 |         svg.appendChild(path);
100 |       }
101 |     }
102 |     
103 |     function animate() {
104 |       currentAngle += rotSpeed;
105 |       renderTorus();
106 |       requestAnimationFrame(animate);
107 |     }
108 |     
109 |     // Handle window resize
110 |     window.addEventListener('resize', () => {
111 |       cx = window.innerWidth / 2;
112 |       cy = window.innerHeight / 2;
113 |     });
114 |     
115 |     // Start animation
116 |     animate();
117 |   </script>
118 | </body>
119 | </html> 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/google-Gemini-2.5-Pro_06042025_c/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <title>Torus Wireframe Animation</title>
 5 |     <link rel="stylesheet" href="style.css">
 6 | </head>
 7 | <body>
 8 |     <svg id="torus-svg" width="100%" height="100%"></svg>
 9 |     <script src="script.js"></script>
10 | </body>
11 | </html> 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/google-Gemini-2.5-Pro_06042025_c/script.js:
--------------------------------------------------------------------------------
  1 | const svg = document.getElementById('torus-svg');
  2 | const svgNS = "http://www.w3.org/2000/svg";
  3 | 
  4 | let width = window.innerWidth;
  5 | let height = window.innerHeight;
  6 | 
  7 | // --- Randomize Parameters ---
  8 | const R = Math.random() * 100 + 100; // Major Radius (100-200)
  9 | const r = Math.random() * 50 + 25;  // Minor Radius (25-75) ensures hole changes
 10 | const numMajor = Math.floor(Math.random() * 20) + 20; // Density Major segments (20-40)
 11 | const numMinor = Math.floor(Math.random() * 10) + 10; // Density Minor segments (10-20)
 12 | const tiltX = (Math.random() - 0.5) * Math.PI; // Initial X tilt (-PI/2 to PI/2)
 13 | const tiltY = (Math.random() - 0.5) * Math.PI; // Initial Y tilt (-PI/2 to PI/2)
 14 | const rotationSpeed = (Math.random() * 0.01) + 0.005; // Rotation speed (0.005-0.015 rad/frame)
 15 | 
 16 | // --- 3D Point Generation ---
 17 | function generateTorusPoints(R, r, numMajor, numMinor) {
 18 |     const points = [];
 19 |     for (let i = 0; i < numMajor; i++) {
 20 |         const majorAngle = 2 * Math.PI * i / numMajor;
 21 |         const cosMajor = Math.cos(majorAngle);
 22 |         const sinMajor = Math.sin(majorAngle);
 23 | 
 24 |         for (let j = 0; j < numMinor; j++) {
 25 |             const minorAngle = 2 * Math.PI * j / numMinor;
 26 |             const cosMinor = Math.cos(minorAngle);
 27 |             const sinMinor = Math.sin(minorAngle);
 28 | 
 29 |             const x = (R + r * cosMinor) * cosMajor;
 30 |             const y = (R + r * cosMinor) * sinMajor;
 31 |             const z = r * sinMinor;
 32 |             points.push({ x, y, z, i, j }); // Store segment indices
 33 |         }
 34 |     }
 35 |     return points;
 36 | }
 37 | 
 38 | // --- 3D Rotation ---
 39 | function rotateX(point, angle) {
 40 |     const y = point.y;
 41 |     const z = point.z;
 42 |     point.y = y * Math.cos(angle) - z * Math.sin(angle);
 43 |     point.z = y * Math.sin(angle) + z * Math.cos(angle);
 44 | }
 45 | 
 46 | function rotateY(point, angle) {
 47 |     const x = point.x;
 48 |     const z = point.z;
 49 |     point.x = x * Math.cos(angle) + z * Math.sin(angle);
 50 |     point.z = -x * Math.sin(angle) + z * Math.cos(angle);
 51 | }
 52 | 
 53 | function rotateZ(point, angle) {
 54 |     const x = point.x;
 55 |     const y = point.y;
 56 |     point.x = x * Math.cos(angle) - y * Math.sin(angle);
 57 |     point.y = x * Math.sin(angle) + y * Math.cos(angle);
 58 | }
 59 | 
 60 | // --- Projection ---
 61 | function project(point) {
 62 |     // Simple orthographic projection
 63 |     const scale = 1.5; // Adjust scale as needed
 64 |     return {
 65 |         x: point.x * scale + width / 2,
 66 |         y: point.y * scale + height / 2
 67 |     };
 68 | }
 69 | 
 70 | // --- Drawing ---
 71 | const points3D = generateTorusPoints(R, r, numMajor, numMinor);
 72 | const paths = [];
 73 | 
 74 | // Create paths for major and minor segments
 75 | for (let i = 0; i < numMajor; i++) {
 76 |     for (let j = 0; j < numMinor; j++) {
 77 |         const pathMajor = document.createElementNS(svgNS, 'path');
 78 |         pathMajor.setAttribute('stroke', 'grey');
 79 |         pathMajor.setAttribute('stroke-width', '1');
 80 |         pathMajor.setAttribute('fill', 'none');
 81 |         svg.appendChild(pathMajor);
 82 |         paths.push({ type: 'major', i, j, element: pathMajor });
 83 | 
 84 |         const pathMinor = document.createElementNS(svgNS, 'path');
 85 |         pathMinor.setAttribute('stroke', 'grey');
 86 |         pathMinor.setAttribute('stroke-width', '1');
 87 |         pathMinor.setAttribute('fill', 'none');
 88 |         svg.appendChild(pathMinor);
 89 |         paths.push({ type: 'minor', i, j, element: pathMinor });
 90 |     }
 91 | }
 92 | 
 93 | 
 94 | let angle = 0;
 95 | 
 96 | function animate() {
 97 |     // Clone points for rotation to avoid modifying originals permanently per frame
 98 |     const rotatedPoints = points3D.map(p => ({ ...p }));
 99 | 
100 |     // Apply initial tilt
101 |     rotatedPoints.forEach(p => {
102 |         rotateX(p, tiltX);
103 |         rotateY(p, tiltY);
104 |     });
105 | 
106 |     // Apply animation rotation (around Y axis for this example)
107 |     rotatedPoints.forEach(p => rotateY(p, angle));
108 | 
109 |     // Project points
110 |     const projectedPoints = rotatedPoints.map(project);
111 | 
112 |     // Update SVG paths
113 |     paths.forEach(pathInfo => {
114 |         const { type, i, j, element } = pathInfo;
115 |         const currentPointIndex = i * numMinor + j;
116 | 
117 |         let nextPointIndex;
118 |         if (type === 'major') {
119 |             // Connect point to the next point in the same major ring
120 |             nextPointIndex = i * numMinor + (j + 1) % numMinor;
121 |         } else { // type === 'minor'
122 |             // Connect point to the corresponding point in the next major ring
123 |             nextPointIndex = ((i + 1) % numMajor) * numMinor + j;
124 |         }
125 | 
126 |         const p1 = projectedPoints[currentPointIndex];
127 |         const p2 = projectedPoints[nextPointIndex];
128 | 
129 |         element.setAttribute('d', `M ${p1.x} ${p1.y} L ${p2.x} ${p2.y}`);
130 |     });
131 | 
132 | 
133 |     angle += rotationSpeed;
134 |     requestAnimationFrame(animate);
135 | }
136 | 
137 | // Handle window resize
138 | window.addEventListener('resize', () => {
139 |     width = window.innerWidth;
140 |     height = window.innerHeight;
141 |     svg.setAttribute('viewBox', `0 0 ${width} ${height}`);
142 | });
143 | 
144 | // Initial setup for viewbox
145 | svg.setAttribute('viewBox', `0 0 ${width} ${height}`);
146 | 
147 | // Apply initial tilt to base points ONCE
148 | points3D.forEach(p => {
149 |    rotateX(p, tiltX);
150 |    rotateY(p, tiltY);
151 | });
152 | 
153 | 
154 | animate(); 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/google-Gemini-2.5-Pro_06042025_c/style.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     margin: 0;
 3 |     overflow: hidden;
 4 |     background-color: black;
 5 | }
 6 | 
 7 | svg {
 8 |     display: block;
 9 |     width: 100vw;
10 |     height: 100vh;
11 | } 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/metadata.yaml:
--------------------------------------------------------------------------------
1 | prompt:  create an svg animation of a wire frame torus with the wire mesh . Colors are grey lines on black background. Try to write this using as little code as possible. Randomize the density of the wire mesh. Randomize the angle of the torus with respect to the horizontal and vertical. Also randomize the Major Radius   and Minor Radius  such that the hole in the middle changes. Choose an angle and then animate a rotation of the torus.   Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. 
2 |     


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/openAI-o1_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/openAI-o1_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt2_torus/openAI-o1_06042025_a/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta charset="utf-8">
 5 |   <title>Wireframe Torus</title>
 6 |   <style>
 7 |     html, body { margin: 0; padding: 0; background: black; }
 8 |     svg { width: 100vw; height: 100vh; display: block; }
 9 |   </style>
10 | </head>
11 | <body>
12 | <svg id="torus"></svg>
13 | <script>
14 | (function(){
15 |   const svg = document.getElementById("torus");
16 |   const w = window.innerWidth, h = window.innerHeight;
17 |   const cx = w/2, cy = h/2;              // Center of screen
18 |   const R = 80 + Math.random()*120;      // Major Radius
19 |   const r = 20 + Math.random()*60;       // Minor Radius
20 |   const thetaSteps = 6 + Math.floor(Math.random()*15);
21 |   const phiSteps   = 6 + Math.floor(Math.random()*15);
22 |   const rotX = Math.random()*2*Math.PI;  // Random initial tilt
23 |   const rotY = Math.random()*2*Math.PI;
24 |   const rotZ = Math.random()*2*Math.PI;
25 |   let animAngle = 0;                     // Rotation angle for animation
26 | 
27 |   // Rotation matrix multiplication for a [x, y, z] vector
28 |   function rotate(x, y, z, ax, ay, az) {
29 |     // Rotate around X
30 |     let ry = y*Math.cos(ax) - z*Math.sin(ax), rz = y*Math.sin(ax) + z*Math.cos(ax);
31 |     y = ry; z = rz;
32 |     // Rotate around Y
33 |     let rx = x*Math.cos(ay) + z*Math.sin(ay);
34 |     rz =-x*Math.sin(ay) + z*Math.cos(ay);
35 |     x = rx; z = rz;
36 |     // Rotate around Z
37 |     let rx2 = x*Math.cos(az) - y*Math.sin(az), ry2 = x*Math.sin(az) + y*Math.cos(az);
38 |     return [rx2, ry2, z];
39 |   }
40 | 
41 |   // Create polylines for the wireframe
42 |   const polylines = [];
43 |   for(let i=0; i<thetaSteps; i++){
44 |     let poly = document.createElementNS("http://www.w3.org/2000/svg","polyline");
45 |     poly.setAttribute("fill","none");
46 |     poly.setAttribute("stroke","grey");
47 |     svg.appendChild(poly);
48 |     polylines.push(poly);
49 |   }
50 |   // We'll treat each i as a "ring" along phi
51 |   // Then for each ring, we param over theta
52 | 
53 |   function draw(angle){
54 |     for(let i=0; i<thetaSteps; i++){
55 |       let points = [];
56 |       let phi = i*(2*Math.PI/thetaSteps);
57 |       for(let j=0; j<=phiSteps; j++){
58 |         let theta = j*(2*Math.PI/phiSteps);
59 |         // Parametric torus
60 |         let x = (R + r*Math.cos(phi))*Math.cos(theta),
61 |             y = (R + r*Math.cos(phi))*Math.sin(theta),
62 |             z = r*Math.sin(phi);
63 |         // Apply initial tilt + time-based rotation around Y
64 |         let [rx, ry, rz] = rotate(x, y, z, rotX, rotY+angle, rotZ);
65 |         // Simple perspective
66 |         let scale = 300/(rz+300);
67 |         let sx = cx + rx*scale, sy = cy + ry*scale;
68 |         points.push(sx+","+sy);
69 |       }
70 |       polylines[i].setAttribute("points", points.join(" "));
71 |     }
72 |   }
73 | 
74 |   function animate(){
75 |     animAngle += 0.01;  // Increment rotation
76 |     draw(animAngle);
77 |     requestAnimationFrame(animate);
78 |   }
79 |   draw(0);
80 |   requestAnimationFrame(animate);
81 | })();
82 | </script>
83 | </body>
84 | </html>
85 | 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/anthropic-claude-3.7-sonnet_06042025_a/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>Sierpiński Triangle Fractal Zoom</title>
  7 |     <style>
  8 |         body {
  9 |             margin: 0;
 10 |             overflow: hidden;
 11 |             background-color: #000;
 12 |         }
 13 |         svg {
 14 |             width: 100vw;
 15 |             height: 100vh;
 16 |         }
 17 |     </style>
 18 | </head>
 19 | <body>
 20 |     <svg id="fractal" viewBox="0 0 1 1" preserveAspectRatio="xMidYMid meet"></svg>
 21 |     
 22 |     <script>
 23 |         // Configuration
 24 |         const maxLevel = 10; // Maximum recursion level
 25 |         let zoomFactor = 0.98; // How fast to zoom (smaller = faster)
 26 |         let zoomOffsetX = 0; // Used to focus the zoom point
 27 |         let zoomOffsetY = 0; // Used to focus the zoom point
 28 |         let currentScale = 1; // Current zoom level
 29 |         
 30 |         // SVG element
 31 |         const svg = document.getElementById('fractal');
 32 |         
 33 |         // Main points of the initial triangle
 34 |         let p1 = { x: 0, y: 1 }; // Bottom left
 35 |         let p2 = { x: 0.5, y: 0 }; // Top middle
 36 |         let p3 = { x: 1, y: 1 }; // Bottom right
 37 |         
 38 |         // Focus on the edge (bottom right) of the fractal
 39 |         zoomOffsetX = 0.8;
 40 |         zoomOffsetY = 0.8;
 41 |         
 42 |         // Generate the Sierpiński triangle
 43 |         function drawSierpinskiTriangle(level, p1, p2, p3) {
 44 |             if (level === 0) {
 45 |                 // Draw a triangle at the base level
 46 |                 const triangle = document.createElementNS("http://www.w3.org/2000/svg", "polygon");
 47 |                 triangle.setAttribute('points', `${p1.x},${p1.y} ${p2.x},${p2.y} ${p3.x},${p3.y}`);
 48 |                 triangle.setAttribute('fill', 'none');
 49 |                 triangle.setAttribute('stroke', 'lime');
 50 |                 triangle.setAttribute('stroke-width', '0.001');
 51 |                 svg.appendChild(triangle);
 52 |             } else {
 53 |                 // Calculate midpoints
 54 |                 const p12 = { x: (p1.x + p2.x) / 2, y: (p1.y + p2.y) / 2 };
 55 |                 const p23 = { x: (p2.x + p3.x) / 2, y: (p2.y + p3.y) / 2 };
 56 |                 const p31 = { x: (p3.x + p1.x) / 2, y: (p3.y + p1.y) / 2 };
 57 |                 
 58 |                 // Recursive calls for three sub-triangles
 59 |                 drawSierpinskiTriangle(level - 1, p1, p12, p31);
 60 |                 drawSierpinskiTriangle(level - 1, p12, p2, p23);
 61 |                 drawSierpinskiTriangle(level - 1, p31, p23, p3);
 62 |             }
 63 |         }
 64 |         
 65 |         // Clear and redraw the fractal
 66 |         function redraw() {
 67 |             // Clear the SVG
 68 |             while (svg.firstChild) {
 69 |                 svg.removeChild(svg.firstChild);
 70 |             }
 71 |             
 72 |             // Draw the Sierpiński triangle
 73 |             drawSierpinskiTriangle(maxLevel, p1, p2, p3);
 74 |         }
 75 |         
 76 |         // Animation function to zoom into the fractal
 77 |         function animate() {
 78 |             // Update the viewBox to zoom
 79 |             currentScale *= zoomFactor;
 80 |             
 81 |             // Calculate the new viewBox parameters to zoom into the edge
 82 |             const width = currentScale;
 83 |             const height = currentScale;
 84 |             const x = zoomOffsetX * (1 - currentScale);
 85 |             const y = zoomOffsetY * (1 - currentScale);
 86 |             
 87 |             // Set the new viewBox
 88 |             svg.setAttribute('viewBox', `${x} ${y} ${width} ${height}`);
 89 |             
 90 |             // Add more detail if we've zoomed in enough
 91 |             if (currentScale < 0.1 && maxLevel < 12) {
 92 |                 // Increase detail level as we zoom
 93 |                 redraw();
 94 |             }
 95 |             
 96 |             // Continue the animation
 97 |             requestAnimationFrame(animate);
 98 |         }
 99 |         
100 |         // Initial draw
101 |         redraw();
102 |         
103 |         // Start the animation
104 |         animate();
105 |     </script>
106 | </body>
107 | </html>


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/google-Gemini-2.5-Pro_06042025_c/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/google-Gemini-2.5-Pro_06042025_c/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/google-Gemini-2.5-Pro_06042025_c/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <title>Sierpinski Zoom</title>
  5 | <style>
  6 |   body { margin: 0; overflow: hidden; background-color: #111; }
  7 |   svg { display: block; }
  8 | </style>
  9 | </head>
 10 | <body>
 11 | <svg id="sierpinskiCanvas" width="100vw" height="100vh" preserveAspectRatio="xMidYMid meet"></svg>
 12 | 
 13 | <script>
 14 |   const svg = document.getElementById('sierpinskiCanvas');
 15 |   const svgNS = "http://www.w3.org/2000/svg";
 16 | 
 17 |   let currentViewBox = { x: -5, y: -5, width: 110, height: 110 }; // Initial view slightly larger
 18 |   const zoomFactor = 0.985; // Fast zoom
 19 |   const zoomTarget = { x: 0, y: 100 }; // Zoom target: bottom-left vertex
 20 |   const minTriangleSize = 0.01; // Stop recursion when triangles are smaller than this (in viewbox units)
 21 |   let maxDepth = 0; // Will be calculated dynamically
 22 | 
 23 |   // Initial triangle vertices (base 100)
 24 |   const initialTriangle = [
 25 |     { x: 50, y: 0 },
 26 |     { x: 0, y: 100 },
 27 |     { x: 100, y: 100 }
 28 |   ];
 29 | 
 30 |   function createTriangle(p1, p2, p3, color) {
 31 |     const polygon = document.createElementNS(svgNS, 'polygon');
 32 |     polygon.setAttribute('points', `${p1.x},${p1.y} ${p2.x},${p2.y} ${p3.x},${p3.y}`);
 33 |     polygon.setAttribute('fill', color);
 34 |     // Optimize: Add stroke for visibility at high zoom, make it thin
 35 |     polygon.setAttribute('stroke', '#eee');
 36 |     polygon.setAttribute('stroke-width', '0.1'); // Adjust based on viewbox
 37 |     polygon.setAttribute('vector-effect', 'non-scaling-stroke'); // Keep stroke thin
 38 |     return polygon;
 39 |   }
 40 | 
 41 |   function getMidpoint(p1, p2) {
 42 |     return { x: (p1.x + p2.x) / 2, y: (p1.y + p2.y) / 2 };
 43 |   }
 44 | 
 45 |   // Recursive function to draw Sierpinski triangle
 46 |   function drawSierpinski(p1, p2, p3, depth) {
 47 |     // Base case: stop recursion if depth is reached OR triangle is too small for the current view
 48 |      const triangleWidth = Math.max(
 49 |          Math.abs(p1.x - p2.x),
 50 |          Math.abs(p2.x - p3.x),
 51 |          Math.abs(p3.x - p1.x)
 52 |      );
 53 | 
 54 |     // Simple check if triangle is roughly within view or large enough to matter
 55 |      const roughlyInView =
 56 |         Math.max(p1.x, p2.x, p3.x) > currentViewBox.x &&
 57 |         Math.min(p1.x, p2.x, p3.x) < currentViewBox.x + currentViewBox.width &&
 58 |         Math.max(p1.y, p2.y, p3.y) > currentViewBox.y &&
 59 |         Math.min(p1.y, p2.y, p3.y) < currentViewBox.y + currentViewBox.height;
 60 | 
 61 |     if (depth > maxDepth || triangleWidth < minTriangleSize || !roughlyInView && triangleWidth < currentViewBox.width / 5) { // Add coarse culling
 62 |        return;
 63 |     }
 64 | 
 65 | 
 66 |     // Draw the central upside-down triangle (hole)
 67 |     // We draw the 3 smaller triangles instead of the hole
 68 |     const m1 = getMidpoint(p1, p2);
 69 |     const m2 = getMidpoint(p2, p3);
 70 |     const m3 = getMidpoint(p3, p1);
 71 | 
 72 |     // Generate a color based on depth for visual effect (optional)
 73 |     const hue = (depth * 30) % 360;
 74 |     const color = `hsl(${hue}, 70%, 50%)`;
 75 | 
 76 |     // Draw the three corner triangles if deep enough
 77 |      if (depth === maxDepth || triangleWidth < minTriangleSize * 5) { // Draw solid if base case reached or near
 78 |         const tri = createTriangle(p1, p2, p3, color);
 79 |         svg.appendChild(tri);
 80 |      } else {
 81 |        // Recursively draw the three smaller triangles
 82 |        drawSierpinski(p1, m1, m3, depth + 1);
 83 |        drawSierpinski(m1, p2, m2, depth + 1);
 84 |        drawSierpinski(m3, m2, p3, depth + 1);
 85 |      }
 86 |   }
 87 | 
 88 | 
 89 |   function animate() {
 90 |     // Clear previous frame
 91 |     while (svg.firstChild) {
 92 |       svg.removeChild(svg.firstChild);
 93 |     }
 94 | 
 95 |     // Update viewbox for zoom
 96 |     const newWidth = currentViewBox.width * zoomFactor;
 97 |     const newHeight = currentViewBox.height * zoomFactor;
 98 |     const dx = (zoomTarget.x - currentViewBox.x) * (1 - zoomFactor);
 99 |     const dy = (zoomTarget.y - currentViewBox.y) * (1 - zoomFactor);
100 | 
101 |     currentViewBox.x += dx;
102 |     currentViewBox.y += dy;
103 |     currentViewBox.width = newWidth;
104 |     currentViewBox.height = newHeight;
105 | 
106 |     // Dynamically calculate max depth based on viewbox size
107 |     // We want more detail as we zoom in (smaller viewbox width)
108 |     // Need approx log relationship: depth increases as width decreases
109 |     maxDepth = Math.max(2, Math.ceil(4 - Math.log2(currentViewBox.width / 100))); // Adjust formula as needed
110 | 
111 | 
112 |     // Set the viewbox attribute
113 |     svg.setAttribute('viewBox', `${currentViewBox.x} ${currentViewBox.y} ${currentViewBox.width} ${currentViewBox.height}`);
114 | 
115 |     // Redraw the fractal for the current view
116 |     // Start with the initial large triangle
117 |     drawSierpinski(initialTriangle[0], initialTriangle[1], initialTriangle[2], 0);
118 | 
119 | 
120 |     requestAnimationFrame(animate);
121 |   }
122 | 
123 |   // Start the animation
124 |   animate();
125 | 
126 | </script>
127 | </body>
128 | </html> 


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/metadata.yaml:
--------------------------------------------------------------------------------
1 | prompt:  Procedurally generate Sierpiński triangle /  Sierpiński gasket / Sierpiński sieve fractal as the camera zooms into it. Zoom into the edge of the fractal. Zoom in kinda fast. Use SVG only.  Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. Put everything into one index.html file. 
2 |     


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/openAI-o1_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/openAI-o1_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt3_Sierpinski triangle/openAI-o1_06042025_a/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |   <meta charset="UTF-8" />
 5 |   <title>Sierpinski Triangle Zoom</title>
 6 |   <style>
 7 |     html, body {
 8 |       margin: 0;
 9 |       padding: 0;
10 |       background-color: #222;
11 |     }
12 |     #container {
13 |       width: 100vw;
14 |       height: 100vh;
15 |       display: flex;
16 |       justify-content: center;
17 |       align-items: center;
18 |     }
19 |     svg {
20 |       border: 1px solid #555;
21 |       background-color: #222;
22 |     }
23 |   </style>
24 | </head>
25 | <body>
26 |   <div id="container">
27 |     <svg id="sierpinskiCanvas" width="800" height="600" viewBox="0 0 1 1" preserveAspectRatio="xMidYMid meet"></svg>
28 |   </div>
29 | 
30 |   <script>
31 |     const svg = document.getElementById('sierpinskiCanvas');
32 | 
33 |     function drawTriangle(svg, x1, y1, x2, y2, x3, y3) {
34 |       const polygon = document.createElementNS('http://www.w3.org/2000/svg','polygon');
35 |       polygon.setAttribute('points', \`\${x1},\${y1} \${x2},\${y2} \${x3},\${y3}\`);
36 |       polygon.setAttribute('fill', '#fff');
37 |       svg.appendChild(polygon);
38 |     }
39 | 
40 |     function drawSierpinski(svg, x1, y1, x2, y2, x3, y3, depth) {
41 |       if (depth === 0) {
42 |         drawTriangle(svg, x1, y1, x2, y2, x3, y3);
43 |       } else {
44 |         const mx12 = (x1 + x2) / 2;
45 |         const my12 = (y1 + y2) / 2;
46 |         const mx23 = (x2 + x3) / 2;
47 |         const my23 = (y2 + y3) / 2;
48 |         const mx13 = (x1 + x3) / 2;
49 |         const my13 = (y1 + y3) / 2;
50 | 
51 |         drawSierpinski(svg, x1, y1, mx12, my12, mx13, my13, depth - 1);
52 |         drawSierpinski(svg, x2, y2, mx23, my23, mx12, my12, depth - 1);
53 |         drawSierpinski(svg, x3, y3, mx13, my13, mx23, my23, depth - 1);
54 |       }
55 |     }
56 | 
57 |     // Define the coordinates of the large triangle
58 |     const x1 = 0,       y1 = 0;
59 |     const x2 = 1,       y2 = 0;
60 |     const x3 = 0.5,     y3 = 0.8660; // approx sqrt(3)/2
61 | 
62 |     // Draw at a certain recursion depth.
63 |     const depth = 6;
64 |     drawSierpinski(svg, x1, y1, x2, y2, x3, y3, depth);
65 | 
66 |     let zoom = 1.0;
67 |     function animateZoom() {
68 |       zoom *= 1.03; // Zoom in each frame (kinda fast)
69 |       const newSize = 1 / zoom;
70 |       // We'll stay anchored near the (0,0) corner to "zoom into the edge"
71 |       svg.setAttribute('viewBox', \`0 0 \${newSize} \${newSize}\`);
72 |       requestAnimationFrame(animateZoom);
73 |     }
74 |     requestAnimationFrame(animateZoom);
75 |   </script>
76 | </body>
77 | </html>


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/google-Gemini-2.5-Pro_06042025_c/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/google-Gemini-2.5-Pro_06042025_c/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/metadata.yaml:
--------------------------------------------------------------------------------
1 | prompt:  Procedurally generate Jerusalem cube fractal in 3d. Use SVG mesh only. Rotate the cub around a random angle.  Do not use any libraries only vanila js is allowed. I will look at the result only in chrome v8. Put everything into one index.html file. 
2 |     


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/openAI-o1_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt4_Jerusalem_cube/openAI-o1_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/anthropic-claude-3.7-sonnet_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/metadata.yaml:
--------------------------------------------------------------------------------
1 | prompt:  Procedurally generate a 3d fractal call ed   the Menger sponge (also known as the Menger cube, Menger universal curve, Sierpinski cube, or Sierpinski sponge .  .   Use only vanila js with webgl. Change the colors of the faces as you go deeper into the fractal.  This fractal mathematically has a Hausdorff dimension of  {\displaystyle \log _{3}8}   . Think about how to do this, list yourself several options its not that easy. the Menger sponge has an infinite surface area and zero volume so you need to generate it procedurally and potentially set an arbitrary cutoff limit on how deep the fractal will go . I will look at the result only in chrome v8. Put everything into one index.html file. 
2 |     


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/openAI-o1_06042025_a/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peerbench/peerBench/e25beafd73d7b0fba2fb6c0ec2a5c7438fd5bb15/sample_data/demoscene-js_cgi_benchmark/prompt5_Menger_sponge/openAI-o1_06042025_a/.DS_Store


--------------------------------------------------------------------------------
/sample_data/demoscene-js_cgi_benchmark/scores_metadata_eth_signature.txt:
--------------------------------------------------------------------------------
1 | 0x10a6b15012feee6ce7b181ce609712c02700585b19899acf8bf1954b75ba03ff566dd36f93ff272a5062b6b3aad2ea490eb8a14e5ae7601c99f7713a96a1b68c1b


--------------------------------------------------------------------------------
/sample_data/example_evaluation_result_publication.yaml:
--------------------------------------------------------------------------------
 1 | validator: Did:ridValidator:0x6FCB287c943C1119c550F350B07e7B4c9f02677C:mit.edu   #  did:ridValidator:<public key>:<domain>    Each domain can have multiple valida keys.  The did method has additional logic to check if a certain public key is valid for a given domain.  Domains are verified by DKMI email signature or DNS TXT records.  
 2 | evaluation: did:rideval:MTRobustBLUE:dasdasdasdasdasdasdasdasdasda:github.com/robust/MTbench/spec
 3 | results_per_provider:
 4 |   - provider: Did:ridProvider:Grok3:0x6FCB287c943C1119c550F350B07e7B4c9f026001:api.x.ai   #  did:ridValidator:<public key>:<domain>    Each domain can have multiple valida keys.  The did method has additional logic to check if a certain public key is valid for a given domain. Domains are verified by DKMI email signature or DNS TXT records. 
 5 |     score: 0.8   # One overall normalized response quality score that must be present in all evaluations. Larger score is better. In float form it ranges from   0.0 to 1.0.  It may be a simple average of all the metircs or a more complex convulusion.  
 6 |     metrics:
 7 |       - metric_name: average_BLUE
 8 |         metric_type: float
 9 |         metric_value: 81.1
10 |       - metric_name: ROC
11 |         metric_type: float
12 |         metric_value: 81.1
13 |       - metric_name: GRE
14 |         metric_type: float
15 |         metric_value: 81.1
16 |       - metric_name: averageRoundReponseTIme
17 |         metric_type: int_ms
18 |         metric_value: 302
19 |   - provider: Did:ridProvider:openAi/o3-mini:0x6FCB287c943C1119c550F350B07e7B4c9f026456:openai.org/api
20 |     score: 0.95
21 |     metrics:
22 |       - metric_name: average_BLUE
23 |         metric_type: float
24 |         metric_value: 95.1
25 |       - metric_name: ROC
26 |         metric_type: float
27 |         metric_value: 95.1
28 |       - metric_name: GRE
29 |         metric_type: float
30 |         metric_value: 92.1
31 |       - metric_name: averageRoundReponseTIme
32 |         metric_type: int_ms
33 |         metric_value: 202
34 |   - provider: Did:ridProvider:0x6FCB287c943C1119c550F350B07e7B4c9f026412:anthropic.com/api
35 |     model_used: anthropic/claude-3-7-sonnet-20250219
36 |     score: 0.91
37 |     metrics:
38 |       - metric_name: average_BLUE
39 |         metric_type: float
40 |         metric_value: 91.1
41 |       - metric_name: ROC
42 |         metric_type: float
43 |         metric_value: 91.1
44 |       - metric_name: GRE
45 |         metric_type: float
46 |         metric_value: 91.1
47 |       - metric_name: averageRoundReponseTIme
48 |         metric_type: int_ms
49 |         metric_value: 402
50 |   - provider: Did:ridProvider:openrouter/anthropic/claude-3-7-sonnet-20250219::openrouter.com/api 
51 |     score: 0.91
52 |     metrics:
53 |       - metric_name: average_BLUE
54 |         metric_type: float
55 |         metric_value: 91.1
56 |       - metric_name: ROC
57 |         metric_type: float
58 |         metric_value: 91.1
59 |       - metric_name: GRE
60 |         metric_type: float
61 |         metric_value: 91.1
62 |       - metric_name: averageRoundReponseTIme
63 |         metric_type: int_ms
64 |         metric_value: 402
65 | timestamp_start_eval: 1742145905  # When the first prompt was sent to the first provider
66 | timestamp_end_eval: 1742149905   # When the last prompt was sent to the first provider 
67 | number_of_prompts_behind_score: 100
68 | private_test_data: false
69 | public_test_data: true
70 | test_data_publicized_within: 6months
71 | signature_of_above: 0xyu898uhgyhbjnkoyghjikoi9u8yghbjnkoiuyghjnkmougyhbjnkiyghbjnkoiytghji9u8yghb0xyu898uhgyhbjnkoyghjikoi9u8yghbjnkoiuyghjnkmougyhbjnkiyghbjnkoiytghji9u8yghb
72 | 


--------------------------------------------------------------------------------
/sample_data/example_evaluation_results_video.yaml:
--------------------------------------------------------------------------------
 1 | dataPoint:
 2 |   Validator: "Did:ridValidator:0x7A2B287c943C1119b650F350B07e7A4c9f02677C:university.edu"
 3 |   Provider: "Did:ridprovider:VideoAI:0x8F7C432b943C1119c550F350B07e7B4c9f02677C:api.videoai.com"
 4 |   Evaluation: "did:rideval:ImageCatDetectionV1:abcd1234efgh5678ijkl9012mnopqrst:github.com/engineer/AIbench"
 5 |   evaluations:
 6 |     - prompt_timestamp: "1678953600"  # epoch timestamp when the video was uploaded for analysis
 7 |       response_timestamp: "1678954200"  # epoch timestamp when the result was returned
 8 |       prompt:
 9 |         input_video_cid: "xyzvideo12345cid"  # CID for the input video file (can be video of a cat or not)
10 |         input_video_url: "https://example.com/path_to_video.mp4"  # URL to the video for remote input
11 |       response:
12 |         contains_cat: "true"  # Response indicating whether a cat was detected in the video (true/false)
13 |         detection_confidence: 0.98  # Confidence score for the cat detection
14 |         processing_time: 2.5  # Time taken for processing the video (in seconds)
15 |         frame_count: 300  # Total number of frames in the video
16 |         detected_objects_cid: "abc123detected_objects_cid"  # CID pointing to the file containing detected objects data
17 |       evals:
18 |         contains_cat_correctness: 1  # 0 or 1 indicating whether the evaluation result matches the truth
19 |         detection_accuracy: 0.95  # Model's accuracy in detecting objects in the video
20 |         processing_accuracy: 0.97  # Accuracy in processing the video correctly
21 |   validator_signature: "0x2F1e83bd09ad72bc9a81f1a09d2c9d88e39a871a"
22 | 


--------------------------------------------------------------------------------
/sample_data/example_evaluation_test_stream_publication.yaml:
--------------------------------------------------------------------------------
 1 | shortname: genLLMchatArena     #Author should put effort into ensureing this shortname is unique. It should be short as it will be typed and filtered on 
 2 | title: LLM question answer with blind A vs B comparison to get ELO score
 3 | specification_mutable_url :  https://github.com/lm-sys/FastChat/blob/main/docs/arena.md  # contents behind this url can evolve over time
 4 | specification_current_cid: 0x6ACB287c943C229c550F350B07e7B4c9f0212333    
 5 | specification_all_cid_versions: 
 6 |   - 0x6FCB287c943C1119c550F350B07e7B4c9f026001   
 7 |   - 0x6FCB287c943C1119c550F350B07e7B4c9f026000
 8 |   - 0x6ACB287c943C229c550F350B07e7B4c9f0212333
 9 | paper_pdf: https://arxiv.org/pdf/2403.04132.pdf   #optional 
10 | paper_doi: 10.48550/arXiv.2403.04132  #Optionally they could write the full url   https://doi.org/10.48550/arXiv.2403.04132 . DOI's are super good b/c  they are permanent and guranteed to be unique
11 | paper_BibTex: |                                   #optional       
12 |   @misc{chiang2024chatbotarenaopenplatform,
13 |   title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference}, 
14 |   author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
15 |   year={2024},
16 |   eprint={2403.04132},
17 |   archivePrefix={arXiv},
18 |   primaryClass={cs.AI},
19 |   url={https://arxiv.org/abs/2403.04132}, 
20 |   }
21 | input_dimensions:
22 |   - input_name : prompt_text
23 |     input_type : string
24 | evaluation_dimensions: 
25 |   - eval_name: BLUE 
26 |     eval_type: float
27 |   - eval_name: ResponseTime
28 |     eval_type: int
29 | aggregate_metrics:
30 |   - metric_name: average_BLUE
31 |     metric_type: float
32 |     metric_higher_is_better: true
33 |   - metric_name: averageRoundReponseTIme
34 |     metric_type: int_ms
35 |     metric_higher_is_better: false
36 | max_response_time_ms: 10000  #optional 
37 | minimum_sample_size: 100  #optional
38 | test_data_publicaion_required_within: 6months #optional 
39 | 


--------------------------------------------------------------------------------
/sample_data/example_provider_registration_publication.yaml:
--------------------------------------------------------------------------------
 1 | alsoKnownAs: [ ed://mt.lesan.ai, ed://mt.lesan.org ,  ed://mt.lesan ]  #priority sorted  shortnames 
 2 | domain_of_api: lesan.ai/api
 3 | domain_of_admin_email: admin@lesan.ai
 4 | billing_type: api_call_usage_billing_monthly_minimum
 5 | purchasingmethods: credit_card, polygon_usdc, polygon_matic 
 6 | target_evals: 
 7 |   - did:rideval:MTRobustBLUE:dasdasdasdasdasdasdasdasdasda:github.com/robust/MTbench/spec 
 8 |   - did:rideval:mtBlueHard:ddasdasdasd:github.com/abcde
 9 | cid_of_dot_eml_formated_dkim_signed_email_listing_the_evaluations_they_want_to_be_tested_for
10 | 
11 | 
12 | # or 
13 | 
14 | 
15 | shortname: [ ed://mt.lesan.ai, ed://mt.lesan.org ,  ed://mt.lesan, 
16 | domain_of_api: lesan.ai/api
17 | domain_of_admin_email: admin@lesan.ai
18 | billing_type: api_call_usage_billing_monthly_minimum
19 | purchasingmethods: credit_card, polygon_usdc, polygon_matic 
20 | target_evals: 
21 |   - cid: 0x6ACB287c943C229c550F350B07e7B4c9f0212333
22 |     url: https://github.com/lm-sys/FastChat/blob/main/docs/arena.md 
23 |     paper_doi: 10.48550/arXiv.2403.04132
24 |   - did:rideval:mtBlueHard:ddasdasdasd:github.com/abcde
25 | cid_of_dot_eml_formated_dkim_signed_email_listing_the_evaluations_they_want_to_be_tested_for
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/sample_data/turbine_aerodynamics_example_response_audit_.yaml:
--------------------------------------------------------------------------------
 1 | dataPoint:
 2 |  Validator: "Did:ridValidator:0x9A4B287c943C1119c550F350B07e7B4c9f02677C:mit.edu"
 3 |  Provider: "Did:ridprovider:WindTurbineCorp:0x5F7C432b943C1119c550F350B07e7B4c9f02677C:api.windturbine.com"
 4 |  Evaluation: "did:rideval:DesignEvalV1:abcd1234efgh5678ijkl9012mnopqrst:github.com/engineer/DesignBench"
 5 |  evaluations: 
 6 |     - prompt_timestamp: "1678953600" # epoch timestamp for the time the simulation was run
 7 |       response_timestamp: "1678954200" # epoch timestamp for the time the result was returned
 8 |       prompt:
 9 |         turbine_mesh_cid: ghjklkjhghjklkjhghjklcid
10 |         load_parma: hjkojhbnjkoihbnjkcid  #this is also a CID,  we can detect if its a cid in the first bytes  the eval designer can choose to include it in the name or nto 
11 |       Response: 
12 |         failure_prob_over50p: "true"   # resposnes & prompts can be  CID  to file, string<1000charters ,  number,  or array of numbers [123,1,2,3]
13 |         stresspoints: hjkojhbnjkoihbnjkcid
14 |       evals:
15 |         failure_prob_over50p_correctness: 1 #  0 or 1 
16 |         stresspoints_physically_plausalbe: 0.1 # The model's response is professional and polite
17 |         stresspoints_physically_correctness: 0.9 # No spelling errors in the response
18 |     - prompt_timestamp: "1678953610" # epoch timestamp for the time the simulation was run
19 |       response_timestamp: "1678954200" # epoch timestamp for the time the result was returned
20 |       prompt:
21 |         turbine_mesh_cid: ghjklkjhghjklkjhghjklcid
22 |         load_parma: aaaajhbnjkoihbnjkcid  #this is also a CID,  we can detect if its a cid in the first bytes  the eval designer can choose to include it in the name or nto 
23 |       Response: 
24 |         failure_prob_over50p: "true"   # resposnes & prompts can be  CID  to file, string<1000charters ,  number,  or array of numbers [123,1,2,3]
25 |         stresspoints: hjkojhbnjkoihbnjkcid
26 |       evals:
27 |         failure_prob_over50p_correctness: 1 #  0 or 1 
28 |         stresspoints_physically_plausalbe: 0.1 # The model's response is professional and polite
29 |         stresspoints_physically_correctness: 0.9 # No spelling errors in the response
30 | validator_signature: "0x2F1e83bd09ad72bc9a81f1a09d2c9d88e39a871a"
31 | 


--------------------------------------------------------------------------------