├── CODEOWNERS
├── taxeval
    ├── requirements.txt
    ├── README.md
    └── taxes.ipynb
├── open_llm_leaderboard
    ├── requirements.txt
    ├── README.md
    ├── gsm8k.ipynb
    ├── winogrande.ipynb
    ├── drop.ipynb
    ├── truthfulqa.ipynb
    ├── hellaswag.ipynb
    ├── arc.ipynb
    └── mmlu.ipynb
├── transcription
    ├── requirements.txt
    ├── README.md
    └── transcription.ipynb
├── eleuther_harness
    └── README.md
├── .github
    └── pull_request_template.md
├── CONTRIBUTING.md
├── LICENSE
├── .gitignore
├── alpaca-data
    ├── README.md
    └── alpaca.ipynb
└── README.md


/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @cabreraalex @neubig @Sparkier
2 | 


--------------------------------------------------------------------------------
/taxeval/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | zeno-client
3 | python-dotenv
4 | bertopic


--------------------------------------------------------------------------------
/open_llm_leaderboard/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | numpy
3 | pandas
4 | zeno-client
5 | python-dotenv


--------------------------------------------------------------------------------
/transcription/requirements.txt:
--------------------------------------------------------------------------------
1 | jiwer
2 | pandas
3 | openai-whisper
4 | zeno-client
5 | python-dotenv
6 | torch
7 | transformers
8 | tqdm


--------------------------------------------------------------------------------
/eleuther_harness/README.md:
--------------------------------------------------------------------------------
1 | # EleutherAI Harness
2 | 
3 | Use Zeno to visualize the data from the [Eleuther LM Evaluation Harness][1]!
4 | You can find all the details about how to upload harness results to Zeno in the
5 | [lm-evaluation-harness repo][2].
6 | 
7 | [1]: https://github.com/EleutherAI/lm-evaluation-harness
8 | [2]: https://github.com/EleutherAI/lm-evaluation-harness#visualizing-results
9 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!-- EDIT THE TITLE FIRST. -->
 2 | 
 3 | # Description
 4 | 
 5 | <!-- EDIT HERE:
 6 | Write a detailed description of this change,
 7 | including backgrounds, approaches, and any other information that are related to this change.
 8 | -->
 9 | 
10 | # References
11 | 
12 | <!-- EDIT HERE: Put the list of issues, discussions related to this change. -->
13 | 
14 | - Foo
15 | - Bar
16 | - Baz
17 | 
18 | # Blocked by
19 | 
20 | <!-- EDIT HERE IF ANY: Put the list of changes that have to be merged into the repository before merging this change. -->
21 | 
22 | - NA
23 | - (or link to PRs)
24 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to zeno-build
 2 | 
 3 | If you're reading this, you're probably interested in contributing to
 4 | `zeno-build`. Thank you for your interest!
 5 | 
 6 | We always welcome **new examples**, which you can create by navigating to the
 7 | [examples/] directory and adding a new sub-directory implementing a new example
 8 | (see the examples directory for details).
 9 | 
10 | ## Contribution Guide
11 | 
12 | If you want to make a contribution you can:
13 | 
14 | 1. Browse existing issues and select one to work on.
15 | 2. Create a new issue to discuss a feature that you might want to contribute
16 | 3. Send a PR directly
17 | 
18 | We'd recommend reaching out to us first so we can help out any way we can,
19 | but if you're confident in your contribution, you can go ahead and send a PR
20 | directly.
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Zeno
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/README.md:
--------------------------------------------------------------------------------
 1 | # Open LLM Leaderboard
 2 | 
 3 | Use Zeno to visualize the data and model outputs of the [Open LLM Leaderboard][1]!
 4 | There is a notebook for uploading the raw task data and model results for each
 5 | of the four tasks in the leaderboard.
 6 | The notebooks re-use the outputs from the leaderboard so you don't have to
 7 | run any inference to explore the results.
 8 | 
 9 | You can pick which model results to upload by passing in the org/model strings
10 | into the notebooks from
11 | the
12 | [Leaderboard Details](https://huggingface.co/datasets/open-llm-leaderboard/details/tree/main)
13 | dataset.
14 | 
15 | > Explore our
16 | > [example report](https://hub.zenoml.com/report/a13x/What%20does%20the%20OpenLLM%20Leaderboard%20measure%3F)
17 | > to get an idea of what the resulting data will look like.
18 | 
19 | ## Setup
20 | 
21 | To run this example, first install the requirements:
22 | 
23 | ```bash
24 | pip install -r requirements.txt
25 | ```
26 | 
27 | You'll then need to get an API key from Zeno Hub.
28 | Create an account at [https://hub.zenoml.com](https://hub.zenoml.com) and navigate
29 | to [your account page](https://hub.zenoml.com/account) to get an API key.
30 | Add this key as an environment variable, `ZENO_API_KEY`.
31 | 
32 | You can now run the notebooks to create a Zeno Project for any of the
33 | four benchmark datasets.
34 | 
35 | [1]: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # IDE stuff
132 | .vscode/
133 | *.swp
134 | 
135 | # Apple stuff
136 | .DS_Store
137 | 
138 | # Experimental results
139 | results*/
140 | .zeno_cache/
141 | zeno_cache/
142 | 


--------------------------------------------------------------------------------
/alpaca-data/README.md:
--------------------------------------------------------------------------------
1 | # Alpaca Instruction-Following Data
2 | 
3 | [![Open with Zeno](https://img.shields.io/badge/%20-Open_with_Zeno-612593.svg?labelColor=white&logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iMzMiIGhlaWdodD0iMzMiIHZpZXdCb3g9IjAgMCAzMyAzMyIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTMyIDE1Ljc4NDJMMTYuNDg2MiAxNS43ODQyTDE2LjQ4NjIgMC4yNzA0MDFMMjQuMzAyIDguMDg2MTdMMzIgMTUuNzg0MloiIGZpbGw9IiM2MTI1OTMiLz4KPHBhdGggZD0iTTE1Ljc5MTcgMTUuODMxMUw4LjAzNDc5IDguMDc0MjJMMTUuNzkxNyAwLjMxNzMyOEwxNS43OTE3IDE1LjgzMTFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTQuODY1NSAxNS44MzExTDcuNTk0ODUgMTUuODMxMUw3LjU5NDg1IDguNTYwNDJMMTQuODY1NSAxNS44MzExWiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjYiLz4KPHBhdGggZD0iTTYuMTEyOSAxNS44MzExTDMuMjQxNyAxNS44MzExTDMuMjQxNyAxMi44NjcyTDYuMTEyOSAxNS44MzExWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTIuNzMyMjggMTUuODMxTDEuNTE1NSAxNC42MTQzTDIuNzQyNzEgMTMuMzg3TDIuNzMyMjggMTUuODMxWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjMiLz4KPHBhdGggZD0iTTIuMDM3NiAxNS43ODQyTDEuMTU3NzEgMTUuNzg0MkwxLjE1NzcxIDE0Ljk1MDZMMi4wMzc2IDE1Ljc4NDJaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMC44MzM1NjggMTUuNzg0MUwwLjUwOTM5OSAxNS40NkwwLjgzMzU2NyAxNS4xMzU4TDAuODMzNTY4IDE1Ljc4NDFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNMC4xMDYxODcgMTUuNzk0NEwwLjMwMTAyNSAxNS41OTk2TDAuNDk1ODYzIDE1Ljc5NDRIMC4xMDYxODdaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNNi45NTIxMyAxNS44MjQ4TDMuNjQwOTkgMTIuNTEzN0w2Ljk2OTYzIDkuMTg1MDNMNi45NTIxMyAxNS44MjQ4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjUiLz4KPHBhdGggZD0iTTAuMjk0MjM1IDE2LjQ3OTVMMTUuODA4IDE2LjQ3OTVMMTUuODA4IDMxLjk5MzNMNy45OTIyMyAyNC4xNzc1TDAuMjk0MjM1IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIi8+CjxwYXRoIGQ9Ik0xNi40OTU2IDE3LjI0MzZMMjMuODUwNyAyNC41ODVMMTYuNDk1NiAzMS45NEwxNi40OTU2IDE3LjI0MzZaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTYuNTMyNiAxNi40Nzk1TDI0LjQ1MTUgMTYuNDc5NUwyNC40NTE1IDI0LjAyOEwxNi41MzI2IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNiIvPgo8cGF0aCBkPSJNMjYuMTgxMyAxNi40MzI2TDI5LjA1MjUgMTYuNDMyNkwyOS4wNTI1IDE5LjM5NjRMMjYuMTgxMyAxNi40MzI2WiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTI5LjU2MTkgMTYuNDMyNkwzMC43Nzg3IDE3LjY0OTRMMjkuNTUxNSAxOC44NzY2TDI5LjU2MTkgMTYuNDMyNloiIGZpbGw9IiM2QTFCOUEiIGZpbGwtb3BhY2l0eT0iMC4zIi8+CjxwYXRoIGQ9Ik0zMC4yNTY2IDE2LjQ3OTVMMzEuMTM2NSAxNi40Nzk1TDMxLjEzNjUgMTcuMzEzMUwzMC4yNTY2IDE2LjQ3OTVaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMzEuNDYwNiAxNi40Nzk1TDMxLjc4NDggMTYuODAzN0wzMS40NjA2IDE3LjEyNzlMMzEuNDYwNiAxNi40Nzk1WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTMyLjE4OCAxNi40NjkyTDMxLjk5MzIgMTYuNjY0MUwzMS43OTgzIDE2LjQ2OTJIMzIuMTg4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTI1LjM0MjEgMTYuNDM4OUwyOC42NTMyIDE5Ljc1TDI1LjMyNDYgMjMuMDc4NkwyNS4zNDIxIDE2LjQzODlaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNSIvPgo8L3N2Zz4K)](https://hub.zenoml.com/project/f192ed0b-c880-40cf-9d07-43f9d4cf176c/Alpaca%20Dataset)
4 | 
5 | Explore the [52k instruction-following data set](https://github.com/tatsu-lab/stanford_alpaca/tree/main?tab=readme-ov-file#data-release) generated to train the Alpaca model.
6 | 


--------------------------------------------------------------------------------
/alpaca-data/alpaca.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from zeno_client import ZenoClient\n",
 10 |     "import pandas as pd\n",
 11 |     "import os\n",
 12 |     "import dotenv\n",
 13 |     "\n",
 14 |     "dotenv.load_dotenv(override=True)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "client = ZenoClient(os.environ[\"ZENO_API_KEY\"])"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "df = pd.read_json(\"https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json\")"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "df[\"id\"] = df.index\n",
 42 |     "df[\"data\"] = df.apply(lambda x: {\"instruction\": x['instruction'], \"input\": x['input']}, axis=1)\n",
 43 |     "df[\"type\"] = df[\"instruction\"].str.split(\" \").str[0]\n",
 44 |     "df[\"has input\"] = df[\"input\"] != \"\"\n",
 45 |     "df[\"instruction length\"] = df[\"instruction\"].str.len()\n",
 46 |     "df[\"input length\"] = df[\"input\"].str.len()\n",
 47 |     "df[\"ouput length\"] = df[\"output\"].str.len()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "df[\"type\"].value_counts()\n",
 57 |     "top_20_types = df[\"type\"].value_counts().nlargest(20).index\n",
 58 |     "df.loc[~df[\"type\"].isin(top_20_types), \"type\"] = \"other\""
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "proj = client.create_project(\n",
 68 |     "    name=\"Alpaca Dataset\",\n",
 69 |     "    public=True,\n",
 70 |     "    view={\n",
 71 |     "        \"data\": {\n",
 72 |     "            \"type\": \"vstack\",\n",
 73 |     "            \"keys\": {\n",
 74 |     "                \"instruction\": {\"label\": \"instruction:\", \"type\": \"text\"},\n",
 75 |     "                \"input\": {\"label\": \"input:\", \"type\": \"text\"},\n",
 76 |     "            },\n",
 77 |     "        },\n",
 78 |     "        \"label\": {\"type\": \"text\"},\n",
 79 |     "        \"output\": {\"type\": \"text\"},\n",
 80 |     "    },\n",
 81 |     "    description=\"Explore the data that makes up the Alpaca instruction-tuned dataset.\",\n",
 82 |     ")"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "proj.upload_dataset(df, id_column=\"id\", data_column=\"data\", label_column=\"output\")"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": []
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "kernelspec": {
104 |    "display_name": "zeno-build",
105 |    "language": "python",
106 |    "name": "python3"
107 |   },
108 |   "language_info": {
109 |    "codemirror_mode": {
110 |     "name": "ipython",
111 |     "version": 3
112 |    },
113 |    "file_extension": ".py",
114 |    "mimetype": "text/x-python",
115 |    "name": "python",
116 |    "nbconvert_exporter": "python",
117 |    "pygments_lexer": "ipython3",
118 |    "version": "3.10.13"
119 |   }
120 |  },
121 |  "nbformat": 4,
122 |  "nbformat_minor": 2
123 | }
124 | 


--------------------------------------------------------------------------------
/taxeval/README.md:
--------------------------------------------------------------------------------
 1 | # TaxEval
 2 | 
 3 | [![Open with Zeno](https://img.shields.io/badge/%20-Open_with_Zeno-612593.svg?labelColor=white&logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iMzMiIGhlaWdodD0iMzMiIHZpZXdCb3g9IjAgMCAzMyAzMyIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTMyIDE1Ljc4NDJMMTYuNDg2MiAxNS43ODQyTDE2LjQ4NjIgMC4yNzA0MDFMMjQuMzAyIDguMDg2MTdMMzIgMTUuNzg0MloiIGZpbGw9IiM2MTI1OTMiLz4KPHBhdGggZD0iTTE1Ljc5MTcgMTUuODMxMUw4LjAzNDc5IDguMDc0MjJMMTUuNzkxNyAwLjMxNzMyOEwxNS43OTE3IDE1LjgzMTFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTQuODY1NSAxNS44MzExTDcuNTk0ODUgMTUuODMxMUw3LjU5NDg1IDguNTYwNDJMMTQuODY1NSAxNS44MzExWiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjYiLz4KPHBhdGggZD0iTTYuMTEyOSAxNS44MzExTDMuMjQxNyAxNS44MzExTDMuMjQxNyAxMi44NjcyTDYuMTEyOSAxNS44MzExWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTIuNzMyMjggMTUuODMxTDEuNTE1NSAxNC42MTQzTDIuNzQyNzEgMTMuMzg3TDIuNzMyMjggMTUuODMxWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjMiLz4KPHBhdGggZD0iTTIuMDM3NiAxNS43ODQyTDEuMTU3NzEgMTUuNzg0MkwxLjE1NzcxIDE0Ljk1MDZMMi4wMzc2IDE1Ljc4NDJaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMC44MzM1NjggMTUuNzg0MUwwLjUwOTM5OSAxNS40NkwwLjgzMzU2NyAxNS4xMzU4TDAuODMzNTY4IDE1Ljc4NDFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNMC4xMDYxODcgMTUuNzk0NEwwLjMwMTAyNSAxNS41OTk2TDAuNDk1ODYzIDE1Ljc5NDRIMC4xMDYxODdaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNNi45NTIxMyAxNS44MjQ4TDMuNjQwOTkgMTIuNTEzN0w2Ljk2OTYzIDkuMTg1MDNMNi45NTIxMyAxNS44MjQ4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjUiLz4KPHBhdGggZD0iTTAuMjk0MjM1IDE2LjQ3OTVMMTUuODA4IDE2LjQ3OTVMMTUuODA4IDMxLjk5MzNMNy45OTIyMyAyNC4xNzc1TDAuMjk0MjM1IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIi8+CjxwYXRoIGQ9Ik0xNi40OTU2IDE3LjI0MzZMMjMuODUwNyAyNC41ODVMMTYuNDk1NiAzMS45NEwxNi40OTU2IDE3LjI0MzZaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTYuNTMyNiAxNi40Nzk1TDI0LjQ1MTUgMTYuNDc5NUwyNC40NTE1IDI0LjAyOEwxNi41MzI2IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNiIvPgo8cGF0aCBkPSJNMjYuMTgxMyAxNi40MzI2TDI5LjA1MjUgMTYuNDMyNkwyOS4wNTI1IDE5LjM5NjRMMjYuMTgxMyAxNi40MzI2WiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTI5LjU2MTkgMTYuNDMyNkwzMC43Nzg3IDE3LjY0OTRMMjkuNTUxNSAxOC44NzY2TDI5LjU2MTkgMTYuNDMyNloiIGZpbGw9IiM2QTFCOUEiIGZpbGwtb3BhY2l0eT0iMC4zIi8+CjxwYXRoIGQ9Ik0zMC4yNTY2IDE2LjQ3OTVMMzEuMTM2NSAxNi40Nzk1TDMxLjEzNjUgMTcuMzEzMUwzMC4yNTY2IDE2LjQ3OTVaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMzEuNDYwNiAxNi40Nzk1TDMxLjc4NDggMTYuODAzN0wzMS40NjA2IDE3LjEyNzlMMzEuNDYwNiAxNi40Nzk1WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTMyLjE4OCAxNi40NjkyTDMxLjk5MzIgMTYuNjY0MUwzMS43OTgzIDE2LjQ2OTJIMzIuMTg4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTI1LjM0MjEgMTYuNDM4OUwyOC42NTMyIDE5Ljc1TDI1LjMyNDYgMjMuMDc4NkwyNS4zNDIxIDE2LjQzODlaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNSIvPgo8L3N2Zz4K)](https://hub.zenoml.com/project/e7519c8a-3c07-45ce-90f4-3a5ba4660ab6/LLM%20Taxes%20Benchmark)
 4 | 
 5 | In this example, we'll upload the output of the taxeval project.
 6 | To generate outputs for your own models, go to the [original repo](https://github.com/danielgross/taxeval).
 7 | 
 8 | The result of running Zeno Build will be an interface where you
 9 | can browse and explore the results. See an example below:
10 | 
11 | - [Browsing Interface](https://hub.zenoml.com/project/e7519c8a-3c07-45ce-90f4-3a5ba4660ab6/LLM%20Taxes%20Benchmark)
12 | - [Textual Summary](https://hub.zenoml.com/report/1717/Can%20LLMs%20do%20Your%20Taxes%3F)
13 | 
14 | ## Setup
15 | 
16 | To run this example, you'll need to install the requirements.
17 | 
18 | ```bash
19 | pip install -r requirements.txt
20 | ```
21 | 
22 | ## Run the Example
23 | 
24 | Follow `transcription.ipynb` to run inference and generate a Zeno project.
25 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/gsm8k.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# GSM8k Task \n",
  8 |     "\n",
  9 |     "Diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from zeno_client import ZenoClient, ZenoMetric\n",
 19 |     "import datasets\n",
 20 |     "import os\n",
 21 |     "import dotenv\n",
 22 |     "\n",
 23 |     "dotenv.load_dotenv(override=True)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "API_KEY = os.environ[\"ZENO_API_KEY\"]\n",
 33 |     "client = ZenoClient(API_KEY)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "models = [\"teknium__OpenHermes-2.5-Mistral-7B\", \"Weyaxi__OpenHermes-2.5-neural-chat-7b-v3-1-7B\"]"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "initial_df = datasets.load_dataset(\"gsm8k\", \"main\")[\"test\"].to_pandas()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "initial_df[\"id\"] = initial_df.index"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "proj = client.create_project(\n",
 70 |     "    name=\"GSM8K OpenLLM\", \n",
 71 |     "    view=\"text-classification\", \n",
 72 |     "    description=\"Open LLM GSM8K dataset\",\n",
 73 |     "    metrics=[\n",
 74 |     "        ZenoMetric(name=\"acc\", type=\"mean\", columns=[\"acc\"]),\n",
 75 |     "        ZenoMetric(name=\"avg. output length\", type=\"mean\", columns=[\"output length\"])\n",
 76 |     "    ]\n",
 77 |     ")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "proj.upload_dataset(initial_df, id_column=\"id\", data_column=\"question\", label_column=\"answer\")"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "for m in models:\n",
 96 |     "    output_df = datasets.load_dataset(\"open-llm-leaderboard/details_\" + m, \"harness_gsm8k_5\")[\"latest\"].to_pandas()\n",
 97 |     "    merged_df = output_df.merge(initial_df, left_on=\"example\", right_on=\"question\")\n",
 98 |     "    merged_df[\"output\"] = merged_df['predictions'].apply(lambda x: x[0])\n",
 99 |     "    merged_df[\"output length\"] = merged_df['output'].apply(lambda x: len(x))\n",
100 |     "    merged_df[\"acc\"] = merged_df['metrics'].apply(lambda x: x[\"acc\"])\n",
101 |     "    proj.upload_system(merged_df[[\"id\", \"output\", \"output length\", \"acc\"]], name=m, id_column=\"id\", output_column=\"output\")"
102 |    ]
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "compare",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.9.16"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 2
126 | }
127 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/winogrande.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# WinoGrande Task\n",
  8 |     "\n",
  9 |     "An adversarial and difficult Winograd benchmark at scale, for commonsense reasoning."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from zeno_client import ZenoClient, ZenoMetric\n",
 19 |     "import datasets\n",
 20 |     "import os\n",
 21 |     "import dotenv\n",
 22 |     "\n",
 23 |     "dotenv.load_dotenv(override=True)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "API_KEY = os.environ[\"ZENO_API_KEY\"]\n",
 33 |     "client = ZenoClient(API_KEY)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "models = [\"01-ai__Yi-34B_public\"]"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "base_df = datasets.load_dataset(\"winogrande\", \"winogrande_m\")[\"validation\"].to_pandas()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "proj = client.create_project(\n",
 61 |     "    name=\"WinoGrande OpenLLM Leaderboard\", \n",
 62 |     "    view=\"text-classification\", \n",
 63 |     "    description=\"WinoGrande task in the Open-LLM-Leaderboard (https://arxiv.org/abs/1907.10641).\",\n",
 64 |     "    metrics=[\n",
 65 |     "        ZenoMetric(name=\"acc\", type=\"mean\", columns=[\"acc\"])\n",
 66 |     "    ]\n",
 67 |     ")"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "base_df['id'] = base_df.index\n",
 77 |     "base_df[\"input length\"] = base_df[\"sentence\"].str.len()\n",
 78 |     "base_df[\"prompt\"] = base_df.apply(lambda x: f\"{x['sentence']}\\n\\n{x['option1']}\\n{x['option2']}\", axis=1)\n",
 79 |     "base_df[\"label\"] = base_df.apply(lambda x: x[\"option1\"] if int(x[\"answer\"]) == 1 else x[\"option2\"], axis=1)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "proj.upload_dataset(base_df[['id', \"prompt\", \"label\", \"input length\"]], id_column=\"id\", data_column=\"prompt\", label_column=\"label\")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "for m in models:\n",
 98 |     "    output_df = datasets.load_dataset(\"open-llm-leaderboard/details_\" + m, \"harness_winogrande_5\")[\"latest\"].to_pandas()\n",
 99 |     "    merged_df = output_df.merge(base_df, left_on=\"example\", right_on=\"sentence\")\n",
100 |     "    merged_df[\"output\"] = merged_df.apply(lambda x: f\"{x['option1'] if x['predictions'][0] > x['predictions'][1] else x['option2']} {x['predictions']}\", axis=1)\n",
101 |     "    merged_df[\"acc\"] = merged_df['metrics'].apply(lambda x: x[\"acc\"])\n",
102 |     "    proj.upload_system(merged_df[[\"id\", \"output\", \"acc\"]], name=m, id_column=\"id\", output_column=\"output\")"
103 |    ]
104 |   }
105 |  ],
106 |  "metadata": {
107 |   "kernelspec": {
108 |    "display_name": "compare",
109 |    "language": "python",
110 |    "name": "python3"
111 |   },
112 |   "language_info": {
113 |    "codemirror_mode": {
114 |     "name": "ipython",
115 |     "version": 3
116 |    },
117 |    "file_extension": ".py",
118 |    "mimetype": "text/x-python",
119 |    "name": "python",
120 |    "nbconvert_exporter": "python",
121 |    "pygments_lexer": "ipython3",
122 |    "version": "3.9.16"
123 |   }
124 |  },
125 |  "nbformat": 4,
126 |  "nbformat_minor": 2
127 | }
128 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Zeno Build
 2 | 
 3 | [![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)
 4 | [![Discord](https://img.shields.io/discord/1086004954872950834)](https://discord.gg/km62pDKAkE)
 5 | [![Open Zeno](https://img.shields.io/badge/%20-Open_Zeno-612593.svg?labelColor=white&logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iMzMiIGhlaWdodD0iMzMiIHZpZXdCb3g9IjAgMCAzMyAzMyIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTMyIDE1Ljc4NDJMMTYuNDg2MiAxNS43ODQyTDE2LjQ4NjIgMC4yNzA0MDFMMjQuMzAyIDguMDg2MTdMMzIgMTUuNzg0MloiIGZpbGw9IiM2MTI1OTMiLz4KPHBhdGggZD0iTTE1Ljc5MTcgMTUuODMxMUw4LjAzNDc5IDguMDc0MjJMMTUuNzkxNyAwLjMxNzMyOEwxNS43OTE3IDE1LjgzMTFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTQuODY1NSAxNS44MzExTDcuNTk0ODUgMTUuODMxMUw3LjU5NDg1IDguNTYwNDJMMTQuODY1NSAxNS44MzExWiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjYiLz4KPHBhdGggZD0iTTYuMTEyOSAxNS44MzExTDMuMjQxNyAxNS44MzExTDMuMjQxNyAxMi44NjcyTDYuMTEyOSAxNS44MzExWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTIuNzMyMjggMTUuODMxTDEuNTE1NSAxNC42MTQzTDIuNzQyNzEgMTMuMzg3TDIuNzMyMjggMTUuODMxWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjMiLz4KPHBhdGggZD0iTTIuMDM3NiAxNS43ODQyTDEuMTU3NzEgMTUuNzg0MkwxLjE1NzcxIDE0Ljk1MDZMMi4wMzc2IDE1Ljc4NDJaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMC44MzM1NjggMTUuNzg0MUwwLjUwOTM5OSAxNS40NkwwLjgzMzU2NyAxNS4xMzU4TDAuODMzNTY4IDE1Ljc4NDFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNMC4xMDYxODcgMTUuNzk0NEwwLjMwMTAyNSAxNS41OTk2TDAuNDk1ODYzIDE1Ljc5NDRIMC4xMDYxODdaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNNi45NTIxMyAxNS44MjQ4TDMuNjQwOTkgMTIuNTEzN0w2Ljk2OTYzIDkuMTg1MDNMNi45NTIxMyAxNS44MjQ4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjUiLz4KPHBhdGggZD0iTTAuMjk0MjM1IDE2LjQ3OTVMMTUuODA4IDE2LjQ3OTVMMTUuODA4IDMxLjk5MzNMNy45OTIyMyAyNC4xNzc1TDAuMjk0MjM1IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIi8+CjxwYXRoIGQ9Ik0xNi40OTU2IDE3LjI0MzZMMjMuODUwNyAyNC41ODVMMTYuNDk1NiAzMS45NEwxNi40OTU2IDE3LjI0MzZaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTYuNTMyNiAxNi40Nzk1TDI0LjQ1MTUgMTYuNDc5NUwyNC40NTE1IDI0LjAyOEwxNi41MzI2IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNiIvPgo8cGF0aCBkPSJNMjYuMTgxMyAxNi40MzI2TDI5LjA1MjUgMTYuNDMyNkwyOS4wNTI1IDE5LjM5NjRMMjYuMTgxMyAxNi40MzI2WiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTI5LjU2MTkgMTYuNDMyNkwzMC43Nzg3IDE3LjY0OTRMMjkuNTUxNSAxOC44NzY2TDI5LjU2MTkgMTYuNDMyNloiIGZpbGw9IiM2QTFCOUEiIGZpbGwtb3BhY2l0eT0iMC4zIi8+CjxwYXRoIGQ9Ik0zMC4yNTY2IDE2LjQ3OTVMMzEuMTM2NSAxNi40Nzk1TDMxLjEzNjUgMTcuMzEzMUwzMC4yNTY2IDE2LjQ3OTVaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMzEuNDYwNiAxNi40Nzk1TDMxLjc4NDggMTYuODAzN0wzMS40NjA2IDE3LjEyNzlMMzEuNDYwNiAxNi40Nzk1WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTMyLjE4OCAxNi40NjkyTDMxLjk5MzIgMTYuNjY0MUwzMS43OTgzIDE2LjQ2OTJIMzIuMTg4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTI1LjM0MjEgMTYuNDM4OUwyOC42NTMyIDE5Ljc1TDI1LjMyNDYgMjMuMDc4NkwyNS4zNDIxIDE2LjQzODlaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNSIvPgo8L3N2Zz4K)](https://hub.zenoml.com)
 6 | 
 7 | **Zeno Build** is a collection of examples using **Zeno** to evaluate generative
 8 | AI models. Use it to get started with common evaluation setups.
 9 | 
10 | The examples in this repository are architecture agnostic, we don't care if you
11 | are using
12 | [OpenAI](https://openai.com/),
13 | [LangChain](https://github.com/hwchase17/langchain), or [Hugging
14 | Face](https://huggingface.co).
15 | 
16 | Sound interesting? Read on!
17 | 
18 | ## Examples
19 | 
20 | Each of the examples in this repository is specifically designed to be
21 | self-contained and easy to modify. To get started with
22 | Zeno, we suggest that you find the closest example to what you're trying
23 | to do, copy the example to the new directory, and start hacking!
24 | 
25 | ## Contributing Back
26 | 
27 | If you build something cool, **we'd love for you to contribute it back**. We
28 | welcome pull requests of both new examples and new functionality for the core
29 | `zeno_build` library. If this is of interest to you, please click through to
30 | our [contributing doc](contributing.md) doc to learn more.
31 | 
32 | ## Get in Touch
33 | 
34 | If you have any questions, feature requests, bug reports, etc., we recommend
35 | getting in touch via the github [issues
36 | page](https://github.com/zeno-ml/zeno-build/issues) or
37 | [discord](https://discord.gg/km62pDKAkE), where the community can discuss and/or
38 | implement your suggestions!
39 | 


--------------------------------------------------------------------------------
/transcription/README.md:
--------------------------------------------------------------------------------
 1 | # Audio Transcription
 2 | 
 3 | [![Open with Zeno](https://img.shields.io/badge/%20-Open_with_Zeno-612593.svg?labelColor=white&logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iMzMiIGhlaWdodD0iMzMiIHZpZXdCb3g9IjAgMCAzMyAzMyIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTMyIDE1Ljc4NDJMMTYuNDg2MiAxNS43ODQyTDE2LjQ4NjIgMC4yNzA0MDFMMjQuMzAyIDguMDg2MTdMMzIgMTUuNzg0MloiIGZpbGw9IiM2MTI1OTMiLz4KPHBhdGggZD0iTTE1Ljc5MTcgMTUuODMxMUw4LjAzNDc5IDguMDc0MjJMMTUuNzkxNyAwLjMxNzMyOEwxNS43OTE3IDE1LjgzMTFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTQuODY1NSAxNS44MzExTDcuNTk0ODUgMTUuODMxMUw3LjU5NDg1IDguNTYwNDJMMTQuODY1NSAxNS44MzExWiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjYiLz4KPHBhdGggZD0iTTYuMTEyOSAxNS44MzExTDMuMjQxNyAxNS44MzExTDMuMjQxNyAxMi44NjcyTDYuMTEyOSAxNS44MzExWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTIuNzMyMjggMTUuODMxTDEuNTE1NSAxNC42MTQzTDIuNzQyNzEgMTMuMzg3TDIuNzMyMjggMTUuODMxWiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjMiLz4KPHBhdGggZD0iTTIuMDM3NiAxNS43ODQyTDEuMTU3NzEgMTUuNzg0MkwxLjE1NzcxIDE0Ljk1MDZMMi4wMzc2IDE1Ljc4NDJaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMC44MzM1NjggMTUuNzg0MUwwLjUwOTM5OSAxNS40NkwwLjgzMzU2NyAxNS4xMzU4TDAuODMzNTY4IDE1Ljc4NDFaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNMC4xMDYxODcgMTUuNzk0NEwwLjMwMTAyNSAxNS41OTk2TDAuNDk1ODYzIDE1Ljc5NDRIMC4xMDYxODdaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuMSIvPgo8cGF0aCBkPSJNNi45NTIxMyAxNS44MjQ4TDMuNjQwOTkgMTIuNTEzN0w2Ljk2OTYzIDkuMTg1MDNMNi45NTIxMyAxNS44MjQ4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjUiLz4KPHBhdGggZD0iTTAuMjk0MjM1IDE2LjQ3OTVMMTUuODA4IDE2LjQ3OTVMMTUuODA4IDMxLjk5MzNMNy45OTIyMyAyNC4xNzc1TDAuMjk0MjM1IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIi8+CjxwYXRoIGQ9Ik0xNi40OTU2IDE3LjI0MzZMMjMuODUwNyAyNC41ODVMMTYuNDk1NiAzMS45NEwxNi40OTU2IDE3LjI0MzZaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuOCIvPgo8cGF0aCBkPSJNMTYuNTMyNiAxNi40Nzk1TDI0LjQ1MTUgMTYuNDc5NUwyNC40NTE1IDI0LjAyOEwxNi41MzI2IDE2LjQ3OTVaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNiIvPgo8cGF0aCBkPSJNMjYuMTgxMyAxNi40MzI2TDI5LjA1MjUgMTYuNDMyNkwyOS4wNTI1IDE5LjM5NjRMMjYuMTgxMyAxNi40MzI2WiIgZmlsbD0iIzZBMUI5QSIgZmlsbC1vcGFjaXR5PSIwLjQiLz4KPHBhdGggZD0iTTI5LjU2MTkgMTYuNDMyNkwzMC43Nzg3IDE3LjY0OTRMMjkuNTUxNSAxOC44NzY2TDI5LjU2MTkgMTYuNDMyNloiIGZpbGw9IiM2QTFCOUEiIGZpbGwtb3BhY2l0eT0iMC4zIi8+CjxwYXRoIGQ9Ik0zMC4yNTY2IDE2LjQ3OTVMMzEuMTM2NSAxNi40Nzk1TDMxLjEzNjUgMTcuMzEzMUwzMC4yNTY2IDE2LjQ3OTVaIiBmaWxsPSIjNkExQjlBIiBmaWxsLW9wYWNpdHk9IjAuMiIvPgo8cGF0aCBkPSJNMzEuNDYwNiAxNi40Nzk1TDMxLjc4NDggMTYuODAzN0wzMS40NjA2IDE3LjEyNzlMMzEuNDYwNiAxNi40Nzk1WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTMyLjE4OCAxNi40NjkyTDMxLjk5MzIgMTYuNjY0MUwzMS43OTgzIDE2LjQ2OTJIMzIuMTg4WiIgZmlsbD0iIzYxMjU5MyIgZmlsbC1vcGFjaXR5PSIwLjEiLz4KPHBhdGggZD0iTTI1LjM0MjEgMTYuNDM4OUwyOC42NTMyIDE5Ljc1TDI1LjMyNDYgMjMuMDc4NkwyNS4zNDIxIDE2LjQzODlaIiBmaWxsPSIjNjEyNTkzIiBmaWxsLW9wYWNpdHk9IjAuNSIvPgo8L3N2Zz4K)](https://hub.zenoml.com/report/cabreraalex/Audio%20Transcription%20Report)
 4 | 
 5 | Audio transcription is an essential task for applications such as voice assistants,
 6 | podcast search, and video captioning. There are numerous open-source and commercial
 7 | tools for audio transcription, and it can be difficult to know which one to use.
 8 | [OpenAI's Whisper](https://github.com/openai/whisper) API is often people's
 9 | go-to choice,but there are nine different models to choose from with different
10 | sizes, speeds, and cost.
11 | 
12 | In this example, we'll use Zeno to compare the performance of the different
13 | models on the [Speech Accent Archive](https://accent.gmu.edu/) dataset.
14 | The dataset has over 2,000 people from around the world reading the same
15 | paragraph in English.We'll use the dataset to evaluate the performance of
16 | the different models on different accents and English fluency levels.
17 | 
18 | The result of running Zeno Build will be an interface where you
19 | can browse and explore the results. See an example below:
20 | 
21 | - [Browsing Interface](https://hub.zenoml.com/project/cabreraalex/Audio%20Transcription%20Accents/explore)
22 | - [Textual Summary](https://hub.zenoml.com/report/cabreraalex/Audio%20Transcription%20Report)
23 | 
24 | ## Setup
25 | 
26 | To run this example, you'll need to install the requirements.
27 | 
28 | ```bash
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | This example also requires the `ffmpeg` library to be installed. You can test
33 | if it is installed by running `ffmpeg --help`. If it is not found, you should
34 | install it through your package manager. For example, if you are using conda,
35 | you can just run the following (and other managers such as `brew` and `apt` also
36 | work).
37 | 
38 | ```bash
39 | conda install ffmpeg
40 | ```
41 | 
42 | ## Run the Example
43 | 
44 | Follow `transcription.ipynb` to run inference and generate a Zeno project.
45 | 


--------------------------------------------------------------------------------
/taxeval/taxes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Setup\n",
  8 |     "\n",
  9 |     "Import libraries, set up model, and read input data.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "import json\n",
 20 |     "import os\n",
 21 |     "from dotenv import load_dotenv\n",
 22 |     "\n",
 23 |     "from zeno_client import ZenoClient, ZenoMetric"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "data = json.load(open(\"tax-benchmark.json\"))"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "def format_question(input):\n",
 42 |     "    return_question = input[\"source_question\"][\"description\"].replace(\"\\\\n\", \"\\n\")\n",
 43 |     "    return_question += \"\\n\\n\"\n",
 44 |     "    for answer in enumerate(input[\"source_question\"][\"options\"]):\n",
 45 |     "        return_question += f\"{answer[0] + 1}. {answer[1]}\\n\"\n",
 46 |     "    return return_question\n",
 47 |     "\n",
 48 |     "\n",
 49 |     "df_input = pd.DataFrame(\n",
 50 |     "    {\n",
 51 |     "        \"question\": [format_question(d) for d in data],\n",
 52 |     "        \"answer\": [str(d[\"source_question\"][\"correct_answer\"]) for d in data],\n",
 53 |     "        \"reference\": [d[\"source_question\"][\"reference\"] for d in data],\n",
 54 |     "        \"tag\": [d[\"source_question\"][\"tag\"] for d in data],\n",
 55 |     "        \"category\": [d[\"source_question\"][\"category\"] for d in data],\n",
 56 |     "    }\n",
 57 |     ")\n",
 58 |     "df_input[\"question length\"] = df_input[\"question\"].apply(lambda x: len(x))\n",
 59 |     "df_input[\"id\"] = df_input.index"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# optional, generate topics using BERTopic\n",
 69 |     "from bertopic import BERTopic\n",
 70 |     "\n",
 71 |     "topic_model = BERTopic(\"english\", min_topic_size=3)\n",
 72 |     "topics, probs = topic_model.fit_transform(\n",
 73 |     "    [d[\"source_question\"][\"description\"] for d in data]\n",
 74 |     ")\n",
 75 |     "df_input[\"topic\"] = topics\n",
 76 |     "df_input[\"topic\"] = df_input[\"topic\"].astype(str)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Create Zeno Project\n",
 84 |     "\n",
 85 |     "Our view configuration will feature markdown for the input data and the system output.\n",
 86 |     "We'll add two metrics, accuracy and output length.\n"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "load_dotenv(\"../.env\", override=True)\n",
 96 |     "client = ZenoClient(os.environ.get(\"ZENO_API_KEY\"))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "project = client.create_project(\n",
106 |     "    name=\"LLM Taxes Benchmark\",\n",
107 |     "    view={\n",
108 |     "        \"data\": {\"type\": \"markdown\"},\n",
109 |     "        \"label\": {\"type\": \"text\"},\n",
110 |     "        \"output\": {\"type\": \"markdown\"},\n",
111 |     "    },\n",
112 |     "    description=\"Tax questions for LLMs\",\n",
113 |     "    public=True,\n",
114 |     "    metrics=[\n",
115 |     "        ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"]),\n",
116 |     "        ZenoMetric(name=\"output length\", type=\"mean\", columns=[\"output length\"]),\n",
117 |     "    ],\n",
118 |     ")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "project.upload_dataset(\n",
128 |     "    df_input, id_column=\"id\", data_column=\"question\", label_column=\"answer\"\n",
129 |     ")"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "for model in data[0][\"full\"].keys():\n",
139 |     "    df_system = pd.DataFrame(\n",
140 |     "        {\n",
141 |     "            \"output\": [\n",
142 |     "                f\"**Full:** {d['full'][model]}\\n\\n**Simplified**: {d['simplified'][model]}\"\n",
143 |     "                for d in data\n",
144 |     "            ],\n",
145 |     "            \"output length\": [len(d[\"full\"][model]) for d in data],\n",
146 |     "            \"simplified output\": [str(d[\"simplified\"][model]) for d in data],\n",
147 |     "        }\n",
148 |     "    )\n",
149 |     "    df_system[\"correct\"] = df_input[\"answer\"] == df_system[\"simplified output\"]\n",
150 |     "    df_system[\"id\"] = df_input[\"id\"]\n",
151 |     "    project.upload_system(\n",
152 |     "        df_system, name=model.replace(\"/\", \"-\"), id_column=\"id\", output_column=\"output\"\n",
153 |     "    )"
154 |    ]
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "compare",
160 |    "language": "python",
161 |    "name": "python3"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.10.11"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 2
178 | }
179 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/drop.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# DROP Task\n",
  8 |     "\n",
  9 |     "English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from zeno_client import ZenoClient, ZenoMetric\n",
 19 |     "import datasets\n",
 20 |     "import os\n",
 21 |     "import dotenv\n",
 22 |     "\n",
 23 |     "dotenv.load_dotenv(override=True)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "client = ZenoClient(os.environ[\"ZENO_API_KEY\"])"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Select which models you want to analyze. These paths should represent the name of the repositories when clicking on the 📄 icon next to models on the leaderboard *without* the `details_` start: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# falcon-180B, llama2-70B, mistral-7B (are underperforming)\n",
 49 |     "# Yi-34B, tigerbot-70B, possibly internlm-20B (have a good perf)\n",
 50 |     "# facebook/xglm-7.5B falls in the middle\n",
 51 |     "models = [\"01-ai__Yi-34B_public\", \"TigerResearch__tigerbot-70b-chat\", \"tiiuae__falcon-180B\", \"mistralai__Mistral-7B-v0.1\", \"facebook__xglm-7.5B\"]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "We load the base DROP dataset to get the gold-truth answers. We also do some processing on the inputs."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "base_df = datasets.load_dataset(\"drop\")[\"validation\"].to_pandas()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "base_df = base_df.drop_duplicates(subset=['query_id'])\n",
 77 |     "base_df['input'] = base_df.apply(lambda x: f\"**Passage**: {x['passage']} \\n\\n**Question:** {x['question']}\", axis=1)\n",
 78 |     "base_df['answers'] = base_df.apply(lambda x: \", \".join(x['answers_spans']['spans']), axis=1)\n",
 79 |     "base_df['answer type'] = base_df[\"answers_spans\"].apply(lambda x: x['types'][0])\n",
 80 |     "base_df['passage length'] = base_df['passage'].str.len()\n",
 81 |     "base_df['question length'] = base_df['question'].str.len()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "proj = client.create_project(\n",
 91 |     "    name=\"OpenLLM Leaderboard DROP Comparison\", \n",
 92 |     "    view={\n",
 93 |     "        \"data\": {\n",
 94 |     "            \"type\": \"markdown\"\n",
 95 |     "        },\n",
 96 |     "        \"label\": {\n",
 97 |     "            \"type\": \"text\"\n",
 98 |     "        },\n",
 99 |     "        \"output\": {\n",
100 |     "            \"type\": \"text\"\n",
101 |     "        } \n",
102 |     "    }, \n",
103 |     "    description=\"Exploring performance differences on DROP for models in OpenLLM Leaderboard.\",\n",
104 |     "    metrics=[\n",
105 |     "        ZenoMetric(name=\"f1\", type=\"mean\", columns=[\"f1\"]),\n",
106 |     "        ZenoMetric(name=\"em\", type=\"mean\", columns=[\"em\"]),\n",
107 |     "        ZenoMetric(name=\"avg output length\", type=\"mean\", columns=[\"output length\"])\n",
108 |     "    ]\n",
109 |     ")"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "proj.upload_dataset(base_df[[\"query_id\", \"input\", \"answers\", \"passage length\", \"question length\", \"answer type\"]], id_column=\"query_id\", data_column=\"input\", label_column=\"answers\")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "output_df = datasets.load_dataset(\"open-llm-leaderboard/details_\" + models[1], \"harness_drop_3\")[\"latest\"].to_pandas()"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "for m in models:\n",
137 |     "    print(\"uploading \", m)\n",
138 |     "    output_df = datasets.load_dataset(\"open-llm-leaderboard/details_\" + m, \"harness_drop_3\")[\"latest\"].to_pandas()\n",
139 |     "    merged_df = output_df.merge(base_df, left_on=\"example\", right_on=\"query_id\").drop_duplicates(subset=['query_id'])\n",
140 |     "    merged_df['output'] = merged_df['predictions'].apply(lambda x: x[0] if len(x) > 0 else '')\n",
141 |     "    if \"metrics\" in merged_df.columns:\n",
142 |     "        merged_df[\"f1\"] = merged_df['metrics'].apply(lambda x: x['f1'])\n",
143 |     "        merged_df[\"em\"] = merged_df['metrics'].apply(lambda x: x['em'])\n",
144 |     "    else:\n",
145 |     "        merged_df[\"f1\"] = merged_df['f1']\n",
146 |     "        merged_df[\"em\"] = merged_df['em']\n",
147 |     "    merged_df['output length'] = merged_df['output'].str.len()\n",
148 |     "    proj.upload_system(merged_df[[\"query_id\", \"output\", \"f1\", \"em\", \"output length\"]], name=m, id_column=\"query_id\", output_column=\"output\")"
149 |    ]
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "compare",
155 |    "language": "python",
156 |    "name": "python3"
157 |   },
158 |   "language_info": {
159 |    "codemirror_mode": {
160 |     "name": "ipython",
161 |     "version": 3
162 |    },
163 |    "file_extension": ".py",
164 |    "mimetype": "text/x-python",
165 |    "name": "python",
166 |    "nbconvert_exporter": "python",
167 |    "pygments_lexer": "ipython3",
168 |    "version": "3.9.16"
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 2
173 | }
174 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/truthfulqa.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# TruthfulQA\n",
  8 |     "\n",
  9 |     "TruthfulQA differs quite a bit from the other tasks as there is not a single correct answer. Interestingly, all the true responses come before the false responses."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from zeno_client import ZenoClient, ZenoMetric\n",
 19 |     "import datasets\n",
 20 |     "import os\n",
 21 |     "\n",
 22 |     "API_KEY = os.environ[\"ZENO_API_KEY\"]"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "Feel free to change the list of models used.\n",
 30 |     "You can go to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to check what models are available.\n",
 31 |     "Some of them might not have associated data, you can check this by clicking on the little icon next to the model name.\n",
 32 |     "If you get a 404 after clicking, we won't be able to fetch the model data and this notebook will crash."
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "models = [\"meta-llama/Llama-2-70b-hf\", \"mistralai/Mistral-7B-v0.1\", \"tiiuae/falcon-40b\", \"Riiid/sheep-duck-llama-2-70b-v1.1\", \"AIDC-ai-business/Marcoroni-70B-v1\", \"ICBU-NPU/FashionGPT-70B-V1.1\", \"adonlee/LLaMA_2_70B_LoRA\", \"uni-tianyan/Uni-TianYan\"]"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "def get_data(model: str):\n",
 51 |     "    data_path = \"details_\" + model.replace(\"/\", \"__\")\n",
 52 |     "    return datasets.load_dataset(\n",
 53 |     "        \"open-llm-leaderboard/\" + data_path,\n",
 54 |     "        \"harness_truthfulqa_mc_0\",\n",
 55 |     "    )"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "def generate_dataset(df):\n",
 65 |     "    df_lim = df[[\"question\", \"mc2_targets\"]]\n",
 66 |     "    df_lim.loc[:, \"data\"] = df_lim.apply(lambda x: \"\\n\" + x[\"question\"] + \"\\n\\n\" + \"\\n\".join(map(lambda y: \"- \" + str(y).lstrip(), x[\"mc2_targets\"][\"choices\"])), axis=1)\n",
 67 |     "    df_lim.loc[:, \"# of options\"] = df_lim.apply(lambda x: len(x[\"mc2_targets\"][\"choices\"]), axis=1)\n",
 68 |     "    df_lim.loc[:, \"# of answers\"] = df_lim.apply(lambda x: x[\"mc2_targets\"][\"labels\"].sum(), axis=1)\n",
 69 |     "    df_lim.loc[:, \"label\"] = df_lim[\"mc2_targets\"].apply(lambda r: \"\\n\" + \"\\n\".join([\"True\" if s == 1 else \"False\" for s in r[\"labels\"]]))\n",
 70 |     "    df_lim = df_lim.drop(columns=[\"mc2_targets\", \"question\"])\n",
 71 |     "    df_lim[\"id\"] = df_lim.index \n",
 72 |     "    return df_lim\n",
 73 |     "\n",
 74 |     "def generate_system(df):\n",
 75 |     "    df_system = df[[\"predictions\", \"mc2\", \"mc1_targets\"]]\n",
 76 |     "    df_system[\"predictions\"] = df_system.apply(lambda x: f\"mc2: {x['mc2']}\" + \"\\n\\n\" + \"\\n\".join(map(lambda y: str(round(y, 2)), x['predictions'][len(x['mc1_targets']['choices']):])), axis=1)\n",
 77 |     "    df_system[\"id\"] = df_system.index\n",
 78 |     "    return df_system[[\"predictions\", \"mc2\", \"id\"]]"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "Make sure you have your Zeno API key in your environment variables."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "client = ZenoClient(API_KEY)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "Lets create a project to hold the data for the TruthfulQA task."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "proj = client.create_project(\n",
111 |     "    name=\"TruthfulQA\", \n",
112 |     "    view=\"text-classification\", \n",
113 |     "    description=\"TruthfulQA (https://arxiv.org/abs/2109.07958) task in the Open-LLM-Leaderboard.\",\n",
114 |     "    metrics=[\n",
115 |     "        ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"mc2\"])\n",
116 |     "    ]\n",
117 |     ")"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "Let us now upload the data to the project we just created."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "df = generate_dataset(get_data(models[0])['latest'].to_pandas())\n",
134 |     "print(\"\\nYour dataset has {} rows\\n\".format(len(df)))\n",
135 |     "num_rows = len(df)\n",
136 |     "proj.upload_dataset(df, id_column=\"id\", label_column=\"label\", data_column=\"data\")"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "Finally, let us upload all the model outputs for the models we specified above."
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "for model in models:\n",
153 |     "    dataset = get_data(model)['latest'].to_pandas()\n",
154 |     "    print(dataset.columns)\n",
155 |     "    if len(dataset) != num_rows:\n",
156 |     "        print(\"Skipping {} because it has {} rows instead of {}\".format(model, len(dataset), num_rows))\n",
157 |     "        continue\n",
158 |     "    df_system = generate_system(dataset)\n",
159 |     "    proj.upload_system(df_system[[\"predictions\", \"mc2\", \"id\"]], name=model.replace('/', \"__\"), output_column=\"predictions\", id_column=\"id\")"
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "compare",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.10.11"
180 |   },
181 |   "orig_nbformat": 4
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 2
185 | }
186 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/hellaswag.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# HellaSwag\n",
  8 |     "\n",
  9 |     "HellaSwag is a common-sense inference task."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from zeno_client import ZenoClient, ZenoMetric\n",
 19 |     "import datasets\n",
 20 |     "import numpy as np\n",
 21 |     "import os\n",
 22 |     "\n",
 23 |     "API_KEY = os.environ[\"ZENO_API_KEY\"]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "Feel free to change the list of models used.\n",
 31 |     "You can go to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to check what models are available.\n",
 32 |     "Some of them might not have associated data, you can check this by clicking on the little icon next to the model name.\n",
 33 |     "If you get a 404 after clicking, we won't be able to fetch the model data and this notebook will crash."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "models = [\"meta-llama/Llama-2-70b-hf\", \"mistralai/Mistral-7B-v0.1\", \"tiiuae/falcon-40b\", \"Riiid/sheep-duck-llama-2-70b-v1.1\", \"AIDC-ai-business/Marcoroni-70B-v1\", \"ICBU-NPU/FashionGPT-70B-V1.1\", \"adonlee/LLaMA_2_70B_LoRA\", \"uni-tianyan/Uni-TianYan\"]"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "def get_data(model: str):\n",
 52 |     "    data_path = \"details_\" + model.replace(\"/\", \"__\")\n",
 53 |     "    return datasets.load_dataset(\n",
 54 |     "        \"open-llm-leaderboard/\" + data_path,\n",
 55 |     "        \"harness_hellaswag_10\",\n",
 56 |     "    )"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "labels = [\"A\", \"B\", \"C\", \"D\"]\n",
 66 |     "\n",
 67 |     "def generate_dataset(df):\n",
 68 |     "    df_lim = df[[\"example\", \"choices\", \"gold\"]]\n",
 69 |     "    df_lim.loc[:, \"data\"] = df_lim.apply(lambda x: \"\\n\" + x[\"example\"] + \"\\n\\nOptions:\\n\" + \"\\n\".join(f\"{labels[i]}: {x}\" for i,x in enumerate(x['choices'])), axis=1)\n",
 70 |     "    df_lim.loc[:, \"label\"] = df_lim.apply(lambda x: labels[x[\"gold\"]], axis=1)\n",
 71 |     "    df_lim = df_lim.drop(columns=[\"example\", \"choices\", \"gold\"])\n",
 72 |     "    df_lim[\"id\"] = df_lim.index \n",
 73 |     "    return df_lim\n",
 74 |     "\n",
 75 |     "def generate_system(df):\n",
 76 |     "    df_system = df[[\"predictions\", \"acc_norm\", \"choices\", \"acc\"]]\n",
 77 |     "    df_system[\"answer_raw\"] = df_system.apply(lambda x: labels[np.argmax(x['predictions'])], axis=1)\n",
 78 |     "    df_system[\"answer_norm\"] = df_system.apply(lambda x: labels[np.argmax(x['predictions']/np.array([float(len(i)) for i in x['choices']]))], axis=1)\n",
 79 |     "    df_system[\"predictions\"] = df_system.apply(lambda x: x['answer_norm'] + \"\\n\\n\" + \"Raw Pred.: \" + \", \".join(map(lambda y: str(round(y, 2)), x['predictions'])) + \"\\nNorm Pred.: \" + \", \".join(map(lambda y: str(round(y, 2)), x['predictions']/np.array([float(len(i)) for i in x['choices']]))), axis=1)\n",
 80 |     "    df_system[\"correct\"] = df_system.apply(lambda x: True if x['acc_norm'] > 0 else False, axis=1)\n",
 81 |     "    df_system[\"correct_raw\"] = df_system.apply(lambda x: True if x['acc'] > 0 else False, axis=1)\n",
 82 |     "    df_system = df_system.drop(columns=[\"acc_norm\", \"choices\", \"acc\"])\n",
 83 |     "    df_system[\"id\"] = df_system.index\n",
 84 |     "    return df_system"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Make sure you have your Zeno API key in your environment variables."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "client = ZenoClient(API_KEY)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "Lets create a project to hold the data for the HellaSwag task."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "proj = client.create_project(\n",
117 |     "    name=\"HellaSwag\", \n",
118 |     "    view=\"text-classification\", \n",
119 |     "    description=\"HellaSwag (https://arxiv.org/abs/1905.07830) task in the Open-LLM-Leaderboard.\",\n",
120 |     "    metrics=[\n",
121 |     "        ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])\n",
122 |     "    ]\n",
123 |     ")"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "Let us now upload the data to the project we just created."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "df = generate_dataset(get_data(models[0])['latest'].to_pandas())\n",
140 |     "print(\"\\nYour dataset has {} rows\\n\".format(len(df)))\n",
141 |     "num_rows = len(df)\n",
142 |     "proj.upload_dataset(df, id_column=\"id\", label_column=\"label\", data_column=\"data\")"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "Finally, let us upload all the model outputs for the models we specified above."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "for model in models:\n",
159 |     "    dataset = get_data(model)['latest'].to_pandas()\n",
160 |     "    if len(dataset) != num_rows:\n",
161 |     "        print(\"Skipping {} because it has {} rows instead of {}\".format(model, len(dataset), num_rows))\n",
162 |     "        continue\n",
163 |     "    df_system = generate_system(dataset)\n",
164 |     "    proj.upload_system(df_system, name=model.replace('/', \"__\"), output_column=\"predictions\", id_column=\"id\")"
165 |    ]
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "kernelspec": {
170 |    "display_name": "compare",
171 |    "language": "python",
172 |    "name": "python3"
173 |   },
174 |   "language_info": {
175 |    "codemirror_mode": {
176 |     "name": "ipython",
177 |     "version": 3
178 |    },
179 |    "file_extension": ".py",
180 |    "mimetype": "text/x-python",
181 |    "name": "python",
182 |    "nbconvert_exporter": "python",
183 |    "pygments_lexer": "ipython3",
184 |    "version": "3.10.11"
185 |   },
186 |   "orig_nbformat": 4
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 2
190 | }
191 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/arc.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ARC Task\n",
  8 |     "\n",
  9 |     "ARC is a question-answering task with grade-school science questions."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from zeno_client import ZenoClient, ZenoMetric\n",
 19 |     "import datasets\n",
 20 |     "import numpy as np\n",
 21 |     "import os"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "Feel free to change the list of models used.\n",
 29 |     "You can go to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to check what models are available.\n",
 30 |     "Some of them might not have associated data, you can check this by clicking on the little icon next to the model name.\n",
 31 |     "If you get a 404 after clicking, we won't be able to fetch the model data and this notebook will crash."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "models = [\"meta-llama/Llama-2-70b-hf\", \"mistralai/Mistral-7B-v0.1\", \"tiiuae/falcon-40b\", \"Riiid/sheep-duck-llama-2-70b-v1.1\", \"AIDC-ai-business/Marcoroni-70B-v1\", \"ICBU-NPU/FashionGPT-70B-V1.1\", \"adonlee/LLaMA_2_70B_LoRA\", \"uni-tianyan/Uni-TianYan\"]"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "def get_data(model: str):\n",
 50 |     "    data_path = \"details_\" + model.replace(\"/\", \"__\")\n",
 51 |     "    return datasets.load_dataset(\n",
 52 |     "        \"open-llm-leaderboard/\" + data_path,\n",
 53 |     "        \"harness_arc_challenge_25\",\n",
 54 |     "    )"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "labels = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n",
 64 |     "\n",
 65 |     "def generate_dataset(df):\n",
 66 |     "    df_lim = df[[\"example\", \"choices\", \"gold\"]]\n",
 67 |     "    df_lim.loc[:, \"data\"] = df_lim.apply(lambda x: \"\\n\" + x[\"example\"] + \"\\n\" + \"\\n\".join(f\"{labels[i]}: {x}\" for i,x in enumerate(x['choices'])), axis=1)\n",
 68 |     "    df_lim.loc[:, \"num_choices\"] = df_lim.apply(lambda x: str(len(x['choices'])), axis=1)\n",
 69 |     "    df_lim.loc[:, \"label\"] = df_lim.apply(lambda x: labels[x[\"gold\"]], axis=1)\n",
 70 |     "    df_lim = df_lim.drop(columns=[\"example\", \"choices\", \"gold\"])\n",
 71 |     "    df_lim[\"id\"] = df_lim.index \n",
 72 |     "    return df_lim\n",
 73 |     "\n",
 74 |     "def generate_system(df):\n",
 75 |     "    df_system = df[[\"predictions\", \"acc_norm\", \"choices\", \"acc\"]]\n",
 76 |     "    df_system[\"answer_raw\"] = df_system.apply(lambda x: labels[np.argmax(x['predictions'])], axis=1)\n",
 77 |     "    df_system[\"answer_norm\"] = df_system.apply(lambda x: labels[np.argmax(x['predictions']/np.array([float(len(i)) for i in x['choices']]))], axis=1)\n",
 78 |     "    df_system[\"predictions\"] = df_system.apply(lambda x: x['answer_norm'] + \"\\n\\n\" + \"Raw Pred.: \" + \", \".join(map(lambda y: str(round(y, 2)), x['predictions'])) + \"\\nNorm Pred.: \" + \", \".join(map(lambda y: str(round(y, 2)), x['predictions']/np.array([float(len(i)) for i in x['choices']]))), axis=1)\n",
 79 |     "    df_system[\"correct\"] = df_system.apply(lambda x: True if x['acc_norm'] > 0 else False, axis=1)\n",
 80 |     "    df_system[\"correct_raw\"] = df_system.apply(lambda x: True if x['acc'] > 0 else False, axis=1)\n",
 81 |     "    df_system = df_system.drop(columns=[\"acc_norm\", \"choices\", \"acc\"])\n",
 82 |     "    df_system[\"id\"] = df_system.index\n",
 83 |     "    return df_system"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Make sure you have your Zeno API key in your environment variables."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "API_KEY = os.environ[\"ZENO_API_KEY\"]\n",
100 |     "client = ZenoClient(API_KEY)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "Lets create a project to hold the data for the ARC task."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "proj = client.create_project(\n",
117 |     "    name=\"ARC\", \n",
118 |     "    view=\"text-classification\", \n",
119 |     "    description=\"ARC (https://arxiv.org/abs/1803.05457) task in the Open-LLM-Leaderboard.\",\n",
120 |     "    metrics=[\n",
121 |     "        ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])\n",
122 |     "    ]\n",
123 |     ")"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "Let us now upload the data to the project we just created."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "df = generate_dataset(get_data(models[0])['latest'].to_pandas())\n",
140 |     "print(\"\\nYour dataset has {} rows\\n\".format(len(df)))\n",
141 |     "num_rows = len(df)\n",
142 |     "proj.upload_dataset(df, id_column=\"id\", label_column=\"label\", data_column=\"data\")"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "Finally, let us upload all the model outputs for the models we specified above."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "for model in models:\n",
159 |     "    dataset = get_data(model)['latest'].to_pandas()\n",
160 |     "    if len(dataset) != num_rows:\n",
161 |     "        print(\"Skipping {} because it has {} rows instead of {}\".format(model, len(dataset), num_rows))\n",
162 |     "        continue\n",
163 |     "    df_system = generate_system(dataset)\n",
164 |     "    proj.upload_system(df_system, name=model.replace('/', \"__\"), output_column=\"predictions\", id_column=\"id\")"
165 |    ]
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "kernelspec": {
170 |    "display_name": "compare",
171 |    "language": "python",
172 |    "name": "python3"
173 |   },
174 |   "language_info": {
175 |    "codemirror_mode": {
176 |     "name": "ipython",
177 |     "version": 3
178 |    },
179 |    "file_extension": ".py",
180 |    "mimetype": "text/x-python",
181 |    "name": "python",
182 |    "nbconvert_exporter": "python",
183 |    "pygments_lexer": "ipython3",
184 |    "version": "3.10.11"
185 |   },
186 |   "orig_nbformat": 4
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 2
190 | }
191 | 


--------------------------------------------------------------------------------
/transcription/transcription.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Setup\n",
  8 |     "\n",
  9 |     "Import libraries, set up model, and read input data."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from jiwer import wer\n",
 19 |     "import os\n",
 20 |     "import pandas as pd\n",
 21 |     "import whisper\n",
 22 |     "import zeno_client\n",
 23 |     "import dotenv\n",
 24 |     "import torch\n",
 25 |     "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n",
 26 |     "import pandas as pd\n",
 27 |     "import requests\n",
 28 |     "from io import BytesIO\n",
 29 |     "import wave\n",
 30 |     "import struct\n",
 31 |     "from tqdm import tqdm\n",
 32 |     "\n",
 33 |     "tqdm.pandas()\n",
 34 |     "dotenv.load_dotenv(override=True)"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
 44 |     "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "df = pd.read_csv(\"speech_accent_archive.csv\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "df[\"data\"] = \"https://zenoml.s3.amazonaws.com/accents/\" + df[\"id\"]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# Define the function to get amplitude and length\n",
 72 |     "def get_amplitude_and_length_from_url(url):\n",
 73 |     "    # Download the WAV file content from the URL\n",
 74 |     "    try:\n",
 75 |     "        response = requests.get(url)\n",
 76 |     "        response.raise_for_status()  # will raise an HTTPError if the HTTP request returned an unsuccessful status code\n",
 77 |     "\n",
 78 |     "        # Use the BytesIO object as input for the wave module\n",
 79 |     "        with wave.open(BytesIO(response.content), 'rb') as wav_file:\n",
 80 |     "            frame_rate = wav_file.getframerate()\n",
 81 |     "            n_frames = wav_file.getnframes()\n",
 82 |     "            n_channels = wav_file.getnchannels()\n",
 83 |     "            sample_width = wav_file.getsampwidth()\n",
 84 |     "            duration = n_frames / frame_rate\n",
 85 |     "\n",
 86 |     "            frames = wav_file.readframes(n_frames)\n",
 87 |     "            if sample_width == 1:  # 8-bit audio\n",
 88 |     "                fmt = '{}B'.format(n_frames * n_channels)\n",
 89 |     "            elif sample_width == 2:  # 16-bit audio\n",
 90 |     "                fmt = '{}h'.format(n_frames * n_channels)\n",
 91 |     "            else:\n",
 92 |     "                raise ValueError(\"Only supports up to 16-bit audio.\")\n",
 93 |     "            \n",
 94 |     "            frame_amplitudes = struct.unpack(fmt, frames)\n",
 95 |     "            max_amplitude = max(frame_amplitudes)\n",
 96 |     "            max_amplitude_normalized = max_amplitude / float(int((2 ** (8 * sample_width)) / 2))\n",
 97 |     "\n",
 98 |     "            return max_amplitude_normalized, duration\n",
 99 |     "    except requests.RequestException as e:\n",
100 |     "        print(f\"Request failed: {e}\")\n",
101 |     "        return None, None\n",
102 |     "\n",
103 |     "# Define a wrapper function for apply to work row-wise\n",
104 |     "def apply_get_amplitude_and_length(row):\n",
105 |     "    url = row['data']  # Assuming the URL is in the 'data' column\n",
106 |     "    amplitude, length = get_amplitude_and_length_from_url(url)\n",
107 |     "    return pd.Series({'amplitude': amplitude, 'length': length})\n",
108 |     "\n",
109 |     "# Usage with apply on the DataFrame\n",
110 |     "# This will create two new columns 'amplitude' and 'length' in the DataFrame\n",
111 |     "df[['amplitude', 'length']] = df.progress_apply(apply_get_amplitude_and_length, axis=1)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "### Zeno Project\n",
119 |     "\n",
120 |     "We create a Zeno project with a WER metric and upload our base data."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "client = zeno_client.ZenoClient(os.environ.get(\"ZENO_API_KEY\"))\n",
130 |     "\n",
131 |     "project = client.create_project(\n",
132 |     "    name=\"Transcription Whisper Distil\", \n",
133 |     "    view=\"audio-transcription\",\n",
134 |     "    description=\"Test of audio transcription\",\n",
135 |     "    metrics=[\n",
136 |     "        zeno_client.ZenoMetric(name=\"avg wer\", type=\"mean\", columns=[\"wer\"])\n",
137 |     "    ]\n",
138 |     ")"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "project.upload_dataset(df, id_column=\"id\", data_column=\"data\", label_column=\"label\")"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "### Run Inference\n",
155 |     "\n",
156 |     "We now run inference on the base Whisper models and Distil models, cacheing the output."
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "models = [\"medium.en\", \"large-v1\", \"large-v2\", \"large-v3\", \"distil-medium.en\", \"distil-large-v2\"]"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "os.makedirs(\"cache\", exist_ok=True)\n",
175 |     "\n",
176 |     "df_systems = []\n",
177 |     "for model_name in models:\n",
178 |     "    try:\n",
179 |     "        df_system = pd.read_parquet(f\"cache/{model_name}.parquet\")\n",
180 |     "    except:\n",
181 |     "        df_system = df[[\"id\", \"data\", \"label\"]].copy()\n",
182 |     "\n",
183 |     "        if \"distil\" in model_name:\n",
184 |     "            model_id = \"distil-whisper/\" + model_name\n",
185 |     "            model = AutoModelForSpeechSeq2Seq.from_pretrained(\n",
186 |     "                model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True\n",
187 |     "            )\n",
188 |     "            model.to(device)\n",
189 |     "\n",
190 |     "            processor = AutoProcessor.from_pretrained(model_id)\n",
191 |     "            pipe = pipeline(\n",
192 |     "                \"automatic-speech-recognition\",\n",
193 |     "                model=model,\n",
194 |     "                tokenizer=processor.tokenizer,\n",
195 |     "                feature_extractor=processor.feature_extractor,\n",
196 |     "                max_new_tokens=128,\n",
197 |     "                chunk_length_s=15,\n",
198 |     "                batch_size=16,\n",
199 |     "                torch_dtype=torch_dtype,\n",
200 |     "                device=device,\n",
201 |     "            )\n",
202 |     "            df_system[\"output\"] = df_system[\"data\"].progress_apply(lambda x: pipe(x)['text'])\n",
203 |     "            pass\n",
204 |     "        else:\n",
205 |     "            whisper_model = whisper.load_model(model_name)\n",
206 |     "            df_system[\"output\"] = df_system[\"data\"].progress_apply(lambda x: whisper_model.transcribe(x)[\"text\"])\n",
207 |     "\n",
208 |     "        df_system[\"wer\"] = df_system.progress_apply(lambda x: wer(x[\"label\"], x[\"output\"]), axis=1)\n",
209 |     "        df_system.to_parquet(f\"cache/{model_name}.parquet\", index=False)\n",
210 |     "    df_systems.append(df_system) "
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "### Upload Results\n",
218 |     "\n",
219 |     "Lastly, we upload our final results."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "for i, df_system in enumerate(df_systems):\n",
229 |     "    project.upload_system(df_system[[\"id\", \"output\", \"wer\"]], name=models[i], id_column=\"id\", output_column=\"output\")"
230 |    ]
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "compare",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.9.16"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 2
254 | }
255 | 


--------------------------------------------------------------------------------
/open_llm_leaderboard/mmlu.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MMLU\n",
  8 |     "\n",
  9 |     "MMLU is a question answering task where each question has four potential answers, one of which is correct. Questions come from 57 categories, including elementary mathematics, US history, computer science, law, and more."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from zeno_client import ZenoClient, ZenoMetric\n",
 19 |     "import datasets\n",
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import os\n",
 23 |     "\n",
 24 |     "API_KEY = os.environ[\"ZENO_API_KEY\"]"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "Feel free to change the list of models used.\n",
 32 |     "You can go to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to check what models are available.\n",
 33 |     "Some of them might not have associated data, you can check this by clicking on the little icon next to the model name.\n",
 34 |     "If you get a 404 after clicking, we won't be able to fetch the model data and this notebook will crash."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "models = [\"meta-llama/Llama-2-70b-hf\", \"mistralai/Mistral-7B-v0.1\", \"tiiuae/falcon-40b\", \"Riiid/sheep-duck-llama-2-70b-v1.1\", \"AIDC-ai-business/Marcoroni-70B-v1\", \"ICBU-NPU/FashionGPT-70B-V1.1\", \"adonlee/LLaMA_2_70B_LoRA\", \"uni-tianyan/Uni-TianYan\"]"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# All 57 tasks in the MMLU dataset\n",
 53 |     "tasks = [\"hendrycksTest-abstract_algebra\", \"hendrycksTest-anatomy\", \"hendrycksTest-astronomy\", \"hendrycksTest-business_ethics\", \"hendrycksTest-clinical_knowledge\", \"hendrycksTest-college_biology\", \"hendrycksTest-college_chemistry\", \"hendrycksTest-college_computer_science\", \"hendrycksTest-college_mathematics\", \"hendrycksTest-college_medicine\", \"hendrycksTest-college_physics\", \"hendrycksTest-computer_security\", \"hendrycksTest-conceptual_physics\", \"hendrycksTest-econometrics\", \"hendrycksTest-electrical_engineering\", \"hendrycksTest-elementary_mathematics\", \"hendrycksTest-formal_logic\", \"hendrycksTest-global_facts\", \"hendrycksTest-high_school_biology\", \"hendrycksTest-high_school_chemistry\", \"hendrycksTest-high_school_computer_science\", \"hendrycksTest-high_school_european_history\", \"hendrycksTest-high_school_geography\", \"hendrycksTest-high_school_government_and_politics\", \"hendrycksTest-high_school_macroeconomics\", \"hendrycksTest-high_school_mathematics\", \"hendrycksTest-high_school_microeconomics\", \"hendrycksTest-high_school_physics\", \"hendrycksTest-high_school_psychology\", \"hendrycksTest-high_school_statistics\", \"hendrycksTest-high_school_us_history\", \"hendrycksTest-high_school_world_history\", \"hendrycksTest-human_aging\", \"hendrycksTest-human_sexuality\", \"hendrycksTest-international_law\", \"hendrycksTest-jurisprudence\", \"hendrycksTest-logical_fallacies\", \"hendrycksTest-machine_learning\", \"hendrycksTest-management\", \"hendrycksTest-marketing\", \"hendrycksTest-medical_genetics\", \"hendrycksTest-miscellaneous\", \"hendrycksTest-moral_disputes\", \"hendrycksTest-moral_scenarios\", \"hendrycksTest-nutrition\", \"hendrycksTest-philosophy\", \"hendrycksTest-prehistory\", \"hendrycksTest-professional_accounting\", \"hendrycksTest-professional_law\", \"hendrycksTest-professional_medicine\", \"hendrycksTest-professional_psychology\", \"hendrycksTest-public_relations\", \"hendrycksTest-security_studies\", \"hendrycksTest-sociology\", \"hendrycksTest-us_foreign_policy\", \"hendrycksTest-virology\", \"hendrycksTest-world_religions\"]"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "def get_data_for_task(model: str, task: str):\n",
 63 |     "    data_path = \"details_\" + model.replace(\"/\", \"__\")\n",
 64 |     "    return datasets.load_dataset(\n",
 65 |     "        \"open-llm-leaderboard/\" + data_path,\n",
 66 |     "        f\"harness_{task.replace('-', '_')}_5\",\n",
 67 |     "    )"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "def get_data(model: str):\n",
 77 |     "    frames = []\n",
 78 |     "    for task in tasks:\n",
 79 |     "        data = get_data_for_task(model, task)['latest'].to_pandas()\n",
 80 |     "        data['task'] = task\n",
 81 |     "        frames.append(data)\n",
 82 |     "    df = pd.concat(frames, ignore_index=True)\n",
 83 |     "    return df"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "labels = [\"A\", \"B\", \"C\", \"D\"]\n",
 93 |     "\n",
 94 |     "def generate_dataset(df):\n",
 95 |     "    df_lim = df[[\"example\", \"choices\", \"gold\", \"task\"]]\n",
 96 |     "    df_lim.loc[:, \"data\"] = df_lim.apply(lambda x: x[\"example\"][:x[\"example\"].rfind('\\n')], axis=1)\n",
 97 |     "    df_lim.loc[:, \"label\"] = df_lim.apply(lambda x: labels[x[\"gold\"]], axis=1)\n",
 98 |     "    df_lim = df_lim.drop(columns=[\"example\", \"choices\", \"gold\"])\n",
 99 |     "    df_lim[\"id\"] = df_lim.index \n",
100 |     "    return df_lim\n",
101 |     "\n",
102 |     "def generate_system(df):\n",
103 |     "    df_system = df[[\"predictions\", \"acc\", \"choices\"]]\n",
104 |     "    df_system[\"predictions\"] = df_system.apply(lambda x: labels[np.argmax(x['predictions'])] + \"\\n\\n\" + \"Pred.: \" + \", \".join(map(lambda y: str(round(y, 2)), x['predictions'])), axis=1)\n",
105 |     "    df_system[\"correct\"] = df_system.apply(lambda x: True if x['acc'] > 0 else False, axis=1)\n",
106 |     "    df_system = df_system.drop(columns=[\"acc\", \"choices\"])\n",
107 |     "    df_system[\"id\"] = df_system.index\n",
108 |     "    return df_system"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "Make sure you have your Zeno API key in your environment variables."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "client = ZenoClient(API_KEY)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "Lets create a project to hold the data for the MMLU task."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "proj = client.create_project(\n",
141 |     "    name=\"MMLU\", \n",
142 |     "    view=\"text-classification\", \n",
143 |     "    description=\"MMLU (https://arxiv.org/abs/2009.03300) tasks in the Open-LLM-Leaderboard.\",\n",
144 |     "    metrics=[\n",
145 |     "        ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])\n",
146 |     "    ]\n",
147 |     ")"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "Let us now upload the data to the project we just created."
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "df = generate_dataset(get_data(models[0]))\n",
164 |     "print(\"\\nYour dataset has {} rows\\n\".format(len(df)))\n",
165 |     "num_rows = len(df)\n",
166 |     "proj.upload_dataset(df, id_column=\"id\", label_column=\"label\", data_column=\"data\")"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "Finally, let us upload all the model outputs for the models we specified above."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "for model in models:\n",
183 |     "    dataset = get_data(model)\n",
184 |     "    if len(dataset) != num_rows:\n",
185 |     "        print(\"Skipping {} because it has {} rows instead of {}\".format(model, len(dataset), num_rows))\n",
186 |     "        continue\n",
187 |     "    df_system = generate_system(dataset)\n",
188 |     "    proj.upload_system(df_system, name=model.replace('/', \"__\"), output_column=\"predictions\", id_column=\"id\")"
189 |    ]
190 |   }
191 |  ],
192 |  "metadata": {
193 |   "kernelspec": {
194 |    "display_name": "compare",
195 |    "language": "python",
196 |    "name": "python3"
197 |   },
198 |   "language_info": {
199 |    "codemirror_mode": {
200 |     "name": "ipython",
201 |     "version": 3
202 |    },
203 |    "file_extension": ".py",
204 |    "mimetype": "text/x-python",
205 |    "name": "python",
206 |    "nbconvert_exporter": "python",
207 |    "pygments_lexer": "ipython3",
208 |    "version": "3.10.11"
209 |   },
210 |   "orig_nbformat": 4
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 2
214 | }
215 | 


--------------------------------------------------------------------------------