├── requirements.txt
├── .env.example
├── LICENSE
├── README.md
├── .gitignore
└── run_evals.ipynb


/requirements.txt:
--------------------------------------------------------------------------------
1 | weave
2 | openAI
3 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | WANDB_API_KEY=#add your key from https://wandb.ai/authorize
 2 | 
 3 | #Optional
 4 | OPENROUTER_API_KEY=#add your key from https://openrouter.ai
 5 | 
 6 | 
 7 | GROQ_API_KEY=#add your key from https://groq.com
 8 | 
 9 | TOGETHER_API_KEY=#add your key from https://together.xyz
10 | 
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 altryne
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![CleanShot 2024-07-24 at 15 42 23@2x](https://github.com/user-attachments/assets/0c741cc7-4523-428c-a421-6c0c8dfb1425)
 2 | 
 3 | # compare-llama-providers
 4 | This is a quick test to see if all the new LLama 3.1 server providers are returning the same tokens, using Weights & Biases Weave Evaluations [https://wandb.me/weave](https://wandb.me/weave)
 5 | 
 6 | 
 7 | 
 8 | ## Installation
 9 | 
10 | To get started with this project, you need to install the required dependencies. Follow the steps below:
11 | 
12 | 1. **Create a virtual environment** (optional but recommended):
13 |    ```bash
14 |    python3 -m venv venv
15 |    source venv/bin/activate  # On Windows use `venv\Scripts\activate`
16 |    ```
17 | 
18 | 2. **Install the requirements**:
19 |    ```bash
20 |    pip install -r requirements.txt
21 |    ```
22 | 
23 | 
24 | 3. **Set up environment variables**:
25 |    Copy the `.env.example` file to a new file named `.env`:
26 |    ```bash
27 |    cp .env.example .env
28 |    ```
29 | 
30 |    Then, open the `.env` file and add your secrets:
31 |    - Add your Weights & Biases API key from https://wandb.ai/authorize to `WANDB_API_KEY`
32 |    - Add your OpenRouter API key from https://openrouter.ai to `OPENROUTER_API_KEY`
33 |    - Add your Groq API key to `GROQ_API_KEY`
34 |    - Add your Together API key to `TOGETHER_API_KEY`
35 | 
36 |    Your `.env` file should look something like this:
37 |    ```
38 |    WANDB_API_KEY=your_wandb_api_key_here
39 |    OPENROUTER_API_KEY=your_openrouter_api_key_here
40 |    GROQ_API_KEY=your_groq_api_key_here
41 |    TOGETHER_API_KEY=your_together_api_key_here
42 |    ```
43 | 
44 |    Note: Make sure to keep your `.env` file private and never commit it to version control.
45 | 
46 | 4. ** Run the python notebook run_evals.ipynb**
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .env
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
111 | .pdm.toml
112 | .pdm-python
113 | .pdm-build/
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | 


--------------------------------------------------------------------------------
/run_evals.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Compare LLama Providers\n",
  8 |     "\n",
  9 |     "Run evaluations on a few prompts for llama3.1 70B across several providers and comprate results to baseline."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Install and read in required packages, plus create an anthropic client.\n",
 19 |     "print('⏳ Installing packages')\n",
 20 |     "%pip install -q weave set-env-colab-kaggle-dotenv tqdm ipywidgets requests groq together\n",
 21 |     "print('✅ Packages installed')"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from tqdm.notebook import tqdm_notebook as tqdm\n",
 31 |     "from set_env import set_env\n",
 32 |     "from openai import OpenAI\n",
 33 |     "from groq import Groq\n",
 34 |     "from together import Together\n",
 35 |     "import weave\n",
 36 |     "import os\n",
 37 |     "import json\n",
 38 |     "import requests\n",
 39 |     "set_env(\"OPENROUTER_API_KEY\")\n",
 40 |     "set_env(\"GROQ_API_KEY\")\n",
 41 |     "set_env(\"WANDB_API_KEY\")\n",
 42 |     "\n",
 43 |     "groqclient = Groq(\n",
 44 |     "    api_key=os.environ.get(\"GROQ_API_KEY\"),\n",
 45 |     ")\n",
 46 |     "client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Initialize Weave project\n",
 56 |     "weave.init('compare-llamas')\n",
 57 |     "\n",
 58 |     "\n",
 59 |     "class LlamaModel(weave.Model):\n",
 60 |     "    provider: str\n",
 61 |     "\n",
 62 |     "    @weave.op()\n",
 63 |     "    def predict(self, prompt: str) -> str:\n",
 64 |     "        data = {\n",
 65 |     "            \"model\": \"meta-llama/llama-3.1-70b-instruct\",\n",
 66 |     "            \"messages\": [\n",
 67 |     "                {\"role\": \"user\", \"content\": prompt}\n",
 68 |     "            ],\n",
 69 |     "            \"temperature\": 0,\n",
 70 |     "            \"provider\": {\n",
 71 |     "                \"order\": [self.provider],\n",
 72 |     "                \"allow_fallbacks\": False\n",
 73 |     "            }\n",
 74 |     "        }\n",
 75 |     "        \n",
 76 |     "        response = self.make_openrouter_request(data)\n",
 77 |     "        return response['choices'][0]['message']['content']\n",
 78 |     "\n",
 79 |     "    @weave.op()\n",
 80 |     "    def make_openrouter_request(self, data):\n",
 81 |     "        try:\n",
 82 |     "            response = requests.post(\n",
 83 |     "                \"https://openrouter.ai/api/v1/chat/completions\",\n",
 84 |     "                headers = {\n",
 85 |     "                    \"Authorization\": f\"Bearer {os.environ['OPENROUTER_API_KEY']}\",\n",
 86 |     "                    \"Content-Type\": \"application/json\"\n",
 87 |     "                },\n",
 88 |     "                json=data\n",
 89 |     "            )\n",
 90 |     "            response.raise_for_status()  # Raises an HTTPError for bad responses\n",
 91 |     "            return response.json()\n",
 92 |     "        except requests.RequestException as e:\n",
 93 |     "            raise Exception(f\"API request failed: {str(e)}\")\n",
 94 |     "\n",
 95 |     "class GroqModel(weave.Model):\n",
 96 |     "\n",
 97 |     "    @weave.op()\n",
 98 |     "    def predict(self, prompt: str) -> str:\n",
 99 |     "        response = groqclient.chat.completions.create(\n",
100 |     "            model='llama-3.1-70b-versatile',\n",
101 |     "            messages=[\n",
102 |     "                {\n",
103 |     "                    \"role\": \"user\",\n",
104 |     "                    \"content\": prompt\n",
105 |     "                }\n",
106 |     "            ],\n",
107 |     "            temperature=0.0,\n",
108 |     "            seed=123123\n",
109 |     "        )\n",
110 |     "        return response.choices[0].message.content\n",
111 |     "    \n",
112 |     "class TogetherModel(weave.Model):\n",
113 |     "    @weave.op()\n",
114 |     "    def predict(self, prompt: str) -> str:\n",
115 |     "        response = self.make_together_request(prompt)\n",
116 |     "        return response.choices[0].message.content\n",
117 |     "\n",
118 |     "    @weave.op()\n",
119 |     "    def make_together_request(self, prompt):\n",
120 |     "        response = client.chat.completions.create(\n",
121 |     "            model=\"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo\",\n",
122 |     "            messages=[\n",
123 |     "                {\n",
124 |     "                    \"role\": \"user\",\n",
125 |     "                    \"content\": prompt\n",
126 |     "                }\n",
127 |     "            ],\n",
128 |     "            temperature=0,\n",
129 |     "        )\n",
130 |     "        return response\n",
131 |     "\n",
132 |     "# Create LLamas per provider OctoAI, NovitaAI, DeepInfra, Together, Fireworks\n",
133 |     "octoai_llama = LlamaModel(provider='OctoAI', name='OctoAILLa_LLama3.1_70B')\n",
134 |     "novitaai_llama = LlamaModel(provider='Novita', name='NovitaAI_LLaMa3.1_70B')\n",
135 |     "deepinfra_llama = LlamaModel(provider='DeepInfra', name='DeepInfra_LLaMa3.1_70B')\n",
136 |     "\n",
137 |     "fireworks_llama = LlamaModel(provider='Fireworks', name='Fireworks_LLaMa3.1_70B')\n",
138 |     "groq_llama = GroqModel(name='Groq_LLaMa3.1_70B')\n",
139 |     "together_llama = TogetherModel(name='Together_LLaMa3.1_70B')\n",
140 |     "\n",
141 |     "print(\"✅ Weave models created\")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# Let's build a dataset of quirky prompts and potentially their answers \n",
151 |     "from weave import Dataset\n",
152 |     "\n",
153 |     "quirky_prompts = Dataset(\n",
154 |     "    name=\"my_llama_quirky_prompts\",\n",
155 |     "    rows=[\n",
156 |     "        {\n",
157 |     "            \"question\": \"Give me 10 sentences that end in the word \\\"apple\\\"\",\n",
158 |     "            \"rubric\": \"all sentences must end with the word apple\"\n",
159 |     "        },\n",
160 |     "        {\n",
161 |     "            \"question\": \"Answer with the number of legs about the following statement: The fox lost a leg, but then magically grew back the leg he lost and a mysterious extra leg on top of that\",\n",
162 |     "            \"rubric\": \"Answer must be 5 or five\"\n",
163 |     "        },\n",
164 |     "        {\n",
165 |     "            \"question\": \"Yam (a boy) has 4 sisters. Each sister has 3 brothers. How many brothers does Yam have? Let's think step by step.\",\n",
166 |     "            \"rubric\": \"Answer must indicate that Yam has 2 brothers\"\n",
167 |     "        },\n",
168 |     "        {\n",
169 |     "            \"question\": \"You have five apples today, you ate two apples yesterday so how many apples do you have today? Provide a logical answer.\",\n",
170 |     "            \"rubric\": \"Answer must be five and explain that yesterdays action have no bearing on todays apple quantity\"\n",
171 |     "        },\n",
172 |     "        {\n",
173 |     "            \"question\": \"Which number is bigger: 9.11 or 9.9?\",\n",
174 |     "            \"rubric\": \"Answer should conclude that 9.9 is bigger\" \n",
175 |     "        },\n",
176 |     "        {\n",
177 |     "            \"question\": \"If I hang 5 shirts outside and it takes them 5 hours to dry, how long would it take to dry 30 shirts\",\n",
178 |     "            \"rubric\": \"Answer must state that it would take the same amount of time\"\n",
179 |     "        },\n",
180 |     "        {\n",
181 |     "            \"question\": \"There are three sisters in a room. Anna is reading a book. Alice is playing a match of chess against someone in the room. What is the third sister, Amanda, doing?\",\n",
182 |     "            \"rubric\": \"Playing chess with Alice\"\n",
183 |     "        },\n",
184 |     "        {\n",
185 |     "            \"question\": \"\"\"Determine all triples (x, y, z) of real numbers that are solutions to the following\n",
186 |     "system of equations:\n",
187 |     "log9 x + 10g9 y + 10g3 z = 2\n",
188 |     "log 16 x + log4 y + log16 z = 1\n",
189 |     "log5 x + log25 y + log25 z = 0\n",
190 |     "\"\"\",\n",
191 |     "            \"rubric\": \"IDK the answer to this one\"\n",
192 |     "        }\n",
193 |     "    ]\n",
194 |     ")\n"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "# Get all instantiated models\n",
204 |     "# models = [octoai_llama, together_llama]\n",
205 |     "models = [deepinfra_llama, fireworks_llama, groq_llama]\n",
206 |     "\n",
207 |     "\n",
208 |     "#define our scoring functions\n",
209 |     "@weave.op()\n",
210 |     "def has_response(rubric: str, model_output: dict) -> dict:\n",
211 |     "    return {'has_response': model_output is not None}\n",
212 |     "\n",
213 |     "# Define the preprocess_model_input function\n",
214 |     "def preprocess_model_input(row):\n",
215 |     "    return {'prompt': row['question']}\n",
216 |     "\n",
217 |     "\n",
218 |     "# Define the evaluation\n",
219 |     "evaluation = weave.Evaluation(\n",
220 |     "    name='quirky_prompts_eval',\n",
221 |     "    dataset=quirky_prompts,\n",
222 |     "    trials=1,\n",
223 |     "    scorers=[\n",
224 |     "        has_response\n",
225 |     "    ],\n",
226 |     "    preprocess_model_input=preprocess_model_input\n",
227 |     ")\n",
228 |     "\n",
229 |     "# Run evaluation for each model\n",
230 |     "results = {}\n",
231 |     "for model in models:\n",
232 |     "    print(f\"Evaluating {model.name}...\")\n",
233 |     "    result = await evaluation.evaluate(model)\n",
234 |     "    results[model.name] = result\n"
235 |    ]
236 |   }
237 |  ],
238 |  "metadata": {
239 |   "kernelspec": {
240 |    "display_name": "comparellamas",
241 |    "language": "python",
242 |    "name": "python3"
243 |   },
244 |   "language_info": {
245 |    "codemirror_mode": {
246 |     "name": "ipython",
247 |     "version": 3
248 |    },
249 |    "file_extension": ".py",
250 |    "mimetype": "text/x-python",
251 |    "name": "python",
252 |    "nbconvert_exporter": "python",
253 |    "pygments_lexer": "ipython3",
254 |    "version": "3.11.8"
255 |   }
256 |  },
257 |  "nbformat": 4,
258 |  "nbformat_minor": 2
259 | }
260 | 


--------------------------------------------------------------------------------