├── .devcontainer ├── devcontainer.json ├── postCreateCommand.sh └── postStartCommand.sh ├── .github └── workflows │ ├── python-ci.yml │ └── python-publish.yml ├── .gitignore ├── .vscode ├── launch.json └── settings.json ├── CONTRIBUTING.md ├── README.md ├── llm_github_models.py ├── pyproject.toml ├── tests ├── files │ ├── kick.wav │ └── salmon.jpeg ├── test_llm_github_embeddings.py ├── test_llm_github_models.py └── test_tool_support.py └── tools ├── README.md ├── download_models_json.py ├── models.fragment.md ├── models.json └── parse_models_json.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3.13", 3 | "image": "mcr.microsoft.com/devcontainers/python:3.13-bullseye", 4 | "customizations": { 5 | "vscode": { 6 | "settings": { 7 | "python.defaultInterpreterPath": "/usr/local/bin/python", 8 | "python.linting.enabled": true 9 | }, 10 | "extensions": [ 11 | "ms-python.python", 12 | "ms-python.vscode-pylance", 13 | "ms-python.vscode-python-envs", 14 | "charliermarsh.ruff" 15 | ] 16 | } 17 | }, 18 | "postCreateCommand": ".devcontainer/postCreateCommand.sh", 19 | "postStartCommand": ".devcontainer/postStartCommand.sh" 20 | } -------------------------------------------------------------------------------- /.devcontainer/postCreateCommand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | pip install . 6 | 7 | llm install -e . -------------------------------------------------------------------------------- /.devcontainer/postStartCommand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | llm keys set github --value $GITHUB_TOKEN -------------------------------------------------------------------------------- /.github/workflows/python-ci.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | models: read 13 | strategy: 14 | matrix: 15 | python-version: [3.9, "3.10", 3.11, 3.12, 3.13] 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install ".[test]" 27 | - name: Run ruff format 28 | run: | 29 | ruff check 30 | - name: Run pyright 31 | run: | 32 | pyright llm_github_models.py 33 | - name: Run tests 34 | run: | 35 | pytest 36 | env: 37 | GITHUB_MODELS_KEY: ${{ secrets.GITHUB_TOKEN }} 38 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package to PyPI when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | release-build: 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.x" 28 | 29 | - name: Build release distributions 30 | run: | 31 | # NOTE: put your own distribution build steps here. 32 | python -m pip install build 33 | python -m build 34 | 35 | - name: Upload distributions 36 | uses: actions/upload-artifact@v4 37 | with: 38 | name: release-dists 39 | path: dist/ 40 | 41 | pypi-publish: 42 | runs-on: ubuntu-latest 43 | needs: 44 | - release-build 45 | permissions: 46 | # IMPORTANT: this permission is mandatory for trusted publishing 47 | id-token: write 48 | 49 | # Dedicated environments with protections for publishing are strongly recommended. 50 | # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules 51 | environment: 52 | name: pypi 53 | # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: 54 | # url: https://pypi.org/p/YOURPROJECT 55 | # 56 | # ALTERNATIVE: if your GitHub Release name is the PyPI project version string 57 | # ALTERNATIVE: exactly, uncomment the following line instead: 58 | # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} 59 | 60 | steps: 61 | - name: Retrieve release distributions 62 | uses: actions/download-artifact@v4 63 | with: 64 | name: release-dists 65 | path: dist/ 66 | 67 | - name: Publish release distributions to PyPI 68 | uses: pypa/gh-action-pypi-publish@release/v1 69 | with: 70 | packages-dir: dist/ 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | [.]venv/ 2 | env/ 3 | *.py[co] 4 | __pycache__/ 5 | *.egg-info/ 6 | build/ -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "GPT-4o with audio", 5 | "type": "debugpy", 6 | "request": "launch", 7 | "module": "llm", 8 | "args": [ 9 | "what is happening in this audio?", 10 | "-m", 11 | "github/gpt-4o", 12 | "-a", 13 | "tests/files/kick.wav" 14 | ], 15 | }, 16 | { 17 | "name": "GPT-4o with function", 18 | "type": "debugpy", 19 | "request": "launch", 20 | "module": "llm", 21 | // This needs to be string instead of an array so the the whitespace is preserved 22 | "args": "'what is 34234 * 213345?' -m github/gpt-4o --functions 'def multiply(x: int, y: int) -> int:\n \"\"\"Multiply two numbers.\"\"\"\n return x * y\n' --td" 23 | } 24 | ] 25 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true, 7 | "[python]": { 8 | "editor.formatOnSave": true, 9 | "editor.codeActionsOnSave": { 10 | "source.fixAll": "explicit", 11 | "source.unusedImports": "explicit", 12 | "source.organizeImports": "explicit" 13 | }, 14 | "editor.defaultFormatter": "charliermarsh.ruff" 15 | } 16 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # CONTRIBUTING 2 | 3 | ## Required tools 4 | 5 | - [Python 3.9+](https://docs.python.org/3/using/index.html) 6 | 7 | ## Setup 8 | 9 | 1. Set up a virtual environment at `//.venv` and activate it 10 | (see [the docs](https://docs.python.org/3/library/venv.html) for more information) 11 | 1. `llm install -e '.[test]'` to install all dependencies 12 | 13 | ## Running tests 14 | 15 | 1. `pytest` to run tests 16 | 17 | ## Code formatting and type checks 18 | 19 | Pull-requests will only pass in CI/CD if the following are met: 20 | 21 | 1. `ruff check` 22 | 2. `pyright llm_github_models.py` 23 | 3. `ruff format --check` 24 | 25 | Run `ruff check --fix` to resort imports before submitting PRs, or commit another change. Run `ruff format` to bring the code file up to our style guidelines. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GitHub Models Plugin for LLM 2 | [![PyPI](https://img.shields.io/pypi/v/llm-github-models.svg)](https://pypi.org/project/llm-github-models/) 3 | [![Changelog](https://img.shields.io/github/v/release/tonybaloney/llm-github-models?include_prereleases&label=changelog)](https://github.com/tonybaloney/llm-github-models/releases) 4 | 5 | This is a plugin for [llm](https://llm.datasette.io) that uses [GitHub Models](https://github.blog/news-insights/product-news/introducing-github-models/) via the Azure AI Inference SDK. GitHub Models is available to all GitHub users and offers **free** usage of many AI LLMs. 6 | 7 | ## Features 8 | 9 | - Support for all >30 models, including GPT-4o, 4.1, o3, DeepSeek-R1, Llama3.x and more 10 | - Support for [schemas](https://llm.datasette.io/en/stable/schemas.html) 11 | - Output token usage 12 | - Support for [Embedding Models](https://llm.datasette.io/en/stable/embeddings/index.html) 13 | - Async and streaming outputs (model dependent) 14 | - Support for model attachments 15 | - Support for [tools](https://llm.datasette.io/en/stable/tools.html) 16 | 17 | ## Installation 18 | 19 | ```default 20 | $ llm install llm-github-models 21 | ``` 22 | 23 | or `pip install llm-github-models` 24 | 25 | ## Usage 26 | 27 | To set the API key, use the `llm keys set github` command or use the `GITHUB_MODELS_KEY` environment variable. 28 | 29 | To get an API key, create a personal access token (PAT) inside [GitHub Settings](https://github.com/settings/tokens). 30 | 31 | Learn about [rate limits here](https://docs.github.com/github-models/prototyping-with-ai-models#rate-limits) 32 | 33 | All model names are affixed with `github/` to distinguish the OpenAI ones from the builtin models. 34 | 35 | ## Example 36 | 37 | ```default 38 | $ llm prompt 'top facts about cheese' -m github/gpt-4.1-mini 39 | Sure! Here are some top facts about cheese: 40 | 41 | 1. **Ancient Origins**: Cheese is one of the oldest man-made foods, with evidence of cheese-making dating back over 7,000 years. 42 | 43 | 2. **Variety**: There are over 1,800 distinct types of cheese worldwide, varying by texture, flavor, milk source, and production methods. 44 | ``` 45 | 46 | ### Image attachments 47 | 48 | Multi-modal vision models can accept image attachments using the [LLM attachments](https://llm.datasette.io/en/stable/usage.html#attachments) options: 49 | 50 | ```bash 51 | llm -m github/Llama-3.2-11B-Vision-Instruct "Describe this image" -a https://static.simonwillison.net/static/2024/pelicans.jpg 52 | ``` 53 | 54 | Produces 55 | ```bash 56 | This image depicts a dense gathering of pelicans, with the largest birds situated in the center, showcasing their light brown plumage and long, pointed beaks. The pelicans are standing on a rocky shoreline, with a serene body of water behind them, characterized by its pale blue hue and gentle ripples. In the background, a dark, rocky cliff rises, adding depth to the scene. 57 | 58 | The overall atmosphere of the image exudes tranquility, with the pelicans seemingly engaging in a social gathering or feeding activity. The photograph's clarity and focus on the pelicans' behavior evoke a sense of observation and appreciation for the natural world. 59 | ``` 60 | 61 | ## Supported Models 62 | 63 | ### Chat Models 64 | 65 | | Model Name | Streaming | Schemas | Tools | Input Modalities | Output Modalities | 66 | |------------|-----------|---------|-------|------------------|-------------------| 67 | | AI21-Jamba-1.5-Large | ✅ | ❌ | ❌ | text | text | 68 | | AI21-Jamba-1.5-Mini | ✅ | ❌ | ❌ | text | text | 69 | | Codestral-2501 | ✅ | ❌ | ✅ | text | text | 70 | | Cohere-command-r | ✅ | ❌ | ✅ | text | text | 71 | | Cohere-command-r-08-2024 | ✅ | ❌ | ✅ | text | text | 72 | | Cohere-command-r-plus | ✅ | ❌ | ✅ | text | text | 73 | | Cohere-command-r-plus-08-2024 | ✅ | ❌ | ✅ | text | text | 74 | | DeepSeek-R1 | ✅ | ❌ | ❌ | text | text | 75 | | DeepSeek-V3 | ✅ | ❌ | ❌ | text | text | 76 | | DeepSeek-V3-0324 | ✅ | ❌ | ❌ | text | text | 77 | | Llama-3.2-11B-Vision-Instruct | ✅ | ❌ | ❌ | text, image, audio | text | 78 | | Llama-3.2-90B-Vision-Instruct | ✅ | ❌ | ❌ | text, image, audio | text | 79 | | Llama-3.3-70B-Instruct | ✅ | ❌ | ❌ | text | text | 80 | | Llama-4-Maverick-17B-128E-Instruct-FP8 | ✅ | ❌ | ❌ | text, image | text | 81 | | Llama-4-Scout-17B-16E-Instruct | ✅ | ❌ | ❌ | text, image | text | 82 | | MAI-DS-R1 | ✅ | ❌ | ❌ | text | text | 83 | | Meta-Llama-3-70B-Instruct | ✅ | ❌ | ❌ | text | text | 84 | | Meta-Llama-3-8B-Instruct | ✅ | ❌ | ❌ | text | text | 85 | | Meta-Llama-3.1-405B-Instruct | ✅ | ❌ | ❌ | text | text | 86 | | Meta-Llama-3.1-70B-Instruct | ✅ | ❌ | ❌ | text | text | 87 | | Meta-Llama-3.1-8B-Instruct | ✅ | ❌ | ❌ | text | text | 88 | | Ministral-3B | ✅ | ❌ | ✅ | text | text | 89 | | Mistral-Large-2411 | ✅ | ❌ | ✅ | text | text | 90 | | Mistral-Nemo | ✅ | ❌ | ✅ | text | text | 91 | | Mistral-large | ✅ | ❌ | ✅ | text | text | 92 | | Mistral-large-2407 | ✅ | ❌ | ✅ | text | text | 93 | | Mistral-small | ✅ | ❌ | ✅ | text | text | 94 | | Phi-3-medium-128k-instruct | ✅ | ❌ | ❌ | text | text | 95 | | Phi-3-medium-4k-instruct | ✅ | ❌ | ❌ | text | text | 96 | | Phi-3-mini-128k-instruct | ✅ | ❌ | ❌ | text | text | 97 | | Phi-3-mini-4k-instruct | ✅ | ❌ | ❌ | text | text | 98 | | Phi-3-small-128k-instruct | ✅ | ❌ | ❌ | text | text | 99 | | Phi-3-small-8k-instruct | ✅ | ❌ | ❌ | text | text | 100 | | Phi-3.5-MoE-instruct | ✅ | ❌ | ❌ | text | text | 101 | | Phi-3.5-mini-instruct | ✅ | ❌ | ❌ | text | text | 102 | | Phi-3.5-vision-instruct | ✅ | ❌ | ❌ | text, image | text | 103 | | Phi-4 | ✅ | ❌ | ❌ | text | text | 104 | | Phi-4-mini-instruct | ✅ | ❌ | ❌ | text | text | 105 | | Phi-4-mini-reasoning | ✅ | ❌ | ❌ | text | text | 106 | | Phi-4-multimodal-instruct | ✅ | ❌ | ❌ | audio, image, text | text | 107 | | Phi-4-reasoning | ✅ | ❌ | ❌ | text | text | 108 | | cohere-command-a | ✅ | ❌ | ✅ | text | text | 109 | | gpt-4.1 | ✅ | ✅ | ✅ | text, image | text | 110 | | gpt-4.1-mini | ✅ | ✅ | ✅ | text, image | text | 111 | | gpt-4.1-nano | ✅ | ✅ | ✅ | text, image | text | 112 | | gpt-4o | ✅ | ✅ | ✅ | text, image, audio | text | 113 | | gpt-4o-mini | ✅ | ✅ | ✅ | text, image, audio | text | 114 | | grok-3 | ✅ | ❌ | ✅ | text | text | 115 | | grok-3-mini | ✅ | ❌ | ✅ | text | text | 116 | | jais-30b-chat | ✅ | ❌ | ❌ | text | text | 117 | | mistral-medium-2505 | ✅ | ❌ | ✅ | text, image | text | 118 | | mistral-small-2503 | ✅ | ❌ | ✅ | text, image | text | 119 | | o1 | ❌ | ✅ | ✅ | text, image | text | 120 | | o1-mini | ❌ | ❌ | ❌ | text | text | 121 | | o1-preview | ❌ | ❌ | ❌ | text | text | 122 | | o3 | ✅ | ❌ | ✅ | text, image | text | 123 | | o3-mini | ❌ | ✅ | ✅ | text | text | 124 | | o4-mini | ✅ | ❌ | ✅ | text, image | text | 125 | 126 | ### AI21 Jamba 1.5 Large 127 | 128 | Usage: `llm -m github/AI21-Jamba-1.5-Large` 129 | 130 | **Publisher:** AI21 Labs 131 | 132 | **Description:** A 398B parameters (94B active) multilingual model, offering a 256K long context window, function calling, structured output, and grounded generation. 133 | 134 | ### AI21 Jamba 1.5 Mini 135 | 136 | Usage: `llm -m github/AI21-Jamba-1.5-Mini` 137 | 138 | **Publisher:** AI21 Labs 139 | 140 | **Description:** A 52B parameters (12B active) multilingual model, offering a 256K long context window, function calling, structured output, and grounded generation. 141 | 142 | ### Codestral 25.01 143 | 144 | Usage: `llm -m github/Codestral-2501` 145 | 146 | **Publisher:** Mistral AI 147 | 148 | **Description:** Codestral 25.01 by Mistral AI is designed for code generation, supporting 80+ programming languages, and optimized for tasks like code completion and fill-in-the-middle 149 | 150 | ### Cohere Command R 151 | 152 | Usage: `llm -m github/Cohere-command-r` 153 | 154 | **Publisher:** Cohere 155 | 156 | **Description:** Command R is a scalable generative model targeting RAG and Tool Use to enable production-scale AI for enterprise. 157 | 158 | ### Cohere Command R 08-2024 159 | 160 | Usage: `llm -m github/Cohere-command-r-08-2024` 161 | 162 | **Publisher:** Cohere 163 | 164 | **Description:** Command R is a scalable generative model targeting RAG and Tool Use to enable production-scale AI for enterprise. 165 | 166 | ### Cohere Command R+ 167 | 168 | Usage: `llm -m github/Cohere-command-r-plus` 169 | 170 | **Publisher:** Cohere 171 | 172 | **Description:** Command R+ is a state-of-the-art RAG-optimized model designed to tackle enterprise-grade workloads. 173 | 174 | ### Cohere Command R+ 08-2024 175 | 176 | Usage: `llm -m github/Cohere-command-r-plus-08-2024` 177 | 178 | **Publisher:** Cohere 179 | 180 | **Description:** Command R+ is a state-of-the-art RAG-optimized model designed to tackle enterprise-grade workloads. 181 | 182 | ### Cohere Embed v3 English 183 | 184 | Usage: `llm -m github/Cohere-embed-v3-english` 185 | 186 | **Publisher:** Cohere 187 | 188 | **Description:** Cohere Embed English is the market's leading text representation model used for semantic search, retrieval-augmented generation (RAG), classification, and clustering. 189 | 190 | ### Cohere Embed v3 Multilingual 191 | 192 | Usage: `llm -m github/Cohere-embed-v3-multilingual` 193 | 194 | **Publisher:** Cohere 195 | 196 | **Description:** Cohere Embed Multilingual is the market's leading text representation model used for semantic search, retrieval-augmented generation (RAG), classification, and clustering. 197 | 198 | ### DeepSeek-R1 199 | 200 | Usage: `llm -m github/DeepSeek-R1` 201 | 202 | **Publisher:** DeepSeek 203 | 204 | **Description:** DeepSeek-R1 excels at reasoning tasks using a step-by-step training process, such as language, scientific reasoning, and coding tasks. 205 | 206 | ### DeepSeek-V3 207 | 208 | Usage: `llm -m github/DeepSeek-V3` 209 | 210 | **Publisher:** DeepSeek 211 | 212 | **Description:** A strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. 213 | 214 | ### DeepSeek-V3-0324 215 | 216 | Usage: `llm -m github/DeepSeek-V3-0324` 217 | 218 | **Publisher:** DeepSeek 219 | 220 | **Description:** DeepSeek-V3-0324 demonstrates notable improvements over its predecessor, DeepSeek-V3, in several key aspects, including enhanced reasoning, improved function calling, and superior code generation capabilities. 221 | 222 | ### Llama-3.2-11B-Vision-Instruct 223 | 224 | Usage: `llm -m github/Llama-3.2-11B-Vision-Instruct` 225 | 226 | **Publisher:** Meta 227 | 228 | **Description:** Excels in image reasoning capabilities on high-res images for visual understanding apps. 229 | 230 | ### Llama-3.2-90B-Vision-Instruct 231 | 232 | Usage: `llm -m github/Llama-3.2-90B-Vision-Instruct` 233 | 234 | **Publisher:** Meta 235 | 236 | **Description:** Advanced image reasoning capabilities for visual understanding agentic apps. 237 | 238 | ### Llama-3.3-70B-Instruct 239 | 240 | Usage: `llm -m github/Llama-3.3-70B-Instruct` 241 | 242 | **Publisher:** Meta 243 | 244 | **Description:** Llama 3.3 70B Instruct offers enhanced reasoning, math, and instruction following with performance comparable to Llama 3.1 405B. 245 | 246 | ### Llama 4 Maverick 17B 128E Instruct FP8 247 | 248 | Usage: `llm -m github/Llama-4-Maverick-17B-128E-Instruct-FP8` 249 | 250 | **Publisher:** Meta 251 | 252 | **Description:** Llama 4 Maverick 17B 128E Instruct FP8 is great at precise image understanding and creative writing, offering high quality at a lower price compared to Llama 3.3 70B 253 | 254 | ### Llama 4 Scout 17B 16E Instruct 255 | 256 | Usage: `llm -m github/Llama-4-Scout-17B-16E-Instruct` 257 | 258 | **Publisher:** Meta 259 | 260 | **Description:** Llama 4 Scout 17B 16E Instruct is great at multi-document summarization, parsing extensive user activity for personalized tasks, and reasoning over vast codebases. 261 | 262 | ### MAI-DS-R1 263 | 264 | Usage: `llm -m github/MAI-DS-R1` 265 | 266 | **Publisher:** Microsoft 267 | 268 | **Description:** MAI-DS-R1 is a DeepSeek-R1 reasoning model that has been post-trained by the Microsoft AI team to fill in information gaps in the previous version of the model and improve its harm protections while maintaining R1 reasoning capabilities. 269 | 270 | ### Meta-Llama-3-70B-Instruct 271 | 272 | Usage: `llm -m github/Meta-Llama-3-70B-Instruct` 273 | 274 | **Publisher:** Meta 275 | 276 | **Description:** A powerful 70-billion parameter model excelling in reasoning, coding, and broad language applications. 277 | 278 | ### Meta-Llama-3-8B-Instruct 279 | 280 | Usage: `llm -m github/Meta-Llama-3-8B-Instruct` 281 | 282 | **Publisher:** Meta 283 | 284 | **Description:** A versatile 8-billion parameter model optimized for dialogue and text generation tasks. 285 | 286 | ### Meta-Llama-3.1-405B-Instruct 287 | 288 | Usage: `llm -m github/Meta-Llama-3.1-405B-Instruct` 289 | 290 | **Publisher:** Meta 291 | 292 | **Description:** The Llama 3.1 instruction tuned text only models are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks. 293 | 294 | ### Meta-Llama-3.1-70B-Instruct 295 | 296 | Usage: `llm -m github/Meta-Llama-3.1-70B-Instruct` 297 | 298 | **Publisher:** Meta 299 | 300 | **Description:** The Llama 3.1 instruction tuned text only models are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks. 301 | 302 | ### Meta-Llama-3.1-8B-Instruct 303 | 304 | Usage: `llm -m github/Meta-Llama-3.1-8B-Instruct` 305 | 306 | **Publisher:** Meta 307 | 308 | **Description:** The Llama 3.1 instruction tuned text only models are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks. 309 | 310 | ### Ministral 3B 311 | 312 | Usage: `llm -m github/Ministral-3B` 313 | 314 | **Publisher:** Mistral AI 315 | 316 | **Description:** Ministral 3B is a state-of-the-art Small Language Model (SLM) optimized for edge computing and on-device applications. As it is designed for low-latency and compute-efficient inference, it it also the perfect model for standard GenAI applications that have 317 | 318 | ### Mistral Large 24.11 319 | 320 | Usage: `llm -m github/Mistral-Large-2411` 321 | 322 | **Publisher:** Mistral AI 323 | 324 | **Description:** Mistral Large 24.11 offers enhanced system prompts, advanced reasoning and function calling capabilities. 325 | 326 | ### Mistral Nemo 327 | 328 | Usage: `llm -m github/Mistral-Nemo` 329 | 330 | **Publisher:** Mistral AI 331 | 332 | **Description:** Mistral Nemo is a cutting-edge Language Model (LLM) boasting state-of-the-art reasoning, world knowledge, and coding capabilities within its size category. 333 | 334 | ### Mistral Large 335 | 336 | Usage: `llm -m github/Mistral-large` 337 | 338 | **Publisher:** Mistral AI 339 | 340 | **Description:** Mistral's flagship model that's ideal for complex tasks that require large reasoning capabilities or are highly specialized (Synthetic Text Generation, Code Generation, RAG, or Agents). 341 | 342 | ### Mistral Large (2407) 343 | 344 | Usage: `llm -m github/Mistral-large-2407` 345 | 346 | **Publisher:** Mistral AI 347 | 348 | **Description:** Mistral Large (2407) is an advanced Large Language Model (LLM) with state-of-the-art reasoning, knowledge and coding capabilities. 349 | 350 | ### Mistral Small 351 | 352 | Usage: `llm -m github/Mistral-small` 353 | 354 | **Publisher:** Mistral AI 355 | 356 | **Description:** Mistral Small can be used on any language-based task that requires high efficiency and low latency. 357 | 358 | ### Phi-3-medium instruct (128k) 359 | 360 | Usage: `llm -m github/Phi-3-medium-128k-instruct` 361 | 362 | **Publisher:** Microsoft 363 | 364 | **Description:** Same Phi-3-medium model, but with a larger context size for RAG or few shot prompting. 365 | 366 | ### Phi-3-medium instruct (4k) 367 | 368 | Usage: `llm -m github/Phi-3-medium-4k-instruct` 369 | 370 | **Publisher:** Microsoft 371 | 372 | **Description:** A 14B parameters model, proves better quality than Phi-3-mini, with a focus on high-quality, reasoning-dense data. 373 | 374 | ### Phi-3-mini instruct (128k) 375 | 376 | Usage: `llm -m github/Phi-3-mini-128k-instruct` 377 | 378 | **Publisher:** Microsoft 379 | 380 | **Description:** Same Phi-3-mini model, but with a larger context size for RAG or few shot prompting. 381 | 382 | ### Phi-3-mini instruct (4k) 383 | 384 | Usage: `llm -m github/Phi-3-mini-4k-instruct` 385 | 386 | **Publisher:** Microsoft 387 | 388 | **Description:** Tiniest member of the Phi-3 family. Optimized for both quality and low latency. 389 | 390 | ### Phi-3-small instruct (128k) 391 | 392 | Usage: `llm -m github/Phi-3-small-128k-instruct` 393 | 394 | **Publisher:** Microsoft 395 | 396 | **Description:** Same Phi-3-small model, but with a larger context size for RAG or few shot prompting. 397 | 398 | ### Phi-3-small instruct (8k) 399 | 400 | Usage: `llm -m github/Phi-3-small-8k-instruct` 401 | 402 | **Publisher:** Microsoft 403 | 404 | **Description:** A 7B parameters model, proves better quality than Phi-3-mini, with a focus on high-quality, reasoning-dense data. 405 | 406 | ### Phi-3.5-MoE instruct (128k) 407 | 408 | Usage: `llm -m github/Phi-3.5-MoE-instruct` 409 | 410 | **Publisher:** Microsoft 411 | 412 | **Description:** A new mixture of experts model 413 | 414 | ### Phi-3.5-mini instruct (128k) 415 | 416 | Usage: `llm -m github/Phi-3.5-mini-instruct` 417 | 418 | **Publisher:** Microsoft 419 | 420 | **Description:** Refresh of Phi-3-mini model. 421 | 422 | ### Phi-3.5-vision instruct (128k) 423 | 424 | Usage: `llm -m github/Phi-3.5-vision-instruct` 425 | 426 | **Publisher:** Microsoft 427 | 428 | **Description:** Refresh of Phi-3-vision model. 429 | 430 | ### Phi-4 431 | 432 | Usage: `llm -m github/Phi-4` 433 | 434 | **Publisher:** Microsoft 435 | 436 | **Description:** Phi-4 14B, a highly capable model for low latency scenarios. 437 | 438 | ### Phi-4-mini-instruct 439 | 440 | Usage: `llm -m github/Phi-4-mini-instruct` 441 | 442 | **Publisher:** Microsoft 443 | 444 | **Description:** 3.8B parameters Small Language Model outperforming larger models in reasoning, math, coding, and function-calling 445 | 446 | ### Phi-4-mini-reasoning 447 | 448 | Usage: `llm -m github/Phi-4-mini-reasoning` 449 | 450 | **Publisher:** Microsoft 451 | 452 | **Description:** Lightweight math reasoning model optimized for multi-step problem solving 453 | 454 | ### Phi-4-multimodal-instruct 455 | 456 | Usage: `llm -m github/Phi-4-multimodal-instruct` 457 | 458 | **Publisher:** Microsoft 459 | 460 | **Description:** First small multimodal model to have 3 modality inputs (text, audio, image), excelling in quality and efficiency 461 | 462 | ### Phi-4-Reasoning 463 | 464 | Usage: `llm -m github/Phi-4-reasoning` 465 | 466 | **Publisher:** Microsoft 467 | 468 | **Description:** State-of-the-art open-weight reasoning model. 469 | 470 | ### Cohere Command A 471 | 472 | Usage: `llm -m github/cohere-command-a` 473 | 474 | **Publisher:** Cohere 475 | 476 | **Description:** Command A is a highly efficient generative model that excels at agentic and multilingual use cases. 477 | 478 | ### Cohere Embed 4 479 | 480 | Usage: `llm -m github/embed-v-4-0` 481 | 482 | **Publisher:** Cohere 483 | 484 | **Description:** Embed 4 transforms texts and images into numerical vectors 485 | 486 | ### OpenAI GPT-4.1 487 | 488 | Usage: `llm -m github/gpt-4.1` 489 | 490 | **Publisher:** OpenAI 491 | 492 | **Description:** gpt-4.1 outperforms gpt-4o across the board, with major gains in coding, instruction following, and long-context understanding 493 | 494 | ### OpenAI GPT-4.1-mini 495 | 496 | Usage: `llm -m github/gpt-4.1-mini` 497 | 498 | **Publisher:** OpenAI 499 | 500 | **Description:** gpt-4.1-mini outperform gpt-4o-mini across the board, with major gains in coding, instruction following, and long-context handling 501 | 502 | ### OpenAI GPT-4.1-nano 503 | 504 | Usage: `llm -m github/gpt-4.1-nano` 505 | 506 | **Publisher:** OpenAI 507 | 508 | **Description:** gpt-4.1-nano provides gains in coding, instruction following, and long-context handling along with lower latency and cost 509 | 510 | ### OpenAI GPT-4o 511 | 512 | Usage: `llm -m github/gpt-4o` 513 | 514 | **Publisher:** OpenAI 515 | 516 | **Description:** OpenAI's most advanced multimodal model in the gpt-4o family. Can handle both text and image inputs. 517 | 518 | ### OpenAI GPT-4o mini 519 | 520 | Usage: `llm -m github/gpt-4o-mini` 521 | 522 | **Publisher:** OpenAI 523 | 524 | **Description:** An affordable, efficient AI solution for diverse text and image tasks. 525 | 526 | ### Grok 3 527 | 528 | Usage: `llm -m github/grok-3` 529 | 530 | **Publisher:** xAI 531 | 532 | **Description:** Grok 3 is xAI's debut model, pretrained by Colossus at supermassive scale to excel in specialized domains like finance, healthcare, and the law. 533 | 534 | ### Grok 3 Mini 535 | 536 | Usage: `llm -m github/grok-3-mini` 537 | 538 | **Publisher:** xAI 539 | 540 | **Description:** Grok 3 Mini is a lightweight model that thinks before responding. Trained on mathematic and scientific problems, it is great for logic-based tasks. 541 | 542 | ### JAIS 30b Chat 543 | 544 | Usage: `llm -m github/jais-30b-chat` 545 | 546 | **Publisher:** Core42 547 | 548 | **Description:** JAIS 30b Chat is an auto-regressive bilingual LLM for Arabic & English with state-of-the-art capabilities in Arabic. 549 | 550 | ### Mistral Medium 3 (25.05) 551 | 552 | Usage: `llm -m github/mistral-medium-2505` 553 | 554 | **Publisher:** Mistral AI 555 | 556 | **Description:** Mistral Medium 3 is an advanced Large Language Model (LLM) with state-of-the-art reasoning, knowledge, coding and vision capabilities. 557 | 558 | ### Mistral Small 3.1 559 | 560 | Usage: `llm -m github/mistral-small-2503` 561 | 562 | **Publisher:** Mistral AI 563 | 564 | **Description:** Enhanced Mistral Small 3 with multimodal capabilities and a 128k context length. 565 | 566 | ### OpenAI o1 567 | 568 | Usage: `llm -m github/o1` 569 | 570 | **Publisher:** OpenAI 571 | 572 | **Description:** Focused on advanced reasoning and solving complex problems, including math and science tasks. Ideal for applications that require deep contextual understanding and agentic workflows. 573 | 574 | ### OpenAI o1-mini 575 | 576 | Usage: `llm -m github/o1-mini` 577 | 578 | **Publisher:** OpenAI 579 | 580 | **Description:** Smaller, faster, and 80% cheaper than o1-preview, performs well at code generation and small context operations. 581 | 582 | ### OpenAI o1-preview 583 | 584 | Usage: `llm -m github/o1-preview` 585 | 586 | **Publisher:** OpenAI 587 | 588 | **Description:** Focused on advanced reasoning and solving complex problems, including math and science tasks. Ideal for applications that require deep contextual understanding and agentic workflows. 589 | 590 | ### OpenAI o3 591 | 592 | Usage: `llm -m github/o3` 593 | 594 | **Publisher:** OpenAI 595 | 596 | **Description:** o3 includes significant improvements on quality and safety while supporting the existing features of o1 and delivering comparable or better performance. 597 | 598 | ### OpenAI o3-mini 599 | 600 | Usage: `llm -m github/o3-mini` 601 | 602 | **Publisher:** OpenAI 603 | 604 | **Description:** o3-mini includes the o1 features with significant cost-efficiencies for scenarios requiring high performance. 605 | 606 | ### OpenAI o4-mini 607 | 608 | Usage: `llm -m github/o4-mini` 609 | 610 | **Publisher:** OpenAI 611 | 612 | **Description:** o4-mini includes significant improvements on quality and safety while supporting the existing features of o3-mini and delivering comparable or better performance. 613 | 614 | ### OpenAI Text Embedding 3 (large) 615 | 616 | Usage: `llm -m github/text-embedding-3-large` 617 | 618 | **Publisher:** OpenAI 619 | 620 | **Description:** Text-embedding-3 series models are the latest and most capable embedding model from OpenAI. 621 | 622 | ### OpenAI Text Embedding 3 (small) 623 | 624 | Usage: `llm -m github/text-embedding-3-small` 625 | 626 | **Publisher:** OpenAI 627 | 628 | **Description:** Text-embedding-3 series models are the latest and most capable embedding model from OpenAI. 629 | 630 | -------------------------------------------------------------------------------- /llm_github_models.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import AsyncGenerator, Dict, Iterable, Iterator, List, Optional, Union 3 | 4 | import llm 5 | from azure.ai.inference import ChatCompletionsClient, EmbeddingsClient 6 | from azure.ai.inference.aio import ChatCompletionsClient as AsyncChatCompletionsClient 7 | from azure.ai.inference.models import ( 8 | AssistantMessage, 9 | AudioContentFormat, 10 | AudioContentItem, 11 | ChatCompletionsToolCall, 12 | ChatCompletionsToolDefinition, 13 | ChatRequestMessage, 14 | CompletionsUsage, 15 | ContentItem, 16 | FunctionCall, 17 | FunctionDefinition, 18 | ImageContentItem, 19 | ImageDetailLevel, 20 | ImageUrl, 21 | InputAudio, 22 | JsonSchemaFormat, 23 | StreamingChatResponseMessageUpdate, 24 | StreamingChatResponseToolCallUpdate, 25 | SystemMessage, 26 | TextContentItem, 27 | ToolMessage, 28 | UserMessage, 29 | ) 30 | from azure.core.credentials import AzureKeyCredential 31 | from llm.models import ( 32 | AsyncConversation, 33 | AsyncModel, 34 | AsyncResponse, 35 | Attachment, 36 | Conversation, 37 | EmbeddingModel, 38 | Prompt, 39 | Response, 40 | ) 41 | from pydantic import BaseModel 42 | 43 | INFERENCE_ENDPOINT = "https://models.inference.ai.azure.com" 44 | 45 | CHAT_MODELS = [ 46 | ("AI21-Jamba-1.5-Large", True, False, False, False, ["text"], ["text"]), 47 | ("AI21-Jamba-1.5-Mini", True, False, False, False, ["text"], ["text"]), 48 | ("Codestral-2501", True, False, False, True, ["text"], ["text"]), 49 | ("Cohere-command-r", True, False, False, True, ["text"], ["text"]), 50 | ("Cohere-command-r-08-2024", True, False, False, True, ["text"], ["text"]), 51 | ("Cohere-command-r-plus", True, False, False, True, ["text"], ["text"]), 52 | ("Cohere-command-r-plus-08-2024", True, False, False, True, ["text"], ["text"]), 53 | ("DeepSeek-R1", True, False, False, False, ["text"], ["text"]), 54 | ("DeepSeek-V3", True, False, False, False, ["text"], ["text"]), 55 | ("DeepSeek-V3-0324", True, False, False, False, ["text"], ["text"]), 56 | ( 57 | "Llama-3.2-11B-Vision-Instruct", 58 | True, 59 | False, 60 | False, 61 | False, 62 | ["text", "image", "audio"], 63 | ["text"], 64 | ), 65 | ( 66 | "Llama-3.2-90B-Vision-Instruct", 67 | True, 68 | False, 69 | False, 70 | False, 71 | ["text", "image", "audio"], 72 | ["text"], 73 | ), 74 | ("Llama-3.3-70B-Instruct", True, False, False, False, ["text"], ["text"]), 75 | ( 76 | "Llama-4-Maverick-17B-128E-Instruct-FP8", 77 | True, 78 | False, 79 | False, 80 | False, 81 | ["text", "image"], 82 | ["text"], 83 | ), 84 | ("Llama-4-Scout-17B-16E-Instruct", True, False, False, False, ["text", "image"], ["text"]), 85 | ("MAI-DS-R1", True, False, False, False, ["text"], ["text"]), 86 | ("Meta-Llama-3-70B-Instruct", True, False, False, False, ["text"], ["text"]), 87 | ("Meta-Llama-3-8B-Instruct", True, False, False, False, ["text"], ["text"]), 88 | ("Meta-Llama-3.1-405B-Instruct", True, False, False, False, ["text"], ["text"]), 89 | ("Meta-Llama-3.1-70B-Instruct", True, False, False, False, ["text"], ["text"]), 90 | ("Meta-Llama-3.1-8B-Instruct", True, False, False, False, ["text"], ["text"]), 91 | ("Ministral-3B", True, False, False, True, ["text"], ["text"]), 92 | ("Mistral-Large-2411", True, False, False, True, ["text"], ["text"]), 93 | ("Mistral-Nemo", True, False, False, True, ["text"], ["text"]), 94 | ("Mistral-large", True, False, False, True, ["text"], ["text"]), 95 | ("Mistral-large-2407", True, False, False, True, ["text"], ["text"]), 96 | ("Mistral-small", True, False, False, True, ["text"], ["text"]), 97 | ("Phi-3-medium-128k-instruct", True, False, False, False, ["text"], ["text"]), 98 | ("Phi-3-medium-4k-instruct", True, False, False, False, ["text"], ["text"]), 99 | ("Phi-3-mini-128k-instruct", True, False, False, False, ["text"], ["text"]), 100 | ("Phi-3-mini-4k-instruct", True, False, False, False, ["text"], ["text"]), 101 | ("Phi-3-small-128k-instruct", True, False, False, False, ["text"], ["text"]), 102 | ("Phi-3-small-8k-instruct", True, False, False, False, ["text"], ["text"]), 103 | ("Phi-3.5-MoE-instruct", True, False, False, False, ["text"], ["text"]), 104 | ("Phi-3.5-mini-instruct", True, False, False, False, ["text"], ["text"]), 105 | ("Phi-3.5-vision-instruct", True, False, False, False, ["text", "image"], None), 106 | ("Phi-4", True, False, False, False, ["text"], ["text"]), 107 | ("Phi-4-mini-instruct", True, False, False, False, ["text"], ["text"]), 108 | ("Phi-4-mini-reasoning", True, False, False, False, ["text"], ["text"]), 109 | ("Phi-4-multimodal-instruct", True, False, False, False, ["audio", "image", "text"], ["text"]), 110 | ("Phi-4-reasoning", True, False, False, False, ["text"], ["text"]), 111 | ("cohere-command-a", True, False, False, True, ["text"], ["text"]), 112 | ("gpt-4.1", True, True, True, True, ["text", "image"], ["text"]), 113 | ("gpt-4.1-mini", True, True, True, True, ["text", "image"], ["text"]), 114 | ("gpt-4.1-nano", True, True, True, True, ["text", "image"], ["text"]), 115 | ("gpt-4o", True, True, True, True, ["text", "image", "audio"], ["text"]), 116 | ("gpt-4o-mini", True, True, True, True, ["text", "image", "audio"], ["text"]), 117 | ("grok-3", True, False, False, True, ["text"], ["text"]), 118 | ("grok-3-mini", True, False, False, True, ["text"], ["text"]), 119 | ("jais-30b-chat", True, False, False, False, ["text"], ["text"]), 120 | ("mistral-medium-2505", True, False, False, True, ["text", "image"], ["text"]), 121 | ("mistral-small-2503", True, False, False, True, ["text", "image"], ["text"]), 122 | ("o1", False, True, False, True, ["text", "image"], ["text"]), 123 | ("o1-mini", False, False, False, False, ["text"], ["text"]), 124 | ("o1-preview", False, False, False, False, ["text"], ["text"]), 125 | ("o3", True, False, True, True, ["text", "image"], ["text"]), 126 | ("o3-mini", False, True, False, True, ["text"], ["text"]), 127 | ("o4-mini", True, False, True, True, ["text", "image"], ["text"]), 128 | ] 129 | 130 | EMBEDDING_MODELS = [ 131 | ("Cohere-embed-v3-english", []), 132 | ("Cohere-embed-v3-multilingual", []), 133 | ("text-embedding-3-large", [1024, 256]), 134 | ("text-embedding-3-small", [512]), 135 | ] 136 | 137 | 138 | @llm.hookimpl 139 | def register_models(register): 140 | # Register both sync and async versions of each model 141 | # TODO: Dynamically fetch this list 142 | for ( 143 | model_id, 144 | can_stream, 145 | supports_schema, 146 | requires_usage_stream_option, 147 | supports_tools, 148 | input_modalities, 149 | output_modalities, 150 | ) in CHAT_MODELS: 151 | register( 152 | GitHubModels( 153 | model_id, 154 | can_stream=can_stream, 155 | supports_schema=supports_schema, 156 | requires_usage_stream_option=requires_usage_stream_option, 157 | supports_tools=supports_tools, 158 | input_modalities=input_modalities, 159 | output_modalities=output_modalities, 160 | ), 161 | GitHubAsyncModels( 162 | model_id, 163 | can_stream=can_stream, 164 | supports_schema=supports_schema, 165 | requires_usage_stream_option=requires_usage_stream_option, 166 | supports_tools=supports_tools, 167 | input_modalities=input_modalities, 168 | output_modalities=output_modalities, 169 | ), 170 | ) 171 | 172 | 173 | @llm.hookimpl 174 | def register_embedding_models(register): 175 | # Register embedding models 176 | for model_id, supported_dimensions in EMBEDDING_MODELS: 177 | register(GitHubEmbeddingModel(model_id)) 178 | for dimensions in supported_dimensions: 179 | register(GitHubEmbeddingModel(model_id, dimensions=dimensions)) 180 | 181 | 182 | IMAGE_ATTACHMENTS = { 183 | "image/png", 184 | "image/jpeg", 185 | "image/webp", 186 | "image/gif", 187 | } 188 | 189 | AUDIO_ATTACHMENTS = { 190 | "audio/wav", 191 | "audio/mpeg", 192 | } 193 | 194 | 195 | def attachment_as_content_item(attachment: Attachment) -> ContentItem: 196 | if attachment is None or attachment.resolve_type() is None: 197 | raise ValueError("Attachment cannot be None or empty") 198 | 199 | attachment_type: str = attachment.resolve_type() # type: ignore 200 | 201 | if attachment_type.startswith("audio/"): 202 | audio_format = ( 203 | AudioContentFormat.WAV if attachment_type == "audio/wav" else AudioContentFormat.MP3 204 | ) 205 | if attachment.path is None: 206 | raise ValueError("Audio attachment must have a path for audio content") 207 | 208 | return AudioContentItem( 209 | input_audio=InputAudio.load(audio_file=attachment.path, audio_format=audio_format) 210 | ) 211 | if attachment_type.startswith("image/"): 212 | if attachment.url: 213 | return ImageContentItem( 214 | image_url=ImageUrl( 215 | url=attachment.url, 216 | detail=ImageDetailLevel.AUTO, 217 | ), 218 | ) 219 | if attachment.path: 220 | return ImageContentItem( 221 | image_url=ImageUrl.load( 222 | image_file=attachment.path, 223 | image_format=attachment_type.split("/")[1], 224 | detail=ImageDetailLevel.AUTO, 225 | ), 226 | ) 227 | 228 | raise ValueError(f"Unsupported attachment type: {attachment_type}") 229 | 230 | 231 | def build_messages( 232 | prompt: Prompt, conversation: Optional[Union[Conversation, AsyncConversation]] = None 233 | ) -> List[ChatRequestMessage]: 234 | messages: List[ChatRequestMessage] = [] 235 | current_system = None 236 | if conversation is not None: 237 | for prev_response in conversation.responses: 238 | if prev_response.prompt.system and prev_response.prompt.system != current_system: 239 | messages.append(SystemMessage(prev_response.prompt.system)) 240 | current_system = prev_response.prompt.system 241 | if prev_response.attachments: 242 | attachment_message: list[ContentItem] = [] 243 | if prev_response.prompt.prompt: 244 | attachment_message.append(TextContentItem(text=prev_response.prompt.prompt)) 245 | for attachment in prev_response.attachments: 246 | attachment_message.append(attachment_as_content_item(attachment)) 247 | messages.append(UserMessage(attachment_message)) 248 | else: 249 | messages.append(UserMessage(prev_response.prompt.prompt)) 250 | 251 | # Add any tool results from the previous prompt 252 | for tool_result in prev_response.prompt.tool_results: 253 | messages.append( 254 | ToolMessage( 255 | tool_call_id=tool_result.tool_call_id or "", content=tool_result.output 256 | ) 257 | ) 258 | 259 | # Add the assistant's response 260 | assistant_msg = AssistantMessage(prev_response.text_or_raise()) # type: ignore 261 | 262 | tool_calls = prev_response.tool_calls_or_raise() # type: ignore 263 | if tool_calls: 264 | assistant_tool_calls = [] 265 | for tool_call in tool_calls: 266 | assistant_tool_calls.append( 267 | ChatCompletionsToolCall( 268 | id=tool_call.tool_call_id, 269 | function=FunctionCall( 270 | name=tool_call.name, arguments=json.dumps(tool_call.arguments) 271 | ), 272 | ) 273 | ) 274 | 275 | # Set tool_calls on the assistant message 276 | assistant_msg.tool_calls = assistant_tool_calls 277 | 278 | messages.append(assistant_msg) 279 | 280 | if prompt.system and prompt.system != current_system: 281 | messages.append(SystemMessage(prompt.system)) 282 | if prompt.attachments: 283 | attachment_message = [] 284 | if prompt.prompt: 285 | attachment_message.append(TextContentItem(text=prompt.prompt)) 286 | for attachment in prompt.attachments: 287 | attachment_message.append(attachment_as_content_item(attachment)) 288 | messages.append(UserMessage(attachment_message)) 289 | elif prompt.prompt: 290 | messages.append(UserMessage(content=prompt.prompt)) 291 | 292 | # Add any tool results for the current prompt 293 | for tool_result in prompt.tool_results: 294 | messages.append( 295 | ToolMessage(tool_call_id=tool_result.tool_call_id or "", content=tool_result.output) 296 | ) 297 | 298 | return messages 299 | 300 | 301 | def set_usage(usage: CompletionsUsage, response: Union[Response, AsyncResponse]) -> None: 302 | # Recursively remove keys with value 0 and empty dictionaries 303 | def remove_empty_and_zero(obj): 304 | if isinstance(obj, dict): 305 | cleaned = {k: remove_empty_and_zero(v) for k, v in obj.items() if v != 0 and v != {}} 306 | return {k: v for k, v in cleaned.items() if v is not None and v != {}} 307 | return obj 308 | 309 | details = usage.as_dict() 310 | details.pop("prompt_tokens", None) 311 | details.pop("completion_tokens", None) 312 | details.pop("total_tokens", None) 313 | 314 | response.set_usage( 315 | input=usage.prompt_tokens, 316 | output=usage.completion_tokens, 317 | details=remove_empty_and_zero(details), 318 | ) 319 | 320 | 321 | def append_streaming_tool_calls( 322 | tool_calls: Dict[str, StreamingChatResponseToolCallUpdate], 323 | delta: StreamingChatResponseMessageUpdate, 324 | ): 325 | if not delta.tool_calls: 326 | return 327 | 328 | for tool_call in delta.tool_calls: 329 | index = tool_call.get("index") 330 | if index not in tool_calls: 331 | tool_calls[index] = tool_call 332 | else: 333 | tool_calls[index].function.arguments += tool_call.function.arguments 334 | 335 | 336 | def add_tool_calls( 337 | tool_calls: Iterable[Union[ChatCompletionsToolCall, StreamingChatResponseToolCallUpdate]], 338 | response: Union[Response, AsyncResponse], 339 | ): 340 | for tool_call in tool_calls: 341 | try: 342 | arguments = json.loads(tool_call.function.arguments) 343 | except json.JSONDecodeError: 344 | arguments = {"error": "Invalid JSON in arguments"} 345 | 346 | response.add_tool_call( 347 | llm.ToolCall( 348 | tool_call_id=tool_call.id, 349 | name=tool_call.function.name, 350 | arguments=arguments, 351 | ) 352 | ) 353 | 354 | 355 | class _Shared: 356 | needs_key = "github" 357 | key_env_var = "GITHUB_MODELS_KEY" 358 | 359 | def __init__( 360 | self, 361 | model_id: str, 362 | can_stream: bool = True, 363 | supports_schema: bool = False, 364 | requires_usage_stream_option: bool = True, 365 | supports_tools: bool = False, 366 | input_modalities: Optional[List[str]] = None, 367 | output_modalities: Optional[List[str]] = None, 368 | ): 369 | self.model_id = f"github/{model_id}" 370 | self.model_name = model_id 371 | self.can_stream = can_stream 372 | self.supports_schema = supports_schema 373 | self.supports_tools = supports_tools 374 | self.attachment_types = set() 375 | if input_modalities and "image" in input_modalities: 376 | self.attachment_types.update(IMAGE_ATTACHMENTS) 377 | if input_modalities and "audio" in input_modalities: 378 | self.attachment_types.update(AUDIO_ATTACHMENTS) 379 | 380 | self.input_modalities = input_modalities 381 | self.output_modalities = output_modalities 382 | 383 | self.client_kwargs = {} 384 | # Use latest version 385 | self.client_kwargs["api_version"] = "2025-03-01-preview" 386 | 387 | self.streaming_model_extras = {} 388 | if requires_usage_stream_option: 389 | self.streaming_model_extras["stream_options"] = { 390 | "include_usage": True, 391 | } 392 | 393 | # Using the same display string for both the sync and async models 394 | # makes them not show up twice in `llm models` 395 | def __str__(self) -> str: 396 | return f"GitHub Models: {self.model_id}" 397 | 398 | def get_tools(self, prompt: Prompt) -> Optional[List[ChatCompletionsToolDefinition]]: 399 | if not self.supports_tools or not prompt.tools: 400 | return None 401 | 402 | return [ 403 | ChatCompletionsToolDefinition( 404 | function=FunctionDefinition( 405 | name=t.name, 406 | description=t.description or None, 407 | parameters=t.input_schema, 408 | ), 409 | ) 410 | for t in prompt.tools 411 | ] 412 | 413 | 414 | class GitHubModels(_Shared, llm.Model): 415 | def execute( 416 | self, 417 | prompt: Prompt, 418 | stream: bool, 419 | response: Response, 420 | conversation: Optional[Conversation], 421 | ) -> Iterator[str]: 422 | # unset keys are handled by llm.Model.get_key() 423 | key: str = self.get_key() # type: ignore 424 | 425 | with ChatCompletionsClient( 426 | endpoint=INFERENCE_ENDPOINT, 427 | credential=AzureKeyCredential(key), 428 | model=self.model_name, 429 | **self.client_kwargs, 430 | ) as client: 431 | response_format = "text" 432 | if prompt.schema: 433 | if not isinstance(prompt.schema, dict) and issubclass(prompt.schema, BaseModel): 434 | response_format = JsonSchemaFormat( 435 | name="output", schema=prompt.schema.model_json_schema() 436 | ) 437 | else: 438 | response_format = JsonSchemaFormat( 439 | name="output", 440 | schema=prompt.schema, # type: ignore[variable] 441 | ) 442 | 443 | usage: Optional[CompletionsUsage] = None 444 | messages = build_messages(prompt, conversation) 445 | 446 | tools = self.get_tools(prompt) 447 | 448 | if stream: 449 | completion = client.complete( 450 | messages=messages, 451 | stream=True, 452 | response_format=response_format, 453 | model_extras=self.streaming_model_extras, 454 | tools=tools, 455 | ) 456 | tool_calls = {} 457 | 458 | for chunk in completion: 459 | usage = usage or chunk.usage 460 | 461 | if len(chunk.choices) == 0: 462 | continue 463 | 464 | delta = chunk.choices[0].delta 465 | content = delta.content 466 | append_streaming_tool_calls(tool_calls, delta) 467 | 468 | if content is not None: 469 | yield content 470 | 471 | add_tool_calls( 472 | tool_calls.values(), 473 | response, 474 | ) 475 | 476 | response.response_json = None # TODO 477 | else: 478 | completion = client.complete( 479 | messages=messages, 480 | stream=False, 481 | response_format=response_format, 482 | tools=tools, 483 | ) 484 | usage = completion.usage 485 | 486 | tool_calls = completion.choices[0].message.tool_calls or [] 487 | add_tool_calls(tool_calls, response) 488 | 489 | response.response_json = None # TODO 490 | if completion.choices[0].message.content: 491 | yield completion.choices[0].message.content 492 | 493 | if usage is not None: 494 | set_usage(usage, response) 495 | 496 | 497 | class GitHubAsyncModels(_Shared, AsyncModel): 498 | async def execute( 499 | self, 500 | prompt: Prompt, 501 | stream: bool, 502 | response: AsyncResponse, 503 | conversation: Optional[AsyncConversation], 504 | ) -> AsyncGenerator[str, None]: 505 | key = self.get_key() 506 | 507 | async with AsyncChatCompletionsClient( 508 | endpoint=INFERENCE_ENDPOINT, 509 | credential=AzureKeyCredential(key), # type: ignore[variable] 510 | model=self.model_name, 511 | **self.client_kwargs, 512 | ) as client: 513 | response_format = "text" 514 | if prompt.schema: 515 | if not isinstance(prompt.schema, dict) and issubclass(prompt.schema, BaseModel): 516 | response_format = JsonSchemaFormat( 517 | name="output", schema=prompt.schema.model_json_schema() 518 | ) 519 | else: 520 | response_format = JsonSchemaFormat( 521 | name="output", 522 | schema=prompt.schema, # type: ignore[variable] 523 | ) 524 | 525 | usage: Optional[CompletionsUsage] = None 526 | messages = build_messages(prompt, conversation) 527 | 528 | tools = self.get_tools(prompt) 529 | 530 | if stream: 531 | completion = await client.complete( 532 | messages=messages, 533 | stream=True, 534 | response_format=response_format, 535 | model_extras=self.streaming_model_extras, 536 | tools=tools, 537 | ) 538 | 539 | tool_calls = {} 540 | async for chunk in completion: 541 | usage = usage or chunk.usage 542 | 543 | if len(chunk.choices) == 0: 544 | continue 545 | 546 | delta = chunk.choices[0].delta 547 | content = delta.content 548 | append_streaming_tool_calls(tool_calls, delta) 549 | 550 | if content is not None: 551 | yield content 552 | 553 | add_tool_calls( 554 | tool_calls.values(), 555 | response, 556 | ) 557 | 558 | response.response_json = None # TODO 559 | else: 560 | completion = await client.complete( 561 | messages=messages, 562 | stream=False, 563 | response_format=response_format, 564 | tools=tools, 565 | ) 566 | usage = usage or completion.usage 567 | 568 | tool_calls = completion.choices[0].message.tool_calls or [] 569 | add_tool_calls(tool_calls, response) 570 | 571 | response.response_json = None # TODO 572 | if completion.choices[0].message.content: 573 | yield completion.choices[0].message.content 574 | 575 | if usage is not None: 576 | set_usage(usage, response) 577 | 578 | 579 | class GitHubEmbeddingModel(EmbeddingModel): 580 | needs_key = "github" 581 | key_env_var = "GITHUB_MODELS_KEY" 582 | batch_size = 100 583 | 584 | def __init__(self, model_id: str, dimensions: Optional[int] = None): 585 | self.model_id = f"github/{model_id}" 586 | if dimensions is not None: 587 | self.model_id += f"-{dimensions}" 588 | 589 | self.model_name = model_id 590 | self.dimensions = dimensions 591 | 592 | def embed_batch(self, items: Iterable[Union[str, bytes]]) -> Iterator[List[float]]: 593 | if not items: 594 | return iter([]) 595 | 596 | key = self.get_key() 597 | client = EmbeddingsClient( 598 | endpoint=INFERENCE_ENDPOINT, 599 | credential=AzureKeyCredential(key), # type: ignore 600 | ) 601 | 602 | # TODO: Handle iterable of bytes 603 | 604 | kwargs = { 605 | "input": items, 606 | "model": self.model_name, 607 | } 608 | if self.dimensions: 609 | kwargs["dimensions"] = self.dimensions 610 | 611 | response = client.embed(**kwargs) 612 | return ([float(x) for x in item.embedding] for item in response.data) 613 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "llm-github-models" 3 | version = "0.15" 4 | description = "LLM plugin to access GitHub Models API" 5 | readme = "README.md" 6 | authors = [{name = "Anthony Shaw"}] 7 | license = {text = "Apache-2.0"} 8 | classifiers = [ 9 | "License :: OSI Approved :: Apache Software License" 10 | ] 11 | dependencies = [ 12 | "aiohttp>=3.11.18", 13 | "llm>=0.26", 14 | "azure-ai-inference>=1.0.0b8", 15 | ] 16 | 17 | [project.urls] 18 | Homepage = "https://github.com/tonybaloney/llm-github-models" 19 | Changelog = "https://github.com/tonybaloney/llm-github-models/releases" 20 | Issues = "https://github.com/tonybaloney/llm-github-models/issues" 21 | CI = "https://github.com/tonybaloney/llm-github-models/actions" 22 | 23 | [project.entry-points.llm] 24 | github = "llm_github_models" 25 | 26 | [project.optional-dependencies] 27 | test = ["pytest", "pytest-recording", "pytest-asyncio", "ruff", "pyright"] 28 | 29 | [tool.ruff] 30 | line-length = 100 31 | 32 | [tool.ruff.lint] 33 | select = ["E", "F", "I"] 34 | -------------------------------------------------------------------------------- /tests/files/kick.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonybaloney/llm-github-models/84db4b65a8c1cdb2b619a4ef3c972b4380923a9f/tests/files/kick.wav -------------------------------------------------------------------------------- /tests/files/salmon.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonybaloney/llm-github-models/84db4b65a8c1cdb2b619a4ef3c972b4380923a9f/tests/files/salmon.jpeg -------------------------------------------------------------------------------- /tests/test_llm_github_embeddings.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | from azure.ai.inference.models import EmbeddingItem, EmbeddingsResult 5 | 6 | from llm_github_models import EMBEDDING_MODELS, GitHubEmbeddingModel 7 | 8 | EMBEDDING_MODEL_IDS = [f"github/{model}" for model in EMBEDDING_MODELS] 9 | 10 | 11 | @pytest.mark.parametrize("model_id", EMBEDDING_MODELS) 12 | def test_embedding_model_initialization(model_id: str): 13 | """Test that embedding models are initialized correctly.""" 14 | embedding_model = GitHubEmbeddingModel(model_id) 15 | assert embedding_model.model_id == f"github/{model_id}" 16 | assert embedding_model.model_name == model_id 17 | 18 | 19 | @patch("llm_github_models.EmbeddingsClient", autospec=True) 20 | def test_embed_single_text(MockEmbeddingsClient): 21 | """Test embedding a single text.""" 22 | # Setup mock 23 | mock_instance = MockEmbeddingsClient.return_value 24 | 25 | # Mock the response 26 | mock_embedding = [0.1, 0.2, 0.3, 0.4, 0.5] 27 | mock_embedding_item = EmbeddingItem(embedding=mock_embedding, index=0) 28 | mock_result = EmbeddingsResult(data=[mock_embedding_item]) 29 | mock_instance.embed.return_value = mock_result 30 | 31 | # Create model and call embed 32 | model = GitHubEmbeddingModel("test-model") 33 | # Patch the get_key method to avoid actual key retrieval 34 | with patch.object(model, "get_key", return_value="test-key"): 35 | result = model.embed_batch(["This is a test text"]) 36 | 37 | # Assertions 38 | MockEmbeddingsClient.assert_called_once() 39 | mock_instance.embed.assert_called_once_with( 40 | model="test-model", 41 | input=["This is a test text"], 42 | ) 43 | 44 | result = list(result) 45 | assert len(result) == 1 46 | assert result[0] == [0.1, 0.2, 0.3, 0.4, 0.5] 47 | 48 | 49 | @patch("llm_github_models.EmbeddingsClient", autospec=True) 50 | def test_embed_with_dimensions(MockEmbeddingsClient): 51 | """Test embedding with a custom dimensions.""" 52 | # Setup mock 53 | mock_instance = MockEmbeddingsClient.return_value 54 | 55 | # Mock the response 56 | mock_embedding = [0.1, 0.2, 0.3, 0.4, 0.5] 57 | mock_embedding_item = EmbeddingItem(embedding=mock_embedding, index=0) 58 | mock_result = EmbeddingsResult(data=[mock_embedding_item]) 59 | mock_instance.embed.return_value = mock_result 60 | 61 | # Create model and call embed 62 | model = GitHubEmbeddingModel("test-model", 1234) 63 | # Patch the get_key method to avoid actual key retrieval 64 | with patch.object(model, "get_key", return_value="test-key"): 65 | result = model.embed_batch(["This is a test text"]) 66 | 67 | # Assertions 68 | MockEmbeddingsClient.assert_called_once() 69 | mock_instance.embed.assert_called_once_with( 70 | model="test-model", 71 | input=["This is a test text"], 72 | dimensions=1234, 73 | ) 74 | 75 | result = list(result) 76 | assert len(result) == 1 77 | assert result[0] == [0.1, 0.2, 0.3, 0.4, 0.5] 78 | 79 | 80 | @patch("llm_github_models.EmbeddingsClient", autospec=True) 81 | def test_embed_multiple_texts(MockEmbeddingsClient): 82 | """Test embedding multiple texts.""" 83 | # Setup mock 84 | mock_instance = MockEmbeddingsClient.return_value 85 | 86 | # Mock the response for multiple embeddings 87 | mock_embedding1 = [0.1, 0.2, 0.3] 88 | mock_embedding2 = [0.4, 0.5, 0.6] 89 | 90 | mock_embedding_item1 = EmbeddingItem(embedding=mock_embedding1, index=0) 91 | mock_embedding_item2 = EmbeddingItem(embedding=mock_embedding2, index=1) 92 | 93 | mock_result = EmbeddingsResult(data=[mock_embedding_item1, mock_embedding_item2]) 94 | 95 | mock_instance.embed.return_value = mock_result 96 | 97 | # Create model and call embed 98 | model = GitHubEmbeddingModel("test-model") 99 | # Patch the get_key method to avoid actual key retrieval 100 | with patch.object(model, "get_key", return_value="test-key"): 101 | texts = ["First text", "Second text"] 102 | result = model.embed_batch(texts) 103 | 104 | # Assertions 105 | MockEmbeddingsClient.assert_called_once() 106 | mock_instance.embed.assert_called_once_with( 107 | model="test-model", 108 | input=texts, 109 | ) 110 | result = list(result) 111 | assert len(result) == 2 112 | assert result[0] == [0.1, 0.2, 0.3] 113 | assert result[1] == [0.4, 0.5, 0.6] 114 | 115 | 116 | @patch("llm_github_models.EmbeddingsClient", autospec=True) 117 | def test_embed_empty_list(MockEmbeddingsClient): 118 | model = GitHubEmbeddingModel("text-embedding-3-small") 119 | with patch.object(model, "get_key", return_value="key"): 120 | result = model.embed_batch([]) 121 | assert list(result) == [] 122 | 123 | MockEmbeddingsClient.assert_not_called() 124 | 125 | 126 | def test_register_embedding_models(): 127 | registered = [] 128 | 129 | def fake_register(instance): 130 | registered.append(instance) 131 | 132 | from llm_github_models import register_embedding_models 133 | 134 | register_embedding_models(fake_register) 135 | 136 | def check_model(model_id, dimensions=None): 137 | suffix = f"-{dimensions}" if dimensions else "" 138 | m = next(m for m in registered if m.model_id == f"github/{model_id}{suffix}") 139 | 140 | assert isinstance(m, GitHubEmbeddingModel) 141 | assert m.model_name == model_id 142 | assert m.dimensions == dimensions 143 | 144 | registered.remove(m) 145 | 146 | for model_id, supported_dimensions in EMBEDDING_MODELS: 147 | check_model(model_id) 148 | 149 | for dims in supported_dimensions: 150 | check_model(model_id, dims) 151 | 152 | assert not registered, "More models registered than expected" 153 | -------------------------------------------------------------------------------- /tests/test_llm_github_models.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | from unittest.mock import Mock, patch 4 | 5 | import pytest 6 | from azure.ai.inference.models import ( 7 | AudioContentItem, 8 | CompletionsUsage, 9 | ImageContentItem, 10 | ImageUrl, 11 | InputAudio, 12 | StreamingChatChoiceUpdate, 13 | StreamingChatCompletionsUpdate, 14 | SystemMessage, 15 | UserMessage, 16 | ) 17 | from llm import get_async_model, get_model 18 | from llm.models import Attachment, Conversation, Prompt, Response 19 | from pydantic import BaseModel 20 | 21 | from llm_github_models import GitHubModels, build_messages, set_usage 22 | 23 | MODELS = ["github/gpt-4.1-mini", "github/gpt-4o-mini", "github/Llama-3.2-11B-Vision-Instruct"] 24 | 25 | 26 | @pytest.mark.parametrize("model", MODELS) 27 | def test_build_messages_no_conversation(model: str): 28 | # Test build_messages with conversation=None and a basic prompt without system. 29 | dummy_prompt = Prompt(prompt="Hello from prompt", system=None, attachments=[], model=model) 30 | messages = build_messages(dummy_prompt, None) 31 | # Should add one UserMessage from prompt since conversation is None. 32 | assert isinstance(messages, list) 33 | # Expecting only one message: UserMessage with content "Hello from prompt" 34 | assert len(messages) == 1 35 | msg = messages[0] 36 | assert isinstance(msg, UserMessage) 37 | # For a simple user message, content is stored in 'content' 38 | # Compare against expected message content. 39 | assert msg.content == "Hello from prompt" 40 | 41 | 42 | @pytest.mark.parametrize("model", MODELS) 43 | def test_build_messages_with_conversation_no_prompt_system(model: str): 44 | # Create a dummy conversation with one response. 45 | dummy_prompt = Prompt(prompt="Hello from prompt", system=None, attachments=[], model=model) 46 | _model = get_model(model) 47 | # The response has a system message and a user message. 48 | dummy_response = Response( 49 | prompt=Prompt(prompt="Hello from last time", system=None, attachments=[], model=model), 50 | model=_model, 51 | stream=False, 52 | ) 53 | dummy_convo = Conversation(responses=[dummy_response], model=_model) 54 | # Create a prompt with no system and without attachments. 55 | messages = build_messages(dummy_prompt, dummy_convo) 56 | assert len(messages) == 3 57 | 58 | 59 | @pytest.mark.parametrize("model", MODELS) 60 | def test_build_messages_with_conversation_prompt_system(model: str): 61 | # Create a dummy conversation with one response. 62 | dummy_prompt = Prompt( 63 | prompt="Hello from prompt", system="You are a hawk", attachments=[], model=model 64 | ) 65 | _model = get_model(model) 66 | # The response has a system message and a user message. 67 | dummy_response = Response( 68 | prompt=Prompt( 69 | prompt="Hello from last time", 70 | system="You are a hawk", 71 | attachments=[], 72 | model=model, 73 | ), 74 | model=_model, 75 | stream=False, 76 | ) 77 | dummy_convo = Conversation(responses=[dummy_response], model=_model) 78 | # Create a prompt with no system and without attachments. 79 | messages = build_messages(dummy_prompt, dummy_convo) 80 | assert len(messages) == 4 81 | # First message should be a system message. 82 | assert isinstance(messages[0], SystemMessage) 83 | assert messages[0].content == "You are a hawk" 84 | 85 | 86 | def test_build_messages_with_image_path_attachment(): 87 | # Create a dummy attachment object for an image. 88 | model: str = "gpt-4o" 89 | attachment = Attachment( 90 | path=pathlib.Path("tests/files/salmon.jpeg"), url=None, type="image/jpeg" 91 | ) 92 | dummy_attachment = attachment 93 | # Create a prompt with an attachment and prompt text. 94 | dummy_prompt = Prompt( 95 | prompt="Here is an image:", 96 | system=None, 97 | model=model, 98 | attachments=[dummy_attachment], 99 | ) 100 | # No conversation provided. 101 | messages = build_messages(dummy_prompt, None) 102 | # For a prompt with attachments, build_messages creates one UserMessage whose content is a list. 103 | assert len(messages) == 1 104 | msg = messages[0] 105 | assert isinstance(msg, UserMessage) 106 | # The content should be a list with two items: TextContentItem and ImageContentItem. 107 | # Validate type and content. 108 | content_list = msg.content 109 | assert isinstance(content_list, list) 110 | assert len(content_list) == 2 111 | image_item = content_list[1] 112 | assert isinstance(image_item, ImageContentItem) 113 | # Check that image_item.image_url is an ImageUrl with the correct url. 114 | assert isinstance(image_item.image_url, ImageUrl) 115 | assert image_item.image_url.url.startswith("data:image/jpeg;base64,") 116 | 117 | 118 | def test_build_messages_with_image_url_attachments(): 119 | # Create a dummy attachment object for an image. 120 | model: str = "gpt-4o" 121 | attachment = Attachment(path=None, url="http://dummy.image/url.png", type="image/png") 122 | dummy_attachment = attachment 123 | # Create a prompt with an attachment and prompt text. 124 | dummy_prompt = Prompt( 125 | prompt="Here is an image:", 126 | system=None, 127 | model=model, 128 | attachments=[dummy_attachment], 129 | ) 130 | # No conversation provided. 131 | messages = build_messages(dummy_prompt, None) 132 | # For a prompt with attachments, build_messages creates one UserMessage whose content is a list. 133 | assert len(messages) == 1 134 | msg = messages[0] 135 | assert isinstance(msg, UserMessage) 136 | # The content should be a list with two items: TextContentItem and ImageContentItem. 137 | # Validate type and content. 138 | content_list = msg.content 139 | assert isinstance(content_list, list) 140 | assert len(content_list) == 2 141 | image_item = content_list[1] 142 | assert isinstance(image_item, ImageContentItem) 143 | # Check that image_item.image_url is an ImageUrl with the correct url. 144 | assert isinstance(image_item.image_url, ImageUrl) 145 | assert image_item.image_url.url == "http://dummy.image/url.png" 146 | 147 | 148 | def test_build_messages_with_audio_path_attachment(): 149 | # Create a dummy attachment object for an image. 150 | model: str = "gpt-4o" 151 | attachment = Attachment(path=pathlib.Path("tests/files/kick.wav"), url=None, type="audio/wav") 152 | dummy_attachment = attachment 153 | # Create a prompt with an attachment and prompt text. 154 | dummy_prompt = Prompt( 155 | prompt="Here is an audio clip:", 156 | system=None, 157 | model=model, 158 | attachments=[dummy_attachment], 159 | ) 160 | # No conversation provided. 161 | messages = build_messages(dummy_prompt, None) 162 | # For a prompt with attachments, build_messages creates one UserMessage whose content is a list. 163 | assert len(messages) == 1 164 | msg = messages[0] 165 | assert isinstance(msg, UserMessage) 166 | # The content should be a list with two items: TextContentItem and ImageContentItem. 167 | # Validate type and content. 168 | content_list = msg.content 169 | assert isinstance(content_list, list) 170 | assert len(content_list) == 2 171 | audio_item = content_list[1] 172 | assert isinstance(audio_item, AudioContentItem) 173 | # Check that image_item.image_url is an ImageUrl with the correct url. 174 | assert isinstance(audio_item.input_audio, InputAudio) 175 | assert audio_item.input_audio.data.startswith("UklGRuwiAAB") 176 | assert audio_item.input_audio.format == "wav" 177 | assert audio_item.input_audio.data.endswith("AAAAA=") 178 | 179 | 180 | class DogSchema(BaseModel): 181 | """ 182 | A schema for a dog with a name and age. 183 | """ 184 | 185 | name: str 186 | age: int 187 | one_sentence_bio: str 188 | 189 | 190 | def test_schema_with_unsupported_model(): 191 | """ 192 | Test that requesting a schema for an unsupported model raises an error. 193 | """ 194 | model = get_model("github/Mistral-Nemo") 195 | 196 | with pytest.raises(ValueError): 197 | model.prompt("Invent a good dog", schema=DogSchema) 198 | 199 | 200 | def test_schema_with_supported_model(): 201 | """ 202 | Test that requesting a schema for a supported model works. 203 | """ 204 | model = get_model("github/gpt-4.1-mini") 205 | 206 | response = model.prompt("Invent a good dog named Buddy", schema=DogSchema) 207 | dog = json.loads(response.text()) 208 | assert dog["name"] == "Buddy" 209 | 210 | 211 | @pytest.mark.asyncio 212 | async def test_async_model_prompt(): 213 | """ 214 | Test that the async model prompt works correctly. 215 | """ 216 | model = get_async_model("github/gpt-4.1-mini") 217 | response = await model.prompt("What is the capital of France?") 218 | assert "Paris" in await response.text() 219 | 220 | 221 | @patch("llm_github_models.ChatCompletionsClient", autospec=True) 222 | def test_doesnt_request_streaming_usage_when_not_required(MockChatCompletionsClient): 223 | # Setup mock 224 | mock_update = StreamingChatCompletionsUpdate( 225 | { 226 | "choices": [StreamingChatChoiceUpdate({"delta": {"content": "Paris"}})], 227 | } 228 | ) 229 | 230 | # `with ChatCompletionsClient(...) as client:` 231 | mock_instance = MockChatCompletionsClient.return_value.__enter__.return_value 232 | 233 | # `for chunk in client.complete(...)` 234 | mock_instance.complete.return_value.__iter__.return_value = [mock_update] 235 | 236 | model = GitHubModels("test-model", requires_usage_stream_option=False) 237 | 238 | # Patch the get_key method to avoid actual key retrieval 239 | with patch.object(model, "get_key", return_value="test-key"): 240 | result = model.prompt("What is the capital of France", stream=True) 241 | 242 | assert result.text() == "Paris" 243 | 244 | # Assertions 245 | call_kwargs = mock_instance.complete.call_args.kwargs 246 | assert call_kwargs["model_extras"] == {}, ( 247 | "model_extras should be empty when requires_usage_stream_option is False" 248 | ) 249 | 250 | 251 | def test_set_usage(): 252 | usage = CompletionsUsage( 253 | { 254 | "completion_tokens": 10, 255 | "prompt_tokens": 5, 256 | "extra": { 257 | "value": 123, 258 | "inner_empty": {}, 259 | "inner_zero": 0, 260 | }, 261 | "other": "data", 262 | "zero": 0, 263 | "empty": {}, 264 | } 265 | ) 266 | 267 | captured_usage = {} 268 | 269 | def usage_callback(input=None, output=None, details=None): 270 | captured_usage["input"] = input 271 | captured_usage["output"] = output 272 | captured_usage["details"] = details 273 | 274 | mock_response = Mock(spec=Response) 275 | mock_response.set_usage.side_effect = usage_callback 276 | 277 | set_usage(usage, mock_response) 278 | 279 | assert captured_usage["input"] == 5 280 | assert captured_usage["output"] == 10 281 | 282 | # Everything that is 0 or empty should be filtered out. 283 | assert captured_usage["details"] == { 284 | "extra": { 285 | "value": 123, 286 | }, 287 | "other": "data", 288 | } 289 | 290 | 291 | def test_sync_returns_usage(): 292 | """ 293 | Test that the sync model returns usage information for streaming and non-streaming. 294 | """ 295 | model = get_model("github/gpt-4.1-mini") 296 | 297 | response = model.prompt("What is the capital of France?") 298 | usage = response.usage() 299 | assert_has_usage(usage) 300 | 301 | response = model.prompt("What is the capital of France?", stream=True) 302 | usage = response.usage() 303 | assert_has_usage(usage) 304 | 305 | 306 | @pytest.mark.asyncio 307 | async def test_async_returns_usage(): 308 | """ 309 | Test that the async model returns usage information for streaming and non-streaming. 310 | """ 311 | model = get_async_model("github/gpt-4.1-mini") 312 | 313 | response = await model.prompt("What is the capital of France?") 314 | usage = await response.usage() 315 | assert_has_usage(usage) 316 | 317 | response = await model.prompt("What is the capital of France?", stream=True) 318 | usage = await response.usage() 319 | assert_has_usage(usage) 320 | 321 | 322 | def assert_has_usage(usage): 323 | """ 324 | Helper function to assert that usage has input and output tokens. 325 | """ 326 | assert usage is not None 327 | assert usage.input is not None, "Usage input should not be None" 328 | assert usage.input > 0, "Usage input should be greater than 0" 329 | assert usage.output is not None, "Usage output should not be None" 330 | assert usage.output > 0, "Usage output should be greater than 0" 331 | -------------------------------------------------------------------------------- /tests/test_tool_support.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | from azure.ai.inference.models import StreamingChatResponseMessageUpdate 5 | from llm import get_async_model, get_model 6 | 7 | from llm_github_models import append_streaming_tool_calls 8 | 9 | 10 | def test_model_supports_tools(): 11 | model = get_model("github/gpt-4o-mini") 12 | assert model.supports_tools is True 13 | 14 | 15 | def test_append_streaming_tool_calls(): 16 | tool_calls = {} 17 | 18 | # call_1 => multiply 19 | append_streaming_tool_calls( 20 | tool_calls, 21 | StreamingChatResponseMessageUpdate( 22 | { 23 | "tool_calls": [ 24 | { 25 | "id": "call_1", 26 | "index": 0, 27 | "function": {"name": "multiply", "arguments": ""}, 28 | } 29 | ] 30 | } 31 | ), 32 | ) 33 | 34 | # call_1 => multiply(x: 35 | # call_2 => add(x: 1, 36 | append_streaming_tool_calls( 37 | tool_calls, 38 | StreamingChatResponseMessageUpdate( 39 | { 40 | "tool_calls": [ 41 | { 42 | "index": 0, 43 | "function": {"arguments": '{ "x": '}, 44 | }, 45 | { 46 | "id": "call_2", 47 | "index": 1, 48 | "function": {"name": "add", "arguments": '{ "x": 1,'}, 49 | }, 50 | ] 51 | } 52 | ), 53 | ) 54 | 55 | # call_1 => multiply(x: 2, y: 3) 56 | # call_2 => add(x: 1, y: 57 | append_streaming_tool_calls( 58 | tool_calls, 59 | StreamingChatResponseMessageUpdate( 60 | { 61 | "tool_calls": [ 62 | { 63 | "index": 0, 64 | "function": {"arguments": '2, "y": 3}'}, 65 | }, 66 | { 67 | "index": 1, 68 | "function": {"name": "add", "arguments": ' "y":'}, 69 | }, 70 | ] 71 | } 72 | ), 73 | ) 74 | 75 | # call_1 => multiply(x: 2, y: 3) 76 | # call_2 => add(x: 1, y: 3) 77 | append_streaming_tool_calls( 78 | tool_calls, 79 | StreamingChatResponseMessageUpdate( 80 | { 81 | "tool_calls": [ 82 | { 83 | "index": 1, 84 | "function": {"name": "add", "arguments": " 3 }"}, 85 | }, 86 | ] 87 | } 88 | ), 89 | ) 90 | 91 | assert len(tool_calls) == 2 92 | 93 | assert tool_calls[0].id == "call_1" 94 | assert tool_calls[0].function.name == "multiply" 95 | assert json.loads(tool_calls[0].function.arguments) == {"x": 2, "y": 3} 96 | 97 | assert tool_calls[1].id == "call_2" 98 | assert tool_calls[1].function.name == "add" 99 | assert json.loads(tool_calls[1].function.arguments) == {"x": 1, "y": 3} 100 | 101 | 102 | @pytest.mark.parametrize("stream", [True, False]) 103 | def test_sync_uses_tools(stream): 104 | model = get_model("github/gpt-4o-mini") 105 | 106 | # Create a prompt with a tool 107 | def multiply(x: int, y: int) -> int: 108 | """Multiply two numbers.""" 109 | return x * y 110 | 111 | chain = model.chain("What is 34234 * 213345?", tools=[multiply], stream=stream).responses() # type: ignore 112 | 113 | tool_call_resp = next(chain) 114 | 115 | tool_calls = tool_call_resp.tool_calls() 116 | assert tool_calls is not None 117 | assert len(tool_calls) == 1 118 | assert tool_calls[0].name == "multiply" 119 | assert tool_calls[0].arguments == {"x": 34234, "y": 213345} 120 | 121 | # Sometimes it likes to add commas to the output number 122 | response_text = next(chain).text().replace(",", "") 123 | assert "7303652730" in response_text 124 | 125 | 126 | @pytest.mark.parametrize("stream", [True, False]) 127 | @pytest.mark.asyncio 128 | async def test_async_uses_tools(stream): 129 | model = get_async_model("github/gpt-4o-mini") 130 | 131 | # Create a prompt with a tool 132 | def multiply(x: int, y: int) -> int: 133 | """Multiply two numbers.""" 134 | return x * y 135 | 136 | chain = model.chain("What is 34234 * 213345?", tools=[multiply], stream=stream).responses() # type: ignore 137 | 138 | responses = [] 139 | async for resp in chain: 140 | responses.append(resp) 141 | 142 | tool_call_resp = responses[0] 143 | 144 | tool_calls = await tool_call_resp.tool_calls() 145 | assert tool_calls is not None 146 | assert len(tool_calls) == 1 147 | assert tool_calls[0].name == "multiply" 148 | assert tool_calls[0].arguments == {"x": 34234, "y": 213345} 149 | 150 | # Sometimes it likes to add commas to the output number 151 | response_text = (await responses[1].text()).replace(",", "") 152 | assert "7303652730" in response_text 153 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Updating models 2 | 3 | 1. `python ./download_models_json.py` 4 | 1. `python ./parse_models_json.py` 5 | 1. Copy CHAT_MODELS and EMBEDDING_MODELS to `../llm_github_models.py` 6 | 1. Run `ruff format llm_github_models.py` 7 | 1. Move `models.fragment.md` to `../README.md` 8 | -------------------------------------------------------------------------------- /tools/download_models_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import requests 4 | 5 | url = "https://api.catalog.azureml.ms/asset-gallery/v1.0/models" 6 | headers = {"Content-Type": "application/json"} 7 | filters = { 8 | "filters": [ 9 | {"field": "freePlayground", "operator": "eq", "values": ["true"]}, 10 | {"field": "labels", "operator": "eq", "values": ["latest"]}, 11 | ], 12 | "order": [{"field": "name", "direction": "asc"}], 13 | } 14 | 15 | all_models = [] 16 | continuation_token = None 17 | 18 | while True: 19 | payload = filters.copy() 20 | if continuation_token: 21 | payload["continuationToken"] = continuation_token 22 | 23 | print("Fetching models...") 24 | response = requests.post(url, headers=headers, json=payload) 25 | response.raise_for_status() 26 | 27 | data = response.json() 28 | all_models.extend(data.get("summaries", [])) 29 | 30 | continuation_token = data.get("continuationToken") 31 | if continuation_token: 32 | print(f"Continuation token: {continuation_token}") 33 | if not continuation_token: 34 | break 35 | 36 | print("Saving models to models.json...") 37 | with open("models.json", "w") as f: 38 | json.dump(all_models, f, indent=4) 39 | 40 | print(f"Saved {len(all_models)} models to models.json") 41 | -------------------------------------------------------------------------------- /tools/models.fragment.md: -------------------------------------------------------------------------------- 1 | ## Supported Models 2 | 3 | ### Chat Models 4 | 5 | | Model Name | Streaming | Schemas | Tools | Input Modalities | Output Modalities | 6 | |------------|-----------|---------|-------|------------------|-------------------| 7 | | AI21-Jamba-1.5-Large | ✅ | ❌ | ❌ | text | text | 8 | | AI21-Jamba-1.5-Mini | ✅ | ❌ | ❌ | text | text | 9 | | Codestral-2501 | ✅ | ❌ | ✅ | text | text | 10 | | Cohere-command-r | ✅ | ❌ | ✅ | text | text | 11 | | Cohere-command-r-08-2024 | ✅ | ❌ | ✅ | text | text | 12 | | Cohere-command-r-plus | ✅ | ❌ | ✅ | text | text | 13 | | Cohere-command-r-plus-08-2024 | ✅ | ❌ | ✅ | text | text | 14 | | DeepSeek-R1 | ✅ | ❌ | ❌ | text | text | 15 | | DeepSeek-V3 | ✅ | ❌ | ❌ | text | text | 16 | | DeepSeek-V3-0324 | ✅ | ❌ | ❌ | text | text | 17 | | Llama-3.2-11B-Vision-Instruct | ✅ | ❌ | ❌ | text, image, audio | text | 18 | | Llama-3.2-90B-Vision-Instruct | ✅ | ❌ | ❌ | text, image, audio | text | 19 | | Llama-3.3-70B-Instruct | ✅ | ❌ | ❌ | text | text | 20 | | Llama-4-Maverick-17B-128E-Instruct-FP8 | ✅ | ❌ | ❌ | text, image | text | 21 | | Llama-4-Scout-17B-16E-Instruct | ✅ | ❌ | ❌ | text, image | text | 22 | | MAI-DS-R1 | ✅ | ❌ | ❌ | text | text | 23 | | Meta-Llama-3-70B-Instruct | ✅ | ❌ | ❌ | text | text | 24 | | Meta-Llama-3-8B-Instruct | ✅ | ❌ | ❌ | text | text | 25 | | Meta-Llama-3.1-405B-Instruct | ✅ | ❌ | ❌ | text | text | 26 | | Meta-Llama-3.1-70B-Instruct | ✅ | ❌ | ❌ | text | text | 27 | | Meta-Llama-3.1-8B-Instruct | ✅ | ❌ | ❌ | text | text | 28 | | Ministral-3B | ✅ | ❌ | ✅ | text | text | 29 | | Mistral-Large-2411 | ✅ | ❌ | ✅ | text | text | 30 | | Mistral-Nemo | ✅ | ❌ | ✅ | text | text | 31 | | Mistral-large | ✅ | ❌ | ✅ | text | text | 32 | | Mistral-large-2407 | ✅ | ❌ | ✅ | text | text | 33 | | Mistral-small | ✅ | ❌ | ✅ | text | text | 34 | | Phi-3-medium-128k-instruct | ✅ | ❌ | ❌ | text | text | 35 | | Phi-3-medium-4k-instruct | ✅ | ❌ | ❌ | text | text | 36 | | Phi-3-mini-128k-instruct | ✅ | ❌ | ❌ | text | text | 37 | | Phi-3-mini-4k-instruct | ✅ | ❌ | ❌ | text | text | 38 | | Phi-3-small-128k-instruct | ✅ | ❌ | ❌ | text | text | 39 | | Phi-3-small-8k-instruct | ✅ | ❌ | ❌ | text | text | 40 | | Phi-3.5-MoE-instruct | ✅ | ❌ | ❌ | text | text | 41 | | Phi-3.5-mini-instruct | ✅ | ❌ | ❌ | text | text | 42 | | Phi-3.5-vision-instruct | ✅ | ❌ | ❌ | text, image | text | 43 | | Phi-4 | ✅ | ❌ | ❌ | text | text | 44 | | Phi-4-mini-instruct | ✅ | ❌ | ❌ | text | text | 45 | | Phi-4-mini-reasoning | ✅ | ❌ | ❌ | text | text | 46 | | Phi-4-multimodal-instruct | ✅ | ❌ | ❌ | audio, image, text | text | 47 | | Phi-4-reasoning | ✅ | ❌ | ❌ | text | text | 48 | | cohere-command-a | ✅ | ❌ | ✅ | text | text | 49 | | gpt-4.1 | ✅ | ✅ | ✅ | text, image | text | 50 | | gpt-4.1-mini | ✅ | ✅ | ✅ | text, image | text | 51 | | gpt-4.1-nano | ✅ | ✅ | ✅ | text, image | text | 52 | | gpt-4o | ✅ | ✅ | ✅ | text, image, audio | text | 53 | | gpt-4o-mini | ✅ | ✅ | ✅ | text, image, audio | text | 54 | | grok-3 | ✅ | ❌ | ✅ | text | text | 55 | | grok-3-mini | ✅ | ❌ | ✅ | text | text | 56 | | jais-30b-chat | ✅ | ❌ | ❌ | text | text | 57 | | mistral-medium-2505 | ✅ | ❌ | ✅ | text, image | text | 58 | | mistral-small-2503 | ✅ | ❌ | ✅ | text, image | text | 59 | | o1 | ❌ | ✅ | ✅ | text, image | text | 60 | | o1-mini | ❌ | ❌ | ❌ | text | text | 61 | | o1-preview | ❌ | ❌ | ❌ | text | text | 62 | | o3 | ✅ | ❌ | ✅ | text, image | text | 63 | | o3-mini | ❌ | ✅ | ✅ | text | text | 64 | | o4-mini | ✅ | ❌ | ✅ | text, image | text | 65 | 66 | ### AI21 Jamba 1.5 Large 67 | 68 | Usage: `llm -m github/AI21-Jamba-1.5-Large` 69 | 70 | **Publisher:** AI21 Labs 71 | 72 | **Description:** A 398B parameters (94B active) multilingual model, offering a 256K long context window, function calling, structured output, and grounded generation. 73 | 74 | ### AI21 Jamba 1.5 Mini 75 | 76 | Usage: `llm -m github/AI21-Jamba-1.5-Mini` 77 | 78 | **Publisher:** AI21 Labs 79 | 80 | **Description:** A 52B parameters (12B active) multilingual model, offering a 256K long context window, function calling, structured output, and grounded generation. 81 | 82 | ### Codestral 25.01 83 | 84 | Usage: `llm -m github/Codestral-2501` 85 | 86 | **Publisher:** Mistral AI 87 | 88 | **Description:** Codestral 25.01 by Mistral AI is designed for code generation, supporting 80+ programming languages, and optimized for tasks like code completion and fill-in-the-middle 89 | 90 | ### Cohere Command R 91 | 92 | Usage: `llm -m github/Cohere-command-r` 93 | 94 | **Publisher:** Cohere 95 | 96 | **Description:** Command R is a scalable generative model targeting RAG and Tool Use to enable production-scale AI for enterprise. 97 | 98 | ### Cohere Command R 08-2024 99 | 100 | Usage: `llm -m github/Cohere-command-r-08-2024` 101 | 102 | **Publisher:** Cohere 103 | 104 | **Description:** Command R is a scalable generative model targeting RAG and Tool Use to enable production-scale AI for enterprise. 105 | 106 | ### Cohere Command R+ 107 | 108 | Usage: `llm -m github/Cohere-command-r-plus` 109 | 110 | **Publisher:** Cohere 111 | 112 | **Description:** Command R+ is a state-of-the-art RAG-optimized model designed to tackle enterprise-grade workloads. 113 | 114 | ### Cohere Command R+ 08-2024 115 | 116 | Usage: `llm -m github/Cohere-command-r-plus-08-2024` 117 | 118 | **Publisher:** Cohere 119 | 120 | **Description:** Command R+ is a state-of-the-art RAG-optimized model designed to tackle enterprise-grade workloads. 121 | 122 | ### Cohere Embed v3 English 123 | 124 | Usage: `llm -m github/Cohere-embed-v3-english` 125 | 126 | **Publisher:** Cohere 127 | 128 | **Description:** Cohere Embed English is the market's leading text representation model used for semantic search, retrieval-augmented generation (RAG), classification, and clustering. 129 | 130 | ### Cohere Embed v3 Multilingual 131 | 132 | Usage: `llm -m github/Cohere-embed-v3-multilingual` 133 | 134 | **Publisher:** Cohere 135 | 136 | **Description:** Cohere Embed Multilingual is the market's leading text representation model used for semantic search, retrieval-augmented generation (RAG), classification, and clustering. 137 | 138 | ### DeepSeek-R1 139 | 140 | Usage: `llm -m github/DeepSeek-R1` 141 | 142 | **Publisher:** DeepSeek 143 | 144 | **Description:** DeepSeek-R1 excels at reasoning tasks using a step-by-step training process, such as language, scientific reasoning, and coding tasks. 145 | 146 | ### DeepSeek-V3 147 | 148 | Usage: `llm -m github/DeepSeek-V3` 149 | 150 | **Publisher:** DeepSeek 151 | 152 | **Description:** A strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. 153 | 154 | ### DeepSeek-V3-0324 155 | 156 | Usage: `llm -m github/DeepSeek-V3-0324` 157 | 158 | **Publisher:** DeepSeek 159 | 160 | **Description:** DeepSeek-V3-0324 demonstrates notable improvements over its predecessor, DeepSeek-V3, in several key aspects, including enhanced reasoning, improved function calling, and superior code generation capabilities. 161 | 162 | ### Llama-3.2-11B-Vision-Instruct 163 | 164 | Usage: `llm -m github/Llama-3.2-11B-Vision-Instruct` 165 | 166 | **Publisher:** Meta 167 | 168 | **Description:** Excels in image reasoning capabilities on high-res images for visual understanding apps. 169 | 170 | ### Llama-3.2-90B-Vision-Instruct 171 | 172 | Usage: `llm -m github/Llama-3.2-90B-Vision-Instruct` 173 | 174 | **Publisher:** Meta 175 | 176 | **Description:** Advanced image reasoning capabilities for visual understanding agentic apps. 177 | 178 | ### Llama-3.3-70B-Instruct 179 | 180 | Usage: `llm -m github/Llama-3.3-70B-Instruct` 181 | 182 | **Publisher:** Meta 183 | 184 | **Description:** Llama 3.3 70B Instruct offers enhanced reasoning, math, and instruction following with performance comparable to Llama 3.1 405B. 185 | 186 | ### Llama 4 Maverick 17B 128E Instruct FP8 187 | 188 | Usage: `llm -m github/Llama-4-Maverick-17B-128E-Instruct-FP8` 189 | 190 | **Publisher:** Meta 191 | 192 | **Description:** Llama 4 Maverick 17B 128E Instruct FP8 is great at precise image understanding and creative writing, offering high quality at a lower price compared to Llama 3.3 70B 193 | 194 | ### Llama 4 Scout 17B 16E Instruct 195 | 196 | Usage: `llm -m github/Llama-4-Scout-17B-16E-Instruct` 197 | 198 | **Publisher:** Meta 199 | 200 | **Description:** Llama 4 Scout 17B 16E Instruct is great at multi-document summarization, parsing extensive user activity for personalized tasks, and reasoning over vast codebases. 201 | 202 | ### MAI-DS-R1 203 | 204 | Usage: `llm -m github/MAI-DS-R1` 205 | 206 | **Publisher:** Microsoft 207 | 208 | **Description:** MAI-DS-R1 is a DeepSeek-R1 reasoning model that has been post-trained by the Microsoft AI team to fill in information gaps in the previous version of the model and improve its harm protections while maintaining R1 reasoning capabilities. 209 | 210 | ### Meta-Llama-3-70B-Instruct 211 | 212 | Usage: `llm -m github/Meta-Llama-3-70B-Instruct` 213 | 214 | **Publisher:** Meta 215 | 216 | **Description:** A powerful 70-billion parameter model excelling in reasoning, coding, and broad language applications. 217 | 218 | ### Meta-Llama-3-8B-Instruct 219 | 220 | Usage: `llm -m github/Meta-Llama-3-8B-Instruct` 221 | 222 | **Publisher:** Meta 223 | 224 | **Description:** A versatile 8-billion parameter model optimized for dialogue and text generation tasks. 225 | 226 | ### Meta-Llama-3.1-405B-Instruct 227 | 228 | Usage: `llm -m github/Meta-Llama-3.1-405B-Instruct` 229 | 230 | **Publisher:** Meta 231 | 232 | **Description:** The Llama 3.1 instruction tuned text only models are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks. 233 | 234 | ### Meta-Llama-3.1-70B-Instruct 235 | 236 | Usage: `llm -m github/Meta-Llama-3.1-70B-Instruct` 237 | 238 | **Publisher:** Meta 239 | 240 | **Description:** The Llama 3.1 instruction tuned text only models are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks. 241 | 242 | ### Meta-Llama-3.1-8B-Instruct 243 | 244 | Usage: `llm -m github/Meta-Llama-3.1-8B-Instruct` 245 | 246 | **Publisher:** Meta 247 | 248 | **Description:** The Llama 3.1 instruction tuned text only models are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks. 249 | 250 | ### Ministral 3B 251 | 252 | Usage: `llm -m github/Ministral-3B` 253 | 254 | **Publisher:** Mistral AI 255 | 256 | **Description:** Ministral 3B is a state-of-the-art Small Language Model (SLM) optimized for edge computing and on-device applications. As it is designed for low-latency and compute-efficient inference, it it also the perfect model for standard GenAI applications that have 257 | 258 | ### Mistral Large 24.11 259 | 260 | Usage: `llm -m github/Mistral-Large-2411` 261 | 262 | **Publisher:** Mistral AI 263 | 264 | **Description:** Mistral Large 24.11 offers enhanced system prompts, advanced reasoning and function calling capabilities. 265 | 266 | ### Mistral Nemo 267 | 268 | Usage: `llm -m github/Mistral-Nemo` 269 | 270 | **Publisher:** Mistral AI 271 | 272 | **Description:** Mistral Nemo is a cutting-edge Language Model (LLM) boasting state-of-the-art reasoning, world knowledge, and coding capabilities within its size category. 273 | 274 | ### Mistral Large 275 | 276 | Usage: `llm -m github/Mistral-large` 277 | 278 | **Publisher:** Mistral AI 279 | 280 | **Description:** Mistral's flagship model that's ideal for complex tasks that require large reasoning capabilities or are highly specialized (Synthetic Text Generation, Code Generation, RAG, or Agents). 281 | 282 | ### Mistral Large (2407) 283 | 284 | Usage: `llm -m github/Mistral-large-2407` 285 | 286 | **Publisher:** Mistral AI 287 | 288 | **Description:** Mistral Large (2407) is an advanced Large Language Model (LLM) with state-of-the-art reasoning, knowledge and coding capabilities. 289 | 290 | ### Mistral Small 291 | 292 | Usage: `llm -m github/Mistral-small` 293 | 294 | **Publisher:** Mistral AI 295 | 296 | **Description:** Mistral Small can be used on any language-based task that requires high efficiency and low latency. 297 | 298 | ### Phi-3-medium instruct (128k) 299 | 300 | Usage: `llm -m github/Phi-3-medium-128k-instruct` 301 | 302 | **Publisher:** Microsoft 303 | 304 | **Description:** Same Phi-3-medium model, but with a larger context size for RAG or few shot prompting. 305 | 306 | ### Phi-3-medium instruct (4k) 307 | 308 | Usage: `llm -m github/Phi-3-medium-4k-instruct` 309 | 310 | **Publisher:** Microsoft 311 | 312 | **Description:** A 14B parameters model, proves better quality than Phi-3-mini, with a focus on high-quality, reasoning-dense data. 313 | 314 | ### Phi-3-mini instruct (128k) 315 | 316 | Usage: `llm -m github/Phi-3-mini-128k-instruct` 317 | 318 | **Publisher:** Microsoft 319 | 320 | **Description:** Same Phi-3-mini model, but with a larger context size for RAG or few shot prompting. 321 | 322 | ### Phi-3-mini instruct (4k) 323 | 324 | Usage: `llm -m github/Phi-3-mini-4k-instruct` 325 | 326 | **Publisher:** Microsoft 327 | 328 | **Description:** Tiniest member of the Phi-3 family. Optimized for both quality and low latency. 329 | 330 | ### Phi-3-small instruct (128k) 331 | 332 | Usage: `llm -m github/Phi-3-small-128k-instruct` 333 | 334 | **Publisher:** Microsoft 335 | 336 | **Description:** Same Phi-3-small model, but with a larger context size for RAG or few shot prompting. 337 | 338 | ### Phi-3-small instruct (8k) 339 | 340 | Usage: `llm -m github/Phi-3-small-8k-instruct` 341 | 342 | **Publisher:** Microsoft 343 | 344 | **Description:** A 7B parameters model, proves better quality than Phi-3-mini, with a focus on high-quality, reasoning-dense data. 345 | 346 | ### Phi-3.5-MoE instruct (128k) 347 | 348 | Usage: `llm -m github/Phi-3.5-MoE-instruct` 349 | 350 | **Publisher:** Microsoft 351 | 352 | **Description:** A new mixture of experts model 353 | 354 | ### Phi-3.5-mini instruct (128k) 355 | 356 | Usage: `llm -m github/Phi-3.5-mini-instruct` 357 | 358 | **Publisher:** Microsoft 359 | 360 | **Description:** Refresh of Phi-3-mini model. 361 | 362 | ### Phi-3.5-vision instruct (128k) 363 | 364 | Usage: `llm -m github/Phi-3.5-vision-instruct` 365 | 366 | **Publisher:** Microsoft 367 | 368 | **Description:** Refresh of Phi-3-vision model. 369 | 370 | ### Phi-4 371 | 372 | Usage: `llm -m github/Phi-4` 373 | 374 | **Publisher:** Microsoft 375 | 376 | **Description:** Phi-4 14B, a highly capable model for low latency scenarios. 377 | 378 | ### Phi-4-mini-instruct 379 | 380 | Usage: `llm -m github/Phi-4-mini-instruct` 381 | 382 | **Publisher:** Microsoft 383 | 384 | **Description:** 3.8B parameters Small Language Model outperforming larger models in reasoning, math, coding, and function-calling 385 | 386 | ### Phi-4-mini-reasoning 387 | 388 | Usage: `llm -m github/Phi-4-mini-reasoning` 389 | 390 | **Publisher:** Microsoft 391 | 392 | **Description:** Lightweight math reasoning model optimized for multi-step problem solving 393 | 394 | ### Phi-4-multimodal-instruct 395 | 396 | Usage: `llm -m github/Phi-4-multimodal-instruct` 397 | 398 | **Publisher:** Microsoft 399 | 400 | **Description:** First small multimodal model to have 3 modality inputs (text, audio, image), excelling in quality and efficiency 401 | 402 | ### Phi-4-Reasoning 403 | 404 | Usage: `llm -m github/Phi-4-reasoning` 405 | 406 | **Publisher:** Microsoft 407 | 408 | **Description:** State-of-the-art open-weight reasoning model. 409 | 410 | ### Cohere Command A 411 | 412 | Usage: `llm -m github/cohere-command-a` 413 | 414 | **Publisher:** Cohere 415 | 416 | **Description:** Command A is a highly efficient generative model that excels at agentic and multilingual use cases. 417 | 418 | ### Cohere Embed 4 419 | 420 | Usage: `llm -m github/embed-v-4-0` 421 | 422 | **Publisher:** Cohere 423 | 424 | **Description:** Embed 4 transforms texts and images into numerical vectors 425 | 426 | ### OpenAI GPT-4.1 427 | 428 | Usage: `llm -m github/gpt-4.1` 429 | 430 | **Publisher:** OpenAI 431 | 432 | **Description:** gpt-4.1 outperforms gpt-4o across the board, with major gains in coding, instruction following, and long-context understanding 433 | 434 | ### OpenAI GPT-4.1-mini 435 | 436 | Usage: `llm -m github/gpt-4.1-mini` 437 | 438 | **Publisher:** OpenAI 439 | 440 | **Description:** gpt-4.1-mini outperform gpt-4o-mini across the board, with major gains in coding, instruction following, and long-context handling 441 | 442 | ### OpenAI GPT-4.1-nano 443 | 444 | Usage: `llm -m github/gpt-4.1-nano` 445 | 446 | **Publisher:** OpenAI 447 | 448 | **Description:** gpt-4.1-nano provides gains in coding, instruction following, and long-context handling along with lower latency and cost 449 | 450 | ### OpenAI GPT-4o 451 | 452 | Usage: `llm -m github/gpt-4o` 453 | 454 | **Publisher:** OpenAI 455 | 456 | **Description:** OpenAI's most advanced multimodal model in the gpt-4o family. Can handle both text and image inputs. 457 | 458 | ### OpenAI GPT-4o mini 459 | 460 | Usage: `llm -m github/gpt-4o-mini` 461 | 462 | **Publisher:** OpenAI 463 | 464 | **Description:** An affordable, efficient AI solution for diverse text and image tasks. 465 | 466 | ### Grok 3 467 | 468 | Usage: `llm -m github/grok-3` 469 | 470 | **Publisher:** xAI 471 | 472 | **Description:** Grok 3 is xAI's debut model, pretrained by Colossus at supermassive scale to excel in specialized domains like finance, healthcare, and the law. 473 | 474 | ### Grok 3 Mini 475 | 476 | Usage: `llm -m github/grok-3-mini` 477 | 478 | **Publisher:** xAI 479 | 480 | **Description:** Grok 3 Mini is a lightweight model that thinks before responding. Trained on mathematic and scientific problems, it is great for logic-based tasks. 481 | 482 | ### JAIS 30b Chat 483 | 484 | Usage: `llm -m github/jais-30b-chat` 485 | 486 | **Publisher:** Core42 487 | 488 | **Description:** JAIS 30b Chat is an auto-regressive bilingual LLM for Arabic & English with state-of-the-art capabilities in Arabic. 489 | 490 | ### Mistral Medium 3 (25.05) 491 | 492 | Usage: `llm -m github/mistral-medium-2505` 493 | 494 | **Publisher:** Mistral AI 495 | 496 | **Description:** Mistral Medium 3 is an advanced Large Language Model (LLM) with state-of-the-art reasoning, knowledge, coding and vision capabilities. 497 | 498 | ### Mistral Small 3.1 499 | 500 | Usage: `llm -m github/mistral-small-2503` 501 | 502 | **Publisher:** Mistral AI 503 | 504 | **Description:** Enhanced Mistral Small 3 with multimodal capabilities and a 128k context length. 505 | 506 | ### OpenAI o1 507 | 508 | Usage: `llm -m github/o1` 509 | 510 | **Publisher:** OpenAI 511 | 512 | **Description:** Focused on advanced reasoning and solving complex problems, including math and science tasks. Ideal for applications that require deep contextual understanding and agentic workflows. 513 | 514 | ### OpenAI o1-mini 515 | 516 | Usage: `llm -m github/o1-mini` 517 | 518 | **Publisher:** OpenAI 519 | 520 | **Description:** Smaller, faster, and 80% cheaper than o1-preview, performs well at code generation and small context operations. 521 | 522 | ### OpenAI o1-preview 523 | 524 | Usage: `llm -m github/o1-preview` 525 | 526 | **Publisher:** OpenAI 527 | 528 | **Description:** Focused on advanced reasoning and solving complex problems, including math and science tasks. Ideal for applications that require deep contextual understanding and agentic workflows. 529 | 530 | ### OpenAI o3 531 | 532 | Usage: `llm -m github/o3` 533 | 534 | **Publisher:** OpenAI 535 | 536 | **Description:** o3 includes significant improvements on quality and safety while supporting the existing features of o1 and delivering comparable or better performance. 537 | 538 | ### OpenAI o3-mini 539 | 540 | Usage: `llm -m github/o3-mini` 541 | 542 | **Publisher:** OpenAI 543 | 544 | **Description:** o3-mini includes the o1 features with significant cost-efficiencies for scenarios requiring high performance. 545 | 546 | ### OpenAI o4-mini 547 | 548 | Usage: `llm -m github/o4-mini` 549 | 550 | **Publisher:** OpenAI 551 | 552 | **Description:** o4-mini includes significant improvements on quality and safety while supporting the existing features of o3-mini and delivering comparable or better performance. 553 | 554 | ### OpenAI Text Embedding 3 (large) 555 | 556 | Usage: `llm -m github/text-embedding-3-large` 557 | 558 | **Publisher:** OpenAI 559 | 560 | **Description:** Text-embedding-3 series models are the latest and most capable embedding model from OpenAI. 561 | 562 | ### OpenAI Text Embedding 3 (small) 563 | 564 | Usage: `llm -m github/text-embedding-3-small` 565 | 566 | **Publisher:** OpenAI 567 | 568 | **Description:** Text-embedding-3 series models are the latest and most capable embedding model from OpenAI. 569 | 570 | -------------------------------------------------------------------------------- /tools/parse_models_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to parse the models.json from the github API until there is a live API to call. 3 | """ 4 | 5 | import json 6 | from pprint import pprint 7 | 8 | chat_models = [] 9 | embedding_models = [] 10 | 11 | 12 | def supports_streaming(name): 13 | if name in ["o1", "o1-mini", "o1-preview", "o3-mini"]: 14 | return False 15 | return True 16 | 17 | 18 | def supports_schemas(name): 19 | if name in [ 20 | "gpt-4o", 21 | "gpt-4o-mini", 22 | "gpt-4.1", 23 | "gpt-4.1-mini", 24 | "gpt-4.1-nano", 25 | "o1", 26 | "o3-mini", 27 | ]: 28 | return True 29 | return False 30 | 31 | 32 | def requires_usage_stream_option(name): 33 | return name in [ 34 | "gpt-4o", 35 | "gpt-4o-mini", 36 | "gpt-4.1", 37 | "gpt-4.1-mini", 38 | "gpt-4.1-nano", 39 | "o3", 40 | "o4-mini", 41 | ] 42 | 43 | 44 | def supports_tools(name): 45 | # Note: this list does not line up with the official docs at 46 | # https://learn.microsoft.com/en-us/azure/machine-learning/concept-models-featured?view=azureml-api-2 47 | # But in practice these are the models that work. 48 | tool_supporting_models = [ 49 | "o3", 50 | "o3-mini", 51 | "o4-mini", 52 | "o1", 53 | "gpt-4o", 54 | "gpt-4o-mini", 55 | "gpt-4.1", 56 | "gpt-4.1-mini", 57 | "gpt-4.1-nano", 58 | "grok-3", 59 | "grok-3-mini", 60 | "cohere-command-a", 61 | "Cohere-command-r-plus-08-2024", 62 | "Cohere-command-r-08-2024", 63 | "Cohere-command-r-plus", 64 | "Cohere-command-r", 65 | "Codestral-2501", 66 | "Ministral-3B", 67 | "Mistral-Nemo", 68 | "Mistral-Large-2411", 69 | "Mistral-large-2407", 70 | "Mistral-large", 71 | "mistral-medium-2505", 72 | "mistral-small-2503", 73 | "Mistral-small", 74 | ] 75 | return name in tool_supporting_models 76 | 77 | 78 | with open("models.json", "r", encoding="utf-8") as f: 79 | models = json.load(f) 80 | for model in models: 81 | if "chat-completion" in model["inferenceTasks"]: 82 | chat_models.append( 83 | ( 84 | model["name"], 85 | supports_streaming(model["name"]), 86 | supports_schemas(model["name"]), 87 | requires_usage_stream_option(model["name"]), 88 | supports_tools(model["name"]), 89 | model["modelLimits"]["supportedInputModalities"], 90 | model["modelLimits"]["supportedOutputModalities"], 91 | ) 92 | ) 93 | elif "embeddings" in model["inferenceTasks"]: 94 | embedding_models.append(model["name"]) 95 | else: 96 | print("Not sure what to do with this model: ", model["name"]) 97 | 98 | print("Chat models:") 99 | # sort by name 100 | chat_models = sorted(chat_models, key=lambda x: x[0]) 101 | pprint(chat_models, indent=4, width=999) 102 | print("Embedding models:") 103 | # sort by name 104 | embedding_models = sorted(embedding_models) 105 | pprint(embedding_models, indent=4) 106 | 107 | # Make a Markdown series for the models 108 | 109 | with open("models.fragment.md", "w", encoding="utf-8") as f: 110 | f.write("## Supported Models\n\n") 111 | 112 | # Add chat models table 113 | f.write("### Chat Models\n\n") 114 | f.write("| Model Name | Streaming | Schemas | Tools | Input Modalities | Output Modalities |\n") 115 | f.write("|------------|-----------|---------|-------|------------------|-------------------|\n") 116 | 117 | for ( 118 | model_name, 119 | streaming, 120 | schemas, 121 | usage_stream, 122 | tools, 123 | input_modalities, 124 | output_modalities, 125 | ) in chat_models: 126 | streaming_str = "✅" if streaming else "❌" 127 | schemas_str = "✅" if schemas else "❌" 128 | tools_str = "✅" if tools else "❌" 129 | input_str = ", ".join(input_modalities) if input_modalities else "text" 130 | output_str = ", ".join(output_modalities) if output_modalities else "text" 131 | 132 | f.write( 133 | f"| {model_name} | {streaming_str} | {schemas_str} |" 134 | f" {tools_str} | {input_str} | {output_str} |\n" 135 | ) 136 | 137 | f.write("\n") 138 | 139 | for model in models: 140 | f.write(f"### {model['displayName']}\n\n") 141 | f.write(f"Usage: `llm -m github/{model['name']}`\n\n") 142 | f.write(f"**Publisher:** {model['publisher']} \n\n") 143 | f.write(f"**Description:** {model['summary'].replace('\n## ', '\n#### ')} \n\n") 144 | --------------------------------------------------------------------------------