├── .github ├── actions │ └── server-testing │ │ └── action.yml └── workflows │ ├── publish-to-test-pypi.yml │ ├── server_installer_windows_latest.yml │ ├── test_lemonade.yml │ ├── test_lemonade_oga_cpu.yml │ ├── test_quark.yml │ └── test_server.yml ├── .lfsconfig ├── .pylintrc ├── LICENSE ├── NOTICE.md ├── README.md ├── docs ├── CNAME ├── README.md ├── assets │ ├── carousel.js │ ├── extra.css │ ├── favicon.ico │ ├── install-selector.js │ ├── logo.png │ ├── mkdocs_requirements.txt │ └── website-styles.css ├── code.md ├── contribute.md ├── dev_cli │ ├── README.md │ ├── humaneval_accuracy.md │ ├── llamacpp.md │ ├── lm-eval.md │ ├── mmlu_accuracy.md │ ├── ort_genai_igpu.md │ ├── perplexity.md │ └── quark.md ├── favicon.ico ├── index.html ├── install_options.html ├── lemonade_api.md ├── publish_website_docs.py ├── server │ ├── README.md │ ├── apps │ │ ├── README.md │ │ ├── ai-dev-gallery.md │ │ ├── ai-toolkit.md │ │ ├── anythingLLM.md │ │ ├── codeGPT.md │ │ ├── continue.md │ │ ├── lm-eval.md │ │ ├── mindcraft.md │ │ ├── open-webui.md │ │ └── wut.md │ ├── concepts.md │ ├── lemonade-server-cli.md │ ├── server_integration.md │ ├── server_models.md │ └── server_spec.md └── versioning.md ├── examples ├── README.md ├── api_basic.py ├── api_oga_cpu.py ├── api_oga_cpu_streaming.py ├── api_oga_hybrid.py ├── api_oga_hybrid_streaming.py ├── api_oga_igpu.py ├── api_oga_igpu_streaming.py ├── api_oga_npu.py ├── api_oga_npu_streaming.py ├── api_streaming.py ├── demos │ ├── README.md │ ├── chat │ │ ├── chat_hybrid.py │ │ └── chat_start.py │ └── search │ │ ├── search_hybrid.py │ │ └── search_start.py └── notebooks │ └── lemonade_model_validation.ipynb ├── img ├── basic_demo.gif └── llm_demo.png ├── installer ├── AMD_LICENSE ├── Installer.nsi ├── add_to_path.py ├── installer_banner.bmp ├── lemonade-server.bat ├── lemonade_notification.vbs └── lemonade_server.vbs ├── mkdocs.yml ├── setup.py ├── src ├── lemonade │ ├── __init__.py │ ├── api.py │ ├── cache.py │ ├── cli.py │ ├── common │ │ ├── __init__.py │ │ ├── build.py │ │ ├── cli_helpers.py │ │ ├── exceptions.py │ │ ├── filesystem.py │ │ ├── inference_engines.py │ │ ├── network.py │ │ ├── printing.py │ │ ├── status.py │ │ ├── system_info.py │ │ └── test_helpers.py │ ├── profilers │ │ ├── __init__.py │ │ ├── memory_tracker.py │ │ └── profiler.py │ ├── sequence.py │ ├── state.py │ ├── tools │ │ ├── __init__.py │ │ ├── accuracy.py │ │ ├── adapter.py │ │ ├── bench.py │ │ ├── huggingface │ │ │ ├── bench.py │ │ │ ├── load.py │ │ │ └── utils.py │ │ ├── humaneval.py │ │ ├── llamacpp │ │ │ ├── bench.py │ │ │ └── load.py │ │ ├── management_tools.py │ │ ├── mmlu.py │ │ ├── oga │ │ │ ├── __init__.py │ │ │ ├── bench.py │ │ │ ├── load.py │ │ │ └── utils.py │ │ ├── perplexity.py │ │ ├── prompt.py │ │ ├── quark │ │ │ ├── __init__.py │ │ │ ├── quark_load.py │ │ │ └── quark_quantize.py │ │ ├── report │ │ │ ├── __init__.py │ │ │ ├── llm_report.py │ │ │ └── table.py │ │ ├── server │ │ │ ├── __init__.py │ │ │ ├── llamacpp.py │ │ │ ├── serve.py │ │ │ ├── static │ │ │ │ ├── favicon.ico │ │ │ │ ├── styles.css │ │ │ │ └── webapp.html │ │ │ ├── tool_calls.py │ │ │ ├── tray.py │ │ │ ├── utils │ │ │ │ ├── port.py │ │ │ │ ├── system_tray.py │ │ │ │ └── thread.py │ │ │ └── webapp.py │ │ └── tool.py │ └── version.py ├── lemonade_install │ ├── __init__.py │ └── install.py └── lemonade_server │ ├── cli.py │ ├── model_manager.py │ ├── pydantic_models.py │ └── server_models.json └── test ├── llm_api.py ├── oga_cpu_api.py ├── quark_api.py ├── server.py ├── server_cli.py └── server_unit.py /.github/workflows/publish-to-test-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python distributions to PyPI 2 | 3 | on: 4 | push: 5 | branches: ["main", "canary", "refresh"] 6 | tags: 7 | - v* 8 | - RC* 9 | pull_request: 10 | merge_group: 11 | 12 | jobs: 13 | build-n-publish: 14 | name: Build and publish Python distributions to PyPI 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@main 18 | - uses: conda-incubator/setup-miniconda@v3 19 | with: 20 | miniconda-version: "latest" 21 | activate-environment: lemon 22 | python-version: "3.10" 23 | - name: Install pypa/build 24 | run: >- 25 | python -m pip install build --user 26 | - name: Build a binary wheel and a source tarball 27 | run: | 28 | python -m build --sdist --wheel --outdir dist/ . 29 | version=$(python setup.py --version) 30 | echo "VERSION=$version" >> $GITHUB_ENV 31 | - name: Test wheel 32 | shell: bash -el {0} 33 | run: | 34 | python -m pip install --upgrade pip 35 | pip install "dist/lemonade_sdk-${{ env.VERSION }}-py3-none-any.whl[dev]" 36 | lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are" 37 | - name: Publish distribution package to PyPI 38 | if: startsWith(github.ref, 'refs/tags/v') 39 | uses: pypa/gh-action-pypi-publish@release/v1 40 | with: 41 | password: ${{ secrets.PYPI_API_TOKEN }} 42 | - name: Publish distribution package to Test PyPI 43 | if: startsWith(github.ref, 'refs/tags/RC') 44 | uses: pypa/gh-action-pypi-publish@release/v1 45 | with: 46 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 47 | repository_url: https://test.pypi.org/legacy/ 48 | 49 | # This file was originally licensed under Apache 2.0. It has been modified. 50 | # Modifications Copyright (c) 2025 AMD -------------------------------------------------------------------------------- /.github/workflows/test_lemonade.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Lint and Test Lemonade 5 | 6 | on: 7 | push: 8 | branches: ["main"] 9 | pull_request: 10 | merge_group: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | make-lemonade: 17 | env: 18 | LEMONADE_CI_MODE: "True" 19 | strategy: 20 | matrix: 21 | os: [ubuntu-latest, windows-latest] 22 | runs-on: ${{ matrix.os }} 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Miniconda with 64-bit Python 26 | uses: conda-incubator/setup-miniconda@v2 27 | with: 28 | miniconda-version: "latest" 29 | activate-environment: lemon 30 | python-version: "3.10" 31 | run-post: "false" 32 | - name: Install dependencies 33 | shell: bash -el {0} 34 | run: | 35 | python -m pip install --upgrade pip 36 | pip install pylint 37 | python -m pip check 38 | pip install -e .[dev] 39 | - name: Lint with Black 40 | uses: psf/black@stable 41 | with: 42 | options: "--check --verbose" 43 | src: "./src" 44 | - name: Lint with PyLint 45 | shell: bash -el {0} 46 | run: | 47 | pylint src/lemonade --rcfile .pylintrc --disable E0401 48 | pylint examples --rcfile .pylintrc --disable E0401,E0611,F0010 --jobs=1 -v 49 | - name: Run lemonade tests 50 | shell: bash -el {0} 51 | run: | 52 | # Test CLI 53 | lemonade -m -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10 54 | 55 | # Test low-level APIs 56 | python test/llm_api.py 57 | 58 | # Test high-level APIs 59 | python examples/api_basic.py 60 | python examples/api_streaming.py 61 | 62 | # This file was originally licensed under Apache 2.0. It has been modified. 63 | # Modifications Copyright (c) 2025 AMD -------------------------------------------------------------------------------- /.github/workflows/test_lemonade_oga_cpu.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Lint and Test Lemonade for OGA on CPU 5 | 6 | on: 7 | push: 8 | branches: ["main"] 9 | pull_request: 10 | merge_group: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | make-oga-cpu-lemonade: 17 | env: 18 | LEMONADE_CI_MODE: "True" 19 | runs-on: windows-latest 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Miniconda with 64-bit Python 23 | uses: conda-incubator/setup-miniconda@v2 24 | with: 25 | miniconda-version: "latest" 26 | activate-environment: lemon 27 | python-version: "3.10" 28 | run-post: "false" 29 | - name: Install dependencies 30 | shell: bash -el {0} 31 | run: | 32 | python -m pip install --upgrade pip 33 | conda install pylint 34 | python -m pip check 35 | pip install -e .[dev,oga-cpu] 36 | - name: Lint with Black 37 | uses: psf/black@stable 38 | with: 39 | options: "--check --verbose" 40 | src: "./src" 41 | - name: Lint with PyLint 42 | shell: bash -el {0} 43 | run: | 44 | pylint src/lemonade --rcfile .pylintrc --disable E0401 45 | - name: Run lemonade tests 46 | shell: bash -el {0} 47 | env: 48 | HF_TOKEN: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions 49 | run: | 50 | # Test CLI 51 | lemonade -i amd/Qwen2.5-0.5B-Instruct-quantized_int4-float16-cpu-onnx oga-load --device cpu --dtype int4 llm-prompt -p "tell me a story" --max-new-tokens 5 52 | 53 | # Test low-level APIs 54 | python test/oga_cpu_api.py 55 | 56 | # Test high-level APIs 57 | python examples/api_oga_cpu.py 58 | python examples/api_oga_cpu_streaming.py 59 | 60 | # This file was originally licensed under Apache 2.0. It has been modified. 61 | # Modifications Copyright (c) 2025 AMD -------------------------------------------------------------------------------- /.github/workflows/test_quark.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Test Lemonade with Quark Quantization 5 | 6 | on: 7 | push: 8 | branches: ["main"] 9 | pull_request: 10 | merge_group: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | make-quark-lemonade: 17 | env: 18 | LEMONADE_CI_MODE: "True" 19 | runs-on: windows-latest 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Miniconda with 64-bit Python 23 | uses: conda-incubator/setup-miniconda@v2 24 | with: 25 | miniconda-version: "latest" 26 | activate-environment: lemon 27 | python-version: "3.10" 28 | run-post: "false" 29 | - name: Install dependencies 30 | shell: bash -el {0} 31 | run: | 32 | python -m pip install --upgrade pip 33 | conda install pylint 34 | python -m pip check 35 | pip install -e .[dev,oga-cpu] 36 | lemonade-install --quark 0.6.0 37 | - name: Lint with Black 38 | uses: psf/black@stable 39 | with: 40 | options: "--check --verbose" 41 | src: "./src" 42 | - name: Lint with PyLint 43 | shell: bash -el {0} 44 | run: | 45 | pylint src/lemonade/tools/quark --rcfile .pylintrc --disable E0401 46 | - name: Run lemonade tests 47 | shell: bash -el {0} 48 | env: 49 | HF_TOKEN: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions 50 | run: | 51 | python test/quark_api.py 52 | 53 | # This file was originally licensed under Apache 2.0. It has been modified. 54 | # Modifications Copyright (c) 2025 AMD -------------------------------------------------------------------------------- /.github/workflows/test_server.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Test Lemonade Server 5 | 6 | on: 7 | push: 8 | branches: ["main"] 9 | pull_request: 10 | merge_group: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | make-server-lemonade: 17 | env: 18 | LEMONADE_CI_MODE: "True" 19 | strategy: 20 | matrix: 21 | python-version: ["3.10", "3.12"] 22 | os: [ubuntu-latest, windows-latest] 23 | runs-on: ${{ matrix.os }} 24 | steps: 25 | - uses: actions/checkout@v3 26 | - name: Set up Miniconda with 64-bit Python 27 | uses: conda-incubator/setup-miniconda@v2 28 | with: 29 | miniconda-version: "latest" 30 | activate-environment: lemon 31 | python-version: ${{ matrix.python-version }} 32 | run-post: "false" 33 | - name: Install dependencies 34 | shell: bash -el {0} 35 | run: | 36 | python -m pip install --upgrade pip 37 | python -m pip check 38 | pip install -e .[dev,oga-cpu] 39 | lemonade-server-dev pull Qwen2.5-0.5B-Instruct-CPU 40 | - name: Run server tests (unit tests) 41 | shell: bash -el {0} 42 | run: | 43 | python test/server_unit.py 44 | - name: Run server tests (network online mode) 45 | shell: bash -el {0} 46 | run: | 47 | python test/server.py 48 | - name: Run server tests (offline mode) 49 | shell: bash -el {0} 50 | run: | 51 | python test/server.py --offline 52 | 53 | # This file was originally licensed under Apache 2.0. It has been modified. 54 | # Modifications Copyright (c) 2025 AMD 55 | -------------------------------------------------------------------------------- /.lfsconfig: -------------------------------------------------------------------------------- 1 | [lfs] 2 | fetchexclude = *.onnx,*_model.zip 3 | -------------------------------------------------------------------------------- /NOTICE.md: -------------------------------------------------------------------------------- 1 | PORTIONS LICENSED AS FOLLOWS 2 | 3 | Lemonade SDK used the [ONNX TurnkeyML](https://github.com/onnx/turnkeyml) project as a starting point under the [Apache 2.0 license](./LICENSE). 4 | 5 | ## TurnkeyML Attribution 6 | 7 | TurnkeyML used code from other open source projects as a starting point (see [NOTICE.md](NOTICE.md)). Thank you Philip Colangelo, Derek Elkins, Jeremy Fowers, Dan Gard, Victoria Godsoe, Mark Heaps, Daniel Holanda, Brian Kurtz, Mariah Larwood, Philip Lassen, Andrew Ling, Adrian Macias, Gary Malik, Sarah Massengill, Ashwin Murthy, Hatice Ozen, Tim Sears, Sean Settle, Krishna Sivakumar, Aviv Weinstein, Xueli Xao, Bill Xing, and Lev Zlotnik for your contributions to that work. 8 | 9 | \> TurnkeyML used code from the [MLAgility](https://github.com/groq/mlagility) and [GroqFlow](https://github.com/groq/groqflow) projects as a starting point. Much of that code was refactored, improved, or replaced by the time TurnkeyML was published. 10 | 11 | \> TurnkeyML uses the [Microsoft lemon emoji](https://github.com/microsoft/fluentui-emoji) as an icon for the lemoande tool. 12 | 13 | >The MIT License 14 | > 15 | >Copyright 2023 Groq Inc. 16 | > 17 | >Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 18 | > 19 | >The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 20 | > 21 | >THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | lemonade-server.ai -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # 🍋 Lemonade SDK 2 | 3 | Welcome to the documentation for the Lemonade SDK project! Use this resource to learn more about the server, CLI, API, and how to contribute to the project. 4 | 5 | <div class="hide-in-mkdocs"> 6 | 7 | - [Installation](#installation) 8 | - [Server](#server) 9 | - [Developer CLI](#developer-cli) 10 | - [Lemonade API](#lemonade-api) 11 | - [Software and Hardware Overview](#software-and-hardware-overview) 12 | - [Supported Hardware Accelerators](#supported-hardware-accelerators) 13 | - [Supported Inference Engines](#supported-inference-engines) 14 | - [Contributing](#contributing) 15 | </div> 16 | 17 | ## Installation 18 | 19 | 20 | [Click here for Lemonade SDK installation options](https://lemonade-server.ai/install_options.html). 21 | 22 | For a quick start with Hugging Face (PyTorch) LLMs on CPU, run the following installation commands in an active Python 3 environment, and then try the Server, CLI, or API links below. 23 | 24 | ```bash 25 | pip install lemonade-sdk[dev] 26 | ``` 27 | 28 | ## Server 29 | 30 | The Lemonade Server is an OpenAI API-compatible HTTP server that supports streamlined integration with a wide variety of LLM applications. Learn more in [server documentation](https://lemonade-server.ai/docs/). 31 | 32 | ## Developer CLI 33 | 34 | The Lemonade developer CLI, `lemonade`, offers tools for performance benchmarking, accuracy evaluation, and device-specific model preparation. Learn more in the dev CLI [README.md](./dev_cli/README.md). 35 | 36 | ## Lemonade API 37 | 38 | The high-level Lemonade API abstracts loading models from any supported framework (e.g., Hugging Face, OGA) and backend (e.g., CPU, Hybrid) using the popular `from_pretrained()` function. This makes it easy to integrate Lemonade LLMs into Python applications. For more information on recipes and compatibility, see the [Lemonade API ReadMe](./lemonade_api.md). 39 | 40 | OGA Hybrid: 41 | ```python 42 | from lemonade.api import from_pretrained 43 | 44 | model, tokenizer = from_pretrained("amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid") 45 | 46 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 47 | response = model.generate(input_ids, max_new_tokens=30) 48 | 49 | print(tokenizer.decode(response[0])) 50 | ``` 51 | 52 | You can find examples for the high-level APIs [here](https://github.com/lemonade-sdk/lemonade/tree/main/examples). 53 | 54 | ## Software and Hardware Overview 55 | 56 | The goal of Lemonade is to help achieve maximum LLM performance on your PC. To cover a wide range of PCs, Lemonade supports a wide variety of hardware accelerators and inference engines described in the subsections below. 57 | 58 | ### Supported Hardware Accelerators 59 | 60 | | Mode | Description | 61 | | :--- | :--- | 62 | | **NPU & Hybrid** | Ryzen™ AI 300-series devices have a neural processing unit (NPU) that can run LLMs and accelerate time-to-first-token (TTFT) performance. The typical way of utilizing the NPU is called *hybrid execution*, where the prompt is processed on the NPU to produce the first token, and the remaining tokens are computed on the Ryzen AI integrated GPU (iGPU). | 63 | | **GPU** | PCs with an integrated GPU (iGPU), such as many laptop SoCs, and/or discrete GPU (dGPU), such as many desktop and workstation PCs, can run LLMs on that GPU hardware. Lemonade Server provides GPU support in every installation via the Vulkan llama.cpp binaries.<br/><br/> <sub>Note: GPU support is not currently provided for CLI tasks such as benchmarking.</sub> | 64 | 65 | ### Supported Inference Engines 66 | | Engine | Description | 67 | | :--- | :--- | 68 | | **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). | 69 | | **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). | 70 | | **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. | 71 | 72 | ## Contributing 73 | 74 | Contributions are welcome! If you decide to contribute, please: 75 | 76 | - Do so via a pull request. 77 | - Write your code in keeping with the same style as the rest of this repo's code. 78 | - Add a test under `test/` that provides coverage of your new feature. 79 | 80 | The best way to contribute is to add new tools to cover more devices and usage scenarios. 81 | 82 | To add a new tool: 83 | 84 | 1. (Optional) Create a new `.py` file under `src/lemonade/tools` (or use an existing file if your tool fits into a pre-existing family of tools). 85 | 1. Define a new class that inherits the `Tool` class. 86 | 1. Register the class by adding it to the list of `tools` near the top of `src/lemonade/cli.py`. 87 | 88 | You can learn more about contributing on the repository's [contribution guide](https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md). 89 | 90 | <!--This file was originally licensed under Apache 2.0. It has been modified. 91 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/assets/carousel.js: -------------------------------------------------------------------------------- 1 | // Simple YouTube video carousel for MkDocs Material 2 | 3 | document.addEventListener('DOMContentLoaded', function () { 4 | var carousel = document.getElementById('yt-carousel'); 5 | if (!carousel) return; 6 | // Support both data-ids (comma-separated) and data-videos (JSON array of {id, title}) 7 | var videos = []; 8 | if (carousel.dataset.videos) { 9 | try { 10 | videos = JSON.parse(carousel.dataset.videos); 11 | } catch (e) { 12 | console.error('Invalid JSON in data-videos:', e); 13 | } 14 | } else if (carousel.dataset.ids) { 15 | videos = carousel.dataset.ids.split(',').map(function(id) { 16 | return { id: id.trim(), title: '' }; 17 | }); 18 | } 19 | if (!videos.length) return; 20 | var idx = 0; 21 | 22 | function render() { 23 | var video = videos[idx]; 24 | var titleHtml = video.title ? `<div style=\"margin-bottom:8px;font-weight:bold;font-size:1.1rem;\">${video.title}</div>` : ''; 25 | carousel.innerHTML = ` 26 | <div style="display:flex;flex-direction:column;align-items:center;max-width:100%;"> 27 | ${titleHtml} 28 | <div style="position:relative;width:100%;max-width:560px;aspect-ratio:16/9;"> 29 | <iframe style="width:100%;height:100%;border-radius:12px;box-shadow:0 2px 16px #0003;" src="https://www.youtube.com/embed/${video.id}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> 30 | </div> 31 | <div style="margin-top:16px;display:flex;align-items:center;gap:16px;"> 32 | <button id="yt-prev" style="padding:2px 8px;font-size:0.6rem;border:none;border-radius:6px;background:#E59800;color:#222;font-weight:bold;cursor:pointer;">Prev</button> 33 | <span style="font-size:1rem;color:#666;">${idx+1} / ${videos.length}</span> 34 | <button id="yt-next" style="padding:2px 8px;font-size:0.6rem;border:none;border-radius:6px;background:#E59800;color:#222;font-weight:bold;cursor:pointer;">Next</button> 35 | </div> 36 | </div> 37 | `; 38 | document.getElementById('yt-prev').onclick = function() { 39 | idx = (idx - 1 + videos.length) % videos.length; 40 | render(); 41 | }; 42 | document.getElementById('yt-next').onclick = function() { 43 | idx = (idx + 1) % videos.length; 44 | render(); 45 | }; 46 | } 47 | render(); 48 | }); 49 | -------------------------------------------------------------------------------- /docs/assets/extra.css: -------------------------------------------------------------------------------- 1 | /* Note: I have not figured out all the color variables yet */ 2 | 3 | [data-md-color-scheme="lightmode"] { 4 | --md-primary-fg-color: #FFFBE9; /* Header, Selected Page Font */ 5 | --md-primary-bg-color: #000000; /* Header Font, Icon Color*/ 6 | --md-primary-bg-color--light: #000000; /* Search bar font color */ 7 | --md-accent-fg-color: #FFD744; /* Hover color of links */ 8 | --md-footer-fg-color: #E59800; /* Nav Footer Font Color */ 9 | --md-footer-fg-color--light: #3b3b3b; /* Footer Font Color */ 10 | --md-footer-fg-color--lighter: #3b3b3b; /* Made With... color */ 11 | --md-footer-bg-color: #FFFBE9; /* Nav Footer Background Color */ 12 | --md-footer-bg-color--dark: #FFFBE9; /* Footer Background Color */ 13 | --md-default-bg-color: #FFFBE9; /* Main background color */ 14 | --md-code-bg-color: #ffefb5; /* Code block background color */ 15 | --md-code-fg-color: #000000; /* Code block font color */ 16 | --md-default-fg-color--light: #E59800; /* Blockquote color */ 17 | } 18 | 19 | [data-md-color-scheme="slate"] { 20 | --md-primary-fg-color: #FFD500; /* Header, Selected Page Font */ 21 | --md-primary-bg-color: #000000; /* Header Font, Icon Color*/ 22 | --md-primary-bg-color--light: #000000; /* Search bar font color */ 23 | --md-accent-fg-color: #FFD500; /* Hover color of links */ 24 | --md-accent-fg-color--transparent: #E59800; 25 | --md-footer-fg-color: #E59800; /* Nav Footer Font Color */ 26 | --md-footer-fg-color--light: #929292; /* Footer Font Color */ 27 | --md-footer-fg-color--lighter: #929292; /* Made With... color */ 28 | --md-footer-bg-color: #000000; /* Nav Footer Background Color */ 29 | --md-footer-bg-color--dark: #000000; /* Footer Background Color */ 30 | --md-primary-bg-color--light: #000000; /* Search Font */ 31 | } 32 | 33 | [data-md-color-scheme="slate"] { 34 | --md-hue: 320; /* between 0 and 360 */ 35 | /* --md-saturation: 50; /* between 0 and 100 */ 36 | /* --md-lightness: 100; between 0 and 100 */ 37 | --md-footer-bg-color: #141413; /* Nav Footer Background Color */ 38 | --md-default-bg-color: #141413 !important; /* Dark background */ 39 | --md-primary-fg-color: #E59800 !important; /* Header, Selected Page Font */ 40 | --md-footer-bg-color--dark: #1f1503 !important; /* Footer Background Color */ 41 | } 42 | 43 | .hide-in-mkdocs { display: none; } 44 | 45 | /* docs/assets/extra.css */ 46 | .mkdocs-only { display: block; } 47 | 48 | /* Hide the page title in the navigation sidebar */ 49 | .md-nav__title { 50 | display: none !important; 51 | } 52 | 53 | /* Make page titles (h1) a darker grey in light mode, lighter in dark mode */ 54 | [data-md-color-scheme="lightmode"] h1, 55 | [data-md-color-scheme="lightmode"] .md-typeset h1 { 56 | color: #222 !important; 57 | } 58 | [data-md-color-scheme="slate"] h1, 59 | [data-md-color-scheme="slate"] .md-typeset h1 { 60 | color: #cfcfcf !important; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /docs/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/docs/assets/favicon.ico -------------------------------------------------------------------------------- /docs/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/docs/assets/logo.png -------------------------------------------------------------------------------- /docs/assets/mkdocs_requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-material 3 | mkdocs-monorepo-plugin 4 | pymdown-extensions -------------------------------------------------------------------------------- /docs/code.md: -------------------------------------------------------------------------------- 1 | # Lemonade SDK Code Structure 2 | 3 | # Repo Organization 4 | 5 | The Lemonade SDK source code has a few major top-level directories: 6 | - `docs`: documentation for the entire project. 7 | - `examples`: example scripts for use with the Lemonade tools. 8 | - `src/lemonade`: source code for the lemonade-sdk package. 9 | - `src/lemonade/tools`: implements `Tool` and defines the tools built in to `lemonade`. 10 | - `src/lemonade/sequence.py`: implements `Sequence` and defines the plugin API for `Tool`s. 11 | - `src/lemonade/cli`: implements the `lemonade` CLI. 12 | - `src/lemonade/common`: functions common to the other modules. 13 | - `src/lemonade/version.py`: defines the package version number. 14 | - `src/lemonade/state.py`: implements the `State` class. 15 | - `test`: tests for the Lemonade SDK tools. 16 | 17 | ## Tool Classes 18 | 19 | All of the logic for actually building models is contained in `Tool` classes. Generally, a `FirstTool` class obtains a model, and each subsequent `Tool` is a model-to-model transformation. For example: 20 | - the `Discover(FirstTool)` (aka `discover` in the CLI) obtains a PyTorch model instance from a python script. 21 | - the `ExportPytorchModel(Tool)` (aka `export-pytorch` in the CLI) transforms a PyTorch model instance into an ONNX model file. 22 | 23 | ### Composability 24 | 25 | `Tools` are designed to be composable. This composability is facilitated by the `State` class, which is how `Tools` communicate with each other. Every `Tool` takes an instance of `State` as input and then returns an instance of `State`. 26 | 27 | ### Implementation 28 | 29 | See [tools.py](https://github.com/lemonade-sdk/lemonade/blob/main/src/lemonade/tools/tool.py) for a definition of each method of `Tool` that must be implemented to create a new `Tool` subclass. 30 | 31 | <!--This file was originally licensed under Apache 2.0. It has been modified. 32 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/dev_cli/humaneval_accuracy.md: -------------------------------------------------------------------------------- 1 | # Using the HumanEval accuracy test tools 2 | 3 | The HumanEval benchmark is a code generation and functional correctness evaluation framework designed to assess language models' ability to generate Python code. It consists of 164 handwritten programming problems, each containing a function signature, docstring, body, and several unit tests. This benchmark focuses on evaluating a model's capability to generate functionally correct code that passes the test cases, making it particularly useful for assessing code generation capabilities. 4 | 5 | This tool provides an automated way to evaluate language models on the HumanEval benchmark. It handles the process of downloading the dataset, generating code completions, executing them in a secure environment, and calculating pass@k metrics. 6 | 7 | ## Dataset 8 | 9 | The HumanEval dataset is automatically downloaded from [OpenAI's human-eval repository](https://github.com/openai/human-eval) when you first run the benchmark. The dataset contains programming problems that test various aspects of Python programming, including: 10 | 11 | - Basic programming operations 12 | - String manipulation 13 | - Mathematical computations 14 | - List operations 15 | - Algorithm implementation 16 | - Data structure manipulation 17 | 18 | ## Running the Benchmark 19 | 20 | ```bash 21 | lemonade -i meta-llama/Llama-3.2-1B oga-load --device igpu --dtype int4 accuracy-humaneval --k-samples 1 --first-n-samples 5 --timeout 30.0 22 | ``` 23 | 24 | ### Optional arguments: 25 | 26 | `--k-samples`: Number of completions to generate per prompt (default: 1). This parameter determines the k in pass@k metrics. For example: 27 | - `--k-samples 1`: Calculates pass@1 (single attempt per problem) 28 | - `--k-samples 10`: Calculates pass@10 (ten attempts per problem) 29 | - `--k-samples 100`: Calculates pass@100 (hundred attempts per problem) 30 | 31 | Higher k values provide more robust evaluation but take longer to run. 32 | 33 | `--first-n-samples`: Evaluate only the first N problems from the dataset (default: entire dataset). Useful for quick testing or when you want to evaluate a subset of problems. 34 | 35 | `--timeout`: Maximum time in seconds allowed for each test case execution (default: 30.0). This prevents infinite loops or long-running code from blocking the evaluation. 36 | 37 | `--data-dir`: Custom directory for storing the HumanEval dataset (default: "<lemonade_cache_dir>/data/humaneval"). 38 | 39 | ## How It Works 40 | 41 | 1. **Dataset Preparation:** 42 | - On first run, the tool downloads the HumanEval dataset (HumanEval.jsonl.gz) 43 | - The dataset contains function signatures, docstrings, and test cases 44 | - Each problem is structured to test specific programming capabilities 45 | - You can evaluate only the first N problems using `--first-n-samples` 46 | 47 | 2. **Code Generation:** 48 | - For each programming problem, the model is provided with a prompt containing: 49 | - Function signature (e.g., `def sort_numbers(numbers):`) 50 | - Docstring describing the function's purpose and requirements 51 | - The model generates k code completions for the function body (controlled by `--k-samples`) 52 | - These k samples are used to calculate the pass@k metric 53 | 54 | 3. **Secure Execution:** 55 | - Generated code is executed in a secure sandbox environment maintained by OpenAI's human-eval library. For your awareness, OpenAI's policy is to disable code execution by default, however lemonade enables code execution by default by automatically setting the environment variable `HF_ALLOW_CODE_EVAL=1`. OpenAI provides the following code execution protections: 56 | - **Process Isolation**: Each code sample runs in a separate process to prevent interference 57 | - **Resource Limits**: 58 | - CPU time limit (controlled by `--timeout`) 59 | - Memory usage restrictions 60 | - Maximum output size restrictions 61 | - **Restricted Access**: 62 | - No network access 63 | - No file system access outside test directory 64 | - No subprocess creation 65 | - No system calls 66 | - **Module Restrictions**: 67 | - Only allows importing standard Python libraries needed for testing 68 | - Blocks potentially dangerous modules (os, sys, subprocess, etc.) 69 | These security measures are implemented through: 70 | - Python's built-in `resource` module for resource limits 71 | - AST (Abstract Syntax Tree) analysis for code validation 72 | - Process-level isolation using `multiprocessing` 73 | - Custom import hooks to restrict module access 74 | 75 | 4. **Evaluation Metrics:** 76 | - **pass@k**: Percentage of problems solved with k attempts 77 | - pass@1: Success rate with single attempt 78 | - pass@10: Success rate within 10 attempts 79 | - pass@100: Success rate within 100 attempts 80 | - A problem is considered solved if all test cases pass 81 | - Results are normalized to percentages 82 | 83 | 5. **Output Files:** 84 | The tool generates several output files in the results directory: 85 | - `evaluation_results.csv`: Contains prompts, completions, and expected answers 86 | - `humaneval_predictions.jsonl`: Raw model predictions in JSONL format 87 | - `humaneval_predictions.jsonl_results.jsonl`: Detailed evaluation results 88 | 89 | ## Example Results Format 90 | 91 | The evaluation produces metrics in the following format: 92 | ```json 93 | { 94 | "pass@1": 0.25, // 25% success rate with 1 attempt 95 | "pass@10": 0.45, // 45% success rate within 10 attempts 96 | "pass@100": 0.65 // 65% success rate within 100 attempts 97 | } 98 | ``` 99 | 100 | ## Limitations 101 | 102 | 1. **Resource Requirements**: Generating multiple samples per problem (high k values) can be computationally intensive and time-consuming. 103 | 2. **Memory Usage**: Large language models may require significant memory, especially when generating multiple samples. 104 | 105 | ## References 106 | 107 | 1. [Evaluating Large Language Models Trained on Code](https://arxiv.org/abs/2107.03374) 108 | 2. [OpenAI HumanEval Repository](https://github.com/openai/human-eval) 109 | 110 | <!--This file was originally licensed under Apache 2.0. It has been modified. 111 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/dev_cli/llamacpp.md: -------------------------------------------------------------------------------- 1 | # LLAMA.CPP 2 | 3 | Run transformer models using llama.cpp. This integration allows you to: 4 | 1. Load and run llama.cpp models 5 | 2. Benchmark model performance 6 | 3. Use the models with other tools like chat or MMLU accuracy testing 7 | 8 | ## Prerequisites 9 | 10 | You need: 11 | 1. A compiled llama.cpp executable (llama-cli or llama-cli.exe) 12 | 2. A GGUF model file 13 | 14 | ### Building llama.cpp (if needed) 15 | 16 | #### Linux 17 | ```bash 18 | git clone https://github.com/ggerganov/llama.cpp 19 | cd llama.cpp 20 | make 21 | ``` 22 | 23 | #### Windows 24 | ```bash 25 | git clone https://github.com/ggerganov/llama.cpp 26 | cd llama.cpp 27 | cmake -B build 28 | cmake --build build --config Release 29 | ``` 30 | 31 | The executable will be in `build/bin/Release/llama-cli.exe` on Windows or `llama-cli` in the root directory on Linux. 32 | 33 | ## Usage 34 | 35 | ### Loading a Model 36 | 37 | Use the `load-llama-cpp` tool to load a model: 38 | 39 | ```bash 40 | lemonade -i MODEL_NAME load-llama-cpp \ 41 | --executable PATH_TO_EXECUTABLE \ 42 | --model-binary PATH_TO_GGUF_FILE 43 | ``` 44 | 45 | Parameters: 46 | | Parameter | Required | Default | Description | 47 | |--------------|----------|---------|-------------------------------------------------------| 48 | | executable | Yes | - | Path to llama-cli/llama-cli.exe | 49 | | model-binary | Yes | - | Path to .gguf model file | 50 | | threads | No | 1 | Number of threads for generation | 51 | | context-size | No | 512 | Context window size | 52 | | output-tokens| No | 512 | Maximum number of tokens to generate | 53 | 54 | ### Benchmarking 55 | 56 | After loading a model, you can benchmark it using `llama-cpp-bench`: 57 | 58 | ```bash 59 | lemonade -i MODEL_NAME \ 60 | load-llama-cpp \ 61 | --executable PATH_TO_EXECUTABLE \ 62 | --model-binary PATH_TO_GGUF_FILE \ 63 | llama-cpp-bench 64 | ``` 65 | 66 | Benchmark parameters: 67 | | Parameter | Default | Description | 68 | |------------------|----------------------------|-------------------------------------------| 69 | | prompt | "Hello, I am conscious and"| Input prompt for benchmarking | 70 | | context-size | 512 | Context window size | 71 | | output-tokens | 512 | Number of tokens to generate | 72 | | iterations | 1 | Number of benchmark iterations | 73 | | warmup-iterations| 0 | Number of warmup iterations (not counted) | 74 | 75 | The benchmark will measure and report: 76 | - Time to first token (prompt evaluation time) 77 | - Token generation speed (tokens per second) 78 | 79 | ### Example Commands 80 | 81 | #### Windows Example 82 | ```bash 83 | # Load and benchmark a model 84 | lemonade -i Qwen/Qwen2.5-0.5B-Instruct-GGUF \ 85 | load-llama-cpp \ 86 | --executable "C:\work\llama.cpp\build\bin\Release\llama-cli.exe" \ 87 | --model-binary "C:\work\llama.cpp\models\qwen2.5-0.5b-instruct-fp16.gguf" \ 88 | llama-cpp-bench \ 89 | --iterations 3 \ 90 | --warmup-iterations 1 91 | 92 | # Run MMLU accuracy test 93 | lemonade -i Qwen/Qwen2.5-0.5B-Instruct-GGUF \ 94 | load-llama-cpp \ 95 | --executable "C:\work\llama.cpp\build\bin\Release\llama-cli.exe" \ 96 | --model-binary "C:\work\llama.cpp\models\qwen2.5-0.5b-instruct-fp16.gguf" \ 97 | accuracy-mmlu \ 98 | --tests management \ 99 | --max-evals 2 100 | ``` 101 | 102 | #### Linux Example 103 | ```bash 104 | # Load and benchmark a model 105 | lemonade -i Qwen/Qwen2.5-0.5B-Instruct-GGUF \ 106 | load-llama-cpp \ 107 | --executable "./llama-cli" \ 108 | --model-binary "./models/qwen2.5-0.5b-instruct-fp16.gguf" \ 109 | llama-cpp-bench \ 110 | --iterations 3 \ 111 | --warmup-iterations 1 112 | ``` 113 | 114 | ## Integration with Other Tools 115 | 116 | After loading with `load-llama-cpp`, the model can be used with any tool that supports the ModelAdapter interface, including: 117 | - accuracy-mmlu 118 | - llm-prompt 119 | - accuracy-humaneval 120 | - and more 121 | 122 | The integration provides: 123 | - Platform-independent path handling (works on both Windows and Linux) 124 | - Proper error handling with detailed messages 125 | - Performance metrics collection 126 | - Configurable generation parameters (temperature, top_p, top_k) 127 | - 10-minute timeout for model generation to prevent indefinite hangs 128 | 129 | <!--This file was originally licensed under Apache 2.0. It has been modified. 130 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/dev_cli/lm-eval.md: -------------------------------------------------------------------------------- 1 | # Evaluating Models with lm-eval-harness 2 | 3 | The `lm-eval-harness` tool in Lemonade provides an easy way to evaluate language models on a variety of standardized benchmarks using the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) framework from EleutherAI. This tool allows you to generate standardized accuracy metrics across a wide range of tasks and datasets. 4 | 5 | ## How It Works 6 | 7 | Lemonade makes model evaluation simple by handling the entire workflow for you: 8 | 9 | 1. **Load Your Model**: First, you load your model using either Hugging Face (`huggingface-load`) or ONNX Runtime GenAI (`oga-load`) with your preferred settings like device and dtype. 10 | 11 | 2. **Start Evaluation Server**: Lemonade automatically starts a local server with your loaded model, making it accessible to the evaluation framework. 12 | 13 | 3. **Run lm-evaluation-harness**: Lemonade then runs the lm-evaluation-harness against the server, executing the specific tasks and benchmarks requested. 14 | 15 | 4. **Organize Results**: Finally, Lemonade processes the raw evaluation data and organizes it into clear, readable reports with key metrics like accuracy percentages, saving everything to the model's build directory for easy access. 16 | 17 | ## Usage 18 | 19 | The basic syntax follows this pattern: 20 | 21 | ```bash 22 | lemonade -i <checkpoint> <loading_method> [loading_options] lm-eval-harness --task <task_name> [options] 23 | ``` 24 | 25 | ### Common Options 26 | 27 | - `--task`: Specifies which task to evaluate on (e.g., gsm8k, mmlu, mmlu_*). 28 | - `--limit`: Optional number of examples to evaluate (useful for quick tests). 29 | - `--num-fewshot`: Number of examples to use in few-shot prompts (default: 0). 30 | - `--log_samples`: Log individual samples and predictions. 31 | 32 | ### Examples 33 | 34 | #### ONNX Runtime GenAI: 35 | 36 | ```bash 37 | lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device cpu --dtype int4 lm-eval-harness --task mmlu_abstract_algebra --limit 10 38 | ``` 39 | 40 | This example: 41 | - Loads the Llama 3.2 1B model with OGA. 42 | - Quantizes to INT4 precision. 43 | - Evaluates on the abstract algebra subset of MMLU. 44 | - Limits evaluation to 10 questions. 45 | 46 | #### Hugging Face: 47 | 48 | ```bash 49 | lemonade -i meta-llama/Llama-3.2-1B-Instruct huggingface-load --device cpu lm-eval-harness --task mmlu_abstract_algebra 50 | ``` 51 | 52 | This example: 53 | - Loads the Llama 3.2 1B model using Hugging Face. 54 | - Evaluates on the abstract algebra subset of MMLU. 55 | - Uses the full test set. 56 | 57 | ## Supported Tasks 58 | 59 | The tool supports all tasks available in lm-evaluation-harness, including: 60 | 61 | - **MMLU**: Massive Multitask Language Understanding (use `mmlu` for all subjects or `mmlu_<subject>` for specific subjects). 62 | - **GSM8K**: Grade School Math word problems. 63 | - **HumanEval**: Code generation and completion. 64 | - **TruthfulQA**: Testing model truthfulness. 65 | - **MATH**: Complex mathematical problem solving. 66 | - And many more (see the [full list in the lm-evaluation-harness repository](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md)). 67 | 68 | ## Understanding Results 69 | 70 | Results are displayed in the terminal and saved to the model's build directory. 71 | 72 | ### Metrics 73 | 74 | The key metrics vary by task, but commonly include: 75 | 76 | - **exact_match**: Percentage of exact matches between model predictions and expected answers. 77 | - **acc** or **accuracy**: Accuracy score (varies by task). 78 | - **f1**: F1 score for tasks that require partial matching. 79 | 80 | For multiple-choice tasks like MMLU, scores represent the percentage of correct answers. For generative tasks like GSM8K, results often include metrics for both strict and flexible matching: 81 | 82 | - **exact_match,strict-match**: Requires the model to produce the exact correct answer. 83 | - **exact_match,flexible-extract**: Allows for variations in formatting but requires the correct numerical answer. 84 | 85 | ### Result Files 86 | 87 | Detailed result files are saved in: 88 | ``` 89 | <cache_dir>/builds/<model_name>_<timestamp>/lm_eval_results/<task_name>_results/ 90 | ``` 91 | 92 | These include the full evaluation data in JSON format. 93 | 94 | ## Interpreting Results 95 | 96 | When evaluating models, consider: 97 | 98 | 1. **Task Relevance**: Different tasks measure different capabilities. Choose tasks relevant to your use case. 99 | 100 | 2. **Comparison Context**: Compare results against other models of similar size/architecture for meaningful insights. 101 | 102 | 3. **Few-shot Performance**: Many models perform significantly better with examples (try `--num-fewshot 5`). 103 | 104 | 4. **Limitations**: Low scores on specific tasks may highlight limitations in the model's training data or capabilities. 105 | 106 | Summary `llm-eval-harness` tool results are also included in the tables generated by 107 | the report tool (`lemonade report --perf`). 108 | 109 | ## Further Information 110 | 111 | For more details on lm-evaluation-harness and its capabilities, see the [official documentation](https://github.com/EleutherAI/lm-evaluation-harness). -------------------------------------------------------------------------------- /docs/dev_cli/ort_genai_igpu.md: -------------------------------------------------------------------------------- 1 | # OnnxRuntime GenAI (OGA) for iGPU and CPU 2 | 3 | [onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs. 4 | 5 | ## Installation 6 | 7 | See [Lemonade Installation](./README.md#installation) for the OGA iGPU backend. 8 | 9 | ## Get models 10 | 11 | - The oga-load tool can download models from Hugging Face and build ONNX files using OGA's `model_builder`, which can quantize and optimize models for both iGPU and CPU. 12 | - Download and build ONNX model files: 13 | - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4` 14 | - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4` 15 | - The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls: 16 | - `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4` 17 | - `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4` 18 | - The ONNX model build process can be forced to run again, overwriting the above cache, by using the `--force` flag: 19 | - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force` 20 | - Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models, such as: 21 | - Gemma 22 | - LLaMa 23 | - Mistral 24 | - Phi 25 | - Qwen 26 | - Nemotron 27 | - For the full list of supported models, please see the [model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md). 28 | - The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository: 29 | - `cpu`: `fp32`, `int4` 30 | - `igpu`: `fp16`, `int4` 31 | 32 | ## Directory structure: 33 | - The model_builder tool caches Hugging Face files and temporary ONNX external data files in `<LEMONADE CACHE>\model_builder` 34 | - The output from model_builder is stored in `<LEMONADE_CACHE>\oga_models\<MODELNAME>\<SUBFOLDER>` 35 | - `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case. 36 | - `SUBFOLDER` is `<EP>-<DTYPE>`, where `EP` is the execution provider (`dml` for `igpu`, `cpu` for `cpu`, and `npu` for `npu`) and `DTYPE` is the datatype. 37 | - If the `--int4-block-size` flag is used then `SUBFOLDER` is` <EP>-<DTYPE>-block-<SIZE>` where `SIZE` is the specified block size. 38 | - Other ONNX models in the format required by onnxruntime-genai can be loaded by Lemonade if placed in the `<LEMONADE_CACHE>\oga_models` folder. 39 | - Use the `-i` and `--subfolder` flags to specify the folder and subfolder, for example: 40 | - `lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load` 41 | - Lemonade will expect the ONNX model files to be located in `<LEMONADE_CACHE>\oga_models\my_model_name\my_subfolder` 42 | 43 | <!--This file was originally licensed under Apache 2.0. It has been modified. 44 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/dev_cli/perplexity.md: -------------------------------------------------------------------------------- 1 | 2 | # Perplexity Evaluation 3 | 4 | 5 | ## Overview 6 | 7 | Perplexity is a measurement of how well a probability model predicts a sample. A lower perplexity indicates the model is more confident in its predictions. In the context of language models, perplexity measures the likelihood of the sequence according to the model, given as: 8 | 9 | `Perplexity (P) = exp(Average Negative Log-Likelihood)` 10 | 11 | `Where Average Negative Log-Likelihood = (1/N) * Sum[-log p(x_i) from i=1 to N]` 12 | 13 | 14 | ## Script Functionality 15 | 16 | ### Key Components 17 | 18 | - **`max_length`**: The maximum input length the model can handle at once (set by the model's configuration). 19 | - **`stride`**: The step size for the window, set to half of `max_length` to ensure some overlap and preserve context. 20 | - **`seq_len`**: The total length of the tokenized input. 21 | 22 | ### Detailed Steps 23 | 24 | 1. **Load Model and Tokenizer**: Receive the model and tokenizer with specified configurations. 25 | 2. **Load and Prepare Data**: Loads the "wikitext-2-raw-v1" dataset and concatenates texts with double newlines. The data is then tokenized. 26 | 3. **Sliding Window Perplexity Calculation**: The script uses a sliding window approach (with a stride of half the window size) to calculate the perplexity for subsets of the data, adjusting for the maximum input length of the model: 27 | - For each window, input data is processed, and the corresponding labels are adjusted to mask out irrelevant parts (using `-100`). 28 | - The model computes the logits and loss for each window. 29 | - Predicted and actual words at the end of each window are logged for analysis. 30 | 4. **Logging to CSV**: Summarizes the context window, predicted and actual next words, and loss for each window into a CSV file for further analysis. 31 | 5. **Perplexity Calculation**: Calculates the total negative log-likelihood adjusted by the effective token count for each window, then computes the average across all tokens to determine the perplexity. 32 | 33 | ### Example Outputs 34 | 35 | The script outputs a CSV file named `summary_results.csv` with the following columns: 36 | 37 | - **Context (Partial context displayed for Brevity)** 38 | - **Predicted next word** 39 | - **Actual next word** 40 | - **Loss for this window** 41 | 42 | These entries help in understanding how the model is performing at each step of the text. 43 | 44 | ## How to Interpret Perplexity Results 45 | 46 | Understanding Perplexity 47 | Definition: Perplexity is defined as the exponential of the average negative log-likelihood of a model on a given test set. 48 | 49 | Lower Values are Better: A lower perplexity score indicates that the model has a higher probability of correctly predicting the sample, suggesting better performance. A lower perplexity means the model is more certain about its predictions. 50 | 51 | ### Interpretation: 52 | 53 | **High Perplexity:** Indicates confusion or a high level of uncertainty in the model’s predictions. A high perplexity can suggest that the model's language understanding is poor or that the model is not well-tuned for the given data. 54 | 55 | **Low Perplexity:** Suggests that the model predictions are more accurate and that it assigns higher probabilities to the actual observed outcomes. This is indicative of a model that has a good grasp of the language patterns seen in the test set. 56 | Practical Implications 57 | 58 | **Model Comparison:** Perplexity is particularly useful for comparing different versions of the same model (e.g., before and after quantization, fine-tuning or training on additional data). The model with the lower perplexity is generally considered better at modeling the language of the test corpus. 59 | 60 | **Model Selection for Applications:** For applications involving language generation (like machine translation, text summarization, or chatbots), selecting a model with lower perplexity might result in more fluent, coherent, and contextually appropriate text output. 61 | 62 | **Diagnosing Model Fit:** High perplexity could indicate underfitting, where the model is too simple to capture the complexity of the language data. It can also help in diagnosing whether the model is well-suited for the specific domain of the text being modeled. 63 | 64 | 65 | ### Caveats in Interpretation 66 | 67 | **Dependency on Test Set:** Perplexity is highly dependent on the test set used. A model can show very different perplexity scores on different datasets. Therefore, it's important to consider the nature and domain of the test set when evaluating perplexity. 68 | 69 | **Not a Complete Measure:** While perplexity provides a measure of how uncertain a model is about its predictions, it does not directly measure how coherent or contextually appropriate generated texts are. Other qualitative assessments and metrics might be necessary to fully evaluate a language model's output. 70 | 71 | **Comparison Across Different Data:** Comparing perplexity scores across models trained or tested on different datasets can be misleading because the intrinsic difficulty of the datasets can affect the perplexity. 72 | 73 | <!--This file was originally licensed under Apache 2.0. It has been modified. 74 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/dev_cli/quark.md: -------------------------------------------------------------------------------- 1 | # Quick Guide to Quark Quantization Tools 2 | 3 | ## Introduction 4 | Quark is indeed AMD's recommended quantization framework for targeting Ryzen AI platforms, supporting both PyTorch and ONNX formats. For Quark specific info, please visit [quark-doc](https://quark.docs.amd.com/latest/). Here's a guide on using Quark tools for quantization and reloading a quantized model using lemonade: 5 | 6 | ## Installation 7 | 8 | 1. Create and activate a conda environment: 9 | - `conda create -n quark python=3.10` 10 | - `conda activate quark` 11 | 2. Install requirements to setup this environment. 12 | Depending on your usecase you can install for CPU, NPU pr hybrid. 13 | ```bash 14 | pip install -e .[dev,oga-cpu] # Can also work with llm-oga-npu or llm-oga-hybrid 15 | ``` 16 | 2. Install `quark` using `lemonade-install` for easy install 17 | ```bash 18 | # Install the latest external version of quark 19 | lemonade-install --quark 0.6.0 20 | ``` 21 | This downloads the .whl files and zip folder from the Quark page, installs, and sets up the environment for Quark. 22 | 23 | ## Usage 24 | ```bash 25 | lemonade -i <model-ckpt> huggingface-load quark-quantize 26 | --model-export <export_format> # Export formats [quark_safetensors, onnx, gguf] 27 | --quant-algo <quantization_algorithm> # Supported algorithms [gptq, awq, autosmoothquant] 28 | --quant-scheme <quantization_scheme> # Quant schemes [w_int4, w_uint4, w_int8...] 29 | --device <device> # Target device [cpu, cuda] 30 | llm-prompt -p "<prompt>" 31 | ``` 32 | ## Example Workflows 33 | ### Quantize and Export 34 | 35 | This command quantizes an opt-125m loaded from HF, using AWQ qunatization algorithm to generate A8W8 quantized model. Running quantization on CPU can be time consuming. This test can take upto 1hr using 36 | 100% of your CPU. 37 | 38 | ```bash 39 | lemonade -i facebook/opt-125m huggingface-load quark-quantize --quant-algo awq --quant-scheme w_int8_a_int8_per_tensor_sym --model-export quark_safetensors --device cpu 40 | ``` 41 | 42 | #### Load Quantized Model: 43 | This command loads the exported model from a cache folder that corresponds to the quantization recipe used during its export. 44 | ```bash 45 | lemonade -i facebook/opt-125m huggingface-load quark-load --safetensors-model-reload --quant-algo awq --quant-scheme w_int8_a_int8_per_tensor_sym --device cpu llm-prompt -p "Hello world" 46 | ``` 47 | 48 | ### Supported Quantization Schemes 49 | 50 | The following are the different quantization schemes supported for various models. 51 | For a comprehensive list of datatype support for specific models, refer to the [support matrix](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html#id11). 52 | 53 | - w_uint4_per_group_asym 54 | - w_int4_per_channel_sym 55 | - w_int8_a_int8_per_tensor_sym 56 | - w_int8_per_tensor_sym and more.. 57 | 58 | For more information on the supported quantization schemes, see [Language Model Post Training Quantization (PTQ) Using Quark](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html). 59 | 60 | ### Supported Export Formats 61 | 62 | Lemonade supports exporting quark quantized models in various formats. The following export formats are available: 63 | 64 | - quark_safetensors 65 | - onnx 66 | - vllm_adopted_safetensors 67 | - gguf 68 | 69 | ## Known Issues 70 | - No PyPI installer for Quark yet. You can use lemondade-installer as mentioned [above](#installation) for Quark installation. 71 | - Not enough Quark APIs are exposed. Need to rely heavily of Zip folder released by Quark. 72 | - Latest Quark version is hardcoded in quark_quantize for download checks. 73 | 74 | - There is currently no PyPI installer for Quark. You can use lemonade-installer as mentioned in the [Installation Section](#installation) of this guide for Quark installation. 75 | - There are limited Quark APIs currently available. Users will need to rely on the Zip folder released by Quark. 76 | - Latest Quark version hardcoded in quark_quantize for download checks. 77 | - Unable to suppress logging info from Quark. Using log_severity_level, you can suppress the quantization logs, but you cannot suppress info and warning messages when reloading the model, etc. 78 | 79 | <!--This file was originally licensed under Apache 2.0. It has been modified. 80 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/docs/favicon.ico -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | <!DOCTYPE html> 2 | <html lang="en"> 3 | <head> 4 | <meta charset="UTF-8"> 5 | <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Lemonade Server</title> 6 | <link rel="icon" href="./favicon.ico"> 7 | <link rel="stylesheet" href="assets/website-styles.css"> 8 | </head> 9 | <body> <nav class="navbar" id="navbar"> 10 | <div class="navbar-context" id="navbar-context"> 11 | <a class="navbar-back" onclick="showMain(); return false;">← Back</a> 12 | <span class="text-muted">Installation Setup</span> 13 | </div> 14 | <div class="navbar-links"> 15 | <a href="https://github.com/lemonade-sdk/lemonade">GitHub</a> 16 | <a href="docs/">Docs</a> 17 | <a href="docs/server/server_models/">Models</a> 18 | <a href="docs/server/apps/">Featured Apps</a> 19 | </div> 20 | </nav> 21 | 22 | <main class="main"> 23 | <div class="title">🍋 Lemonade Server</div> 24 | <div class="subtitle"> 25 | Refreshingly fast local LLMs on GPUs and NPUs.<br> 26 | Integrate with <a href="https://www.youtube.com/watch?v=PXNTDZREJ_A">Open WebUI</a>, <a href="https://youtu.be/JecpotOZ6qo?si=WxWVQhUBCJQgE6vX">AI Toolkit</a>, or <a href="docs/server/server_integration/">your own app</a> in minutes. 27 | </div> 28 | <div class="button-row"> 29 | <a class="download-btn" href="https://github.com/lemonade-sdk/lemonade/releases/latest/download/Lemonade_Server_Installer.exe"> 30 | Download<br> 31 | <span class="download-sub">for Windows 11</span> 32 | </a> <a class="dev-btn" href="#" onclick="showInstaller(); return false;"> 33 | Developer Setup<br> 34 | <span class="dev-sub">For Ubuntu and Windows</span> 35 | </a> </div> 36 | </main> 37 | <!-- Install Selector View --> 38 | <div id="lmn-install-view" class="lmn-install-view"> 39 | <span class="lmn-install-title">🍋 Installation Setup</span> 40 | <div id="lmn-installer" class="lmn-installer-container"> 41 | <table class="lmn-installer-table"> 42 | <tr> 43 | <td class="lmn-label">Operating System</td> 44 | <td id="os-win" class="lmn-active" onclick="lmnSet('os','win')">Windows</td> 45 | <td id="os-linux" onclick="lmnSet('os','linux')">Linux</td> 46 | </tr> 47 | <tr> 48 | <td class="lmn-label">Installation Type</td> 49 | <td id="type-server" class="lmn-active" onclick="lmnSet('type','server')">Server Only</td> 50 | <td id="type-full" onclick="lmnSet('type','full')">Full SDK</td> 51 | </tr> 52 | <tr> 53 | <td class="lmn-label">Installation Method</td> 54 | <td id="method-gui" class="lmn-active" onclick="lmnSet('method','gui')">GUI .exe</td> 55 | <td id="method-pypi" onclick="lmnSet('method','pypi')">PyPI</td> 56 | <td id="method-src" onclick="lmnSet('method','src')">From Source</td> 57 | </tr> 58 | <tr> 59 | <td class="lmn-label">Inference Engine</td> 60 | <td id="fw-oga" class="lmn-active" onclick="lmnSet('fw','oga')">OGA</td> 61 | <td id="fw-llama" onclick="lmnSet('fw','llama')">llama.cpp</td> 62 | <td id="fw-torch" onclick="lmnSet('fw','torch')">PyTorch</td> 63 | </tr> 64 | <tr> 65 | <td class="lmn-label">Device Support</td> 66 | <td id="dev-hybrid" class="lmn-active" onclick="lmnSet('dev','hybrid')">Hybrid</td> 67 | <td id="dev-cpu" onclick="lmnSet('dev','cpu')">CPU</td> 68 | <td id="dev-gpu" onclick="lmnSet('dev','gpu')">GPU</td> 69 | </tr> 70 | </table> 71 | <div class="lmn-content-section"> 72 | <div class="lmn-section-header"> 73 | Installation Instructions 74 | </div> 75 | <div id="lmn-badges" class="lmn-badges"></div> 76 | <div id="lmn-install-content"> 77 | <div id="lmn-download-area" class="lmn-download-section" style="display: none;"> 78 | <a id="lmn-link" href="https://github.com/lemonade-sdk/lemonade/releases/latest/download/lemonade_server_installer.exe">Download Lemonade Server Installer (.exe)</a> 79 | </div> 80 | <div id="lmn-command" class="lmn-command"></div> 81 | </div> 82 | </div> 83 | <div id="lmn-explore-section" class="lmn-content-section" style="margin-top: 1.5em;"> 84 | <div class="lmn-section-header lmn-explore-header"> 85 | Quick Start 86 | </div> 87 | <div id="lmn-explore-command" class="lmn-command"></div> 88 | </div> 89 | </div> 90 | </div> 91 | 92 | <footer class="site-footer"> 93 | <div class="dad-joke">When life gives you LLMs, make an LLM aide.</div> <div class="copyright">Copyright 2025 AMD</div> 94 | </footer> 95 | 96 | <script src="assets/install-selector.js"></script> <script> 97 | function showInstaller() { 98 | document.querySelector('.main').classList.add('hidden'); 99 | document.getElementById('lmn-install-view').style.display = 'flex'; 100 | document.getElementById('navbar-context').classList.add('show'); 101 | // Initialize the installer if not already done 102 | if (typeof lmnInit === 'function') { 103 | lmnInit(); 104 | } 105 | } 106 | 107 | function showMain() { 108 | document.querySelector('.main').classList.remove('hidden'); 109 | document.getElementById('lmn-install-view').style.display = 'none'; 110 | document.getElementById('navbar-context').classList.remove('show'); 111 | } 112 | </script> 113 | </body> 114 | </html> 115 | -------------------------------------------------------------------------------- /docs/install_options.html: -------------------------------------------------------------------------------- 1 | <!-- Lemonade SDK Install Selector --> 2 | <!DOCTYPE html> 3 | <html lang="en"> 4 | <head> <meta charset="UTF-8"> 5 | <title>Lemonade Install Selector</title> 6 | <link rel="icon" href="./favicon.ico"> 7 | <link rel="stylesheet" href="assets/website-styles.css"> 8 | </head> 9 | <body class="install-options"> <!-- Breadcrumb Navigation --> 10 | <div class="breadcrumb"> 11 | <a class="breadcrumb-back" onclick="window.history.back(); return false;">← Documentation</a> 12 | <span class="breadcrumb-separator">/</span> 13 | <span>Installation Setup</span> 14 | </div> 15 | 16 | <div class="lmn-center"> 17 | <span class="lmn-install-title">🍋 Installation Setup</span> 18 | <div id="lmn-installer" class="lmn-installer-container"> 19 | <table class="lmn-installer-table"> 20 | <tr> 21 | <td class="lmn-label">Operating System</td> 22 | <td id="os-win" class="lmn-active" onclick="lmnSet('os','win')">Windows</td> 23 | <td id="os-linux" onclick="lmnSet('os','linux')">Linux</td> 24 | </tr> 25 | <tr> 26 | <td class="lmn-label">Installation Type</td> 27 | <td id="type-server" class="lmn-active" onclick="lmnSet('type','server')">Server Only</td> 28 | <td id="type-full" onclick="lmnSet('type','full')">Full SDK</td> 29 | </tr> 30 | <tr> 31 | <td class="lmn-label">Installation Method</td> 32 | <td id="method-gui" class="lmn-active" onclick="lmnSet('method','gui')">GUI .exe</td> 33 | <td id="method-pypi" onclick="lmnSet('method','pypi')">PyPI</td> 34 | <td id="method-src" onclick="lmnSet('method','src')">From Source</td> 35 | </tr> 36 | <tr> 37 | <td class="lmn-label">Inference Engine</td> 38 | <td id="fw-oga" class="lmn-active" onclick="lmnSet('fw','oga')">OGA</td> 39 | <td id="fw-llama" onclick="lmnSet('fw','llama')">llama.cpp</td> 40 | <td id="fw-torch" onclick="lmnSet('fw','torch')">PyTorch</td> 41 | </tr> 42 | <tr> 43 | <td class="lmn-label">Device Support</td> 44 | <td id="dev-hybrid" class="lmn-active" onclick="lmnSet('dev','hybrid')">Hybrid</td> 45 | <td id="dev-cpu" onclick="lmnSet('dev','cpu')">CPU</td> 46 | <td id="dev-gpu" onclick="lmnSet('dev','gpu')">GPU</td> 47 | </tr> 48 | </table> 49 | <div class="lmn-content-section"> 50 | <div class="lmn-section-header"> 51 | Installation Instructions 52 | </div> 53 | <div id="lmn-badges" class="lmn-badges"></div> 54 | <div id="lmn-install-content"> 55 | <div id="lmn-download-area" class="lmn-download-section" style="display: none;"> 56 | <a id="lmn-link" href="https://github.com/lemonade-sdk/lemonade/releases/latest/download/lemonade_server_installer.exe">Download Lemonade Server Installer (.exe)</a> 57 | </div> 58 | <div id="lmn-command" class="lmn-command"></div> 59 | </div> 60 | </div> 61 | <div id="lmn-explore-section" class="lmn-content-section" style="margin-top: 1.5em;"> 62 | <div class="lmn-section-header lmn-explore-header"> 63 | Quick Start 64 | </div> 65 | <div id="lmn-explore-command" class="lmn-command"></div> 66 | </div> 67 | </div> 68 | </div> 69 | <script src="assets/install-selector.js"></script> 70 | <script> 71 | // Initialize the installer when the page loads 72 | document.addEventListener('DOMContentLoaded', function() { 73 | if (typeof lmnInit === 'function') { 74 | lmnInit(); 75 | } 76 | }); 77 | </script> 78 | </body> 79 | </html> 80 | -------------------------------------------------------------------------------- /docs/publish_website_docs.py: -------------------------------------------------------------------------------- 1 | # In conda environment of choice, run the following from genai/ folder: 2 | # pip install -r docs/assets/mkdocs_requirements.txt 3 | 4 | # Then run this script to publish the documentation to docs/docs/ 5 | # python docs/publish_website_docs.py 6 | 7 | # Standard library imports for file, directory, regex, system, and subprocess operations 8 | import os 9 | import shutil 10 | import re 11 | import sys 12 | import subprocess 13 | 14 | 15 | def main(): 16 | 17 | # Print the current working directory for debugging 18 | print("[INFO] Current working directory:", os.getcwd()) 19 | 20 | # Define source and destination file paths 21 | src = "docs/server/README.md" 22 | dst = "docs/index.md" 23 | 24 | # Check if the source README exists; exit with error if not 25 | if not os.path.exists(src): 26 | print("[ERROR] docs/server/README.md not found!") 27 | sys.exit(1) 28 | 29 | # Read the source README, making necessary replacements 30 | with open(src, "r", encoding="utf-8") as f: 31 | readme_content = f.read() 32 | 33 | # Write the content to the destination index.md 34 | with open(dst, "w", encoding="utf-8") as f: 35 | f.write(readme_content) 36 | print("[INFO] Copied docs/server/README.md to docs/index.md.") 37 | 38 | # Read the just-written index.md and perform additional link fixes for website publishing 39 | print("[INFO] Fixing links in docs/index.md...") 40 | with open(dst, "r", encoding="utf-8") as f: 41 | content = f.read() 42 | 43 | # List of (pattern, replacement) tuples for fixing internal documentation links 44 | replacements = [ 45 | (r"\(\./apps/README\.md\)", r"(./server/apps/README.md)"), 46 | (r"\(\./concepts\.md\)", r"(./server/concepts.md)"), 47 | (r"\(\./lemonade-server-cli\.md\)", r"(./server/lemonade-server-cli.md)"), 48 | (r"\(\./server_models\.md\)", r"(./server/server_models.md)"), 49 | (r"\(\./server_spec\.md\)", r"(./server/server_spec.md)"), 50 | (r"\(\./server_integration\.md\)", r"(./server/server_integration.md)"), 51 | ] 52 | for pattern, repl in replacements: 53 | content = re.sub(pattern, repl, content) 54 | 55 | # Write the fully processed content back to index.md 56 | with open(dst, "w", encoding="utf-8") as f: 57 | f.write(content) 58 | 59 | # Remove existing docs/docs if it exists 60 | if os.path.exists("docs/docs"): 61 | print("Removing ", os.path.abspath("docs/docs")) 62 | shutil.rmtree("docs/docs") 63 | 64 | # Build the documentation using mkdocs 65 | print("[INFO] Building documentation with mkdocs...") 66 | subprocess.run(["mkdocs", "build", "--clean"], check=True) 67 | 68 | # Move the generated site/ directory to docs/docs/, replacing it if it already exists 69 | print("[INFO] Moving site/ to docs/docs/...") 70 | 71 | # Check what mkdocs actually generated 72 | if os.path.exists(os.path.abspath("site/docs")): 73 | # If mkdocs generated site/docs/, move that content 74 | source_dir = os.path.abspath("site/docs") 75 | elif os.path.exists(os.path.abspath("site")): 76 | # If mkdocs generated site/, move that content 77 | source_dir = os.path.abspath("site") 78 | else: 79 | print("[ERROR] No site directory found after mkdocs build!") 80 | sys.exit(1) 81 | 82 | # Move the correct source directory 83 | shutil.move(source_dir, "docs/docs") 84 | print(f"[INFO] Moved {os.path.abspath(source_dir)} to docs/docs/") 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /docs/server/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Lemonade Server 2 | 3 | 🍋 Lemonade Server is a server interface that uses the standard Open AI API, allowing applications to integrate with local LLMs. This means that you can easily replace cloud-based LLMs with private and free LLMs that run locally on your own PC's NPU and GPU. 4 | 5 | Lemonade Server is available as a standalone tool with a [one-click Windows GUI installer](https://github.com/lemonade-sdk/lemonade/releases/latest/download/Lemonade_Server_Installer.exe). 6 | 7 | Once you've installed, we recommend checking out these resources: 8 | 9 | | Documentation | Description | 10 | |---------------|-------------| 11 | | [Supported Applications](./apps/README.md) | Explore applications that work out-of-the-box with Lemonade Server. | 12 | | [Lemonade Server Concepts](./concepts.md) | Background knowledge about local LLM servers and the OpenAI standard. | 13 | | [`lemonade-server` CLI Guide](./lemonade-server-cli.md) | Learn how to manage the server process and install new models using the command-line interface. | 14 | | [Models List](./server_models.md) | Browse a curated set of LLMs available for serving. | 15 | | [Server Spec](./server_spec.md) | Review all supported OpenAI-compatible and Lemonade-specific API endpoints. | 16 | | [Integration Guide](./server_integration.md) | Step-by-step instructions for integrating Lemonade Server into your own applications. | 17 | 18 | > Note: if you want to develop Lemonade Server itself, you can [install from source](https://lemonade-server.ai/install_options.html). 19 | 20 | ## Integrate Lemonade Server with Your Application 21 | 22 | Since Lemonade Server implements the standard OpenAI API specification, you can use any OpenAI-compatible client library by configuring it to use `http://localhost:8000/api/v1` as the base URL. A table containing official and popular OpenAI clients on different languages is shown below. 23 | 24 | Feel free to pick and choose your preferred language. 25 | 26 | 27 | | Python | C++ | Java | C# | Node.js | Go | Ruby | Rust | PHP | 28 | |--------|-----|------|----|---------|----|-------|------|-----| 29 | | [openai-python](https://github.com/openai/openai-python) | [openai-cpp](https://github.com/olrea/openai-cpp) | [openai-java](https://github.com/openai/openai-java) | [openai-dotnet](https://github.com/openai/openai-dotnet) | [openai-node](https://github.com/openai/openai-node) | [go-openai](https://github.com/sashabaranov/go-openai) | [ruby-openai](https://github.com/alexrudall/ruby-openai) | [async-openai](https://github.com/64bit/async-openai) | [openai-php](https://github.com/openai-php/client) | 30 | 31 | 32 | ### Python Client Example 33 | ```python 34 | from openai import OpenAI 35 | 36 | # Initialize the client to use Lemonade Server 37 | client = OpenAI( 38 | base_url="http://localhost:8000/api/v1", 39 | api_key="lemonade" # required but unused 40 | ) 41 | 42 | # Create a chat completion 43 | completion = client.chat.completions.create( 44 | model="Llama-3.2-1B-Instruct-Hybrid", # or any other available model 45 | messages=[ 46 | {"role": "user", "content": "What is the capital of France?"} 47 | ] 48 | ) 49 | 50 | # Print the response 51 | print(completion.choices[0].message.content) 52 | ``` 53 | 54 | For more detailed integration instructions, see the [Integration Guide](./server_integration.md). 55 | 56 | 57 | <!--Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/server/apps/README.md: -------------------------------------------------------------------------------- 1 | # Lemonade Server Examples 2 | 3 | Many applications today utilize OpenAI models like ChatGPT through APIs such as: 4 | 5 | `POST https://api.openai.com/v1/chat/completions` 6 | 7 | This API call triggers the ChatGPT model to generate responses for a chat. With Lemonade Server, we are replacing the OpenAI endpoint with a local LLM. The new API call becomes: 8 | 9 | `POST http://localhost:8000/api/v1/chat/completions` 10 | 11 | This allows the same application to leverage local LLMs instead of relying on OpenAI's cloud-based models. The guides in this folder show how to connect Lemonade Server to popular applications to enable local LLM execution. To run these examples, you'll need a Windows PC. 12 | 13 | ## 🎥 Video Tutorials 14 | 15 | <div id="yt-carousel" data-videos='[ 16 | {"id": "PXNTDZREJ_A", "title": "Open WebUI Demo"}, 17 | {"id": "JecpotOZ6qo", "title": "Microsoft AI Toolkit Demo"}, 18 | {"id": "bP_MZnDpbUc", "title": "Continue Coding Assistant"}, 19 | {"id": "_PORHv_-atI", "title": "GAIA"} 20 | ]'></div> 21 | 22 | <div class="hide-in-mkdocs"> 23 | 24 | Links to the video tutorials available are provided in the third column of the following table. 25 | 26 | </div> 27 | 28 | | App | Guide | Video | 29 | |---------------------|-----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------| 30 | | [Open WebUI](https://github.com/open-webui/open-webui) | [How to chat with Lemonade LLMs in Open WebUI](./open-webui.md) | [Watch Demo](https://www.youtube.com/watch?v=PXNTDZREJ_A) | 31 | | [Continue.dev](https://www.continue.dev/) | [How to use Lemonade LLMs as a coding assistant in Continue](./continue.md) | [Watch Demo](https://youtu.be/bP_MZnDpbUc?si=hRhLbLEV6V_OGlUt) | 32 | | [Microsoft AI Toolkit](https://learn.microsoft.com/en-us/windows/ai/toolkit/) | [Experimenting with Lemonade LLMs in VS Code using Microsoft's AI Toolkit](./ai-toolkit.md) | [Watch Demo](https://youtu.be/JecpotOZ6qo?si=WxWVQhUBCJQgE6vX) | 33 | | [GAIA](https://github.com/amd/gaia) | [An application for running LLMs locally, includes a ChatBot, YouTube Agent, and more](https://github.com/amd/gaia?tab=readme-ov-file#getting-started-guide) | [Watch Demo](https://youtu.be/_PORHv_-atI?si=EYQjmrRQ6Zy2H0ek) | 34 | | [Microsoft AI Dev Gallery](https://aka.ms/ai-dev-gallery) | [Microsoft's showcase application for exploring AI capabilities](./ai-dev-gallery.md) | _coming soon_ | 35 | | [CodeGPT](https://codegpt.co/) | [How to use Lemonade LLMs as a coding assistant in CodeGPT](./codeGPT.md) | _coming soon_ | 36 | | [MindCraft](https://github.com/kolbytn/mindcraft) | [How to use Lemonade LLMs as a Minecraft agent](./mindcraft.md) | _coming soon_ | 37 | | [wut](https://github.com/shobrook/wut) | [Terminal assistant that uses Lemonade LLMs to explain errors](./wut.md) | _coming soon_ | 38 | | [AnythingLLM](https://anythingllm.com/) | [Running agents locally with Lemonade and AnythingLLM](./anythingLLM.md) | _coming soon_ | 39 | | [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) | [A unified framework to test generative language models on a large number of different evaluation tasks.](./lm-eval.md) | _coming soon_ | 40 | | [PEEL](https://github.com/lemonade-apps/peel) | [Using Local LLMs in Windows PowerShell](https://github.com/lemonade-apps/peel?tab=readme-ov-file#installation) | _coming soon_ | 41 | 42 | ## 📦 Looking for Installation Help? 43 | 44 | To set up Lemonade Server, check out the [Lemonade Server guide](../README.md) for installation instructions and the [server spec](../server_spec.md) to learn more about the functionality. For more information about 🍋 Lemonade SDK, see the [Lemonade SDK README](../README.md). 45 | 46 | ## 🛠️ Support 47 | 48 | If you encounter any issues or have questions, feel free to: 49 | 50 | - File an issue on our [GitHub Issues page](https://github.com/lemonade-sdk/lemonade/issues). 51 | - Email us at [lemonade@amd.com](mailto:lemonade@amd.com). 52 | 53 | ## 💡 Want to Add an Example? 54 | 55 | If you've connected Lemonade to a new application, feel free to contribute a guide by following our contribution guide found [here](../../contribute.md) or let us know at [lemonade@amd.com](mailto:lemonade@amd.com). 56 | 57 | <!--This file was originally licensed under Apache 2.0. It has been modified. 58 | Modifications Copyright (c) 2025 AMD--> 59 | -------------------------------------------------------------------------------- /docs/server/apps/ai-dev-gallery.md: -------------------------------------------------------------------------------- 1 | # AI Dev Gallery with Lemonade Server 2 | 3 | ## Overview 4 | 5 | [AI Dev Gallery](https://aka.ms/ai-dev-gallery) is Microsoft's showcase application that demonstrates various AI capabilities through built-in samples and applications. It provides an easy way to explore and experiment with different AI models and scenarios, including text generation, chat applications, and more. 6 | 7 | AI Dev Gallery has native integration with Lemonade Server, which means it can automatically detect and connect to your local Lemonade instance without manual URL configuration. 8 | 9 | ## Expectations 10 | 11 | AI Dev Gallery works well with most models available in Lemonade. The built-in samples are designed to work with various model types and sizes, making it a great tool for testing and exploring different AI capabilities locally. 12 | 13 | The application provides a user-friendly interface for experimenting with AI models through pre-built scenarios, making it accessible for both beginners and advanced users. 14 | 15 | ## Setup 16 | 17 | ### Prerequisites 18 | 19 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe. 20 | 2. **Important**: Make sure your Lemonade Server is running before opening AI Dev Gallery. 21 | 22 | ### Install AI Dev Gallery 23 | 24 | 1. Open the Microsoft Store on Windows. 25 | 2. Search for "AI Dev Gallery" by Microsoft Corporation. 26 | 3. Click "Install" to download and install the application. 27 | 28 | Alternatively, you can access AI Dev Gallery directly through [aka.ms/ai-dev-gallery](https://aka.ms/ai-dev-gallery). 29 | 30 | ### Connect to Lemonade 31 | 32 | AI Dev Gallery has native integration with Lemonade Server, so no manual configuration is required. The application will automatically detect your running Lemonade Server instance. 33 | 34 | **Important**: Ensure your Lemonade Server is running before launching AI Dev Gallery. 35 | 36 | ## Usage 37 | 38 | AI Dev Gallery provides various built-in applications and samples to explore AI capabilities: 39 | 40 | ### Quick Start 41 | 42 | 1. Launch AI Dev Gallery. 43 | 2. Navigate to **Samples** → **Text** → **Chat** (or another text/code sample). 44 | 3. Click on the model selector above the chat window. 45 | 4. Select **Lemonade** from the available providers. 46 | 5. Choose your preferred model from the list of available models. 47 | 48 | ### Supported Scenarios 49 | 50 | AI Dev Gallery supports various AI scenarios through its sample applications with Lemonade integration: 51 | 52 | **Text Processing**: 53 | 54 | - **Conversational AI**: Chat and Semantic Kernel Chat for interactive conversations 55 | - **Content Generation**: Generate text for various purposes and creative writing 56 | - **Language Tasks**: Translation, grammar checking, and paraphrasing 57 | - **Text Analysis**: Sentiment analysis and content moderation 58 | - **Information Retrieval**: Semantic search and retrieval augmented generation 59 | - **Text Enhancement**: Summarization and custom parameter configurations 60 | 61 | **Code Assistance**: 62 | 63 | - **Code Generation**: Create code snippets and programs 64 | - **Code Analysis**: Explain existing code and understand functionality 65 | 66 | 67 | ### Tips for Best Experience 68 | 69 | - Start your Lemonade Server before opening AI Dev Gallery 70 | - Try different models to see how they perform across various scenarios 71 | - Explore different sample categories to understand various AI capabilities 72 | - Use the built-in samples as starting points for your own AI experiments 73 | 74 | ## Troubleshooting 75 | 76 | ### AI Dev Gallery doesn't detect Lemonade 77 | 78 | - Ensure Lemonade Server is running and accessible at `http://localhost:8000` 79 | - Restart AI Dev Gallery after ensuring Lemonade Server is running 80 | 81 | ### Models not appearing in the selector 82 | 83 | - Open `http://localhost:8000` in a browser and make sure to download the models you want to use through the "Model Manager" tab. 84 | 85 | ## Additional Resources 86 | 87 | - [AI Dev Gallery Website](https://aka.ms/ai-dev-gallery) 88 | - [Lemonade Server Models](../server_models.md) 89 | 90 | <!--This file was originally licensed under Apache 2.0. It has been modified. 91 | Modifications Copyright (c) 2025 AMD--> 92 | -------------------------------------------------------------------------------- /docs/server/apps/ai-toolkit.md: -------------------------------------------------------------------------------- 1 | # Microsoft AI Toolkit for VS Code 2 | 3 | ## Overview 4 | 5 | The [AI Toolkit for Visual Studio Code](https://learn.microsoft.com/en-us/windows/ai/toolkit/) is a VS Code extension that simplifies generative AI app development by bringing together cutting-edge AI development tools and models from various catalogs. It supports running AI models locally or connecting to remote models via API keys. 6 | 7 | ## Demo Video 8 | 9 | ▶️ [Watch on YouTube](https://www.youtube.com/watch?v=JecpotOZ6qo) 10 | 11 | <iframe width="560" height="315" src="https://www.youtube.com/embed/JecpotOZ6qo?si=9YcWwVEx7UX5A812" 12 | title="YouTube video player" frameborder="0" allowfullscreen></iframe> 13 | 14 | ## Expectations 15 | 16 | We have found that most LLMs work well with this application. 17 | 18 | However, the `Inference Parameters` option is not fully supported, as Lemonade Server currently does not accept those as inputs (see [server_spec.md](../server_spec.md) for details). 19 | 20 | 21 | ## Setup 22 | 23 | ### Prerequisites 24 | 25 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe. 26 | 27 | ### Install AI Toolkit for VS Code 28 | 29 | 1. Open the Extensions tab in VS Code Activity Bar. 30 | 2. Search for "AI Toolkit for Visual Studio Code" in the Extensions Marketplace search bar. 31 | 3. Select the AI Toolkit extension and click install. 32 | 33 | This will add an AI Toolkit icon to your VS Code Activity Bar. 34 | 35 | ### Connect Lemonade to AI Toolkit 36 | 37 | The AI Toolkit now supports "Bring Your Own Model" functionality, allowing you to connect to models served via the OpenAI API standard, which Lemonade uses. 38 | 39 | 1. Open the AI Toolkit tab in your VS Code Activity Bar. 40 | 2. In the right corner of the "My Models" section, click the "+" button to "Add model for remote inference". 41 | 3. Select "Add a custom model". 42 | 4. When prompted to "Enter OpenAI chat completion endpoint URL" enter: 43 | ``` 44 | http://localhost:8000/api/v1/chat/completions 45 | ``` 46 | 5. When prompted to "Enter the exact model name as in the API" select a model (e.g., `Phi-3-Mini-Instruct-Hybrid`) 47 | - Note: You can get a list of all models available [here](../server_models.md). 48 | 6. Select the same name as the display model name. 49 | 7. Skip the HTTP authentication step by pressing "Enter". 50 | 51 | ## Usage 52 | 53 | Once you've set up the Lemonade model in AI Toolkit, you can: 54 | 55 | 1. Use the **AI Playground** tool to directly interact with your added model. 56 | 2. Use the **Prompt Builder** tool to craft effective prompts for your AI models. 57 | 3. Use the **Bulk Run** tool to compute responses for custom datasets and easily visualize those responses on a table format. 58 | 4. Use the **Evaluation** tool to quickly assess your model's coherence, fluency, relevance, and similarity, as well as to compute BLEU, F1, GLEU, and Meteor scores. 59 | 60 | ## Additional Resources 61 | 62 | - [AI Toolkit for VS Code Documentation](https://learn.microsoft.com/en-us/windows/ai/toolkit/) 63 | - [AI Toolkit GitHub Repository](https://github.com/microsoft/vscode-ai-toolkit) 64 | - [Bring Your Own Models on AI Toolkit](https://techcommunity.microsoft.com/blog/azuredevcommunityblog/bring-your-own-models-on-ai-toolkit---using-ollama-and-api-keys/4369411) 65 | 66 | <!--This file was originally licensed under Apache 2.0. It has been modified. 67 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/server/apps/anythingLLM.md: -------------------------------------------------------------------------------- 1 | 2 | # Running agents locally with Lemonade and AnythingLLM 3 | 4 | ## Overview 5 | 6 | [AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) is a versatile local LLM platform that allows you to chat with your documents and code using a variety of models. It supports the OpenAI-compatible API interface, allowing easy integration with local servers like Lemonade. 7 | 8 | This guide will help you configure AnythingLLM to use Lemonade's OpenAI-compatible server, and utilize the powerful `@agent` capability to interact with documents, webpages, and more. 9 | 10 | ## Expectations 11 | 12 | Lemonade integrates best with AnythingLLM when using models such as `Qwen-1.5-7B-Chat-Hybrid` and `Llama-3.2-1B-Instruct-Hybrid`, both of which support a context length of up to 3,000 tokens. 13 | 14 | Keep in mind that when using the `@agent` feature, multi-turn conversations can quickly consume available context. As a result, the number of back-and-forth turns in a single conversation may be limited due to the growing context size. 15 | 16 | 17 | ## Setup 18 | 19 | ### Prerequisites 20 | 21 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe. 22 | 2. Install and set up AnythingLLM from their [GitHub](https://github.com/Mintplex-Labs/anything-llm#quick-start) or [website](https://anythingllm.com/desktop). 23 | 24 | 25 | ### Configure AnythingLLM to Use Lemonade 26 | 27 | <ol> 28 | <li>In the bottom of the left menu, click on the wrench icon to "Open Settings".</li> 29 | <li>Under the menu "AI Providers", click "LLM".</li> 30 | <li> 31 | Select "Generic OpenAI" and enter the following info: 32 | <table> 33 | <tr><th>Setting</th><th>Value</th></tr> 34 | <tr><td><b>Base URL</b></td><td><code>http://localhost:8000/api/v1</code></td></tr> 35 | <tr><td><b>API Key</b></td><td><code>-</code></td></tr> 36 | <tr><td><b>Chat Model Name</b></td><td><code>Qwen-1.5-7B-Chat-Hybrid</code></td></tr> 37 | <tr><td><b>Token context window</b></td><td><code>3000</code></td></tr> 38 | <tr><td><b>Max Tokens</b></td><td><code>3000</code></td></tr> 39 | </table> 40 | </li> 41 | <li>In the bottom left, click the back button to exit.</li> 42 | <li>In the left menu, click "New Workspace" and give it a name.</li> 43 | <li>Where you see your new workspace, click the gear icon to open the "Workspace Settings"</li> 44 | <li>In the top menu of the window that opens, click on "Agent Configuration"</li> 45 | <li>Under Chat Settings, select Generic OpenAI and click save.</li> 46 | <li>Under Workspace Agent LLM Provider, select "Generic OpenAI" and click save.</li> 47 | </ol> 48 | 49 | ## Usage with @agent 50 | 51 | ### Overview 52 | 53 | Agents are capable of scraping websites, listing and summarizing documents, searching the web, creating charts, and even saving files to your desktop or their own memory. 54 | 55 | To start an agent session, simply go to any workspace and type `@agent <your prompt>`. To exit the session, just type `exit`. 56 | 57 | ### Agent Skills 58 | 59 | You may turn on and off specific `Agent Skills` by going to your `Workspace Settings` → `Agent Configuration` → `Configure Agent Skills`. 60 | 61 | Available agent skills include: 62 | 63 | * RAG & long-term memory 64 | * View and summarize documents 65 | * Scrape Websites 66 | * Generate & save files to browser 67 | * Generate Charts 68 | * Web Search 69 | * SQL Connector 70 | 71 | ### Examples 72 | 73 | Here are some examples on how you can interact with Anything LLM agents: 74 | 75 | - **Rag & long-term memory** 76 | - `@agent My name is Dr Lemon. Remember this in our next conversation` 77 | - Then, on a follow up chat you can ask `@agent What is my name according to your memory?` 78 | - **Scrape Websites** 79 | - `@agent Scrape this website and tell me what are the two ways of installing lemonade https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/README.md` 80 | - **Web Search** (enable skill before trying) 81 | - `@agent Search the web for the best place to buy shoes` 82 | 83 | You can find more details about agent usage [here](https://docs.anythingllm.com/agent/usage). 84 | 85 | ## Additional Resources 86 | 87 | - [AnthingLLM Website](https://anythingllm.com/) 88 | - [AnythingLLM GitHub](https://github.com/Mintplex-Labs/anything-llm) 89 | - [AnythingLLM Documentation](https://docs.anythingllm.com/) 90 | 91 | <!--This file was originally licensed under Apache 2.0. It has been modified. 92 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/server/apps/codeGPT.md: -------------------------------------------------------------------------------- 1 | # CodeGPT with VS Code 2 | 3 | ## Overview 4 | 5 | [CodeGPT Chat](https://codegpt.co/) is an AI-powered chatbot designed to assist developers with coding tasks directly within their preferred integrated development environments (IDEs), for example, VS Code. 6 | 7 | ## Expectations 8 | 9 | We have found that the `Qwen-1.5-7B-Chat-Hybrid` model is the best Hybrid model available for coding. It is good at chatting with a few files at a time in your codebase to learn more about them. It can also make simple code editing suggestions pertaining to a few lines of code at a time. 10 | 11 | However, we do not recommend using this model for analyzing large codebases at once or making large or complex file edits. 12 | 13 | ## Setup 14 | 15 | ### Prerequisites 16 | 17 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe. 18 | 19 | ### Install CodeGPT in VS Code 20 | 21 | > The following instructions are based off CodeGPT provided instructions found [here](https://docs.codegpt.co/docs/tutorial-basics/installation). 22 | 23 | 1. Open the Extensions tab in VS Code Activity Bar. 24 | 1. Search "CodeGPT: Chat & AI Agents" in the Extensions Marketplace search bar. 25 | 1. Select the CodeGPT extension and click install. 26 | 27 | This will add a CodeGPT tab to your VS Code Activity Bar. 28 | 29 | ### Add Lemonade Server to CodeGPT 30 | 31 | > Note: The following instructions are based on instructions from CodeGPT found [here](https://docs.codegpt.co/docs/tutorial-ai-providers/custom). 32 | 33 | 34 | <ol> 35 | <li>Open the CodeGPT tab in your VS Code Activity Bar.</li> 36 | <li>Sign Up or Sign into your account.</li> 37 | <li>In the model dropdown menu and click "View More".</li> 38 | <li>Select the tab: "LLMs Cloud model"</li> 39 | <li>Under "All Models", set the following: 40 | <table> 41 | <tr><th>Field</th><th>Value</th></tr> 42 | <tr><td><b>Select Provider:</b></td><td><code>Custom</code></td></tr> 43 | <tr><td><b>Select Model: </b></td><td><code>Qwen-1.5-7B-Chat-Hybrid</code></td></tr> 44 | </table> 45 | </li> 46 | <li>Click "Change connection settings" and enter the following information: 47 | <table> 48 | <tr><th>Field</th><th>Value</th></tr> 49 | <tr><td><b>API Key</b></td><td><code>-</code></td></tr> 50 | <tr><td><b>Custom Link</b></td><td><code>http://localhost:8000/api/v1/api/v1</code></td></tr> 51 | </table> 52 | </li> 53 | </ol> 54 | 55 | ## Usage 56 | 57 | > Note: see the CodeGPT [user guide](https://docs.codegpt.co/docs/intro) to learn about all of their features. 58 | 59 | To try out CodeGPT: 60 | 61 | - Open the CodeGPT tab in your VS Code Activity Bar, and in the chat box, type a question about your code. Use the `#` symbol to specify a file. 62 | - Example: "What's the fastest way to install lemonade in #getting_started.md?" 63 | - Use /Fix to find and fix a minor bug. 64 | - Use /Document to come up with docstrings and comments for a file. 65 | - Use /UnitTest to make a test file. 66 | 67 | <!--This file was originally licensed under Apache 2.0. It has been modified. 68 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/server/apps/continue.md: -------------------------------------------------------------------------------- 1 | # Continue Coding Assistant 2 | 3 | ## Overview 4 | 5 | [Continue](https://www.continue.dev/) is a coding assistant that lives inside of a VS Code extension. It supports chatting with your codebase, making edits, and a lot more. 6 | 7 | ## Demo Video 8 | 9 | ▶️ [Watch on YouTube](https://www.youtube.com/watch?v=bP_MZnDpbUc&source_ve_path=MjM4NTE) 10 | 11 | <iframe width="560" height="315" src="https://www.youtube.com/embed/bP_MZnDpbUc?si=0KZLzQzFlRvW9J9f" 12 | title="YouTube video player" frameborder="0" allowfullscreen></iframe> 13 | 14 | ## Expectations 15 | 16 | We have found that the `Qwen-1.5-7B-Chat-Hybrid` model is the best Hybrid model available for coding. It is good at chatting with a few files at a time in your codebase to learn more about them. It can also make simple code editing suggestions pertaining to a few lines of code at a time. 17 | 18 | However, we do not recommend using this model for analyzing large codebases at once or making large or complex file edits. 19 | 20 | ## Setup 21 | 22 | ### Prerequisites 23 | 24 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe. 25 | 26 | ### Install Continue 27 | 28 | > Note: they provide their own instructions [here](https://marketplace.visualstudio.com/items?itemName=Continue.continue) 29 | 30 | 1. Open the Extensions tab in VS Code Activity Bar. 31 | 1. Search "Continue - Codestral, Claude, and more" in the Extensions Marketplace search bar. 32 | 1. Select the Continue extension and click install. 33 | 34 | This will add a Continue tab to your VS Code Activity Bar. 35 | 36 | ### Add Lemonade Server to Continue 37 | 38 | > Note: The following instructions are based on instructions from Continue found [here](https://docs.continue.dev/customize/model-providers/openai#openai-compatible-servers--apis) 39 | 40 | 1. Open the Continue tab in your VS Code Activity Bar. 41 | 1. Click the chat box. Some buttons will appear at the bottom of the box, including `Select model`. 42 | 1. Click `Select model`, then `+ Add Chat model` to open the new model dialog box. 43 | 1. Click the `config file` link at the very bottom of the dialog to open `config.yaml`. 44 | 1. Replace the "models" key in the `config.yaml` with the following and save: 45 | 46 | ```yaml 47 | models: 48 | - name: Lemonade 49 | provider: openai 50 | model: Qwen-1.5-7B-Chat-Hybrid 51 | apiBase: http://localhost:8000/api/v1 52 | apiKey: none 53 | ``` 54 | 55 | 6. Close the dialog box. 56 | 7. Click the chat box again. You should see `Lemonade` where you used to see `Select model`. Ready! 57 | 58 | ## Usage 59 | 60 | > Note: see the Continue [user guide](https://docs.continue.dev/) to learn about all of their features. 61 | 62 | Here are some examples for trying out Continue. These examples assume you have cloned this repo and allowed Continue to index it. 63 | 64 | ### Chat with Files 65 | 66 | Open the Continue tab in your VS Code Activity Bar, and in the "Ask anything" box, type a question about your code. Use the `@` symbol to specify a file or tool. 67 | 68 | - "What's the fastest way to install Lemonade in `@getting_started.md?`" 69 | - "According to `@README.md` what do I need to do to set up for `@api_oga_hybrid_streaming.py`?" 70 | 71 | ### Editing Files 72 | 73 | Open a file, select some code, and push Ctrl+I to start a chat about editing that code. 74 | 75 | 1. Open `//examples//lemonade//api_basic.py`. 76 | 1. Select the `print(...` line at the bottom and press `ctrl+i`. 77 | 1. Write "Add a helpful comment" in the chat box and press enter. 78 | 1. Press "accept" if you would like to accept the change. 79 | 80 | ### Making Files 81 | 82 | Start a new chat and prompt: 83 | 84 | > write a script in the style of `@api_basic.py` that uses the microsoft/Phi-4-mini-instruct model on GPU 85 | 86 | Here's what we got: 87 | 88 | ```python 89 | # Import necessary modules 90 | from lemonade.api import from_pretrained 91 | 92 | # Load the Phi-4-mini-instruct model with the hf-cpu recipe 93 | model, tokenizer = from_pretrained("microsoft/Phi-4-mini-instruct", recipe="hf-cpu") 94 | 95 | # Define your prompt 96 | prompt = "This is a sample prompt for the Phi-4-mini-instruct model" 97 | 98 | # Tokenize the prompt 99 | input_ids = tokenizer(prompt, return_tensors="pt") 100 | 101 | # Generate the response using the model 102 | response = model.generate(input_ids, max_new_tokens=100) # Adjust the max_new_tokens as needed 103 | 104 | # Decode the generated response 105 | generated_text = tokenizer.decode(response[0]) 106 | 107 | # Print the response 108 | print(generated_text) 109 | ``` 110 | 111 | <!--This file was originally licensed under Apache 2.0. It has been modified. 112 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/server/apps/wut.md: -------------------------------------------------------------------------------- 1 | # `wut` Terminal Assistant 2 | 3 | ## Overview 4 | 5 | The [`wut` terminal assistant](https://github.com/shobrook/wut) uses LLMs to parse your terminal's scrollback, helping you troubleshoot your last command. 6 | 7 | ## Expectations 8 | 9 | We found that `wut` works nicely with the `Llama-3.2-3B-Instruct-Hybrid` model. 10 | 11 | It is not especially convenient to use `wut` with Windows until the developers remove the requirement for `tmux`, however we do provide instructions for getting set up on Windows in this guide. 12 | 13 | `wut` seems to send the entire terminal scrollback to the LLM, which can produce very long prompts that exceed the LLM's context length. We recommend restricting the terminal scrollback or using a fresh `tmux` session when trying this out. 14 | 15 | ## Setup 16 | 17 | ### Prerequisites 18 | 19 | #### Install Lemonade Server 20 | 21 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe. 22 | 23 | #### Installing Windows Subsystem for Linux (WSL) 24 | 25 | `wut` currently requires a `tmux` terminal in order to function. We found the simplest way to achieve this on Windows was through the Windows Subsystem for Linux (WSL). 26 | 27 | 1. Install [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/install). 28 | 1. Open the `WSL Settings` app, navigate to `Networking`, and make sure the `Networking mode` is `Mirrored`. This is required for WSL terminals to be able to see the Lemonade server running in Windows. 29 | 1. If needed: shut down WSL to make sure the changes apply: 30 | 31 | ```powershell 32 | wsl --shutdown 33 | ``` 34 | 35 | ### Installing Wut 36 | 37 | * Start a WSL terminal. 38 | * Install [`pipx`](https://github.com/pypa/pipx), as recommended by the following `wut` instructions: 39 | 40 | ```bash 41 | sudo apt update 42 | sudo apt install pipx 43 | pipx ensurepath 44 | ``` 45 | 46 | * Re-launch your terminal to make sure `pipx` is available, then install `wut`: 47 | 48 | ```bash 49 | pipx install wut-cli 50 | ``` 51 | 52 | * Add `wut`'s required environment variables to your `.bashrc` file: 53 | 54 | ```bash 55 | export OPENAI_API_KEY="-" 56 | export OPENAI_MODEL="Llama-3.2-3B-Instruct-Hybrid" 57 | export OPENAI_BASE_URL="http://localhost:8000/api/v1" 58 | ``` 59 | 60 | ## Usage 61 | 62 | ### Start a terminal 63 | 64 | 1. Start a WSL terminal. 65 | 2. Start a `tmux` session: 66 | 67 | ```bash 68 | tmux 69 | ``` 70 | 71 | Then, try some of these example commands that `wut` can help explain. 72 | 73 | ### Help with Lemonade Server 74 | 75 | People often ask exactly what Lemonade Server's `models` endpoint does. Fortunately, `wut` is able to intuit the answer! 76 | 77 | ```bash 78 | curl http://localhost:8000/api/v1/models 79 | wut 80 | ``` 81 | 82 | The terminal response of the `curl` command is this (only intelligible by machines): 83 | 84 | ``` 85 | curl http://localhost:8000/api/v1/models 86 | {"object":"list","data":[{"id":"Qwen2.5-0.5B-Instruct-CPU","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Llama-3.2-1B-Instruct-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Llama-3.2-3B-Instruct-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Phi-3-Mini-Instruct-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Qwen-1.5-7B-Chat-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"DeepSeek-R1-Distill-Llama-8B-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"DeepSeek-R1-Distill-Qwen-7B-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"}]} 87 | ``` 88 | 89 | But `wut` does a nice job interpreting: 90 | 91 | ``` 92 | The output suggests that the API endpoint is returning a list of models, and the owned_by field indicates that all models are owned by "lemonade". Thecreated timestamp indicates when each model was created. 93 | 94 | The output is a valid JSON response, and there is no error or warning message. The command was successful, and the output can be used for further processing or analysis. 95 | ``` 96 | 97 | 98 | ### Bad Git Command 99 | 100 | Run a command that doesn't exist, and then ask `wut` for help: 101 | 102 | ```bash 103 | git pull-request 104 | wut 105 | ``` 106 | 107 | Results in: 108 | 109 | > git: 'pull-request' is not a git command. See 'git --help'. 110 | 111 | And then `wut` provides some helpful feedback: 112 | 113 | > Key takeaway: The command git pull-request is not a valid Git command. The correct command to create a pull request is git request-pull, but it's not a standard Git command. The output wut is the name of the activated Conda environment. To create a pull request, use git request-pull or git pull with the --pr option. 114 | 115 | <!--This file was originally licensed under Apache 2.0. It has been modified. 116 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/server/lemonade-server-cli.md: -------------------------------------------------------------------------------- 1 | # `lemonade-server` CLI 2 | 3 | The `lemonade-server` command-line interface (CLI) provides a set of utility commands for managing the server. When you install Lemonade Server using the GUI installer, `lemonade-server` is added to your PATH so that it can be invoked from any terminal. 4 | 5 | > Note: if you installed from source or PyPI, you should call `lemonade-server-dev` in your activated Python environment, instead of using `lemonade-server`. 6 | 7 | `lemonade-server` provides these utilities: 8 | 9 | | Option/Command | Description | 10 | |---------------------|-------------------------------------| 11 | | `-v`, `--version` | Print the `lemonade-sdk` package version used to install Lemonade Server. | 12 | | `serve` | Start the server process in the current terminal. See command options [below](#command-line-options-for-serve). | 13 | | `status` | Check if server is running. If it is, print the port number. | 14 | | `stop` | Stop any running Lemonade Server process. | 15 | | `pull MODEL_NAME` | Install an LLM named `MODEL_NAME`. See the [server models guide](./server_models.md) for more information. | 16 | | `run MODEL_NAME` | Start the server (if not already running) and chat with the specified model. | 17 | | `list` | List all models. | 18 | 19 | 20 | Example: 21 | 22 | ```bash 23 | lemonade-server serve --port 8080 --log-level debug --truncate-inputs 24 | ``` 25 | 26 | ## Command Line Options for `serve` 27 | 28 | When using the `serve` command, you can configure the server with these additional options: 29 | 30 | | Option | Description | Default | 31 | |--------------------------------|-------------------------------------|---------| 32 | | `--port [port]` | Specify the port number to run the server on | 8000 | 33 | | `--log-level [level]` | Set the logging level | info | 34 | 35 | The [Lemonade Server integration guide](./server_integration.md) provides more information about how these commands can be used to integrate Lemonade Server into an application. 36 | 37 | <!--Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /docs/versioning.md: -------------------------------------------------------------------------------- 1 | # Versioning Policy 2 | 3 | The `lemonade-sdk` package applies semantic versioning for its 3-digit version number. The version number is stored in `src/version.py`. 4 | 5 | The 3 digits correspond to MAJOR.MINOR.PATCH, which can be interpreted as follows: 6 | * MAJOR: changes indicate breaking API changes that may require the user to change their own code 7 | * MINOR: changes indicate that builds against a previous minor version may not be compatible, and the user may need to rebuild those models 8 | * PATCH: no user action required when the patch number changes 9 | 10 | <!--This file was originally licensed under Apache 2.0. It has been modified. 11 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Lemonade Examples 2 | 3 | This folder contains examples of how to deploy `lemonade` into applications. 4 | 5 | ## Notebooks 6 | 7 | The `notebooks/` folder contains Jupyter notebooks that give tutorials on deeper topics. 8 | 9 | ## API Examples 10 | 11 | This folder has examples of using the Lemonade API to integrate LLMs into Python applications. These APIs make it easy to load a model, generate responses, and also show how to stream those responses. 12 | 13 | The `demos/` folder also contains some higher-level application demos of the APIs. Learn more in `demos/README.md`. 14 | 15 | This table shows which API examples are available: 16 | 17 | | Framework | CPU | NPU | Hybrid | 18 | |----------------------------|---------------------------|-----------------|--------------------| 19 | | Huggingface | api_basic.py | - | - | 20 | | OGA | api_oga_cpu.py | api_oga_npu.py | api_oga_hybrid.py | 21 | | Huggingface with streaming | api_streaming.py | - | - | 22 | | OGA with streaming | api_oga_cpu_streaming.py | api_oga_npu_streaming.py | api_oga_hybrid_streaming.py | 23 | 24 | To run an API example, first set up a conda environment with the appropriate framework and backend support. Then run the scripts with a command like `python api_basic.py`. 25 | 26 | <!--This file was originally licensed under Apache 2.0. It has been modified. 27 | Modifications Copyright (c) 2025 AMD--> 28 | -------------------------------------------------------------------------------- /examples/api_basic.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on CPU using the hf-cpu recipe, and then use it to generate 4 | the response to a prompt. 5 | 6 | If you have a discrete GPU, you can try that by changing the recipe 7 | to hf-dgpu. Note: make sure to have torch+cuda installed when trying 8 | hf-dgpu. 9 | """ 10 | 11 | from lemonade.api import from_pretrained 12 | 13 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="hf-cpu") 14 | 15 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 16 | response = model.generate(input_ids, max_new_tokens=30) 17 | 18 | print(tokenizer.decode(response[0])) 19 | 20 | # This file was originally licensed under Apache 2.0. It has been modified. 21 | # Modifications Copyright (c) 2025 AMD 22 | -------------------------------------------------------------------------------- /examples/api_oga_cpu.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on CPU via OnnxRuntime-Genai (OGA) using the oga-cpu recipe, 4 | and then use it to generate the response to a prompt. 5 | 6 | Make sure you have set up your OGA device in your Python environment. 7 | See for details: 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 9 | """ 10 | 11 | from lemonade.api import from_pretrained 12 | 13 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-cpu") 14 | 15 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 16 | response = model.generate(input_ids, max_new_tokens=30) 17 | 18 | print(tokenizer.decode(response[0])) 19 | 20 | # This file was originally licensed under Apache 2.0. It has been modified. 21 | # Modifications Copyright (c) 2025 AMD 22 | -------------------------------------------------------------------------------- /examples/api_oga_cpu_streaming.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on CPU via OnnxRuntime-GenAI using the oga-cpu recipe, and then 4 | use a thread to generate a streaming the response to a prompt. 5 | 6 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, 7 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. 8 | 9 | Make sure you have set up your OGA device in your Python environment. 10 | See for details: 11 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 12 | """ 13 | 14 | from threading import Thread 15 | from lemonade.api import from_pretrained 16 | from lemonade.tools.oga.utils import OrtGenaiStreamer 17 | 18 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-cpu") 19 | 20 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 21 | 22 | streamer = OrtGenaiStreamer(tokenizer) 23 | generation_kwargs = { 24 | "input_ids": input_ids, 25 | "streamer": streamer, 26 | "max_new_tokens": 30, 27 | } 28 | 29 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 30 | thread.start() 31 | 32 | # Generate the response using streaming 33 | for new_text in streamer: 34 | print(new_text) 35 | 36 | thread.join() 37 | 38 | # This file was originally licensed under Apache 2.0. It has been modified. 39 | # Modifications Copyright (c) 2025 AMD 40 | -------------------------------------------------------------------------------- /examples/api_oga_hybrid.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on Ryzen AI hybrid mode (NPU and iGPU together) via OnnxRuntime-Genai (OGA) 4 | using the oga-hybrid recipe, and then use it to generate the response to a prompt. 5 | 6 | Make sure you have set up your OGA device in your Python environment. 7 | See for details: 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 9 | """ 10 | 11 | from lemonade.api import from_pretrained 12 | 13 | model, tokenizer = from_pretrained( 14 | "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid" 15 | ) 16 | 17 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 18 | response = model.generate(input_ids, max_new_tokens=30) 19 | 20 | print(tokenizer.decode(response[0])) 21 | 22 | # This file was originally licensed under Apache 2.0. It has been modified. 23 | # Modifications Copyright (c) 2025 AMD 24 | -------------------------------------------------------------------------------- /examples/api_oga_hybrid_streaming.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on Ryzen AI hybrid mode (NPU and iGPU together) via OnnxRuntime-GenAI 4 | using the oga-cpu recipe, and then use a thread to generate a streaming the 5 | response to a prompt. 6 | 7 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, 8 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. 9 | 10 | Make sure you have set up your OGA device in your Python environment. 11 | See for details: 12 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 13 | """ 14 | 15 | from threading import Thread 16 | from lemonade.api import from_pretrained 17 | from lemonade.tools.oga.utils import OrtGenaiStreamer 18 | 19 | model, tokenizer = from_pretrained( 20 | "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid" 21 | ) 22 | 23 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 24 | 25 | streamer = OrtGenaiStreamer(tokenizer) 26 | generation_kwargs = { 27 | "input_ids": input_ids, 28 | "streamer": streamer, 29 | "max_new_tokens": 30, 30 | } 31 | 32 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 33 | thread.start() 34 | 35 | # Generate the response using streaming 36 | for new_text in streamer: 37 | print(new_text) 38 | 39 | thread.join() 40 | 41 | # This file was originally licensed under Apache 2.0. It has been modified. 42 | # Modifications Copyright (c) 2025 AMD 43 | -------------------------------------------------------------------------------- /examples/api_oga_igpu.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on integrated GPUs (iGPUs) via OnnxRuntime-Genai (OGA) 4 | using the oga-igpu recipe, and then use it to generate the response to a prompt. 5 | 6 | Make sure you have set up your OGA device in your Python environment. 7 | See for details: 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 9 | """ 10 | 11 | from lemonade.api import from_pretrained 12 | 13 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-igpu") 14 | 15 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 16 | response = model.generate(input_ids, max_new_tokens=30) 17 | 18 | print(tokenizer.decode(response[0])) 19 | 20 | # This file was originally licensed under Apache 2.0. It has been modified. 21 | # Modifications Copyright (c) 2025 AMD 22 | -------------------------------------------------------------------------------- /examples/api_oga_igpu_streaming.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on integrated GPUs (iGPUs) via OnnxRuntime-GenAI using the oga-igpu recipe, 4 | and then use a thread to generate a streaming the response to a prompt. 5 | 6 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, 7 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. 8 | 9 | Make sure you have set up your OGA device in your Python environment. 10 | See for details: 11 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 12 | """ 13 | 14 | from threading import Thread 15 | from lemonade.api import from_pretrained 16 | from lemonade.tools.ort_genai.oga import OrtGenaiStreamer 17 | 18 | model, tokenizer = from_pretrained( 19 | "Qwen/Qwen2.5-0.5B-Instruct", 20 | recipe="oga-igpu", 21 | ) 22 | 23 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 24 | 25 | streamer = OrtGenaiStreamer(tokenizer) 26 | generation_kwargs = { 27 | "input_ids": input_ids, 28 | "streamer": streamer, 29 | "max_new_tokens": 30, 30 | } 31 | 32 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 33 | thread.start() 34 | 35 | # Generate the response using streaming 36 | for new_text in streamer: 37 | print(new_text) 38 | 39 | thread.join() 40 | 41 | # This file was originally licensed under Apache 2.0. It has been modified. 42 | # Modifications Copyright (c) 2025 AMD 43 | -------------------------------------------------------------------------------- /examples/api_oga_npu.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on Ryzen AI NPU via OnnxRuntime-Genai (OGA) using the oga-npu recipe, 4 | and then use it to generate the response to a prompt. 5 | 6 | Make sure you have set up your OGA device in your Python environment. 7 | See for details: 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 9 | """ 10 | 11 | from lemonade.api import from_pretrained 12 | 13 | model, tokenizer = from_pretrained( 14 | "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix", 15 | recipe="oga-npu", 16 | ) 17 | 18 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 19 | response = model.generate(input_ids, max_new_tokens=30) 20 | 21 | print(tokenizer.decode(response[0])) 22 | 23 | # This file was originally licensed under Apache 2.0. It has been modified. 24 | # Modifications Copyright (c) 2025 AMD 25 | -------------------------------------------------------------------------------- /examples/api_oga_npu_streaming.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on Ryzen AI NPU via OnnxRuntime-GenAI using the oga-npu recipe, 4 | and then use a thread to generate a streaming the response to a prompt. 5 | 6 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer, 7 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid. 8 | 9 | Make sure you have set up your OGA device in your Python environment. 10 | See for details: 11 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation 12 | """ 13 | 14 | from threading import Thread 15 | from lemonade.api import from_pretrained 16 | from lemonade.tools.oga.utils import OrtGenaiStreamer 17 | 18 | model, tokenizer = from_pretrained( 19 | "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix", 20 | recipe="oga-npu", 21 | ) 22 | 23 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 24 | 25 | streamer = OrtGenaiStreamer(tokenizer) 26 | generation_kwargs = { 27 | "input_ids": input_ids, 28 | "streamer": streamer, 29 | "max_new_tokens": 30, 30 | } 31 | 32 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 33 | thread.start() 34 | 35 | # Generate the response using streaming 36 | for new_text in streamer: 37 | print(new_text) 38 | 39 | thread.join() 40 | 41 | # This file was originally licensed under Apache 2.0. It has been modified. 42 | # Modifications Copyright (c) 2025 AMD 43 | -------------------------------------------------------------------------------- /examples/api_streaming.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use the lemonade API to load a model for 3 | inference on CPU using the hf-cpu recipe, and then use a thread to 4 | generate a streaming the response to a prompt. 5 | 6 | Note: this approach only works with recipes that support TextIteratorStreamer, 7 | i.e., huggingface-based recipes such as hf-cpu and hf-dgpu. 8 | """ 9 | 10 | from threading import Thread 11 | from transformers import TextIteratorStreamer 12 | from lemonade.api import from_pretrained 13 | 14 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="hf-cpu") 15 | 16 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids 17 | 18 | streamer = TextIteratorStreamer( 19 | tokenizer, 20 | skip_prompt=True, 21 | ) 22 | generation_kwargs = { 23 | "input_ids": input_ids, 24 | "streamer": streamer, 25 | "max_new_tokens": 30, 26 | } 27 | 28 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 29 | thread.start() 30 | 31 | # Generate the response using streaming 32 | for new_text in streamer: 33 | print(new_text) 34 | 35 | thread.join() 36 | 37 | # This file was originally licensed under Apache 2.0. It has been modified. 38 | # Modifications Copyright (c) 2025 AMD 39 | -------------------------------------------------------------------------------- /examples/demos/README.md: -------------------------------------------------------------------------------- 1 | # Lemonade Demos 2 | 3 | The demo scripts in this folder show how `lemonade` can be used in integrate OnnxRuntime-GenAI (OGA) into higher-level applications such as chat and search. 4 | 5 | The format of each demo is to have two files which show the before-and-after of integrating OGA: 6 | - `*_start.py`: a version of the application that uses regular software to try and handle a natural language task. 7 | - `*_hybrid.py`: an upgrade of the application that integrates an LLM with Ryzen AI Hybrid to improve the natural language task. 8 | 9 | The demos available are: 10 | - `chat/`: prompts the user for a message and then streams the LLM's response to the terminal. 11 | - `search/`: demonstrates how a user can search an employee handbook in natural language using an LLM. 12 | 13 | To run a demo: 14 | 1. Set up a conda environment with the appropriate framework and backend support. 15 | 1. `cd` into the demo directory (e.g., `cd search/`) 16 | 1. Run the `*_start.py` script to see what the application is like without the LLM (e.g., `python search_start.py`) 17 | 1. Run the `*_hybrid.py` script to see what the application is like with the LLM (e.g., `python search_hybrid.py`) 18 | 19 | <!--This file was originally licensed under Apache 2.0. It has been modified. 20 | Modifications Copyright (c) 2025 AMD--> -------------------------------------------------------------------------------- /examples/demos/chat/chat_hybrid.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from threading import Thread, Event 3 | from transformers import StoppingCriteriaList 4 | from lemonade.tools.server.serve import StopOnEvent 5 | from lemonade.api import from_pretrained 6 | from lemonade.tools.oga.utils import OrtGenaiStreamer 7 | 8 | 9 | def main(): 10 | 11 | model, tokenizer = from_pretrained( 12 | "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", 13 | recipe="oga-hybrid", 14 | ) 15 | 16 | while True: 17 | # Enable sending a signal into the generator thread to stop 18 | # the generation early 19 | stop_event = Event() 20 | stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) 21 | 22 | # Prompt the user for an input message 23 | print() 24 | user_message = input("User: ") 25 | print() 26 | 27 | # Print a friendly message when we quit 28 | if user_message == "quit": 29 | print("System: Ok, bye!\n") 30 | break 31 | 32 | # Generate the response in a thread and stream the result back 33 | # to the main thread 34 | input_ids = tokenizer(user_message, return_tensors="pt").input_ids 35 | 36 | streamer = OrtGenaiStreamer(tokenizer) 37 | generation_kwargs = { 38 | "input_ids": input_ids, 39 | "streamer": streamer, 40 | "max_new_tokens": 200, 41 | "stopping_criteria": stopping_criteria, 42 | } 43 | 44 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 45 | thread.start() 46 | 47 | # Print each word to the screen as it arrives from the streamer 48 | # Allow the user to terminate the response with 49 | # a keyboard interrupt (ctrl+c) 50 | try: 51 | print("LLM: ", end="") 52 | for new_text in streamer: 53 | print(new_text, end="") 54 | sys.stdout.flush() 55 | 56 | except KeyboardInterrupt: 57 | stop_event.set() 58 | 59 | print() 60 | 61 | thread.join() 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | 67 | # This file was originally licensed under Apache 2.0. It has been modified. 68 | # Modifications Copyright (c) 2025 AMD 69 | -------------------------------------------------------------------------------- /examples/demos/chat/chat_start.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from threading import Thread, Event 3 | from queue import Queue 4 | from time import sleep 5 | from transformers import StoppingCriteriaList 6 | from lemonade.tools.server.serve import StopOnEvent 7 | 8 | 9 | class TextStreamer: 10 | """ 11 | Imitates a queue for streaming text from one thread to another. 12 | 13 | Not needed once we integrate with the lemonade API. 14 | """ 15 | 16 | def __init__(self): 17 | self.text_queue = Queue() 18 | self.stop_signal = None 19 | 20 | def add_text(self, text: str): 21 | self.text_queue.put(text) 22 | 23 | def done(self): 24 | self.text_queue.put(self.stop_signal) 25 | 26 | def __iter__(self): 27 | return self 28 | 29 | def __next__(self): 30 | value = self.text_queue.get() 31 | if value == self.stop_signal: 32 | raise StopIteration() 33 | else: 34 | return value 35 | 36 | 37 | def generate_placeholder( 38 | streamer: TextStreamer, stopping_criteria: StoppingCriteriaList 39 | ): 40 | """ 41 | Imitates an LLM's generate function by streaming text to a queue. 42 | 43 | Not needed once we integrate with the lemonade API. 44 | """ 45 | 46 | # pylint: disable=line-too-long 47 | response = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 48 | 49 | for word in response.split(" "): 50 | streamer.add_text(f"{word} ") 51 | sleep(0.05) 52 | 53 | if stopping_criteria[0].stop_event.is_set(): 54 | break 55 | 56 | streamer.done() 57 | 58 | 59 | def main(): 60 | 61 | while True: 62 | # Enable sending a signal into the generator thread to stop 63 | # the generation early 64 | stop_event = Event() 65 | stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) 66 | 67 | # Prompt the user for an input message 68 | print() 69 | user_message = input("User: ") 70 | print() 71 | 72 | # Print a friendly message when we quit 73 | if user_message == "quit": 74 | print("System: Ok, bye!\n") 75 | break 76 | 77 | # Generate the response in a thread and stream the result back 78 | # to the main thread 79 | streamer = TextStreamer() 80 | generation_kwargs = { 81 | "streamer": streamer, 82 | "stopping_criteria": stopping_criteria, 83 | } 84 | 85 | thread = Thread(target=generate_placeholder, kwargs=generation_kwargs) 86 | thread.start() 87 | 88 | # Print each word to the screen as it arrives 89 | # Allow the user to terminate the response with 90 | # a keyboard interrupt (ctrl+c) 91 | try: 92 | print("LLM: ", end="") 93 | for new_text in streamer: 94 | print(new_text, end="") 95 | sys.stdout.flush() 96 | 97 | except KeyboardInterrupt: 98 | stop_event.set() 99 | 100 | print() 101 | 102 | thread.join() 103 | 104 | 105 | if __name__ == "__main__": 106 | main() 107 | 108 | # This file was originally licensed under Apache 2.0. It has been modified. 109 | # Modifications Copyright (c) 2025 AMD 110 | -------------------------------------------------------------------------------- /examples/demos/search/search_hybrid.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from threading import Thread, Event 3 | from transformers import StoppingCriteriaList 4 | from lemonade.api import from_pretrained 5 | from lemonade.tools.oga.utils import OrtGenaiStreamer 6 | from lemonade.tools.server.serve import StopOnEvent 7 | 8 | employee_handbook = """ 9 | 1. You will work very hard every day.\n 10 | 2. You are allowed to listen to music, but must wear headphones.\n 11 | 3. Remember, the break room fridge is not a science experiment. 12 | Please label and remove your leftovers regularly!\n 13 | """ 14 | 15 | 16 | def system_prompt(user_prompt): 17 | return f""" 18 | <|begin_of_text|><|start_header_id|>system<|end_header_id|> 19 | 20 | You are a helpful assistant who can only answer questions about this employee handbook: {employee_handbook}. 21 | Don't make up information that isn't in the handbook already. 22 | <|eot_id|><|start_header_id|>user<|end_header_id|> 23 | 24 | {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|> 25 | """ 26 | 27 | 28 | def main(): 29 | 30 | # Load LLaMA-3.2 1B model on Ryzen AI Hybrid 31 | model, tokenizer = from_pretrained( 32 | "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", 33 | recipe="oga-hybrid", 34 | ) 35 | 36 | while True: 37 | # Enable sending a signal into the generator thread to stop 38 | # the generation early 39 | stop_event = Event() 40 | stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) 41 | 42 | # Prompt the user for an input message 43 | print() 44 | user_message = input("User: ") 45 | print() 46 | 47 | # Print a friendly message when we quit 48 | if user_message == "quit": 49 | print("System: Ok, bye!\n") 50 | break 51 | 52 | # Generate the response in a thread and stream the result back 53 | # to the main thread 54 | input_ids = tokenizer( 55 | system_prompt(user_message), return_tensors="pt" 56 | ).input_ids 57 | 58 | streamer = OrtGenaiStreamer(tokenizer) 59 | generation_kwargs = { 60 | "input_ids": input_ids, 61 | "streamer": streamer, 62 | "max_new_tokens": 200, 63 | "stopping_criteria": stopping_criteria, 64 | } 65 | 66 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 67 | thread.start() 68 | 69 | # Print each word to the screen as it arrives from the streamer 70 | # Allow the user to terminate the response with 71 | # a keyboard interrupt (ctrl+c) 72 | try: 73 | print("LLM: ", end="") 74 | for new_text in streamer: 75 | print(new_text, end="") 76 | sys.stdout.flush() 77 | 78 | except KeyboardInterrupt: 79 | stop_event.set() 80 | 81 | print() 82 | 83 | thread.join() 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | 89 | # This file was originally licensed under Apache 2.0. It has been modified. 90 | # Modifications Copyright (c) 2025 AMD 91 | -------------------------------------------------------------------------------- /examples/demos/search/search_start.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from threading import Thread, Event 3 | from queue import Queue 4 | from time import sleep 5 | from transformers import StoppingCriteriaList 6 | from lemonade.tools.server.serve import StopOnEvent 7 | 8 | 9 | employee_handbook = """ 10 | 1. You will work very hard every day.\n 11 | 2. You are allowed to listen to music, but must wear headphones.\n 12 | 3. Remember, the break room fridge is not a science experiment. 13 | Please label and remove your leftovers regularly!\n 14 | """ 15 | 16 | 17 | class TextStreamer: 18 | """ 19 | Imitates a queue for streaming text from one thread to another. 20 | 21 | Not needed once we integrate with the lemonade API. 22 | """ 23 | 24 | def __init__(self): 25 | self.text_queue = Queue() 26 | self.stop_signal = None 27 | 28 | def add_text(self, text: str): 29 | self.text_queue.put(text) 30 | 31 | def done(self): 32 | self.text_queue.put(self.stop_signal) 33 | 34 | def __iter__(self): 35 | return self 36 | 37 | def __next__(self): 38 | value = self.text_queue.get() 39 | if value == self.stop_signal: 40 | raise StopIteration() 41 | else: 42 | return value 43 | 44 | 45 | def plain_text_search( 46 | question: str, streamer: TextStreamer, stopping_criteria: StoppingCriteriaList 47 | ): 48 | """ 49 | Searches the employee handbook, looking for an exact match and 50 | returns an answer if available. 51 | 52 | Imitates an LLM's generate function by streaming text to a queue. 53 | 54 | Not needed once we integrate with the lemonade API. 55 | """ 56 | 57 | # Turn the question into key words 58 | # Remove punctuation and convert to lower-case 59 | sanitized_question = question.replace("?", "").replace(".", "").lower() 60 | # Get a list of important words (longer than length 3) 61 | keywords = [word for word in sanitized_question.split(" ") if len(word) > 3] 62 | 63 | # Search for the key words in the employee handbook 64 | result = None 65 | for keyword in keywords: 66 | for line in employee_handbook.lower().split("\n"): 67 | if keyword in line: 68 | result = line 69 | 70 | if result: 71 | response = ( 72 | f"This line of the employee handbook might be relevant to you: {result}" 73 | ) 74 | else: 75 | response = ( 76 | "I am sorry, I didn't find anything that is useful to you. Please " 77 | "try again with another question or read the entire employee handbook " 78 | "cover-to-cover to make sure that you didn't miss any rules." 79 | ) 80 | 81 | for word in response.split(" "): 82 | streamer.add_text(f"{word} ") 83 | sleep(0.05) 84 | 85 | if stopping_criteria[0].stop_event.is_set(): 86 | break 87 | 88 | streamer.done() 89 | 90 | 91 | def main(): 92 | 93 | while True: 94 | # Enable sending a signal into the generator thread to stop 95 | # the generation early 96 | stop_event = Event() 97 | stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)]) 98 | 99 | # Prompt the user for an input message 100 | print() 101 | user_message = input("User: ") 102 | print() 103 | 104 | # Print a friendly message when we quit 105 | if user_message == "quit": 106 | print("System: Ok, bye!\n") 107 | break 108 | 109 | # Generate the response in a thread and stream the result back 110 | # to the main thread 111 | streamer = TextStreamer() 112 | generation_kwargs = { 113 | "question": user_message, 114 | "streamer": streamer, 115 | "stopping_criteria": stopping_criteria, 116 | } 117 | 118 | thread = Thread(target=plain_text_search, kwargs=generation_kwargs) 119 | thread.start() 120 | 121 | # Print each word to the screen as it arrives from the streamer 122 | # Allow the user to terminate the response with 123 | # a keyboard interrupt (ctrl+c) 124 | try: 125 | print("LLM: ", end="") 126 | for new_text in streamer: 127 | print(new_text, end="") 128 | sys.stdout.flush() 129 | 130 | except KeyboardInterrupt: 131 | stop_event.set() 132 | 133 | print() 134 | 135 | thread.join() 136 | 137 | 138 | if __name__ == "__main__": 139 | main() 140 | 141 | # This file was originally licensed under Apache 2.0. It has been modified. 142 | # Modifications Copyright (c) 2025 AMD 143 | -------------------------------------------------------------------------------- /img/basic_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/img/basic_demo.gif -------------------------------------------------------------------------------- /img/llm_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/img/llm_demo.png -------------------------------------------------------------------------------- /installer/add_to_path.py: -------------------------------------------------------------------------------- 1 | import winreg 2 | import argparse 3 | 4 | 5 | def add_to_path(directory_to_add): 6 | """ 7 | Adds a directory to the beginning of the user Path, or 8 | moves it to the beginning if it already exists in the Path. 9 | 10 | Args: 11 | directory_to_add (str): Directory path to add to the Path 12 | 13 | Returns: 14 | bool: True if successful, False otherwise 15 | """ 16 | try: 17 | # Open the Environment key in HKEY_CURRENT_USER 18 | key = winreg.OpenKey( 19 | winreg.HKEY_CURRENT_USER, 20 | "Environment", 21 | 0, 22 | winreg.KEY_READ | winreg.KEY_WRITE, 23 | ) 24 | 25 | # Get the current Path value 26 | try: 27 | # Try to get the current Path value 28 | # If the Path env var exists but it is empty, it will return an empty string 29 | current_path, _ = winreg.QueryValueEx(key, "Path") 30 | except FileNotFoundError: 31 | # If the Path env var doesn't exist yet, it will raise a FileNotFoundError 32 | # In this case ONLY, it is safe to set the current path to an empty string 33 | current_path = "" 34 | except Exception as e: 35 | # If anything else goes wrong, print the error and exit 36 | # We don't want to risk corrupting the registry 37 | print(f"Error getting current Path: {e}") 38 | exit(1) 39 | 40 | # Split the Path into individual directories 41 | path_items = [ 42 | item for item in current_path.split(";") if item 43 | ] # Remove empty entries 44 | 45 | # Check if directory is already in Path 46 | if directory_to_add in path_items: 47 | # Remove it from its current position 48 | path_items.remove(directory_to_add) 49 | print(f"- {directory_to_add} was already in Path, moving to the beginning") 50 | else: 51 | print(f"- Adding {directory_to_add} to the beginning of Path") 52 | 53 | # Add the directory to the beginning of Path 54 | path_items.insert(0, directory_to_add) 55 | 56 | # Join the items back together 57 | new_path = ";".join(path_items) 58 | 59 | # Write the new Path back to registry 60 | winreg.SetValueEx(key, "Path", 0, winreg.REG_EXPAND_SZ, new_path) 61 | winreg.CloseKey(key) 62 | 63 | print("- Successfully updated user Path") 64 | return True 65 | 66 | except Exception as e: 67 | print(f"Error updating Path: {e}") 68 | return False 69 | 70 | 71 | if __name__ == "__main__": 72 | parser = argparse.ArgumentParser( 73 | description="Add a directory to the beginning of the user Path" 74 | ) 75 | parser.add_argument("directory", help="Directory path to add to Path") 76 | args = parser.parse_args() 77 | 78 | add_to_path(args.directory) 79 | 80 | # This file was originally licensed under Apache 2.0. It has been modified. 81 | # Modifications Copyright (c) 2025 AMD 82 | -------------------------------------------------------------------------------- /installer/installer_banner.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/installer/installer_banner.bmp -------------------------------------------------------------------------------- /installer/lemonade-server.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal enabledelayedexpansion 3 | 4 | REM Get current time in milliseconds since midnight 5 | for /f "tokens=1-4 delims=:.," %%a in ("!time!") do ( 6 | set /a "CURRENT_TIME=((((%%a*60)+1%%b %% 100)*60+1%%c %% 100)*1000)+1%%d %% 1000" 7 | ) 8 | 9 | REM Use temp directory for the lock file 10 | set "LOCK_FILE=%TEMP%\lemonade_server.lock" 11 | 12 | REM Show a notification and run the server in tray mode. 13 | REM Note: command line arguments are parsed in order from left to right 14 | set TRAY=0 15 | set ARGS= 16 | for %%a in (%*) do ( 17 | set ARGS=!ARGS! %%a 18 | if /I "%%a"=="serve" ( 19 | set TRAY=1 20 | ) 21 | if /I "%%a"=="--no-tray" ( 22 | set TRAY=0 23 | ) 24 | ) 25 | 26 | REM Only check lock file if running in tray mode 27 | if %TRAY%==1 ( 28 | REM Check if another instance is starting (within last 10000 milliseconds) 29 | if exist "!LOCK_FILE!" ( 30 | set /p STORED_TIME=<"!LOCK_FILE!" 31 | set /a TIME_DIFF=!CURRENT_TIME!-!STORED_TIME! 32 | 33 | REM Only block if difference is positive and less than 10000 milliseconds (10 seconds) 34 | if !TIME_DIFF! gtr 0 if !TIME_DIFF! lss 10000 ( 35 | echo Another instance of Lemonade Server is currently starting. 36 | exit /b 3 37 | ) 38 | ) 39 | 40 | REM Set the starting timestamp in lock file 41 | echo !CURRENT_TIME!>"!LOCK_FILE!" 42 | ) 43 | 44 | REM Change to parent directory where conda env and bin folders are located 45 | pushd "%~dp0.." 46 | 47 | REM Run the Python CLI script, passing filtered arguments 48 | call "%CD%\python\Scripts\lemonade-server-dev" !ARGS! 49 | set SERVER_ERRORLEVEL=%ERRORLEVEL% 50 | popd 51 | 52 | REM Clean up lock file before any exit 53 | del "!LOCK_FILE!" 2>nul 54 | 55 | REM Provide a notification if the server is already running 56 | if %SERVER_ERRORLEVEL% equ 2 ( 57 | if %TRAY%==1 ( 58 | REM Blocking call to show notification 59 | wscript "%~dp0lemonade_notification.vbs" "Lemonade Server" "Lemonade Server is already running!\nCheck your system tray for details or run `lemonade-server stop` to stop the existing server and try again." 60 | exit /b 2 61 | ) 62 | ) 63 | 64 | REM Exit without additional notifications if error code is 0 (no errors), 15 (lemonade-server stop), or less than 0 (forced exit) 65 | if %SERVER_ERRORLEVEL% equ 15 ( 66 | exit /b 15 67 | ) else if %SERVER_ERRORLEVEL% leq 0 ( 68 | exit /b 0 69 | ) 70 | 71 | REM Error handling if any other error code 72 | if %TRAY%==0 ( 73 | echo. 74 | echo An error occurred while running Lemonade Server. 75 | echo Please check the error message above. 76 | echo. 77 | pause 78 | ) 79 | if %TRAY%==1 ( 80 | REM Blocking call to show notification 81 | wscript "%~dp0lemonade_notification.vbs" "Lemonade Server" "An error occurred while running Lemonade Server.\nPlease run the server manually. Error code: %SERVER_ERRORLEVEL%" 82 | ) 83 | 84 | REM This file was originally licensed under Apache 2.0. It has been modified. 85 | REM Modifications Copyright (c) 2025 AMD -------------------------------------------------------------------------------- /installer/lemonade_notification.vbs: -------------------------------------------------------------------------------- 1 | ' Lemonade Server Loading Notification 2 | ' Shows a notification that can be manually controlled 3 | ' Usage: wscript lemonade_notification.vbs [title] [message] 4 | 5 | Dim objShell, objFSO, signalFile, windowTitle, messageText 6 | Set objShell = CreateObject("WScript.Shell") 7 | Set objFSO = CreateObject("Scripting.FileSystemObject") 8 | 9 | ' Get command line arguments or use defaults 10 | If WScript.Arguments.Count >= 1 Then 11 | windowTitle = WScript.Arguments(0) 12 | Else 13 | windowTitle = "Lemonade Server" 14 | End If 15 | 16 | If WScript.Arguments.Count >= 2 Then 17 | messageText = WScript.Arguments(1) 18 | ' Replace pipe characters with line breaks for multi-line notifications 19 | messageText = Replace(messageText, "\n", vbCrLf) 20 | Else 21 | messageText = "Starting Lemonade Server..." 22 | End If 23 | 24 | ' Signal file path for manual control 25 | signalFile = objFSO.GetSpecialFolder(2) & "\lemonade_notification_signal.txt" 26 | 27 | ' Create signal file to indicate the notification is active 28 | objFSO.CreateTextFile(signalFile, True).Close 29 | 30 | ' Show notification (no timeout - stays open until manually closed) 31 | result = objShell.Popup(messageText, 0, windowTitle, 0) 32 | 33 | ' Clean up signal file 34 | If objFSO.FileExists(signalFile) Then 35 | objFSO.DeleteFile signalFile 36 | End If 37 | 38 | Set objShell = Nothing 39 | Set objFSO = Nothing -------------------------------------------------------------------------------- /installer/lemonade_server.vbs: -------------------------------------------------------------------------------- 1 | ' This script detects wheter we are in headless mode and launches lemonade-server 2 | ' either in headless mode or with a system tray icon. 3 | 4 | Set wshShell = CreateObject("WScript.Shell") 5 | Set fso = CreateObject("Scripting.FileSystemObject") 6 | 7 | scriptDir = fso.GetParentFolderName(WScript.ScriptFullName) 8 | 9 | ' Declare headless variable 10 | Dim HEADLESS 11 | 12 | ' Simple GUI detection: check if system tray is available 13 | On Error Resume Next 14 | Set shell = CreateObject("Shell.Application") 15 | If Err.Number = 0 Then 16 | ' Try to access the system tray area 17 | Set trayWnd = shell.Windows() 18 | If Err.Number = 0 Then 19 | ' GUI mode: show tray 20 | Set trayWnd = Nothing 21 | Set shell = Nothing 22 | On Error GoTo 0 23 | HEADLESS = False 24 | Else 25 | ' Headless mode: no GUI 26 | Set shell = Nothing 27 | On Error GoTo 0 28 | HEADLESS = True 29 | End If 30 | Else 31 | ' Headless mode: no GUI 32 | On Error GoTo 0 33 | HEADLESS = True 34 | End If 35 | 36 | If HEADLESS = True Then 37 | ' Headless mode: open a terminal and run the server without the tray 38 | wshShell.Run """" & scriptDir & "\lemonade-server.bat"" serve --no-tray", 1, True 39 | Else 40 | ' Check if we're in CI mode via environment variable 41 | ciMode = wshShell.ExpandEnvironmentStrings("%LEMONADE_CI_MODE%") 42 | If ciMode <> "%LEMONADE_CI_MODE%" And (LCase(ciMode) = "true" Or LCase(ciMode) = "1") Then 43 | ' CI mode: run without tray even in GUI environment 44 | wshShell.Run """" & scriptDir & "\lemonade-server.bat"" serve --no-tray", 1, True 45 | Else 46 | ' GUI mode: Run the server on a hidden window with the tray 47 | wshShell.Run """" & scriptDir & "\lemonade-server.bat"" serve", 0, False 48 | End If 49 | End If 50 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | # This is the configuration file for MkDocs, a static site generator that uses Markdown files to create documentation sites. 2 | # The configuration file is written in YAML format and contains various settings for the site. 3 | 4 | # To install the MkDocs dependencies, run the following command in the terminal: 5 | # pip install -r docs/assets/mkdocs_requirements.txt 6 | 7 | # To build the site, run the following command in the terminal: 8 | # mkdocs build 9 | 10 | # To serve the site locally, run the following command in the terminal: 11 | # mkdocs serve 12 | 13 | # To deploy the site to GitHub Pages, run the following command in the terminal: 14 | # mkdocs gh-deploy <-- this should be updated when we have CI setup with what the real instructions are. 15 | 16 | site_name: Lemonade Server Documentation 17 | site_url: https://lemonade-server.ai/ 18 | site_description: Lemonade Server is a lightweight, open-source local LLM server that allows you to run and manage multiple AI applications on your local machine. It provides a simple CLI for managing applications and supports various LLMs, making it easy to deploy and use AI models locally. 19 | 20 | edit_uri: server/README.md 21 | 22 | repo_name: lemonade-sdk/lemonade 23 | repo_url: https://github.com/lemonade-sdk/lemonade 24 | 25 | plugins: 26 | - monorepo 27 | - search 28 | 29 | theme: 30 | name: material 31 | logo: assets/logo.png # If we want to use a custom logo instead of an icon 32 | icon: 33 | repo: fontawesome/brands/github # This is the icon for the repo link in the header 34 | favicon: assets/favicon.ico 35 | features: 36 | - navigation.footer 37 | - navigation.tracking 38 | - navigation.expand 39 | - navigation.top 40 | - content.code.annotate 41 | - content.code.copy 42 | palette: 43 | 44 | # Light mode settings 45 | - scheme: lightmode 46 | primary: amber 47 | toggle: 48 | icon: material/weather-night 49 | name: Switch to dark mode 50 | 51 | # Dark mode settings 52 | - scheme: slate 53 | primary: amber 54 | accent: amber 55 | toggle: 56 | icon: material/weather-sunny 57 | name: Switch to light mode 58 | nav_style: dark 59 | 60 | # Add the list of markdown files to be included in the documentation 61 | # The order of the files in the list will determine the order they appear in the documentation 62 | nav: 63 | - Downloading and Getting Started: server/README.md 64 | - Supported Applications: server/apps/README.md 65 | - Application Guides: 66 | - Open WebUI: server/apps/open-webui.md 67 | - AI Dev Gallery: server/apps/ai-dev-gallery.md 68 | - AI Toolkit: server/apps/ai-toolkit.md 69 | - AnythingLLM: server/apps/anythingLLM.md 70 | - CodeGPT: server/apps/codeGPT.md 71 | - Continue: server/apps/continue.md 72 | - LM-Eval-Harness: server/apps/lm-eval.md 73 | - Mindcraft: server/apps/mindcraft.md 74 | - Wut: server/apps/wut.md 75 | - Lemonade Server CLI Guide: server/lemonade-server-cli.md 76 | - Understanding local LLM servers: server/concepts.md 77 | - Models List: server/server_models.md 78 | - Server Spec: server/server_spec.md 79 | - Integration Guide: server/server_integration.md 80 | - Contribution Guide: contribute.md 81 | 82 | not_in_nav: | 83 | /index.md 84 | /lemonade_api.md 85 | 86 | exclude_docs: | 87 | code.md 88 | versioning.md 89 | dev_cli/README.md 90 | dev_cli/humaneval_accuracy.md 91 | dev_cli/mmlu_accuracy.md 92 | dev_cli/perplexity.md 93 | dev_cli/quark.md 94 | dev_cli/ort_genai_igpu.md 95 | dev_cli/llamacpp.md 96 | dev_cli/lm-eval.md 97 | 98 | # The following adds icons on the bottom of the page 99 | extra: 100 | homepage: https://lemonade-server.ai 101 | social: 102 | - icon: simple/youtube 103 | link: https://www.youtube.com/@AMDDevCentral 104 | - icon: simple/github 105 | link: https://github.com/lemonade-sdk/lemonade 106 | 107 | copyright: Copyright © 2025 AMD. All rights reserved. 108 | 109 | # The custom CSS for colors and more 110 | extra_css: 111 | - assets/extra.css 112 | 113 | # The custom JavaScript for the carousel for the videos 114 | extra_javascript: 115 | - assets/carousel.js 116 | 117 | markdown_extensions: 118 | - admonition 119 | - pymdownx.superfences # Better code blocks 120 | - pymdownx.tabbed: # Tabbed code blocks 121 | alternate_style: true 122 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("src/lemonade/version.py", encoding="utf-8") as fp: 4 | version = fp.read().split('"')[1] 5 | 6 | setup( 7 | name="lemonade-sdk", 8 | version=version, 9 | description="Lemonade SDK: Your LLM Aide for Validation and Deployment", 10 | author_email="lemonade@amd.com", 11 | package_dir={"": "src"}, 12 | packages=[ 13 | "lemonade", 14 | "lemonade.profilers", 15 | "lemonade.common", 16 | "lemonade.tools", 17 | "lemonade.tools.huggingface", 18 | "lemonade.tools.oga", 19 | "lemonade.tools.llamacpp", 20 | "lemonade.tools.quark", 21 | "lemonade.tools.report", 22 | "lemonade.tools.server.utils", 23 | "lemonade.tools.server", 24 | "lemonade_install", 25 | "lemonade_server", 26 | ], 27 | install_requires=[ 28 | # Minimal dependencies required for end-users who are running 29 | # apps deployed on Lemonade SDK 30 | "invoke>=2.0.0", 31 | "onnx>=1.11.0,<1.18.0", 32 | "pyyaml>=5.4", 33 | "typeguard>=2.3.13", 34 | "packaging>=20.9", 35 | # Necessary until upstream packages account for the breaking 36 | # change to numpy 37 | "numpy<2.0.0", 38 | "fasteners", 39 | "GitPython>=3.1.40", 40 | "psutil>=6.1.1", 41 | "wmi", 42 | "py-cpuinfo", 43 | "pytz", 44 | "zstandard", 45 | "fastapi", 46 | "uvicorn[standard]", 47 | "openai>=1.81.0", 48 | "transformers<=4.51.3", 49 | "jinja2", 50 | "tabulate", 51 | "sentencepiece", 52 | "huggingface-hub==0.33.0", 53 | ], 54 | extras_require={ 55 | # The non-dev extras are meant to deploy specific backends into end-user 56 | # applications, without including developer-focused tools 57 | "oga-hybrid": [ 58 | # Note: `lemonade-install --ryzenai hybrid` is necessary 59 | # to complete installation 60 | "onnx==1.16.1", 61 | "numpy==1.26.4", 62 | "protobuf>=6.30.1", 63 | ], 64 | "oga-cpu": [ 65 | "onnxruntime-genai==0.8.2", 66 | "onnxruntime >=1.22.0", 67 | ], 68 | # Developer-focused tools for benchmarking, accuracy testing, and 69 | # model preparation (ONNX export, quantization, device-specifc optimization, etc.) 70 | "dev": [ 71 | # Minimal dependencies for developers to use all features of 72 | # Lemonade SDK, including building and optimizing models 73 | "torch>=2.6.0", 74 | "accelerate", 75 | "datasets", 76 | "pandas>=1.5.3", 77 | "matplotlib", 78 | # Install human-eval from a forked repo with Windows support until the 79 | # PR (https://github.com/openai/human-eval/pull/53) is merged 80 | "human-eval-windows==1.0.4", 81 | "lm-eval[api]", 82 | ], 83 | # Keep backwards compatibility for old extras names 84 | "oga-hybrid-minimal": ["lemonade-sdk[oga-hybrid]"], 85 | "oga-cpu-minimal": ["lemonade-sdk[oga-cpu]"], 86 | "llm": ["lemonade-sdk[dev]"], 87 | "llm-oga-cpu": ["lemonade-sdk[dev,oga-cpu]"], 88 | # The following extras are deprecated and/or not commonly used 89 | "llm-oga-igpu": [ 90 | "onnxruntime-genai-directml==0.6.0", 91 | "onnxruntime-directml>=1.19.0,<1.22.0", 92 | "transformers<4.45.0", 93 | "lemonade-sdk[dev]", 94 | ], 95 | "llm-oga-cuda": [ 96 | "onnxruntime-genai-cuda==0.8.2", 97 | "onnxruntime-gpu >=1.22.0", 98 | "transformers<=4.51.3", 99 | "lemonade-sdk[dev]", 100 | ], 101 | "llm-oga-npu": [ 102 | "onnx==1.16.0", 103 | # NPU requires specific onnxruntime version for Ryzen AI compatibility 104 | # This may conflict with other OGA extras that require >=1.22.0 105 | "onnxruntime==1.18.0", 106 | "numpy==1.26.4", 107 | "protobuf>=6.30.1", 108 | "lemonade-sdk[dev]", 109 | ], 110 | "llm-oga-hybrid": ["lemonade-sdk[dev,oga-hybrid]"], 111 | "llm-oga-unified": ["lemonade-sdk[llm-oga-hybrid]"], 112 | }, 113 | classifiers=[], 114 | entry_points={ 115 | "console_scripts": [ 116 | "lemonade=lemonade:lemonadecli", 117 | "lemonade-install=lemonade_install:installcli", 118 | "lemonade-server-dev=lemonade_server.cli:main", 119 | ] 120 | }, 121 | python_requires=">=3.10, <3.13", 122 | long_description=open("README.md", "r", encoding="utf-8").read(), 123 | long_description_content_type="text/markdown", 124 | include_package_data=True, 125 | package_data={ 126 | "lemonade_server": ["server_models.json"], 127 | "lemonade": ["tools/server/static/*"], 128 | }, 129 | ) 130 | 131 | # This file was originally licensed under Apache 2.0. It has been modified. 132 | # Modifications Copyright (c) 2025 AMD 133 | -------------------------------------------------------------------------------- /src/lemonade/__init__.py: -------------------------------------------------------------------------------- 1 | from lemonade.version import __version__ 2 | 3 | from .state import load_state, State 4 | 5 | from .cli import main as lemonadecli 6 | -------------------------------------------------------------------------------- /src/lemonade/api.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=no-member 2 | 3 | from typing import Tuple, Dict 4 | from lemonade.state import State 5 | import lemonade.common.printing as printing 6 | import lemonade.cache as cache 7 | from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter 8 | from lemonade.common.system_info import ( 9 | get_system_info_dict, 10 | get_device_info_dict, 11 | get_system_info as get_system_info_obj, 12 | ) 13 | 14 | 15 | class NotSupported(Exception): 16 | """ 17 | Indicates that a checkpoint/recipe pair are not supported 18 | together at this time. 19 | """ 20 | 21 | def __init__(self, msg): 22 | super().__init__(msg) 23 | printing.log_error(msg) 24 | 25 | 26 | def _raise_not_supported(recipe, checkpoint): 27 | raise NotSupported( 28 | f"Recipe {recipe} does not have support for checkpoint {checkpoint}" 29 | ) 30 | 31 | 32 | def _make_state(recipe, checkpoint) -> Dict: 33 | return State(cache_dir=cache.DEFAULT_CACHE_DIR, build_name=f"{checkpoint}_{recipe}") 34 | 35 | 36 | def from_pretrained( 37 | checkpoint: str, 38 | recipe: str = "hf-cpu", 39 | ) -> Tuple[ModelAdapter, TokenizerAdapter]: 40 | """ 41 | Load an LLM and the corresponding tokenizer using a lemonade recipe. 42 | 43 | Args: 44 | - checkpoint: huggingface checkpoint that defines the LLM 45 | - recipe: defines the implementation and hardware used for the LLM 46 | 47 | Recipe choices: 48 | - hf-cpu: Huggingface Transformers implementation for CPU with max-perf settings 49 | - hf-dgpu: Huggingface Transformers implementation on dGPU (via device="cuda") 50 | - oga-cpu: CPU implementation based on onnxruntime-genai 51 | - oga-igpu: DirectML implementation for iGPU based on onnxruntime-genai-directml 52 | - oga-hybird: AMD Ryzen AI Hybrid implementation based on onnxruntime-genai 53 | 54 | Returns: 55 | - model: LLM instance with a generate() method that invokes the recipe 56 | - tokenizer: tokenizer instance compatible with the model, which supports 57 | the encode (call) and decode() methods. 58 | """ 59 | 60 | if recipe == "hf-cpu": 61 | # Huggingface Transformers recipe for CPU 62 | # Huggingface supports all checkpoints, so there is nothing to check for 63 | 64 | import torch 65 | from lemonade.tools.huggingface.load import HuggingfaceLoad 66 | 67 | state = _make_state(recipe, checkpoint) 68 | 69 | state = HuggingfaceLoad().run( 70 | state, 71 | input=checkpoint, 72 | dtype=torch.bfloat16, 73 | ) 74 | 75 | return state.model, state.tokenizer 76 | 77 | elif recipe == "hf-dgpu": 78 | # Huggingface Transformers recipe for discrete GPU (Nvidia, Instinct, Radeon) 79 | 80 | import torch 81 | from lemonade.tools.huggingface.load import HuggingfaceLoad 82 | 83 | state = _make_state(recipe, checkpoint) 84 | 85 | state = HuggingfaceLoad().run( 86 | state, 87 | input=checkpoint, 88 | dtype=torch.bfloat16, 89 | device="cuda", 90 | ) 91 | 92 | return state.model, state.tokenizer 93 | 94 | elif recipe.startswith("oga-"): 95 | import lemonade.tools.oga.load as oga 96 | 97 | # Make sure the user chose a supported runtime, e.g., oga-cpu 98 | user_backend = recipe.split("oga-")[1] 99 | supported_backends = ["cpu", "igpu", "npu", "hybrid"] 100 | supported_recipes = [f"oga-{backend}" for backend in supported_backends] 101 | if recipe not in supported_recipes: 102 | raise NotSupported( 103 | "Selected OGA recipe is not supported. " 104 | f"The supported OGA recipes are: {supported_recipes}" 105 | ) 106 | 107 | backend_to_dtype = { 108 | "cpu": "int4", 109 | "igpu": "int4", 110 | "hybrid": "int4", 111 | "npu": "int4", 112 | } 113 | 114 | state = _make_state(recipe, checkpoint) 115 | 116 | state = oga.OgaLoad().run( 117 | state, 118 | input=checkpoint, 119 | device=user_backend, 120 | dtype=backend_to_dtype[user_backend], 121 | ) 122 | 123 | return state.model, state.tokenizer 124 | 125 | else: 126 | _raise_not_supported(recipe, checkpoint) 127 | 128 | 129 | def get_system_info(verbose: bool = False) -> Dict: 130 | """ 131 | Get comprehensive system information including hardware details and device information. 132 | 133 | Returns: 134 | dict: Complete system information including: 135 | - Basic system info (OS, processor, memory, BIOS, etc.). 136 | - Device information (CPU, AMD iGPU, AMD dGPU, NPU). 137 | - Inference engine availability per device. 138 | - Python package versions (verbose mode only). 139 | """ 140 | 141 | # Get basic system info 142 | info = get_system_info_dict() 143 | 144 | # Add device information 145 | info["Devices"] = get_device_info_dict() 146 | 147 | # Filter out verbose-only information if not in verbose mode 148 | if not verbose: 149 | essential_keys = ["OS Version", "Processor", "Physical Memory", "Devices"] 150 | info = {k: v for k, v in info.items() if k in essential_keys} 151 | else: 152 | # In verbose mode, add Python packages at the end 153 | system_info_obj = get_system_info_obj() 154 | info["Python Packages"] = system_info_obj.get_python_packages() 155 | 156 | return info 157 | 158 | 159 | def get_device_info() -> Dict: 160 | """ 161 | Get device information including CPU, AMD iGPU, AMD dGPU, and NPU details. 162 | 163 | Returns: 164 | dict: Device information including: 165 | - cpu: CPU details with inference engine availability. 166 | - amd_igpu: AMD integrated GPU information. 167 | - amd_dgpu: List of AMD discrete GPU information. 168 | - npu: NPU information. 169 | """ 170 | 171 | return get_device_info_dict() 172 | 173 | 174 | # This file was originally licensed under Apache 2.0. It has been modified. 175 | # Modifications Copyright (c) 2025 AMD 176 | -------------------------------------------------------------------------------- /src/lemonade/cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timezone 3 | 4 | # Allow an environment variable to override the default 5 | # location for the build cache 6 | if os.environ.get("LEMONADE_CACHE_DIR"): 7 | DEFAULT_CACHE_DIR = os.path.expanduser(os.environ.get("LEMONADE_CACHE_DIR")) 8 | else: 9 | DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "lemonade") 10 | 11 | 12 | def checkpoint_to_model_name(checkpoint_name: str) -> str: 13 | """ 14 | Get the model's name by stripping the author's name from the checkpoint name 15 | """ 16 | 17 | return checkpoint_name.split("/")[1] 18 | 19 | 20 | def get_timestamp() -> str: 21 | """ 22 | Get a timestamp string in the format: 23 | <year>y_<month>m_<day>d_<hour>h_<minute>m_<second>s 24 | """ 25 | # Get the current time in GMT 26 | current_time = datetime.now(timezone.utc) 27 | 28 | # Format the timestamp string 29 | timestamp = current_time.strftime("%Yy_%mm_%dd_%Hh_%Mm_%Ss") 30 | return timestamp 31 | 32 | 33 | def build_name(input_name): 34 | """ 35 | Name the lemonade build by concatenating these two factors: 36 | 1. Sanitize the input name (typically a model checkpoint name) by 37 | replacing any `/` characters with `_`. 38 | 2. Timestamp to ensure that builds in the same cache will not 39 | collide in the same build directory. 40 | 41 | If the input_name is a local folder, then we don't know the 42 | model checkpoint name, so we use "local_model" 43 | """ 44 | 45 | if os.path.isdir(input_name): 46 | input_name_sanitized = "local_model" 47 | else: 48 | # Sanitize the input name 49 | input_name_sanitized = input_name.replace("/", "_") 50 | 51 | # Get the formatted timestamp string 52 | timestamp = get_timestamp() 53 | 54 | return f"{input_name_sanitized}_{timestamp}" 55 | 56 | 57 | class Keys: 58 | MODEL = "model" 59 | PER_ITERATION_LATENCY = "per_iteration_latency" 60 | MEAN_LATENCY = "mean_latency" 61 | STD_DEV_LATENCY = "std_dev_latency" 62 | TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second" 63 | STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second" 64 | SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token" 65 | PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second" 66 | STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token" 67 | CHECKPOINT = "checkpoint" 68 | DTYPE = "dtype" 69 | PROMPT = "prompt" 70 | PROMPT_TOKENS = "prompt_tokens" 71 | PROMPT_TEMPLATE = "prompt_template" 72 | RESPONSE = "response" 73 | RESPONSE_TOKENS = "response_tokens" 74 | RESPONSE_LENGTHS_HISTOGRAM = "response_lengths_histogram" 75 | CACHE_DIR = "cache_dir" 76 | DEVICE = "device" 77 | LOCAL_MODEL_FOLDER = "local_model_folder" 78 | MEMORY_USAGE_PLOT = "memory_usage_plot" 79 | MAX_MEMORY_USED_GB = "max_memory_used_GB" 80 | MAX_MEMORY_USED_GBYTE = "max_memory_used_gbyte" 81 | RYZEN_AI_VERSION_INFO = "ryzen_ai_version_info" 82 | 83 | 84 | # This file was originally licensed under Apache 2.0. It has been modified. 85 | # Modifications Copyright (c) 2025 AMD 86 | -------------------------------------------------------------------------------- /src/lemonade/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # pylint: disable=C0413 4 | # Prevent HF warnings from showing on every import 5 | os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1" 6 | from lemonade.version import __version__ as version_number 7 | from lemonade.tools import FirstTool, NiceHelpFormatter 8 | from lemonade.profilers.memory_tracker import MemoryTracker 9 | import lemonade.common.filesystem as fs 10 | import lemonade.common.cli_helpers as cli 11 | from lemonade.sequence import Sequence 12 | from lemonade.tools.management_tools import Cache, Version, SystemInfo 13 | from lemonade.state import State 14 | 15 | from lemonade.tools.huggingface.load import HuggingfaceLoad 16 | from lemonade.tools.huggingface.bench import HuggingfaceBench 17 | from lemonade.tools.oga.load import OgaLoad 18 | from lemonade.tools.oga.bench import OgaBench 19 | from lemonade.tools.llamacpp.bench import LlamaCppBench 20 | from lemonade.tools.llamacpp.load import LoadLlamaCpp 21 | 22 | import lemonade.cache as cache 23 | from lemonade.tools.mmlu import AccuracyMMLU 24 | from lemonade.tools.humaneval import AccuracyHumaneval 25 | from lemonade.tools.perplexity import AccuracyPerplexity 26 | from lemonade.tools.accuracy import LMEvalHarness 27 | from lemonade.tools.prompt import LLMPrompt 28 | from lemonade.tools.quark.quark_load import QuarkLoad 29 | from lemonade.tools.quark.quark_quantize import QuarkQuantize 30 | from lemonade.tools.report.llm_report import LemonadeReport 31 | 32 | 33 | def main(): 34 | 35 | # List the available tools 36 | tools = [ 37 | HuggingfaceLoad, 38 | LoadLlamaCpp, 39 | LlamaCppBench, 40 | AccuracyMMLU, 41 | AccuracyHumaneval, 42 | AccuracyPerplexity, 43 | LMEvalHarness, 44 | LLMPrompt, 45 | HuggingfaceBench, 46 | OgaLoad, 47 | OgaBench, 48 | QuarkQuantize, 49 | QuarkLoad, 50 | LemonadeReport, 51 | # Inherited from lemonade 52 | Cache, 53 | Version, 54 | SystemInfo, 55 | ] 56 | 57 | # List the available profilers 58 | profilers = [MemoryTracker] 59 | 60 | # Define the argument parser 61 | parser = cli.CustomArgumentParser( 62 | description=f"""Tools for evaluating and deploying LLMs (v{version_number}). 63 | 64 | Read this to learn the command syntax: 65 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md""", 66 | formatter_class=NiceHelpFormatter, 67 | ) 68 | 69 | parser.add_argument( 70 | "-i", 71 | "--input", 72 | help="The input that will be evaluated by the starting tool " 73 | "(e.g., huggingface checkpoint)", 74 | ) 75 | 76 | parser.add_argument( 77 | "-d", 78 | "--cache-dir", 79 | help="Cache directory where tool results are " 80 | f"stored (default: {cache.DEFAULT_CACHE_DIR})", 81 | required=False, 82 | default=cache.DEFAULT_CACHE_DIR, 83 | ) 84 | 85 | for profiler in profilers: 86 | profiler.add_arguments_to_parser(parser) 87 | 88 | global_args, tool_instances, evaluation_tools = cli.parse_tools( 89 | parser, tools, cli_name="lemonade" 90 | ) 91 | 92 | profiler_instances = [ 93 | profiler(global_args[profiler.unique_name.replace("-", "_")]) 94 | for profiler in profilers 95 | if global_args.get(profiler.unique_name.replace("-", "_"), None) is not None 96 | ] 97 | 98 | if len(evaluation_tools) > 0: 99 | if not issubclass(evaluation_tools[0], FirstTool): 100 | parser.error( 101 | "The first tool in the sequence needs to be one " 102 | "of the 'tools that can start a sequence.' Use " 103 | "`lemonade -h` to see that list of tools." 104 | ) 105 | # Run the evaluation tools as a build 106 | sequence = Sequence(tools=tool_instances, profilers=profiler_instances) 107 | 108 | # Forward the selected input to the first tool in the sequence 109 | first_tool_args = next(iter(sequence.tools.values())) 110 | first_tool_args.append("--input") 111 | first_tool_args.append(global_args["input"]) 112 | 113 | state = State( 114 | cache_dir=os.path.abspath(global_args["cache_dir"]), 115 | build_name=cache.build_name(global_args["input"]), 116 | sequence_info=sequence.info, 117 | ) 118 | sequence.launch(state) 119 | else: 120 | # Run the management tools 121 | for management_tool, argv in tool_instances.items(): 122 | # Support "~" in the cache_dir argument 123 | parsed_cache_dir = os.path.expanduser(global_args[fs.Keys.CACHE_DIR]) 124 | management_tool.parse_and_run(parsed_cache_dir, argv) 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | 130 | # This file was originally licensed under Apache 2.0. It has been modified. 131 | # Modifications Copyright (c) 2025 AMD 132 | -------------------------------------------------------------------------------- /src/lemonade/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/common/__init__.py -------------------------------------------------------------------------------- /src/lemonade/common/cli_helpers.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from typing import List, Dict, Tuple, Any 4 | from lemonade.tools import Tool, FirstTool 5 | import lemonade.common.printing as printing 6 | from lemonade.tools.management_tools import ManagementTool 7 | 8 | 9 | class CustomArgumentParser(argparse.ArgumentParser): 10 | 11 | def error(self, message): 12 | self.print_usage() 13 | printing.log_error(message) 14 | self.exit(2) 15 | 16 | 17 | def _tool_list_help(tools: List[Tool], subclass, exclude=None) -> str: 18 | help = "" 19 | 20 | for tool_class in tools: 21 | if exclude and issubclass(tool_class, exclude): 22 | continue 23 | if issubclass(tool_class, subclass): 24 | help = ( 25 | help 26 | + f" * {tool_class.unique_name}: {tool_class.parser().short_description}\n" 27 | ) 28 | 29 | return help 30 | 31 | 32 | def parse_tools( 33 | parser: argparse.ArgumentParser, supported_tools: List[Tool], cli_name="lemonade" 34 | ) -> Tuple[Dict[str, Any], Dict[Tool, List[str]], List[str]]: 35 | """ 36 | Add the help for parsing tools and their args to an ArgumentParser. 37 | 38 | Then, perform the task of parsing a full CLI command including 39 | teasing apart the global arguments and separate tool invocations. 40 | """ 41 | 42 | tool_parsers = {tool.unique_name: tool.parser() for tool in supported_tools} 43 | tool_classes = {tool.unique_name: tool for tool in supported_tools} 44 | 45 | # Sort tools into categories and format for the help menu 46 | first_tool_choices = _tool_list_help(supported_tools, FirstTool) 47 | eval_tool_choices = _tool_list_help(supported_tools, Tool, exclude=FirstTool) 48 | mgmt_tool_choices = _tool_list_help(supported_tools, ManagementTool) 49 | 50 | tools_action = parser.add_argument( 51 | "tools", 52 | metavar="tool --tool-args [tool --tool-args...]", 53 | nargs="?", 54 | help=f"""\ 55 | Run `{cli_name} TOOL -h` to learn more about each tool. 56 | 57 | Tools that can start a sequence: 58 | {first_tool_choices} 59 | Tools that go into a sequence: 60 | {eval_tool_choices} 61 | Management tools: 62 | {mgmt_tool_choices}""", 63 | choices=tool_parsers.keys(), 64 | ) 65 | 66 | # run as if "-h" was passed if no parameters are passed 67 | if len(sys.argv) == 1: 68 | sys.argv.append("-h") 69 | 70 | # Break sys.argv into categories based on which tools were invoked 71 | # Arguments that are passed prior to invoking a tool are categorized as 72 | # global arguments that should be used to initialize the state. 73 | current_tool = "globals" 74 | tools_invoked = {current_tool: []} 75 | cmd = sys.argv[1:] 76 | while len(cmd): 77 | if cmd[0] in tool_parsers.keys(): 78 | # Make sure each tool was only called once 79 | if cmd[0] in tools_invoked.keys(): 80 | parser.error( 81 | "A single call to lemonade can only invoke each tool once, " 82 | f"however this call invokes tool {cmd[0]} multiple times." 83 | ) 84 | current_tool = cmd.pop(0) 85 | tools_invoked[current_tool] = [] 86 | else: 87 | tools_invoked[current_tool].append(cmd.pop(0)) 88 | 89 | # Trick argparse into thinking tools was not a positional argument 90 | # this helps to avoid an error where an incorrect arg/value pair 91 | # can be misinterpreted as the tools positional argument 92 | tools_action.option_strings = ["--tools"] 93 | 94 | # Do one pass of parsing to figure out if -h was used 95 | global_args = vars(parser.parse_args(tools_invoked["globals"])) 96 | 97 | # Remove "tools" from global args because it was just there 98 | # as a placeholder 99 | global_args.pop("tools") 100 | 101 | # Remove globals from the list since its already been parsed 102 | tools_invoked.pop("globals") 103 | evaluation_tools = [] 104 | management_tools = [] 105 | for cmd, argv in tools_invoked.items(): 106 | tool_parsers[cmd].parse_args(argv) 107 | 108 | # Keep track of whether the tools are ManagementTool or not, 109 | # since ManagementTools are mutually exclusive with evaluation 110 | # tools 111 | if issubclass(tool_classes[cmd], ManagementTool): 112 | management_tools.append(cmd) 113 | else: 114 | evaluation_tools.append(cmd) 115 | 116 | if len(management_tools) > 0 and len(evaluation_tools) > 0: 117 | parser.error( 118 | "This call to lemonade invoked both management and " 119 | "evaluation tools, however each call to lemonade " 120 | "is only allowed to invoke one or the other. " 121 | f"Management tools: {management_tools};" 122 | f"Evaluation tools: {evaluation_tools}." 123 | ) 124 | 125 | if len(management_tools) == 0 and len(evaluation_tools) == 0: 126 | parser.error( 127 | "Calls to lemonade are required to call at least " 128 | "one tool or management tool." 129 | ) 130 | 131 | # Convert tool names into Tool instances 132 | tool_instances = {tool_classes[cmd](): argv for cmd, argv in tools_invoked.items()} 133 | evaluation_tools = [tool_classes[cmd] for cmd in evaluation_tools] 134 | 135 | return global_args, tool_instances, evaluation_tools 136 | 137 | 138 | # This file was originally licensed under Apache 2.0. It has been modified. 139 | # Modifications Copyright (c) 2025 AMD 140 | -------------------------------------------------------------------------------- /src/lemonade/common/exceptions.py: -------------------------------------------------------------------------------- 1 | import lemonade.common.printing as printing 2 | 3 | 4 | class Error(Exception): 5 | """ 6 | Indicates something has gone wrong while running the tools 7 | """ 8 | 9 | def __init__(self, msg): 10 | super().__init__(msg) 11 | printing.log_error(msg) 12 | 13 | 14 | class CacheError(Error): 15 | """ 16 | Indicates ambiguous behavior from when a build already exists in the cache, 17 | but the model, inputs, or args have changed thereby invalidating 18 | the cached copy of the model. 19 | """ 20 | 21 | 22 | class EnvError(Error): 23 | """ 24 | Indicates to the user that the required tools are not 25 | available on their PATH. 26 | """ 27 | 28 | 29 | class ArgError(Error): 30 | """ 31 | Indicates to the user that they provided invalid arguments 32 | """ 33 | 34 | 35 | class ToolError(Exception): 36 | """ 37 | Let the user know that something went wrong while 38 | running a tool. 39 | 40 | Note: not overloading __init__() so that the 41 | attempt to print to stdout isn't captured into 42 | the Tool's log file. 43 | """ 44 | 45 | 46 | class StateError(Exception): 47 | """ 48 | Raised when something goes wrong with State 49 | """ 50 | 51 | 52 | class IntakeError(Exception): 53 | """ 54 | Let the user know that something went wrong during the 55 | initial intake process of analyzing a model. 56 | """ 57 | 58 | 59 | class IOError(Error): 60 | """ 61 | Indicates to the user that an input/output operation failed, 62 | such trying to open a file. 63 | """ 64 | 65 | 66 | class ModelArgError(Error): 67 | """ 68 | Indicates to the user that values provided to a Model instance method 69 | were not allowed. 70 | """ 71 | 72 | 73 | class ModelRuntimeError(Error): 74 | """ 75 | Indicates to the user that attempting to invoke a Model instance failed. 76 | """ 77 | 78 | 79 | class BenchmarkException(Exception): 80 | """ 81 | Indicates a failure during benchmarking 82 | """ 83 | 84 | 85 | class HardwareError(Error): 86 | """ 87 | Indicates that the hardware used is faulty or unavailable. 88 | """ 89 | 90 | 91 | class SkipBuild(Exception): 92 | """ 93 | Indicates that an exception is deliberately being raised to skip a build 94 | """ 95 | 96 | 97 | # This file was originally licensed under Apache 2.0. It has been modified. 98 | # Modifications Copyright (c) 2025 AMD 99 | -------------------------------------------------------------------------------- /src/lemonade/common/network.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | import socket 4 | from huggingface_hub import model_info 5 | 6 | 7 | def is_offline(): 8 | """ 9 | Check if the system is offline by attempting to connect to huggingface.co. 10 | 11 | Returns: 12 | bool: True if the system is offline (cannot connect to huggingface.co), 13 | False otherwise. 14 | """ 15 | if os.environ.get("LEMONADE_OFFLINE"): 16 | return True 17 | try: 18 | socket.gethostbyname("huggingface.co") 19 | return False 20 | except socket.gaierror: 21 | return True 22 | 23 | 24 | def get_base_model(checkpoint: str) -> Optional[str]: 25 | """ 26 | Get the base model information for a given checkpoint from the Hugging Face Hub. 27 | Will auto-detect if we're offline and skip the network call in that case. 28 | 29 | Args: 30 | checkpoint: The model checkpoint to query 31 | 32 | Returns: 33 | The base model name if found, or None if not found or error occurs 34 | """ 35 | # Skip network call in offline mode 36 | if is_offline(): 37 | return None 38 | 39 | try: 40 | info = model_info(checkpoint) 41 | if info.cardData and "base_model" in info.cardData: 42 | if info.cardData["base_model"] is not None: 43 | # This is a derived model 44 | return info.cardData["base_model"] 45 | else: 46 | # This is itself a base model 47 | return [checkpoint] 48 | except Exception: # pylint: disable=broad-except 49 | pass 50 | return None 51 | -------------------------------------------------------------------------------- /src/lemonade/common/printing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import enum 4 | import sys 5 | import math 6 | 7 | 8 | class Colors: 9 | HEADER = "\033[95m" 10 | OKBLUE = "\033[94m" 11 | OKCYAN = "\033[96m" 12 | OKGREEN = "\033[92m" 13 | WARNING = "\033[93m" 14 | FAIL = "\033[91m" 15 | ENDC = "\033[0m" 16 | BOLD = "\033[1m" 17 | UNDERLINE = "\033[4m" 18 | 19 | 20 | def log(txt, c=Colors.ENDC, end="", is_error=False): 21 | logn(txt, c=c, end=end, is_error=is_error) 22 | 23 | 24 | def logn(txt, c=Colors.ENDC, end="\n", is_error=False): 25 | file = sys.stderr if is_error else sys.stdout 26 | print(c + txt + Colors.ENDC, end=end, flush=True, file=file) 27 | 28 | 29 | class LogType(enum.Enum): 30 | ERROR = "Error:" 31 | SUCCESS = "Woohoo!" 32 | WARNING = "Warning:" 33 | INFO = "Info:" 34 | 35 | 36 | def clean_print(type: LogType, msg): 37 | # Replace path to user’s home directory by a tilde symbol (~) 38 | home_directory = os.path.expanduser("~") 39 | home_directory_escaped = re.escape(home_directory) 40 | msg = re.sub(home_directory_escaped, "~", msg) 41 | 42 | # Split message into list, remove leading spaces and line breaks 43 | msg = msg.split("\n") 44 | msg = [line.lstrip() for line in msg] 45 | while msg[0] == "" and len(msg) > 1: 46 | msg.pop(0) 47 | 48 | # Print message 49 | indentation = len(type.value) + 1 50 | if type == LogType.ERROR: 51 | log(f"\n{type.value} ".rjust(indentation), c=Colors.FAIL, is_error=True) 52 | elif type == LogType.SUCCESS: 53 | log(f"\n{type.value} ".rjust(indentation), c=Colors.OKGREEN) 54 | elif type == LogType.WARNING: 55 | log(f"\n{type.value} ".rjust(indentation), c=Colors.WARNING) 56 | elif type == LogType.INFO: 57 | log(f"\n{type.value} ".rjust(indentation), c=Colors.OKCYAN) 58 | 59 | is_error = type == LogType.ERROR 60 | for line_idx, line in enumerate(msg): 61 | if line_idx != 0: 62 | log(" " * indentation) 63 | s_line = line.split("**") 64 | for idx, l in enumerate(s_line): 65 | c = Colors.ENDC if idx % 2 == 0 else Colors.BOLD 66 | if idx != len(s_line) - 1: 67 | log(l, c=c, is_error=is_error) 68 | else: 69 | logn(l, c=c, is_error=is_error) 70 | 71 | 72 | def log_error(msg): 73 | clean_print(LogType.ERROR, str(msg)) 74 | # ASCII art credit: 75 | # https://textart4u.blogspot.com/2014/05/the-fail-whale-ascii-art-code.html 76 | logn( 77 | """\n▄██████████████▄▐█▄▄▄▄█▌ 78 | ██████▌▄▌▄▐▐▌███▌▀▀██▀▀ 79 | ████▄█▌▄▌▄▐▐▌▀███▄▄█▌ 80 | ▄▄▄▄▄██████████████\n\n""", 81 | is_error=True, 82 | ) 83 | 84 | 85 | def log_success(msg): 86 | clean_print(LogType.SUCCESS, msg) 87 | 88 | 89 | def log_warning(msg): 90 | clean_print(LogType.WARNING, msg) 91 | 92 | 93 | def log_info(msg): 94 | clean_print(LogType.INFO, msg) 95 | 96 | 97 | def list_table(list, padding=25, num_cols=4): 98 | lines_per_column = int(math.ceil(len(list) / num_cols)) 99 | for i in range(lines_per_column): 100 | for col in range(num_cols): 101 | if i + col * lines_per_column < len(list): 102 | print( 103 | list[i + col * lines_per_column].ljust(padding), 104 | end="", 105 | ) 106 | print("\n\t", end="") 107 | 108 | 109 | # This file was originally licensed under Apache 2.0. It has been modified. 110 | # Modifications Copyright (c) 2025 AMD 111 | -------------------------------------------------------------------------------- /src/lemonade/common/test_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def create_test_dir( 6 | key: str, 7 | base_dir: str = os.path.dirname(os.path.abspath(__file__)), 8 | ): 9 | # Define paths to be used 10 | cache_dir = os.path.join(base_dir, "generated", f"{key}_cache_dir") 11 | corpus_dir = os.path.join(base_dir, "generated", "test_corpus") 12 | 13 | # Delete folders if they exist and 14 | if os.path.isdir(cache_dir): 15 | shutil.rmtree(cache_dir) 16 | if os.path.isdir(corpus_dir): 17 | shutil.rmtree(corpus_dir) 18 | os.makedirs(corpus_dir, exist_ok=True) 19 | 20 | return cache_dir, corpus_dir 21 | 22 | 23 | def strip_dot_py(test_script_file: str) -> str: 24 | return test_script_file.split(".")[0] 25 | 26 | 27 | # This file was originally licensed under Apache 2.0. It has been modified. 28 | # Modifications Copyright (c) 2025 AMD 29 | -------------------------------------------------------------------------------- /src/lemonade/profilers/__init__.py: -------------------------------------------------------------------------------- 1 | from .profiler import Profiler 2 | -------------------------------------------------------------------------------- /src/lemonade/profilers/profiler.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class Profiler(abc.ABC): 5 | 6 | unique_name: str 7 | 8 | def __init__(self, parser_arg_value=None): 9 | self.parser_arg_value = parser_arg_value 10 | # Statistics that will be displayed to the CLI user 11 | self.status_stats = [] 12 | 13 | @staticmethod 14 | @abc.abstractmethod 15 | def add_arguments_to_parser(parser): 16 | """ 17 | Adds the argument parsing for this tool to the parser. 18 | Uses f"--{self.unique_name}" as the argument. 19 | """ 20 | 21 | @abc.abstractmethod 22 | def start(self, build_dir): 23 | """ 24 | This method is called prior to the tool sequence starting. 25 | This informs the profiler to start gathering data. 26 | The build directory can be used to store profiling data. 27 | """ 28 | 29 | def tool_starting(self, tool_name): 30 | """ 31 | This method is called to inform the profiler of the name of the tool that is about to start. 32 | """ 33 | 34 | def tool_stopping(self): 35 | """ 36 | This method is called to inform the profiler that the tool has finished. 37 | """ 38 | 39 | def stop(self): 40 | """ 41 | This method is called when the tool sequence has finished. 42 | This informs the profiler to stop gathering data. 43 | """ 44 | 45 | @abc.abstractmethod 46 | def generate_results(self, state, timestamp, start_times): 47 | """ 48 | This method is called so that the profiler can create its output files. 49 | The state is passed so that build info can be gathered and stats can be written. 50 | The timestamp can be used for filename in current working directory. 51 | The start times parameter is a dict with the keys being the tools names and 52 | the values being the time the tool started. There is an initial "warmup" key 53 | that has a start time before the first tool and a "cool down" key that contains the 54 | time when the last tool ended. 55 | """ 56 | 57 | 58 | # Copyright (c) 2025 AMD 59 | -------------------------------------------------------------------------------- /src/lemonade/state.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import Dict, Optional, Any 4 | import yaml 5 | import lemonade.common.build as build 6 | import lemonade.common.filesystem as fs 7 | from lemonade.version import __version__ as lemonade_version 8 | 9 | 10 | def _is_nice_to_write(value): 11 | """ 12 | Checks whether a value is nice to write to YAML. 13 | Returns True if the value is a string, int, float, bool, list, dict, or tuple. 14 | Returns False otherwise. 15 | """ 16 | if isinstance(value, (str, int, float, bool)): 17 | return True 18 | elif isinstance(value, list) or isinstance(value, tuple): 19 | # Check if all elements in the list are nice to write 20 | return all(_is_nice_to_write(item) for item in value) 21 | elif isinstance(value, dict): 22 | # Check if all values in the dictionary are nice to write 23 | return all(_is_nice_to_write(item) for item in value.values()) 24 | return False 25 | 26 | 27 | def _sanitize_for_yaml(input_dict: Dict) -> Dict: 28 | """ 29 | Creates a new dictionary containing only nice-to-write values 30 | from the original dictionary. 31 | """ 32 | result = {} 33 | for key, value in input_dict.items(): 34 | if _is_nice_to_write(value): 35 | result[key] = value 36 | return result 37 | 38 | 39 | class State: 40 | """ 41 | The State class is meant to carry build state, starting with the user's 42 | initial arguments, through each build Tool in the Sequence, and finally 43 | to the disk, where it is used to assess cache hits. 44 | 45 | State is initialized with the key members that are shared by every build, 46 | and reasonable default values are assigned as appropriate. 47 | 48 | Tool developers can also add any members they wish. To get or set an 49 | attribute, reference it as an attribute: 50 | 1. get: `my_variable = state.attribute_name` 51 | 2. set: `state.attribute_name = my_variable` 52 | 53 | Build State can be saved and loaded from disk in the form of a state.yaml file 54 | via State.save() and load_state(), respectively. Note that while State can 55 | contain members of any type, only YAML-safe members (str, int, bool, float, 56 | list, dict, tuple) will be saved and loaded. 57 | """ 58 | 59 | def __init__( 60 | self, 61 | cache_dir: str, 62 | build_name: Optional[str] = None, 63 | sequence_info: Dict[str, Dict] = None, 64 | **kwargs, 65 | ): 66 | 67 | # The default model name is the name of the python file that calls build_model() 68 | if build_name is None: 69 | build_name = os.path.basename(sys.argv[0]) 70 | 71 | # Support "~" in the cache_dir argument 72 | parsed_cache_dir = os.path.expanduser(cache_dir) 73 | 74 | # Save settings as State members 75 | self.cache_dir = parsed_cache_dir 76 | self.build_name = build_name 77 | self.sequence_info = sequence_info 78 | self.lemonade_version = lemonade_version 79 | self.build_status = build.FunctionStatus.NOT_STARTED 80 | self.downcast_applied = False 81 | self.uid = build.unique_id() 82 | self.results = None 83 | 84 | # Store any additional kwargs as members 85 | for key, value in kwargs.items(): 86 | self.__dict__[key] = value 87 | 88 | def __setattr__(self, name: str, value: Any) -> None: 89 | """ 90 | Tool developers can add a new member to State by simply 91 | assigning it as an attribute, i.e., `state.new_member = value`. 92 | """ 93 | return super().__setattr__(name, value) 94 | 95 | def save_stat(self, key: str, value): 96 | """ 97 | Save statistics to an yaml file in the build directory 98 | """ 99 | 100 | stats = fs.Stats(self.cache_dir, self.build_name) 101 | stats.save_stat(key, value) 102 | 103 | def save_sub_stat(self, parent_key: str, key: str, value): 104 | """ 105 | Save statistics to an yaml file in the build directory 106 | """ 107 | 108 | stats = fs.Stats(self.cache_dir, self.build_name) 109 | stats.save_sub_stat(parent_key, key, value) 110 | 111 | def save(self): 112 | """ 113 | Save all YAML-friendly members to disk as a state.yaml file. 114 | 115 | Note that `model` and `inputs` will typically not be saved since 116 | they are typically in non-YAML-friendly types such as `torch.nn.Module` 117 | and `torch.tensor`. 118 | """ 119 | 120 | state_to_save = _sanitize_for_yaml(vars(self)) 121 | 122 | # Create a build directory in the cache 123 | fs.make_build_dir(self.cache_dir, self.build_name) 124 | 125 | with open( 126 | build.state_file(self.cache_dir, self.build_name), 127 | "w", 128 | encoding="utf8", 129 | ) as outfile: 130 | yaml.dump(state_to_save, outfile) 131 | 132 | 133 | def load_state( 134 | cache_dir=None, 135 | build_name=None, 136 | state_path=None, 137 | ) -> State: 138 | """ 139 | Read a state.yaml file corresponding to a specific build in a specific 140 | cache, and use its contents to initialize a State instance. 141 | """ 142 | 143 | if state_path is not None: 144 | file_path = state_path 145 | elif build_name is not None and cache_dir is not None: 146 | file_path = build.state_file(cache_dir, build_name) 147 | else: 148 | raise ValueError( 149 | "This function requires either build_name and cache_dir to be set, " 150 | "or state_path to be set, not both or neither" 151 | ) 152 | 153 | state_dict = build.load_yaml(file_path) 154 | 155 | return State(**state_dict) 156 | 157 | 158 | # This file was originally licensed under Apache 2.0. It has been modified. 159 | # Modifications Copyright (c) 2025 AMD 160 | -------------------------------------------------------------------------------- /src/lemonade/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .tool import Tool, FirstTool, NiceHelpFormatter 2 | -------------------------------------------------------------------------------- /src/lemonade/tools/adapter.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class ModelAdapter(abc.ABC): 5 | """ 6 | Base class for adapting an LLM to work with lemonade's standardized tools 7 | """ 8 | 9 | def __init__(self): 10 | """ 11 | Self-benchmarking ModelAdapters can store their results in the 12 | tokens_per_second and time_to_first_token members. 13 | """ 14 | self.tokens_per_second = None 15 | self.time_to_first_token = None 16 | self.type = "generic" 17 | 18 | @abc.abstractmethod 19 | def generate(self, input_ids, max_new_tokens=512): 20 | """ 21 | Generate is the primary method required by lemonade's accuracy tools 22 | 23 | We try to keep the signature here minimal to allow for maximum compatibility 24 | with recipe components, which themselves may not support a lot of arguments. 25 | """ 26 | 27 | 28 | class TokenizerAdapter(abc.ABC): 29 | """ 30 | Base class for adapting an LLM's tokenizer to work with lemonade's standard tools 31 | """ 32 | 33 | def __init__(self, tokenizer=None): 34 | self.auto_tokenizer = tokenizer 35 | 36 | @abc.abstractmethod 37 | def __call__(self, prompt: str): 38 | """ 39 | Args: 40 | prompt: text that should be encoded and passed to the LLM as input_ids 41 | 42 | Returns: input_ids 43 | """ 44 | 45 | @abc.abstractmethod 46 | def decode(self, response) -> str: 47 | """ 48 | Args: 49 | response: tokens from the LLM that should be decoded into text 50 | 51 | Returns: text response of the LLM 52 | """ 53 | 54 | def apply_chat_template(self, *args, **kwargs): 55 | """ 56 | Convert messages into a single tokenizable string 57 | """ 58 | return self.auto_tokenizer.apply_chat_template(*args, **kwargs) 59 | 60 | @property 61 | def chat_template(self): 62 | return self.auto_tokenizer.chat_template 63 | 64 | @property 65 | def eos_token(self): 66 | return self.auto_tokenizer.eos_token 67 | 68 | 69 | class PassthroughTokenizerResult: 70 | """ 71 | Data structure for holding a tokenizer result where the input_ids 72 | are packaged in a non-standard way, but we still want to adhere to 73 | standard interfaces (e.g., result.input_ids). 74 | 75 | For example: CLI-based tools that have their own internal tokenizer that 76 | isn't exposed to the user. In this case we can pass the prompt through as 77 | a string. 78 | """ 79 | 80 | def __init__(self, prompt): 81 | self.input_ids = prompt 82 | 83 | 84 | class PassthroughTokenizer(TokenizerAdapter): 85 | """ 86 | Tokenizer adapter that forwards the prompt to input_ids as text, 87 | and then forwards a text LLM response through decode() as text. 88 | 89 | Useful for CLI-based tools that have their own internal tokenizer that 90 | isn't exposed to the user. 91 | """ 92 | 93 | # pylint: disable=unused-argument 94 | def __call__(self, prompt: str, **kwargs): 95 | return PassthroughTokenizerResult(prompt) 96 | 97 | # pylint: disable=unused-argument 98 | def decode(self, response: str, **kwargs): 99 | return response 100 | 101 | 102 | # This file was originally licensed under Apache 2.0. It has been modified. 103 | # Modifications Copyright (c) 2025 AMD 104 | -------------------------------------------------------------------------------- /src/lemonade/tools/oga/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/oga/__init__.py -------------------------------------------------------------------------------- /src/lemonade/tools/oga/bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import statistics 3 | from statistics import StatisticsError 4 | from lemonade.state import State 5 | from lemonade.cache import Keys 6 | from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter 7 | from lemonade.tools.bench import Bench 8 | 9 | 10 | class OgaBench(Bench): 11 | """ 12 | Benchmark any model that adheres to the ModelAdapter interface. 13 | 14 | Required input state: 15 | - MODEL: model instance to benchmark. 16 | - TOKENIZER: tokenizer instance used to generate inputs for the model. 17 | 18 | Output state produced: None 19 | """ 20 | 21 | unique_name = "oga-bench" 22 | 23 | def __init__(self): 24 | super().__init__() 25 | 26 | # Additional statistics generated by this bench tool 27 | self.status_stats.insert( 28 | self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1, 29 | Keys.STD_DEV_TOKENS_PER_SECOND, 30 | ) 31 | self.std_dev_token_generation_tokens_per_second_list = [] 32 | 33 | @staticmethod 34 | def parser(add_help: bool = True) -> argparse.ArgumentParser: 35 | parser = __class__.helpful_parser( 36 | short_description="Benchmark an LLM in onnxruntime-genai (OGA)", 37 | add_help=add_help, 38 | ) 39 | 40 | parser = Bench.parser(parser) 41 | 42 | return parser 43 | 44 | def get_prompt_str(self, state, token_length): 45 | """ 46 | Returns a string with the prescribed token length. 47 | """ 48 | tokenizer: TokenizerAdapter = state.tokenizer 49 | test_prompt = "word " * (token_length - 1) 50 | input_ids = tokenizer(test_prompt, return_tensors="pt").input_ids 51 | test_token_length = len(input_ids) 52 | delta = test_token_length - token_length 53 | if delta == 0: 54 | return test_prompt 55 | return "word " * max(token_length - 1 - delta, 0) 56 | 57 | def run_prompt( 58 | self, 59 | state: State, 60 | report_progress_fn, 61 | prompt: str, 62 | iterations: int, 63 | warmup_iterations: int, 64 | output_tokens: int, 65 | ) -> State: 66 | 67 | model: ModelAdapter = state.model 68 | tokenizer: TokenizerAdapter = state.tokenizer 69 | 70 | input_ids = tokenizer(prompt, return_tensors="pt").input_ids 71 | self.input_ids_len_list.append(len(input_ids)) 72 | per_iteration_time_to_first_token = [] 73 | per_iteration_tokens_per_second = [] 74 | 75 | # Don't capture time for warmup 76 | for count in range(warmup_iterations): 77 | outputs = model.generate(input_ids, max_new_tokens=output_tokens) 78 | self.tokens_out_len_list.append(len(outputs[0]) - len(input_ids)) 79 | report_progress_fn((count + 1) / (warmup_iterations + iterations)) 80 | 81 | for count in range(iterations): 82 | outputs = model.generate( 83 | input_ids, 84 | max_new_tokens=output_tokens, 85 | min_new_tokens=output_tokens, 86 | ) 87 | report_progress_fn( 88 | (warmup_iterations + count + 1) / (warmup_iterations + iterations) 89 | ) 90 | 91 | token_len = len(outputs[0]) - len(input_ids) 92 | self.tokens_out_len_list.append(token_len) 93 | 94 | # Only count an iteration if it produced enough tokens 95 | if token_len >= output_tokens: 96 | per_iteration_time_to_first_token.append(model.time_to_first_token) 97 | per_iteration_tokens_per_second.append(model.tokens_per_second) 98 | 99 | if not per_iteration_time_to_first_token or not per_iteration_tokens_per_second: 100 | raise Bench.not_enough_tokens(output_tokens) 101 | 102 | mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token) 103 | self.mean_time_to_first_token_list.append(mean_time_to_first_token) 104 | self.prefill_tokens_per_second_list.append( 105 | len(input_ids) / mean_time_to_first_token 106 | ) 107 | self.token_generation_tokens_per_second_list.append( 108 | statistics.mean(per_iteration_tokens_per_second) 109 | ) 110 | try: 111 | self.std_dev_time_to_first_token_list.append( 112 | statistics.stdev(per_iteration_time_to_first_token) 113 | ) 114 | except StatisticsError: 115 | # Less than 2 measurements 116 | self.std_dev_time_to_first_token_list.append(None) 117 | try: 118 | self.std_dev_token_generation_tokens_per_second_list.append( 119 | statistics.stdev(per_iteration_tokens_per_second) 120 | ) 121 | except StatisticsError: 122 | # Less than 2 measurements 123 | self.std_dev_token_generation_tokens_per_second_list.append(None) 124 | 125 | def save_stats(self, state): 126 | super().save_stats(state) 127 | 128 | # Save additional statistics 129 | if not all( 130 | element is None 131 | for element in self.std_dev_token_generation_tokens_per_second_list 132 | ): 133 | state.save_stat( 134 | Keys.STD_DEV_TOKENS_PER_SECOND, 135 | self.get_item_or_list( 136 | self.std_dev_token_generation_tokens_per_second_list 137 | ), 138 | ) 139 | 140 | 141 | # This file was originally licensed under Apache 2.0. It has been modified. 142 | # Modifications Copyright (c) 2025 AMD 143 | -------------------------------------------------------------------------------- /src/lemonade/tools/perplexity.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from lemonade.state import State 4 | from lemonade.tools import Tool 5 | import lemonade.common.printing as printing 6 | import lemonade.common.build as build 7 | 8 | 9 | class AccuracyPerplexity(Tool): 10 | """ 11 | Measure perplexity of an LLM using the Wikitext-2 dataset. 12 | 13 | Required input state: 14 | - state.model: instance that provides a __call__() method that returns 15 | output.logits and supports model.config.max_position_embeddings 16 | - state.tokenizer: instance of Hugging Face PretrainedTokenizer 17 | 18 | Output state produced: None 19 | 20 | See docs/dev_cli/perplexity.md for more details. 21 | """ 22 | 23 | unique_name = "accuracy-perplexity" 24 | 25 | def __init__(self): 26 | super().__init__(monitor_message="Measuring perplexity") 27 | 28 | @staticmethod 29 | def parser(add_help: bool = True) -> argparse.ArgumentParser: 30 | parser = __class__.helpful_parser( 31 | short_description="Measure perplexity score", 32 | add_help=add_help, 33 | ) 34 | return parser 35 | 36 | def run( 37 | self, 38 | state: State, 39 | ) -> State: 40 | 41 | import pandas as pd 42 | import torch 43 | from datasets import load_dataset 44 | 45 | try: 46 | printing.log_info("Downloading dataset ...") 47 | dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") 48 | except Exception as e: # pylint: disable=broad-except 49 | printing.log_error(f"Error during dataset load: {e}") 50 | raise e 51 | 52 | tokenizer = state.tokenizer 53 | model = state.model 54 | # Tokenize the entire test dataset text, joining entries with double new lines 55 | encodings = tokenizer("\n\n".join(dataset["text"]), return_tensors="pt") 56 | 57 | # Retrieve the maximum input length that the model can handle 58 | try: 59 | max_length = model.config.max_position_embeddings 60 | except AttributeError: 61 | # Some LLMs do not have the config.max_position_embeddings attribute 62 | # However, most LLMs support at least 2048 context length, so this 63 | # try-except will allow a few more LLMs to work 64 | max_length = 2048 65 | # Set stride to half of the maximum input length for overlapping window processing 66 | # Refer to docs/dev_cli/perplexity.md for more information on sliding window 67 | stride = max_length // 2 68 | # Determine the total sequence length of the tokenized input 69 | seq_len = encodings.input_ids.size(1) 70 | 71 | negative_log_likelihoods = [] 72 | summary_data = [] 73 | prev_end_location = 0 74 | 75 | model_results_dir = os.path.join( 76 | build.output_dir(state.cache_dir, state.build_name), "perplexity" 77 | ) 78 | 79 | for begin_location in range(0, seq_len, stride): 80 | end_location = min(begin_location + max_length, seq_len) 81 | target_len = end_location - prev_end_location 82 | input_ids = encodings.input_ids[:, begin_location:end_location] 83 | target_ids = input_ids.clone() 84 | target_ids[:, :-target_len] = -100 85 | 86 | # Forward pass the model to get logits 87 | with torch.no_grad(): 88 | try: 89 | outputs = model(input_ids, labels=target_ids) 90 | logits = outputs.logits 91 | except Exception as e: # pylint: disable=broad-except 92 | printing.log_error( 93 | f"Error during model forward pass execution: {e}" 94 | ) 95 | 96 | # Compute loss manually for visualization 97 | shift_logits = logits[..., :-1, :].contiguous() 98 | shift_labels = target_ids[..., 1:].contiguous() 99 | effective_token_count = (target_ids != -100).sum().item() 100 | negative_log_likelihoods.append( 101 | (outputs.loss.item(), effective_token_count) 102 | ) 103 | 104 | # Decode predicted and actual next words for the last token position 105 | predictions = torch.argmax(shift_logits, dim=-1) 106 | predicted_tokens = predictions[:, -1] 107 | actual_tokens = shift_labels[:, -1] 108 | 109 | predicted_words = tokenizer.batch_decode( 110 | predicted_tokens, skip_special_tokens=True 111 | ) 112 | actual_words = tokenizer.batch_decode( 113 | actual_tokens, skip_special_tokens=True 114 | ) 115 | context = tokenizer.decode(input_ids[0, :]) 116 | 117 | summary_data.append( 118 | { 119 | "Context": context[-stride:], 120 | "Predicted next word": predicted_words, 121 | "Actual next word": actual_words, 122 | "Loss for this window": outputs.loss.item(), 123 | } 124 | ) 125 | prev_end_location = end_location 126 | 127 | # Total loss calculation considering the number of tokens for each segment 128 | total_loss = sum(loss * count for loss, count in negative_log_likelihoods) 129 | total_tokens = sum(count for _, count in negative_log_likelihoods) 130 | 131 | # Calculate average negative_log_likelihood and perplexity 132 | average_negative_log_likelihood = total_loss / total_tokens 133 | perplexity = torch.exp(torch.tensor(average_negative_log_likelihood)) 134 | 135 | # Save accuracy results to stats file 136 | state.save_stat("perplexity_score", float(perplexity.item())) 137 | 138 | # Save accuracy results to CSV file 139 | summary_df = pd.DataFrame(summary_data) 140 | summary_df.to_csv( 141 | os.path.join(model_results_dir, "summary_results.csv"), index=False 142 | ) 143 | return state 144 | 145 | 146 | # This file was originally licensed under Apache 2.0. It has been modified. 147 | # Modifications Copyright (c) 2025 AMD 148 | -------------------------------------------------------------------------------- /src/lemonade/tools/quark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/quark/__init__.py -------------------------------------------------------------------------------- /src/lemonade/tools/report/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/report/__init__.py -------------------------------------------------------------------------------- /src/lemonade/tools/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/server/__init__.py -------------------------------------------------------------------------------- /src/lemonade/tools/server/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/server/static/favicon.ico -------------------------------------------------------------------------------- /src/lemonade/tools/server/tool_calls.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict, Pattern, Optional 3 | import logging 4 | import json 5 | 6 | 7 | def extract_code_block(text: str) -> str: 8 | """ 9 | Extracts the content inside triple backtick code blocks from a text. 10 | 11 | Args: 12 | text (str): The text to extract the code block from. 13 | 14 | Returns: 15 | str: The content of the first code block if any are found, otherwise the raw text. 16 | """ 17 | # Regex pattern to match triple backtick code blocks (with optional language hint) 18 | pattern = re.compile(r"```(?:\w+)?\n(.*?)```", re.DOTALL) 19 | 20 | # Find all matches 21 | code_blocks = pattern.findall(text) 22 | 23 | # Return first match or raw text 24 | return code_blocks[0] if code_blocks else text 25 | 26 | 27 | def standardize_tool_call(tool_call: dict) -> dict | None: 28 | """ 29 | Standardizes the format of tool calls according to the format expected by OpenAI. 30 | 31 | Args: 32 | tool_call (dict): The tool call to validate. 33 | 34 | Returns: 35 | dict | None: Standardized tool call if valid, None otherwise. 36 | """ 37 | # Ensure the tool call has a "name" 38 | standardized_tool_call = {} 39 | if "name" in tool_call: 40 | standardized_tool_call["name"] = tool_call["name"] 41 | else: 42 | logging.warning("Tool call does not have a 'name' field.") 43 | return None 44 | 45 | # Ensure the tool call has "arguments" 46 | if "arguments" in tool_call: 47 | standardized_tool_call["arguments"] = tool_call["arguments"] 48 | elif "parameters" in tool_call: 49 | standardized_tool_call["arguments"] = tool_call["parameters"] 50 | else: 51 | logging.warning("Tool call does not have a 'arguments' or 'parameters' field.") 52 | return None 53 | 54 | return standardized_tool_call 55 | 56 | 57 | def get_tool_call_pattern(added_tokens_decoder: List[str]) -> Optional[Pattern]: 58 | """ 59 | Extracts tool call pattern from the added tokens decoder. 60 | """ 61 | special_tokens = [v.content for v in added_tokens_decoder.values()] 62 | 63 | # Pattern 1: <tool_call>...</tool_call> block 64 | # Sample model that uses this pattern: Qwen3-8B 65 | if "<tool_call>" in special_tokens and "</tool_call>" in special_tokens: 66 | return re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL) 67 | 68 | # Pattern 2: [TOOL_CALLS] [ {...} ] block 69 | # Sample model that uses this pattern: Mistral-7B-Instruct-v0.3 70 | elif "[TOOL_CALLS]" in special_tokens: 71 | return re.compile(r"\[TOOL_CALLS\]\s*\[(.*?)\](?=\s*<|/?eos|$)", re.DOTALL) 72 | 73 | else: 74 | logging.warning( 75 | "Tool calling identifiers were not found for the current model." 76 | ) 77 | return None 78 | 79 | 80 | def extract_tool_calls( 81 | text: str, tool_call_pattern: Optional[Pattern] = None 82 | ) -> tuple[List[Dict], str]: 83 | """ 84 | Extracts tool calls from generated text based on tool calling identifiers. 85 | 86 | Args: 87 | text (str): The text output generated by the model. 88 | tool_call_pattern (Optional[Pattern]): The pattern to use to extract tool calls. 89 | 90 | Returns: 91 | tuple[List[Dict], str]: A tuple containing: 92 | - List[Dict]: A list of extracted tool call objects (raw JSON-like dicts) 93 | - str: The original text with tool calls removed 94 | """ 95 | 96 | matches = [] 97 | if tool_call_pattern is not None: 98 | matches = list(tool_call_pattern.finditer(text)) 99 | 100 | # Some models don't use any tool calling identifiers. 101 | # Instead, tool calls are identified by only generating JSON content. 102 | # Sample model that uses this pattern: Llama-3.1-8B-Instruct 103 | else: 104 | try: 105 | # Remove the json for a code block if needed 106 | parsed_text = extract_code_block(text) 107 | json_tool_calls = json.loads(parsed_text) 108 | 109 | if isinstance(json_tool_calls, dict): 110 | json_tool_calls = [json_tool_calls] 111 | 112 | extracted_tool_calls = [] 113 | for tool_call in json_tool_calls: 114 | # Return the tool call if all calls are valid 115 | standard_tool_call = standardize_tool_call(tool_call) 116 | if standard_tool_call is not None: 117 | extracted_tool_calls.append(standard_tool_call) 118 | else: 119 | return [], text 120 | 121 | return extracted_tool_calls, "" 122 | 123 | except json.JSONDecodeError: 124 | pass 125 | 126 | # Process matches in reverse to avoid position shifting 127 | extracted_tool_calls = [] 128 | cleaned_text = text 129 | for match in reversed(matches): 130 | content = match.group(1).strip() 131 | json_tool_call = None 132 | try: 133 | json_tool_call = json.loads(content) 134 | except json.JSONDecodeError: 135 | logging.warning("Could not parse tool call as JSON.") 136 | continue 137 | 138 | # Attempt to standardize the tool call 139 | standard_tool_call = standardize_tool_call(json_tool_call) 140 | if standard_tool_call is None: 141 | continue 142 | 143 | # If the content is a valid JSON object, add it to the list 144 | extracted_tool_calls.append(standard_tool_call) 145 | 146 | # Remove the matched tool call from the text 147 | cleaned_text = cleaned_text[: match.start()] + cleaned_text[match.end() :] 148 | 149 | return extracted_tool_calls, cleaned_text.strip() 150 | 151 | 152 | # This file was originally licensed under Apache 2.0. It has been modified. 153 | # Modifications Copyright (c) 2025 AMD 154 | -------------------------------------------------------------------------------- /src/lemonade/tools/server/utils/port.py: -------------------------------------------------------------------------------- 1 | import socketserver 2 | import sys 3 | import logging 4 | import importlib 5 | import asyncio 6 | from contextlib import asynccontextmanager 7 | from fastapi import FastAPI 8 | 9 | _lazy_imports = { 10 | "TextIteratorStreamer": ("transformers", "TextIteratorStreamer"), 11 | "StoppingCriteriaList": ("transformers", "StoppingCriteriaList"), 12 | } 13 | 14 | 15 | def find_free_port(): 16 | """ 17 | Scans for an unoccupied TCP port 18 | 19 | Returns the port number as an int on success 20 | Returns None if no port can be found 21 | """ 22 | 23 | try: 24 | with socketserver.TCPServer(("localhost", 0), None) as s: 25 | return s.server_address[1] 26 | # pylint: disable=broad-exception-caught 27 | except Exception: 28 | return None 29 | 30 | 31 | @asynccontextmanager 32 | async def lifespan(app: FastAPI): 33 | # Only do minimal setup here so endpoints are available immediately 34 | try: 35 | if sys.stdout.encoding: 36 | "🍋".encode(sys.stdout.encoding) 37 | use_emojis = True 38 | except (UnicodeEncodeError, AttributeError): 39 | use_emojis = False 40 | 41 | if use_emojis: 42 | logging.info( 43 | "\n" 44 | "\n" 45 | "🍋 Lemonade Server Ready!\n" 46 | f"🍋 Open http://localhost:{app.port} in your browser for:\n" 47 | "🍋 💬 chat\n" 48 | "🍋 💻 model management\n" 49 | "🍋 📄 docs\n" 50 | ) 51 | else: 52 | logging.info( 53 | "\n" 54 | "\n" 55 | "[Lemonade] Lemonade Server Ready!\n" 56 | f"[Lemonade] Open http://localhost:{app.port} in your browser for:\n" 57 | "[Lemonade] chat\n" 58 | "[Lemonade] model management\n" 59 | "[Lemonade] docs\n" 60 | ) 61 | 62 | # Start lazy imports in the background, and set app.initialized = True 63 | # when the imports are available 64 | async def lazy_imports_bg(): 65 | for object_name, import_info in _lazy_imports.items(): 66 | module_name = import_info[0] 67 | class_name = import_info[1] 68 | module = importlib.import_module(module_name) 69 | obj = getattr(module, class_name) 70 | globals()[object_name] = obj 71 | 72 | app.initialized = True 73 | 74 | asyncio.create_task(lazy_imports_bg()) 75 | 76 | yield 77 | -------------------------------------------------------------------------------- /src/lemonade/tools/server/utils/thread.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import logging 3 | from lemonade.tools.server.serve import Server 4 | 5 | 6 | class ServerRunner(threading.Thread): 7 | """ 8 | Thread class for running the Lemonade Server with a loaded model. 9 | """ 10 | 11 | def __init__( 12 | self, model, tokenizer, checkpoint, recipe, host="localhost", port=8000 13 | ): 14 | threading.Thread.__init__(self) 15 | self.model = model 16 | self.tokenizer = tokenizer 17 | self.checkpoint = checkpoint 18 | self.recipe = recipe 19 | self.host = host 20 | self.port = port 21 | self.server = None 22 | self.ready_event = threading.Event() 23 | self.shutdown_event = threading.Event() 24 | self.uvicorn_server = None 25 | 26 | def run(self): 27 | try: 28 | # Create the server instance 29 | self.server = Server() 30 | 31 | # Configure the server with model/tokenizer 32 | self.server.model = self.model 33 | self.server.tokenizer = self.tokenizer 34 | self.server.llm_loaded = type( 35 | "obj", 36 | (object,), 37 | { 38 | "checkpoint": self.checkpoint, 39 | "recipe": self.recipe, 40 | "max_prompt_length": None, 41 | "reasoning": False, 42 | "model_name": "custom", 43 | }, 44 | ) 45 | 46 | # Set up the server for threaded execution 47 | self.uvicorn_server = self.server.run_in_thread( 48 | port=self.port, host=self.host, log_level="warning" 49 | ) 50 | 51 | # Set the ready event 52 | self.ready_event.set() 53 | 54 | # Run the server until shutdown is requested 55 | logging.info(f"Starting server on http://{self.host}:{self.port}") 56 | self.uvicorn_server.run() 57 | 58 | except Exception as e: 59 | logging.error(f"Error starting server: {e}") 60 | self.ready_event.set() 61 | raise 62 | 63 | def shutdown(self): 64 | """Shutdown the server""" 65 | if hasattr(self, "uvicorn_server") and self.uvicorn_server: 66 | logging.info("Shutting down server...") 67 | self.uvicorn_server.should_exit = True 68 | self.shutdown_event.set() 69 | 70 | # Clean up resources properly to avoid memory leaks 71 | if hasattr(self, "server") and self.server: 72 | logging.info("Cleaning up model and tokenizer resources...") 73 | 74 | if hasattr(self.server, "model"): 75 | self.server.model = None 76 | 77 | if hasattr(self.server, "tokenizer"): 78 | self.server.tokenizer = None 79 | 80 | if hasattr(self.server, "llm_loaded"): 81 | self.server.llm_loaded = None 82 | 83 | # Clean up local references 84 | if hasattr(self, "model"): 85 | del self.model 86 | if hasattr(self, "tokenizer"): 87 | del self.tokenizer 88 | -------------------------------------------------------------------------------- /src/lemonade/tools/server/webapp.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import json 3 | from fastapi.responses import HTMLResponse 4 | from lemonade_server.model_manager import ModelManager 5 | 6 | 7 | def get_webapp_html(port=8000): 8 | """ 9 | Show Lemonade Web App for LLM chat and model management. 10 | """ 11 | # Load server models from JSON 12 | server_models = ModelManager().supported_models 13 | 14 | # Use shared filter function from model_manager.py 15 | filtered_models = ModelManager().filter_models_by_backend(server_models) 16 | 17 | # Pass filtered server_models to JS 18 | server_models_js = ( 19 | f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>" 20 | ) 21 | 22 | # Load HTML template 23 | template_path = Path(__file__).parent / "static" / "webapp.html" 24 | with open(template_path, "r", encoding="utf-8") as f: 25 | html_template = f.read() 26 | 27 | # Replace template variables 28 | html_content = html_template.replace("{{SERVER_PORT}}", str(port)) 29 | html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js) 30 | 31 | return HTMLResponse(content=html_content) 32 | -------------------------------------------------------------------------------- /src/lemonade/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "8.0.5" 2 | -------------------------------------------------------------------------------- /src/lemonade_install/__init__.py: -------------------------------------------------------------------------------- 1 | from .install import main as installcli 2 | -------------------------------------------------------------------------------- /src/lemonade_server/pydantic_models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union, List, Any 2 | 3 | from pydantic import BaseModel 4 | 5 | # Set to a high number to allow for interesting experiences in real apps 6 | # Tests should use the max_new_tokens argument to set a lower value 7 | DEFAULT_MAX_NEW_TOKENS = 1500 8 | 9 | 10 | class LoadConfig(BaseModel): 11 | """ 12 | Configuration for loading a language model. 13 | 14 | Specifies the model checkpoint, generation parameters, 15 | and hardware/framework configuration (recipe) for model loading. 16 | """ 17 | 18 | model_name: str 19 | checkpoint: Optional[str] = None 20 | recipe: Optional[str] = None 21 | # Indicates the maximum prompt length allowed for that specific 22 | # checkpoint + recipe combination 23 | max_prompt_length: Optional[int] = None 24 | # Indicates whether the model is a reasoning model, like DeepSeek 25 | reasoning: Optional[bool] = False 26 | # Indicates which Multimodal Projector (mmproj) file to use 27 | mmproj: Optional[str] = None 28 | 29 | 30 | class CompletionRequest(BaseModel): 31 | """ 32 | Request model for text completion API endpoint. 33 | 34 | Contains a prompt, a model identifier, and a streaming 35 | flag to control response delivery. 36 | """ 37 | 38 | prompt: str 39 | model: str 40 | echo: bool = False 41 | stream: bool = False 42 | logprobs: int | None = False 43 | stop: list[str] | str | None = None 44 | temperature: float | None = None 45 | max_tokens: int | None = None 46 | 47 | 48 | class ChatCompletionRequest(BaseModel): 49 | """ 50 | Request model for chat completion API endpoint. 51 | 52 | Contains a list of chat messages, a model identifier, 53 | and a streaming flag to control response delivery. 54 | """ 55 | 56 | messages: list[dict] 57 | model: str 58 | stream: bool = False 59 | logprobs: int | None = False 60 | stop: list[str] | str | None = None 61 | temperature: float | None = None 62 | tools: list[dict] | None = None 63 | max_tokens: int | None = None 64 | max_completion_tokens: int | None = None 65 | response_format: dict | None = None 66 | 67 | 68 | class EmbeddingsRequest(BaseModel): 69 | """ 70 | Request model for embeddings API endpoint. 71 | 72 | Generates embeddings for the provided input text or tokens. 73 | """ 74 | 75 | input: Union[str, List] 76 | model: Optional[str] = None 77 | encoding_format: Optional[str] = "float" # "float" or "base64" 78 | 79 | 80 | class RerankingRequest(BaseModel): 81 | """ 82 | Request model for reranking API endpoint. 83 | 84 | Reranks a list of documents based on their relevance to a query. 85 | """ 86 | 87 | query: str 88 | documents: List[str] 89 | model: str 90 | 91 | 92 | class ResponsesRequest(BaseModel): 93 | """ 94 | Request model for responses API endpoint. 95 | """ 96 | 97 | input: list[dict] | str 98 | model: str 99 | max_output_tokens: int | None = None 100 | temperature: float | None = None 101 | stream: bool = False 102 | 103 | 104 | class PullConfig(LoadConfig): 105 | """ 106 | Pull and load have the same fields. 107 | """ 108 | 109 | 110 | class DeleteConfig(BaseModel): 111 | """ 112 | Configuration for deleting a supported LLM. 113 | """ 114 | 115 | model_name: str 116 | -------------------------------------------------------------------------------- /test/quark_api.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import shutil 3 | import os 4 | from lemonade.state import State 5 | import lemonade.common.test_helpers as common 6 | from lemonade.tools.prompt import LLMPrompt 7 | from lemonade.tools.huggingface.load import HuggingfaceLoad 8 | from lemonade.tools.quark.quark_quantize import QuarkQuantize 9 | from lemonade.tools.quark.quark_load import QuarkLoad 10 | 11 | 12 | class Testing(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | # Load default args from QuarkQuantize parser 16 | parser = QuarkQuantize.parser() 17 | cls.default_args = vars(parser.parse_args([])) 18 | 19 | def setUp(self) -> None: 20 | shutil.rmtree(cache_dir, ignore_errors=True) 21 | 22 | def test_001_quantize(self): 23 | """ 24 | This test first quantizes the model, exports it to 25 | target format and then reloads the quantized model 26 | """ 27 | checkpoint = "facebook/opt-125m" 28 | device = "cpu" 29 | prompt = "What if?" 30 | 31 | state = State(cache_dir=cache_dir, build_name="test") 32 | state = HuggingfaceLoad().run(state, input=checkpoint) 33 | 34 | quantize_args = { 35 | "model_export": "quark_safetensors", 36 | "quant_algo": "awq", 37 | "quant_scheme": "w_uint4_per_group_asym", 38 | "device": "cpu", 39 | "skip_quantization": True, 40 | } 41 | # Combine specific quant args with defaults 42 | quantize_args = {**self.default_args, **quantize_args} 43 | state = QuarkQuantize().run(state, **quantize_args) 44 | state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=10) 45 | 46 | assert len(state.response) > 0, state.response 47 | 48 | 49 | if __name__ == "__main__": 50 | cache_dir, _ = common.create_test_dir( 51 | "lemonade_quark_api", base_dir=os.path.abspath(".") 52 | ) 53 | unittest.main() 54 | 55 | # This file was originally licensed under Apache 2.0. It has been modified. 56 | # Modifications Copyright (c) 2025 AMD 57 | --------------------------------------------------------------------------------