The response has been limited to 50k tokens of the smallest files in the repo. You can remove this limitation by removing the max tokens filter.
├── .github
    ├── actions
    │   └── server-testing
    │   │   └── action.yml
    └── workflows
    │   ├── publish-to-test-pypi.yml
    │   ├── server_installer_windows_latest.yml
    │   ├── test_lemonade.yml
    │   ├── test_lemonade_oga_cpu.yml
    │   ├── test_quark.yml
    │   └── test_server.yml
├── .lfsconfig
├── .pylintrc
├── LICENSE
├── NOTICE.md
├── README.md
├── docs
    ├── CNAME
    ├── README.md
    ├── assets
    │   ├── carousel.js
    │   ├── extra.css
    │   ├── favicon.ico
    │   ├── install-selector.js
    │   ├── logo.png
    │   ├── mkdocs_requirements.txt
    │   └── website-styles.css
    ├── code.md
    ├── contribute.md
    ├── dev_cli
    │   ├── README.md
    │   ├── humaneval_accuracy.md
    │   ├── llamacpp.md
    │   ├── lm-eval.md
    │   ├── mmlu_accuracy.md
    │   ├── ort_genai_igpu.md
    │   ├── perplexity.md
    │   └── quark.md
    ├── favicon.ico
    ├── index.html
    ├── install_options.html
    ├── lemonade_api.md
    ├── publish_website_docs.py
    ├── server
    │   ├── README.md
    │   ├── apps
    │   │   ├── README.md
    │   │   ├── ai-dev-gallery.md
    │   │   ├── ai-toolkit.md
    │   │   ├── anythingLLM.md
    │   │   ├── codeGPT.md
    │   │   ├── continue.md
    │   │   ├── lm-eval.md
    │   │   ├── mindcraft.md
    │   │   ├── open-webui.md
    │   │   └── wut.md
    │   ├── concepts.md
    │   ├── lemonade-server-cli.md
    │   ├── server_integration.md
    │   ├── server_models.md
    │   └── server_spec.md
    └── versioning.md
├── examples
    ├── README.md
    ├── api_basic.py
    ├── api_oga_cpu.py
    ├── api_oga_cpu_streaming.py
    ├── api_oga_hybrid.py
    ├── api_oga_hybrid_streaming.py
    ├── api_oga_igpu.py
    ├── api_oga_igpu_streaming.py
    ├── api_oga_npu.py
    ├── api_oga_npu_streaming.py
    ├── api_streaming.py
    ├── demos
    │   ├── README.md
    │   ├── chat
    │   │   ├── chat_hybrid.py
    │   │   └── chat_start.py
    │   └── search
    │   │   ├── search_hybrid.py
    │   │   └── search_start.py
    └── notebooks
    │   └── lemonade_model_validation.ipynb
├── img
    ├── basic_demo.gif
    └── llm_demo.png
├── installer
    ├── AMD_LICENSE
    ├── Installer.nsi
    ├── add_to_path.py
    ├── installer_banner.bmp
    ├── lemonade-server.bat
    ├── lemonade_notification.vbs
    └── lemonade_server.vbs
├── mkdocs.yml
├── setup.py
├── src
    ├── lemonade
    │   ├── __init__.py
    │   ├── api.py
    │   ├── cache.py
    │   ├── cli.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── cli_helpers.py
    │   │   ├── exceptions.py
    │   │   ├── filesystem.py
    │   │   ├── inference_engines.py
    │   │   ├── network.py
    │   │   ├── printing.py
    │   │   ├── status.py
    │   │   ├── system_info.py
    │   │   └── test_helpers.py
    │   ├── profilers
    │   │   ├── __init__.py
    │   │   ├── memory_tracker.py
    │   │   └── profiler.py
    │   ├── sequence.py
    │   ├── state.py
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── accuracy.py
    │   │   ├── adapter.py
    │   │   ├── bench.py
    │   │   ├── huggingface
    │   │   │   ├── bench.py
    │   │   │   ├── load.py
    │   │   │   └── utils.py
    │   │   ├── humaneval.py
    │   │   ├── llamacpp
    │   │   │   ├── bench.py
    │   │   │   └── load.py
    │   │   ├── management_tools.py
    │   │   ├── mmlu.py
    │   │   ├── oga
    │   │   │   ├── __init__.py
    │   │   │   ├── bench.py
    │   │   │   ├── load.py
    │   │   │   └── utils.py
    │   │   ├── perplexity.py
    │   │   ├── prompt.py
    │   │   ├── quark
    │   │   │   ├── __init__.py
    │   │   │   ├── quark_load.py
    │   │   │   └── quark_quantize.py
    │   │   ├── report
    │   │   │   ├── __init__.py
    │   │   │   ├── llm_report.py
    │   │   │   └── table.py
    │   │   ├── server
    │   │   │   ├── __init__.py
    │   │   │   ├── llamacpp.py
    │   │   │   ├── serve.py
    │   │   │   ├── static
    │   │   │   │   ├── favicon.ico
    │   │   │   │   ├── styles.css
    │   │   │   │   └── webapp.html
    │   │   │   ├── tool_calls.py
    │   │   │   ├── tray.py
    │   │   │   ├── utils
    │   │   │   │   ├── port.py
    │   │   │   │   ├── system_tray.py
    │   │   │   │   └── thread.py
    │   │   │   └── webapp.py
    │   │   └── tool.py
    │   └── version.py
    ├── lemonade_install
    │   ├── __init__.py
    │   └── install.py
    └── lemonade_server
    │   ├── cli.py
    │   ├── model_manager.py
    │   ├── pydantic_models.py
    │   └── server_models.json
└── test
    ├── llm_api.py
    ├── oga_cpu_api.py
    ├── quark_api.py
    ├── server.py
    ├── server_cli.py
    └── server_unit.py


/.github/workflows/publish-to-test-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python distributions to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main", "canary", "refresh"]
 6 |     tags:
 7 |       - v*
 8 |       - RC*
 9 |   pull_request:
10 |   merge_group:
11 | 
12 | jobs:
13 |   build-n-publish:
14 |     name: Build and publish Python distributions to PyPI
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@main
18 |       - uses: conda-incubator/setup-miniconda@v3
19 |         with:
20 |           miniconda-version: "latest"
21 |           activate-environment: lemon
22 |           python-version: "3.10"
23 |       - name: Install pypa/build
24 |         run: >-
25 |           python -m pip install build --user
26 |       - name: Build a binary wheel and a source tarball
27 |         run: |
28 |           python -m build --sdist --wheel --outdir dist/ .
29 |           version=$(python setup.py --version)
30 |           echo "VERSION=$version" >> $GITHUB_ENV
31 |       - name: Test wheel
32 |         shell: bash -el {0}
33 |         run: |
34 |           python -m pip install --upgrade pip
35 |           pip install "dist/lemonade_sdk-${{ env.VERSION }}-py3-none-any.whl[dev]"
36 |           lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are"
37 |       - name: Publish distribution package to PyPI
38 |         if: startsWith(github.ref, 'refs/tags/v')
39 |         uses: pypa/gh-action-pypi-publish@release/v1
40 |         with:
41 |           password: ${{ secrets.PYPI_API_TOKEN }}
42 |       - name: Publish distribution package to Test PyPI
43 |         if: startsWith(github.ref, 'refs/tags/RC')
44 |         uses: pypa/gh-action-pypi-publish@release/v1
45 |         with:
46 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
47 |           repository_url: https://test.pypi.org/legacy/
48 | 
49 | # This file was originally licensed under Apache 2.0. It has been modified.
50 | # Modifications Copyright (c) 2025 AMD


--------------------------------------------------------------------------------
/.github/workflows/test_lemonade.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Lint and Test Lemonade
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["main"]
 9 |   pull_request:
10 |   merge_group:
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   make-lemonade:
17 |     env:
18 |         LEMONADE_CI_MODE: "True"
19 |     strategy:
20 |       matrix:
21 |         os: [ubuntu-latest, windows-latest]
22 |     runs-on: ${{ matrix.os }}
23 |     steps:
24 |       - uses: actions/checkout@v3
25 |       - name: Set up Miniconda with 64-bit Python
26 |         uses: conda-incubator/setup-miniconda@v2
27 |         with:
28 |           miniconda-version: "latest"
29 |           activate-environment: lemon
30 |           python-version: "3.10"
31 |           run-post: "false"
32 |       - name: Install dependencies
33 |         shell: bash -el {0}
34 |         run: |
35 |           python -m pip install --upgrade pip
36 |           pip install pylint
37 |           python -m pip check
38 |           pip install -e .[dev]
39 |       - name: Lint with Black
40 |         uses: psf/black@stable
41 |         with:
42 |           options: "--check --verbose"
43 |           src: "./src"
44 |       - name: Lint with PyLint
45 |         shell: bash -el {0}
46 |         run: |
47 |           pylint src/lemonade --rcfile .pylintrc --disable E0401
48 |           pylint examples --rcfile .pylintrc --disable E0401,E0611,F0010 --jobs=1 -v
49 |       - name: Run lemonade tests
50 |         shell: bash -el {0}
51 |         run: |
52 |           # Test CLI
53 |           lemonade -m -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10
54 |           
55 |           # Test low-level APIs
56 |           python test/llm_api.py
57 | 
58 |           # Test high-level APIs
59 |           python examples/api_basic.py
60 |           python examples/api_streaming.py
61 | 
62 | # This file was originally licensed under Apache 2.0. It has been modified.
63 | # Modifications Copyright (c) 2025 AMD


--------------------------------------------------------------------------------
/.github/workflows/test_lemonade_oga_cpu.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Lint and Test Lemonade for OGA on CPU
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["main"]
 9 |   pull_request:
10 |   merge_group:
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   make-oga-cpu-lemonade:
17 |     env:
18 |         LEMONADE_CI_MODE: "True"
19 |     runs-on: windows-latest
20 |     steps:
21 |       - uses: actions/checkout@v3
22 |       - name: Set up Miniconda with 64-bit Python
23 |         uses: conda-incubator/setup-miniconda@v2
24 |         with:
25 |           miniconda-version: "latest"
26 |           activate-environment: lemon
27 |           python-version: "3.10"
28 |           run-post: "false"
29 |       - name: Install dependencies
30 |         shell: bash -el {0}
31 |         run: |
32 |           python -m pip install --upgrade pip
33 |           conda install pylint
34 |           python -m pip check
35 |           pip install -e .[dev,oga-cpu]
36 |       - name: Lint with Black
37 |         uses: psf/black@stable
38 |         with:
39 |           options: "--check --verbose"
40 |           src: "./src"
41 |       - name: Lint with PyLint
42 |         shell: bash -el {0}
43 |         run: |
44 |           pylint src/lemonade --rcfile .pylintrc --disable E0401
45 |       - name: Run lemonade tests
46 |         shell: bash -el {0}
47 |         env:
48 |           HF_TOKEN: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions
49 |         run: |
50 |           # Test CLI
51 |           lemonade -i amd/Qwen2.5-0.5B-Instruct-quantized_int4-float16-cpu-onnx oga-load --device cpu --dtype int4 llm-prompt -p "tell me a story" --max-new-tokens 5
52 | 
53 |           # Test low-level APIs
54 |           python test/oga_cpu_api.py
55 | 
56 |           # Test high-level APIs
57 |           python examples/api_oga_cpu.py
58 |           python examples/api_oga_cpu_streaming.py
59 | 
60 | # This file was originally licensed under Apache 2.0. It has been modified.
61 | # Modifications Copyright (c) 2025 AMD


--------------------------------------------------------------------------------
/.github/workflows/test_quark.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Test Lemonade with Quark Quantization
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["main"]
 9 |   pull_request:
10 |   merge_group:
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   make-quark-lemonade:
17 |     env:
18 |         LEMONADE_CI_MODE: "True"
19 |     runs-on: windows-latest
20 |     steps:
21 |       - uses: actions/checkout@v3
22 |       - name: Set up Miniconda with 64-bit Python
23 |         uses: conda-incubator/setup-miniconda@v2
24 |         with:
25 |           miniconda-version: "latest"
26 |           activate-environment: lemon
27 |           python-version: "3.10"
28 |           run-post: "false"
29 |       - name: Install dependencies
30 |         shell: bash -el {0}
31 |         run: |
32 |           python -m pip install --upgrade pip
33 |           conda install pylint
34 |           python -m pip check
35 |           pip install -e .[dev,oga-cpu]
36 |           lemonade-install --quark 0.6.0
37 |       - name: Lint with Black
38 |         uses: psf/black@stable
39 |         with:
40 |           options: "--check --verbose"
41 |           src: "./src"
42 |       - name: Lint with PyLint
43 |         shell: bash -el {0}
44 |         run: |
45 |           pylint src/lemonade/tools/quark --rcfile .pylintrc --disable E0401 
46 |       - name: Run lemonade tests
47 |         shell: bash -el {0}
48 |         env:
49 |           HF_TOKEN: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions
50 |         run: |
51 |           python test/quark_api.py
52 | 
53 | # This file was originally licensed under Apache 2.0. It has been modified.
54 | # Modifications Copyright (c) 2025 AMD


--------------------------------------------------------------------------------
/.github/workflows/test_server.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Test Lemonade Server
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["main"]
 9 |   pull_request:
10 |   merge_group:
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   make-server-lemonade:
17 |     env:
18 |         LEMONADE_CI_MODE: "True"
19 |     strategy:
20 |       matrix:
21 |         python-version: ["3.10", "3.12"]
22 |         os: [ubuntu-latest, windows-latest]
23 |     runs-on: ${{ matrix.os }}
24 |     steps:
25 |       - uses: actions/checkout@v3
26 |       - name: Set up Miniconda with 64-bit Python
27 |         uses: conda-incubator/setup-miniconda@v2
28 |         with:
29 |           miniconda-version: "latest"
30 |           activate-environment: lemon
31 |           python-version: ${{ matrix.python-version }}
32 |           run-post: "false"
33 |       - name: Install dependencies
34 |         shell: bash -el {0}
35 |         run: |
36 |           python -m pip install --upgrade pip
37 |           python -m pip check
38 |           pip install -e .[dev,oga-cpu]
39 |           lemonade-server-dev pull Qwen2.5-0.5B-Instruct-CPU
40 |       - name: Run server tests (unit tests)
41 |         shell: bash -el {0}
42 |         run: |
43 |           python test/server_unit.py
44 |       - name: Run server tests (network online mode)
45 |         shell: bash -el {0}
46 |         run: |
47 |           python test/server.py
48 |       - name: Run server tests (offline mode)
49 |         shell: bash -el {0}
50 |         run: |
51 |           python test/server.py --offline
52 | 
53 | # This file was originally licensed under Apache 2.0. It has been modified.
54 | # Modifications Copyright (c) 2025 AMD
55 | 


--------------------------------------------------------------------------------
/.lfsconfig:
--------------------------------------------------------------------------------
1 | [lfs]
2 |   fetchexclude = *.onnx,*_model.zip
3 | 


--------------------------------------------------------------------------------
/NOTICE.md:
--------------------------------------------------------------------------------
 1 | PORTIONS LICENSED AS FOLLOWS
 2 | 
 3 | Lemonade SDK used the [ONNX TurnkeyML](https://github.com/onnx/turnkeyml) project as a starting point under the [Apache 2.0 license](./LICENSE).
 4 | 
 5 | ## TurnkeyML Attribution
 6 | 
 7 | TurnkeyML used code from other open source projects as a starting point (see [NOTICE.md](NOTICE.md)). Thank you Philip Colangelo, Derek Elkins, Jeremy Fowers, Dan Gard, Victoria Godsoe, Mark Heaps, Daniel Holanda, Brian Kurtz, Mariah Larwood, Philip Lassen, Andrew Ling, Adrian Macias, Gary Malik, Sarah Massengill, Ashwin Murthy, Hatice Ozen, Tim Sears, Sean Settle, Krishna Sivakumar, Aviv Weinstein, Xueli Xao, Bill Xing, and Lev Zlotnik for your contributions to that work.
 8 | 
 9 | \>  TurnkeyML used code from the [MLAgility](https://github.com/groq/mlagility) and [GroqFlow](https://github.com/groq/groqflow) projects as a starting point. Much of that code was refactored, improved, or replaced by the time TurnkeyML was published. 
10 | 
11 | \> TurnkeyML uses the [Microsoft lemon emoji](https://github.com/microsoft/fluentui-emoji) as an icon for the lemoande tool.
12 | 
13 | >The MIT License
14 | >
15 | >Copyright 2023 Groq Inc.
16 | >
17 | >Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
18 | >
19 | >The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
20 | >
21 | >THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | lemonade-server.ai


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # 🍋 Lemonade SDK
 2 | 
 3 | Welcome to the documentation for the Lemonade SDK project! Use this resource to learn more about the server, CLI, API, and how to contribute to the project.
 4 | 
 5 | <div class="hide-in-mkdocs">
 6 | 
 7 | - [Installation](#installation)
 8 | - [Server](#server)
 9 | - [Developer CLI](#developer-cli)
10 | - [Lemonade API](#lemonade-api)
11 | - [Software and Hardware Overview](#software-and-hardware-overview)
12 |   - [Supported Hardware Accelerators](#supported-hardware-accelerators)
13 |   - [Supported Inference Engines](#supported-inference-engines)
14 | - [Contributing](#contributing)
15 | </div>
16 | 
17 | ## Installation
18 | 
19 | 
20 | [Click here for Lemonade SDK installation options](https://lemonade-server.ai/install_options.html).
21 | 
22 | For a quick start with Hugging Face (PyTorch) LLMs on CPU, run the following installation commands in an active Python 3 environment, and then try the Server, CLI, or API links below.
23 | 
24 | ```bash
25 | pip install lemonade-sdk[dev]
26 | ```
27 | 
28 | ## Server
29 | 
30 | The Lemonade Server is an OpenAI API-compatible HTTP server that supports streamlined integration with a wide variety of LLM applications. Learn more in [server documentation](https://lemonade-server.ai/docs/).
31 | 
32 | ## Developer CLI
33 | 
34 | The Lemonade developer CLI, `lemonade`, offers tools for performance benchmarking, accuracy evaluation, and device-specific model preparation. Learn more in the dev CLI [README.md](./dev_cli/README.md).
35 | 
36 | ## Lemonade API
37 | 
38 | The high-level Lemonade API abstracts loading models from any supported framework (e.g., Hugging Face, OGA) and backend (e.g., CPU, Hybrid) using the popular `from_pretrained()` function. This makes it easy to integrate Lemonade LLMs into Python applications. For more information on recipes and compatibility, see the [Lemonade API ReadMe](./lemonade_api.md).
39 | 
40 | OGA Hybrid:
41 | ```python
42 | from lemonade.api import from_pretrained
43 | 
44 | model, tokenizer = from_pretrained("amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid")
45 | 
46 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
47 | response = model.generate(input_ids, max_new_tokens=30)
48 | 
49 | print(tokenizer.decode(response[0]))
50 | ```
51 | 
52 | You can find examples for the high-level APIs [here](https://github.com/lemonade-sdk/lemonade/tree/main/examples).
53 | 
54 | ## Software and Hardware Overview
55 | 
56 | The goal of Lemonade is to help achieve maximum LLM performance on your PC. To cover a wide range of PCs, Lemonade supports a wide variety of hardware accelerators and inference engines described in the subsections below.
57 | 
58 | ### Supported Hardware Accelerators
59 | 
60 | | Mode | Description |
61 | | :--- | :--- |
62 | | **NPU & Hybrid** | Ryzen™ AI 300-series devices have a neural processing unit (NPU) that can run LLMs and accelerate time-to-first-token (TTFT) performance. The typical way of utilizing the NPU is called *hybrid execution*, where the prompt is processed on the NPU to produce the first token, and the remaining tokens are computed on the Ryzen AI integrated GPU (iGPU). |
63 | | **GPU** | PCs with an integrated GPU (iGPU), such as many laptop SoCs, and/or discrete GPU (dGPU), such as many desktop and workstation PCs, can run LLMs on that GPU hardware. Lemonade Server provides GPU support in every installation via the Vulkan llama.cpp binaries.<br/><br/> <sub>Note: GPU support is not currently provided for CLI tasks such as benchmarking.</sub> |
64 | 
65 | ### Supported Inference Engines
66 | | Engine | Description |
67 | | :--- | :--- |
68 | | **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). |
69 | | **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). |
70 | | **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. |
71 | 
72 | ## Contributing
73 | 
74 | Contributions are welcome! If you decide to contribute, please:
75 | 
76 | - Do so via a pull request.
77 | - Write your code in keeping with the same style as the rest of this repo's code.
78 | - Add a test under `test/` that provides coverage of your new feature.
79 | 
80 | The best way to contribute is to add new tools to cover more devices and usage scenarios.
81 | 
82 | To add a new tool:
83 | 
84 | 1. (Optional) Create a new `.py` file under `src/lemonade/tools` (or use an existing file if your tool fits into a pre-existing family of tools).
85 | 1. Define a new class that inherits the `Tool` class.
86 | 1. Register the class by adding it to the list of `tools` near the top of `src/lemonade/cli.py`.
87 | 
88 | You can learn more about contributing on the repository's [contribution guide](https://github.com/lemonade-sdk/lemonade/blob/main/docs/contribute.md).
89 | 
90 | <!--This file was originally licensed under Apache 2.0. It has been modified.
91 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/assets/carousel.js:
--------------------------------------------------------------------------------
 1 | // Simple YouTube video carousel for MkDocs Material
 2 | 
 3 | document.addEventListener('DOMContentLoaded', function () {
 4 |   var carousel = document.getElementById('yt-carousel');
 5 |   if (!carousel) return;
 6 |   // Support both data-ids (comma-separated) and data-videos (JSON array of {id, title})
 7 |   var videos = [];
 8 |   if (carousel.dataset.videos) {
 9 |     try {
10 |       videos = JSON.parse(carousel.dataset.videos);
11 |     } catch (e) {
12 |       console.error('Invalid JSON in data-videos:', e);
13 |     }
14 |   } else if (carousel.dataset.ids) {
15 |     videos = carousel.dataset.ids.split(',').map(function(id) {
16 |       return { id: id.trim(), title: '' };
17 |     });
18 |   }
19 |   if (!videos.length) return;
20 |   var idx = 0;
21 | 
22 |   function render() {
23 |     var video = videos[idx];
24 |     var titleHtml = video.title ? `<div style=\"margin-bottom:8px;font-weight:bold;font-size:1.1rem;\">${video.title}</div>` : '';
25 |     carousel.innerHTML = `
26 |       <div style="display:flex;flex-direction:column;align-items:center;max-width:100%;">
27 |         ${titleHtml}
28 |         <div style="position:relative;width:100%;max-width:560px;aspect-ratio:16/9;">
29 |           <iframe style="width:100%;height:100%;border-radius:12px;box-shadow:0 2px 16px #0003;" src="https://www.youtube.com/embed/${video.id}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
30 |         </div>
31 |         <div style="margin-top:16px;display:flex;align-items:center;gap:16px;">
32 |           <button id="yt-prev" style="padding:2px 8px;font-size:0.6rem;border:none;border-radius:6px;background:#E59800;color:#222;font-weight:bold;cursor:pointer;">Prev</button>
33 |           <span style="font-size:1rem;color:#666;">${idx+1} / ${videos.length}</span>
34 |           <button id="yt-next" style="padding:2px 8px;font-size:0.6rem;border:none;border-radius:6px;background:#E59800;color:#222;font-weight:bold;cursor:pointer;">Next</button>
35 |         </div>
36 |       </div>
37 |     `;
38 |     document.getElementById('yt-prev').onclick = function() {
39 |       idx = (idx - 1 + videos.length) % videos.length;
40 |       render();
41 |     };
42 |     document.getElementById('yt-next').onclick = function() {
43 |       idx = (idx + 1) % videos.length;
44 |       render();
45 |     };
46 |   }
47 |   render();
48 | });
49 | 


--------------------------------------------------------------------------------
/docs/assets/extra.css:
--------------------------------------------------------------------------------
 1 | /* Note: I have not figured out all the color variables yet */
 2 | 
 3 | [data-md-color-scheme="lightmode"] {
 4 |   --md-primary-fg-color:        #FFFBE9;        /* Header, Selected Page Font */
 5 |   --md-primary-bg-color:        #000000;    /* Header Font, Icon Color*/
 6 |   --md-primary-bg-color--light: #000000;    /* Search bar font color */
 7 |   --md-accent-fg-color:         #FFD744;    /* Hover color of links */
 8 |   --md-footer-fg-color:                #E59800;      /* Nav Footer Font Color */
 9 |   --md-footer-fg-color--light:         #3b3b3b;      /* Footer Font Color */
10 |   --md-footer-fg-color--lighter:       #3b3b3b;      /* Made With... color */
11 |   --md-footer-bg-color:                #FFFBE9;      /* Nav Footer Background  Color */
12 |   --md-footer-bg-color--dark:          #FFFBE9;      /* Footer Background Color */
13 |   --md-default-bg-color:               #FFFBE9;       /* Main background color */
14 |   --md-code-bg-color:                #ffefb5;      /* Code block background color */
15 |   --md-code-fg-color:                #000000;      /* Code block font color */
16 |   --md-default-fg-color--light:             #E59800;      /* Blockquote color */
17 | }
18 | 
19 | [data-md-color-scheme="slate"] {
20 |   --md-primary-fg-color:        #FFD500;        /* Header, Selected Page Font */
21 |   --md-primary-bg-color:        #000000;    /* Header Font, Icon Color*/
22 |   --md-primary-bg-color--light: #000000;    /* Search bar font color */
23 |   --md-accent-fg-color:         #FFD500;    /* Hover color of links */
24 |   --md-accent-fg-color--transparent: #E59800;
25 |   --md-footer-fg-color:                #E59800;      /* Nav Footer Font Color */
26 |   --md-footer-fg-color--light:         #929292;      /* Footer Font Color */
27 |   --md-footer-fg-color--lighter:       #929292;      /* Made With... color */
28 |   --md-footer-bg-color:                #000000;      /* Nav Footer Background  Color */
29 |   --md-footer-bg-color--dark:          #000000;      /* Footer Background Color */
30 |   --md-primary-bg-color--light:          #000000;      /* Search Font */
31 | }
32 | 
33 | [data-md-color-scheme="slate"] {
34 |   --md-hue: 320; /* between 0 and 360 */
35 |   /* --md-saturation: 50; /* between 0 and 100 */
36 |   /* --md-lightness: 100; between 0 and 100 */
37 |   --md-footer-bg-color:  #141413;      /* Nav Footer Background  Color */
38 |   --md-default-bg-color: #141413 !important; /* Dark background */
39 |   --md-primary-fg-color: #E59800 !important; /* Header, Selected Page Font */
40 |   --md-footer-bg-color--dark: #1f1503 !important; /* Footer Background Color */
41 | }
42 | 
43 | .hide-in-mkdocs { display: none; }
44 | 
45 | /* docs/assets/extra.css */
46 | .mkdocs-only { display: block; }
47 | 
48 | /* Hide the page title in the navigation sidebar */
49 | .md-nav__title {
50 |   display: none !important;
51 | }
52 | 
53 | /* Make page titles (h1) a darker grey in light mode, lighter in dark mode */
54 | [data-md-color-scheme="lightmode"] h1,
55 | [data-md-color-scheme="lightmode"] .md-typeset h1 {
56 |   color: #222 !important;
57 | }
58 | [data-md-color-scheme="slate"] h1,
59 | [data-md-color-scheme="slate"] .md-typeset h1 {
60 |   color: #cfcfcf !important;
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/docs/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/docs/assets/favicon.ico


--------------------------------------------------------------------------------
/docs/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/docs/assets/logo.png


--------------------------------------------------------------------------------
/docs/assets/mkdocs_requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-material
3 | mkdocs-monorepo-plugin
4 | pymdown-extensions


--------------------------------------------------------------------------------
/docs/code.md:
--------------------------------------------------------------------------------
 1 | # Lemonade SDK Code Structure
 2 | 
 3 | # Repo Organization
 4 | 
 5 | The Lemonade SDK source code has a few major top-level directories:
 6 | - `docs`: documentation for the entire project.
 7 | - `examples`: example scripts for use with the Lemonade tools.
 8 | - `src/lemonade`: source code for the lemonade-sdk package.
 9 |   - `src/lemonade/tools`: implements `Tool` and defines the tools built in to `lemonade`.
10 |   - `src/lemonade/sequence.py`: implements `Sequence` and defines the plugin API for `Tool`s.
11 |   - `src/lemonade/cli`: implements the `lemonade` CLI.
12 |   - `src/lemonade/common`: functions common to the other modules.
13 |   - `src/lemonade/version.py`: defines the package version number.
14 |   - `src/lemonade/state.py`: implements the `State` class.
15 | - `test`: tests for the Lemonade SDK tools.
16 | 
17 | ## Tool Classes
18 | 
19 | All of the logic for actually building models is contained in `Tool` classes. Generally, a `FirstTool` class obtains a model, and each subsequent `Tool` is a model-to-model transformation. For example:
20 | - the `Discover(FirstTool)` (aka `discover` in the CLI) obtains a PyTorch model instance from a python script.
21 | - the `ExportPytorchModel(Tool)` (aka `export-pytorch` in the CLI) transforms a PyTorch model instance into an ONNX model file.
22 | 
23 | ### Composability
24 | 
25 | `Tools` are designed to be composable. This composability is facilitated by the `State` class, which is how `Tools` communicate with each other. Every `Tool` takes an instance of `State` as input and then returns an instance of `State`.
26 | 
27 | ### Implementation
28 | 
29 | See [tools.py](https://github.com/lemonade-sdk/lemonade/blob/main/src/lemonade/tools/tool.py) for a definition of each method of `Tool` that must be implemented to create a new `Tool` subclass.
30 | 
31 | <!--This file was originally licensed under Apache 2.0. It has been modified.
32 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/dev_cli/humaneval_accuracy.md:
--------------------------------------------------------------------------------
  1 | # Using the HumanEval accuracy test tools
  2 | 
  3 | The HumanEval benchmark is a code generation and functional correctness evaluation framework designed to assess language models' ability to generate Python code. It consists of 164 handwritten programming problems, each containing a function signature, docstring, body, and several unit tests. This benchmark focuses on evaluating a model's capability to generate functionally correct code that passes the test cases, making it particularly useful for assessing code generation capabilities.
  4 | 
  5 | This tool provides an automated way to evaluate language models on the HumanEval benchmark. It handles the process of downloading the dataset, generating code completions, executing them in a secure environment, and calculating pass@k metrics.
  6 | 
  7 | ## Dataset
  8 | 
  9 | The HumanEval dataset is automatically downloaded from [OpenAI's human-eval repository](https://github.com/openai/human-eval) when you first run the benchmark. The dataset contains programming problems that test various aspects of Python programming, including:
 10 | 
 11 | - Basic programming operations
 12 | - String manipulation
 13 | - Mathematical computations
 14 | - List operations
 15 | - Algorithm implementation
 16 | - Data structure manipulation
 17 | 
 18 | ## Running the Benchmark
 19 | 
 20 | ```bash
 21 | lemonade -i meta-llama/Llama-3.2-1B oga-load --device igpu --dtype int4 accuracy-humaneval --k-samples 1 --first-n-samples 5 --timeout 30.0
 22 | ```
 23 | 
 24 | ### Optional arguments:
 25 | 
 26 | `--k-samples`: Number of completions to generate per prompt (default: 1). This parameter determines the k in pass@k metrics. For example:
 27 | - `--k-samples 1`: Calculates pass@1 (single attempt per problem)
 28 | - `--k-samples 10`: Calculates pass@10 (ten attempts per problem)
 29 | - `--k-samples 100`: Calculates pass@100 (hundred attempts per problem)
 30 | 
 31 | Higher k values provide more robust evaluation but take longer to run.
 32 | 
 33 | `--first-n-samples`: Evaluate only the first N problems from the dataset (default: entire dataset). Useful for quick testing or when you want to evaluate a subset of problems.
 34 | 
 35 | `--timeout`: Maximum time in seconds allowed for each test case execution (default: 30.0). This prevents infinite loops or long-running code from blocking the evaluation.
 36 | 
 37 | `--data-dir`: Custom directory for storing the HumanEval dataset (default: "<lemonade_cache_dir>/data/humaneval").
 38 | 
 39 | ## How It Works
 40 | 
 41 | 1. **Dataset Preparation:**
 42 |    - On first run, the tool downloads the HumanEval dataset (HumanEval.jsonl.gz)
 43 |    - The dataset contains function signatures, docstrings, and test cases
 44 |    - Each problem is structured to test specific programming capabilities
 45 |    - You can evaluate only the first N problems using `--first-n-samples`
 46 | 
 47 | 2. **Code Generation:**
 48 |    - For each programming problem, the model is provided with a prompt containing:
 49 |      - Function signature (e.g., `def sort_numbers(numbers):`)
 50 |      - Docstring describing the function's purpose and requirements
 51 |    - The model generates k code completions for the function body (controlled by `--k-samples`)
 52 |    - These k samples are used to calculate the pass@k metric
 53 | 
 54 | 3. **Secure Execution:**
 55 |    - Generated code is executed in a secure sandbox environment maintained by OpenAI's human-eval library. For your awareness, OpenAI's policy is to disable code execution by default, however lemonade enables code execution by default by automatically setting the environment variable `HF_ALLOW_CODE_EVAL=1`. OpenAI provides the following code execution protections:
 56 |      - **Process Isolation**: Each code sample runs in a separate process to prevent interference
 57 |      - **Resource Limits**:
 58 |        - CPU time limit (controlled by `--timeout`)
 59 |        - Memory usage restrictions
 60 |        - Maximum output size restrictions
 61 |      - **Restricted Access**:
 62 |        - No network access
 63 |        - No file system access outside test directory
 64 |        - No subprocess creation
 65 |        - No system calls
 66 |      - **Module Restrictions**:
 67 |        - Only allows importing standard Python libraries needed for testing
 68 |        - Blocks potentially dangerous modules (os, sys, subprocess, etc.)
 69 |    These security measures are implemented through:
 70 |    - Python's built-in `resource` module for resource limits
 71 |    - AST (Abstract Syntax Tree) analysis for code validation
 72 |    - Process-level isolation using `multiprocessing`
 73 |    - Custom import hooks to restrict module access
 74 | 
 75 | 4. **Evaluation Metrics:**
 76 |    - **pass@k**: Percentage of problems solved with k attempts
 77 |      - pass@1: Success rate with single attempt
 78 |      - pass@10: Success rate within 10 attempts
 79 |      - pass@100: Success rate within 100 attempts
 80 |    - A problem is considered solved if all test cases pass
 81 |    - Results are normalized to percentages
 82 | 
 83 | 5. **Output Files:**
 84 |    The tool generates several output files in the results directory:
 85 |    - `evaluation_results.csv`: Contains prompts, completions, and expected answers
 86 |    - `humaneval_predictions.jsonl`: Raw model predictions in JSONL format
 87 |    - `humaneval_predictions.jsonl_results.jsonl`: Detailed evaluation results
 88 | 
 89 | ## Example Results Format
 90 | 
 91 | The evaluation produces metrics in the following format:
 92 | ```json
 93 | {
 94 |     "pass@1": 0.25,    // 25% success rate with 1 attempt
 95 |     "pass@10": 0.45,   // 45% success rate within 10 attempts
 96 |     "pass@100": 0.65   // 65% success rate within 100 attempts
 97 | }
 98 | ```
 99 | 
100 | ## Limitations
101 | 
102 | 1. **Resource Requirements**: Generating multiple samples per problem (high k values) can be computationally intensive and time-consuming.
103 | 2. **Memory Usage**: Large language models may require significant memory, especially when generating multiple samples.
104 | 
105 | ## References
106 | 
107 | 1. [Evaluating Large Language Models Trained on Code](https://arxiv.org/abs/2107.03374)
108 | 2. [OpenAI HumanEval Repository](https://github.com/openai/human-eval) 
109 | 
110 | <!--This file was originally licensed under Apache 2.0. It has been modified.
111 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/dev_cli/llamacpp.md:
--------------------------------------------------------------------------------
  1 | # LLAMA.CPP
  2 | 
  3 | Run transformer models using llama.cpp. This integration allows you to:
  4 | 1. Load and run llama.cpp models
  5 | 2. Benchmark model performance
  6 | 3. Use the models with other tools like chat or MMLU accuracy testing
  7 | 
  8 | ## Prerequisites
  9 | 
 10 | You need:
 11 | 1. A compiled llama.cpp executable (llama-cli or llama-cli.exe)
 12 | 2. A GGUF model file
 13 | 
 14 | ### Building llama.cpp (if needed)
 15 | 
 16 | #### Linux
 17 | ```bash
 18 | git clone https://github.com/ggerganov/llama.cpp
 19 | cd llama.cpp
 20 | make
 21 | ```
 22 | 
 23 | #### Windows
 24 | ```bash
 25 | git clone https://github.com/ggerganov/llama.cpp
 26 | cd llama.cpp
 27 | cmake -B build
 28 | cmake --build build --config Release
 29 | ```
 30 | 
 31 | The executable will be in `build/bin/Release/llama-cli.exe` on Windows or `llama-cli` in the root directory on Linux.
 32 | 
 33 | ## Usage
 34 | 
 35 | ### Loading a Model
 36 | 
 37 | Use the `load-llama-cpp` tool to load a model:
 38 | 
 39 | ```bash
 40 | lemonade -i MODEL_NAME load-llama-cpp \
 41 |     --executable PATH_TO_EXECUTABLE \
 42 |     --model-binary PATH_TO_GGUF_FILE
 43 | ```
 44 | 
 45 | Parameters:
 46 | | Parameter     | Required | Default | Description                                           |
 47 | |--------------|----------|---------|-------------------------------------------------------|
 48 | | executable   | Yes      | -       | Path to llama-cli/llama-cli.exe                      |
 49 | | model-binary | Yes      | -       | Path to .gguf model file                             |
 50 | | threads      | No       | 1       | Number of threads for generation                      |
 51 | | context-size | No       | 512     | Context window size                                  |
 52 | | output-tokens| No       | 512     | Maximum number of tokens to generate                 |
 53 | 
 54 | ### Benchmarking
 55 | 
 56 | After loading a model, you can benchmark it using `llama-cpp-bench`:
 57 | 
 58 | ```bash
 59 | lemonade -i MODEL_NAME \
 60 |     load-llama-cpp \
 61 |         --executable PATH_TO_EXECUTABLE \
 62 |         --model-binary PATH_TO_GGUF_FILE \
 63 |     llama-cpp-bench
 64 | ```
 65 | 
 66 | Benchmark parameters:
 67 | | Parameter         | Default                    | Description                               |
 68 | |------------------|----------------------------|-------------------------------------------|
 69 | | prompt           | "Hello, I am conscious and"| Input prompt for benchmarking            |
 70 | | context-size     | 512                        | Context window size                       |
 71 | | output-tokens    | 512                        | Number of tokens to generate              |
 72 | | iterations       | 1                          | Number of benchmark iterations            |
 73 | | warmup-iterations| 0                          | Number of warmup iterations (not counted) |
 74 | 
 75 | The benchmark will measure and report:
 76 | - Time to first token (prompt evaluation time)
 77 | - Token generation speed (tokens per second)
 78 | 
 79 | ### Example Commands
 80 | 
 81 | #### Windows Example
 82 | ```bash
 83 | # Load and benchmark a model
 84 | lemonade -i Qwen/Qwen2.5-0.5B-Instruct-GGUF \
 85 |     load-llama-cpp \
 86 |         --executable "C:\work\llama.cpp\build\bin\Release\llama-cli.exe" \
 87 |         --model-binary "C:\work\llama.cpp\models\qwen2.5-0.5b-instruct-fp16.gguf" \
 88 |     llama-cpp-bench \
 89 |         --iterations 3 \
 90 |         --warmup-iterations 1
 91 | 
 92 | # Run MMLU accuracy test
 93 | lemonade -i Qwen/Qwen2.5-0.5B-Instruct-GGUF \
 94 |     load-llama-cpp \
 95 |         --executable "C:\work\llama.cpp\build\bin\Release\llama-cli.exe" \
 96 |         --model-binary "C:\work\llama.cpp\models\qwen2.5-0.5b-instruct-fp16.gguf" \
 97 |     accuracy-mmlu \
 98 |         --tests management \
 99 |         --max-evals 2
100 | ```
101 | 
102 | #### Linux Example
103 | ```bash
104 | # Load and benchmark a model
105 | lemonade -i Qwen/Qwen2.5-0.5B-Instruct-GGUF \
106 |     load-llama-cpp \
107 |         --executable "./llama-cli" \
108 |         --model-binary "./models/qwen2.5-0.5b-instruct-fp16.gguf" \
109 |     llama-cpp-bench \
110 |         --iterations 3 \
111 |         --warmup-iterations 1
112 | ```
113 | 
114 | ## Integration with Other Tools
115 | 
116 | After loading with `load-llama-cpp`, the model can be used with any tool that supports the ModelAdapter interface, including:
117 | - accuracy-mmlu
118 | - llm-prompt
119 | - accuracy-humaneval
120 | - and more
121 | 
122 | The integration provides:
123 | - Platform-independent path handling (works on both Windows and Linux)
124 | - Proper error handling with detailed messages
125 | - Performance metrics collection
126 | - Configurable generation parameters (temperature, top_p, top_k)
127 | - 10-minute timeout for model generation to prevent indefinite hangs
128 | 
129 | <!--This file was originally licensed under Apache 2.0. It has been modified.
130 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/dev_cli/lm-eval.md:
--------------------------------------------------------------------------------
  1 | # Evaluating Models with lm-eval-harness
  2 | 
  3 | The `lm-eval-harness` tool in Lemonade provides an easy way to evaluate language models on a variety of standardized benchmarks using the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) framework from EleutherAI. This tool allows you to generate standardized accuracy metrics across a wide range of tasks and datasets.
  4 | 
  5 | ## How It Works
  6 | 
  7 | Lemonade makes model evaluation simple by handling the entire workflow for you:
  8 | 
  9 | 1. **Load Your Model**: First, you load your model using either Hugging Face (`huggingface-load`) or ONNX Runtime GenAI (`oga-load`) with your preferred settings like device and dtype.
 10 | 
 11 | 2. **Start Evaluation Server**: Lemonade automatically starts a local server with your loaded model, making it accessible to the evaluation framework.
 12 | 
 13 | 3. **Run lm-evaluation-harness**: Lemonade then runs the lm-evaluation-harness against the server, executing the specific tasks and benchmarks requested.
 14 | 
 15 | 4. **Organize Results**: Finally, Lemonade processes the raw evaluation data and organizes it into clear, readable reports with key metrics like accuracy percentages, saving everything to the model's build directory for easy access.
 16 | 
 17 | ## Usage
 18 | 
 19 | The basic syntax follows this pattern:
 20 | 
 21 | ```bash
 22 | lemonade -i <checkpoint> <loading_method> [loading_options] lm-eval-harness --task <task_name> [options]
 23 | ```
 24 | 
 25 | ### Common Options
 26 | 
 27 | - `--task`: Specifies which task to evaluate on (e.g., gsm8k, mmlu, mmlu_*).
 28 | - `--limit`: Optional number of examples to evaluate (useful for quick tests).
 29 | - `--num-fewshot`: Number of examples to use in few-shot prompts (default: 0).
 30 | - `--log_samples`: Log individual samples and predictions.
 31 | 
 32 | ### Examples
 33 | 
 34 | #### ONNX Runtime GenAI:
 35 | 
 36 | ```bash
 37 | lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device cpu --dtype int4 lm-eval-harness --task mmlu_abstract_algebra --limit 10
 38 | ```
 39 | 
 40 | This example:
 41 | - Loads the Llama 3.2 1B model with OGA.
 42 | - Quantizes to INT4 precision.
 43 | - Evaluates on the abstract algebra subset of MMLU.
 44 | - Limits evaluation to 10 questions.
 45 | 
 46 | #### Hugging Face:
 47 | 
 48 | ```bash
 49 | lemonade -i meta-llama/Llama-3.2-1B-Instruct huggingface-load --device cpu lm-eval-harness --task mmlu_abstract_algebra
 50 | ```
 51 | 
 52 | This example:
 53 | - Loads the Llama 3.2 1B model using Hugging Face.
 54 | - Evaluates on the abstract algebra subset of MMLU.
 55 | - Uses the full test set.
 56 | 
 57 | ## Supported Tasks
 58 | 
 59 | The tool supports all tasks available in lm-evaluation-harness, including:
 60 | 
 61 | - **MMLU**: Massive Multitask Language Understanding (use `mmlu` for all subjects or `mmlu_<subject>` for specific subjects).
 62 | - **GSM8K**: Grade School Math word problems.
 63 | - **HumanEval**: Code generation and completion.
 64 | - **TruthfulQA**: Testing model truthfulness.
 65 | - **MATH**: Complex mathematical problem solving.
 66 | - And many more (see the [full list in the lm-evaluation-harness repository](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md)).
 67 | 
 68 | ## Understanding Results
 69 | 
 70 | Results are displayed in the terminal and saved to the model's build directory.
 71 | 
 72 | ### Metrics
 73 | 
 74 | The key metrics vary by task, but commonly include:
 75 | 
 76 | - **exact_match**: Percentage of exact matches between model predictions and expected answers.
 77 | - **acc** or **accuracy**: Accuracy score (varies by task).
 78 | - **f1**: F1 score for tasks that require partial matching.
 79 | 
 80 | For multiple-choice tasks like MMLU, scores represent the percentage of correct answers. For generative tasks like GSM8K, results often include metrics for both strict and flexible matching:
 81 | 
 82 | - **exact_match,strict-match**: Requires the model to produce the exact correct answer.
 83 | - **exact_match,flexible-extract**: Allows for variations in formatting but requires the correct numerical answer.
 84 | 
 85 | ### Result Files
 86 | 
 87 | Detailed result files are saved in:
 88 | ```
 89 | <cache_dir>/builds/<model_name>_<timestamp>/lm_eval_results/<task_name>_results/
 90 | ```
 91 | 
 92 | These include the full evaluation data in JSON format.
 93 | 
 94 | ## Interpreting Results
 95 | 
 96 | When evaluating models, consider:
 97 | 
 98 | 1. **Task Relevance**: Different tasks measure different capabilities. Choose tasks relevant to your use case.
 99 | 
100 | 2. **Comparison Context**: Compare results against other models of similar size/architecture for meaningful insights.
101 | 
102 | 3. **Few-shot Performance**: Many models perform significantly better with examples (try `--num-fewshot 5`).
103 | 
104 | 4. **Limitations**: Low scores on specific tasks may highlight limitations in the model's training data or capabilities.
105 | 
106 | Summary `llm-eval-harness` tool results are also included in the tables generated by
107 | the report tool (`lemonade report --perf`).
108 | 
109 | ## Further Information
110 | 
111 | For more details on lm-evaluation-harness and its capabilities, see the [official documentation](https://github.com/EleutherAI/lm-evaluation-harness). 


--------------------------------------------------------------------------------
/docs/dev_cli/ort_genai_igpu.md:
--------------------------------------------------------------------------------
 1 | # OnnxRuntime GenAI (OGA) for iGPU and CPU
 2 | 
 3 | [onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs.
 4 | 
 5 | ## Installation
 6 | 
 7 | See [Lemonade Installation](./README.md#installation) for the OGA iGPU backend.
 8 | 
 9 | ## Get models
10 | 
11 | - The oga-load tool can download models from Hugging Face and build ONNX files using OGA's `model_builder`, which can quantize and optimize models for both iGPU and CPU.
12 | - Download and build ONNX model files:
13 |   - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4`
14 |   - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4`
15 | - The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls:
16 |   - `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4`
17 |   - `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4`
18 | - The ONNX model build process can be forced to run again, overwriting the above cache, by using the `--force` flag:
19 |   - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force`
20 | - Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models, such as:
21 |   - Gemma
22 |   - LLaMa
23 |   - Mistral
24 |   - Phi
25 |   - Qwen
26 |   - Nemotron
27 | - For the full list of supported models, please see the [model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md).
28 | - The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository:
29 |   - `cpu`: `fp32`, `int4`
30 |   - `igpu`: `fp16`, `int4`
31 | 
32 | ## Directory structure:
33 | - The model_builder tool caches Hugging Face files and temporary ONNX external data files in `<LEMONADE CACHE>\model_builder`
34 | - The output from model_builder is stored in `<LEMONADE_CACHE>\oga_models\<MODELNAME>\<SUBFOLDER>`
35 |   - `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case.
36 |   - `SUBFOLDER` is `<EP>-<DTYPE>`, where `EP` is the execution provider (`dml` for `igpu`, `cpu` for `cpu`, and `npu` for `npu`) and `DTYPE` is the datatype.
37 |   - If the `--int4-block-size` flag is used then `SUBFOLDER` is` <EP>-<DTYPE>-block-<SIZE>` where `SIZE` is the specified block size.
38 | - Other ONNX models in the format required by onnxruntime-genai can be loaded by Lemonade if placed in the `<LEMONADE_CACHE>\oga_models` folder.
39 |   - Use the `-i` and `--subfolder` flags to specify the folder and subfolder, for example:
40 |     - `lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load`
41 |   - Lemonade will expect the ONNX model files to be located in `<LEMONADE_CACHE>\oga_models\my_model_name\my_subfolder`
42 | 
43 | <!--This file was originally licensed under Apache 2.0. It has been modified.
44 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/dev_cli/perplexity.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Perplexity Evaluation
 3 | 
 4 | 
 5 | ## Overview
 6 | 
 7 | Perplexity is a measurement of how well a probability model predicts a sample. A lower perplexity indicates the model is more confident in its predictions. In the context of language models, perplexity measures the likelihood of the sequence according to the model, given as:
 8 | 
 9 | `Perplexity (P) = exp(Average Negative Log-Likelihood)`
10 | 
11 | `Where Average Negative Log-Likelihood = (1/N) * Sum[-log p(x_i) from i=1 to N]`
12 | 
13 | 
14 | ## Script Functionality
15 | 
16 | ### Key Components
17 | 
18 | - **`max_length`**: The maximum input length the model can handle at once (set by the model's configuration).
19 | - **`stride`**: The step size for the window, set to half of `max_length` to ensure some overlap and preserve context.
20 | - **`seq_len`**: The total length of the tokenized input.
21 | 
22 | ### Detailed Steps
23 | 
24 | 1. **Load Model and Tokenizer**: Receive the model and tokenizer with specified configurations.
25 | 2. **Load and Prepare Data**: Loads the "wikitext-2-raw-v1" dataset and concatenates texts with double newlines. The data is then tokenized.
26 | 3. **Sliding Window Perplexity Calculation**: The script uses a sliding window approach (with a stride of half the window size) to calculate the perplexity for subsets of the data, adjusting for the maximum input length of the model:
27 |     - For each window, input data is processed, and the corresponding labels are adjusted to mask out irrelevant parts (using `-100`).
28 |     - The model computes the logits and loss for each window.
29 |     - Predicted and actual words at the end of each window are logged for analysis.
30 | 4. **Logging to CSV**: Summarizes the context window, predicted and actual next words, and loss for each window into a CSV file for further analysis.
31 | 5. **Perplexity Calculation**: Calculates the total negative log-likelihood adjusted by the effective token count for each window, then computes the average across all tokens to determine the perplexity.
32 | 
33 | ### Example Outputs
34 | 
35 | The script outputs a CSV file named `summary_results.csv` with the following columns:
36 | 
37 | - **Context (Partial context displayed for Brevity)**
38 | - **Predicted next word**
39 | - **Actual next word**
40 | - **Loss for this window**
41 | 
42 | These entries help in understanding how the model is performing at each step of the text.
43 | 
44 | ## How to Interpret Perplexity Results
45 | 
46 | Understanding Perplexity
47 | Definition: Perplexity is defined as the exponential of the average negative log-likelihood of a model on a given test set. 
48 | 
49 | Lower Values are Better: A lower perplexity score indicates that the model has a higher probability of correctly predicting the sample, suggesting better performance. A lower perplexity means the model is more certain about its predictions.
50 | 
51 | ### Interpretation:
52 | 
53 | **High Perplexity:** Indicates confusion or a high level of uncertainty in the model’s predictions. A high perplexity can suggest that the model's language understanding is poor or that the model is not well-tuned for the given data.
54 | 
55 | **Low Perplexity:** Suggests that the model predictions are more accurate and that it assigns higher probabilities to the actual observed outcomes. This is indicative of a model that has a good grasp of the language patterns seen in the test set.
56 | Practical Implications
57 | 
58 | **Model Comparison:** Perplexity is particularly useful for comparing different versions of the same model (e.g., before and after quantization, fine-tuning or training on additional data). The model with the lower perplexity is generally considered better at modeling the language of the test corpus.
59 | 
60 | **Model Selection for Applications:** For applications involving language generation (like machine translation, text summarization, or chatbots), selecting a model with lower perplexity might result in more fluent, coherent, and contextually appropriate text output.
61 | 
62 | **Diagnosing Model Fit:** High perplexity could indicate underfitting, where the model is too simple to capture the complexity of the language data. It can also help in diagnosing whether the model is well-suited for the specific domain of the text being modeled.
63 | 
64 | 
65 | ### Caveats in Interpretation
66 | 
67 | **Dependency on Test Set:** Perplexity is highly dependent on the test set used. A model can show very different perplexity scores on different datasets. Therefore, it's important to consider the nature and domain of the test set when evaluating perplexity.
68 | 
69 | **Not a Complete Measure:** While perplexity provides a measure of how uncertain a model is about its predictions, it does not directly measure how coherent or contextually appropriate generated texts are. Other qualitative assessments and metrics might be necessary to fully evaluate a language model's output.
70 | 
71 | **Comparison Across Different Data:** Comparing perplexity scores across models trained or tested on different datasets can be misleading because the intrinsic difficulty of the datasets can affect the perplexity.
72 | 
73 | <!--This file was originally licensed under Apache 2.0. It has been modified.
74 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/dev_cli/quark.md:
--------------------------------------------------------------------------------
 1 | # Quick Guide to Quark Quantization Tools
 2 | 
 3 | ## Introduction
 4 | Quark is indeed AMD's recommended quantization framework for targeting Ryzen AI platforms, supporting both PyTorch and ONNX formats. For Quark specific info, please visit [quark-doc](https://quark.docs.amd.com/latest/). Here's a guide on using Quark tools for quantization and reloading a quantized model using lemonade:
 5 | 
 6 | ## Installation
 7 | 
 8 | 1. Create and activate a conda environment:
 9 |     - `conda create -n quark python=3.10`
10 |     - `conda activate quark`
11 | 2. Install requirements to setup this environment.
12 | Depending on your usecase you can install for CPU, NPU pr hybrid. 
13 |     ```bash
14 |     pip install -e .[dev,oga-cpu] # Can also work with llm-oga-npu or llm-oga-hybrid
15 |     ```
16 | 2. Install `quark` using `lemonade-install` for easy install
17 |     ```bash
18 |     # Install the latest external version of quark
19 |     lemonade-install --quark 0.6.0
20 |     ```
21 |     This downloads the .whl files and zip folder from the Quark page, installs, and sets up the environment for Quark.
22 | 
23 | ## Usage
24 | ```bash
25 | lemonade -i <model-ckpt> huggingface-load quark-quantize 
26 |     --model-export <export_format> # Export formats [quark_safetensors, onnx, gguf]
27 |     --quant-algo <quantization_algorithm> # Supported algorithms [gptq, awq, autosmoothquant] 
28 |     --quant-scheme <quantization_scheme> # Quant schemes [w_int4, w_uint4, w_int8...] 
29 |     --device <device> # Target device [cpu, cuda] 
30 |     llm-prompt -p "<prompt>"
31 | ```
32 | ## Example Workflows
33 | ### Quantize and Export
34 | 
35 | This command quantizes an opt-125m loaded from HF, using AWQ qunatization algorithm to generate A8W8 quantized model. Running quantization on CPU can be time consuming. This test can take upto 1hr using 
36 | 100% of your CPU.
37 | 
38 | ```bash
39 | lemonade -i facebook/opt-125m huggingface-load quark-quantize --quant-algo awq --quant-scheme w_int8_a_int8_per_tensor_sym --model-export quark_safetensors --device cpu
40 | ```
41 | 
42 | #### Load Quantized Model:
43 | This command loads the exported model from a cache folder that corresponds to the quantization recipe used during its export.
44 | ```bash
45 | lemonade -i facebook/opt-125m huggingface-load quark-load --safetensors-model-reload --quant-algo awq --quant-scheme w_int8_a_int8_per_tensor_sym --device cpu llm-prompt -p "Hello world"
46 | ```
47 | 
48 | ### Supported Quantization Schemes
49 | 
50 | The following are the different quantization schemes supported for various models.
51 | For a comprehensive list of datatype support for specific models, refer to the [support matrix](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html#id11).
52 | 
53 | - w_uint4_per_group_asym
54 | - w_int4_per_channel_sym
55 | - w_int8_a_int8_per_tensor_sym
56 | - w_int8_per_tensor_sym and more..
57 | 
58 | For more information on the supported quantization schemes, see [Language Model Post Training Quantization (PTQ) Using Quark](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html).
59 | 
60 | ### Supported Export Formats
61 | 
62 | Lemonade supports exporting quark quantized models in various formats. The following export formats are available:
63 | 
64 | - quark_safetensors
65 | - onnx
66 | - vllm_adopted_safetensors
67 | - gguf
68 | 
69 | ## Known Issues
70 | - No PyPI installer for Quark yet. You can use lemondade-installer as mentioned [above](#installation) for Quark installation.
71 | - Not enough Quark APIs are exposed. Need to rely heavily of Zip folder released by Quark. 
72 | - Latest Quark version is hardcoded in quark_quantize for download checks.
73 | 
74 | - There is currently no PyPI installer for Quark. You can use lemonade-installer as mentioned in the [Installation Section](#installation) of this guide for Quark installation.
75 | - There are limited Quark APIs currently available. Users will need to rely on the Zip folder released by Quark. 
76 | - Latest Quark version hardcoded in quark_quantize for download checks.
77 | - Unable to suppress logging info from Quark. Using log_severity_level, you can suppress the quantization logs, but you cannot suppress info and warning messages when reloading the model, etc.
78 | 
79 | <!--This file was originally licensed under Apache 2.0. It has been modified.
80 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/docs/favicon.ico


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |   <meta charset="UTF-8">
  5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">  <title>Lemonade Server</title>  
  6 |   <link rel="icon" href="./favicon.ico">
  7 |   <link rel="stylesheet" href="assets/website-styles.css">
  8 | </head>
  9 | <body>  <nav class="navbar" id="navbar">
 10 |     <div class="navbar-context" id="navbar-context">
 11 |       <a class="navbar-back" onclick="showMain(); return false;">← Back</a>
 12 |       <span class="text-muted">Installation Setup</span>
 13 |     </div>
 14 |     <div class="navbar-links">
 15 |       <a href="https://github.com/lemonade-sdk/lemonade">GitHub</a>
 16 |       <a href="docs/">Docs</a>
 17 |       <a href="docs/server/server_models/">Models</a>
 18 |       <a href="docs/server/apps/">Featured Apps</a>
 19 |     </div>
 20 |   </nav>
 21 |   
 22 |   <main class="main">
 23 |     <div class="title">🍋 Lemonade Server</div>
 24 |     <div class="subtitle">
 25 |       Refreshingly fast local LLMs on GPUs and NPUs.<br>
 26 |       Integrate with <a href="https://www.youtube.com/watch?v=PXNTDZREJ_A">Open WebUI</a>, <a href="https://youtu.be/JecpotOZ6qo?si=WxWVQhUBCJQgE6vX">AI Toolkit</a>, or <a href="docs/server/server_integration/">your own app</a> in minutes.
 27 |     </div>
 28 |     <div class="button-row">
 29 |       <a class="download-btn" href="https://github.com/lemonade-sdk/lemonade/releases/latest/download/Lemonade_Server_Installer.exe">
 30 |         Download<br>
 31 |         <span class="download-sub">for Windows 11</span>
 32 |       </a>      <a class="dev-btn" href="#" onclick="showInstaller(); return false;">
 33 |         Developer Setup<br>
 34 |         <span class="dev-sub">For Ubuntu and Windows</span>
 35 |       </a>    </div>
 36 |   </main>
 37 |     <!-- Install Selector View -->
 38 |   <div id="lmn-install-view" class="lmn-install-view">
 39 |     <span class="lmn-install-title">🍋 Installation Setup</span>
 40 |     <div id="lmn-installer" class="lmn-installer-container">
 41 |       <table class="lmn-installer-table">
 42 |         <tr>
 43 |           <td class="lmn-label">Operating System</td>
 44 |           <td id="os-win" class="lmn-active" onclick="lmnSet('os','win')">Windows</td>
 45 |           <td id="os-linux" onclick="lmnSet('os','linux')">Linux</td>
 46 |         </tr>
 47 |         <tr>
 48 |           <td class="lmn-label">Installation Type</td>
 49 |           <td id="type-server" class="lmn-active" onclick="lmnSet('type','server')">Server Only</td>
 50 |           <td id="type-full" onclick="lmnSet('type','full')">Full SDK</td>
 51 |         </tr>
 52 |         <tr>
 53 |           <td class="lmn-label">Installation Method</td>
 54 |           <td id="method-gui" class="lmn-active" onclick="lmnSet('method','gui')">GUI .exe</td>
 55 |           <td id="method-pypi" onclick="lmnSet('method','pypi')">PyPI</td>
 56 |           <td id="method-src" onclick="lmnSet('method','src')">From Source</td>
 57 |         </tr>
 58 |         <tr>
 59 |           <td class="lmn-label">Inference Engine</td>
 60 |           <td id="fw-oga" class="lmn-active" onclick="lmnSet('fw','oga')">OGA</td>
 61 |           <td id="fw-llama" onclick="lmnSet('fw','llama')">llama.cpp</td>
 62 |           <td id="fw-torch" onclick="lmnSet('fw','torch')">PyTorch</td>
 63 |         </tr>
 64 |         <tr>
 65 |           <td class="lmn-label">Device Support</td>
 66 |           <td id="dev-hybrid" class="lmn-active" onclick="lmnSet('dev','hybrid')">Hybrid</td>
 67 |           <td id="dev-cpu" onclick="lmnSet('dev','cpu')">CPU</td>
 68 |           <td id="dev-gpu" onclick="lmnSet('dev','gpu')">GPU</td>
 69 |         </tr>
 70 |       </table>
 71 |       <div class="lmn-content-section">
 72 |         <div class="lmn-section-header">
 73 |           Installation Instructions
 74 |         </div>
 75 |         <div id="lmn-badges" class="lmn-badges"></div>
 76 |         <div id="lmn-install-content">
 77 |           <div id="lmn-download-area" class="lmn-download-section" style="display: none;">
 78 |             <a id="lmn-link" href="https://github.com/lemonade-sdk/lemonade/releases/latest/download/lemonade_server_installer.exe">Download Lemonade Server Installer (.exe)</a>
 79 |           </div>
 80 |           <div id="lmn-command" class="lmn-command"></div>
 81 |         </div>
 82 |       </div>
 83 |       <div id="lmn-explore-section" class="lmn-content-section" style="margin-top: 1.5em;">
 84 |         <div class="lmn-section-header lmn-explore-header">
 85 |           Quick Start
 86 |         </div>
 87 |         <div id="lmn-explore-command" class="lmn-command"></div>
 88 |       </div>
 89 |     </div>
 90 |   </div>
 91 |   
 92 |   <footer class="site-footer">
 93 |     <div class="dad-joke">When life gives you LLMs, make an LLM aide.</div>    <div class="copyright">Copyright 2025 AMD</div>
 94 |   </footer>
 95 |   
 96 |   <script src="assets/install-selector.js"></script>  <script>
 97 |     function showInstaller() {
 98 |       document.querySelector('.main').classList.add('hidden');
 99 |       document.getElementById('lmn-install-view').style.display = 'flex';
100 |       document.getElementById('navbar-context').classList.add('show');
101 |       // Initialize the installer if not already done
102 |       if (typeof lmnInit === 'function') {
103 |         lmnInit();
104 |       }
105 |     }
106 |     
107 |     function showMain() {
108 |       document.querySelector('.main').classList.remove('hidden');
109 |       document.getElementById('lmn-install-view').style.display = 'none';
110 |       document.getElementById('navbar-context').classList.remove('show');
111 |     }
112 |   </script>
113 | </body>
114 | </html>
115 | 


--------------------------------------------------------------------------------
/docs/install_options.html:
--------------------------------------------------------------------------------
 1 | <!-- Lemonade SDK Install Selector -->
 2 | <!DOCTYPE html>
 3 | <html lang="en">
 4 | <head>  <meta charset="UTF-8">
 5 |   <title>Lemonade Install Selector</title>
 6 |   <link rel="icon" href="./favicon.ico">
 7 |   <link rel="stylesheet" href="assets/website-styles.css">
 8 | </head>
 9 | <body class="install-options">  <!-- Breadcrumb Navigation -->
10 |   <div class="breadcrumb">
11 |     <a class="breadcrumb-back" onclick="window.history.back(); return false;">← Documentation</a>
12 |     <span class="breadcrumb-separator">/</span>
13 |     <span>Installation Setup</span>
14 |   </div>
15 |   
16 |   <div class="lmn-center">
17 |     <span class="lmn-install-title">🍋 Installation Setup</span>
18 |     <div id="lmn-installer" class="lmn-installer-container">
19 |       <table class="lmn-installer-table">
20 |         <tr>
21 |           <td class="lmn-label">Operating System</td>
22 |           <td id="os-win" class="lmn-active" onclick="lmnSet('os','win')">Windows</td>
23 |           <td id="os-linux" onclick="lmnSet('os','linux')">Linux</td>
24 |         </tr>
25 |         <tr>
26 |           <td class="lmn-label">Installation Type</td>
27 |           <td id="type-server" class="lmn-active" onclick="lmnSet('type','server')">Server Only</td>
28 |           <td id="type-full" onclick="lmnSet('type','full')">Full SDK</td>
29 |         </tr>
30 |         <tr>
31 |           <td class="lmn-label">Installation Method</td>
32 |           <td id="method-gui" class="lmn-active" onclick="lmnSet('method','gui')">GUI .exe</td>
33 |           <td id="method-pypi" onclick="lmnSet('method','pypi')">PyPI</td>
34 |           <td id="method-src" onclick="lmnSet('method','src')">From Source</td>
35 |         </tr>
36 |         <tr>
37 |           <td class="lmn-label">Inference Engine</td>
38 |           <td id="fw-oga" class="lmn-active" onclick="lmnSet('fw','oga')">OGA</td>
39 |           <td id="fw-llama" onclick="lmnSet('fw','llama')">llama.cpp</td>
40 |           <td id="fw-torch" onclick="lmnSet('fw','torch')">PyTorch</td>
41 |         </tr>
42 |         <tr>
43 |           <td class="lmn-label">Device Support</td>
44 |           <td id="dev-hybrid" class="lmn-active" onclick="lmnSet('dev','hybrid')">Hybrid</td>
45 |           <td id="dev-cpu" onclick="lmnSet('dev','cpu')">CPU</td>
46 |           <td id="dev-gpu" onclick="lmnSet('dev','gpu')">GPU</td>
47 |         </tr>
48 |       </table>
49 |       <div class="lmn-content-section">
50 |         <div class="lmn-section-header">
51 |           Installation Instructions
52 |         </div>
53 |         <div id="lmn-badges" class="lmn-badges"></div>
54 |         <div id="lmn-install-content">
55 |           <div id="lmn-download-area" class="lmn-download-section" style="display: none;">
56 |             <a id="lmn-link" href="https://github.com/lemonade-sdk/lemonade/releases/latest/download/lemonade_server_installer.exe">Download Lemonade Server Installer (.exe)</a>
57 |           </div>
58 |           <div id="lmn-command" class="lmn-command"></div>
59 |         </div>
60 |       </div>
61 |       <div id="lmn-explore-section" class="lmn-content-section" style="margin-top: 1.5em;">
62 |         <div class="lmn-section-header lmn-explore-header">
63 |           Quick Start
64 |         </div>
65 |         <div id="lmn-explore-command" class="lmn-command"></div>
66 |       </div>
67 |     </div>
68 |   </div>
69 |   <script src="assets/install-selector.js"></script>
70 |   <script>
71 |     // Initialize the installer when the page loads
72 |     document.addEventListener('DOMContentLoaded', function() {
73 |       if (typeof lmnInit === 'function') {
74 |         lmnInit();
75 |       }
76 |     });
77 |   </script>
78 | </body>
79 | </html>
80 | 


--------------------------------------------------------------------------------
/docs/publish_website_docs.py:
--------------------------------------------------------------------------------
 1 | # In conda environment of choice, run the following from genai/ folder:
 2 | # pip install -r docs/assets/mkdocs_requirements.txt
 3 | 
 4 | # Then run this script to publish the documentation to docs/docs/
 5 | # python docs/publish_website_docs.py
 6 | 
 7 | # Standard library imports for file, directory, regex, system, and subprocess operations
 8 | import os
 9 | import shutil
10 | import re
11 | import sys
12 | import subprocess
13 | 
14 | 
15 | def main():
16 | 
17 |     # Print the current working directory for debugging
18 |     print("[INFO] Current working directory:", os.getcwd())
19 | 
20 |     # Define source and destination file paths
21 |     src = "docs/server/README.md"
22 |     dst = "docs/index.md"
23 | 
24 |     # Check if the source README exists; exit with error if not
25 |     if not os.path.exists(src):
26 |         print("[ERROR] docs/server/README.md not found!")
27 |         sys.exit(1)
28 | 
29 |     # Read the source README, making necessary replacements
30 |     with open(src, "r", encoding="utf-8") as f:
31 |         readme_content = f.read()
32 | 
33 |     # Write the content to the destination index.md
34 |     with open(dst, "w", encoding="utf-8") as f:
35 |         f.write(readme_content)
36 |     print("[INFO] Copied docs/server/README.md to docs/index.md.")
37 | 
38 |     # Read the just-written index.md and perform additional link fixes for website publishing
39 |     print("[INFO] Fixing links in docs/index.md...")
40 |     with open(dst, "r", encoding="utf-8") as f:
41 |         content = f.read()
42 | 
43 |     # List of (pattern, replacement) tuples for fixing internal documentation links
44 |     replacements = [
45 |         (r"\(\./apps/README\.md\)", r"(./server/apps/README.md)"),
46 |         (r"\(\./concepts\.md\)", r"(./server/concepts.md)"),
47 |         (r"\(\./lemonade-server-cli\.md\)", r"(./server/lemonade-server-cli.md)"),
48 |         (r"\(\./server_models\.md\)", r"(./server/server_models.md)"),
49 |         (r"\(\./server_spec\.md\)", r"(./server/server_spec.md)"),
50 |         (r"\(\./server_integration\.md\)", r"(./server/server_integration.md)"),
51 |     ]
52 |     for pattern, repl in replacements:
53 |         content = re.sub(pattern, repl, content)
54 | 
55 |     # Write the fully processed content back to index.md
56 |     with open(dst, "w", encoding="utf-8") as f:
57 |         f.write(content)
58 | 
59 |     # Remove existing docs/docs if it exists
60 |     if os.path.exists("docs/docs"):
61 |         print("Removing ", os.path.abspath("docs/docs"))
62 |         shutil.rmtree("docs/docs")
63 | 
64 |     # Build the documentation using mkdocs
65 |     print("[INFO] Building documentation with mkdocs...")
66 |     subprocess.run(["mkdocs", "build", "--clean"], check=True)
67 | 
68 |     # Move the generated site/ directory to docs/docs/, replacing it if it already exists
69 |     print("[INFO] Moving site/ to docs/docs/...")
70 | 
71 |     # Check what mkdocs actually generated
72 |     if os.path.exists(os.path.abspath("site/docs")):
73 |         # If mkdocs generated site/docs/, move that content
74 |         source_dir = os.path.abspath("site/docs")
75 |     elif os.path.exists(os.path.abspath("site")):
76 |         # If mkdocs generated site/, move that content
77 |         source_dir = os.path.abspath("site")
78 |     else:
79 |         print("[ERROR] No site directory found after mkdocs build!")
80 |         sys.exit(1)
81 | 
82 |     # Move the correct source directory
83 |     shutil.move(source_dir, "docs/docs")
84 |     print(f"[INFO] Moved {os.path.abspath(source_dir)} to docs/docs/")
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/docs/server/README.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with Lemonade Server
 2 | 
 3 | 🍋 Lemonade Server is a server interface that uses the standard Open AI API, allowing applications to integrate with local LLMs. This means that you can easily replace cloud-based LLMs with private and free LLMs that run locally on your own PC's NPU and GPU.
 4 | 
 5 | Lemonade Server is available as a standalone tool with a [one-click Windows GUI installer](https://github.com/lemonade-sdk/lemonade/releases/latest/download/Lemonade_Server_Installer.exe).
 6 | 
 7 | Once you've installed, we recommend checking out these resources:
 8 | 
 9 | | Documentation | Description |
10 | |---------------|-------------|
11 | | [Supported Applications](./apps/README.md) | Explore applications that work out-of-the-box with Lemonade Server. |
12 | | [Lemonade Server Concepts](./concepts.md) | Background knowledge about local LLM servers and the OpenAI standard. |
13 | | [`lemonade-server` CLI Guide](./lemonade-server-cli.md) | Learn how to manage the server process and install new models using the command-line interface. |
14 | | [Models List](./server_models.md) | Browse a curated set of LLMs available for serving. |
15 | | [Server Spec](./server_spec.md) | Review all supported OpenAI-compatible and Lemonade-specific API endpoints. |
16 | | [Integration Guide](./server_integration.md) | Step-by-step instructions for integrating Lemonade Server into your own applications. |
17 | 
18 | > Note: if you want to develop Lemonade Server itself, you can [install from source](https://lemonade-server.ai/install_options.html).
19 | 
20 | ## Integrate Lemonade Server with Your Application
21 | 
22 | Since Lemonade Server implements the standard OpenAI API specification, you can use any OpenAI-compatible client library by configuring it to use `http://localhost:8000/api/v1` as the base URL. A table containing official and popular OpenAI clients on different languages is shown below.
23 | 
24 | Feel free to pick and choose your preferred language.
25 | 
26 | 
27 | | Python | C++ | Java | C# | Node.js | Go | Ruby | Rust | PHP |
28 | |--------|-----|------|----|---------|----|-------|------|-----|
29 | | [openai-python](https://github.com/openai/openai-python) | [openai-cpp](https://github.com/olrea/openai-cpp) | [openai-java](https://github.com/openai/openai-java) | [openai-dotnet](https://github.com/openai/openai-dotnet) | [openai-node](https://github.com/openai/openai-node) | [go-openai](https://github.com/sashabaranov/go-openai) | [ruby-openai](https://github.com/alexrudall/ruby-openai) | [async-openai](https://github.com/64bit/async-openai) | [openai-php](https://github.com/openai-php/client) |
30 | 
31 | 
32 | ### Python Client Example
33 | ```python
34 | from openai import OpenAI
35 | 
36 | # Initialize the client to use Lemonade Server
37 | client = OpenAI(
38 |     base_url="http://localhost:8000/api/v1",
39 |     api_key="lemonade"  # required but unused
40 | )
41 | 
42 | # Create a chat completion
43 | completion = client.chat.completions.create(
44 |     model="Llama-3.2-1B-Instruct-Hybrid",  # or any other available model
45 |     messages=[
46 |         {"role": "user", "content": "What is the capital of France?"}
47 |     ]
48 | )
49 | 
50 | # Print the response
51 | print(completion.choices[0].message.content)
52 | ```
53 | 
54 | For more detailed integration instructions, see the [Integration Guide](./server_integration.md).
55 | 
56 | 
57 | <!--Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/server/apps/README.md:
--------------------------------------------------------------------------------
 1 | # Lemonade Server Examples
 2 | 
 3 | Many applications today utilize OpenAI models like ChatGPT through APIs such as:
 4 | 
 5 | `POST https://api.openai.com/v1/chat/completions`
 6 | 
 7 | This API call triggers the ChatGPT model to generate responses for a chat. With Lemonade Server, we are replacing the OpenAI endpoint with a local LLM. The new API call becomes:
 8 | 
 9 | `POST http://localhost:8000/api/v1/chat/completions`
10 | 
11 | This allows the same application to leverage local LLMs instead of relying on OpenAI's cloud-based models. The guides in this folder show how to connect Lemonade Server to popular applications to enable local LLM execution. To run these examples, you'll need a Windows PC.
12 | 
13 | ## 🎥 Video Tutorials
14 | 
15 | <div id="yt-carousel" data-videos='[
16 |   {"id": "PXNTDZREJ_A", "title": "Open WebUI Demo"},
17 |   {"id": "JecpotOZ6qo", "title": "Microsoft AI Toolkit Demo"},
18 |   {"id": "bP_MZnDpbUc", "title": "Continue Coding Assistant"},
19 |   {"id": "_PORHv_-atI", "title": "GAIA"}
20 | ]'></div>
21 | 
22 | <div class="hide-in-mkdocs">
23 | 
24 | Links to the video tutorials available are provided in the third column of the following table.
25 | 
26 | </div>
27 | 
28 | | App                 | Guide                                                                                               | Video                                                                                     |
29 | |---------------------|-----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
30 | | [Open WebUI](https://github.com/open-webui/open-webui)         | [How to chat with Lemonade LLMs in Open WebUI](./open-webui.md)   | [Watch Demo](https://www.youtube.com/watch?v=PXNTDZREJ_A)                                 |
31 | | [Continue.dev](https://www.continue.dev/)   | [How to use Lemonade LLMs as a coding assistant in Continue](./continue.md)                                          | [Watch Demo](https://youtu.be/bP_MZnDpbUc?si=hRhLbLEV6V_OGlUt)                            |
32 | | [Microsoft AI Toolkit](https://learn.microsoft.com/en-us/windows/ai/toolkit/)   | [Experimenting with Lemonade LLMs in VS Code using Microsoft's AI Toolkit](./ai-toolkit.md)                                          | [Watch Demo](https://youtu.be/JecpotOZ6qo?si=WxWVQhUBCJQgE6vX)                            |
33 | | [GAIA](https://github.com/amd/gaia)   | [An application for running LLMs locally, includes a ChatBot, YouTube Agent, and more](https://github.com/amd/gaia?tab=readme-ov-file#getting-started-guide) | [Watch Demo](https://youtu.be/_PORHv_-atI?si=EYQjmrRQ6Zy2H0ek)                            |
34 | | [Microsoft AI Dev Gallery](https://aka.ms/ai-dev-gallery) | [Microsoft's showcase application for exploring AI capabilities](./ai-dev-gallery.md) | _coming soon_                                                                             |
35 | | [CodeGPT](https://codegpt.co/)   | [How to use Lemonade LLMs as a coding assistant in CodeGPT](./codeGPT.md)                                          | _coming soon_                                                                             |
36 | | [MindCraft](https://github.com/kolbytn/mindcraft) | [How to use Lemonade LLMs as a Minecraft agent](./mindcraft.md) | _coming soon_                                                                             |
37 | | [wut](https://github.com/shobrook/wut)   | [Terminal assistant that uses Lemonade LLMs to explain errors](./wut.md)                                          | _coming soon_                                                                             |
38 | | [AnythingLLM](https://anythingllm.com/) | [Running agents locally with Lemonade and AnythingLLM](./anythingLLM.md) | _coming soon_                                                                             |
39 | | [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness)   | [A unified framework to test generative language models on a large number of different evaluation tasks.](./lm-eval.md)              | _coming soon_                                                                             |
40 | | [PEEL](https://github.com/lemonade-apps/peel)     | [Using Local LLMs in Windows PowerShell](https://github.com/lemonade-apps/peel?tab=readme-ov-file#installation)                   | _coming soon_                                                                             |
41 | 
42 | ## 📦 Looking for Installation Help?
43 | 
44 | To set up Lemonade Server, check out the [Lemonade Server guide](../README.md) for installation instructions and the [server spec](../server_spec.md) to learn more about the functionality. For more information about 🍋 Lemonade SDK, see the [Lemonade SDK README](../README.md).
45 | 
46 | ## 🛠️ Support
47 | 
48 | If you encounter any issues or have questions, feel free to:
49 | 
50 | - File an issue on our [GitHub Issues page](https://github.com/lemonade-sdk/lemonade/issues).
51 | - Email us at [lemonade@amd.com](mailto:lemonade@amd.com).
52 | 
53 | ## 💡 Want to Add an Example?
54 | 
55 | If you've connected Lemonade to a new application, feel free to contribute a guide by following our contribution guide found [here](../../contribute.md) or let us know at [lemonade@amd.com](mailto:lemonade@amd.com).
56 | 
57 | <!--This file was originally licensed under Apache 2.0. It has been modified.
58 | Modifications Copyright (c) 2025 AMD-->
59 | 


--------------------------------------------------------------------------------
/docs/server/apps/ai-dev-gallery.md:
--------------------------------------------------------------------------------
 1 | # AI Dev Gallery with Lemonade Server
 2 | 
 3 | ## Overview
 4 | 
 5 | [AI Dev Gallery](https://aka.ms/ai-dev-gallery) is Microsoft's showcase application that demonstrates various AI capabilities through built-in samples and applications. It provides an easy way to explore and experiment with different AI models and scenarios, including text generation, chat applications, and more.
 6 | 
 7 | AI Dev Gallery has native integration with Lemonade Server, which means it can automatically detect and connect to your local Lemonade instance without manual URL configuration.
 8 | 
 9 | ## Expectations
10 | 
11 | AI Dev Gallery works well with most models available in Lemonade. The built-in samples are designed to work with various model types and sizes, making it a great tool for testing and exploring different AI capabilities locally.
12 | 
13 | The application provides a user-friendly interface for experimenting with AI models through pre-built scenarios, making it accessible for both beginners and advanced users.
14 | 
15 | ## Setup
16 | 
17 | ### Prerequisites
18 | 
19 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe.
20 | 2. **Important**: Make sure your Lemonade Server is running before opening AI Dev Gallery.
21 | 
22 | ### Install AI Dev Gallery
23 | 
24 | 1. Open the Microsoft Store on Windows.
25 | 2. Search for "AI Dev Gallery" by Microsoft Corporation.
26 | 3. Click "Install" to download and install the application.
27 | 
28 | Alternatively, you can access AI Dev Gallery directly through [aka.ms/ai-dev-gallery](https://aka.ms/ai-dev-gallery).
29 | 
30 | ### Connect to Lemonade
31 | 
32 | AI Dev Gallery has native integration with Lemonade Server, so no manual configuration is required. The application will automatically detect your running Lemonade Server instance.
33 | 
34 | **Important**: Ensure your Lemonade Server is running before launching AI Dev Gallery.
35 | 
36 | ## Usage
37 | 
38 | AI Dev Gallery provides various built-in applications and samples to explore AI capabilities:
39 | 
40 | ### Quick Start
41 | 
42 | 1. Launch AI Dev Gallery.
43 | 2. Navigate to **Samples** → **Text** → **Chat** (or another text/code sample).
44 | 3. Click on the model selector above the chat window.
45 | 4. Select **Lemonade** from the available providers.
46 | 5. Choose your preferred model from the list of available models.
47 | 
48 | ### Supported Scenarios
49 | 
50 | AI Dev Gallery supports various AI scenarios through its sample applications with Lemonade integration:
51 | 
52 | **Text Processing**:
53 | 
54 | - **Conversational AI**: Chat and Semantic Kernel Chat for interactive conversations
55 | - **Content Generation**: Generate text for various purposes and creative writing
56 | - **Language Tasks**: Translation, grammar checking, and paraphrasing
57 | - **Text Analysis**: Sentiment analysis and content moderation
58 | - **Information Retrieval**: Semantic search and retrieval augmented generation
59 | - **Text Enhancement**: Summarization and custom parameter configurations
60 | 
61 | **Code Assistance**:
62 | 
63 | - **Code Generation**: Create code snippets and programs
64 | - **Code Analysis**: Explain existing code and understand functionality
65 | 
66 | 
67 | ### Tips for Best Experience
68 | 
69 | - Start your Lemonade Server before opening AI Dev Gallery
70 | - Try different models to see how they perform across various scenarios
71 | - Explore different sample categories to understand various AI capabilities
72 | - Use the built-in samples as starting points for your own AI experiments
73 | 
74 | ## Troubleshooting
75 | 
76 | ### AI Dev Gallery doesn't detect Lemonade
77 | 
78 | - Ensure Lemonade Server is running and accessible at `http://localhost:8000`
79 | - Restart AI Dev Gallery after ensuring Lemonade Server is running
80 | 
81 | ### Models not appearing in the selector
82 | 
83 | - Open `http://localhost:8000` in a browser and make sure to download the models you want to use through the "Model Manager" tab.
84 | 
85 | ## Additional Resources
86 | 
87 | - [AI Dev Gallery Website](https://aka.ms/ai-dev-gallery)
88 | - [Lemonade Server Models](../server_models.md)
89 | 
90 | <!--This file was originally licensed under Apache 2.0. It has been modified.
91 | Modifications Copyright (c) 2025 AMD-->
92 | 


--------------------------------------------------------------------------------
/docs/server/apps/ai-toolkit.md:
--------------------------------------------------------------------------------
 1 | # Microsoft AI Toolkit for VS Code
 2 | 
 3 | ## Overview
 4 | 
 5 | The [AI Toolkit for Visual Studio Code](https://learn.microsoft.com/en-us/windows/ai/toolkit/) is a VS Code extension that simplifies generative AI app development by bringing together cutting-edge AI development tools and models from various catalogs. It supports running AI models locally or connecting to remote models via API keys.
 6 | 
 7 | ## Demo Video
 8 | 
 9 | ▶️ [Watch on YouTube](https://www.youtube.com/watch?v=JecpotOZ6qo)
10 | 
11 | <iframe width="560" height="315" src="https://www.youtube.com/embed/JecpotOZ6qo?si=9YcWwVEx7UX5A812" 
12 | title="YouTube video player" frameborder="0" allowfullscreen></iframe>
13 | 
14 | ## Expectations
15 | 
16 | We have found that most LLMs work well with this application. 
17 | 
18 | However, the `Inference Parameters` option is not fully supported, as Lemonade Server currently does not accept those as inputs (see [server_spec.md](../server_spec.md) for details).
19 | 
20 | 
21 | ## Setup
22 | 
23 | ### Prerequisites
24 | 
25 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe.
26 | 
27 | ### Install AI Toolkit for VS Code
28 | 
29 | 1. Open the Extensions tab in VS Code Activity Bar.
30 | 2. Search for "AI Toolkit for Visual Studio Code" in the Extensions Marketplace search bar.
31 | 3. Select the AI Toolkit extension and click install.
32 | 
33 | This will add an AI Toolkit icon to your VS Code Activity Bar.
34 | 
35 | ### Connect Lemonade to AI Toolkit
36 | 
37 | The AI Toolkit now supports "Bring Your Own Model" functionality, allowing you to connect to models served via the OpenAI API standard, which Lemonade uses.
38 | 
39 | 1. Open the AI Toolkit tab in your VS Code Activity Bar.
40 | 2. In the right corner of the "My Models" section, click the "+" button to "Add model for remote inference".
41 | 3. Select "Add a custom model".
42 | 4. When prompted to "Enter OpenAI chat completion endpoint URL" enter:
43 |     ```
44 |     http://localhost:8000/api/v1/chat/completions
45 |     ```
46 | 5. When prompted to "Enter the exact model name as in the API" select a model (e.g., `Phi-3-Mini-Instruct-Hybrid`)
47 |     - Note: You can get a list of all models available [here](../server_models.md).
48 | 6. Select the same name as the display model name.
49 | 7. Skip the HTTP authentication step by pressing "Enter".
50 | 
51 | ## Usage
52 | 
53 | Once you've set up the Lemonade model in AI Toolkit, you can:
54 | 
55 | 1. Use the **AI Playground** tool to directly interact with your added model.
56 | 2. Use the **Prompt Builder** tool to craft effective prompts for your AI models.
57 | 3. Use the **Bulk Run** tool to compute responses for custom datasets and easily visualize those responses on a table format.
58 | 4. Use the **Evaluation** tool to quickly assess your model's coherence, fluency, relevance, and similarity, as well as to compute BLEU, F1, GLEU, and Meteor scores.
59 | 
60 | ## Additional Resources
61 | 
62 | - [AI Toolkit for VS Code Documentation](https://learn.microsoft.com/en-us/windows/ai/toolkit/)
63 | - [AI Toolkit GitHub Repository](https://github.com/microsoft/vscode-ai-toolkit)
64 | - [Bring Your Own Models on AI Toolkit](https://techcommunity.microsoft.com/blog/azuredevcommunityblog/bring-your-own-models-on-ai-toolkit---using-ollama-and-api-keys/4369411)
65 | 
66 | <!--This file was originally licensed under Apache 2.0. It has been modified.
67 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/server/apps/anythingLLM.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Running agents locally with Lemonade and AnythingLLM
 3 | 
 4 | ## Overview
 5 | 
 6 | [AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) is a versatile local LLM platform that allows you to chat with your documents and code using a variety of models. It supports the OpenAI-compatible API interface, allowing easy integration with local servers like Lemonade.
 7 | 
 8 | This guide will help you configure AnythingLLM to use Lemonade's OpenAI-compatible server, and utilize the powerful `@agent` capability to interact with documents, webpages, and more.
 9 | 
10 | ## Expectations
11 | 
12 | Lemonade integrates best with AnythingLLM when using models such as `Qwen-1.5-7B-Chat-Hybrid` and `Llama-3.2-1B-Instruct-Hybrid`, both of which support a context length of up to 3,000 tokens.
13 | 
14 | Keep in mind that when using the `@agent` feature, multi-turn conversations can quickly consume available context. As a result, the number of back-and-forth turns in a single conversation may be limited due to the growing context size.
15 | 
16 | 
17 | ## Setup
18 | 
19 | ### Prerequisites
20 | 
21 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe.
22 | 2. Install and set up AnythingLLM from their [GitHub](https://github.com/Mintplex-Labs/anything-llm#quick-start) or [website](https://anythingllm.com/desktop).
23 | 
24 | 
25 | ### Configure AnythingLLM to Use Lemonade
26 | 
27 | <ol>
28 |   <li>In the bottom of the left menu, click on the wrench icon to "Open Settings".</li>
29 |   <li>Under the menu "AI Providers", click "LLM".</li>
30 |   <li>
31 |     Select "Generic OpenAI" and enter the following info:
32 |     <table>
33 |       <tr><th>Setting</th><th>Value</th></tr>
34 |       <tr><td><b>Base URL</b></td><td><code>http://localhost:8000/api/v1</code></td></tr>
35 |       <tr><td><b>API Key</b></td><td><code>-</code></td></tr>
36 |       <tr><td><b>Chat Model Name</b></td><td><code>Qwen-1.5-7B-Chat-Hybrid</code></td></tr>
37 |       <tr><td><b>Token context window</b></td><td><code>3000</code></td></tr>
38 |       <tr><td><b>Max Tokens</b></td><td><code>3000</code></td></tr>
39 |     </table>
40 |   </li>
41 |   <li>In the bottom left, click the back button to exit.</li>
42 |   <li>In the left menu, click "New Workspace" and give it a name.</li>
43 |   <li>Where you see your new workspace, click the gear icon to open the "Workspace Settings"</li>
44 |   <li>In the top menu of the window that opens, click on "Agent Configuration"</li>
45 |   <li>Under Chat Settings, select Generic OpenAI and click save.</li>
46 |   <li>Under Workspace Agent LLM Provider, select "Generic OpenAI" and click save.</li>
47 | </ol>
48 | 
49 | ## Usage with @agent
50 | 
51 | ### Overview
52 | 
53 | Agents are capable of scraping websites, listing and summarizing documents, searching the web, creating charts, and even saving files to your desktop or their own memory.
54 | 
55 | To start an agent session, simply go to any workspace and type `@agent <your prompt>`. To exit the session, just type `exit`.
56 | 
57 | ### Agent Skills
58 | 
59 | You may turn on and off specific `Agent Skills` by going to your `Workspace Settings` → `Agent Configuration` → `Configure Agent Skills`.
60 | 
61 | Available agent skills include:
62 | 
63 | * RAG & long-term memory
64 | * View and summarize documents
65 | * Scrape Websites
66 | * Generate & save files to browser
67 | * Generate Charts
68 | * Web Search
69 | * SQL Connector
70 | 
71 | ### Examples
72 | 
73 | Here are some examples on how you can interact with Anything LLM agents:
74 | 
75 | - **Rag & long-term memory**
76 |     - `@agent My name is Dr Lemon. Remember this in our next conversation`
77 |     - Then, on a follow up chat you can ask `@agent What is my name according to your memory?`
78 | - **Scrape Websites**
79 |     - `@agent Scrape this website and tell me what are the two ways of installing lemonade https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/README.md`
80 | - **Web Search** (enable skill before trying)
81 |     - `@agent Search the web for the best place to buy shoes`
82 | 
83 | You can find more details about agent usage [here](https://docs.anythingllm.com/agent/usage).
84 | 
85 | ## Additional Resources
86 | 
87 | - [AnthingLLM Website](https://anythingllm.com/)
88 | - [AnythingLLM GitHub](https://github.com/Mintplex-Labs/anything-llm)
89 | - [AnythingLLM Documentation](https://docs.anythingllm.com/)
90 | 
91 | <!--This file was originally licensed under Apache 2.0. It has been modified.
92 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/server/apps/codeGPT.md:
--------------------------------------------------------------------------------
 1 | # CodeGPT with VS Code
 2 | 
 3 | ## Overview
 4 | 
 5 | [CodeGPT Chat](https://codegpt.co/) is an AI-powered chatbot designed to assist developers with coding tasks directly within their preferred integrated development environments (IDEs), for example, VS Code.
 6 | 
 7 | ## Expectations
 8 | 
 9 | We have found that the `Qwen-1.5-7B-Chat-Hybrid` model is the best Hybrid model available for coding. It is good at chatting with a few files at a time in your codebase to learn more about them. It can also make simple code editing suggestions pertaining to a few lines of code at a time.
10 | 
11 | However, we do not recommend using this model for analyzing large codebases at once or making large or complex file edits.
12 | 
13 | ## Setup
14 | 
15 | ### Prerequisites
16 | 
17 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe.
18 | 
19 | ### Install CodeGPT in VS Code
20 | 
21 | > The following instructions are based off CodeGPT provided instructions found [here](https://docs.codegpt.co/docs/tutorial-basics/installation).
22 | 
23 | 1. Open the Extensions tab in VS Code Activity Bar.
24 | 1. Search "CodeGPT: Chat & AI Agents" in the Extensions Marketplace search bar.
25 | 1. Select the CodeGPT extension and click install.
26 | 
27 | This will add a CodeGPT tab to your VS Code Activity Bar.
28 | 
29 | ### Add Lemonade Server to CodeGPT
30 | 
31 | > Note: The following instructions are based on instructions from CodeGPT found [here](https://docs.codegpt.co/docs/tutorial-ai-providers/custom).
32 | 
33 | 
34 | <ol>
35 |   <li>Open the CodeGPT tab in your VS Code Activity Bar.</li>
36 |   <li>Sign Up or Sign into your account.</li>
37 |   <li>In the model dropdown menu and click "View More".</li>
38 |   <li>Select the tab: "LLMs Cloud model"</li>
39 |   <li>Under "All Models", set the following:
40 |   <table>
41 |    <tr><th>Field</th><th>Value</th></tr>
42 |       <tr><td><b>Select Provider:</b></td><td><code>Custom</code></td></tr>
43 |       <tr><td><b>Select Model: </b></td><td><code>Qwen-1.5-7B-Chat-Hybrid</code></td></tr>
44 |    </table>
45 |   </li>
46 |   <li>Click "Change connection settings" and enter the following information:
47 |     <table>
48 |       <tr><th>Field</th><th>Value</th></tr>
49 |       <tr><td><b>API Key</b></td><td><code>-</code></td></tr>
50 |       <tr><td><b>Custom Link</b></td><td><code>http://localhost:8000/api/v1/api/v1</code></td></tr>
51 |     </table>
52 |   </li>
53 | </ol>
54 | 
55 | ## Usage
56 | 
57 | > Note: see the CodeGPT [user guide](https://docs.codegpt.co/docs/intro) to learn about all of their features.
58 | 
59 | To try out CodeGPT:
60 | 
61 | - Open the CodeGPT tab in your VS Code Activity Bar, and in the chat box, type a question about your code. Use the `#` symbol to specify a file.
62 |   - Example: "What's the fastest way to install lemonade in #getting_started.md?"
63 | - Use /Fix to find and fix a minor bug.
64 | - Use /Document to come up with docstrings and comments for a file.
65 | - Use /UnitTest to make a  test file.
66 | 
67 | <!--This file was originally licensed under Apache 2.0. It has been modified.
68 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/server/apps/continue.md:
--------------------------------------------------------------------------------
  1 | # Continue Coding Assistant
  2 | 
  3 | ## Overview
  4 | 
  5 | [Continue](https://www.continue.dev/) is a coding assistant that lives inside of a VS Code extension. It supports chatting with your codebase, making edits, and a lot more.
  6 | 
  7 | ## Demo Video
  8 | 
  9 | ▶️ [Watch on YouTube](https://www.youtube.com/watch?v=bP_MZnDpbUc&source_ve_path=MjM4NTE)
 10 | 
 11 | <iframe width="560" height="315" src="https://www.youtube.com/embed/bP_MZnDpbUc?si=0KZLzQzFlRvW9J9f" 
 12 | title="YouTube video player" frameborder="0" allowfullscreen></iframe>
 13 | 
 14 | ## Expectations
 15 | 
 16 | We have found that the `Qwen-1.5-7B-Chat-Hybrid` model is the best Hybrid model available for coding. It is good at chatting with a few files at a time in your codebase to learn more about them. It can also make simple code editing suggestions pertaining to a few lines of code at a time.
 17 | 
 18 | However, we do not recommend using this model for analyzing large codebases at once or making large or complex file edits.
 19 | 
 20 | ## Setup
 21 | 
 22 | ### Prerequisites
 23 | 
 24 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe.
 25 | 
 26 | ### Install Continue
 27 | 
 28 | > Note: they provide their own instructions [here](https://marketplace.visualstudio.com/items?itemName=Continue.continue)
 29 | 
 30 | 1. Open the Extensions tab in VS Code Activity Bar.
 31 | 1. Search "Continue - Codestral, Claude, and more" in the Extensions Marketplace search bar.
 32 | 1. Select the Continue extension and click install.
 33 | 
 34 | This will add a Continue tab to your VS Code Activity Bar.
 35 | 
 36 | ### Add Lemonade Server to Continue
 37 | 
 38 | > Note: The following instructions are based on instructions from Continue found [here](https://docs.continue.dev/customize/model-providers/openai#openai-compatible-servers--apis) 
 39 | 
 40 | 1. Open the Continue tab in your VS Code Activity Bar.
 41 | 1. Click the chat box. Some buttons will appear at the bottom of the box, including `Select model`.
 42 | 1. Click `Select model`, then `+ Add Chat model` to open the new model dialog box.
 43 | 1. Click the `config file` link at the very bottom of the dialog to open `config.yaml`.
 44 | 1. Replace the "models" key in the `config.yaml` with the following and save:
 45 | 
 46 | ```yaml
 47 | models:
 48 |   - name: Lemonade
 49 |     provider: openai
 50 |     model: Qwen-1.5-7B-Chat-Hybrid 
 51 |     apiBase: http://localhost:8000/api/v1
 52 |     apiKey: none
 53 | ```
 54 | 
 55 | 6. Close the dialog box.
 56 | 7. Click the chat box again. You should see `Lemonade` where you used to see `Select model`. Ready!
 57 | 
 58 | ## Usage
 59 | 
 60 | > Note: see the Continue [user guide](https://docs.continue.dev/) to learn about all of their features.
 61 | 
 62 | Here are some examples for trying out Continue. These examples assume you have cloned this repo and allowed Continue to index it.
 63 | 
 64 | ### Chat with Files
 65 | 
 66 | Open the Continue tab in your VS Code Activity Bar, and in the "Ask anything" box, type a question about your code. Use the `@` symbol to specify a file or tool.
 67 | 
 68 |   - "What's the fastest way to install Lemonade in `@getting_started.md?`"
 69 |   - "According to `@README.md` what do I need to do to set up for `@api_oga_hybrid_streaming.py`?"
 70 | 
 71 | ### Editing Files
 72 | 
 73 | Open a file, select some code, and push Ctrl+I to start a chat about editing that code.
 74 | 
 75 |   1. Open `//examples//lemonade//api_basic.py`.
 76 |   1. Select the `print(...` line at the bottom and press `ctrl+i`.
 77 |   1. Write "Add a helpful comment" in the chat box and press enter.
 78 |   1. Press "accept" if you would like to accept the change.
 79 | 
 80 | ### Making Files
 81 | 
 82 | Start a new chat and prompt: 
 83 | 
 84 | > write a script in the style of `@api_basic.py` that uses the microsoft/Phi-4-mini-instruct model on GPU
 85 | 
 86 | Here's what we got:
 87 | 
 88 | ```python
 89 | # Import necessary modules
 90 | from lemonade.api import from_pretrained
 91 | 
 92 | # Load the Phi-4-mini-instruct model with the hf-cpu recipe
 93 | model, tokenizer = from_pretrained("microsoft/Phi-4-mini-instruct", recipe="hf-cpu")
 94 | 
 95 | # Define your prompt
 96 | prompt = "This is a sample prompt for the Phi-4-mini-instruct model"
 97 | 
 98 | # Tokenize the prompt
 99 | input_ids = tokenizer(prompt, return_tensors="pt")
100 | 
101 | # Generate the response using the model
102 | response = model.generate(input_ids, max_new_tokens=100)  # Adjust the max_new_tokens as needed
103 | 
104 | # Decode the generated response
105 | generated_text = tokenizer.decode(response[0])
106 | 
107 | # Print the response
108 | print(generated_text)
109 | ```
110 | 
111 | <!--This file was originally licensed under Apache 2.0. It has been modified.
112 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/server/apps/wut.md:
--------------------------------------------------------------------------------
  1 | # `wut` Terminal Assistant
  2 | 
  3 | ## Overview
  4 | 
  5 | The [`wut` terminal assistant](https://github.com/shobrook/wut) uses LLMs to parse your terminal's scrollback, helping you troubleshoot your last command.
  6 | 
  7 | ## Expectations
  8 | 
  9 | We found that `wut` works nicely with the `Llama-3.2-3B-Instruct-Hybrid` model.
 10 | 
 11 | It is not especially convenient to use `wut` with Windows until the developers remove the requirement for `tmux`, however we do provide instructions for getting set up on Windows in this guide.
 12 | 
 13 | `wut` seems to send the entire terminal scrollback to the LLM, which can produce very long prompts that exceed the LLM's context length. We recommend restricting the terminal scrollback or using a fresh `tmux` session when trying this out.
 14 | 
 15 | ## Setup
 16 | 
 17 | ### Prerequisites
 18 | 
 19 | #### Install Lemonade Server
 20 | 
 21 | 1. Install Lemonade Server by following the [Lemonade Server Instructions](../README.md) and using the installer .exe.
 22 | 
 23 | #### Installing Windows Subsystem for Linux (WSL)
 24 | 
 25 | `wut` currently requires a `tmux` terminal in order to function. We found the simplest way to achieve this on Windows was through the Windows Subsystem for Linux (WSL).
 26 | 
 27 | 1. Install [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/install).
 28 | 1. Open the `WSL Settings` app, navigate to `Networking`, and make sure the `Networking mode` is `Mirrored`. This is required for WSL terminals to be able to see the Lemonade server running in Windows.
 29 | 1. If needed: shut down WSL to make sure the changes apply:
 30 | 
 31 | ```powershell
 32 | wsl --shutdown
 33 | ```
 34 | 
 35 | ### Installing Wut
 36 | 
 37 | * Start a WSL terminal.
 38 | * Install [`pipx`](https://github.com/pypa/pipx), as recommended by the following `wut` instructions:
 39 | 
 40 | ```bash
 41 | sudo apt update
 42 | sudo apt install pipx
 43 | pipx ensurepath
 44 | ```
 45 | 
 46 | * Re-launch your terminal to make sure `pipx` is available, then install `wut`:
 47 | 
 48 | ```bash
 49 | pipx install wut-cli
 50 | ```
 51 | 
 52 | * Add `wut`'s required environment variables to your `.bashrc` file:
 53 | 
 54 | ```bash
 55 | export OPENAI_API_KEY="-"
 56 | export OPENAI_MODEL="Llama-3.2-3B-Instruct-Hybrid"
 57 | export OPENAI_BASE_URL="http://localhost:8000/api/v1"
 58 | ```
 59 | 
 60 | ## Usage
 61 | 
 62 | ### Start a terminal
 63 | 
 64 | 1. Start a WSL terminal.
 65 | 2. Start a `tmux` session:
 66 | 
 67 | ```bash
 68 | tmux
 69 | ```
 70 | 
 71 | Then, try some of these example commands that `wut` can help explain.
 72 | 
 73 | ### Help with Lemonade Server
 74 | 
 75 | People often ask exactly what Lemonade Server's `models` endpoint does. Fortunately, `wut` is able to intuit the answer!
 76 | 
 77 | ```bash
 78 | curl http://localhost:8000/api/v1/models
 79 | wut
 80 | ```
 81 | 
 82 | The terminal response of the `curl` command is this (only intelligible by machines):
 83 | 
 84 | ```
 85 | curl http://localhost:8000/api/v1/models
 86 | {"object":"list","data":[{"id":"Qwen2.5-0.5B-Instruct-CPU","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Llama-3.2-1B-Instruct-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Llama-3.2-3B-Instruct-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Phi-3-Mini-Instruct-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"Qwen-1.5-7B-Chat-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"DeepSeek-R1-Distill-Llama-8B-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"},{"id":"DeepSeek-R1-Distill-Qwen-7B-Hybrid","created":1744226681,"object":"model","owned_by":"lemonade"}]}
 87 | ```
 88 | 
 89 | But `wut` does a nice job interpreting:
 90 | 
 91 | ```
 92 | The output suggests that the API endpoint is returning a list of models, and the owned_by field indicates that all models are owned by "lemonade". Thecreated timestamp indicates when each model was created.
 93 | 
 94 | The output is a valid JSON response, and there is no error or warning message. The command was successful, and the output can be used for further processing or analysis. 
 95 | ```
 96 | 
 97 | 
 98 | ### Bad Git Command
 99 | 
100 | Run a command that doesn't exist, and then ask `wut` for help:
101 | 
102 | ```bash
103 | git pull-request
104 | wut
105 | ```
106 | 
107 | Results in:
108 | 
109 | > git: 'pull-request' is not a git command. See 'git --help'.
110 | 
111 | And then `wut` provides some helpful feedback:
112 | 
113 | > Key takeaway: The command git pull-request is not a valid Git command. The correct command to create a pull request is git request-pull, but it's not a standard Git command. The output wut is the name of the activated Conda environment. To create a pull request, use git request-pull or git pull with the --pr option. 
114 | 
115 | <!--This file was originally licensed under Apache 2.0. It has been modified.
116 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/server/lemonade-server-cli.md:
--------------------------------------------------------------------------------
 1 | # `lemonade-server` CLI
 2 | 
 3 | The `lemonade-server` command-line interface (CLI) provides a set of utility commands for managing the server. When you install Lemonade Server using the GUI installer, `lemonade-server` is added to your PATH so that it can be invoked from any terminal.
 4 | 
 5 | > Note: if you installed from source or PyPI, you should call `lemonade-server-dev` in your activated Python environment, instead of using `lemonade-server`.
 6 | 
 7 | `lemonade-server` provides these utilities:
 8 | 
 9 | | Option/Command      | Description                         |
10 | |---------------------|-------------------------------------|
11 | | `-v`, `--version`   | Print the `lemonade-sdk` package version used to install Lemonade Server. |
12 | | `serve`             | Start the server process in the current terminal. See command options [below](#command-line-options-for-serve). |
13 | | `status`            | Check if server is running. If it is, print the port number. |
14 | | `stop`              | Stop any running Lemonade Server process. |
15 | | `pull MODEL_NAME`   | Install an LLM named `MODEL_NAME`. See the [server models guide](./server_models.md) for more information. |
16 | | `run MODEL_NAME`    | Start the server (if not already running) and chat with the specified model. |
17 | | `list`              | List all models. |
18 | 
19 | 
20 | Example:
21 | 
22 | ```bash
23 | lemonade-server serve --port 8080 --log-level debug --truncate-inputs
24 | ```
25 | 
26 | ## Command Line Options for `serve`
27 | 
28 | When using the `serve` command, you can configure the server with these additional options:
29 | 
30 | | Option                         | Description                         | Default |
31 | |--------------------------------|-------------------------------------|---------|
32 | | `--port [port]`                | Specify the port number to run the server on | 8000 |
33 | | `--log-level [level]`          | Set the logging level               | info |
34 | 
35 | The [Lemonade Server integration guide](./server_integration.md) provides more information about how these commands can be used to integrate Lemonade Server into an application.
36 | 
37 | <!--Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/docs/versioning.md:
--------------------------------------------------------------------------------
 1 | # Versioning Policy
 2 | 
 3 | The `lemonade-sdk` package applies semantic versioning for its 3-digit version number. The version number is stored in `src/version.py`.
 4 | 
 5 | The 3 digits correspond to MAJOR.MINOR.PATCH, which can be interpreted as follows:
 6 | * MAJOR: changes indicate breaking API changes that may require the user to change their own code
 7 | * MINOR: changes indicate that builds against a previous minor version may not be compatible, and the user may need to rebuild those models
 8 | * PATCH: no user action required when the patch number changes
 9 | 
10 | <!--This file was originally licensed under Apache 2.0. It has been modified.
11 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Lemonade Examples
 2 | 
 3 | This folder contains examples of how to deploy `lemonade` into applications. 
 4 | 
 5 | ## Notebooks
 6 | 
 7 | The `notebooks/` folder contains Jupyter notebooks that give tutorials on deeper topics.
 8 | 
 9 | ## API Examples
10 | 
11 | This folder has examples of using the Lemonade API to integrate LLMs into Python applications. These APIs make it easy to load a model, generate responses, and also show how to stream those responses.
12 | 
13 | The `demos/` folder also contains some higher-level application demos of the APIs. Learn more in `demos/README.md`.
14 | 
15 | This table shows which API examples are available:
16 | 
17 | | Framework                  | CPU                       | NPU             | Hybrid             |
18 | |----------------------------|---------------------------|-----------------|--------------------|
19 | | Huggingface                | api_basic.py              | -               | -                  |
20 | | OGA                        | api_oga_cpu.py            | api_oga_npu.py | api_oga_hybrid.py |
21 | | Huggingface with streaming | api_streaming.py          | -               | -                  |
22 | | OGA with streaming         | api_oga_cpu_streaming.py  | api_oga_npu_streaming.py | api_oga_hybrid_streaming.py |
23 | 
24 | To run an API example, first set up a conda environment with the appropriate framework and backend support. Then run the scripts with a command like `python api_basic.py`.
25 | 
26 | <!--This file was originally licensed under Apache 2.0. It has been modified.
27 | Modifications Copyright (c) 2025 AMD-->
28 | 


--------------------------------------------------------------------------------
/examples/api_basic.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on CPU using the hf-cpu recipe, and then use it to generate
 4 | the response to a prompt.
 5 | 
 6 | If you have a discrete GPU, you can try that by changing the recipe
 7 | to hf-dgpu. Note: make sure to have torch+cuda installed when trying
 8 | hf-dgpu.
 9 | """
10 | 
11 | from lemonade.api import from_pretrained
12 | 
13 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="hf-cpu")
14 | 
15 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
16 | response = model.generate(input_ids, max_new_tokens=30)
17 | 
18 | print(tokenizer.decode(response[0]))
19 | 
20 | # This file was originally licensed under Apache 2.0. It has been modified.
21 | # Modifications Copyright (c) 2025 AMD
22 | 


--------------------------------------------------------------------------------
/examples/api_oga_cpu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on CPU via OnnxRuntime-Genai (OGA) using the oga-cpu recipe,
 4 | and then use it to generate the response to a prompt.
 5 | 
 6 | Make sure you have set up your OGA device in your Python environment.
 7 | See for details:
 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
 9 | """
10 | 
11 | from lemonade.api import from_pretrained
12 | 
13 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-cpu")
14 | 
15 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
16 | response = model.generate(input_ids, max_new_tokens=30)
17 | 
18 | print(tokenizer.decode(response[0]))
19 | 
20 | # This file was originally licensed under Apache 2.0. It has been modified.
21 | # Modifications Copyright (c) 2025 AMD
22 | 


--------------------------------------------------------------------------------
/examples/api_oga_cpu_streaming.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on CPU via OnnxRuntime-GenAI using the oga-cpu recipe, and then
 4 | use a thread to generate a streaming the response to a prompt.
 5 | 
 6 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer,
 7 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid.
 8 | 
 9 | Make sure you have set up your OGA device in your Python environment.
10 | See for details:
11 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
12 | """
13 | 
14 | from threading import Thread
15 | from lemonade.api import from_pretrained
16 | from lemonade.tools.oga.utils import OrtGenaiStreamer
17 | 
18 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-cpu")
19 | 
20 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
21 | 
22 | streamer = OrtGenaiStreamer(tokenizer)
23 | generation_kwargs = {
24 |     "input_ids": input_ids,
25 |     "streamer": streamer,
26 |     "max_new_tokens": 30,
27 | }
28 | 
29 | thread = Thread(target=model.generate, kwargs=generation_kwargs)
30 | thread.start()
31 | 
32 | # Generate the response using streaming
33 | for new_text in streamer:
34 |     print(new_text)
35 | 
36 | thread.join()
37 | 
38 | # This file was originally licensed under Apache 2.0. It has been modified.
39 | # Modifications Copyright (c) 2025 AMD
40 | 


--------------------------------------------------------------------------------
/examples/api_oga_hybrid.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on Ryzen AI hybrid mode (NPU and iGPU together) via OnnxRuntime-Genai (OGA)
 4 | using the oga-hybrid recipe, and then use it to generate the response to a prompt.
 5 | 
 6 | Make sure you have set up your OGA device in your Python environment.
 7 | See for details:
 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
 9 | """
10 | 
11 | from lemonade.api import from_pretrained
12 | 
13 | model, tokenizer = from_pretrained(
14 |     "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid"
15 | )
16 | 
17 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
18 | response = model.generate(input_ids, max_new_tokens=30)
19 | 
20 | print(tokenizer.decode(response[0]))
21 | 
22 | # This file was originally licensed under Apache 2.0. It has been modified.
23 | # Modifications Copyright (c) 2025 AMD
24 | 


--------------------------------------------------------------------------------
/examples/api_oga_hybrid_streaming.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on Ryzen AI hybrid mode (NPU and iGPU together) via OnnxRuntime-GenAI
 4 | using the oga-cpu recipe, and then use a thread to generate a streaming the
 5 | response to a prompt.
 6 | 
 7 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer,
 8 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid.
 9 | 
10 | Make sure you have set up your OGA device in your Python environment.
11 | See for details:
12 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
13 | """
14 | 
15 | from threading import Thread
16 | from lemonade.api import from_pretrained
17 | from lemonade.tools.oga.utils import OrtGenaiStreamer
18 | 
19 | model, tokenizer = from_pretrained(
20 |     "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid", recipe="oga-hybrid"
21 | )
22 | 
23 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
24 | 
25 | streamer = OrtGenaiStreamer(tokenizer)
26 | generation_kwargs = {
27 |     "input_ids": input_ids,
28 |     "streamer": streamer,
29 |     "max_new_tokens": 30,
30 | }
31 | 
32 | thread = Thread(target=model.generate, kwargs=generation_kwargs)
33 | thread.start()
34 | 
35 | # Generate the response using streaming
36 | for new_text in streamer:
37 |     print(new_text)
38 | 
39 | thread.join()
40 | 
41 | # This file was originally licensed under Apache 2.0. It has been modified.
42 | # Modifications Copyright (c) 2025 AMD
43 | 


--------------------------------------------------------------------------------
/examples/api_oga_igpu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on integrated GPUs (iGPUs) via OnnxRuntime-Genai (OGA)
 4 | using the oga-igpu recipe, and then use it to generate the response to a prompt.
 5 | 
 6 | Make sure you have set up your OGA device in your Python environment.
 7 | See for details:
 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
 9 | """
10 | 
11 | from lemonade.api import from_pretrained
12 | 
13 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-igpu")
14 | 
15 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
16 | response = model.generate(input_ids, max_new_tokens=30)
17 | 
18 | print(tokenizer.decode(response[0]))
19 | 
20 | # This file was originally licensed under Apache 2.0. It has been modified.
21 | # Modifications Copyright (c) 2025 AMD
22 | 


--------------------------------------------------------------------------------
/examples/api_oga_igpu_streaming.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on integrated GPUs (iGPUs) via OnnxRuntime-GenAI using the oga-igpu recipe,
 4 | and then use a thread to generate a streaming the response to a prompt.
 5 | 
 6 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer,
 7 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid.
 8 | 
 9 | Make sure you have set up your OGA device in your Python environment.
10 | See for details:
11 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
12 | """
13 | 
14 | from threading import Thread
15 | from lemonade.api import from_pretrained
16 | from lemonade.tools.ort_genai.oga import OrtGenaiStreamer
17 | 
18 | model, tokenizer = from_pretrained(
19 |     "Qwen/Qwen2.5-0.5B-Instruct",
20 |     recipe="oga-igpu",
21 | )
22 | 
23 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
24 | 
25 | streamer = OrtGenaiStreamer(tokenizer)
26 | generation_kwargs = {
27 |     "input_ids": input_ids,
28 |     "streamer": streamer,
29 |     "max_new_tokens": 30,
30 | }
31 | 
32 | thread = Thread(target=model.generate, kwargs=generation_kwargs)
33 | thread.start()
34 | 
35 | # Generate the response using streaming
36 | for new_text in streamer:
37 |     print(new_text)
38 | 
39 | thread.join()
40 | 
41 | # This file was originally licensed under Apache 2.0. It has been modified.
42 | # Modifications Copyright (c) 2025 AMD
43 | 


--------------------------------------------------------------------------------
/examples/api_oga_npu.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on Ryzen AI NPU via OnnxRuntime-Genai (OGA) using the oga-npu recipe,
 4 | and then use it to generate the response to a prompt.
 5 | 
 6 | Make sure you have set up your OGA device in your Python environment.
 7 | See for details:
 8 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
 9 | """
10 | 
11 | from lemonade.api import from_pretrained
12 | 
13 | model, tokenizer = from_pretrained(
14 |     "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
15 |     recipe="oga-npu",
16 | )
17 | 
18 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
19 | response = model.generate(input_ids, max_new_tokens=30)
20 | 
21 | print(tokenizer.decode(response[0]))
22 | 
23 | # This file was originally licensed under Apache 2.0. It has been modified.
24 | # Modifications Copyright (c) 2025 AMD
25 | 


--------------------------------------------------------------------------------
/examples/api_oga_npu_streaming.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on Ryzen AI NPU via OnnxRuntime-GenAI using the oga-npu recipe,
 4 | and then use a thread to generate a streaming the response to a prompt.
 5 | 
 6 | Note: this approach only works with recipes that support lemonade's OrtGenaiStreamer,
 7 | i.e., OGA-based recipes such as oga-cpu, oga-igpu, oga-npu, and oga-hybrid.
 8 | 
 9 | Make sure you have set up your OGA device in your Python environment.
10 | See for details:
11 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md#installation
12 | """
13 | 
14 | from threading import Thread
15 | from lemonade.api import from_pretrained
16 | from lemonade.tools.oga.utils import OrtGenaiStreamer
17 | 
18 | model, tokenizer = from_pretrained(
19 |     "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
20 |     recipe="oga-npu",
21 | )
22 | 
23 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
24 | 
25 | streamer = OrtGenaiStreamer(tokenizer)
26 | generation_kwargs = {
27 |     "input_ids": input_ids,
28 |     "streamer": streamer,
29 |     "max_new_tokens": 30,
30 | }
31 | 
32 | thread = Thread(target=model.generate, kwargs=generation_kwargs)
33 | thread.start()
34 | 
35 | # Generate the response using streaming
36 | for new_text in streamer:
37 |     print(new_text)
38 | 
39 | thread.join()
40 | 
41 | # This file was originally licensed under Apache 2.0. It has been modified.
42 | # Modifications Copyright (c) 2025 AMD
43 | 


--------------------------------------------------------------------------------
/examples/api_streaming.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to use the lemonade API to load a model for
 3 | inference on CPU using the hf-cpu recipe, and then use a thread to
 4 | generate a streaming the response to a prompt.
 5 | 
 6 | Note: this approach only works with recipes that support TextIteratorStreamer,
 7 | i.e., huggingface-based recipes such as hf-cpu and hf-dgpu.
 8 | """
 9 | 
10 | from threading import Thread
11 | from transformers import TextIteratorStreamer
12 | from lemonade.api import from_pretrained
13 | 
14 | model, tokenizer = from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="hf-cpu")
15 | 
16 | input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids
17 | 
18 | streamer = TextIteratorStreamer(
19 |     tokenizer,
20 |     skip_prompt=True,
21 | )
22 | generation_kwargs = {
23 |     "input_ids": input_ids,
24 |     "streamer": streamer,
25 |     "max_new_tokens": 30,
26 | }
27 | 
28 | thread = Thread(target=model.generate, kwargs=generation_kwargs)
29 | thread.start()
30 | 
31 | # Generate the response using streaming
32 | for new_text in streamer:
33 |     print(new_text)
34 | 
35 | thread.join()
36 | 
37 | # This file was originally licensed under Apache 2.0. It has been modified.
38 | # Modifications Copyright (c) 2025 AMD
39 | 


--------------------------------------------------------------------------------
/examples/demos/README.md:
--------------------------------------------------------------------------------
 1 | # Lemonade Demos
 2 | 
 3 | The demo scripts in this folder show how `lemonade` can be used in integrate OnnxRuntime-GenAI (OGA) into higher-level applications such as chat and search.
 4 | 
 5 | The format of each demo is to have two files which show the before-and-after of integrating OGA:
 6 |     - `*_start.py`: a version of the application that uses regular software to try and handle a natural language task.
 7 |     - `*_hybrid.py`: an upgrade of the application that integrates an LLM with Ryzen AI Hybrid to improve the natural language task.
 8 | 
 9 | The demos available are:
10 |     - `chat/`: prompts the user for a message and then streams the LLM's response to the terminal.
11 |     - `search/`: demonstrates how a user can search an employee handbook in natural language using an LLM.
12 | 
13 | To run a demo:
14 | 1. Set up a conda environment with the appropriate framework and backend support.
15 | 1. `cd` into the demo directory (e.g., `cd search/`)
16 | 1. Run the `*_start.py` script to see what the application is like without the LLM (e.g., `python search_start.py`)
17 | 1. Run the `*_hybrid.py` script to see what the application is like with the LLM (e.g., `python search_hybrid.py`)
18 | 
19 | <!--This file was originally licensed under Apache 2.0. It has been modified.
20 | Modifications Copyright (c) 2025 AMD-->


--------------------------------------------------------------------------------
/examples/demos/chat/chat_hybrid.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from threading import Thread, Event
 3 | from transformers import StoppingCriteriaList
 4 | from lemonade.tools.server.serve import StopOnEvent
 5 | from lemonade.api import from_pretrained
 6 | from lemonade.tools.oga.utils import OrtGenaiStreamer
 7 | 
 8 | 
 9 | def main():
10 | 
11 |     model, tokenizer = from_pretrained(
12 |         "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
13 |         recipe="oga-hybrid",
14 |     )
15 | 
16 |     while True:
17 |         # Enable sending a signal into the generator thread to stop
18 |         # the generation early
19 |         stop_event = Event()
20 |         stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)])
21 | 
22 |         # Prompt the user for an input message
23 |         print()
24 |         user_message = input("User: ")
25 |         print()
26 | 
27 |         # Print a friendly message when we quit
28 |         if user_message == "quit":
29 |             print("System: Ok, bye!\n")
30 |             break
31 | 
32 |         # Generate the response in a thread and stream the result back
33 |         # to the main thread
34 |         input_ids = tokenizer(user_message, return_tensors="pt").input_ids
35 | 
36 |         streamer = OrtGenaiStreamer(tokenizer)
37 |         generation_kwargs = {
38 |             "input_ids": input_ids,
39 |             "streamer": streamer,
40 |             "max_new_tokens": 200,
41 |             "stopping_criteria": stopping_criteria,
42 |         }
43 | 
44 |         thread = Thread(target=model.generate, kwargs=generation_kwargs)
45 |         thread.start()
46 | 
47 |         # Print each word to the screen as it arrives from the streamer
48 |         # Allow the user to terminate the response with
49 |         # a keyboard interrupt (ctrl+c)
50 |         try:
51 |             print("LLM: ", end="")
52 |             for new_text in streamer:
53 |                 print(new_text, end="")
54 |                 sys.stdout.flush()
55 | 
56 |         except KeyboardInterrupt:
57 |             stop_event.set()
58 | 
59 |         print()
60 | 
61 |         thread.join()
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 
67 | # This file was originally licensed under Apache 2.0. It has been modified.
68 | # Modifications Copyright (c) 2025 AMD
69 | 


--------------------------------------------------------------------------------
/examples/demos/chat/chat_start.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from threading import Thread, Event
  3 | from queue import Queue
  4 | from time import sleep
  5 | from transformers import StoppingCriteriaList
  6 | from lemonade.tools.server.serve import StopOnEvent
  7 | 
  8 | 
  9 | class TextStreamer:
 10 |     """
 11 |     Imitates a queue for streaming text from one thread to another.
 12 | 
 13 |     Not needed once we integrate with the lemonade API.
 14 |     """
 15 | 
 16 |     def __init__(self):
 17 |         self.text_queue = Queue()
 18 |         self.stop_signal = None
 19 | 
 20 |     def add_text(self, text: str):
 21 |         self.text_queue.put(text)
 22 | 
 23 |     def done(self):
 24 |         self.text_queue.put(self.stop_signal)
 25 | 
 26 |     def __iter__(self):
 27 |         return self
 28 | 
 29 |     def __next__(self):
 30 |         value = self.text_queue.get()
 31 |         if value == self.stop_signal:
 32 |             raise StopIteration()
 33 |         else:
 34 |             return value
 35 | 
 36 | 
 37 | def generate_placeholder(
 38 |     streamer: TextStreamer, stopping_criteria: StoppingCriteriaList
 39 | ):
 40 |     """
 41 |     Imitates an LLM's generate function by streaming text to a queue.
 42 | 
 43 |     Not needed once we integrate with the lemonade API.
 44 |     """
 45 | 
 46 |     # pylint: disable=line-too-long
 47 |     response = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
 48 | 
 49 |     for word in response.split(" "):
 50 |         streamer.add_text(f"{word} ")
 51 |         sleep(0.05)
 52 | 
 53 |         if stopping_criteria[0].stop_event.is_set():
 54 |             break
 55 | 
 56 |     streamer.done()
 57 | 
 58 | 
 59 | def main():
 60 | 
 61 |     while True:
 62 |         # Enable sending a signal into the generator thread to stop
 63 |         # the generation early
 64 |         stop_event = Event()
 65 |         stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)])
 66 | 
 67 |         # Prompt the user for an input message
 68 |         print()
 69 |         user_message = input("User: ")
 70 |         print()
 71 | 
 72 |         # Print a friendly message when we quit
 73 |         if user_message == "quit":
 74 |             print("System: Ok, bye!\n")
 75 |             break
 76 | 
 77 |         # Generate the response in a thread and stream the result back
 78 |         # to the main thread
 79 |         streamer = TextStreamer()
 80 |         generation_kwargs = {
 81 |             "streamer": streamer,
 82 |             "stopping_criteria": stopping_criteria,
 83 |         }
 84 | 
 85 |         thread = Thread(target=generate_placeholder, kwargs=generation_kwargs)
 86 |         thread.start()
 87 | 
 88 |         # Print each word to the screen as it arrives
 89 |         # Allow the user to terminate the response with
 90 |         # a keyboard interrupt (ctrl+c)
 91 |         try:
 92 |             print("LLM: ", end="")
 93 |             for new_text in streamer:
 94 |                 print(new_text, end="")
 95 |                 sys.stdout.flush()
 96 | 
 97 |         except KeyboardInterrupt:
 98 |             stop_event.set()
 99 | 
100 |         print()
101 | 
102 |         thread.join()
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 
108 | # This file was originally licensed under Apache 2.0. It has been modified.
109 | # Modifications Copyright (c) 2025 AMD
110 | 


--------------------------------------------------------------------------------
/examples/demos/search/search_hybrid.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from threading import Thread, Event
 3 | from transformers import StoppingCriteriaList
 4 | from lemonade.api import from_pretrained
 5 | from lemonade.tools.oga.utils import OrtGenaiStreamer
 6 | from lemonade.tools.server.serve import StopOnEvent
 7 | 
 8 | employee_handbook = """
 9 | 1. You will work very hard every day.\n
10 | 2. You are allowed to listen to music, but must wear headphones.\n
11 | 3. Remember, the break room fridge is not a science experiment. 
12 |     Please label and remove your leftovers regularly!\n
13 | """
14 | 
15 | 
16 | def system_prompt(user_prompt):
17 |     return f"""
18 | <|begin_of_text|><|start_header_id|>system<|end_header_id|>
19 | 
20 | You are a helpful assistant who can only answer questions about this employee handbook: {employee_handbook}. 
21 | Don't make up information that isn't in the handbook already. 
22 | <|eot_id|><|start_header_id|>user<|end_header_id|>
23 | 
24 | {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
25 | """
26 | 
27 | 
28 | def main():
29 | 
30 |     # Load LLaMA-3.2 1B model on Ryzen AI Hybrid
31 |     model, tokenizer = from_pretrained(
32 |         "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
33 |         recipe="oga-hybrid",
34 |     )
35 | 
36 |     while True:
37 |         # Enable sending a signal into the generator thread to stop
38 |         # the generation early
39 |         stop_event = Event()
40 |         stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)])
41 | 
42 |         # Prompt the user for an input message
43 |         print()
44 |         user_message = input("User: ")
45 |         print()
46 | 
47 |         # Print a friendly message when we quit
48 |         if user_message == "quit":
49 |             print("System: Ok, bye!\n")
50 |             break
51 | 
52 |         # Generate the response in a thread and stream the result back
53 |         # to the main thread
54 |         input_ids = tokenizer(
55 |             system_prompt(user_message), return_tensors="pt"
56 |         ).input_ids
57 | 
58 |         streamer = OrtGenaiStreamer(tokenizer)
59 |         generation_kwargs = {
60 |             "input_ids": input_ids,
61 |             "streamer": streamer,
62 |             "max_new_tokens": 200,
63 |             "stopping_criteria": stopping_criteria,
64 |         }
65 | 
66 |         thread = Thread(target=model.generate, kwargs=generation_kwargs)
67 |         thread.start()
68 | 
69 |         # Print each word to the screen as it arrives from the streamer
70 |         # Allow the user to terminate the response with
71 |         # a keyboard interrupt (ctrl+c)
72 |         try:
73 |             print("LLM: ", end="")
74 |             for new_text in streamer:
75 |                 print(new_text, end="")
76 |                 sys.stdout.flush()
77 | 
78 |         except KeyboardInterrupt:
79 |             stop_event.set()
80 | 
81 |         print()
82 | 
83 |         thread.join()
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 
89 | # This file was originally licensed under Apache 2.0. It has been modified.
90 | # Modifications Copyright (c) 2025 AMD
91 | 


--------------------------------------------------------------------------------
/examples/demos/search/search_start.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from threading import Thread, Event
  3 | from queue import Queue
  4 | from time import sleep
  5 | from transformers import StoppingCriteriaList
  6 | from lemonade.tools.server.serve import StopOnEvent
  7 | 
  8 | 
  9 | employee_handbook = """
 10 | 1. You will work very hard every day.\n
 11 | 2. You are allowed to listen to music, but must wear headphones.\n
 12 | 3. Remember, the break room fridge is not a science experiment. 
 13 |     Please label and remove your leftovers regularly!\n
 14 | """
 15 | 
 16 | 
 17 | class TextStreamer:
 18 |     """
 19 |     Imitates a queue for streaming text from one thread to another.
 20 | 
 21 |     Not needed once we integrate with the lemonade API.
 22 |     """
 23 | 
 24 |     def __init__(self):
 25 |         self.text_queue = Queue()
 26 |         self.stop_signal = None
 27 | 
 28 |     def add_text(self, text: str):
 29 |         self.text_queue.put(text)
 30 | 
 31 |     def done(self):
 32 |         self.text_queue.put(self.stop_signal)
 33 | 
 34 |     def __iter__(self):
 35 |         return self
 36 | 
 37 |     def __next__(self):
 38 |         value = self.text_queue.get()
 39 |         if value == self.stop_signal:
 40 |             raise StopIteration()
 41 |         else:
 42 |             return value
 43 | 
 44 | 
 45 | def plain_text_search(
 46 |     question: str, streamer: TextStreamer, stopping_criteria: StoppingCriteriaList
 47 | ):
 48 |     """
 49 |     Searches the employee handbook, looking for an exact match and
 50 |     returns an answer if available.
 51 | 
 52 |     Imitates an LLM's generate function by streaming text to a queue.
 53 | 
 54 |     Not needed once we integrate with the lemonade API.
 55 |     """
 56 | 
 57 |     # Turn the question into key words
 58 |     # Remove punctuation and convert to lower-case
 59 |     sanitized_question = question.replace("?", "").replace(".", "").lower()
 60 |     # Get a list of important words (longer than length 3)
 61 |     keywords = [word for word in sanitized_question.split(" ") if len(word) > 3]
 62 | 
 63 |     # Search for the key words in the employee handbook
 64 |     result = None
 65 |     for keyword in keywords:
 66 |         for line in employee_handbook.lower().split("\n"):
 67 |             if keyword in line:
 68 |                 result = line
 69 | 
 70 |     if result:
 71 |         response = (
 72 |             f"This line of the employee handbook might be relevant to you: {result}"
 73 |         )
 74 |     else:
 75 |         response = (
 76 |             "I am sorry, I didn't find anything that is useful to you. Please "
 77 |             "try again with another question or read the entire employee handbook "
 78 |             "cover-to-cover to make sure that you didn't miss any rules."
 79 |         )
 80 | 
 81 |     for word in response.split(" "):
 82 |         streamer.add_text(f"{word} ")
 83 |         sleep(0.05)
 84 | 
 85 |         if stopping_criteria[0].stop_event.is_set():
 86 |             break
 87 | 
 88 |     streamer.done()
 89 | 
 90 | 
 91 | def main():
 92 | 
 93 |     while True:
 94 |         # Enable sending a signal into the generator thread to stop
 95 |         # the generation early
 96 |         stop_event = Event()
 97 |         stopping_criteria = StoppingCriteriaList([StopOnEvent(stop_event)])
 98 | 
 99 |         # Prompt the user for an input message
100 |         print()
101 |         user_message = input("User: ")
102 |         print()
103 | 
104 |         # Print a friendly message when we quit
105 |         if user_message == "quit":
106 |             print("System: Ok, bye!\n")
107 |             break
108 | 
109 |         # Generate the response in a thread and stream the result back
110 |         # to the main thread
111 |         streamer = TextStreamer()
112 |         generation_kwargs = {
113 |             "question": user_message,
114 |             "streamer": streamer,
115 |             "stopping_criteria": stopping_criteria,
116 |         }
117 | 
118 |         thread = Thread(target=plain_text_search, kwargs=generation_kwargs)
119 |         thread.start()
120 | 
121 |         # Print each word to the screen as it arrives from the streamer
122 |         # Allow the user to terminate the response with
123 |         # a keyboard interrupt (ctrl+c)
124 |         try:
125 |             print("LLM: ", end="")
126 |             for new_text in streamer:
127 |                 print(new_text, end="")
128 |                 sys.stdout.flush()
129 | 
130 |         except KeyboardInterrupt:
131 |             stop_event.set()
132 | 
133 |         print()
134 | 
135 |         thread.join()
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     main()
140 | 
141 | # This file was originally licensed under Apache 2.0. It has been modified.
142 | # Modifications Copyright (c) 2025 AMD
143 | 


--------------------------------------------------------------------------------
/img/basic_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/img/basic_demo.gif


--------------------------------------------------------------------------------
/img/llm_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/img/llm_demo.png


--------------------------------------------------------------------------------
/installer/add_to_path.py:
--------------------------------------------------------------------------------
 1 | import winreg
 2 | import argparse
 3 | 
 4 | 
 5 | def add_to_path(directory_to_add):
 6 |     """
 7 |     Adds a directory to the beginning of the user Path, or
 8 |     moves it to the beginning if it already exists in the Path.
 9 | 
10 |     Args:
11 |         directory_to_add (str): Directory path to add to the Path
12 | 
13 |     Returns:
14 |         bool: True if successful, False otherwise
15 |     """
16 |     try:
17 |         # Open the Environment key in HKEY_CURRENT_USER
18 |         key = winreg.OpenKey(
19 |             winreg.HKEY_CURRENT_USER,
20 |             "Environment",
21 |             0,
22 |             winreg.KEY_READ | winreg.KEY_WRITE,
23 |         )
24 | 
25 |         # Get the current Path value
26 |         try:
27 |             # Try to get the current Path value
28 |             # If the Path env var exists but it is empty, it will return an empty string
29 |             current_path, _ = winreg.QueryValueEx(key, "Path")
30 |         except FileNotFoundError:
31 |             # If the Path env var doesn't exist yet, it will raise a FileNotFoundError
32 |             # In this case ONLY, it is safe to set the current path to an empty string
33 |             current_path = ""
34 |         except Exception as e:
35 |             # If anything else goes wrong, print the error and exit
36 |             # We don't want to risk corrupting the registry
37 |             print(f"Error getting current Path: {e}")
38 |             exit(1)
39 | 
40 |         # Split the Path into individual directories
41 |         path_items = [
42 |             item for item in current_path.split(";") if item
43 |         ]  # Remove empty entries
44 | 
45 |         # Check if directory is already in Path
46 |         if directory_to_add in path_items:
47 |             # Remove it from its current position
48 |             path_items.remove(directory_to_add)
49 |             print(f"- {directory_to_add} was already in Path, moving to the beginning")
50 |         else:
51 |             print(f"- Adding {directory_to_add} to the beginning of Path")
52 | 
53 |         # Add the directory to the beginning of Path
54 |         path_items.insert(0, directory_to_add)
55 | 
56 |         # Join the items back together
57 |         new_path = ";".join(path_items)
58 | 
59 |         # Write the new Path back to registry
60 |         winreg.SetValueEx(key, "Path", 0, winreg.REG_EXPAND_SZ, new_path)
61 |         winreg.CloseKey(key)
62 | 
63 |         print("- Successfully updated user Path")
64 |         return True
65 | 
66 |     except Exception as e:
67 |         print(f"Error updating Path: {e}")
68 |         return False
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     parser = argparse.ArgumentParser(
73 |         description="Add a directory to the beginning of the user Path"
74 |     )
75 |     parser.add_argument("directory", help="Directory path to add to Path")
76 |     args = parser.parse_args()
77 | 
78 |     add_to_path(args.directory)
79 | 
80 | # This file was originally licensed under Apache 2.0. It has been modified.
81 | # Modifications Copyright (c) 2025 AMD
82 | 


--------------------------------------------------------------------------------
/installer/installer_banner.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/installer/installer_banner.bmp


--------------------------------------------------------------------------------
/installer/lemonade-server.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | setlocal enabledelayedexpansion
 3 | 
 4 | REM Get current time in milliseconds since midnight
 5 | for /f "tokens=1-4 delims=:.," %%a in ("!time!") do (
 6 |     set /a "CURRENT_TIME=((((%%a*60)+1%%b %% 100)*60+1%%c %% 100)*1000)+1%%d %% 1000"
 7 | )
 8 | 
 9 | REM Use temp directory for the lock file
10 | set "LOCK_FILE=%TEMP%\lemonade_server.lock"
11 | 
12 | REM Show a notification and run the server in tray mode.
13 | REM Note: command line arguments are parsed in order from left to right
14 | set TRAY=0
15 | set ARGS=
16 | for %%a in (%*) do (
17 |     set ARGS=!ARGS! %%a
18 |     if /I "%%a"=="serve" (
19 |         set TRAY=1
20 |     )
21 |     if /I "%%a"=="--no-tray" (
22 |         set TRAY=0
23 |     )
24 | )
25 | 
26 | REM Only check lock file if running in tray mode
27 | if %TRAY%==1 (
28 |     REM Check if another instance is starting (within last 10000 milliseconds)
29 |     if exist "!LOCK_FILE!" (
30 |         set /p STORED_TIME=<"!LOCK_FILE!"
31 |         set /a TIME_DIFF=!CURRENT_TIME!-!STORED_TIME!
32 |         
33 |         REM Only block if difference is positive and less than 10000 milliseconds (10 seconds)
34 |         if !TIME_DIFF! gtr 0 if !TIME_DIFF! lss 10000 (
35 |             echo Another instance of Lemonade Server is currently starting.
36 |             exit /b 3
37 |         )
38 |     )
39 | 
40 |     REM Set the starting timestamp in lock file
41 |     echo !CURRENT_TIME!>"!LOCK_FILE!"
42 | )
43 | 
44 | REM Change to parent directory where conda env and bin folders are located
45 | pushd "%~dp0.."
46 | 
47 | REM Run the Python CLI script, passing filtered arguments
48 | call "%CD%\python\Scripts\lemonade-server-dev" !ARGS!
49 | set SERVER_ERRORLEVEL=%ERRORLEVEL%
50 | popd
51 | 
52 | REM Clean up lock file before any exit
53 | del "!LOCK_FILE!" 2>nul
54 | 
55 | REM Provide a notification if the server is already running
56 | if %SERVER_ERRORLEVEL% equ 2 (
57 |     if %TRAY%==1 (
58 |         REM Blocking call to show notification
59 |         wscript "%~dp0lemonade_notification.vbs" "Lemonade Server" "Lemonade Server is already running!\nCheck your system tray for details or run `lemonade-server stop` to stop the existing server and try again."
60 |         exit /b 2
61 |     )
62 | )
63 | 
64 | REM Exit without additional notifications if error code is 0 (no errors), 15 (lemonade-server stop), or less than 0 (forced exit)
65 | if %SERVER_ERRORLEVEL% equ 15 (
66 |     exit /b 15
67 | ) else if %SERVER_ERRORLEVEL% leq 0 (
68 |     exit /b 0
69 | )
70 | 
71 | REM Error handling if any other error code
72 | if %TRAY%==0 (
73 |     echo.
74 |     echo An error occurred while running Lemonade Server.
75 |     echo Please check the error message above.
76 |     echo.
77 |     pause
78 | )
79 | if %TRAY%==1 (
80 |     REM Blocking call to show notification
81 |     wscript "%~dp0lemonade_notification.vbs" "Lemonade Server" "An error occurred while running Lemonade Server.\nPlease run the server manually. Error code: %SERVER_ERRORLEVEL%"
82 | )
83 | 
84 | REM This file was originally licensed under Apache 2.0. It has been modified.
85 | REM Modifications Copyright (c) 2025 AMD 


--------------------------------------------------------------------------------
/installer/lemonade_notification.vbs:
--------------------------------------------------------------------------------
 1 | ' Lemonade Server Loading Notification
 2 | ' Shows a notification that can be manually controlled
 3 | ' Usage: wscript lemonade_notification.vbs [title] [message]
 4 | 
 5 | Dim objShell, objFSO, signalFile, windowTitle, messageText
 6 | Set objShell = CreateObject("WScript.Shell")
 7 | Set objFSO = CreateObject("Scripting.FileSystemObject")
 8 | 
 9 | ' Get command line arguments or use defaults
10 | If WScript.Arguments.Count >= 1 Then
11 |     windowTitle = WScript.Arguments(0)
12 | Else
13 |     windowTitle = "Lemonade Server"
14 | End If
15 | 
16 | If WScript.Arguments.Count >= 2 Then
17 |     messageText = WScript.Arguments(1)
18 |     ' Replace pipe characters with line breaks for multi-line notifications
19 |     messageText = Replace(messageText, "\n", vbCrLf)
20 | Else
21 |     messageText = "Starting Lemonade Server..."
22 | End If
23 | 
24 | ' Signal file path for manual control
25 | signalFile = objFSO.GetSpecialFolder(2) & "\lemonade_notification_signal.txt"
26 | 
27 | ' Create signal file to indicate the notification is active
28 | objFSO.CreateTextFile(signalFile, True).Close
29 | 
30 | ' Show notification (no timeout - stays open until manually closed)
31 | result = objShell.Popup(messageText, 0, windowTitle, 0)
32 | 
33 | ' Clean up signal file
34 | If objFSO.FileExists(signalFile) Then
35 |     objFSO.DeleteFile signalFile
36 | End If
37 | 
38 | Set objShell = Nothing
39 | Set objFSO = Nothing 


--------------------------------------------------------------------------------
/installer/lemonade_server.vbs:
--------------------------------------------------------------------------------
 1 | ' This script detects wheter we are in headless mode and launches lemonade-server
 2 | ' either in headless mode or with a system tray icon.
 3 | 
 4 | Set wshShell = CreateObject("WScript.Shell")
 5 | Set fso = CreateObject("Scripting.FileSystemObject")
 6 | 
 7 | scriptDir = fso.GetParentFolderName(WScript.ScriptFullName)
 8 | 
 9 | ' Declare headless variable
10 | Dim HEADLESS
11 | 
12 | ' Simple GUI detection: check if system tray is available
13 | On Error Resume Next
14 | Set shell = CreateObject("Shell.Application")
15 | If Err.Number = 0 Then
16 |     ' Try to access the system tray area
17 |     Set trayWnd = shell.Windows()
18 |     If Err.Number = 0 Then
19 |         ' GUI mode: show tray
20 |         Set trayWnd = Nothing
21 |         Set shell = Nothing
22 |         On Error GoTo 0
23 |         HEADLESS = False
24 |     Else
25 |         ' Headless mode: no GUI
26 |         Set shell = Nothing
27 |         On Error GoTo 0
28 |         HEADLESS = True
29 |     End If
30 | Else
31 |     ' Headless mode: no GUI
32 |     On Error GoTo 0
33 |     HEADLESS = True
34 | End If
35 | 
36 | If HEADLESS = True Then
37 |     ' Headless mode: open a terminal and run the server without the tray
38 |     wshShell.Run """" & scriptDir & "\lemonade-server.bat"" serve --no-tray", 1, True
39 | Else
40 |     ' Check if we're in CI mode via environment variable
41 |     ciMode = wshShell.ExpandEnvironmentStrings("%LEMONADE_CI_MODE%")
42 |     If ciMode <> "%LEMONADE_CI_MODE%" And (LCase(ciMode) = "true" Or LCase(ciMode) = "1") Then
43 |         ' CI mode: run without tray even in GUI environment
44 |         wshShell.Run """" & scriptDir & "\lemonade-server.bat"" serve --no-tray", 1, True
45 |     Else
46 |         ' GUI mode: Run the server on a hidden window with the tray
47 |         wshShell.Run """" & scriptDir & "\lemonade-server.bat"" serve", 0, False
48 |     End If
49 | End If
50 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | # This is the configuration file for MkDocs, a static site generator that uses Markdown files to create documentation sites.
  2 | # The configuration file is written in YAML format and contains various settings for the site.
  3 | 
  4 | # To install the MkDocs dependencies, run the following command in the terminal:
  5 | # pip install -r docs/assets/mkdocs_requirements.txt
  6 | 
  7 | # To build the site, run the following command in the terminal:
  8 | # mkdocs build
  9 | 
 10 | # To serve the site locally, run the following command in the terminal:
 11 | # mkdocs serve
 12 | 
 13 | # To deploy the site to GitHub Pages, run the following command in the terminal:
 14 | # mkdocs gh-deploy  <-- this should be updated when we have CI setup with what the real instructions are.
 15 | 
 16 | site_name: Lemonade Server Documentation
 17 | site_url: https://lemonade-server.ai/
 18 | site_description: Lemonade Server is a lightweight, open-source local LLM server that allows you to run and manage multiple AI applications on your local machine. It provides a simple CLI for managing applications and supports various LLMs, making it easy to deploy and use AI models locally.
 19 | 
 20 | edit_uri: server/README.md
 21 | 
 22 | repo_name: lemonade-sdk/lemonade
 23 | repo_url: https://github.com/lemonade-sdk/lemonade
 24 | 
 25 | plugins:
 26 |   - monorepo
 27 |   - search
 28 | 
 29 | theme:
 30 |   name: material
 31 |   logo: assets/logo.png  # If we want to use a custom logo instead of an icon
 32 |   icon:
 33 |     repo: fontawesome/brands/github # This is the icon for the repo link in the header
 34 |   favicon: assets/favicon.ico
 35 |   features:
 36 |     - navigation.footer
 37 |     - navigation.tracking
 38 |     - navigation.expand
 39 |     - navigation.top
 40 |     - content.code.annotate
 41 |     - content.code.copy
 42 |   palette:
 43 | 
 44 |     # Light mode settings
 45 |     - scheme: lightmode
 46 |       primary: amber
 47 |       toggle:
 48 |         icon: material/weather-night
 49 |         name: Switch to dark mode
 50 | 
 51 |     # Dark mode settings
 52 |     - scheme: slate
 53 |       primary: amber
 54 |       accent: amber
 55 |       toggle:
 56 |         icon: material/weather-sunny
 57 |         name: Switch to light mode
 58 |   nav_style: dark
 59 | 
 60 | # Add the list of markdown files to be included in the documentation
 61 | # The order of the files in the list will determine the order they appear in the documentation
 62 | nav:
 63 |   - Downloading and Getting Started: server/README.md
 64 |   - Supported Applications: server/apps/README.md
 65 |   - Application Guides:
 66 |       - Open WebUI: server/apps/open-webui.md
 67 |       - AI Dev Gallery: server/apps/ai-dev-gallery.md
 68 |       - AI Toolkit: server/apps/ai-toolkit.md
 69 |       - AnythingLLM: server/apps/anythingLLM.md
 70 |       - CodeGPT: server/apps/codeGPT.md
 71 |       - Continue: server/apps/continue.md
 72 |       - LM-Eval-Harness: server/apps/lm-eval.md
 73 |       - Mindcraft: server/apps/mindcraft.md
 74 |       - Wut: server/apps/wut.md
 75 |   - Lemonade Server CLI Guide: server/lemonade-server-cli.md
 76 |   - Understanding local LLM servers: server/concepts.md
 77 |   - Models List: server/server_models.md
 78 |   - Server Spec: server/server_spec.md
 79 |   - Integration Guide: server/server_integration.md
 80 |   - Contribution Guide: contribute.md
 81 | 
 82 | not_in_nav: |
 83 |   /index.md
 84 |   /lemonade_api.md
 85 |   
 86 | exclude_docs: |
 87 |   code.md
 88 |   versioning.md
 89 |   dev_cli/README.md
 90 |   dev_cli/humaneval_accuracy.md
 91 |   dev_cli/mmlu_accuracy.md
 92 |   dev_cli/perplexity.md
 93 |   dev_cli/quark.md
 94 |   dev_cli/ort_genai_igpu.md
 95 |   dev_cli/llamacpp.md
 96 |   dev_cli/lm-eval.md
 97 | 
 98 | # The following adds icons on the bottom of the page
 99 | extra:
100 |   homepage: https://lemonade-server.ai
101 |   social:
102 |     - icon: simple/youtube
103 |       link: https://www.youtube.com/@AMDDevCentral
104 |     - icon: simple/github
105 |       link: https://github.com/lemonade-sdk/lemonade
106 | 
107 | copyright: Copyright &copy; 2025 AMD. All rights reserved.
108 | 
109 | # The custom CSS for colors and more
110 | extra_css:
111 |   - assets/extra.css
112 | 
113 | # The custom JavaScript for the carousel for the videos
114 | extra_javascript:
115 |   - assets/carousel.js
116 | 
117 | markdown_extensions:
118 |   - admonition
119 |   - pymdownx.superfences  # Better code blocks
120 |   - pymdownx.tabbed:      # Tabbed code blocks
121 |       alternate_style: true
122 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup
  2 | 
  3 | with open("src/lemonade/version.py", encoding="utf-8") as fp:
  4 |     version = fp.read().split('"')[1]
  5 | 
  6 | setup(
  7 |     name="lemonade-sdk",
  8 |     version=version,
  9 |     description="Lemonade SDK: Your LLM Aide for Validation and Deployment",
 10 |     author_email="lemonade@amd.com",
 11 |     package_dir={"": "src"},
 12 |     packages=[
 13 |         "lemonade",
 14 |         "lemonade.profilers",
 15 |         "lemonade.common",
 16 |         "lemonade.tools",
 17 |         "lemonade.tools.huggingface",
 18 |         "lemonade.tools.oga",
 19 |         "lemonade.tools.llamacpp",
 20 |         "lemonade.tools.quark",
 21 |         "lemonade.tools.report",
 22 |         "lemonade.tools.server.utils",
 23 |         "lemonade.tools.server",
 24 |         "lemonade_install",
 25 |         "lemonade_server",
 26 |     ],
 27 |     install_requires=[
 28 |         # Minimal dependencies required for end-users who are running
 29 |         # apps deployed on Lemonade SDK
 30 |         "invoke>=2.0.0",
 31 |         "onnx>=1.11.0,<1.18.0",
 32 |         "pyyaml>=5.4",
 33 |         "typeguard>=2.3.13",
 34 |         "packaging>=20.9",
 35 |         # Necessary until upstream packages account for the breaking
 36 |         # change to numpy
 37 |         "numpy<2.0.0",
 38 |         "fasteners",
 39 |         "GitPython>=3.1.40",
 40 |         "psutil>=6.1.1",
 41 |         "wmi",
 42 |         "py-cpuinfo",
 43 |         "pytz",
 44 |         "zstandard",
 45 |         "fastapi",
 46 |         "uvicorn[standard]",
 47 |         "openai>=1.81.0",
 48 |         "transformers<=4.51.3",
 49 |         "jinja2",
 50 |         "tabulate",
 51 |         "sentencepiece",
 52 |         "huggingface-hub==0.33.0",
 53 |     ],
 54 |     extras_require={
 55 |         # The non-dev extras are meant to deploy specific backends into end-user
 56 |         # applications, without including developer-focused tools
 57 |         "oga-hybrid": [
 58 |             # Note: `lemonade-install --ryzenai hybrid` is necessary
 59 |             # to complete installation
 60 |             "onnx==1.16.1",
 61 |             "numpy==1.26.4",
 62 |             "protobuf>=6.30.1",
 63 |         ],
 64 |         "oga-cpu": [
 65 |             "onnxruntime-genai==0.8.2",
 66 |             "onnxruntime >=1.22.0",
 67 |         ],
 68 |         # Developer-focused tools for benchmarking, accuracy testing, and
 69 |         # model preparation (ONNX export, quantization, device-specifc optimization, etc.)
 70 |         "dev": [
 71 |             # Minimal dependencies for developers to use all features of
 72 |             # Lemonade SDK, including building and optimizing models
 73 |             "torch>=2.6.0",
 74 |             "accelerate",
 75 |             "datasets",
 76 |             "pandas>=1.5.3",
 77 |             "matplotlib",
 78 |             # Install human-eval from a forked repo with Windows support until the
 79 |             # PR (https://github.com/openai/human-eval/pull/53) is merged
 80 |             "human-eval-windows==1.0.4",
 81 |             "lm-eval[api]",
 82 |         ],
 83 |         # Keep backwards compatibility for old extras names
 84 |         "oga-hybrid-minimal": ["lemonade-sdk[oga-hybrid]"],
 85 |         "oga-cpu-minimal": ["lemonade-sdk[oga-cpu]"],
 86 |         "llm": ["lemonade-sdk[dev]"],
 87 |         "llm-oga-cpu": ["lemonade-sdk[dev,oga-cpu]"],
 88 |         # The following extras are deprecated and/or not commonly used
 89 |         "llm-oga-igpu": [
 90 |             "onnxruntime-genai-directml==0.6.0",
 91 |             "onnxruntime-directml>=1.19.0,<1.22.0",
 92 |             "transformers<4.45.0",
 93 |             "lemonade-sdk[dev]",
 94 |         ],
 95 |         "llm-oga-cuda": [
 96 |             "onnxruntime-genai-cuda==0.8.2",
 97 |             "onnxruntime-gpu >=1.22.0",
 98 |             "transformers<=4.51.3",
 99 |             "lemonade-sdk[dev]",
100 |         ],
101 |         "llm-oga-npu": [
102 |             "onnx==1.16.0",
103 |             # NPU requires specific onnxruntime version for Ryzen AI compatibility
104 |             # This may conflict with other OGA extras that require >=1.22.0
105 |             "onnxruntime==1.18.0",
106 |             "numpy==1.26.4",
107 |             "protobuf>=6.30.1",
108 |             "lemonade-sdk[dev]",
109 |         ],
110 |         "llm-oga-hybrid": ["lemonade-sdk[dev,oga-hybrid]"],
111 |         "llm-oga-unified": ["lemonade-sdk[llm-oga-hybrid]"],
112 |     },
113 |     classifiers=[],
114 |     entry_points={
115 |         "console_scripts": [
116 |             "lemonade=lemonade:lemonadecli",
117 |             "lemonade-install=lemonade_install:installcli",
118 |             "lemonade-server-dev=lemonade_server.cli:main",
119 |         ]
120 |     },
121 |     python_requires=">=3.10, <3.13",
122 |     long_description=open("README.md", "r", encoding="utf-8").read(),
123 |     long_description_content_type="text/markdown",
124 |     include_package_data=True,
125 |     package_data={
126 |         "lemonade_server": ["server_models.json"],
127 |         "lemonade": ["tools/server/static/*"],
128 |     },
129 | )
130 | 
131 | # This file was originally licensed under Apache 2.0. It has been modified.
132 | # Modifications Copyright (c) 2025 AMD
133 | 


--------------------------------------------------------------------------------
/src/lemonade/__init__.py:
--------------------------------------------------------------------------------
1 | from lemonade.version import __version__
2 | 
3 | from .state import load_state, State
4 | 
5 | from .cli import main as lemonadecli
6 | 


--------------------------------------------------------------------------------
/src/lemonade/api.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=no-member
  2 | 
  3 | from typing import Tuple, Dict
  4 | from lemonade.state import State
  5 | import lemonade.common.printing as printing
  6 | import lemonade.cache as cache
  7 | from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
  8 | from lemonade.common.system_info import (
  9 |     get_system_info_dict,
 10 |     get_device_info_dict,
 11 |     get_system_info as get_system_info_obj,
 12 | )
 13 | 
 14 | 
 15 | class NotSupported(Exception):
 16 |     """
 17 |     Indicates that a checkpoint/recipe pair are not supported
 18 |     together at this time.
 19 |     """
 20 | 
 21 |     def __init__(self, msg):
 22 |         super().__init__(msg)
 23 |         printing.log_error(msg)
 24 | 
 25 | 
 26 | def _raise_not_supported(recipe, checkpoint):
 27 |     raise NotSupported(
 28 |         f"Recipe {recipe} does not have support for checkpoint {checkpoint}"
 29 |     )
 30 | 
 31 | 
 32 | def _make_state(recipe, checkpoint) -> Dict:
 33 |     return State(cache_dir=cache.DEFAULT_CACHE_DIR, build_name=f"{checkpoint}_{recipe}")
 34 | 
 35 | 
 36 | def from_pretrained(
 37 |     checkpoint: str,
 38 |     recipe: str = "hf-cpu",
 39 | ) -> Tuple[ModelAdapter, TokenizerAdapter]:
 40 |     """
 41 |     Load an LLM and the corresponding tokenizer using a lemonade recipe.
 42 | 
 43 |     Args:
 44 |         - checkpoint: huggingface checkpoint that defines the LLM
 45 |         - recipe: defines the implementation and hardware used for the LLM
 46 | 
 47 |     Recipe choices:
 48 |         - hf-cpu: Huggingface Transformers implementation for CPU with max-perf settings
 49 |         - hf-dgpu: Huggingface Transformers implementation on dGPU (via device="cuda")
 50 |         - oga-cpu: CPU implementation based on onnxruntime-genai
 51 |         - oga-igpu: DirectML implementation for iGPU based on onnxruntime-genai-directml
 52 |         - oga-hybird: AMD Ryzen AI Hybrid implementation based on onnxruntime-genai
 53 | 
 54 |     Returns:
 55 |         - model: LLM instance with a generate() method that invokes the recipe
 56 |         - tokenizer: tokenizer instance compatible with the model, which supports
 57 |             the encode (call) and decode() methods.
 58 |     """
 59 | 
 60 |     if recipe == "hf-cpu":
 61 |         # Huggingface Transformers recipe for CPU
 62 |         # Huggingface supports all checkpoints, so there is nothing to check for
 63 | 
 64 |         import torch
 65 |         from lemonade.tools.huggingface.load import HuggingfaceLoad
 66 | 
 67 |         state = _make_state(recipe, checkpoint)
 68 | 
 69 |         state = HuggingfaceLoad().run(
 70 |             state,
 71 |             input=checkpoint,
 72 |             dtype=torch.bfloat16,
 73 |         )
 74 | 
 75 |         return state.model, state.tokenizer
 76 | 
 77 |     elif recipe == "hf-dgpu":
 78 |         # Huggingface Transformers recipe for discrete GPU (Nvidia, Instinct, Radeon)
 79 | 
 80 |         import torch
 81 |         from lemonade.tools.huggingface.load import HuggingfaceLoad
 82 | 
 83 |         state = _make_state(recipe, checkpoint)
 84 | 
 85 |         state = HuggingfaceLoad().run(
 86 |             state,
 87 |             input=checkpoint,
 88 |             dtype=torch.bfloat16,
 89 |             device="cuda",
 90 |         )
 91 | 
 92 |         return state.model, state.tokenizer
 93 | 
 94 |     elif recipe.startswith("oga-"):
 95 |         import lemonade.tools.oga.load as oga
 96 | 
 97 |         # Make sure the user chose a supported runtime, e.g., oga-cpu
 98 |         user_backend = recipe.split("oga-")[1]
 99 |         supported_backends = ["cpu", "igpu", "npu", "hybrid"]
100 |         supported_recipes = [f"oga-{backend}" for backend in supported_backends]
101 |         if recipe not in supported_recipes:
102 |             raise NotSupported(
103 |                 "Selected OGA recipe is not supported. "
104 |                 f"The supported OGA recipes are: {supported_recipes}"
105 |             )
106 | 
107 |         backend_to_dtype = {
108 |             "cpu": "int4",
109 |             "igpu": "int4",
110 |             "hybrid": "int4",
111 |             "npu": "int4",
112 |         }
113 | 
114 |         state = _make_state(recipe, checkpoint)
115 | 
116 |         state = oga.OgaLoad().run(
117 |             state,
118 |             input=checkpoint,
119 |             device=user_backend,
120 |             dtype=backend_to_dtype[user_backend],
121 |         )
122 | 
123 |         return state.model, state.tokenizer
124 | 
125 |     else:
126 |         _raise_not_supported(recipe, checkpoint)
127 | 
128 | 
129 | def get_system_info(verbose: bool = False) -> Dict:
130 |     """
131 |     Get comprehensive system information including hardware details and device information.
132 | 
133 |     Returns:
134 |         dict: Complete system information including:
135 |             - Basic system info (OS, processor, memory, BIOS, etc.).
136 |             - Device information (CPU, AMD iGPU, AMD dGPU, NPU).
137 |             - Inference engine availability per device.
138 |             - Python package versions (verbose mode only).
139 |     """
140 | 
141 |     # Get basic system info
142 |     info = get_system_info_dict()
143 | 
144 |     # Add device information
145 |     info["Devices"] = get_device_info_dict()
146 | 
147 |     # Filter out verbose-only information if not in verbose mode
148 |     if not verbose:
149 |         essential_keys = ["OS Version", "Processor", "Physical Memory", "Devices"]
150 |         info = {k: v for k, v in info.items() if k in essential_keys}
151 |     else:
152 |         # In verbose mode, add Python packages at the end
153 |         system_info_obj = get_system_info_obj()
154 |         info["Python Packages"] = system_info_obj.get_python_packages()
155 | 
156 |     return info
157 | 
158 | 
159 | def get_device_info() -> Dict:
160 |     """
161 |     Get device information including CPU, AMD iGPU, AMD dGPU, and NPU details.
162 | 
163 |     Returns:
164 |         dict: Device information including:
165 |             - cpu: CPU details with inference engine availability.
166 |             - amd_igpu: AMD integrated GPU information.
167 |             - amd_dgpu: List of AMD discrete GPU information.
168 |             - npu: NPU information.
169 |     """
170 | 
171 |     return get_device_info_dict()
172 | 
173 | 
174 | # This file was originally licensed under Apache 2.0. It has been modified.
175 | # Modifications Copyright (c) 2025 AMD
176 | 


--------------------------------------------------------------------------------
/src/lemonade/cache.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timezone
 3 | 
 4 | # Allow an environment variable to override the default
 5 | # location for the build cache
 6 | if os.environ.get("LEMONADE_CACHE_DIR"):
 7 |     DEFAULT_CACHE_DIR = os.path.expanduser(os.environ.get("LEMONADE_CACHE_DIR"))
 8 | else:
 9 |     DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "lemonade")
10 | 
11 | 
12 | def checkpoint_to_model_name(checkpoint_name: str) -> str:
13 |     """
14 |     Get the model's name by stripping the author's name from the checkpoint name
15 |     """
16 | 
17 |     return checkpoint_name.split("/")[1]
18 | 
19 | 
20 | def get_timestamp() -> str:
21 |     """
22 |     Get a timestamp string in the format:
23 |         <year>y_<month>m_<day>d_<hour>h_<minute>m_<second>s
24 |     """
25 |     # Get the current time in GMT
26 |     current_time = datetime.now(timezone.utc)
27 | 
28 |     # Format the timestamp string
29 |     timestamp = current_time.strftime("%Yy_%mm_%dd_%Hh_%Mm_%Ss")
30 |     return timestamp
31 | 
32 | 
33 | def build_name(input_name):
34 |     """
35 |     Name the lemonade build by concatenating these two factors:
36 |         1. Sanitize the input name (typically a model checkpoint name) by
37 |             replacing any `/` characters with `_`.
38 |         2. Timestamp to ensure that builds in the same cache will not
39 |             collide in the same build directory.
40 | 
41 |         If the input_name is a local folder, then we don't know the
42 |         model checkpoint name, so we use "local_model"
43 |     """
44 | 
45 |     if os.path.isdir(input_name):
46 |         input_name_sanitized = "local_model"
47 |     else:
48 |         # Sanitize the input name
49 |         input_name_sanitized = input_name.replace("/", "_")
50 | 
51 |     # Get the formatted timestamp string
52 |     timestamp = get_timestamp()
53 | 
54 |     return f"{input_name_sanitized}_{timestamp}"
55 | 
56 | 
57 | class Keys:
58 |     MODEL = "model"
59 |     PER_ITERATION_LATENCY = "per_iteration_latency"
60 |     MEAN_LATENCY = "mean_latency"
61 |     STD_DEV_LATENCY = "std_dev_latency"
62 |     TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
63 |     STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
64 |     SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
65 |     PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
66 |     STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
67 |     CHECKPOINT = "checkpoint"
68 |     DTYPE = "dtype"
69 |     PROMPT = "prompt"
70 |     PROMPT_TOKENS = "prompt_tokens"
71 |     PROMPT_TEMPLATE = "prompt_template"
72 |     RESPONSE = "response"
73 |     RESPONSE_TOKENS = "response_tokens"
74 |     RESPONSE_LENGTHS_HISTOGRAM = "response_lengths_histogram"
75 |     CACHE_DIR = "cache_dir"
76 |     DEVICE = "device"
77 |     LOCAL_MODEL_FOLDER = "local_model_folder"
78 |     MEMORY_USAGE_PLOT = "memory_usage_plot"
79 |     MAX_MEMORY_USED_GB = "max_memory_used_GB"
80 |     MAX_MEMORY_USED_GBYTE = "max_memory_used_gbyte"
81 |     RYZEN_AI_VERSION_INFO = "ryzen_ai_version_info"
82 | 
83 | 
84 | # This file was originally licensed under Apache 2.0. It has been modified.
85 | # Modifications Copyright (c) 2025 AMD
86 | 


--------------------------------------------------------------------------------
/src/lemonade/cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | # pylint: disable=C0413
  4 | # Prevent HF warnings from showing on every import
  5 | os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
  6 | from lemonade.version import __version__ as version_number
  7 | from lemonade.tools import FirstTool, NiceHelpFormatter
  8 | from lemonade.profilers.memory_tracker import MemoryTracker
  9 | import lemonade.common.filesystem as fs
 10 | import lemonade.common.cli_helpers as cli
 11 | from lemonade.sequence import Sequence
 12 | from lemonade.tools.management_tools import Cache, Version, SystemInfo
 13 | from lemonade.state import State
 14 | 
 15 | from lemonade.tools.huggingface.load import HuggingfaceLoad
 16 | from lemonade.tools.huggingface.bench import HuggingfaceBench
 17 | from lemonade.tools.oga.load import OgaLoad
 18 | from lemonade.tools.oga.bench import OgaBench
 19 | from lemonade.tools.llamacpp.bench import LlamaCppBench
 20 | from lemonade.tools.llamacpp.load import LoadLlamaCpp
 21 | 
 22 | import lemonade.cache as cache
 23 | from lemonade.tools.mmlu import AccuracyMMLU
 24 | from lemonade.tools.humaneval import AccuracyHumaneval
 25 | from lemonade.tools.perplexity import AccuracyPerplexity
 26 | from lemonade.tools.accuracy import LMEvalHarness
 27 | from lemonade.tools.prompt import LLMPrompt
 28 | from lemonade.tools.quark.quark_load import QuarkLoad
 29 | from lemonade.tools.quark.quark_quantize import QuarkQuantize
 30 | from lemonade.tools.report.llm_report import LemonadeReport
 31 | 
 32 | 
 33 | def main():
 34 | 
 35 |     # List the available tools
 36 |     tools = [
 37 |         HuggingfaceLoad,
 38 |         LoadLlamaCpp,
 39 |         LlamaCppBench,
 40 |         AccuracyMMLU,
 41 |         AccuracyHumaneval,
 42 |         AccuracyPerplexity,
 43 |         LMEvalHarness,
 44 |         LLMPrompt,
 45 |         HuggingfaceBench,
 46 |         OgaLoad,
 47 |         OgaBench,
 48 |         QuarkQuantize,
 49 |         QuarkLoad,
 50 |         LemonadeReport,
 51 |         # Inherited from lemonade
 52 |         Cache,
 53 |         Version,
 54 |         SystemInfo,
 55 |     ]
 56 | 
 57 |     # List the available profilers
 58 |     profilers = [MemoryTracker]
 59 | 
 60 |     # Define the argument parser
 61 |     parser = cli.CustomArgumentParser(
 62 |         description=f"""Tools for evaluating and deploying LLMs (v{version_number}).
 63 | 
 64 | Read this to learn the command syntax:
 65 | https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md""",
 66 |         formatter_class=NiceHelpFormatter,
 67 |     )
 68 | 
 69 |     parser.add_argument(
 70 |         "-i",
 71 |         "--input",
 72 |         help="The input that will be evaluated by the starting tool "
 73 |         "(e.g., huggingface checkpoint)",
 74 |     )
 75 | 
 76 |     parser.add_argument(
 77 |         "-d",
 78 |         "--cache-dir",
 79 |         help="Cache directory where tool results are "
 80 |         f"stored (default: {cache.DEFAULT_CACHE_DIR})",
 81 |         required=False,
 82 |         default=cache.DEFAULT_CACHE_DIR,
 83 |     )
 84 | 
 85 |     for profiler in profilers:
 86 |         profiler.add_arguments_to_parser(parser)
 87 | 
 88 |     global_args, tool_instances, evaluation_tools = cli.parse_tools(
 89 |         parser, tools, cli_name="lemonade"
 90 |     )
 91 | 
 92 |     profiler_instances = [
 93 |         profiler(global_args[profiler.unique_name.replace("-", "_")])
 94 |         for profiler in profilers
 95 |         if global_args.get(profiler.unique_name.replace("-", "_"), None) is not None
 96 |     ]
 97 | 
 98 |     if len(evaluation_tools) > 0:
 99 |         if not issubclass(evaluation_tools[0], FirstTool):
100 |             parser.error(
101 |                 "The first tool in the sequence needs to be one "
102 |                 "of the 'tools that can start a sequence.' Use "
103 |                 "`lemonade -h` to see that list of tools."
104 |             )
105 |         # Run the evaluation tools as a build
106 |         sequence = Sequence(tools=tool_instances, profilers=profiler_instances)
107 | 
108 |         # Forward the selected input to the first tool in the sequence
109 |         first_tool_args = next(iter(sequence.tools.values()))
110 |         first_tool_args.append("--input")
111 |         first_tool_args.append(global_args["input"])
112 | 
113 |         state = State(
114 |             cache_dir=os.path.abspath(global_args["cache_dir"]),
115 |             build_name=cache.build_name(global_args["input"]),
116 |             sequence_info=sequence.info,
117 |         )
118 |         sequence.launch(state)
119 |     else:
120 |         # Run the management tools
121 |         for management_tool, argv in tool_instances.items():
122 |             # Support "~" in the cache_dir argument
123 |             parsed_cache_dir = os.path.expanduser(global_args[fs.Keys.CACHE_DIR])
124 |             management_tool.parse_and_run(parsed_cache_dir, argv)
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     main()
129 | 
130 | # This file was originally licensed under Apache 2.0. It has been modified.
131 | # Modifications Copyright (c) 2025 AMD
132 | 


--------------------------------------------------------------------------------
/src/lemonade/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/common/__init__.py


--------------------------------------------------------------------------------
/src/lemonade/common/cli_helpers.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | from typing import List, Dict, Tuple, Any
  4 | from lemonade.tools import Tool, FirstTool
  5 | import lemonade.common.printing as printing
  6 | from lemonade.tools.management_tools import ManagementTool
  7 | 
  8 | 
  9 | class CustomArgumentParser(argparse.ArgumentParser):
 10 | 
 11 |     def error(self, message):
 12 |         self.print_usage()
 13 |         printing.log_error(message)
 14 |         self.exit(2)
 15 | 
 16 | 
 17 | def _tool_list_help(tools: List[Tool], subclass, exclude=None) -> str:
 18 |     help = ""
 19 | 
 20 |     for tool_class in tools:
 21 |         if exclude and issubclass(tool_class, exclude):
 22 |             continue
 23 |         if issubclass(tool_class, subclass):
 24 |             help = (
 25 |                 help
 26 |                 + f" * {tool_class.unique_name}: {tool_class.parser().short_description}\n"
 27 |             )
 28 | 
 29 |     return help
 30 | 
 31 | 
 32 | def parse_tools(
 33 |     parser: argparse.ArgumentParser, supported_tools: List[Tool], cli_name="lemonade"
 34 | ) -> Tuple[Dict[str, Any], Dict[Tool, List[str]], List[str]]:
 35 |     """
 36 |     Add the help for parsing tools and their args to an ArgumentParser.
 37 | 
 38 |     Then, perform the task of parsing a full CLI command including
 39 |     teasing apart the global arguments and separate tool invocations.
 40 |     """
 41 | 
 42 |     tool_parsers = {tool.unique_name: tool.parser() for tool in supported_tools}
 43 |     tool_classes = {tool.unique_name: tool for tool in supported_tools}
 44 | 
 45 |     # Sort tools into categories and format for the help menu
 46 |     first_tool_choices = _tool_list_help(supported_tools, FirstTool)
 47 |     eval_tool_choices = _tool_list_help(supported_tools, Tool, exclude=FirstTool)
 48 |     mgmt_tool_choices = _tool_list_help(supported_tools, ManagementTool)
 49 | 
 50 |     tools_action = parser.add_argument(
 51 |         "tools",
 52 |         metavar="tool --tool-args [tool --tool-args...]",
 53 |         nargs="?",
 54 |         help=f"""\
 55 | Run `{cli_name} TOOL -h` to learn more about each tool.
 56 | 
 57 | Tools that can start a sequence:
 58 | {first_tool_choices}
 59 | Tools that go into a sequence:
 60 | {eval_tool_choices}
 61 | Management tools:
 62 | {mgmt_tool_choices}""",
 63 |         choices=tool_parsers.keys(),
 64 |     )
 65 | 
 66 |     # run as if "-h" was passed if no parameters are passed
 67 |     if len(sys.argv) == 1:
 68 |         sys.argv.append("-h")
 69 | 
 70 |     # Break sys.argv into categories based on which tools were invoked
 71 |     # Arguments that are passed prior to invoking a tool are categorized as
 72 |     # global arguments that should be used to initialize the state.
 73 |     current_tool = "globals"
 74 |     tools_invoked = {current_tool: []}
 75 |     cmd = sys.argv[1:]
 76 |     while len(cmd):
 77 |         if cmd[0] in tool_parsers.keys():
 78 |             # Make sure each tool was only called once
 79 |             if cmd[0] in tools_invoked.keys():
 80 |                 parser.error(
 81 |                     "A single call to lemonade can only invoke each tool once, "
 82 |                     f"however this call invokes tool {cmd[0]} multiple times."
 83 |                 )
 84 |             current_tool = cmd.pop(0)
 85 |             tools_invoked[current_tool] = []
 86 |         else:
 87 |             tools_invoked[current_tool].append(cmd.pop(0))
 88 | 
 89 |     # Trick argparse into thinking tools was not a positional argument
 90 |     # this helps to avoid an error where an incorrect arg/value pair
 91 |     # can be misinterpreted as the tools positional argument
 92 |     tools_action.option_strings = ["--tools"]
 93 | 
 94 |     # Do one pass of parsing to figure out if -h was used
 95 |     global_args = vars(parser.parse_args(tools_invoked["globals"]))
 96 | 
 97 |     # Remove "tools" from global args because it was just there
 98 |     # as a placeholder
 99 |     global_args.pop("tools")
100 | 
101 |     # Remove globals from the list since its already been parsed
102 |     tools_invoked.pop("globals")
103 |     evaluation_tools = []
104 |     management_tools = []
105 |     for cmd, argv in tools_invoked.items():
106 |         tool_parsers[cmd].parse_args(argv)
107 | 
108 |         # Keep track of whether the tools are ManagementTool or not,
109 |         # since ManagementTools are mutually exclusive with evaluation
110 |         # tools
111 |         if issubclass(tool_classes[cmd], ManagementTool):
112 |             management_tools.append(cmd)
113 |         else:
114 |             evaluation_tools.append(cmd)
115 | 
116 |     if len(management_tools) > 0 and len(evaluation_tools) > 0:
117 |         parser.error(
118 |             "This call to lemonade invoked both management and "
119 |             "evaluation tools, however each call to lemonade "
120 |             "is only allowed to invoke one or the other. "
121 |             f"Management tools: {management_tools};"
122 |             f"Evaluation tools: {evaluation_tools}."
123 |         )
124 | 
125 |     if len(management_tools) == 0 and len(evaluation_tools) == 0:
126 |         parser.error(
127 |             "Calls to lemonade are required to call at least "
128 |             "one tool or management tool."
129 |         )
130 | 
131 |     # Convert tool names into Tool instances
132 |     tool_instances = {tool_classes[cmd](): argv for cmd, argv in tools_invoked.items()}
133 |     evaluation_tools = [tool_classes[cmd] for cmd in evaluation_tools]
134 | 
135 |     return global_args, tool_instances, evaluation_tools
136 | 
137 | 
138 | # This file was originally licensed under Apache 2.0. It has been modified.
139 | # Modifications Copyright (c) 2025 AMD
140 | 


--------------------------------------------------------------------------------
/src/lemonade/common/exceptions.py:
--------------------------------------------------------------------------------
 1 | import lemonade.common.printing as printing
 2 | 
 3 | 
 4 | class Error(Exception):
 5 |     """
 6 |     Indicates something has gone wrong while running the tools
 7 |     """
 8 | 
 9 |     def __init__(self, msg):
10 |         super().__init__(msg)
11 |         printing.log_error(msg)
12 | 
13 | 
14 | class CacheError(Error):
15 |     """
16 |     Indicates ambiguous behavior from when a build already exists in the cache,
17 |     but the model, inputs, or args have changed thereby invalidating
18 |     the cached copy of the model.
19 |     """
20 | 
21 | 
22 | class EnvError(Error):
23 |     """
24 |     Indicates to the user that the required tools are not
25 |     available on their PATH.
26 |     """
27 | 
28 | 
29 | class ArgError(Error):
30 |     """
31 |     Indicates to the user that they provided invalid arguments
32 |     """
33 | 
34 | 
35 | class ToolError(Exception):
36 |     """
37 |     Let the user know that something went wrong while
38 |     running a tool.
39 | 
40 |     Note: not overloading __init__() so that the
41 |     attempt to print to stdout isn't captured into
42 |     the Tool's log file.
43 |     """
44 | 
45 | 
46 | class StateError(Exception):
47 |     """
48 |     Raised when something goes wrong with State
49 |     """
50 | 
51 | 
52 | class IntakeError(Exception):
53 |     """
54 |     Let the user know that something went wrong during the
55 |     initial intake process of analyzing a model.
56 |     """
57 | 
58 | 
59 | class IOError(Error):
60 |     """
61 |     Indicates to the user that an input/output operation failed,
62 |     such trying to open a file.
63 |     """
64 | 
65 | 
66 | class ModelArgError(Error):
67 |     """
68 |     Indicates to the user that values provided to a Model instance method
69 |     were not allowed.
70 |     """
71 | 
72 | 
73 | class ModelRuntimeError(Error):
74 |     """
75 |     Indicates to the user that attempting to invoke a Model instance failed.
76 |     """
77 | 
78 | 
79 | class BenchmarkException(Exception):
80 |     """
81 |     Indicates a failure during benchmarking
82 |     """
83 | 
84 | 
85 | class HardwareError(Error):
86 |     """
87 |     Indicates that the hardware used is faulty or unavailable.
88 |     """
89 | 
90 | 
91 | class SkipBuild(Exception):
92 |     """
93 |     Indicates that an exception is deliberately being raised to skip a build
94 |     """
95 | 
96 | 
97 | # This file was originally licensed under Apache 2.0. It has been modified.
98 | # Modifications Copyright (c) 2025 AMD
99 | 


--------------------------------------------------------------------------------
/src/lemonade/common/network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | import socket
 4 | from huggingface_hub import model_info
 5 | 
 6 | 
 7 | def is_offline():
 8 |     """
 9 |     Check if the system is offline by attempting to connect to huggingface.co.
10 | 
11 |     Returns:
12 |         bool: True if the system is offline (cannot connect to huggingface.co),
13 |               False otherwise.
14 |     """
15 |     if os.environ.get("LEMONADE_OFFLINE"):
16 |         return True
17 |     try:
18 |         socket.gethostbyname("huggingface.co")
19 |         return False
20 |     except socket.gaierror:
21 |         return True
22 | 
23 | 
24 | def get_base_model(checkpoint: str) -> Optional[str]:
25 |     """
26 |     Get the base model information for a given checkpoint from the Hugging Face Hub.
27 |     Will auto-detect if we're offline and skip the network call in that case.
28 | 
29 |     Args:
30 |         checkpoint: The model checkpoint to query
31 | 
32 |     Returns:
33 |         The base model name if found, or None if not found or error occurs
34 |     """
35 |     # Skip network call in offline mode
36 |     if is_offline():
37 |         return None
38 | 
39 |     try:
40 |         info = model_info(checkpoint)
41 |         if info.cardData and "base_model" in info.cardData:
42 |             if info.cardData["base_model"] is not None:
43 |                 # This is a derived model
44 |                 return info.cardData["base_model"]
45 |             else:
46 |                 # This is itself a base model
47 |                 return [checkpoint]
48 |     except Exception:  # pylint: disable=broad-except
49 |         pass
50 |     return None
51 | 


--------------------------------------------------------------------------------
/src/lemonade/common/printing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import enum
  4 | import sys
  5 | import math
  6 | 
  7 | 
  8 | class Colors:
  9 |     HEADER = "\033[95m"
 10 |     OKBLUE = "\033[94m"
 11 |     OKCYAN = "\033[96m"
 12 |     OKGREEN = "\033[92m"
 13 |     WARNING = "\033[93m"
 14 |     FAIL = "\033[91m"
 15 |     ENDC = "\033[0m"
 16 |     BOLD = "\033[1m"
 17 |     UNDERLINE = "\033[4m"
 18 | 
 19 | 
 20 | def log(txt, c=Colors.ENDC, end="", is_error=False):
 21 |     logn(txt, c=c, end=end, is_error=is_error)
 22 | 
 23 | 
 24 | def logn(txt, c=Colors.ENDC, end="\n", is_error=False):
 25 |     file = sys.stderr if is_error else sys.stdout
 26 |     print(c + txt + Colors.ENDC, end=end, flush=True, file=file)
 27 | 
 28 | 
 29 | class LogType(enum.Enum):
 30 |     ERROR = "Error:"
 31 |     SUCCESS = "Woohoo!"
 32 |     WARNING = "Warning:"
 33 |     INFO = "Info:"
 34 | 
 35 | 
 36 | def clean_print(type: LogType, msg):
 37 |     # Replace path to user’s home directory by a tilde symbol (~)
 38 |     home_directory = os.path.expanduser("~")
 39 |     home_directory_escaped = re.escape(home_directory)
 40 |     msg = re.sub(home_directory_escaped, "~", msg)
 41 | 
 42 |     # Split message into list, remove leading spaces and line breaks
 43 |     msg = msg.split("\n")
 44 |     msg = [line.lstrip() for line in msg]
 45 |     while msg[0] == "" and len(msg) > 1:
 46 |         msg.pop(0)
 47 | 
 48 |     # Print message
 49 |     indentation = len(type.value) + 1
 50 |     if type == LogType.ERROR:
 51 |         log(f"\n{type.value} ".rjust(indentation), c=Colors.FAIL, is_error=True)
 52 |     elif type == LogType.SUCCESS:
 53 |         log(f"\n{type.value} ".rjust(indentation), c=Colors.OKGREEN)
 54 |     elif type == LogType.WARNING:
 55 |         log(f"\n{type.value} ".rjust(indentation), c=Colors.WARNING)
 56 |     elif type == LogType.INFO:
 57 |         log(f"\n{type.value} ".rjust(indentation), c=Colors.OKCYAN)
 58 | 
 59 |     is_error = type == LogType.ERROR
 60 |     for line_idx, line in enumerate(msg):
 61 |         if line_idx != 0:
 62 |             log(" " * indentation)
 63 |         s_line = line.split("**")
 64 |         for idx, l in enumerate(s_line):
 65 |             c = Colors.ENDC if idx % 2 == 0 else Colors.BOLD
 66 |             if idx != len(s_line) - 1:
 67 |                 log(l, c=c, is_error=is_error)
 68 |             else:
 69 |                 logn(l, c=c, is_error=is_error)
 70 | 
 71 | 
 72 | def log_error(msg):
 73 |     clean_print(LogType.ERROR, str(msg))
 74 |     # ASCII art credit:
 75 |     # https://textart4u.blogspot.com/2014/05/the-fail-whale-ascii-art-code.html
 76 |     logn(
 77 |         """\n▄██████████████▄▐█▄▄▄▄█▌
 78 | ██████▌▄▌▄▐▐▌███▌▀▀██▀▀
 79 | ████▄█▌▄▌▄▐▐▌▀███▄▄█▌
 80 | ▄▄▄▄▄██████████████\n\n""",
 81 |         is_error=True,
 82 |     )
 83 | 
 84 | 
 85 | def log_success(msg):
 86 |     clean_print(LogType.SUCCESS, msg)
 87 | 
 88 | 
 89 | def log_warning(msg):
 90 |     clean_print(LogType.WARNING, msg)
 91 | 
 92 | 
 93 | def log_info(msg):
 94 |     clean_print(LogType.INFO, msg)
 95 | 
 96 | 
 97 | def list_table(list, padding=25, num_cols=4):
 98 |     lines_per_column = int(math.ceil(len(list) / num_cols))
 99 |     for i in range(lines_per_column):
100 |         for col in range(num_cols):
101 |             if i + col * lines_per_column < len(list):
102 |                 print(
103 |                     list[i + col * lines_per_column].ljust(padding),
104 |                     end="",
105 |                 )
106 |         print("\n\t", end="")
107 | 
108 | 
109 | # This file was originally licensed under Apache 2.0. It has been modified.
110 | # Modifications Copyright (c) 2025 AMD
111 | 


--------------------------------------------------------------------------------
/src/lemonade/common/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | def create_test_dir(
 6 |     key: str,
 7 |     base_dir: str = os.path.dirname(os.path.abspath(__file__)),
 8 | ):
 9 |     # Define paths to be used
10 |     cache_dir = os.path.join(base_dir, "generated", f"{key}_cache_dir")
11 |     corpus_dir = os.path.join(base_dir, "generated", "test_corpus")
12 | 
13 |     # Delete folders if they exist and
14 |     if os.path.isdir(cache_dir):
15 |         shutil.rmtree(cache_dir)
16 |     if os.path.isdir(corpus_dir):
17 |         shutil.rmtree(corpus_dir)
18 |     os.makedirs(corpus_dir, exist_ok=True)
19 | 
20 |     return cache_dir, corpus_dir
21 | 
22 | 
23 | def strip_dot_py(test_script_file: str) -> str:
24 |     return test_script_file.split(".")[0]
25 | 
26 | 
27 | # This file was originally licensed under Apache 2.0. It has been modified.
28 | # Modifications Copyright (c) 2025 AMD
29 | 


--------------------------------------------------------------------------------
/src/lemonade/profilers/__init__.py:
--------------------------------------------------------------------------------
1 | from .profiler import Profiler
2 | 


--------------------------------------------------------------------------------
/src/lemonade/profilers/profiler.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class Profiler(abc.ABC):
 5 | 
 6 |     unique_name: str
 7 | 
 8 |     def __init__(self, parser_arg_value=None):
 9 |         self.parser_arg_value = parser_arg_value
10 |         # Statistics that will be displayed to the CLI user
11 |         self.status_stats = []
12 | 
13 |     @staticmethod
14 |     @abc.abstractmethod
15 |     def add_arguments_to_parser(parser):
16 |         """
17 |         Adds the argument parsing for this tool to the parser.
18 |         Uses f"--{self.unique_name}" as the argument.
19 |         """
20 | 
21 |     @abc.abstractmethod
22 |     def start(self, build_dir):
23 |         """
24 |         This method is called prior to the tool sequence starting.
25 |         This informs the profiler to start gathering data.
26 |         The build directory can be used to store profiling data.
27 |         """
28 | 
29 |     def tool_starting(self, tool_name):
30 |         """
31 |         This method is called to inform the profiler of the name of the tool that is about to start.
32 |         """
33 | 
34 |     def tool_stopping(self):
35 |         """
36 |         This method is called to inform the profiler that the tool has finished.
37 |         """
38 | 
39 |     def stop(self):
40 |         """
41 |         This method is called when the tool sequence has finished.
42 |         This informs the profiler to stop gathering data.
43 |         """
44 | 
45 |     @abc.abstractmethod
46 |     def generate_results(self, state, timestamp, start_times):
47 |         """
48 |         This method is called so that the profiler can create its output files.
49 |         The state is passed so that build info can be gathered and stats can be written.
50 |         The timestamp can be used for filename in current working directory.
51 |         The start times parameter is a dict with the keys being the tools names and
52 |         the values being the time the tool started.  There is an initial "warmup" key
53 |         that has a start time before the first tool and a "cool down" key that contains the
54 |         time when the last tool ended.
55 |         """
56 | 
57 | 
58 | # Copyright (c) 2025 AMD
59 | 


--------------------------------------------------------------------------------
/src/lemonade/state.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from typing import Dict, Optional, Any
  4 | import yaml
  5 | import lemonade.common.build as build
  6 | import lemonade.common.filesystem as fs
  7 | from lemonade.version import __version__ as lemonade_version
  8 | 
  9 | 
 10 | def _is_nice_to_write(value):
 11 |     """
 12 |     Checks whether a value is nice to write to YAML.
 13 |     Returns True if the value is a string, int, float, bool, list, dict, or tuple.
 14 |     Returns False otherwise.
 15 |     """
 16 |     if isinstance(value, (str, int, float, bool)):
 17 |         return True
 18 |     elif isinstance(value, list) or isinstance(value, tuple):
 19 |         # Check if all elements in the list are nice to write
 20 |         return all(_is_nice_to_write(item) for item in value)
 21 |     elif isinstance(value, dict):
 22 |         # Check if all values in the dictionary are nice to write
 23 |         return all(_is_nice_to_write(item) for item in value.values())
 24 |     return False
 25 | 
 26 | 
 27 | def _sanitize_for_yaml(input_dict: Dict) -> Dict:
 28 |     """
 29 |     Creates a new dictionary containing only nice-to-write values
 30 |     from the original dictionary.
 31 |     """
 32 |     result = {}
 33 |     for key, value in input_dict.items():
 34 |         if _is_nice_to_write(value):
 35 |             result[key] = value
 36 |     return result
 37 | 
 38 | 
 39 | class State:
 40 |     """
 41 |     The State class is meant to carry build state, starting with the user's
 42 |     initial arguments, through each build Tool in the Sequence, and finally
 43 |     to the disk, where it is used to assess cache hits.
 44 | 
 45 |     State is initialized with the key members that are shared by every build,
 46 |     and reasonable default values are assigned as appropriate.
 47 | 
 48 |     Tool developers can also add any members they wish. To get or set an
 49 |     attribute, reference it as an attribute:
 50 |         1. get: `my_variable = state.attribute_name`
 51 |         2. set: `state.attribute_name = my_variable`
 52 | 
 53 |     Build State can be saved and loaded from disk in the form of a state.yaml file
 54 |     via State.save() and load_state(), respectively. Note that while State can
 55 |     contain members of any type, only YAML-safe members (str, int, bool, float,
 56 |     list, dict, tuple) will be saved and loaded.
 57 |     """
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         cache_dir: str,
 62 |         build_name: Optional[str] = None,
 63 |         sequence_info: Dict[str, Dict] = None,
 64 |         **kwargs,
 65 |     ):
 66 | 
 67 |         # The default model name is the name of the python file that calls build_model()
 68 |         if build_name is None:
 69 |             build_name = os.path.basename(sys.argv[0])
 70 | 
 71 |         # Support "~" in the cache_dir argument
 72 |         parsed_cache_dir = os.path.expanduser(cache_dir)
 73 | 
 74 |         # Save settings as State members
 75 |         self.cache_dir = parsed_cache_dir
 76 |         self.build_name = build_name
 77 |         self.sequence_info = sequence_info
 78 |         self.lemonade_version = lemonade_version
 79 |         self.build_status = build.FunctionStatus.NOT_STARTED
 80 |         self.downcast_applied = False
 81 |         self.uid = build.unique_id()
 82 |         self.results = None
 83 | 
 84 |         # Store any additional kwargs as members
 85 |         for key, value in kwargs.items():
 86 |             self.__dict__[key] = value
 87 | 
 88 |     def __setattr__(self, name: str, value: Any) -> None:
 89 |         """
 90 |         Tool developers can add a new member to State by simply
 91 |         assigning it as an attribute, i.e., `state.new_member = value`.
 92 |         """
 93 |         return super().__setattr__(name, value)
 94 | 
 95 |     def save_stat(self, key: str, value):
 96 |         """
 97 |         Save statistics to an yaml file in the build directory
 98 |         """
 99 | 
100 |         stats = fs.Stats(self.cache_dir, self.build_name)
101 |         stats.save_stat(key, value)
102 | 
103 |     def save_sub_stat(self, parent_key: str, key: str, value):
104 |         """
105 |         Save statistics to an yaml file in the build directory
106 |         """
107 | 
108 |         stats = fs.Stats(self.cache_dir, self.build_name)
109 |         stats.save_sub_stat(parent_key, key, value)
110 | 
111 |     def save(self):
112 |         """
113 |         Save all YAML-friendly members to disk as a state.yaml file.
114 | 
115 |         Note that `model` and `inputs` will typically not be saved since
116 |         they are typically in non-YAML-friendly types such as `torch.nn.Module`
117 |         and `torch.tensor`.
118 |         """
119 | 
120 |         state_to_save = _sanitize_for_yaml(vars(self))
121 | 
122 |         # Create a build directory in the cache
123 |         fs.make_build_dir(self.cache_dir, self.build_name)
124 | 
125 |         with open(
126 |             build.state_file(self.cache_dir, self.build_name),
127 |             "w",
128 |             encoding="utf8",
129 |         ) as outfile:
130 |             yaml.dump(state_to_save, outfile)
131 | 
132 | 
133 | def load_state(
134 |     cache_dir=None,
135 |     build_name=None,
136 |     state_path=None,
137 | ) -> State:
138 |     """
139 |     Read a state.yaml file corresponding to a specific build in a specific
140 |     cache, and use its contents to initialize a State instance.
141 |     """
142 | 
143 |     if state_path is not None:
144 |         file_path = state_path
145 |     elif build_name is not None and cache_dir is not None:
146 |         file_path = build.state_file(cache_dir, build_name)
147 |     else:
148 |         raise ValueError(
149 |             "This function requires either build_name and cache_dir to be set, "
150 |             "or state_path to be set, not both or neither"
151 |         )
152 | 
153 |     state_dict = build.load_yaml(file_path)
154 | 
155 |     return State(**state_dict)
156 | 
157 | 
158 | # This file was originally licensed under Apache 2.0. It has been modified.
159 | # Modifications Copyright (c) 2025 AMD
160 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .tool import Tool, FirstTool, NiceHelpFormatter
2 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/adapter.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | 
  3 | 
  4 | class ModelAdapter(abc.ABC):
  5 |     """
  6 |     Base class for adapting an LLM to work with lemonade's standardized tools
  7 |     """
  8 | 
  9 |     def __init__(self):
 10 |         """
 11 |         Self-benchmarking ModelAdapters can store their results in the
 12 |         tokens_per_second and time_to_first_token members.
 13 |         """
 14 |         self.tokens_per_second = None
 15 |         self.time_to_first_token = None
 16 |         self.type = "generic"
 17 | 
 18 |     @abc.abstractmethod
 19 |     def generate(self, input_ids, max_new_tokens=512):
 20 |         """
 21 |         Generate is the primary method required by lemonade's accuracy tools
 22 | 
 23 |         We try to keep the signature here minimal to allow for maximum compatibility
 24 |         with recipe components, which themselves may not support a lot of arguments.
 25 |         """
 26 | 
 27 | 
 28 | class TokenizerAdapter(abc.ABC):
 29 |     """
 30 |     Base class for adapting an LLM's tokenizer to work with lemonade's standard tools
 31 |     """
 32 | 
 33 |     def __init__(self, tokenizer=None):
 34 |         self.auto_tokenizer = tokenizer
 35 | 
 36 |     @abc.abstractmethod
 37 |     def __call__(self, prompt: str):
 38 |         """
 39 |         Args:
 40 |             prompt: text that should be encoded and passed to the LLM as input_ids
 41 | 
 42 |         Returns: input_ids
 43 |         """
 44 | 
 45 |     @abc.abstractmethod
 46 |     def decode(self, response) -> str:
 47 |         """
 48 |         Args:
 49 |             response: tokens from the LLM that should be decoded into text
 50 | 
 51 |         Returns: text response of the LLM
 52 |         """
 53 | 
 54 |     def apply_chat_template(self, *args, **kwargs):
 55 |         """
 56 |         Convert messages into a single tokenizable string
 57 |         """
 58 |         return self.auto_tokenizer.apply_chat_template(*args, **kwargs)
 59 | 
 60 |     @property
 61 |     def chat_template(self):
 62 |         return self.auto_tokenizer.chat_template
 63 | 
 64 |     @property
 65 |     def eos_token(self):
 66 |         return self.auto_tokenizer.eos_token
 67 | 
 68 | 
 69 | class PassthroughTokenizerResult:
 70 |     """
 71 |     Data structure for holding a tokenizer result where the input_ids
 72 |     are packaged in a non-standard way, but we still want to adhere to
 73 |     standard interfaces (e.g., result.input_ids).
 74 | 
 75 |     For example: CLI-based tools that have their own internal tokenizer that
 76 |     isn't exposed to the user. In this case we can pass the prompt through as
 77 |     a string.
 78 |     """
 79 | 
 80 |     def __init__(self, prompt):
 81 |         self.input_ids = prompt
 82 | 
 83 | 
 84 | class PassthroughTokenizer(TokenizerAdapter):
 85 |     """
 86 |     Tokenizer adapter that forwards the prompt to input_ids as text,
 87 |     and then forwards a text LLM response through decode() as text.
 88 | 
 89 |     Useful for CLI-based tools that have their own internal tokenizer that
 90 |     isn't exposed to the user.
 91 |     """
 92 | 
 93 |     # pylint: disable=unused-argument
 94 |     def __call__(self, prompt: str, **kwargs):
 95 |         return PassthroughTokenizerResult(prompt)
 96 | 
 97 |     # pylint: disable=unused-argument
 98 |     def decode(self, response: str, **kwargs):
 99 |         return response
100 | 
101 | 
102 | # This file was originally licensed under Apache 2.0. It has been modified.
103 | # Modifications Copyright (c) 2025 AMD
104 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/oga/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/oga/__init__.py


--------------------------------------------------------------------------------
/src/lemonade/tools/oga/bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import statistics
  3 | from statistics import StatisticsError
  4 | from lemonade.state import State
  5 | from lemonade.cache import Keys
  6 | from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
  7 | from lemonade.tools.bench import Bench
  8 | 
  9 | 
 10 | class OgaBench(Bench):
 11 |     """
 12 |     Benchmark any model that adheres to the ModelAdapter interface.
 13 | 
 14 |     Required input state:
 15 |         - MODEL: model instance to benchmark.
 16 |         - TOKENIZER: tokenizer instance used to generate inputs for the model.
 17 | 
 18 |     Output state produced: None
 19 |     """
 20 | 
 21 |     unique_name = "oga-bench"
 22 | 
 23 |     def __init__(self):
 24 |         super().__init__()
 25 | 
 26 |         # Additional statistics generated by this bench tool
 27 |         self.status_stats.insert(
 28 |             self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
 29 |             Keys.STD_DEV_TOKENS_PER_SECOND,
 30 |         )
 31 |         self.std_dev_token_generation_tokens_per_second_list = []
 32 | 
 33 |     @staticmethod
 34 |     def parser(add_help: bool = True) -> argparse.ArgumentParser:
 35 |         parser = __class__.helpful_parser(
 36 |             short_description="Benchmark an LLM in onnxruntime-genai (OGA)",
 37 |             add_help=add_help,
 38 |         )
 39 | 
 40 |         parser = Bench.parser(parser)
 41 | 
 42 |         return parser
 43 | 
 44 |     def get_prompt_str(self, state, token_length):
 45 |         """
 46 |         Returns a string with the prescribed token length.
 47 |         """
 48 |         tokenizer: TokenizerAdapter = state.tokenizer
 49 |         test_prompt = "word " * (token_length - 1)
 50 |         input_ids = tokenizer(test_prompt, return_tensors="pt").input_ids
 51 |         test_token_length = len(input_ids)
 52 |         delta = test_token_length - token_length
 53 |         if delta == 0:
 54 |             return test_prompt
 55 |         return "word " * max(token_length - 1 - delta, 0)
 56 | 
 57 |     def run_prompt(
 58 |         self,
 59 |         state: State,
 60 |         report_progress_fn,
 61 |         prompt: str,
 62 |         iterations: int,
 63 |         warmup_iterations: int,
 64 |         output_tokens: int,
 65 |     ) -> State:
 66 | 
 67 |         model: ModelAdapter = state.model
 68 |         tokenizer: TokenizerAdapter = state.tokenizer
 69 | 
 70 |         input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 71 |         self.input_ids_len_list.append(len(input_ids))
 72 |         per_iteration_time_to_first_token = []
 73 |         per_iteration_tokens_per_second = []
 74 | 
 75 |         # Don't capture time for warmup
 76 |         for count in range(warmup_iterations):
 77 |             outputs = model.generate(input_ids, max_new_tokens=output_tokens)
 78 |             self.tokens_out_len_list.append(len(outputs[0]) - len(input_ids))
 79 |             report_progress_fn((count + 1) / (warmup_iterations + iterations))
 80 | 
 81 |         for count in range(iterations):
 82 |             outputs = model.generate(
 83 |                 input_ids,
 84 |                 max_new_tokens=output_tokens,
 85 |                 min_new_tokens=output_tokens,
 86 |             )
 87 |             report_progress_fn(
 88 |                 (warmup_iterations + count + 1) / (warmup_iterations + iterations)
 89 |             )
 90 | 
 91 |             token_len = len(outputs[0]) - len(input_ids)
 92 |             self.tokens_out_len_list.append(token_len)
 93 | 
 94 |             # Only count an iteration if it produced enough tokens
 95 |             if token_len >= output_tokens:
 96 |                 per_iteration_time_to_first_token.append(model.time_to_first_token)
 97 |                 per_iteration_tokens_per_second.append(model.tokens_per_second)
 98 | 
 99 |         if not per_iteration_time_to_first_token or not per_iteration_tokens_per_second:
100 |             raise Bench.not_enough_tokens(output_tokens)
101 | 
102 |         mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
103 |         self.mean_time_to_first_token_list.append(mean_time_to_first_token)
104 |         self.prefill_tokens_per_second_list.append(
105 |             len(input_ids) / mean_time_to_first_token
106 |         )
107 |         self.token_generation_tokens_per_second_list.append(
108 |             statistics.mean(per_iteration_tokens_per_second)
109 |         )
110 |         try:
111 |             self.std_dev_time_to_first_token_list.append(
112 |                 statistics.stdev(per_iteration_time_to_first_token)
113 |             )
114 |         except StatisticsError:
115 |             # Less than 2 measurements
116 |             self.std_dev_time_to_first_token_list.append(None)
117 |         try:
118 |             self.std_dev_token_generation_tokens_per_second_list.append(
119 |                 statistics.stdev(per_iteration_tokens_per_second)
120 |             )
121 |         except StatisticsError:
122 |             # Less than 2 measurements
123 |             self.std_dev_token_generation_tokens_per_second_list.append(None)
124 | 
125 |     def save_stats(self, state):
126 |         super().save_stats(state)
127 | 
128 |         # Save additional statistics
129 |         if not all(
130 |             element is None
131 |             for element in self.std_dev_token_generation_tokens_per_second_list
132 |         ):
133 |             state.save_stat(
134 |                 Keys.STD_DEV_TOKENS_PER_SECOND,
135 |                 self.get_item_or_list(
136 |                     self.std_dev_token_generation_tokens_per_second_list
137 |                 ),
138 |             )
139 | 
140 | 
141 | # This file was originally licensed under Apache 2.0. It has been modified.
142 | # Modifications Copyright (c) 2025 AMD
143 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/perplexity.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from lemonade.state import State
  4 | from lemonade.tools import Tool
  5 | import lemonade.common.printing as printing
  6 | import lemonade.common.build as build
  7 | 
  8 | 
  9 | class AccuracyPerplexity(Tool):
 10 |     """
 11 |     Measure perplexity of an LLM using the Wikitext-2 dataset.
 12 | 
 13 |     Required input state:
 14 |         - state.model: instance that provides a __call__() method that returns
 15 |         output.logits and supports model.config.max_position_embeddings
 16 |         - state.tokenizer: instance of Hugging Face PretrainedTokenizer
 17 | 
 18 |     Output state produced: None
 19 | 
 20 |     See docs/dev_cli/perplexity.md for more details.
 21 |     """
 22 | 
 23 |     unique_name = "accuracy-perplexity"
 24 | 
 25 |     def __init__(self):
 26 |         super().__init__(monitor_message="Measuring perplexity")
 27 | 
 28 |     @staticmethod
 29 |     def parser(add_help: bool = True) -> argparse.ArgumentParser:
 30 |         parser = __class__.helpful_parser(
 31 |             short_description="Measure perplexity score",
 32 |             add_help=add_help,
 33 |         )
 34 |         return parser
 35 | 
 36 |     def run(
 37 |         self,
 38 |         state: State,
 39 |     ) -> State:
 40 | 
 41 |         import pandas as pd
 42 |         import torch
 43 |         from datasets import load_dataset
 44 | 
 45 |         try:
 46 |             printing.log_info("Downloading dataset ...")
 47 |             dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
 48 |         except Exception as e:  # pylint: disable=broad-except
 49 |             printing.log_error(f"Error during dataset load: {e}")
 50 |             raise e
 51 | 
 52 |         tokenizer = state.tokenizer
 53 |         model = state.model
 54 |         # Tokenize the entire test dataset text, joining entries with double new lines
 55 |         encodings = tokenizer("\n\n".join(dataset["text"]), return_tensors="pt")
 56 | 
 57 |         # Retrieve the maximum input length that the model can handle
 58 |         try:
 59 |             max_length = model.config.max_position_embeddings
 60 |         except AttributeError:
 61 |             # Some LLMs do not have the config.max_position_embeddings attribute
 62 |             # However, most LLMs support at least 2048 context length, so this
 63 |             # try-except will allow a few more LLMs to work
 64 |             max_length = 2048
 65 |         # Set stride to half of the maximum input length for overlapping window processing
 66 |         # Refer to docs/dev_cli/perplexity.md for more information on sliding window
 67 |         stride = max_length // 2
 68 |         # Determine the total sequence length of the tokenized input
 69 |         seq_len = encodings.input_ids.size(1)
 70 | 
 71 |         negative_log_likelihoods = []
 72 |         summary_data = []
 73 |         prev_end_location = 0
 74 | 
 75 |         model_results_dir = os.path.join(
 76 |             build.output_dir(state.cache_dir, state.build_name), "perplexity"
 77 |         )
 78 | 
 79 |         for begin_location in range(0, seq_len, stride):
 80 |             end_location = min(begin_location + max_length, seq_len)
 81 |             target_len = end_location - prev_end_location
 82 |             input_ids = encodings.input_ids[:, begin_location:end_location]
 83 |             target_ids = input_ids.clone()
 84 |             target_ids[:, :-target_len] = -100
 85 | 
 86 |             # Forward pass the model to get logits
 87 |             with torch.no_grad():
 88 |                 try:
 89 |                     outputs = model(input_ids, labels=target_ids)
 90 |                     logits = outputs.logits
 91 |                 except Exception as e:  # pylint: disable=broad-except
 92 |                     printing.log_error(
 93 |                         f"Error during model forward pass execution: {e}"
 94 |                     )
 95 | 
 96 |             # Compute loss manually for visualization
 97 |             shift_logits = logits[..., :-1, :].contiguous()
 98 |             shift_labels = target_ids[..., 1:].contiguous()
 99 |             effective_token_count = (target_ids != -100).sum().item()
100 |             negative_log_likelihoods.append(
101 |                 (outputs.loss.item(), effective_token_count)
102 |             )
103 | 
104 |             # Decode predicted and actual next words for the last token position
105 |             predictions = torch.argmax(shift_logits, dim=-1)
106 |             predicted_tokens = predictions[:, -1]
107 |             actual_tokens = shift_labels[:, -1]
108 | 
109 |             predicted_words = tokenizer.batch_decode(
110 |                 predicted_tokens, skip_special_tokens=True
111 |             )
112 |             actual_words = tokenizer.batch_decode(
113 |                 actual_tokens, skip_special_tokens=True
114 |             )
115 |             context = tokenizer.decode(input_ids[0, :])
116 | 
117 |             summary_data.append(
118 |                 {
119 |                     "Context": context[-stride:],
120 |                     "Predicted next word": predicted_words,
121 |                     "Actual next word": actual_words,
122 |                     "Loss for this window": outputs.loss.item(),
123 |                 }
124 |             )
125 |             prev_end_location = end_location
126 | 
127 |         # Total loss calculation considering the number of tokens for each segment
128 |         total_loss = sum(loss * count for loss, count in negative_log_likelihoods)
129 |         total_tokens = sum(count for _, count in negative_log_likelihoods)
130 | 
131 |         # Calculate average negative_log_likelihood and perplexity
132 |         average_negative_log_likelihood = total_loss / total_tokens
133 |         perplexity = torch.exp(torch.tensor(average_negative_log_likelihood))
134 | 
135 |         # Save accuracy results to stats file
136 |         state.save_stat("perplexity_score", float(perplexity.item()))
137 | 
138 |         # Save accuracy results to CSV file
139 |         summary_df = pd.DataFrame(summary_data)
140 |         summary_df.to_csv(
141 |             os.path.join(model_results_dir, "summary_results.csv"), index=False
142 |         )
143 |         return state
144 | 
145 | 
146 | # This file was originally licensed under Apache 2.0. It has been modified.
147 | # Modifications Copyright (c) 2025 AMD
148 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/quark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/quark/__init__.py


--------------------------------------------------------------------------------
/src/lemonade/tools/report/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/report/__init__.py


--------------------------------------------------------------------------------
/src/lemonade/tools/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/server/__init__.py


--------------------------------------------------------------------------------
/src/lemonade/tools/server/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonade-sdk/lemonade/b86292bf84121d003d40804d68032904901ee1bb/src/lemonade/tools/server/static/favicon.ico


--------------------------------------------------------------------------------
/src/lemonade/tools/server/tool_calls.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Dict, Pattern, Optional
  3 | import logging
  4 | import json
  5 | 
  6 | 
  7 | def extract_code_block(text: str) -> str:
  8 |     """
  9 |     Extracts the content inside triple backtick code blocks from a text.
 10 | 
 11 |     Args:
 12 |         text (str): The text to extract the code block from.
 13 | 
 14 |     Returns:
 15 |         str: The content of the first code block if any are found, otherwise the raw text.
 16 |     """
 17 |     # Regex pattern to match triple backtick code blocks (with optional language hint)
 18 |     pattern = re.compile(r"```(?:\w+)?\n(.*?)```", re.DOTALL)
 19 | 
 20 |     # Find all matches
 21 |     code_blocks = pattern.findall(text)
 22 | 
 23 |     # Return first match or raw text
 24 |     return code_blocks[0] if code_blocks else text
 25 | 
 26 | 
 27 | def standardize_tool_call(tool_call: dict) -> dict | None:
 28 |     """
 29 |     Standardizes the format of tool calls according to the format expected by OpenAI.
 30 | 
 31 |     Args:
 32 |         tool_call (dict): The tool call to validate.
 33 | 
 34 |     Returns:
 35 |         dict | None: Standardized tool call if valid, None otherwise.
 36 |     """
 37 |     # Ensure the tool call has a "name"
 38 |     standardized_tool_call = {}
 39 |     if "name" in tool_call:
 40 |         standardized_tool_call["name"] = tool_call["name"]
 41 |     else:
 42 |         logging.warning("Tool call does not have a 'name' field.")
 43 |         return None
 44 | 
 45 |     # Ensure the tool call has "arguments"
 46 |     if "arguments" in tool_call:
 47 |         standardized_tool_call["arguments"] = tool_call["arguments"]
 48 |     elif "parameters" in tool_call:
 49 |         standardized_tool_call["arguments"] = tool_call["parameters"]
 50 |     else:
 51 |         logging.warning("Tool call does not have a 'arguments' or 'parameters' field.")
 52 |         return None
 53 | 
 54 |     return standardized_tool_call
 55 | 
 56 | 
 57 | def get_tool_call_pattern(added_tokens_decoder: List[str]) -> Optional[Pattern]:
 58 |     """
 59 |     Extracts tool call pattern from the added tokens decoder.
 60 |     """
 61 |     special_tokens = [v.content for v in added_tokens_decoder.values()]
 62 | 
 63 |     # Pattern 1: <tool_call>...</tool_call> block
 64 |     # Sample model that uses this pattern: Qwen3-8B
 65 |     if "<tool_call>" in special_tokens and "</tool_call>" in special_tokens:
 66 |         return re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL)
 67 | 
 68 |     # Pattern 2: [TOOL_CALLS] [ {...} ] block
 69 |     # Sample model that uses this pattern: Mistral-7B-Instruct-v0.3
 70 |     elif "[TOOL_CALLS]" in special_tokens:
 71 |         return re.compile(r"\[TOOL_CALLS\]\s*\[(.*?)\](?=\s*<|/?eos|$)", re.DOTALL)
 72 | 
 73 |     else:
 74 |         logging.warning(
 75 |             "Tool calling identifiers were not found for the current model."
 76 |         )
 77 |         return None
 78 | 
 79 | 
 80 | def extract_tool_calls(
 81 |     text: str, tool_call_pattern: Optional[Pattern] = None
 82 | ) -> tuple[List[Dict], str]:
 83 |     """
 84 |     Extracts tool calls from generated text based on tool calling identifiers.
 85 | 
 86 |     Args:
 87 |         text (str): The text output generated by the model.
 88 |         tool_call_pattern (Optional[Pattern]): The pattern to use to extract tool calls.
 89 | 
 90 |     Returns:
 91 |         tuple[List[Dict], str]: A tuple containing:
 92 |             - List[Dict]: A list of extracted tool call objects (raw JSON-like dicts)
 93 |             - str: The original text with tool calls removed
 94 |     """
 95 | 
 96 |     matches = []
 97 |     if tool_call_pattern is not None:
 98 |         matches = list(tool_call_pattern.finditer(text))
 99 | 
100 |     # Some models don't use any tool calling identifiers.
101 |     # Instead, tool calls are identified by only generating JSON content.
102 |     # Sample model that uses this pattern: Llama-3.1-8B-Instruct
103 |     else:
104 |         try:
105 |             # Remove the json for a code block if needed
106 |             parsed_text = extract_code_block(text)
107 |             json_tool_calls = json.loads(parsed_text)
108 | 
109 |             if isinstance(json_tool_calls, dict):
110 |                 json_tool_calls = [json_tool_calls]
111 | 
112 |             extracted_tool_calls = []
113 |             for tool_call in json_tool_calls:
114 |                 # Return the tool call if all calls are valid
115 |                 standard_tool_call = standardize_tool_call(tool_call)
116 |                 if standard_tool_call is not None:
117 |                     extracted_tool_calls.append(standard_tool_call)
118 |                 else:
119 |                     return [], text
120 | 
121 |             return extracted_tool_calls, ""
122 | 
123 |         except json.JSONDecodeError:
124 |             pass
125 | 
126 |     # Process matches in reverse to avoid position shifting
127 |     extracted_tool_calls = []
128 |     cleaned_text = text
129 |     for match in reversed(matches):
130 |         content = match.group(1).strip()
131 |         json_tool_call = None
132 |         try:
133 |             json_tool_call = json.loads(content)
134 |         except json.JSONDecodeError:
135 |             logging.warning("Could not parse tool call as JSON.")
136 |             continue
137 | 
138 |         # Attempt to standardize the tool call
139 |         standard_tool_call = standardize_tool_call(json_tool_call)
140 |         if standard_tool_call is None:
141 |             continue
142 | 
143 |         # If the content is a valid JSON object, add it to the list
144 |         extracted_tool_calls.append(standard_tool_call)
145 | 
146 |         # Remove the matched tool call from the text
147 |         cleaned_text = cleaned_text[: match.start()] + cleaned_text[match.end() :]
148 | 
149 |     return extracted_tool_calls, cleaned_text.strip()
150 | 
151 | 
152 | # This file was originally licensed under Apache 2.0. It has been modified.
153 | # Modifications Copyright (c) 2025 AMD
154 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/server/utils/port.py:
--------------------------------------------------------------------------------
 1 | import socketserver
 2 | import sys
 3 | import logging
 4 | import importlib
 5 | import asyncio
 6 | from contextlib import asynccontextmanager
 7 | from fastapi import FastAPI
 8 | 
 9 | _lazy_imports = {
10 |     "TextIteratorStreamer": ("transformers", "TextIteratorStreamer"),
11 |     "StoppingCriteriaList": ("transformers", "StoppingCriteriaList"),
12 | }
13 | 
14 | 
15 | def find_free_port():
16 |     """
17 |     Scans for an unoccupied TCP port
18 | 
19 |     Returns the port number as an int on success
20 |     Returns None if no port can be found
21 |     """
22 | 
23 |     try:
24 |         with socketserver.TCPServer(("localhost", 0), None) as s:
25 |             return s.server_address[1]
26 |     # pylint: disable=broad-exception-caught
27 |     except Exception:
28 |         return None
29 | 
30 | 
31 | @asynccontextmanager
32 | async def lifespan(app: FastAPI):
33 |     # Only do minimal setup here so endpoints are available immediately
34 |     try:
35 |         if sys.stdout.encoding:
36 |             "🍋".encode(sys.stdout.encoding)
37 |         use_emojis = True
38 |     except (UnicodeEncodeError, AttributeError):
39 |         use_emojis = False
40 | 
41 |     if use_emojis:
42 |         logging.info(
43 |             "\n"
44 |             "\n"
45 |             "🍋  Lemonade Server Ready!\n"
46 |             f"🍋    Open http://localhost:{app.port} in your browser for:\n"
47 |             "🍋      💬 chat\n"
48 |             "🍋      💻 model management\n"
49 |             "🍋      📄 docs\n"
50 |         )
51 |     else:
52 |         logging.info(
53 |             "\n"
54 |             "\n"
55 |             "[Lemonade]  Lemonade Server Ready!\n"
56 |             f"[Lemonade]    Open http://localhost:{app.port} in your browser for:\n"
57 |             "[Lemonade]      chat\n"
58 |             "[Lemonade]      model management\n"
59 |             "[Lemonade]      docs\n"
60 |         )
61 | 
62 |     # Start lazy imports in the background, and set app.initialized = True
63 |     # when the imports are available
64 |     async def lazy_imports_bg():
65 |         for object_name, import_info in _lazy_imports.items():
66 |             module_name = import_info[0]
67 |             class_name = import_info[1]
68 |             module = importlib.import_module(module_name)
69 |             obj = getattr(module, class_name)
70 |             globals()[object_name] = obj
71 | 
72 |         app.initialized = True
73 | 
74 |     asyncio.create_task(lazy_imports_bg())
75 | 
76 |     yield
77 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/server/utils/thread.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import logging
 3 | from lemonade.tools.server.serve import Server
 4 | 
 5 | 
 6 | class ServerRunner(threading.Thread):
 7 |     """
 8 |     Thread class for running the Lemonade Server with a loaded model.
 9 |     """
10 | 
11 |     def __init__(
12 |         self, model, tokenizer, checkpoint, recipe, host="localhost", port=8000
13 |     ):
14 |         threading.Thread.__init__(self)
15 |         self.model = model
16 |         self.tokenizer = tokenizer
17 |         self.checkpoint = checkpoint
18 |         self.recipe = recipe
19 |         self.host = host
20 |         self.port = port
21 |         self.server = None
22 |         self.ready_event = threading.Event()
23 |         self.shutdown_event = threading.Event()
24 |         self.uvicorn_server = None
25 | 
26 |     def run(self):
27 |         try:
28 |             # Create the server instance
29 |             self.server = Server()
30 | 
31 |             # Configure the server with model/tokenizer
32 |             self.server.model = self.model
33 |             self.server.tokenizer = self.tokenizer
34 |             self.server.llm_loaded = type(
35 |                 "obj",
36 |                 (object,),
37 |                 {
38 |                     "checkpoint": self.checkpoint,
39 |                     "recipe": self.recipe,
40 |                     "max_prompt_length": None,
41 |                     "reasoning": False,
42 |                     "model_name": "custom",
43 |                 },
44 |             )
45 | 
46 |             # Set up the server for threaded execution
47 |             self.uvicorn_server = self.server.run_in_thread(
48 |                 port=self.port, host=self.host, log_level="warning"
49 |             )
50 | 
51 |             # Set the ready event
52 |             self.ready_event.set()
53 | 
54 |             # Run the server until shutdown is requested
55 |             logging.info(f"Starting server on http://{self.host}:{self.port}")
56 |             self.uvicorn_server.run()
57 | 
58 |         except Exception as e:
59 |             logging.error(f"Error starting server: {e}")
60 |             self.ready_event.set()
61 |             raise
62 | 
63 |     def shutdown(self):
64 |         """Shutdown the server"""
65 |         if hasattr(self, "uvicorn_server") and self.uvicorn_server:
66 |             logging.info("Shutting down server...")
67 |             self.uvicorn_server.should_exit = True
68 |             self.shutdown_event.set()
69 | 
70 |         # Clean up resources properly to avoid memory leaks
71 |         if hasattr(self, "server") and self.server:
72 |             logging.info("Cleaning up model and tokenizer resources...")
73 | 
74 |             if hasattr(self.server, "model"):
75 |                 self.server.model = None
76 | 
77 |             if hasattr(self.server, "tokenizer"):
78 |                 self.server.tokenizer = None
79 | 
80 |             if hasattr(self.server, "llm_loaded"):
81 |                 self.server.llm_loaded = None
82 | 
83 |         # Clean up local references
84 |         if hasattr(self, "model"):
85 |             del self.model
86 |         if hasattr(self, "tokenizer"):
87 |             del self.tokenizer
88 | 


--------------------------------------------------------------------------------
/src/lemonade/tools/server/webapp.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import json
 3 | from fastapi.responses import HTMLResponse
 4 | from lemonade_server.model_manager import ModelManager
 5 | 
 6 | 
 7 | def get_webapp_html(port=8000):
 8 |     """
 9 |     Show Lemonade Web App for LLM chat and model management.
10 |     """
11 |     # Load server models from JSON
12 |     server_models = ModelManager().supported_models
13 | 
14 |     # Use shared filter function from model_manager.py
15 |     filtered_models = ModelManager().filter_models_by_backend(server_models)
16 | 
17 |     # Pass filtered server_models to JS
18 |     server_models_js = (
19 |         f"<script>window.SERVER_MODELS = {json.dumps(filtered_models)};</script>"
20 |     )
21 | 
22 |     # Load HTML template
23 |     template_path = Path(__file__).parent / "static" / "webapp.html"
24 |     with open(template_path, "r", encoding="utf-8") as f:
25 |         html_template = f.read()
26 | 
27 |     # Replace template variables
28 |     html_content = html_template.replace("{{SERVER_PORT}}", str(port))
29 |     html_content = html_content.replace("{{SERVER_MODELS_JS}}", server_models_js)
30 | 
31 |     return HTMLResponse(content=html_content)
32 | 


--------------------------------------------------------------------------------
/src/lemonade/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "8.0.5"
2 | 


--------------------------------------------------------------------------------
/src/lemonade_install/__init__.py:
--------------------------------------------------------------------------------
1 | from .install import main as installcli
2 | 


--------------------------------------------------------------------------------
/src/lemonade_server/pydantic_models.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Union, List, Any
  2 | 
  3 | from pydantic import BaseModel
  4 | 
  5 | # Set to a high number to allow for interesting experiences in real apps
  6 | # Tests should use the max_new_tokens argument to set a lower value
  7 | DEFAULT_MAX_NEW_TOKENS = 1500
  8 | 
  9 | 
 10 | class LoadConfig(BaseModel):
 11 |     """
 12 |     Configuration for loading a language model.
 13 | 
 14 |     Specifies the model checkpoint, generation parameters,
 15 |     and hardware/framework configuration (recipe) for model loading.
 16 |     """
 17 | 
 18 |     model_name: str
 19 |     checkpoint: Optional[str] = None
 20 |     recipe: Optional[str] = None
 21 |     # Indicates the maximum prompt length allowed for that specific
 22 |     # checkpoint + recipe combination
 23 |     max_prompt_length: Optional[int] = None
 24 |     # Indicates whether the model is a reasoning model, like DeepSeek
 25 |     reasoning: Optional[bool] = False
 26 |     # Indicates which Multimodal Projector (mmproj) file to use
 27 |     mmproj: Optional[str] = None
 28 | 
 29 | 
 30 | class CompletionRequest(BaseModel):
 31 |     """
 32 |     Request model for text completion API endpoint.
 33 | 
 34 |     Contains a prompt, a model identifier, and a streaming
 35 |     flag to control response delivery.
 36 |     """
 37 | 
 38 |     prompt: str
 39 |     model: str
 40 |     echo: bool = False
 41 |     stream: bool = False
 42 |     logprobs: int | None = False
 43 |     stop: list[str] | str | None = None
 44 |     temperature: float | None = None
 45 |     max_tokens: int | None = None
 46 | 
 47 | 
 48 | class ChatCompletionRequest(BaseModel):
 49 |     """
 50 |     Request model for chat completion API endpoint.
 51 | 
 52 |     Contains a list of chat messages, a model identifier,
 53 |     and a streaming flag to control response delivery.
 54 |     """
 55 | 
 56 |     messages: list[dict]
 57 |     model: str
 58 |     stream: bool = False
 59 |     logprobs: int | None = False
 60 |     stop: list[str] | str | None = None
 61 |     temperature: float | None = None
 62 |     tools: list[dict] | None = None
 63 |     max_tokens: int | None = None
 64 |     max_completion_tokens: int | None = None
 65 |     response_format: dict | None = None
 66 | 
 67 | 
 68 | class EmbeddingsRequest(BaseModel):
 69 |     """
 70 |     Request model for embeddings API endpoint.
 71 | 
 72 |     Generates embeddings for the provided input text or tokens.
 73 |     """
 74 | 
 75 |     input: Union[str, List]
 76 |     model: Optional[str] = None
 77 |     encoding_format: Optional[str] = "float"  # "float" or "base64"
 78 | 
 79 | 
 80 | class RerankingRequest(BaseModel):
 81 |     """
 82 |     Request model for reranking API endpoint.
 83 | 
 84 |     Reranks a list of documents based on their relevance to a query.
 85 |     """
 86 | 
 87 |     query: str
 88 |     documents: List[str]
 89 |     model: str
 90 | 
 91 | 
 92 | class ResponsesRequest(BaseModel):
 93 |     """
 94 |     Request model for responses API endpoint.
 95 |     """
 96 | 
 97 |     input: list[dict] | str
 98 |     model: str
 99 |     max_output_tokens: int | None = None
100 |     temperature: float | None = None
101 |     stream: bool = False
102 | 
103 | 
104 | class PullConfig(LoadConfig):
105 |     """
106 |     Pull and load have the same fields.
107 |     """
108 | 
109 | 
110 | class DeleteConfig(BaseModel):
111 |     """
112 |     Configuration for deleting a supported LLM.
113 |     """
114 | 
115 |     model_name: str
116 | 


--------------------------------------------------------------------------------
/test/quark_api.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import shutil
 3 | import os
 4 | from lemonade.state import State
 5 | import lemonade.common.test_helpers as common
 6 | from lemonade.tools.prompt import LLMPrompt
 7 | from lemonade.tools.huggingface.load import HuggingfaceLoad
 8 | from lemonade.tools.quark.quark_quantize import QuarkQuantize
 9 | from lemonade.tools.quark.quark_load import QuarkLoad
10 | 
11 | 
12 | class Testing(unittest.TestCase):
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         # Load default args from QuarkQuantize parser
16 |         parser = QuarkQuantize.parser()
17 |         cls.default_args = vars(parser.parse_args([]))
18 | 
19 |     def setUp(self) -> None:
20 |         shutil.rmtree(cache_dir, ignore_errors=True)
21 | 
22 |     def test_001_quantize(self):
23 |         """
24 |         This test first quantizes the model, exports it to
25 |         target format and then reloads the quantized model
26 |         """
27 |         checkpoint = "facebook/opt-125m"
28 |         device = "cpu"
29 |         prompt = "What if?"
30 | 
31 |         state = State(cache_dir=cache_dir, build_name="test")
32 |         state = HuggingfaceLoad().run(state, input=checkpoint)
33 | 
34 |         quantize_args = {
35 |             "model_export": "quark_safetensors",
36 |             "quant_algo": "awq",
37 |             "quant_scheme": "w_uint4_per_group_asym",
38 |             "device": "cpu",
39 |             "skip_quantization": True,
40 |         }
41 |         # Combine specific quant args with defaults
42 |         quantize_args = {**self.default_args, **quantize_args}
43 |         state = QuarkQuantize().run(state, **quantize_args)
44 |         state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=10)
45 | 
46 |         assert len(state.response) > 0, state.response
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     cache_dir, _ = common.create_test_dir(
51 |         "lemonade_quark_api", base_dir=os.path.abspath(".")
52 |     )
53 |     unittest.main()
54 | 
55 | # This file was originally licensed under Apache 2.0. It has been modified.
56 | # Modifications Copyright (c) 2025 AMD
57 | 


--------------------------------------------------------------------------------