├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── dependabot.yml
    └── workflows
    │   ├── build-and-release.yaml
    │   ├── build-docker.yaml
    │   ├── build-wheels-cuda.yaml
    │   ├── build-wheels-metal.yaml
    │   ├── generate-index-from-release.yaml
    │   ├── publish-to-test.yaml
    │   ├── publish.yaml
    │   ├── test-pypi.yaml
    │   └── test.yaml
├── .gitignore
├── .gitmodules
├── .readthedocs.yaml
├── CHANGELOG.md
├── CMakeLists.txt
├── LICENSE.md
├── Makefile
├── README.md
├── docker
    ├── README.md
    ├── cuda_simple
    │   └── Dockerfile
    ├── open_llama
    │   ├── Dockerfile
    │   ├── build.sh
    │   ├── hug_model.py
    │   ├── start.sh
    │   └── start_server.sh
    ├── openblas_simple
    │   └── Dockerfile
    └── simple
    │   ├── Dockerfile
    │   └── run.sh
├── docs
    ├── api-reference.md
    ├── changelog.md
    ├── icon.svg
    ├── index.md
    ├── install
    │   └── macos.md
    ├── requirements.txt
    └── server.md
├── examples
    ├── batch-processing
    │   └── server.py
    ├── gradio_chat
    │   ├── local.py
    │   └── server.py
    ├── hf_pull
    │   └── main.py
    ├── high_level_api
    │   ├── fastapi_server.py
    │   ├── high_level_api_embedding.py
    │   ├── high_level_api_inference.py
    │   ├── high_level_api_infill.py
    │   ├── high_level_api_streaming.py
    │   └── langchain_custom_llm.py
    ├── low_level_api
    │   ├── Chat.py
    │   ├── Miku.py
    │   ├── ReasonAct.py
    │   ├── common.py
    │   ├── low_level_api_chat_cpp.py
    │   ├── low_level_api_llama_cpp.py
    │   ├── quantize.py
    │   ├── readme
    │   │   └── low_level_api_llama_cpp.md
    │   └── util.py
    ├── notebooks
    │   ├── Batching.ipynb
    │   ├── Clients.ipynb
    │   ├── Functions.ipynb
    │   ├── Guidance.ipynb
    │   ├── Multimodal.ipynb
    │   ├── OpenHermesFunctionCalling.ipynb
    │   └── PerformanceTuning.ipynb
    └── ray
    │   ├── README.md
    │   ├── llm.py
    │   └── requirements.txt
├── llama_cpp
    ├── __init__.py
    ├── _ctypes_extensions.py
    ├── _ggml.py
    ├── _internals.py
    ├── _logger.py
    ├── _utils.py
    ├── llama.py
    ├── llama_cache.py
    ├── llama_chat_format.py
    ├── llama_cpp.py
    ├── llama_grammar.py
    ├── llama_speculative.py
    ├── llama_tokenizer.py
    ├── llama_types.py
    ├── llava_cpp.py
    ├── py.typed
    └── server
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── app.py
    │   ├── cli.py
    │   ├── errors.py
    │   ├── model.py
    │   ├── settings.py
    │   └── types.py
├── mkdocs.yml
├── pyproject.toml
├── scripts
    ├── get-releases.sh
    └── releases-to-pep-503.sh
└── tests
    ├── test_llama.py
    ├── test_llama_chat_format.py
    ├── test_llama_grammar.py
    └── test_llama_speculative.py


/.dockerignore:
--------------------------------------------------------------------------------
  1 | _skbuild/
  2 | 
  3 | .envrc
  4 | 
  5 | models/
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | cover/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | .pybuilder/
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | #   For a library or package, you might want to ignore these files since the code is
 93 | #   intended to run in multiple environments; otherwise, check them in:
 94 | # .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/#use-with-ide
116 | .pdm.toml
117 | 
118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119 | __pypackages__/
120 | 
121 | # Celery stuff
122 | celerybeat-schedule
123 | celerybeat.pid
124 | 
125 | # SageMath parsed files
126 | *.sage.py
127 | 
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv/
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 | 
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | # mkdocs documentation
145 | /site
146 | 
147 | # mypy
148 | .mypy_cache/
149 | .dmypy.json
150 | dmypy.json
151 | 
152 | # Pyre type checker
153 | .pyre/
154 | 
155 | # pytype static type analyzer
156 | .pytype/
157 | 
158 | # Cython debug symbols
159 | cython_debug/
160 | 
161 | # PyCharm
162 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
165 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
166 | .idea/
167 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # Prerequisites
11 | 
12 | Please answer the following questions for yourself before submitting an issue.
13 | 
14 | - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
15 | - [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
16 | - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
17 | - [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
18 | 
19 | # Expected Behavior
20 | 
21 | Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
22 | 
23 | # Current Behavior
24 | 
25 | Please provide a detailed written description of what `llama-cpp-python` did, instead.
26 | 
27 | # Environment and Context
28 | 
29 | Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
30 | 
31 | * Physical (or virtual) hardware you are using, e.g. for Linux:
32 | 
33 | `$ lscpu`
34 | 
35 | * Operating System, e.g. for Linux:
36 | 
37 | `$ uname -a`
38 | 
39 | * SDK version, e.g. for Linux:
40 | 
41 | ```
42 | $ python3 --version
43 | $ make --version
44 | $ g++ --version
45 | ```
46 | 
47 | # Failure Information (for bugs)
48 | 
49 | Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
50 | 
51 | # Steps to Reproduce
52 | 
53 | Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
54 | 
55 | 1. step 1
56 | 2. step 2
57 | 3. step 3
58 | 4. etc.
59 | 
60 | **Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
61 | 
62 | Try the following:
63 | 
64 | 1. `git clone https://github.com/abetlen/llama-cpp-python`
65 | 2. `cd llama-cpp-python`
66 | 3. `rm -rf _skbuild/` # delete any old builds
67 | 4. `python -m pip install .`
68 | 5. `cd ./vendor/llama.cpp`
69 | 6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
70 | 7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
71 | 
72 | # Failure Logs
73 | 
74 | Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
75 | 
76 | Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
77 | 
78 | Example environment info:
79 | ```
80 | llama-cpp-python$ git log | head -1
81 | commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
82 | 
83 | llama-cpp-python$ python3 --version
84 | Python 3.10.10
85 | 
86 | llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy"
87 | fastapi                  0.95.0
88 | numpy                    1.24.3
89 | sse-starlette            1.3.3
90 | uvicorn                  0.21.1
91 | 
92 | llama-cpp-python/vendor/llama.cpp$ git log | head -3
93 | commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
94 | Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
95 | Date:   Thu May 25 20:18:01 2023 -0600
96 | ```
97 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "daily"
12 |   - package-ecosystem: "github-actions"
13 |     directory: "/"
14 |     schedule:
15 |       interval: "daily"
16 |   - package-ecosystem: "docker"
17 |     directory: "/"
18 |     schedule:
19 |       interval: "daily"   
20 | 


--------------------------------------------------------------------------------
/.github/workflows/build-and-release.yaml:
--------------------------------------------------------------------------------
  1 | name: Build Release
  2 | 
  3 | on: workflow_dispatch
  4 | 
  5 | permissions:
  6 |   contents: write
  7 | 
  8 | jobs:
  9 |   build_wheels:
 10 |     name: Build wheels on ${{ matrix.os }}
 11 |     runs-on: ${{ matrix.os }}
 12 |     strategy:
 13 |       matrix:
 14 |         os: [ubuntu-20.04, windows-2019, macos-13]
 15 | 
 16 |     steps:
 17 |       - uses: actions/checkout@v4
 18 |         with:
 19 |           submodules: "recursive"
 20 | 
 21 |       # Used to host cibuildwheel
 22 |       - uses: actions/setup-python@v5
 23 |         with:
 24 |           python-version: "3.9"
 25 | 
 26 |       - name: Install dependencies (Linux/MacOS)
 27 |         if: runner.os != 'Windows'
 28 |         run: |
 29 |           python -m pip install --upgrade pip
 30 |           python -m pip install uv
 31 |           RUST_LOG=trace python -m uv pip install -e .[all] --verbose
 32 |         shell: bash
 33 | 
 34 |       - name: Install dependencies (Windows)
 35 |         if: runner.os == 'Windows'
 36 |         env:
 37 |           RUST_LOG: trace        
 38 |         run: |
 39 |           python -m pip install --upgrade pip
 40 |           python -m pip install uv
 41 |           python -m uv pip install -e .[all] --verbose
 42 |         shell: cmd
 43 | 
 44 |       - name: Build wheels
 45 |         uses: pypa/cibuildwheel@v2.22.0
 46 |         env:
 47 |           # disable repair
 48 |           CIBW_REPAIR_WHEEL_COMMAND: ""
 49 |         with:
 50 |           package-dir: .
 51 |           output-dir: wheelhouse
 52 | 
 53 |       - uses: actions/upload-artifact@v4
 54 |         with:
 55 |           name: wheels-${{ matrix.os }}
 56 |           path: ./wheelhouse/*.whl
 57 | 
 58 |   build_wheels_arm64:
 59 |     name: Build arm64 wheels
 60 |     runs-on: ubuntu-latest
 61 |     steps:
 62 |       - uses: actions/checkout@v4
 63 |         with:
 64 |           submodules: "recursive"
 65 | 
 66 |       - name: Set up QEMU
 67 |         uses: docker/setup-qemu-action@v3
 68 |         with:
 69 |           platforms: linux/arm64
 70 | 
 71 |       - name: Build wheels
 72 |         uses: pypa/cibuildwheel@v2.22.0
 73 |         env:
 74 |           CIBW_SKIP: "*musllinux* pp*"
 75 |           CIBW_REPAIR_WHEEL_COMMAND: ""
 76 |           CIBW_ARCHS: "aarch64"
 77 |           CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
 78 |         with:
 79 |           output-dir: wheelhouse
 80 | 
 81 |       - name: Upload wheels as artifacts
 82 |         uses: actions/upload-artifact@v4
 83 |         with:
 84 |           name: wheels_arm64
 85 |           path: ./wheelhouse/*.whl
 86 | 
 87 |   build_sdist:
 88 |     name: Build source distribution
 89 |     runs-on: ubuntu-latest
 90 | 
 91 |     steps:
 92 |       - uses: actions/checkout@v4
 93 |         with:
 94 |           submodules: "recursive"
 95 | 
 96 |       - uses: actions/setup-python@v5
 97 |         with:
 98 |           python-version: "3.9"
 99 | 
100 |       - name: Install dependencies (Linux/MacOS)
101 |         if: runner.os != 'Windows'
102 |         run: |
103 |           python -m pip install --upgrade pip
104 |           python -m pip install uv
105 |           RUST_LOG=trace python -m uv pip install -e .[all] --verbose
106 |           python -m uv pip install build
107 |         shell: bash
108 | 
109 |       - name: Install dependencies (Windows)
110 |         if: runner.os == 'Windows'
111 |         env:
112 |           RUST_LOG: trace        
113 |         run: |
114 |           python -m pip install --upgrade pip
115 |           python -m pip install uv
116 |           python -m uv pip install -e .[all] --verbose
117 |           python -m uv pip install build
118 |         shell: cmd
119 | 
120 |       - name: Build source distribution
121 |         run: |
122 |           python -m build --sdist
123 | 
124 |       - uses: actions/upload-artifact@v4
125 |         with:
126 |           name: sdist
127 |           path: ./dist/*.tar.gz
128 | 
129 |   release:
130 |     name: Release
131 |     needs: [build_wheels, build_wheels_arm64, build_sdist]
132 |     runs-on: ubuntu-latest
133 | 
134 |     steps:
135 |       - uses: actions/download-artifact@v4
136 |         with:
137 |           merge-multiple: true
138 |           path: dist
139 | 
140 |       - uses: softprops/action-gh-release@v2
141 |         with:
142 |           files: dist/*
143 |         env:
144 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
145 | 


--------------------------------------------------------------------------------
/.github/workflows/build-docker.yaml:
--------------------------------------------------------------------------------
 1 | name: Build Docker
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | permissions:
 6 |   contents: write
 7 |   packages: write
 8 | 
 9 | jobs:
10 |   docker:
11 |     name: Build and push Docker image
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout
15 |         uses: actions/checkout@v4
16 |         with:
17 |           submodules: "recursive"
18 | 
19 |       - name: Set up QEMU
20 |         uses: docker/setup-qemu-action@v3
21 | 
22 |       - name: Set up Docker Buildx
23 |         uses: docker/setup-buildx-action@v3
24 | 
25 |       - name: Login to GitHub Container Registry
26 |         uses: docker/login-action@v3 
27 |         with:
28 |           registry: ghcr.io
29 |           username: ${{ github.repository_owner }}
30 |           password: ${{ secrets.GITHUB_TOKEN }}
31 | 
32 |       - name: Build and push
33 |         id: docker_build
34 |         uses: docker/build-push-action@v6
35 |         with:
36 |           context: .
37 |           file: "docker/simple/Dockerfile"
38 |           push: ${{ startsWith(github.ref, 'refs/tags/') }}
39 |           pull: true
40 |           platforms: linux/amd64,linux/arm64
41 |           tags: |
42 |             ghcr.io/abetlen/llama-cpp-python:latest
43 |             ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }}
44 |           build-args: |
45 |             BUILDKIT_INLINE_CACHE=1
46 | 
47 |       - name: Publish to GitHub Tag
48 |         if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/')
49 |         run: |
50 |           echo "Docker image published for tag: ${{ github.ref_name }}"
51 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheels-cuda.yaml:
--------------------------------------------------------------------------------
  1 | name: Build Wheels (CUDA)
  2 | 
  3 | on: workflow_dispatch
  4 | 
  5 | permissions:
  6 |   contents: write
  7 | 
  8 | jobs:
  9 |   define_matrix:
 10 |     name: Define Build Matrix
 11 |     runs-on: ubuntu-latest
 12 |     outputs:
 13 |       matrix: ${{ steps.set-matrix.outputs.matrix }}
 14 |     defaults:
 15 |       run:
 16 |         shell: pwsh
 17 | 
 18 |     steps:
 19 |       - name: Define Job Output
 20 |         id: set-matrix
 21 |         run: |
 22 |           $matrix = @{
 23 |               'os' = @('ubuntu-latest', 'windows-2019')
 24 |               'pyver' = @("3.9", "3.10", "3.11", "3.12")
 25 |               'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1")
 26 |               'releasetag' = @("basic")
 27 |           }
 28 | 
 29 |           $matrixOut = ConvertTo-Json $matrix -Compress
 30 |           Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
 31 | 
 32 |   build_wheels:
 33 |     name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
 34 |     needs: define_matrix
 35 |     runs-on: ${{ matrix.os }}
 36 |     strategy:
 37 |       matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
 38 |     defaults:
 39 |       run:
 40 |         shell: pwsh
 41 |     env:
 42 |       CUDAVER: ${{ matrix.cuda }}
 43 |       AVXVER: ${{ matrix.releasetag }}
 44 | 
 45 |     steps:
 46 |       - name: Add MSBuild to PATH
 47 |         if: runner.os == 'Windows'
 48 |         uses: microsoft/setup-msbuild@v2
 49 |         with:
 50 |           vs-version: '[16.11,16.12)'
 51 | 
 52 |       - uses: actions/checkout@v4
 53 |         with:
 54 |           submodules: "recursive"
 55 | 
 56 |       - uses: actions/setup-python@v5
 57 |         with:
 58 |           python-version: ${{ matrix.pyver }}
 59 |           cache: 'pip'
 60 | 
 61 |       - name: Setup Mamba
 62 |         uses: conda-incubator/setup-miniconda@v3.1.0
 63 |         with:
 64 |           activate-environment: "llamacpp"
 65 |           python-version: ${{ matrix.pyver }}
 66 |           miniforge-version: latest
 67 |           add-pip-as-python-dependency: true
 68 |           auto-activate-base: false
 69 | 
 70 |       - name: VS Integration Cache
 71 |         id: vs-integration-cache
 72 |         if: runner.os == 'Windows'
 73 |         uses: actions/cache@v4
 74 |         with:
 75 |           path: ./MSBuildExtensions
 76 |           key: cuda-${{ matrix.cuda }}-vs-integration
 77 | 
 78 |       - name: Get Visual Studio Integration
 79 |         if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
 80 |         run: |
 81 |           if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
 82 |           $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
 83 |           for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
 84 |           Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
 85 |           & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
 86 |           Remove-Item 'cudainstaller.zip'
 87 | 
 88 |       - name: Install Visual Studio Integration
 89 |         if: runner.os == 'Windows'
 90 |         run: |
 91 |           $y = (gi '.\MSBuildExtensions').fullname + '\*'
 92 |           (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
 93 |           $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
 94 |           echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
 95 | 
 96 |       - name: Install Dependencies
 97 |         env:
 98 |           MAMBA_DOWNLOAD_FAILFAST: "0"
 99 |           MAMBA_NO_LOW_SPEED_LIMIT: "1"
100 |         run: |
101 |           $cudaVersion = $env:CUDAVER
102 |           mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
103 |           python -m pip install build wheel
104 | 
105 |       - name: Build Wheel
106 |         run: |
107 |           $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
108 |           $env:CUDA_PATH = $env:CONDA_PREFIX
109 |           $env:CUDA_HOME = $env:CONDA_PREFIX
110 |           $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
111 |           if ($IsLinux) {
112 |             $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
113 |           }
114 |           $env:VERBOSE = '1'
115 |           $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all'
116 |           $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
117 |           # if ($env:AVXVER -eq 'AVX') {
118 |           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
119 |           # }
120 |           # if ($env:AVXVER -eq 'AVX512') {
121 |           #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on'
122 |           # }
123 |           # if ($env:AVXVER -eq 'basic') {
124 |           #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
125 |           # }
126 |           python -m build --wheel
127 |           # write the build tag to the output
128 |           Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
129 | 
130 |       - uses: softprops/action-gh-release@v2
131 |         with:
132 |           files: dist/*
133 |           # Set tag_name to <tag>-cu<cuda_version>
134 |           tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
135 |         env:
136 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
137 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheels-metal.yaml:
--------------------------------------------------------------------------------
 1 | name: Build Wheels (Metal)
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | permissions:
 6 |   contents: write
 7 | 
 8 | jobs:
 9 |   build_wheels:
10 |     name: Build wheels on ${{ matrix.os }}
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: [macos-13, macos-14, macos-15]
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |         with:
19 |           submodules: "recursive"
20 | 
21 |       # Used to host cibuildwheel
22 |       - uses: actions/setup-python@v5
23 |         with:
24 |           python-version: "3.12"
25 |           cache: 'pip'
26 |           
27 |       - name: Install dependencies (Linux/MacOS)
28 |         if: runner.os != 'Windows'
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           python -m pip install uv
32 |           RUST_LOG=trace python -m uv pip install -e .[all] --verbose
33 |         shell: bash
34 | 
35 |       - name: Install dependencies (Windows)
36 |         if: runner.os == 'Windows'
37 |         env:
38 |           RUST_LOG: trace        
39 |         run: |
40 |           python -m pip install --upgrade pip
41 |           python -m pip install uv
42 |           python -m uv pip install -e .[all] --verbose
43 |         shell: cmd
44 | 
45 |       - name: Build wheels
46 |         uses: pypa/cibuildwheel@v2.22.0
47 |         env:
48 |           # disable repair
49 |           CIBW_REPAIR_WHEEL_COMMAND: ""
50 |           CIBW_ARCHS: "arm64"
51 |           CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on"
52 |           CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*"
53 |         with:
54 |           package-dir: .
55 |           output-dir: wheelhouse2
56 | 
57 |       - uses: actions/upload-artifact@v4
58 |         with:
59 |           name: wheels-mac_${{ matrix.os }}
60 |           path: ./wheelhouse2/*.whl
61 | 
62 |   release:
63 |     name: Release
64 |     needs: [build_wheels]
65 |     runs-on: ubuntu-latest
66 | 
67 |     steps:
68 |       - uses: actions/download-artifact@v4
69 |         with:
70 |           merge-multiple: true
71 |           path: dist2
72 |           
73 |       - uses: softprops/action-gh-release@v2
74 |         with:
75 |           files: dist2/*
76 |           # set release name to <tag>-metal
77 |           tag_name: ${{ github.ref_name }}-metal
78 |         env:
79 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
80 | 


--------------------------------------------------------------------------------
/.github/workflows/generate-index-from-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Wheels Index
 2 | 
 3 | on:
 4 |   # Trigger on new release
 5 |   workflow_run:
 6 |     workflows: ["Release", "Build Wheels (CUDA)", "Build Wheels (Metal)"]
 7 |     types:
 8 |       - completed
 9 | 
10 |   # Allows you to run this workflow manually from the Actions tab
11 |   workflow_dispatch:
12 | 
13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
14 | permissions:
15 |   contents: read
16 |   pages: write
17 |   id-token: write
18 | 
19 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
20 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
21 | concurrency:
22 |   group: "pages"
23 |   cancel-in-progress: false
24 | 
25 | jobs:
26 |   # Single deploy job since we're just deploying
27 |   deploy:
28 |     environment:
29 |       name: github-pages
30 |       url: ${{ steps.deployment.outputs.page_url }}
31 |     runs-on: ubuntu-latest
32 |     steps:
33 |       - name: Checkout
34 |         uses: actions/checkout@v4
35 |       - name: Setup Pages
36 |         uses: actions/configure-pages@v5
37 |       - name: Build
38 |         env:
39 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
40 |         run: |
41 |           ./scripts/get-releases.sh
42 |           ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
43 |           ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
44 |           ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
45 |           ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
46 |           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
47 |           # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
48 |           # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
49 |           ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
50 |       - name: Upload artifact
51 |         uses: actions/upload-pages-artifact@v3
52 |         with:
53 |           # Upload entire repository
54 |           path: 'index'
55 |       - name: Deploy to GitHub Pages
56 |         id: deployment
57 |         uses: actions/deploy-pages@v4
58 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-test.yaml:
--------------------------------------------------------------------------------
 1 | # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 2 | 
 3 | name: Publish to TestPyPI
 4 | 
 5 | on:
 6 |   workflow_dispatch:
 7 |     inputs:
 8 |       dev_version:
 9 |         description: 'Dev version N'
10 |         required: true
11 | 
12 | 
13 | jobs:
14 |   build-n-publish:
15 |     name: Build and publish
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |       with:
21 |         submodules: "recursive"
22 |         
23 |     - name: Set up Python
24 |       uses: actions/setup-python@v5
25 |       with:
26 |         python-version: "3.11"
27 |         cache: 'pip'
28 |         
29 |     - name: Append Dev Version to __version__
30 |       run: |
31 |         DEV_VERSION=${{ github.event.inputs.dev_version }}
32 |         CURRENT_VERSION=$(awk -F= '/__version__ =/ {print $2}' llama_cpp/__init__.py | tr -d ' "')
33 |         NEW_VERSION="${CURRENT_VERSION}.dev${DEV_VERSION}"
34 |         sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py
35 |         
36 |     - name: Install dependencies (Linux/MacOS)
37 |       if: runner.os != 'Windows'
38 |       run: |
39 |         python -m pip install --upgrade pip
40 |         python -m pip install uv
41 |         RUST_LOG=trace python -m uv pip install -e .[all] --verbose
42 |       shell: bash
43 | 
44 |     - name: Install dependencies (Windows)
45 |       if: runner.os == 'Windows'
46 |       env:
47 |         RUST_LOG: trace       
48 |       run: |
49 |         python -m pip install --upgrade pip
50 |         python -m pip install uv
51 |         python -m uv pip install -e .[all] --verbose
52 |       shell: cmd
53 |         
54 |     - name: Build source distribution
55 |       run: |
56 |         python -m build --sdist
57 |         
58 |     - name: Publish to Test PyPI
59 |       uses: pypa/gh-action-pypi-publish@release/v1
60 |       with:
61 |         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
62 |         repository-url: https://test.pypi.org/legacy/
63 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 4 | 
 5 | on: workflow_dispatch
 6 | 
 7 | jobs:
 8 |   build-n-publish:
 9 |     name: Build and publish
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |       with:
15 |         submodules: "recursive"
16 | 
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v5
19 |       with:
20 |         python-version: "3.9"
21 | 
22 |     - name: Install dependencies (Linux/MacOS)
23 |       if: runner.os != 'Windows'
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         python -m pip install uv
27 |         RUST_LOG=trace python -m uv pip install -e .[all] --verbose
28 |         python -m uv pip install build
29 |       shell: bash
30 | 
31 |     - name: Install dependencies (Windows)
32 |       if: runner.os == 'Windows'
33 |       env:
34 |         RUST_LOG: trace
35 |       run: |
36 |         python -m pip install --upgrade pip
37 |         python -m pip install uv
38 |         python -m uv pip install -e .[all] --verbose
39 |         python -m uv pip install build
40 |       shell: cmd
41 | 
42 |     - name: Build source distribution
43 |       run: |
44 |         python -m build --sdist
45 | 
46 |     - name: Publish distribution to PyPI
47 |       # TODO: move to tag based releases
48 |       # if: startsWith(github.ref, 'refs/tags')
49 |       uses: pypa/gh-action-pypi-publish@release/v1
50 |       with:
51 |         password: ${{ secrets.PYPI_API_TOKEN }}
52 | 


--------------------------------------------------------------------------------
/.github/workflows/test-pypi.yaml:
--------------------------------------------------------------------------------
  1 | name: Tests for PyPI package
  2 | 
  3 | on: workflow_dispatch
  4 | 
  5 | jobs:
  6 |   build-linux:
  7 | 
  8 |     runs-on: ubuntu-latest
  9 |     strategy:
 10 |       matrix:
 11 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
 12 | 
 13 |     steps:
 14 |       - name: Set up Python ${{ matrix.python-version }}
 15 |         uses: actions/setup-python@v5
 16 |         with:
 17 |           python-version: ${{ matrix.python-version }}
 18 |           cache: 'pip'
 19 | 
 20 |       - name: Install dependencies (Linux/MacOS)
 21 |         if: runner.os != 'Windows'
 22 |         run: |
 23 |           python -m pip install --upgrade pip
 24 |           python -m pip install uv
 25 |           RUST_LOG=trace python -m uv pip install llama-cpp-python[all] --verbose 
 26 |         shell: bash
 27 | 
 28 |       - name: Install dependencies (Windows)
 29 |         if: runner.os == 'Windows'
 30 |         env:
 31 |           RUST_LOG: trace           
 32 |         run: |
 33 |           python -m pip install --upgrade pip
 34 |           python -m pip install uv
 35 |           python -m uv pip install llama-cpp-python[all] --verbose 
 36 |         shell: cmd
 37 |           
 38 |       - name: Test with pytest
 39 |         run: |
 40 |           python -c "import llama_cpp"
 41 | 
 42 |   build-windows:
 43 | 
 44 |     runs-on: windows-latest
 45 |     strategy:
 46 |       matrix:
 47 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
 48 | 
 49 |     steps:
 50 |       - name: Set up Python ${{ matrix.python-version }}
 51 |         uses: actions/setup-python@v5
 52 |         with:
 53 |           python-version: ${{ matrix.python-version }}
 54 |           cache: 'pip'
 55 |           
 56 |       - name: Install dependencies (Linux/MacOS)
 57 |         if: runner.os != 'Windows'
 58 |         run: |
 59 |           python -m pip install --upgrade pip
 60 |           python -m pip install uv
 61 |           RUST_LOG=trace python -m uv pip install llama-cpp-python[all] --verbose 
 62 |         shell: bash
 63 | 
 64 |       - name: Install dependencies (Windows)
 65 |         if: runner.os == 'Windows'
 66 |         env:
 67 |           RUST_LOG: trace          
 68 |         run: |
 69 |           python -m pip install --upgrade pip
 70 |           python -m pip install uv
 71 |           python -m uv pip install llama-cpp-python[all] --verbose 
 72 |         shell: cmd
 73 |           
 74 |       - name: Test with pytest
 75 |         run: |
 76 |           python -c "import llama_cpp"
 77 | 
 78 |   build-macos:
 79 | 
 80 |     runs-on: macos-latest
 81 |     strategy:
 82 |       matrix:
 83 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
 84 | 
 85 |     steps:
 86 |       - name: Set up Python ${{ matrix.python-version }}
 87 |         uses: actions/setup-python@v5
 88 |         with:
 89 |           python-version: ${{ matrix.python-version }}
 90 |           cache: 'pip'   
 91 | 
 92 |       - name: Install dependencies (Linux/MacOS)
 93 |         if: runner.os != 'Windows'
 94 |         run: |
 95 |           python -m pip install --upgrade pip
 96 |           python -m pip install uv
 97 |           RUST_LOG=trace python -m uv pip install llama-cpp-python[all] --verbose 
 98 |         shell: bash
 99 | 
100 |       - name: Install dependencies (Windows)
101 |         if: runner.os == 'Windows'
102 |         env:
103 |           RUST_LOG: trace  
104 |         run: |
105 |           python -m pip install --upgrade pip
106 |           python -m pip install uv
107 |           python -m uv pip install llama-cpp-python[all] --verbose 
108 |         shell: cmd
109 |           
110 |       - name: Test with pytest
111 |         run: |
112 |           python -c "import llama_cpp"
113 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
  1 | name: Tests
  2 | on:
  3 |   pull_request:
  4 |     branches:
  5 |       - main
  6 |   push:
  7 |     branches:
  8 |       - main
  9 | 
 10 | env:
 11 |   REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF
 12 |   MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf
 13 | 
 14 | jobs:
 15 |   download-model:
 16 |     runs-on: ubuntu-latest
 17 |     steps:
 18 |       - name: Set up Python
 19 |         uses: actions/setup-python@v5
 20 |         with:
 21 |           python-version: "3.9"
 22 |       - name: Install huggingface-hub
 23 |         run: pip install huggingface-hub
 24 |       - name: Download model
 25 |         run: huggingface-cli download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }}
 26 |       - name: Cache model
 27 |         uses: actions/cache@v4
 28 |         with:
 29 |           path: ~/.cache/huggingface/hub
 30 |           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
 31 | 
 32 |   build-linux:
 33 |     needs: download-model
 34 |     runs-on: ubuntu-latest
 35 |     strategy:
 36 |       matrix:
 37 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
 38 |     steps:
 39 |       - uses: actions/checkout@v4
 40 |         with:
 41 |           submodules: "recursive"
 42 |           
 43 |       - name: Set up Python ${{ matrix.python-version }}
 44 |         uses: actions/setup-python@v5
 45 |         with:
 46 |           python-version: ${{ matrix.python-version }}
 47 |           cache: 'pip'
 48 |       - name: Restore model cache
 49 |         uses: actions/cache@v4
 50 |         with:
 51 |           path: ~/.cache/huggingface/hub
 52 |           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
 53 |       - name: Install dependencies (Linux/MacOS)
 54 |         run: |
 55 |           python -m pip install --upgrade pip
 56 |           python -m pip install uv
 57 |           python -m uv pip install -e .[all] --verbose
 58 |         shell: bash
 59 |       - name: Test with pytest
 60 |         run: |
 61 |           python -m pytest
 62 | 
 63 |   build-windows:
 64 |     needs: download-model
 65 |     runs-on: windows-latest
 66 |     strategy:
 67 |       matrix:
 68 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
 69 |     steps:
 70 |       - uses: actions/checkout@v4
 71 |         with:
 72 |           submodules: "recursive"
 73 |           
 74 |       - name: Set up Python ${{ matrix.python-version }}
 75 |         uses: actions/setup-python@v5
 76 |         with:
 77 |           python-version: ${{ matrix.python-version }}
 78 |           cache: 'pip'
 79 | 
 80 |       - name: Restore model cache
 81 |         uses: actions/cache@v4
 82 |         with:
 83 |           path: ~/.cache/huggingface/hub
 84 |           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
 85 | 
 86 |       - name: Install dependencies (Windows)
 87 |         run: |
 88 |           python -m pip install --upgrade pip
 89 |           python -m pip install uv
 90 |           python -m uv pip install -e .[all] --verbose
 91 |         shell: cmd
 92 |           
 93 |       - name: Test with pytest
 94 |         run: |
 95 |           python -m pytest
 96 | 
 97 |   build-macos:
 98 |     needs: download-model
 99 |     runs-on: macos-13
100 |     strategy:
101 |       matrix:
102 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
103 |     steps:
104 |       - uses: actions/checkout@v4
105 |         with:
106 |           submodules: "recursive"
107 |           
108 |       - name: Set up Python ${{ matrix.python-version }}
109 |         uses: actions/setup-python@v5
110 |         with:
111 |           python-version: ${{ matrix.python-version }}
112 |           cache: 'pip'
113 | 
114 |       - name: System Info
115 |         run: |
116 |           uname -a
117 |           sysctl -n machdep.cpu.brand_string
118 |           python3 -c "import platform; print(platform.machine(), platform.architecture())"
119 | 
120 |       - name: Restore model cache
121 |         uses: actions/cache@v4
122 |         with:
123 |           path: ~/.cache/huggingface/hub
124 |           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
125 |           
126 |       - name: Install dependencies (Linux/MacOS)
127 |         run: |
128 |           python3 -m pip install --upgrade pip
129 |           python3 -m pip install uv
130 |           python3 -m uv pip install -e .[all] --verbose
131 |           CMAKE_ARGS="-DLLAMA_METAL=off" python3 -m uv pip install .[all] --verbose
132 |         shell: bash
133 | 
134 |       - name: Test with pytest
135 |         run: |
136 |           python3 -m pytest
137 | 
138 |   build-macos-metal:
139 |     needs: download-model
140 |     runs-on: macos-13
141 |     steps:
142 |       - uses: actions/checkout@v4
143 |         with:
144 |           submodules: "recursive"
145 |           
146 |       - name: Set up Python 3.9
147 |         uses: actions/setup-python@v5
148 |         with:
149 |           python-version: "3.9"
150 | 
151 |       - name: System Info
152 |         run: |
153 |           uname -a
154 |           sysctl -n machdep.cpu.brand_string
155 |           python3 -c "import platform; print(platform.machine(), platform.architecture())"
156 | 
157 |       - name: Restore model cache
158 |         uses: actions/cache@v4
159 |         with:
160 |           path: ~/.cache/huggingface/hub
161 |           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
162 | 
163 |       - name: Install dependencies
164 |         run: |
165 |           python3 -m pip install --upgrade pip
166 |           CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
167 |         shell: bash
168 | 
169 |       - name: Test with pytest
170 |         run: |
171 |           python3 -m pytest
172 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.local
  2 | 
  3 | .python-version
  4 | 
  5 | .vscode/
  6 | 
  7 | _skbuild/
  8 | 
  9 | .envrc
 10 | .direnv
 11 | 
 12 | models/
 13 | 
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | *$py.class
 18 | 
 19 | # C extensions
 20 | llama_cpp/*.so
 21 | llama_cpp/*.dylib
 22 | llama_cpp/*.metal
 23 | llama_cpp/*.dll
 24 | llama_cpp/*.lib
 25 | 
 26 | # Distribution / packaging
 27 | .Python
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | cover/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | db.sqlite3-journal
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | .pybuilder/
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | #   For a library or package, you might want to ignore these files since the code is
104 | #   intended to run in multiple environments; otherwise, check them in:
105 | # .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # poetry
115 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
117 | #   commonly ignored for libraries.
118 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119 | #poetry.lock
120 | 
121 | # pdm
122 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123 | #pdm.lock
124 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125 | #   in version control.
126 | #   https://pdm.fming.dev/#use-with-ide
127 | .pdm.toml
128 | 
129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130 | __pypackages__/
131 | 
132 | # Celery stuff
133 | celerybeat-schedule
134 | celerybeat.pid
135 | 
136 | # SageMath parsed files
137 | *.sage.py
138 | 
139 | # Environments
140 | .env
141 | .venv
142 | env/
143 | venv/
144 | ENV/
145 | env.bak/
146 | venv.bak/
147 | 
148 | # Spyder project settings
149 | .spyderproject
150 | .spyproject
151 | 
152 | # Rope project settings
153 | .ropeproject
154 | 
155 | # mkdocs documentation
156 | /site
157 | 
158 | # mypy
159 | .mypy_cache/
160 | .dmypy.json
161 | dmypy.json
162 | 
163 | # Pyre type checker
164 | .pyre/
165 | 
166 | # pytype static type analyzer
167 | .pytype/
168 | 
169 | # Cython debug symbols
170 | cython_debug/
171 | 
172 | # PyCharm
173 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
176 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
177 | .idea/
178 | 
179 | # downloaded model .bin files
180 | docker/open_llama/*.bin
181 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "vendor/llama.cpp"]
2 | 	path = vendor/llama.cpp
3 | 	url = https://github.com/ggerganov/llama.cpp.git
4 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for MkDocs projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the version of Python and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.11"
12 | 
13 | mkdocs:
14 |   configuration: mkdocs.yml
15 | 
16 | python:
17 |   install:
18 |     - method: pip
19 |       path: .
20 |     - requirements: docs/requirements.txt
21 | 
22 | submodules:
23 |   include: all
24 |   recursive: true


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.21)
  2 | 
  3 | project(llama_cpp)
  4 | 
  5 | option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
  6 | option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
  7 | 
  8 | function(llama_cpp_python_install_target target)
  9 |     if(NOT TARGET ${target})
 10 |         return()
 11 |     endif()
 12 | 
 13 |     install(
 14 |         TARGETS ${target}
 15 |         LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
 16 |         RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
 17 |         ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
 18 |         FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
 19 |         RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
 20 |     )
 21 |     install(
 22 |         TARGETS ${target}
 23 |         LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
 24 |         RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
 25 |         ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
 26 |         FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
 27 |         RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
 28 |     )
 29 |     set_target_properties(${target} PROPERTIES
 30 |         INSTALL_RPATH "$ORIGIN"
 31 |         BUILD_WITH_INSTALL_RPATH TRUE
 32 |     )
 33 |     if(UNIX)
 34 |         if(APPLE)
 35 |             set_target_properties(${target} PROPERTIES
 36 |                 INSTALL_RPATH "@loader_path"
 37 |                 BUILD_WITH_INSTALL_RPATH TRUE
 38 |             )
 39 |         else()
 40 |             set_target_properties(${target} PROPERTIES
 41 |                 INSTALL_RPATH "$ORIGIN"
 42 |                 BUILD_WITH_INSTALL_RPATH TRUE
 43 |             )
 44 |         endif()
 45 |     endif()
 46 | endfunction()
 47 | 
 48 | if (LLAMA_BUILD)
 49 |     set(BUILD_SHARED_LIBS "On")
 50 | 
 51 |     set(CMAKE_SKIP_BUILD_RPATH FALSE)
 52 | 
 53 |     # When building, don't use the install RPATH already
 54 |     # (but later on when installing)
 55 |     set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
 56 |  
 57 |     # Add the automatically determined parts of the RPATH
 58 |     # which point to directories outside the build tree to the install RPATH
 59 |     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 60 |     set(CMAKE_SKIP_RPATH FALSE)
 61 | 
 62 |     # Enable building of the common library
 63 |     set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)
 64 | 
 65 |     # Disable building curl support
 66 |     set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE)
 67 | 
 68 |     # Architecture detection and settings for Apple platforms
 69 |     if (APPLE)
 70 |         # Get the target architecture
 71 |         execute_process(
 72 |             COMMAND uname -m
 73 |             OUTPUT_VARIABLE HOST_ARCH
 74 |             OUTPUT_STRIP_TRAILING_WHITESPACE
 75 |         )
 76 | 
 77 |         # If CMAKE_OSX_ARCHITECTURES is not set, use the host architecture
 78 |         if(NOT CMAKE_OSX_ARCHITECTURES)
 79 |             set(CMAKE_OSX_ARCHITECTURES ${HOST_ARCH} CACHE STRING "Build architecture for macOS" FORCE)
 80 |         endif()
 81 | 
 82 |         message(STATUS "Host architecture: ${HOST_ARCH}")
 83 |         message(STATUS "Target architecture: ${CMAKE_OSX_ARCHITECTURES}")
 84 | 
 85 |         # Configure based on target architecture
 86 |         if(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
 87 |             # Intel Mac settings
 88 |             set(GGML_AVX "OFF" CACHE BOOL "ggml: enable AVX" FORCE)
 89 |             set(GGML_AVX2 "OFF" CACHE BOOL "ggml: enable AVX2" FORCE)
 90 |             set(GGML_FMA "OFF" CACHE BOOL "ggml: enable FMA" FORCE)
 91 |             set(GGML_F16C "OFF" CACHE BOOL "ggml: enable F16C" FORCE)
 92 |         endif()
 93 | 
 94 |         # Metal settings (enable for both architectures)
 95 |         set(GGML_METAL "ON" CACHE BOOL "ggml: enable Metal" FORCE)
 96 |         set(GGML_METAL_EMBED_LIBRARY "ON" CACHE BOOL "ggml: embed metal library" FORCE)
 97 |     endif()
 98 | 
 99 |     add_subdirectory(vendor/llama.cpp)
100 |     llama_cpp_python_install_target(llama)
101 |     llama_cpp_python_install_target(ggml)
102 | 
103 |     llama_cpp_python_install_target(ggml-base)
104 | 
105 |     llama_cpp_python_install_target(ggml-amx)
106 |     llama_cpp_python_install_target(ggml-blas)
107 |     llama_cpp_python_install_target(ggml-can)
108 |     llama_cpp_python_install_target(ggml-cpu)
109 |     llama_cpp_python_install_target(ggml-cuda)
110 |     llama_cpp_python_install_target(ggml-hip)
111 |     llama_cpp_python_install_target(ggml-kompute)
112 |     llama_cpp_python_install_target(ggml-metal)
113 |     llama_cpp_python_install_target(ggml-musa)
114 |     llama_cpp_python_install_target(ggml-rpc)
115 |     llama_cpp_python_install_target(ggml-sycl)
116 |     llama_cpp_python_install_target(ggml-vulkan)
117 | 
118 |     # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
119 |     if (WIN32)
120 |         install(
121 |             FILES $<TARGET_RUNTIME_DLLS:llama>
122 |             DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
123 |         )
124 |         install(
125 |             FILES $<TARGET_RUNTIME_DLLS:llama>
126 |             DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
127 |         )
128 |         install(
129 |             FILES $<TARGET_RUNTIME_DLLS:ggml>
130 |             DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
131 |         )
132 |         install(
133 |             FILES $<TARGET_RUNTIME_DLLS:ggml>
134 |             DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
135 |         )
136 |     endif()
137 | 
138 |     if (LLAVA_BUILD)
139 |         if (LLAMA_CUBLAS OR LLAMA_CUDA)
140 |             add_compile_definitions(GGML_USE_CUBLAS)
141 |             add_compile_definitions(GGML_USE_CUDA)
142 |         endif()
143 | 
144 |         if (LLAMA_METAL)
145 |             add_compile_definitions(GGML_USE_METAL)
146 |         endif()
147 | 
148 |         # Building llava
149 |         add_subdirectory(vendor/llama.cpp/tools/mtmd)
150 |         set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
151 | 
152 |         if (WIN32)
153 |             set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
154 |         endif()
155 |         llama_cpp_python_install_target(llava_shared)
156 |         if (WIN32)
157 |             install(
158 |                 FILES $<TARGET_RUNTIME_DLLS:llava_shared>
159 |                 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
160 |             )
161 |             install(
162 |                 FILES $<TARGET_RUNTIME_DLLS:llava_shared>
163 |                 DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
164 |             )
165 |         endif()
166 | 
167 |         # Fix for llava build: Add include directory for llama.h
168 |         # Move these commands after the add_subdirectory call
169 |         target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
170 |         target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
171 | 
172 |         if (BUILD_SHARED_LIBS)
173 |             target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
174 |             target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
175 |         endif()
176 | 
177 |         target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
178 |         target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
179 |     endif()
180 | endif()
181 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Copyright (c) 2023 Andrei Betlen
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | update:
 2 | 	poetry install
 3 | 	git submodule update --init --recursive
 4 | 
 5 | update.vendor:
 6 | 	cd vendor/llama.cpp && git pull origin master
 7 | 
 8 | deps:
 9 | 	python3 -m pip install --upgrade pip
10 | 	python3 -m pip install -e ".[all]"
11 | 
12 | build:
13 | 	python3 -m pip install --verbose -e .
14 | 
15 | build.debug:
16 | 	python3 -m pip install \
17 | 		--verbose \
18 | 		--config-settings=cmake.verbose=true \
19 | 		--config-settings=logging.level=INFO \
20 | 		--config-settings=install.strip=false  \
21 | 		--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \
22 | 		--editable .
23 | 
24 | build.debug.extra:
25 | 	python3 -m pip install \
26 | 		--verbose \
27 | 		--config-settings=cmake.verbose=true \
28 | 		--config-settings=logging.level=INFO \
29 | 		--config-settings=install.strip=false  \
30 | 		--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-fsanitize=address -ggdb -O0';-DCMAKE_CXX_FLAGS='-fsanitize=address -ggdb -O0'" \
31 | 		--editable .
32 | 
33 | build.cuda:
34 | 	CMAKE_ARGS="-DGGML_CUDA=on" python3 -m pip install --verbose -e .
35 | 
36 | build.openblas:
37 | 	CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
38 | 
39 | build.blis:
40 | 	CMAKE_ARGS="-DGGML_BLAS=on -DGGML_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e .
41 | 
42 | build.metal:
43 | 	CMAKE_ARGS="-DGGML_METAL=on" python3 -m pip install --verbose -e .
44 | 
45 | build.vulkan:
46 | 	CMAKE_ARGS="-DGGML_VULKAN=on" python3 -m pip install --verbose -e .
47 | 
48 | build.kompute:
49 | 	CMAKE_ARGS="-DGGML_KOMPUTE=on" python3 -m pip install --verbose -e .
50 | 
51 | build.sycl:
52 | 	CMAKE_ARGS="-DGGML_SYCL=on" python3 -m pip install --verbose -e .
53 | 
54 | build.rpc:
55 | 	CMAKE_ARGS="-DGGML_RPC=on" python3 -m pip install --verbose -e .
56 | 
57 | build.sdist:
58 | 	python3 -m build --sdist --verbose
59 | 
60 | deploy.pypi:
61 | 	python3 -m twine upload dist/*
62 | 
63 | deploy.gh-docs:
64 | 	mkdocs build
65 | 	mkdocs gh-deploy
66 | 
67 | test:
68 | 	python3 -m pytest --full-trace -v
69 | 
70 | docker:
71 | 	docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile .
72 | 
73 | run-server:
74 | 	python3 -m llama_cpp.server --model ${MODEL}
75 | 
76 | clean:
77 | 	- cd vendor/llama.cpp && make clean
78 | 	- cd vendor/llama.cpp && rm libllama.so
79 | 	- rm -rf _skbuild
80 | 	- rm llama_cpp/lib/*.so
81 | 	- rm llama_cpp/lib/*.dylib
82 | 	- rm llama_cpp/lib/*.metal
83 | 	- rm llama_cpp/lib/*.dll
84 | 	- rm llama_cpp/lib/*.lib
85 | 
86 | .PHONY: \
87 | 	update \
88 | 	update.vendor \
89 | 	build \
90 | 	build.cuda \
91 | 	build.opencl \
92 | 	build.openblas \
93 | 	build.sdist \
94 | 	deploy.pypi \
95 | 	deploy.gh-docs \
96 | 	docker \
97 | 	clean
98 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | ### Install Docker Server
 2 | > [!IMPORTANT]  
 3 | > This was tested with Docker running on Linux. <br>If you can get it working on Windows or MacOS, please update this `README.md` with a PR!<br>
 4 | 
 5 | [Install Docker Engine](https://docs.docker.com/engine/install)
 6 | 
 7 | 
 8 | ## Simple Dockerfiles for building the llama-cpp-python server with external model bin files
 9 | ### openblas_simple
10 | A simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image:
11 | ```
12 | cd ./openblas_simple
13 | docker build -t openblas_simple .
14 | docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
15 | ```
16 | where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
17 | 
18 | ### cuda_simple
19 | > [!WARNING]  
20 | > Nvidia GPU CuBLAS support requires an Nvidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker Nvidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) <br>
21 | 
22 | A simple Dockerfile for CUDA-accelerated CuBLAS, where the model is located outside the Docker image:
23 | 
24 | ```
25 | cd ./cuda_simple
26 | docker build -t cuda_simple .
27 | docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
28 | ```
29 | where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
30 | 
31 | --------------------------------------------------------------------------
32 | 
33 | ### "Open-Llama-in-a-box"
34 | Download an Apache V2.0 licensed 3B params Open LLaMA model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server:
35 | ```
36 | $ cd ./open_llama
37 | ./build.sh
38 | ./start.sh
39 | ```
40 | 
41 | ### Manually choose your own Llama model from Hugging Face
42 | `python3 ./hug_model.py -a TheBloke -t llama`
43 | You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
44 | ```
45 | docker $ ls -lh *.bin
46 | -rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
47 | lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
48 | ```
49 | 
50 | > [!NOTE]  
51 | > Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
52 | **TWICE** as much disk space as the size of the model:<br>
53 | 
54 | | Model |  Quantized size |
55 | |------:|----------------:|
56 | |    3B |            3 GB |
57 | |    7B |            5 GB |
58 | |   13B |           10 GB |
59 | |   33B |           25 GB |
60 | |   65B |           50 GB |
61 | 
62 | 
63 | > [!NOTE]  
64 | > If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
65 | 


--------------------------------------------------------------------------------
/docker/cuda_simple/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG CUDA_IMAGE="12.5.0-devel-ubuntu22.04"
 2 | FROM nvidia/cuda:${CUDA_IMAGE}
 3 | 
 4 | # We need to set the host to 0.0.0.0 to allow outside access
 5 | ENV HOST 0.0.0.0
 6 | 
 7 | RUN apt-get update && apt-get upgrade -y \
 8 |     && apt-get install -y git build-essential \
 9 |     python3 python3-pip gcc wget \
10 |     ocl-icd-opencl-dev opencl-headers clinfo \
11 |     libclblast-dev libopenblas-dev \
12 |     && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
13 | 
14 | COPY . .
15 | 
16 | # setting build related env vars
17 | ENV CUDA_DOCKER_ARCH=all
18 | ENV GGML_CUDA=1
19 | 
20 | # Install depencencies
21 | RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
22 | 
23 | # Install llama-cpp-python (build with cuda)
24 | RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
25 | 
26 | # Run the server
27 | CMD python3 -m llama_cpp.server
28 | 


--------------------------------------------------------------------------------
/docker/open_llama/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Define the image argument and provide a default value
 2 | ARG IMAGE=python:3-slim-bookworm
 3 | 
 4 | # Use the image as specified
 5 | FROM ${IMAGE}
 6 | 
 7 | # Re-declare the ARG after FROM
 8 | ARG IMAGE
 9 | 
10 | # Update and upgrade the existing packages 
11 | RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
12 |     python3 \
13 |     python3-pip \
14 |     ninja-build \
15 |     build-essential \
16 |     && apt-get clean \
17 |     && rm -rf /var/lib/apt/lists/*
18 | 
19 | RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
20 | 
21 | # Perform the conditional installations based on the image
22 | RUN echo "Image: ${IMAGE}" && \
23 |     if [ "${IMAGE}" = "python:3-slim-bookworm" ] ; then \
24 |     echo "OpenBLAS install:" && \
25 |     apt-get install -y --no-install-recommends libopenblas-dev && \
26 |     CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python --verbose; \
27 | else \
28 |     echo "CuBLAS install:" && \
29 |     CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --verbose; \
30 | fi
31 | 
32 | # Clean up apt cache
33 | RUN rm -rf /var/lib/apt/lists/*
34 | 
35 | # Set a working directory for better clarity
36 | WORKDIR /app
37 | 
38 | # Copy files to the app directory
39 | RUN echo "Installing model...this can take some time..."
40 | COPY ./model.bin /app/model.bin
41 | COPY ./start_server.sh /app/start_server.sh
42 | 
43 | # Make the server start script executable
44 | RUN chmod +x /app/start_server.sh
45 | 
46 | # Set environment variable for the host
47 | ENV HOST=0.0.0.0
48 | 
49 | # Expose a port for the server
50 | EXPOSE 8000
51 | 
52 | # Run the server start script
53 | CMD ["/bin/sh", "/app/start_server.sh"]
54 | 


--------------------------------------------------------------------------------
/docker/open_llama/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | MODEL="open_llama_3b"
 4 | # Get  open_llama_3b_ggml q5_1 quantization
 5 | python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
 6 | ls -lh *.bin
 7 | 
 8 | # Build the default OpenBLAS image
 9 | docker build -t $MODEL .
10 | docker images | egrep "^(REPOSITORY|$MODEL)"
11 | 
12 | echo
13 | echo "To start the docker container run:"
14 | echo "docker run -t -p 8000:8000 $MODEL"
15 | 


--------------------------------------------------------------------------------
/docker/open_llama/hug_model.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import os
  4 | import struct
  5 | import argparse
  6 | 
  7 | def make_request(url, params=None):
  8 |     print(f"Making request to {url}...")
  9 |     response = requests.get(url, params=params)
 10 |     if response.status_code == 200:
 11 |         return json.loads(response.text)
 12 |     else:
 13 |         print(f"Request failed with status code {response.status_code}")
 14 |         return None
 15 | 
 16 | def check_magic_and_version(filename):
 17 |     with open(filename, 'rb') as f:
 18 |         # Read the first 6 bytes from the file
 19 |         data = f.read(6)
 20 | 
 21 |     # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
 22 |     # and the next 2 bytes as a little-endian unsigned short
 23 |     magic, version = struct.unpack('<I H', data)
 24 | 
 25 |     print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
 26 | 
 27 |     return magic, version
 28 | 
 29 | def download_file(url, destination):
 30 |     print(f"Downloading {url} to {destination}...")
 31 |     response = requests.get(url, stream=True)
 32 |     if response.status_code == 200:
 33 |         with open(destination, 'wb') as f:
 34 |             total_downloaded = 0
 35 |             for chunk in response.iter_content(chunk_size=1024):
 36 |                 if chunk:  # filter out keep-alive new chunks
 37 |                     f.write(chunk)
 38 |                     total_downloaded += len(chunk)
 39 |                     if total_downloaded >= 10485760:  # 10 MB
 40 |                         print('.', end='', flush=True)
 41 |                         total_downloaded = 0
 42 |         print("\nDownload complete.")
 43 |         
 44 |         # Creating a symbolic link from destination to "model.bin"
 45 |         if os.path.isfile("model.bin"):
 46 |             os.remove("model.bin")  # remove the existing link if any
 47 |         os.symlink(destination, "model.bin")
 48 |     else:
 49 |         print(f"Download failed with status code {response.status_code}")
 50 | 
 51 | def get_user_choice(model_list):
 52 |     # Print the enumerated list
 53 |     print("\n")
 54 |     for i, (model_id, rfilename) in enumerate(model_list):
 55 |         print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
 56 | 
 57 |     # Get user's choice
 58 |     choice = input("Choose a model to download by entering the corresponding number: ")
 59 |     try:
 60 |         index = int(choice) - 1
 61 |         if 0 <= index < len(model_list):
 62 |             # Return the chosen model
 63 |             return model_list[index]
 64 |         else:
 65 |             print("Invalid choice.")
 66 |     except ValueError:
 67 |         print("Invalid input. Please enter a number corresponding to a model.")
 68 |     except IndexError:
 69 |         print("Invalid choice. Index out of range.")
 70 |     
 71 |     return None
 72 | 
 73 | def main():
 74 |     # Create an argument parser
 75 |     parser = argparse.ArgumentParser(description='Process some parameters.')
 76 | 
 77 |     # Arguments
 78 |     parser.add_argument('-v', '--version', type=int, default=0x0003,
 79 |                         help='hexadecimal version number of ggml file')
 80 |     parser.add_argument('-a', '--author', type=str, default='TheBloke',
 81 |                         help='HuggingFace author filter')
 82 |     parser.add_argument('-t', '--tag', type=str, default='llama',
 83 |                         help='HuggingFace tag filter')
 84 |     parser.add_argument('-s', '--search', type=str, default='',
 85 |                         help='HuggingFace search filter')
 86 |     parser.add_argument('-f', '--filename', type=str, default='q5_1',
 87 |                         help='HuggingFace model repository filename substring match')
 88 | 
 89 |     # Parse the arguments
 90 |     args = parser.parse_args()
 91 | 
 92 |     # Define the parameters
 93 |     params = {
 94 |         "author": args.author,
 95 |         "tags": args.tag,
 96 |         "search": args.search
 97 |     }
 98 | 
 99 |     models = make_request('https://huggingface.co/api/models', params=params)
100 |     if models is None:
101 |         return
102 | 
103 |     model_list = []
104 |     # Iterate over the models
105 |     for model in models:
106 |         model_id = model['id']
107 |         model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
108 |         if model_info is None:
109 |             continue
110 | 
111 |         for sibling in model_info.get('siblings', []):
112 |             rfilename = sibling.get('rfilename')
113 |             if rfilename and args.filename in rfilename:
114 |                 model_list.append((model_id, rfilename))
115 | 
116 |     # Choose the model
117 |     model_list.sort(key=lambda x: x[0])
118 |     if len(model_list) == 0:
119 |         print("No models found")
120 |         exit(1)
121 |     elif len(model_list) == 1:
122 |         model_choice = model_list[0]
123 |     else:
124 |         model_choice = get_user_choice(model_list)
125 | 
126 |     if model_choice is not None:
127 |         model_id, rfilename = model_choice
128 |         url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
129 |         dest = f"{model_id.replace('/', '_')}_{rfilename}"
130 |         download_file(url, dest)
131 |         _, version = check_magic_and_version(dest)
132 |         if version != args.version:
133 |              print(f"Warning: Expected version {args.version}, but found different version in the file.")
134 |     else:
135 |         print("Error - model choice was None")
136 |         exit(2)
137 | 
138 | if __name__ == '__main__':
139 |     main()
140 | 


--------------------------------------------------------------------------------
/docker/open_llama/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | MODEL="open_llama_3b"
 4 | 
 5 | # Start Docker container
 6 | docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
 7 | sleep 10
 8 | echo
 9 | docker ps | egrep "(^CONTAINER|$MODEL)"
10 | 
11 | # Test the model works
12 | echo
13 | curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
14 |   "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
15 |   "stop": [
16 |     "\n",
17 |     "###"
18 |   ]
19 | }' | grep Paris
20 | if [ $? -eq 0 ]
21 | then
22 |     echo
23 |     echo "$MODEL is working!!"
24 | else
25 |     echo
26 |     echo "ERROR: $MODEL not replying."
27 |     exit 1
28 | fi
29 | 


--------------------------------------------------------------------------------
/docker/open_llama/start_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # For mlock support
 4 | ulimit -l unlimited
 5 | 
 6 | if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
 7 |     python3 -B -m llama_cpp.server --model /app/model.bin
 8 | else
 9 |     # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
10 |     python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
11 | fi
12 | 


--------------------------------------------------------------------------------
/docker/openblas_simple/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3-slim-bookworm
 2 | 
 3 | # We need to set the host to 0.0.0.0 to allow outside access
 4 | ENV HOST 0.0.0.0
 5 | 
 6 | COPY . .
 7 | 
 8 | # Install the package
 9 | RUN apt update && apt install -y libopenblas-dev ninja-build build-essential pkg-config \
10 |     && apt-get clean \
11 |     && rm -rf /var/lib/apt/lists/* /tmp/*
12 |     
13 | RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
14 | 
15 | RUN CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
16 | 
17 | # Run the server
18 | CMD python3 -m llama_cpp.server
19 | 


--------------------------------------------------------------------------------
/docker/simple/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Define the image argument and provide a default value
 2 | ARG IMAGE=python:3-slim-bookworm
 3 | 
 4 | # Use the image as specified
 5 | FROM ${IMAGE}
 6 | 
 7 | # Re-declare the ARG after FROM
 8 | ARG IMAGE
 9 | 
10 | # Update and upgrade the existing packages 
11 | RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
12 |     python3 \
13 |     python3-pip \
14 |     ninja-build \
15 |     libopenblas-dev \
16 |     build-essential \
17 |     && apt-get clean \
18 |     && rm -rf /var/lib/apt/lists/* /tmp/*
19 | 
20 | RUN mkdir /app
21 | WORKDIR /app
22 | COPY . /app
23 | 
24 | RUN python3 -m pip install --upgrade pip
25 | 
26 | RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
27 | 
28 | RUN pip install llama-cpp-python --verbose;
29 | 
30 | # Set environment variable for the host
31 | ENV HOST=0.0.0.0
32 | ENV PORT=8000
33 | 
34 | # Expose a port for the server
35 | EXPOSE 8000
36 | 
37 | # Run the server start script
38 | CMD ["/bin/sh", "/app/docker/simple/run.sh"]
39 | 


--------------------------------------------------------------------------------
/docker/simple/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | make build
4 | uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
5 | 


--------------------------------------------------------------------------------
/docs/api-reference.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: API Reference
 3 | ---
 4 | 
 5 | ## High Level API
 6 | 
 7 | High-level Python bindings for llama.cpp.
 8 | 
 9 | ::: llama_cpp.Llama
10 |     options:
11 |         members:
12 |             - __init__
13 |             - tokenize
14 |             - detokenize
15 |             - reset
16 |             - eval
17 |             - sample
18 |             - generate
19 |             - create_embedding
20 |             - embed
21 |             - create_completion
22 |             - __call__
23 |             - create_chat_completion
24 |             - create_chat_completion_openai_v1
25 |             - set_cache
26 |             - save_state
27 |             - load_state
28 |             - token_bos
29 |             - token_eos
30 |             - from_pretrained
31 |         show_root_heading: true
32 | 
33 | ::: llama_cpp.LlamaGrammar
34 |     options:
35 |         members:
36 |             - from_string
37 |             - from_json_schema
38 | 
39 | ::: llama_cpp.LlamaCache
40 |     options:
41 |         show_root_heading: true
42 | 
43 | ::: llama_cpp.LlamaState
44 |     options:
45 |         show_root_heading: true
46 | 
47 | ::: llama_cpp.LogitsProcessor
48 |     options:
49 |         show_root_heading: true
50 | 
51 | ::: llama_cpp.LogitsProcessorList
52 |     options:
53 |         show_root_heading: true
54 | 
55 | ::: llama_cpp.StoppingCriteria
56 |     options:
57 |         show_root_heading: true
58 | 
59 | ::: llama_cpp.StoppingCriteriaList
60 |     options:
61 |         show_root_heading: true
62 | 
63 | ## Low Level API
64 | 
65 | Low-level Python bindings for llama.cpp using Python's ctypes library.
66 | 
67 | ::: llama_cpp.llama_cpp
68 |     options:
69 |         show_if_no_docstring: true
70 |         # filter only members starting with `llama_`
71 |         filters:
72 |             - "^llama_"
73 | 
74 | ::: llama_cpp.llama_cpp
75 |     options:
76 |         show_if_no_docstring: true
77 |         show_root_heading: false
78 |         show_root_toc_entry: false
79 |         heading_level: 4
80 |         # filter only members starting with `LLAMA_`
81 |         filters:
82 |             - "^LLAMA_"
83 | 
84 | ## Misc
85 | 
86 | ::: llama_cpp.llama_types
87 |     options:
88 |         show_if_no_docstring: true


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | -8<- "CHANGELOG.md"


--------------------------------------------------------------------------------
/docs/icon.svg:
--------------------------------------------------------------------------------
1 | <svg width="264" height="264" viewBox="0 0 264 264" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M229.665 157.022C226.791 157.022 224.362 157.022 221.452 157.022C221.452 159.886 221.452 162.638 221.452 165.386C221.452 169.439 218.221 172.725 214.168 172.725V172.725C210.115 172.725 206.775 169.439 206.775 165.386C206.775 162.694 206.775 159.931 206.775 157.051C201.473 157.051 201.438 157.051 196.384 157.051C196.384 152.118 196.384 147.629 196.384 142.792C201.292 142.792 201.341 142.792 206.498 142.792C206.498 139.997 206.498 137.292 206.498 134.581C206.498 130.483 209.802 127.161 213.9 127.161V127.161C217.998 127.161 221.337 130.483 221.337 134.581C221.337 137.217 221.337 139.869 221.337 142.601C223.866 142.601 226.276 142.601 228.674 142.601C232.657 142.601 235.885 145.829 235.885 149.812C235.885 150.141 235.885 150.471 235.885 150.802C235.885 154.238 233.1 157.023 229.665 157.022V157.022Z" fill="#FF8236"/>
3 | <path d="M153.094 142.863C155.969 142.864 158.398 142.864 161.307 142.864C161.307 140 161.307 137.248 161.307 134.5C161.307 130.447 164.538 127.161 168.591 127.161V127.161C172.644 127.161 175.984 130.446 175.984 134.499C175.984 137.192 175.984 139.955 175.984 142.835C181.287 142.835 181.314 142.835 186.368 142.835C186.368 147.768 186.368 152.257 186.368 157.093C181.46 157.093 181.419 157.093 176.262 157.093C176.262 159.889 176.262 162.594 176.262 165.305C176.262 169.403 172.957 172.725 168.86 172.725V172.725C164.762 172.725 161.422 169.403 161.422 165.305C161.422 162.669 161.422 160.017 161.422 157.285C158.893 157.285 156.484 157.285 154.085 157.285C150.103 157.285 146.874 154.056 146.874 150.074C146.874 149.745 146.874 149.415 146.874 149.083C146.874 145.648 149.659 142.863 153.094 142.863V142.863Z" fill="#FF8236"/>
4 | <path fill-rule="evenodd" clip-rule="evenodd" d="M142.114 45.2382C144.949 46.6406 144.087 48.6771 143.291 50.4754C143.001 51.1309 142.713 51.7881 142.424 52.4456C140.823 56.0975 139.217 59.7593 137.254 63.2122C134.69 67.7245 133.969 72.1034 134.718 76.425C135.779 76.6467 136.846 76.8782 137.916 77.1194C140.063 69.1673 143.313 61.8666 150.34 56.3784C154.237 53.3347 158.881 52.5295 163.741 53.4167C166.806 53.9761 167.354 55.1539 165.72 57.7847C165.608 57.9647 165.496 58.1449 165.384 58.3252C163.881 60.7487 162.364 63.1964 160.522 65.3492C155.705 70.9818 156.079 76.8613 159.133 83.0193C159.817 83.2456 160.5 83.4751 161.183 83.7078C172.683 87.6287 177.516 100.775 172.451 111.819L169.949 117.273C153.793 109.863 135.437 103.1 119.57 100.817C119.293 100.777 118.925 100.719 118.479 100.649C112.549 99.7223 92.8929 96.6489 91.0237 104.451L91.0191 104.47C90.054 108.427 90.9132 112.361 91.78 116.33C92.9307 121.599 94.095 126.93 91.0237 132.458C88.0952 137.729 82.3817 140.345 76.8738 142.335C75.8854 142.692 74.8542 143.051 73.7966 143.419C63.0234 147.168 49.5058 151.871 50.5002 165.426C51.8566 183.916 76.7213 188.107 92.8753 190.83C93.4693 190.93 94.0515 191.028 94.6203 191.125C104.41 192.791 114.831 190.226 125.131 187.691C134.359 185.419 143.49 183.171 151.981 184.013C161.451 184.952 168.196 189.989 174.294 194.543C179.765 198.629 184.717 202.327 190.651 202.327C197.695 202.327 202.659 200.151 206.23 198.585C209.02 197.362 210.959 196.512 212.374 197.365C215.601 199.308 206.98 221.281 190.651 221.281C182.127 221.281 175.694 218.458 169.545 215.76C163.916 213.289 158.524 210.923 151.981 210.923C146.86 210.923 140.616 212.768 133.299 214.929C121.058 218.545 105.818 223.046 87.8198 221.281C59.0636 218.46 16.4933 202.541 14.1171 170.151C12.1175 142.894 41.5744 124.106 64.291 117.009C61.6175 103.844 63.224 90.9669 73.8515 81.7137C80.3917 76.0193 89.2057 73.9986 96.7861 73.2898C101.748 72.8258 107.118 72.8441 112.689 73.2406C114.024 67.1837 116.323 61.4235 119.904 56.2059C125.156 48.5519 132.354 44.5736 142.114 45.2382ZM164.455 97.0679C164.455 100.146 161.96 102.641 158.882 102.641C155.804 102.641 153.308 100.146 153.308 97.0679C153.308 93.9899 155.804 91.4946 158.882 91.4946C161.96 91.4946 164.455 93.9899 164.455 97.0679Z" fill="#FF8236"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Getting Started
3 | ---
4 | 
5 | -8<- "README.md"


--------------------------------------------------------------------------------
/docs/install/macos.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: MacOS Install with Metal GPU
 3 | ---
 4 | 
 5 | **(1) Make sure you have xcode installed... at least the command line parts**
 6 | ```
 7 | # check the path of your xcode install 
 8 | xcode-select -p
 9 | 
10 | # xcode installed returns
11 | # /Applications/Xcode-beta.app/Contents/Developer
12 | 
13 | # if xcode is missing then install it... it takes ages;
14 | xcode-select --install
15 | ```
16 | 
17 | **(2) Install the conda version for MacOS that supports Metal GPU**
18 | ```
19 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
20 | bash Miniforge3-MacOSX-arm64.sh
21 | ```
22 | 
23 | **(3) Make a conda environment**
24 | ```
25 | conda create -n llama python=3.9.16
26 | conda activate llama
27 | ```
28 | 
29 | **(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62**  
30 |     *(you needed xcode installed in order pip to build/compile the C++ code)*
31 | ```
32 | pip uninstall llama-cpp-python -y
33 | CMAKE_ARGS="-DGGML_METAL=on" pip install -U llama-cpp-python --no-cache-dir
34 | pip install 'llama-cpp-python[server]'
35 | 
36 | # you should now have llama-cpp-python v0.1.62 or higher installed
37 | llama-cpp-python         0.1.68
38 | 
39 | ```
40 | 
41 | **(5) Download a v3 gguf v2 model**
42 |  - **ggufv2**
43 |  - file name ends with **Q4_0.gguf** - indicating it is 4bit quantized, with quantisation method 0
44 | 
45 | https://huggingface.co/TheBloke/CodeLlama-7B-GGUF
46 | 
47 | 
48 | **(6) run the llama-cpp-python API server with MacOS Metal GPU support**
49 | ```
50 | # config your ggml model path
51 | # make sure it is gguf v2
52 | # make sure it is q4_0
53 | export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]Q4_0.gguf
54 | python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
55 | ```
56 | 
57 | ***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-material
3 | mkdocstrings[python]


--------------------------------------------------------------------------------
/docs/server.md:
--------------------------------------------------------------------------------
  1 | # OpenAI Compatible Server
  2 | 
  3 | `llama-cpp-python` offers an OpenAI API compatible web server.
  4 | 
  5 | This web server can be used to serve local models and easily connect them to existing clients.
  6 | 
  7 | ## Setup
  8 | 
  9 | ### Installation
 10 | 
 11 | The server can be installed by running the following command:
 12 | 
 13 | ```bash
 14 | pip install llama-cpp-python[server]
 15 | ```
 16 | 
 17 | ### Running the server
 18 | 
 19 | The server can then be started by running the following command:
 20 | 
 21 | ```bash
 22 | python3 -m llama_cpp.server --model <model_path>
 23 | ```
 24 | 
 25 | ### Server options
 26 | 
 27 | For a full list of options, run:
 28 | 
 29 | ```bash
 30 | python3 -m llama_cpp.server --help
 31 | ```
 32 | 
 33 | NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
 34 | 
 35 | Check out the server config reference below settings for more information on the available options.
 36 | CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings) 
 37 | 
 38 | Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples.
 39 | 
 40 | 
 41 | ## Guides
 42 | 
 43 | ### Code Completion
 44 | 
 45 | `llama-cpp-python` supports code completion via GitHub Copilot.
 46 | 
 47 | *NOTE*: Without GPU acceleration this is unlikely to be fast enough to be usable.
 48 | 
 49 | You'll first need to download one of the available code completion models in GGUF format:
 50 | 
 51 | - [replit-code-v1_5-GGUF](https://huggingface.co/abetlen/replit-code-v1_5-3b-GGUF)
 52 | 
 53 | Then you'll need to run the OpenAI compatible web server with a increased context size substantially for GitHub Copilot requests:
 54 | 
 55 | ```bash
 56 | python3 -m llama_cpp.server --model <model_path> --n_ctx 16192
 57 | ```
 58 | 
 59 | Then just update your settings in `.vscode/settings.json` to point to your code completion server:
 60 | 
 61 | ```json
 62 | {
 63 |     // ...
 64 |     "github.copilot.advanced": {
 65 |         "debug.testOverrideProxyUrl": "http://<host>:<port>",
 66 |         "debug.overrideProxyUrl": "http://<host>:<port>"
 67 |     }
 68 |     // ...
 69 | }
 70 | ```
 71 | 
 72 | ### Function Calling
 73 | 
 74 | `llama-cpp-python` supports structured function calling based on a JSON schema.
 75 | Function calling is completely compatible with the OpenAI function calling API and can be used by connecting with the official OpenAI Python client.
 76 | 
 77 | You'll first need to download one of the available function calling models in GGUF format:
 78 | 
 79 | - [functionary](https://huggingface.co/meetkai)
 80 | 
 81 | Then when you run the server you'll need to also specify either `functionary-v1` or `functionary-v2` chat_format.
 82 | 
 83 | Note that since functionary requires a HF Tokenizer due to discrepancies between llama.cpp and HuggingFace's tokenizers as mentioned [here](https://github.com/abetlen/llama-cpp-python/blob/main?tab=readme-ov-file#function-calling), you will need to pass in the path to the tokenizer too. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
 84 | 
 85 | ```bash
 86 | python3 -m llama_cpp.server --model <model_path_to_functionary_v2_model> --chat_format functionary-v2 --hf_pretrained_model_name_or_path <model_path_to_functionary_v2_tokenizer>
 87 | ```
 88 | 
 89 | Check out this [example notebook](https://github.com/abetlen/llama-cpp-python/blob/main/examples/notebooks/Functions.ipynb) for a walkthrough of some interesting use cases for function calling.
 90 | 
 91 | ### Multimodal Models
 92 | 
 93 | `llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
 94 | read information from both text and images.
 95 | 
 96 | You'll first need to download one of the available multi-modal models in GGUF format:
 97 | 
 98 | - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
 99 | - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
100 | - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
101 | - [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
102 | - [moondream2](https://huggingface.co/vikhyatk/moondream2)
103 | 
104 | Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format
105 | 
106 | ```bash
107 | python3 -m llama_cpp.server --model <model_path> --clip_model_path <clip_model_path> --chat_format llava-1-5
108 | ```
109 | 
110 | Then you can just use the OpenAI API as normal
111 | 
112 | ```python3
113 | from openai import OpenAI
114 | 
115 | client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
116 | response = client.chat.completions.create(
117 |     model="gpt-4-vision-preview",
118 |     messages=[
119 |         {
120 |             "role": "user",
121 |             "content": [
122 |                 {
123 |                     "type": "image_url",
124 |                     "image_url": {
125 |                         "url": "<image_url>"
126 |                     },
127 |                 },
128 |                 {"type": "text", "text": "What does the image say"},
129 |             ],
130 |         }
131 |     ],
132 | )
133 | print(response)
134 | ```
135 | 
136 | ## Configuration and Multi-Model Support
137 | 
138 | The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
139 | 
140 | ```bash
141 | python3 -m llama_cpp.server --config_file <config_file>
142 | ```
143 | 
144 | Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
145 | 
146 | The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file.
147 | 
148 | At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
149 | 
150 | ```json
151 | {
152 |     "host": "0.0.0.0",
153 |     "port": 8080,
154 |     "models": [
155 |         {
156 |             "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
157 |             "model_alias": "gpt-3.5-turbo",
158 |             "chat_format": "chatml",
159 |             "n_gpu_layers": -1,
160 |             "offload_kqv": true,
161 |             "n_threads": 12,
162 |             "n_batch": 512,
163 |             "n_ctx": 2048
164 |         },
165 |         {
166 |             "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
167 |             "model_alias": "gpt-4",
168 |             "chat_format": "chatml",
169 |             "n_gpu_layers": -1,
170 |             "offload_kqv": true,
171 |             "n_threads": 12,
172 |             "n_batch": 512,
173 |             "n_ctx": 2048
174 |         },
175 |         {
176 |             "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
177 |             "model_alias": "gpt-4-vision-preview",
178 |             "chat_format": "llava-1-5",
179 |             "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
180 |             "n_gpu_layers": -1,
181 |             "offload_kqv": true,
182 |             "n_threads": 12,
183 |             "n_batch": 512,
184 |             "n_ctx": 2048
185 |         },
186 |         {
187 |             "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
188 |             "model_alias": "text-davinci-003",
189 |             "n_gpu_layers": -1,
190 |             "offload_kqv": true,
191 |             "n_threads": 12,
192 |             "n_batch": 512,
193 |             "n_ctx": 2048
194 |         },
195 |         {
196 |             "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
197 |             "model_alias": "copilot-codex",
198 |             "n_gpu_layers": -1,
199 |             "offload_kqv": true,
200 |             "n_threads": 12,
201 |             "n_batch": 1024,
202 |             "n_ctx": 9216
203 |         }
204 |     ]
205 | }
206 | ```
207 | 
208 | The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class.
209 | 
210 | ## Server Options Reference
211 | 
212 | ::: llama_cpp.server.settings.ConfigFileSettings
213 |     options:
214 |         show_if_no_docstring: true
215 | 
216 | ::: llama_cpp.server.settings.ServerSettings
217 |     options:
218 |         show_if_no_docstring: true
219 | 
220 | ::: llama_cpp.server.settings.ModelSettings
221 |     options:
222 |         show_if_no_docstring: true
223 | 


--------------------------------------------------------------------------------
/examples/batch-processing/server.py:
--------------------------------------------------------------------------------
 1 | """llama-cpp-python server from scratch in a single file.
 2 | """
 3 | 
 4 | # import llama_cpp
 5 | 
 6 | # path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"
 7 | 
 8 | # model_params = llama_cpp.llama_model_default_params()
 9 | # model = llama_cpp.llama_load_model_from_file(path, model_params)
10 | 
11 | # if model is None:
12 | #     raise RuntimeError(f"Failed to load model from file: {path}")
13 | 
14 | 
15 | # ctx_params = llama_cpp.llama_context_default_params()
16 | # ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
17 | 
18 | # if ctx is None:
19 | #     raise RuntimeError("Failed to create context")
20 | 
21 | 
22 | from fastapi import FastAPI
23 | 
24 | app = FastAPI()
25 | 
26 | import openai.types.chat as types
27 | 
28 | 
29 | @app.post("/v1/chat/completions")
30 | def create_chat_completions():
31 |     return {"message": "Hello World"}
32 | 


--------------------------------------------------------------------------------
/examples/gradio_chat/local.py:
--------------------------------------------------------------------------------
 1 | import llama_cpp
 2 | import llama_cpp.llama_tokenizer
 3 | 
 4 | import gradio as gr
 5 | 
 6 | llama = llama_cpp.Llama.from_pretrained(
 7 |     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
 8 |     filename="*q8_0.gguf",
 9 |     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
10 |         "Qwen/Qwen1.5-0.5B"
11 |     ),
12 |     verbose=False,
13 | )
14 | 
15 | model = "gpt-3.5-turbo"
16 | 
17 | 
18 | def predict(message, history):
19 |     messages = []
20 | 
21 |     for user_message, assistant_message in history:
22 |         messages.append({"role": "user", "content": user_message})
23 |         messages.append({"role": "assistant", "content": assistant_message})
24 | 
25 |     messages.append({"role": "user", "content": message})
26 | 
27 |     response = llama.create_chat_completion_openai_v1(
28 |         model=model, messages=messages, stream=True
29 |     )
30 | 
31 |     text = ""
32 |     for chunk in response:
33 |         content = chunk.choices[0].delta.content
34 |         if content:
35 |             text += content
36 |             yield text
37 | 
38 | 
39 | js = """function () {
40 |   gradioURL = window.location.href
41 |   if (!gradioURL.endsWith('?__theme=dark')) {
42 |     window.location.replace(gradioURL + '?__theme=dark');
43 |   }
44 | }"""
45 | 
46 | css = """
47 | footer {
48 |     visibility: hidden;
49 | }
50 | full-height {
51 |     height: 100%;
52 | }
53 | """
54 | 
55 | with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo:
56 |     gr.ChatInterface(
57 |         predict,
58 |         fill_height=True,
59 |         examples=[
60 |             "What is the capital of France?",
61 |             "Who was the first person on the moon?",
62 |         ],
63 |     )
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     demo.launch()
68 | 


--------------------------------------------------------------------------------
/examples/gradio_chat/server.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | 
 3 | from openai import OpenAI
 4 | 
 5 | client = OpenAI(base_url="http://localhost:8000/v1", api_key="llama.cpp")
 6 | 
 7 | model = "gpt-3.5-turbo"
 8 | 
 9 | 
10 | def predict(message, history):
11 |     messages = []
12 | 
13 |     for user_message, assistant_message in history:
14 |         messages.append({"role": "user", "content": user_message})
15 |         messages.append({"role": "assistant", "content": assistant_message})
16 | 
17 |     messages.append({"role": "user", "content": message})
18 | 
19 |     response = client.chat.completions.create(
20 |         model=model, messages=messages, stream=True
21 |     )
22 | 
23 |     text = ""
24 |     for chunk in response:
25 |         content = chunk.choices[0].delta.content
26 |         if content:
27 |             text += content
28 |             yield text
29 | 
30 | 
31 | js = """function () {
32 |   gradioURL = window.location.href
33 |   if (!gradioURL.endsWith('?__theme=dark')) {
34 |     window.location.replace(gradioURL + '?__theme=dark');
35 |   }
36 | }"""
37 | 
38 | css = """
39 | footer {
40 |     visibility: hidden;
41 | }
42 | full-height {
43 |     height: 100%;
44 | }
45 | """
46 | 
47 | with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo:
48 |     gr.ChatInterface(
49 |         predict,
50 |         fill_height=True,
51 |         examples=[
52 |             "What is the capital of France?",
53 |             "Who was the first person on the moon?",
54 |         ],
55 |     )
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     demo.launch()
60 | 


--------------------------------------------------------------------------------
/examples/hf_pull/main.py:
--------------------------------------------------------------------------------
 1 | import llama_cpp
 2 | import llama_cpp.llama_tokenizer
 3 | 
 4 | 
 5 | llama = llama_cpp.Llama.from_pretrained(
 6 |     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
 7 |     filename="*q8_0.gguf",
 8 |     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
 9 |         "Qwen/Qwen1.5-0.5B"
10 |     ),
11 |     verbose=False,
12 | )
13 | 
14 | response = llama.create_chat_completion(
15 |     messages=[{"role": "user", "content": "What is the capital of France?"}],
16 |     response_format={
17 |         "type": "json_object",
18 |         "schema": {
19 |             "type": "object",
20 |             "properties": {
21 |                 "country": {"type": "string"},
22 |                 "capital": {"type": "string"},
23 |             },
24 |             "required": ["country", "capital"],
25 |         },
26 |     },
27 |     stream=True,
28 | )
29 | 
30 | for chunk in response:
31 |     delta = chunk["choices"][0]["delta"]
32 |     if "content" not in delta:
33 |         continue
34 |     print(delta["content"], end="", flush=True)
35 | 
36 | print()
37 | 


--------------------------------------------------------------------------------
/examples/high_level_api/fastapi_server.py:
--------------------------------------------------------------------------------
 1 | """Example FastAPI server for llama.cpp.
 2 | 
 3 | To run this example:
 4 | 
 5 | ```bash
 6 | pip install fastapi uvicorn sse-starlette
 7 | export MODEL=../models/7B/...
 8 | ```
 9 | 
10 | Then run:
11 | ```
12 | uvicorn --factory llama_cpp.server.app:create_app --reload
13 | ```
14 | 
15 | or
16 | 
17 | ```
18 | python3 -m llama_cpp.server
19 | ```
20 | 
21 | Then visit http://localhost:8000/docs to see the interactive API docs.
22 | 
23 | 
24 | To actually see the implementation of the server, see llama_cpp/server/app.py
25 | 
26 | """
27 | 
28 | import os
29 | import uvicorn
30 | 
31 | from llama_cpp.server.app import create_app
32 | 
33 | if __name__ == "__main__":
34 |     app = create_app()
35 | 
36 |     uvicorn.run(
37 |         app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
38 |     )
39 | 


--------------------------------------------------------------------------------
/examples/high_level_api/high_level_api_embedding.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from llama_cpp import Llama
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin")
 7 | args = parser.parse_args()
 8 | 
 9 | llm = Llama(model_path=args.model, embedding=True)
10 | 
11 | print(llm.create_embedding("Hello world!"))
12 | 


--------------------------------------------------------------------------------
/examples/high_level_api/high_level_api_inference.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | from llama_cpp import Llama
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
 8 | args = parser.parse_args()
 9 | 
10 | llm = Llama(model_path=args.model)
11 | 
12 | output = llm(
13 |     "Question: What are the names of the planets in the solar system? Answer: ",
14 |     max_tokens=48,
15 |     stop=["Q:", "\n"],
16 |     echo=True,
17 | )
18 | 
19 | print(json.dumps(output, indent=2))
20 | 


--------------------------------------------------------------------------------
/examples/high_level_api/high_level_api_infill.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from llama_cpp import Llama
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
 7 | parser.add_argument("-p", "--prompt", type=str, default="def add(")
 8 | parser.add_argument("-s", "--suffix", type=str, default="\n    return sum\n\n")
 9 | parser.add_argument("-i", "--spm-infill", action="store_true")
10 | args = parser.parse_args()
11 | 
12 | llm = Llama(model_path=args.model, n_gpu_layers=-1, spm_infill=args.spm_infill)
13 | 
14 | output = llm.create_completion(
15 |     temperature=0.0,
16 |     repeat_penalty=1.0,
17 |     prompt=args.prompt,
18 |     suffix=args.suffix,
19 | )
20 | 
21 | # Models sometimes repeat suffix in response, attempt to filter that
22 | response = output["choices"][0]["text"]
23 | response_stripped = response.rstrip()
24 | unwanted_response_suffix = args.suffix.rstrip()
25 | unwanted_response_length = len(unwanted_response_suffix)
26 | 
27 | filtered = False
28 | if (
29 |     unwanted_response_suffix
30 |     and response_stripped[-unwanted_response_length:] == unwanted_response_suffix
31 | ):
32 |     response = response_stripped[:-unwanted_response_length]
33 |     filtered = True
34 | 
35 | print(
36 |     f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m"
37 | )
38 | 


--------------------------------------------------------------------------------
/examples/high_level_api/high_level_api_streaming.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | from llama_cpp import Llama
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
 8 | args = parser.parse_args()
 9 | 
10 | llm = Llama(model_path=args.model)
11 | 
12 | stream = llm(
13 |     "Question: What are the names of the planets in the solar system? Answer: ",
14 |     max_tokens=48,
15 |     stop=["Q:", "\n"],
16 |     stream=True,
17 | )
18 | 
19 | for output in stream:
20 |     print(json.dumps(output, indent=2))
21 | 


--------------------------------------------------------------------------------
/examples/high_level_api/langchain_custom_llm.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from llama_cpp import Llama
 4 | 
 5 | from langchain.llms.base import LLM
 6 | from typing import Optional, List, Mapping, Any
 7 | 
 8 | 
 9 | class LlamaLLM(LLM):
10 |     model_path: str
11 |     llm: Llama
12 | 
13 |     @property
14 |     def _llm_type(self) -> str:
15 |         return "llama-cpp-python"
16 | 
17 |     def __init__(self, model_path: str, **kwargs: Any):
18 |         model_path = model_path
19 |         llm = Llama(model_path=model_path)
20 |         super().__init__(model_path=model_path, llm=llm, **kwargs)
21 | 
22 |     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
23 |         response = self.llm(prompt, stop=stop or [])
24 |         return response["choices"][0]["text"]
25 | 
26 |     @property
27 |     def _identifying_params(self) -> Mapping[str, Any]:
28 |         return {"model_path": self.model_path}
29 | 
30 | 
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
33 | args = parser.parse_args()
34 | 
35 | # Load the model
36 | llm = LlamaLLM(model_path=args.model)
37 | 
38 | # Basic Q&A
39 | answer = llm(
40 |     "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"]
41 | )
42 | print(f"Answer: {answer.strip()}")
43 | 
44 | # Using in a chain
45 | from langchain.prompts import PromptTemplate
46 | from langchain.chains import LLMChain
47 | 
48 | prompt = PromptTemplate(
49 |     input_variables=["product"],
50 |     template="\n\n### Instruction:\nWrite a good name for a company that makes {product}\n\n### Response:\n",
51 | )
52 | chain = LLMChain(llm=llm, prompt=prompt)
53 | 
54 | # Run the chain only specifying the input variable.
55 | print(chain.run("colorful socks"))
56 | 


--------------------------------------------------------------------------------
/examples/low_level_api/Chat.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | import sys, os, datetime
 3 | from common import GptParams
 4 | from low_level_api_chat_cpp import LLaMAInteract
 5 | 
 6 | 
 7 | def env_or_def(env, default):
 8 |     if env in os.environ:
 9 |         return os.environ[env]
10 |     return default
11 | 
12 | 
13 | AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
14 | MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
15 | USER_NAME = env_or_def("USER_NAME", "USER")
16 | N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
17 | N_THREAD = int(env_or_def("N_THREAD", "8"))
18 | 
19 | today = datetime.datetime.today()
20 | DATE_YEAR = today.strftime("%Y")
21 | DATE_TIME = today.strftime("%H:%M")
22 | 
23 | prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
24 | {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
25 | There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
26 | The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
27 | The transcript only includes text, it does not include markup like HTML and Markdown.
28 | 
29 | {USER_NAME}: Hello, {AI_NAME}!
30 | {AI_NAME}: Hello {USER_NAME}! How may I help you today?
31 | {USER_NAME}: What year is it?
32 | {AI_NAME}: We are in {DATE_YEAR}.
33 | {USER_NAME}: Please tell me the largest city in Europe.
34 | {AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
35 | {USER_NAME}: What can you tell me about Moscow?
36 | {AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
37 | {USER_NAME}: What is a cat?
38 | {AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
39 | {USER_NAME}: How do I pass command line arguments to a Node.js program?
40 | {AI_NAME}: The arguments are stored in process.argv.
41 | 
42 |     argv[0] is the path to the Node. js executable.
43 |     argv[1] is the path to the script file.
44 |     argv[2] is the first argument passed to the script.
45 |     argv[3] is the second argument passed to the script and so on.
46 | {USER_NAME}: Name a color.
47 | {AI_NAME}: Blue.
48 | {USER_NAME}: What time is it?
49 | {AI_NAME}: It is {DATE_TIME}.
50 | {USER_NAME}:""" + " ".join(
51 |     sys.argv[1:]
52 | )
53 | 
54 | print("Loading model...")
55 | params = GptParams(
56 |     n_ctx=2048,
57 |     temp=0.7,
58 |     top_k=40,
59 |     top_p=0.5,
60 |     repeat_last_n=256,
61 |     n_batch=1024,
62 |     repeat_penalty=1.17647,
63 |     model=MODEL,
64 |     n_threads=N_THREAD,
65 |     n_predict=N_PREDICTS,
66 |     use_color=True,
67 |     interactive=True,
68 |     antiprompt=[f"{USER_NAME}:"],
69 |     input_prefix=" ",
70 |     input_suffix=f"{AI_NAME}:",
71 |     prompt=prompt,
72 | )
73 | 
74 | with LLaMAInteract(params) as m:
75 |     m.interact()
76 | 


--------------------------------------------------------------------------------
/examples/low_level_api/Miku.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | import sys, os
 3 | from common import GptParams
 4 | from low_level_api_chat_cpp import LLaMAInteract
 5 | 
 6 | 
 7 | def env_or_def(env, default):
 8 |     if env in os.environ:
 9 |         return os.environ[env]
10 |     return default
11 | 
12 | 
13 | AI_NAME = env_or_def("AI_NAME", "Miku")
14 | MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
15 | USER_NAME = env_or_def("USER_NAME", "Anon")
16 | N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
17 | N_THREAD = int(env_or_def("N_THREAD", "0"))
18 | 
19 | prompt = f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
20 | {AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
21 | {AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
22 | {AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
23 | {AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
24 | The conversation is only between {USER_NAME} and {AI_NAME}
25 | The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
26 | {AI_NAME} can only communicate through text, so she can't send images or videos.
27 | 
28 | 
29 | {USER_NAME}: Hello!
30 | {AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
31 | {AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
32 | {AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
33 | {USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
34 | {AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
35 | {AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
36 | {AI_NAME}: What do you like to do in your free time? ^_^
37 | {USER_NAME}:""" + " ".join(
38 |     sys.argv[1:]
39 | )
40 | 
41 | print("Loading model...")
42 | params = GptParams(
43 |     n_batch=1024,
44 |     n_ctx=2048,
45 |     n_keep=-1,
46 |     repeat_last_n=256,
47 |     repeat_penalty=1.17647,
48 |     temp=0.7,
49 |     top_k=40,
50 |     top_p=0.5,
51 |     model=MODEL,
52 |     n_predict=N_PREDICTS,
53 |     use_color=True,
54 |     interactive=True,
55 |     antiprompt=[f"{USER_NAME}:"],
56 |     prompt=prompt,
57 | )
58 | 
59 | if N_THREAD > 0:
60 |     params.n_threads = N_THREAD
61 | 
62 | with LLaMAInteract(params) as m:
63 |     m.interact()
64 | 


--------------------------------------------------------------------------------
/examples/low_level_api/ReasonAct.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | import sys, os, datetime
 3 | from common import GptParams
 4 | from low_level_api_chat_cpp import LLaMAInteract
 5 | 
 6 | 
 7 | def env_or_def(env, default):
 8 |     if env in os.environ:
 9 |         return os.environ[env]
10 |     return default
11 | 
12 | 
13 | MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
14 | 
15 | prompt = f"""You run in a loop of Thought, Action, Observation.
16 | At the end of the loop either Answer or restate your Thought and Action.
17 | Use Thought to describe your thoughts about the question you have been asked.
18 | Use Action to run one of these actions available to you:
19 | - calculate[python math expression]
20 | Observation will be the result of running those actions
21 | 
22 | 
23 | Question: What is 4 * 7 / 3?
24 | Thought: Do I need to use an action? Yes, I use calculate to do math
25 | Action: calculate[4 * 7 / 3]
26 | Observation: 9.3333333333
27 | Thought: Do I need to use an action? No, have the result
28 | Answer: The calculate tool says it is 9.3333333333
29 | Question: What is capital of france?
30 | Thought: Do I need to use an action? No, I know the answer
31 | Answer: Paris is the capital of France
32 | Question:""" + " ".join(
33 |     sys.argv[1:]
34 | )
35 | 
36 | print("Loading model...")
37 | params = GptParams(
38 |     interactive=True,
39 |     interactive_start=True,
40 |     top_k=10000,
41 |     temp=0.2,
42 |     repeat_penalty=1,
43 |     n_threads=7,
44 |     n_ctx=2048,
45 |     antiprompt=["Question:", "Observation:"],
46 |     model=MODEL,
47 |     input_prefix=" ",
48 |     n_predict=-1,
49 |     prompt=prompt,
50 | )
51 | 
52 | with LLaMAInteract(params) as m:
53 |     m.interact()
54 | 


--------------------------------------------------------------------------------
/examples/low_level_api/low_level_api_llama_cpp.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | import os
  3 | import multiprocessing
  4 | 
  5 | import llama_cpp
  6 | 
  7 | llama_cpp.llama_backend_init(numa=False)
  8 | 
  9 | N_THREADS = multiprocessing.cpu_count()
 10 | MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin")
 11 | 
 12 | prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
 13 | 
 14 | lparams = llama_cpp.llama_model_default_params()
 15 | cparams = llama_cpp.llama_context_default_params()
 16 | model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
 17 | ctx = llama_cpp.llama_new_context_with_model(model, cparams)
 18 | 
 19 | # determine the required inference memory per token:
 20 | tmp = [0, 1, 2, 3]
 21 | llama_cpp.llama_eval(
 22 |     ctx=ctx, tokens=(llama_cpp.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0
 23 | )  # Deprecated
 24 | 
 25 | n_past = 0
 26 | 
 27 | prompt = b" " + prompt
 28 | 
 29 | embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
 30 | n_of_tok = llama_cpp.llama_tokenize(
 31 |     model=model,
 32 |     text=bytes(str(prompt), "utf-8"),
 33 |     text_len=len(embd_inp),
 34 |     tokens=embd_inp,
 35 |     n_max_tokens=len(embd_inp),
 36 |     add_bos=False,
 37 |     special=False,
 38 | )
 39 | embd_inp = embd_inp[:n_of_tok]
 40 | 
 41 | n_ctx = llama_cpp.llama_n_ctx(ctx)
 42 | 
 43 | n_predict = 20
 44 | n_predict = min(n_predict, n_ctx - len(embd_inp))
 45 | 
 46 | input_consumed = 0
 47 | input_noecho = False
 48 | 
 49 | remaining_tokens = n_predict
 50 | 
 51 | embd = []
 52 | last_n_size = 64
 53 | last_n_tokens_data = [0] * last_n_size
 54 | n_batch = 24
 55 | last_n_repeat = 64
 56 | repeat_penalty = 1
 57 | frequency_penalty = 0.0
 58 | presence_penalty = 0.0
 59 | 
 60 | while remaining_tokens > 0:
 61 |     if len(embd) > 0:
 62 |         llama_cpp.llama_eval(
 63 |             ctx=ctx,
 64 |             tokens=(llama_cpp.c_int * len(embd))(*embd),
 65 |             n_tokens=len(embd),
 66 |             n_past=n_past,
 67 |         )  # Deprecated
 68 | 
 69 |     n_past += len(embd)
 70 |     embd = []
 71 |     if len(embd_inp) <= input_consumed:
 72 |         logits = llama_cpp.llama_get_logits(ctx)
 73 |         n_vocab = llama_cpp.llama_n_vocab(model)
 74 | 
 75 |         _arr = (llama_cpp.llama_token_data * n_vocab)(
 76 |             *[
 77 |                 llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
 78 |                 for token_id in range(n_vocab)
 79 |             ]
 80 |         )
 81 |         candidates_p = llama_cpp.ctypes.pointer(
 82 |             llama_cpp.llama_token_data_array(_arr, len(_arr), False)
 83 |         )
 84 | 
 85 |         _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
 86 |         llama_cpp.llama_sample_repetition_penalties(
 87 |             ctx,
 88 |             candidates_p,
 89 |             _arr,
 90 |             penalty_last_n=last_n_repeat,
 91 |             penalty_repeat=repeat_penalty,
 92 |             penalty_freq=frequency_penalty,
 93 |             penalty_present=presence_penalty,
 94 |         )
 95 | 
 96 |         llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
 97 |         llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
 98 |         llama_cpp.llama_sample_temperature(ctx, candidates_p, temp=0.2)
 99 |         id = llama_cpp.llama_sample_token(ctx, candidates_p)
100 | 
101 |         last_n_tokens_data = last_n_tokens_data[1:] + [id]
102 |         embd.append(id)
103 |         input_noecho = False
104 |         remaining_tokens -= 1
105 |     else:
106 |         while len(embd_inp) > input_consumed:
107 |             embd.append(embd_inp[input_consumed])
108 |             last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
109 |             input_consumed += 1
110 |             if len(embd) >= n_batch:
111 |                 break
112 |     if not input_noecho:
113 |         for id in embd:
114 |             size = 32
115 |             buffer = (ctypes.c_char * size)()
116 |             n = llama_cpp.llama_token_to_piece(
117 |                 model, llama_cpp.llama_token(id), buffer, size
118 |             )
119 |             assert n <= size
120 |             print(
121 |                 buffer[:n].decode("utf-8"),
122 |                 end="",
123 |                 flush=True,
124 |             )
125 | 
126 |     if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
127 |         break
128 | 
129 | print()
130 | 
131 | llama_cpp.llama_print_timings(ctx)
132 | 
133 | llama_cpp.llama_free(ctx)
134 | 


--------------------------------------------------------------------------------
/examples/low_level_api/quantize.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import llama_cpp
 4 | 
 5 | 
 6 | def main(args):
 7 |     fname_inp = args.fname_inp.encode("utf-8")
 8 |     fname_out = args.fname_out.encode("utf-8")
 9 |     if not os.path.exists(fname_inp):
10 |         raise RuntimeError(f"Input file does not exist ({fname_inp})")
11 |     if os.path.exists(fname_out):
12 |         raise RuntimeError(f"Output file already exists ({fname_out})")
13 |     ftype = args.type
14 |     args = llama_cpp.llama_model_quantize_default_params()
15 |     args.ftype = ftype
16 |     return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args)
17 |     if return_code != 0:
18 |         raise RuntimeError("Failed to quantize model")
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("fname_inp", type=str, help="Path to input model")
24 |     parser.add_argument("fname_out", type=str, help="Path to output model")
25 |     parser.add_argument(
26 |         "type",
27 |         type=int,
28 |         help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum",
29 |     )
30 |     args = parser.parse_args()
31 |     main(args)
32 | 


--------------------------------------------------------------------------------
/examples/low_level_api/readme/low_level_api_llama_cpp.md:
--------------------------------------------------------------------------------
 1 | # Low-Level API for Llama_cpp
 2 | 
 3 | ## Overview
 4 | This Python script, low_level_api_llama_cpp.py, demonstrates the implementation of a low-level API for interacting with the llama_cpp library. The script defines an inference that generates embeddings based on a given prompt using .gguf model.
 5 | 
 6 | ### Prerequisites
 7 | Before running the script, ensure that you have the following dependencies installed:
 8 | 
 9 | .    Python 3.6 or higher
10 | .    llama_cpp: A C++ library for working with .gguf model
11 | .    NumPy: A fundamental package for scientific computing with Python
12 | .    multiprocessing: A Python module for parallel computing
13 | 
14 | ### Usage
15 | install depedencies:
16 | ```bash
17 | python -m pip install llama-cpp-python ctypes os multiprocessing
18 | ```
19 | Run the script:
20 | ```bash
21 | python low_level_api_llama_cpp.py
22 | ```
23 | 
24 | ## Code Structure
25 | The script is organized as follows:
26 | 
27 | ### . Initialization:
28 |         Load the model from the specified path.
29 |         Create a context for model evaluation.
30 | 
31 | ### . Tokenization:
32 |         Tokenize the input prompt using the llama_tokenize function.
33 |         Prepare the input tokens for model evaluation.
34 | 
35 | ### . Inference:
36 |         Perform model evaluation to generate responses.
37 |         Sample from the model's output using various strategies (top-k, top-p, temperature).
38 | 
39 | ### . Output:
40 |         Print the generated tokens and the corresponding decoded text.
41 | 
42 | ### .Cleanup:
43 |         Free resources and print timing information.
44 | 
45 | ## Configuration
46 | Customize the inference behavior by adjusting the following variables:
47 | 
48 | #### . N_THREADS: Number of CPU threads to use for model evaluation.
49 | #### . MODEL_PATH: Path to the model file.
50 | #### . prompt: Input prompt for the chatbot.
51 | 
52 | ## Notes
53 | .    Ensure that the llama_cpp library is built and available in the system. Follow the instructions in the llama_cpp repository for building and installing the library.
54 | 
55 | .    This script is designed to work with the .gguf model and may require modifications for compatibility with other models.
56 | 
57 | ## Acknowledgments
58 | This code is based on the llama_cpp library developed by the community. Special thanks to the contributors for their efforts.
59 | 
60 | ## License
61 | This project is licensed under the MIT License - see the LICENSE file for details.


--------------------------------------------------------------------------------
/examples/low_level_api/util.py:
--------------------------------------------------------------------------------
  1 | ANSI_COLOR_RESET = "\x1b[0m"
  2 | ANSI_COLOR_YELLOW = "\x1b[33m"
  3 | ANSI_BOLD = "\x1b[1m"
  4 | ANSI_COLOR_GREEN = "\x1b[32m"
  5 | 
  6 | CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
  7 | CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
  8 | CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
  9 | 
 10 | 
 11 | # Iterative search
 12 | # Actively searches and prevents a pattern from being returned
 13 | class IterSearch:
 14 |     def __init__(self, pattern):
 15 |         self.pattern = list(pattern)
 16 |         self.buffer = []
 17 | 
 18 |     def __call__(self, char):
 19 |         self.buffer += [char]
 20 | 
 21 |         if self.pattern[: len(self.buffer)] == self.buffer:
 22 |             if len(self.buffer) >= len(self.pattern):
 23 |                 self.buffer.clear()
 24 |             return []
 25 | 
 26 |         _tmp = self.buffer[:]
 27 |         self.buffer.clear()
 28 |         return _tmp
 29 | 
 30 | 
 31 | class Circle:
 32 |     def __init__(self, size, default=0):
 33 |         self.list = [default] * size
 34 |         self.maxsize = size
 35 |         self.size = 0
 36 |         self.offset = 0
 37 | 
 38 |     def append(self, elem):
 39 |         if self.size < self.maxsize:
 40 |             self.list[self.size] = elem
 41 |             self.size += 1
 42 |         else:
 43 |             self.list[self.offset] = elem
 44 |             self.offset = (self.offset + 1) % self.maxsize
 45 | 
 46 |     def __getitem__(self, val):
 47 |         if isinstance(val, int):
 48 |             if 0 > val or val >= self.size:
 49 |                 raise IndexError("Index out of range")
 50 |             return (
 51 |                 self.list[val]
 52 |                 if self.size < self.maxsize
 53 |                 else self.list[(self.offset + val) % self.maxsize]
 54 |             )
 55 |         elif isinstance(val, slice):
 56 |             start, stop, step = val.start, val.stop, val.step
 57 |             if step is None:
 58 |                 step = 1
 59 |             if start is None:
 60 |                 start = 0
 61 |             if stop is None:
 62 |                 stop = self.size
 63 |             if start < 0:
 64 |                 start = self.size + start
 65 |             if stop < 0:
 66 |                 stop = self.size + stop
 67 | 
 68 |             indices = range(start, stop, step)
 69 |             return [
 70 |                 self.list[(self.offset + i) % self.maxsize]
 71 |                 for i in indices
 72 |                 if i < self.size
 73 |             ]
 74 |         else:
 75 |             raise TypeError("Invalid argument type")
 76 | 
 77 | 
 78 | if __name__ == "__main__":
 79 |     c = Circle(5)
 80 | 
 81 |     c.append(1)
 82 |     print(c.list)
 83 |     print(c[:])
 84 |     assert c[0] == 1
 85 |     assert c[:5] == [1]
 86 | 
 87 |     for i in range(2, 5 + 1):
 88 |         c.append(i)
 89 |     print(c.list)
 90 |     print(c[:])
 91 |     assert c[0] == 1
 92 |     assert c[:5] == [1, 2, 3, 4, 5]
 93 | 
 94 |     for i in range(5 + 1, 9 + 1):
 95 |         c.append(i)
 96 |     print(c.list)
 97 |     print(c[:])
 98 |     assert c[0] == 5
 99 |     assert c[:5] == [5, 6, 7, 8, 9]
100 |     # assert c[:-5] == [5,6,7,8,9]
101 |     assert c[:10] == [5, 6, 7, 8, 9]
102 | 


--------------------------------------------------------------------------------
/examples/notebooks/Clients.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "<OpenAIObject text_completion id=cmpl-ad3ba53d-407c-466b-bd5f-97cb8987af83 at 0x7f6adc12d900> JSON: {\n",
 12 |        "  \"choices\": [\n",
 13 |        "    {\n",
 14 |        "      \"finish_reason\": \"length\",\n",
 15 |        "      \"index\": 0,\n",
 16 |        "      \"logprobs\": null,\n",
 17 |        "      \"text\": \" over the lazy dog.\"\n",
 18 |        "    }\n",
 19 |        "  ],\n",
 20 |        "  \"created\": 1680960690,\n",
 21 |        "  \"id\": \"cmpl-ad3ba53d-407c-466b-bd5f-97cb8987af83\",\n",
 22 |        "  \"model\": \"models/ggml-alpaca.bin\",\n",
 23 |        "  \"object\": \"text_completion\",\n",
 24 |        "  \"usage\": {\n",
 25 |        "    \"completion_tokens\": 5,\n",
 26 |        "    \"prompt_tokens\": 8,\n",
 27 |        "    \"total_tokens\": 13\n",
 28 |        "  }\n",
 29 |        "}"
 30 |       ]
 31 |      },
 32 |      "execution_count": 1,
 33 |      "metadata": {},
 34 |      "output_type": "execute_result"
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "import openai\n",
 39 |     "\n",
 40 |     "openai.api_key = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"  # can be anything\n",
 41 |     "openai.api_base = \"http://100.64.159.73:8000/v1\"\n",
 42 |     "\n",
 43 |     "openai.Completion.create(\n",
 44 |     "    model=\"text-davinci-003\",  # currently can be anything\n",
 45 |     "    prompt=\"The quick brown fox jumps\",\n",
 46 |     "    max_tokens=5,\n",
 47 |     ")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "' over the lazy dog'"
 59 |       ]
 60 |      },
 61 |      "execution_count": 2,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "import os\n",
 68 |     "\n",
 69 |     "os.environ[\"OPENAI_API_KEY\"] = (\n",
 70 |     "    \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"  # can be anything\n",
 71 |     ")\n",
 72 |     "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n",
 73 |     "\n",
 74 |     "from langchain.llms import OpenAI\n",
 75 |     "\n",
 76 |     "llms = OpenAI()\n",
 77 |     "llms(\n",
 78 |     "    prompt=\"The quick brown fox jumps\",\n",
 79 |     "    stop=[\".\", \"\\n\"],\n",
 80 |     ")"
 81 |    ]
 82 |   }
 83 |  ],
 84 |  "metadata": {
 85 |   "kernelspec": {
 86 |    "display_name": ".venv",
 87 |    "language": "python",
 88 |    "name": "python3"
 89 |   },
 90 |   "language_info": {
 91 |    "codemirror_mode": {
 92 |     "name": "ipython",
 93 |     "version": 3
 94 |    },
 95 |    "file_extension": ".py",
 96 |    "mimetype": "text/x-python",
 97 |    "name": "python",
 98 |    "nbconvert_exporter": "python",
 99 |    "pygments_lexer": "ipython3",
100 |    "version": "3.8.10"
101 |   },
102 |   "orig_nbformat": 4
103 |  },
104 |  "nbformat": 4,
105 |  "nbformat_minor": 2
106 | }
107 | 


--------------------------------------------------------------------------------
/examples/notebooks/Guidance.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "data": {
10 |       "text/html": [
11 |        "<div id=\"guidance-stop-button-faa51639-e0a4-43c6-a6d4-d5853c2ec764\" style=\"cursor: pointer; margin: 0px; display: none; float: right; padding: 3px; border-radius: 4px 4px 4px 4px; border: 0px solid rgba(127, 127, 127, 1); padding-left: 10px; padding-right: 10px; font-size: 13px; background-color: rgba(127, 127, 127, 0.25);\">Stop program</div><div id=\"guidance-content-faa51639-e0a4-43c6-a6d4-d5853c2ec764\"><pre style='margin: 0px; padding: 0px; padding-left: 8px; margin-left: -8px; border-radius: 0px; border-left: 1px solid rgba(127, 127, 127, 0.2); white-space: pre-wrap; font-family: ColfaxAI, Arial; font-size: 15px; line-height: 23px;'>Tweak this proverb to apply to model instructions instead.\n",
12 |        "\n",
13 |        "<span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{proverb}}'>Where there is no guidance, a people falls,\n",
14 |        "but in an abundance of counselors there is safety.</span>\n",
15 |        "- <span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{book}}'>Proverbs</span> <span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{chapter}}'>11</span>:<span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{verse}}'>14</span>\n",
16 |        "\n",
17 |        "UPDATED\n",
18 |        "Where there is no guidance<span style='background-color: rgba(0, 165, 0, 0.25); opacity: 1.0; display: inline;' title='{{gen &#x27;rewrite&#x27; stop=&quot;\\n-&quot;}}'> for assembling a model, people will struggle,\n",
19 |        "but with clear instructions, the process becomes safe and successful.</span>\n",
20 |        "- GPT <span style='background-color: rgba(0, 165, 0, 0.25); opacity: 1.0; display: inline;' title='{{gen &#x27;chapter&#x27;}}'>2 (updated)</span>:<span style='background-color: rgba(0, 165, 0, 0.25); opacity: 1.0; display: inline;' title='{{gen &#x27;verse&#x27;}}'> Proverbs 11:14</span></pre></div>\n",
21 |        "<script type=\"text/javascript\">(()=>{var t={296:(t,e,n)=>{var i=NaN,o=\"[object Symbol]\",r=/^\\s+|\\s+$/g,a=/^[-+]0x[0-9a-f]+$/i,s=/^0b[01]+$/i,c=/^0o[0-7]+$/i,d=parseInt,u=\"object\"==typeof n.g&&n.g&&n.g.Object===Object&&n.g,l=\"object\"==typeof self&&self&&self.Object===Object&&self,f=u||l||Function(\"return this\")(),h=Object.prototype.toString,p=Math.max,m=Math.min,g=function(){return f.Date.now()};function b(t){var e=typeof t;return!!t&&(\"object\"==e||\"function\"==e)}function y(t){if(\"number\"==typeof t)return t;if(function(t){return\"symbol\"==typeof t||function(t){return!!t&&\"object\"==typeof t}(t)&&h.call(t)==o}(t))return i;if(b(t)){var e=\"function\"==typeof t.valueOf?t.valueOf():t;t=b(e)?e+\"\":e}if(\"string\"!=typeof t)return 0===t?t:+t;t=t.replace(r,\"\");var n=s.test(t);return n||c.test(t)?d(t.slice(2),n?2:8):a.test(t)?i:+t}t.exports=function(t,e,n){var i,o,r,a,s,c,d=0,u=!1,l=!1,f=!0;if(\"function\"!=typeof t)throw new TypeError(\"Expected a function\");function h(e){var n=i,r=o;return i=o=void 0,d=e,a=t.apply(r,n)}function v(t){var n=t-c;return void 0===c||n>=e||n<0||l&&t-d>=r}function _(){var t=g();if(v(t))return w(t);s=setTimeout(_,function(t){var n=e-(t-c);return l?m(n,r-(t-d)):n}(t))}function w(t){return s=void 0,f&&i?h(t):(i=o=void 0,a)}function j(){var t=g(),n=v(t);if(i=arguments,o=this,c=t,n){if(void 0===s)return function(t){return d=t,s=setTimeout(_,e),u?h(t):a}(c);if(l)return s=setTimeout(_,e),h(c)}return void 0===s&&(s=setTimeout(_,e)),a}return e=y(e)||0,b(n)&&(u=!!n.leading,r=(l=\"maxWait\"in n)?p(y(n.maxWait)||0,e):r,f=\"trailing\"in n?!!n.trailing:f),j.cancel=function(){void 0!==s&&clearTimeout(s),d=0,i=c=o=s=void 0},j.flush=function(){return void 0===s?a:w(g())},j}},777:t=>{var e,n,i=Math.max,o=(e=function(t,e){return function(t,e,n){if(\"function\"!=typeof t)throw new TypeError(\"Expected a function\");return setTimeout((function(){t.apply(void 0,n)}),1)}(t,0,e)},n=i(void 0===n?e.length-1:n,0),function(){for(var t=arguments,o=-1,r=i(t.length-n,0),a=Array(r);++o<r;)a[o]=t[n+o];o=-1;for(var s=Array(n+1);++o<n;)s[o]=t[o];return s[n]=a,function(t,e,n){switch(n.length){case 0:return t.call(e);case 1:return t.call(e,n[0]);case 2:return t.call(e,n[0],n[1]);case 3:return t.call(e,n[0],n[1],n[2])}return t.apply(e,n)}(e,this,s)});t.exports=o}},e={};function n(i){var o=e[i];if(void 0!==o)return o.exports;var r=e[i]={exports:{}};return t[i](r,r.exports,n),r.exports}n.n=t=>{var e=t&&t.__esModule?()=>t.default:()=>t;return n.d(e,{a:e}),e},n.d=(t,e)=>{for(var i in e)n.o(e,i)&&!n.o(t,i)&&Object.defineProperty(t,i,{enumerable:!0,get:e[i]})},n.g=function(){if(\"object\"==typeof globalThis)return globalThis;try{return this||new Function(\"return this\")()}catch(t){if(\"object\"==typeof window)return window}}(),n.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e),(()=>{\"use strict\";const t=t=>{const e=new Set;do{for(const n of Reflect.ownKeys(t))e.add([t,n])}while((t=Reflect.getPrototypeOf(t))&&t!==Object.prototype);return e};function e(e,{include:n,exclude:i}={}){const o=t=>{const e=e=>\"string\"==typeof e?t===e:e.test(t);return n?n.some(e):!i||!i.some(e)};for(const[n,i]of t(e.constructor.prototype)){if(\"constructor\"===i||!o(i))continue;const t=Reflect.getOwnPropertyDescriptor(n,i);t&&\"function\"==typeof t.value&&(e[i]=e[i].bind(e))}return e}var i=n(777),o=n.n(i),r=n(296),a=n.n(r);class s{constructor(t,n){e(this),this.interfaceId=t,this.callbackMap={},this.data={},this.pendingData={},this.jcomm=new c(\"guidance_interface_target_\"+this.interfaceId,this.updateData,\"open\"),this.debouncedSendPendingData500=a()(this.sendPendingData,500),this.debouncedSendPendingData1000=a()(this.sendPendingData,1e3),n&&o()(n)}send(t,e){this.addPendingData(t,e),this.sendPendingData()}sendEvent(t){for(const e of Object.keys(t))this.addPendingData(e,t[e]);this.sendPendingData()}debouncedSendEvent500(t){for(const e of Object.keys(t))this.addPendingData(e,t[e]);this.debouncedSendPendingData500()}debouncedSend500(t,e){this.addPendingData(t,e),this.debouncedSendPendingData500()}debouncedSend1000(t,e){this.addPendingData(t,e),this.debouncedSendPendingData1000()}addPendingData(t,e){Array.isArray(t)||(t=[t]);for(const n in t)this.pendingData[t[n]]=e}updateData(t){t=JSON.parse(t.data);for(const e in t)this.data[e]=t[e];for(const e in t)e in this.callbackMap&&this.callbackMap[e](this.data[e])}subscribe(t,e){this.callbackMap[t]=e,o()((e=>this.callbackMap[t](this.data[t])))}sendPendingData(){this.jcomm.send_data(this.pendingData),this.pendingData={}}}class c{constructor(t,e,n=\"open\"){this._fire_callback=this._fire_callback.bind(this),this._register=this._register.bind(this),this.jcomm=void 0,this.callback=e,void 0!==window.Jupyter?\"register\"===n?Jupyter.notebook.kernel.comm_manager.register_target(t,this._register):(this.jcomm=Jupyter.notebook.kernel.comm_manager.new_comm(t),this.jcomm.on_msg(this._fire_callback)):void 0!==window._mgr&&(\"register\"===n?window._mgr.widgetManager.proxyKernel.registerCommTarget(t,this._register):(this.jcomm=window._mgr.widgetManager.proxyKernel.createComm(t),this.jcomm.open({},\"\"),this.jcomm.onMsg=this._fire_callback))}send_data(t){void 0!==this.jcomm?this.jcomm.send(t):console.error(\"Jupyter comm module not yet loaded! So we can't send the message.\")}_register(t,e){this.jcomm=t,this.jcomm.on_msg(this._fire_callback)}_fire_callback(t){this.callback(t.content.data)}}class d{constructor(t,n){e(this),this.id=t,this.comm=new s(t),this.comm.subscribe(\"append\",this.appendData),this.comm.subscribe(\"replace\",this.replaceData),this.comm.subscribe(\"event\",this.eventOccurred),this.element=document.getElementById(\"guidance-content-\"+t),this.stop_button=document.getElementById(\"guidance-stop-button-\"+t),this.stop_button.onclick=()=>this.comm.send(\"event\",\"stop\")}appendData(t){t&&(this.stop_button.style.display=\"inline-block\",this.element.innerHTML+=t)}replaceData(t){t&&(this.stop_button.style.display=\"inline-block\",this.element.innerHTML=t)}eventOccurred(t){\"complete\"===t&&(this.stop_button.style.display=\"none\")}}window._guidanceDisplay=function(t,e){return new d(t,e)}})()})();; window._guidanceDisplay(\"faa51639-e0a4-43c6-a6d4-d5853c2ec764\");</script>"
22 |       ]
23 |      },
24 |      "metadata": {},
25 |      "output_type": "display_data"
26 |     }
27 |    ],
28 |    "source": [
29 |     "import os\n",
30 |     "\n",
31 |     "os.environ[\"OPENAI_API_KEY\"] = (\n",
32 |     "    \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"  # can be anything\n",
33 |     ")\n",
34 |     "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n",
35 |     "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n",
36 |     "\n",
37 |     "import guidance\n",
38 |     "\n",
39 |     "# set the default language model used to execute guidance programs\n",
40 |     "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n",
41 |     "\n",
42 |     "# define a guidance program that adapts a proverb\n",
43 |     "program = guidance(\n",
44 |     "    \"\"\"Tweak this proverb to apply to model instructions instead.\n",
45 |     "\n",
46 |     "{{proverb}}\n",
47 |     "- {{book}} {{chapter}}:{{verse}}\n",
48 |     "\n",
49 |     "UPDATED\n",
50 |     "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n",
51 |     "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\"\n",
52 |     ")\n",
53 |     "\n",
54 |     "# execute the program on a specific proverb\n",
55 |     "executed_program = program(\n",
56 |     "    proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n",
57 |     "    book=\"Proverbs\",\n",
58 |     "    chapter=11,\n",
59 |     "    verse=14,\n",
60 |     ")"
61 |    ]
62 |   },
63 |   {
64 |    "cell_type": "code",
65 |    "execution_count": null,
66 |    "metadata": {},
67 |    "outputs": [],
68 |    "source": []
69 |   }
70 |  ],
71 |  "metadata": {
72 |   "kernelspec": {
73 |    "display_name": ".venv",
74 |    "language": "python",
75 |    "name": "python3"
76 |   },
77 |   "language_info": {
78 |    "codemirror_mode": {
79 |     "name": "ipython",
80 |     "version": 3
81 |    },
82 |    "file_extension": ".py",
83 |    "mimetype": "text/x-python",
84 |    "name": "python",
85 |    "nbconvert_exporter": "python",
86 |    "pygments_lexer": "ipython3",
87 |    "version": "3.8.10"
88 |   },
89 |   "orig_nbformat": 4
90 |  },
91 |  "nbformat": 4,
92 |  "nbformat_minor": 2
93 | }
94 | 


--------------------------------------------------------------------------------
/examples/notebooks/Multimodal.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "<div>\n",
 8 |     "    <img src=\"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\" width=\"500\"/>\n",
 9 |     "</div>"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 13,
15 |    "metadata": {},
16 |    "outputs": [
17 |     {
18 |      "name": "stdout",
19 |      "output_type": "stream",
20 |      "text": [
21 |       "{'text': 'Llama C++'}\n"
22 |      ]
23 |     }
24 |    ],
25 |    "source": [
26 |     "from openai import OpenAI\n",
27 |     "\n",
28 |     "client = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"llama.cpp\")\n",
29 |     "response = client.chat.completions.create(\n",
30 |     "    model=\"gpt-4-vision-preview\",\n",
31 |     "    messages=[\n",
32 |     "        {\n",
33 |     "            \"role\": \"user\",\n",
34 |     "            \"content\": [\n",
35 |     "                {\n",
36 |     "                    \"type\": \"image_url\",\n",
37 |     "                    \"image_url\": {\n",
38 |     "                        \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n",
39 |     "                    },\n",
40 |     "                },\n",
41 |     "                {\n",
42 |     "                    \"type\": \"text\",\n",
43 |     "                    \"text\": \"What does the image say. Format your response as a json object with a single 'text' key.\",\n",
44 |     "                },\n",
45 |     "            ],\n",
46 |     "        }\n",
47 |     "    ],\n",
48 |     "    response_format={\n",
49 |     "        \"type\": \"json_object\",\n",
50 |     "        \"schema\": {\"type\": \"object\", \"properties\": {\"text\": {\"type\": \"string\"}}},\n",
51 |     "    },\n",
52 |     ")\n",
53 |     "import json\n",
54 |     "\n",
55 |     "print(json.loads(response.choices[0].message.content))"
56 |    ]
57 |   },
58 |   {
59 |    "cell_type": "code",
60 |    "execution_count": null,
61 |    "metadata": {},
62 |    "outputs": [],
63 |    "source": []
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": ".venv",
69 |    "language": "python",
70 |    "name": "python3"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 3
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython3",
82 |    "version": "3.11.5+"
83 |   },
84 |   "orig_nbformat": 4
85 |  },
86 |  "nbformat": 4,
87 |  "nbformat_minor": 2
88 | }
89 | 


--------------------------------------------------------------------------------
/examples/ray/README.md:
--------------------------------------------------------------------------------
 1 | This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
 2 | 
 3 | First, install the requirements:
 4 | 
 5 | ```bash
 6 | $ pip install -r requirements.txt
 7 | ```
 8 | 
 9 | Deploy a GGUF model to Ray Serve with the following command:
10 | 
11 | ```bash
12 | $ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
13 | ```
14 | 
15 | This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
16 | 
17 | ```bash
18 | $ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
19 | ```
20 | 


--------------------------------------------------------------------------------
/examples/ray/llm.py:
--------------------------------------------------------------------------------
 1 | from starlette.requests import Request
 2 | from typing import Dict
 3 | from ray import serve
 4 | from ray.serve import Application
 5 | from llama_cpp import Llama
 6 | 
 7 | 
 8 | @serve.deployment
 9 | class LlamaDeployment:
10 |     def __init__(self, model_path: str):
11 |         self._llm = Llama(model_path=model_path)
12 | 
13 |     async def __call__(self, http_request: Request) -> Dict:
14 |         input_json = await http_request.json()
15 |         prompt = input_json["prompt"]
16 |         max_tokens = input_json.get("max_tokens", 64)
17 |         return self._llm(prompt, max_tokens=max_tokens)
18 | 
19 | 
20 | def llm_builder(args: Dict[str, str]) -> Application:
21 |     return LlamaDeployment.bind(args["model_path"])
22 | 


--------------------------------------------------------------------------------
/examples/ray/requirements.txt:
--------------------------------------------------------------------------------
1 | ray[serve]
2 | --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
3 | llama-cpp-python
4 | 


--------------------------------------------------------------------------------
/llama_cpp/__init__.py:
--------------------------------------------------------------------------------
1 | from .llama_cpp import *
2 | from .llama import *
3 | 
4 | __version__ = "0.3.9"
5 | 


--------------------------------------------------------------------------------
/llama_cpp/_ctypes_extensions.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import sys
  4 | import os
  5 | import ctypes
  6 | import functools
  7 | import pathlib
  8 | 
  9 | from typing import (
 10 |     Any,
 11 |     Callable,
 12 |     List,
 13 |     Union,
 14 |     Optional,
 15 |     TYPE_CHECKING,
 16 |     TypeVar,
 17 |     Generic,
 18 | )
 19 | from typing_extensions import TypeAlias
 20 | 
 21 | 
 22 | # Load the library
 23 | def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
 24 |     """Platform independent shared library loader"""
 25 |     # Searching for the library in the current directory under the name "libllama" (default name
 26 |     # for llamacpp) and "llama" (default name for this repo)
 27 |     lib_paths: List[pathlib.Path] = []
 28 |     # Determine the file extension based on the platform
 29 |     if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
 30 |         lib_paths += [
 31 |             base_path / f"lib{lib_base_name}.so",
 32 |         ]
 33 |     elif sys.platform == "darwin":
 34 |         lib_paths += [
 35 |             base_path / f"lib{lib_base_name}.so",
 36 |             base_path / f"lib{lib_base_name}.dylib",
 37 |         ]
 38 |     elif sys.platform == "win32":
 39 |         lib_paths += [
 40 |             base_path / f"{lib_base_name}.dll",
 41 |             base_path / f"lib{lib_base_name}.dll",
 42 |         ]
 43 |     else:
 44 |         raise RuntimeError("Unsupported platform")
 45 | 
 46 |     cdll_args = dict()  # type: ignore
 47 | 
 48 |     # Add the library directory to the DLL search path on Windows (if needed)
 49 |     if sys.platform == "win32":
 50 |         os.add_dll_directory(str(base_path))
 51 |         os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"]
 52 | 
 53 |     if sys.platform == "win32" and sys.version_info >= (3, 8):
 54 |         os.add_dll_directory(str(base_path))
 55 |         if "CUDA_PATH" in os.environ:
 56 |             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
 57 |             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
 58 |         if "HIP_PATH" in os.environ:
 59 |             os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
 60 |             os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
 61 |         cdll_args["winmode"] = ctypes.RTLD_GLOBAL
 62 | 
 63 |     # Try to load the shared library, handling potential errors
 64 |     for lib_path in lib_paths:
 65 |         if lib_path.exists():
 66 |             try:
 67 |                 return ctypes.CDLL(str(lib_path), **cdll_args)  # type: ignore
 68 |             except Exception as e:
 69 |                 raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}")
 70 | 
 71 |     raise FileNotFoundError(
 72 |         f"Shared library with base name '{lib_base_name}' not found"
 73 |     )
 74 | 
 75 | 
 76 | # ctypes sane type hint helpers
 77 | #
 78 | # - Generic Pointer and Array types
 79 | # - PointerOrRef type with a type hinted byref function
 80 | #
 81 | # NOTE: Only use these for static type checking not for runtime checks
 82 | # no good will come of that
 83 | 
 84 | if TYPE_CHECKING:
 85 |     CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData)  # type: ignore
 86 | 
 87 |     CtypesArray: TypeAlias = ctypes.Array[CtypesCData]  # type: ignore
 88 | 
 89 |     CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData]  # type: ignore
 90 | 
 91 |     CtypesVoidPointer: TypeAlias = ctypes.c_void_p
 92 | 
 93 |     class CtypesRef(Generic[CtypesCData]):
 94 |         pass
 95 | 
 96 |     CtypesPointerOrRef: TypeAlias = Union[
 97 |         CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
 98 |     ]
 99 | 
100 |     CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
101 | 
102 | F = TypeVar("F", bound=Callable[..., Any])
103 | 
104 | 
105 | def ctypes_function_for_shared_library(lib: ctypes.CDLL):
106 |     """Decorator for defining ctypes functions with type hints"""
107 | 
108 |     def ctypes_function(
109 |         name: str, argtypes: List[Any], restype: Any, enabled: bool = True
110 |     ):
111 |         def decorator(f: F) -> F:
112 |             if enabled:
113 |                 func = getattr(lib, name)
114 |                 func.argtypes = argtypes
115 |                 func.restype = restype
116 |                 functools.wraps(f)(func)
117 |                 return func
118 |             else:
119 |                 return f
120 | 
121 |         return decorator
122 | 
123 |     return ctypes_function
124 | 
125 | 
126 | def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]:
127 |     """Type-annotated version of ctypes.byref"""
128 |     ...
129 | 
130 | 
131 | byref = _byref if TYPE_CHECKING else ctypes.byref
132 | 


--------------------------------------------------------------------------------
/llama_cpp/_ggml.py:
--------------------------------------------------------------------------------
 1 | """Internal module use at your own risk
 2 | 
 3 | This module provides a minimal interface for working with ggml tensors from llama-cpp-python
 4 | """
 5 | import os
 6 | import pathlib
 7 | 
 8 | import llama_cpp._ctypes_extensions as ctypes_ext
 9 | 
10 | libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
11 | libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path)
12 | 
13 | 


--------------------------------------------------------------------------------
/llama_cpp/_logger.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import ctypes
 3 | import logging
 4 | 
 5 | import llama_cpp
 6 | 
 7 | # enum ggml_log_level {
 8 | #     GGML_LOG_LEVEL_NONE  = 0,
 9 | #     GGML_LOG_LEVEL_INFO  = 1,
10 | #     GGML_LOG_LEVEL_WARN  = 2,
11 | #     GGML_LOG_LEVEL_ERROR = 3,
12 | #     GGML_LOG_LEVEL_DEBUG = 4,
13 | #     GGML_LOG_LEVEL_CONT  = 5, // continue previous log
14 | # };
15 | GGML_LOG_LEVEL_TO_LOGGING_LEVEL = {
16 |     0: logging.CRITICAL,
17 |     1: logging.INFO,
18 |     2: logging.WARNING,
19 |     3: logging.ERROR,
20 |     4: logging.DEBUG,
21 |     5: logging.DEBUG,
22 | }
23 | 
24 | logger = logging.getLogger("llama-cpp-python")
25 | 
26 | _last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0]
27 | 
28 | # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
29 | @llama_cpp.llama_log_callback
30 | def llama_log_callback(
31 |     level: int,
32 |     text: bytes,
33 |     user_data: ctypes.c_void_p,
34 | ):
35 |     # TODO: Correctly implement continue previous log
36 |     global _last_log_level
37 |     log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level
38 |     if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
39 |         print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
40 |     _last_log_level = log_level
41 | 
42 | 
43 | llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))
44 | 
45 | 
46 | def set_verbose(verbose: bool):
47 |     logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
48 | 


--------------------------------------------------------------------------------
/llama_cpp/_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from typing import Any, Dict
 5 | 
 6 | # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
 7 | outnull_file = open(os.devnull, "w")
 8 | errnull_file = open(os.devnull, "w")
 9 | 
10 | STDOUT_FILENO = 1
11 | STDERR_FILENO = 2
12 | 
13 | 
14 | class suppress_stdout_stderr(object):
15 |     # NOTE: these must be "saved" here to avoid exceptions when using
16 |     #       this context manager inside of a __del__ method
17 |     sys = sys
18 |     os = os
19 | 
20 |     def __init__(self, disable: bool = True):
21 |         self.disable = disable
22 | 
23 |     # Oddly enough this works better than the contextlib version
24 |     def __enter__(self):
25 |         if self.disable:
26 |             return self
27 | 
28 |         self.old_stdout_fileno_undup = STDOUT_FILENO
29 |         self.old_stderr_fileno_undup = STDERR_FILENO
30 | 
31 |         self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
32 |         self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
33 | 
34 |         self.old_stdout = self.sys.stdout
35 |         self.old_stderr = self.sys.stderr
36 | 
37 |         self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup)
38 |         self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup)
39 | 
40 |         self.sys.stdout = outnull_file
41 |         self.sys.stderr = errnull_file
42 |         return self
43 | 
44 |     def __exit__(self, *_):
45 |         if self.disable:
46 |             return
47 | 
48 |         # Check if sys.stdout and sys.stderr have fileno method
49 |         self.sys.stdout = self.old_stdout
50 |         self.sys.stderr = self.old_stderr
51 | 
52 |         self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
53 |         self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
54 | 
55 |         self.os.close(self.old_stdout_fileno)
56 |         self.os.close(self.old_stderr_fileno)
57 | 
58 | 
59 | class MetaSingleton(type):
60 |     """
61 |     Metaclass for implementing the Singleton pattern.
62 |     """
63 | 
64 |     _instances: Dict[type, Any] = {}
65 | 
66 |     def __call__(cls, *args: Any, **kwargs: Any) -> Any:
67 |         if cls not in cls._instances:
68 |             cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs)
69 |         return cls._instances[cls]
70 | 
71 | 
72 | class Singleton(object, metaclass=MetaSingleton):
73 |     """
74 |     Base class for implementing the Singleton pattern.
75 |     """
76 | 
77 |     def __init__(self):
78 |         super(Singleton, self).__init__()
79 | 


--------------------------------------------------------------------------------
/llama_cpp/llama_cache.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from abc import ABC, abstractmethod
  3 | from typing import (
  4 |     Optional,
  5 |     Sequence,
  6 |     Tuple,
  7 | )
  8 | from collections import OrderedDict
  9 | 
 10 | import diskcache
 11 | 
 12 | import llama_cpp.llama
 13 | 
 14 | from .llama_types import *
 15 | 
 16 | 
 17 | class BaseLlamaCache(ABC):
 18 |     """Base cache class for a llama.cpp model."""
 19 | 
 20 |     def __init__(self, capacity_bytes: int = (2 << 30)):
 21 |         self.capacity_bytes = capacity_bytes
 22 | 
 23 |     @property
 24 |     @abstractmethod
 25 |     def cache_size(self) -> int:
 26 |         raise NotImplementedError
 27 | 
 28 |     def _find_longest_prefix_key(
 29 |         self,
 30 |         key: Tuple[int, ...],
 31 |     ) -> Optional[Tuple[int, ...]]:
 32 |         pass
 33 | 
 34 |     @abstractmethod
 35 |     def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
 36 |         raise NotImplementedError
 37 | 
 38 |     @abstractmethod
 39 |     def __contains__(self, key: Sequence[int]) -> bool:
 40 |         raise NotImplementedError
 41 | 
 42 |     @abstractmethod
 43 |     def __setitem__(
 44 |         self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"
 45 |     ) -> None:
 46 |         raise NotImplementedError
 47 | 
 48 | 
 49 | class LlamaRAMCache(BaseLlamaCache):
 50 |     """Cache for a llama.cpp model using RAM."""
 51 | 
 52 |     def __init__(self, capacity_bytes: int = (2 << 30)):
 53 |         super().__init__(capacity_bytes)
 54 |         self.capacity_bytes = capacity_bytes
 55 |         self.cache_state: OrderedDict[
 56 |             Tuple[int, ...], "llama_cpp.llama.LlamaState"
 57 |         ] = OrderedDict()
 58 | 
 59 |     @property
 60 |     def cache_size(self):
 61 |         return sum([state.llama_state_size for state in self.cache_state.values()])
 62 | 
 63 |     def _find_longest_prefix_key(
 64 |         self,
 65 |         key: Tuple[int, ...],
 66 |     ) -> Optional[Tuple[int, ...]]:
 67 |         min_len = 0
 68 |         min_key = None
 69 |         keys = (
 70 |             (k, llama_cpp.llama.Llama.longest_token_prefix(k, key))
 71 |             for k in self.cache_state.keys()
 72 |         )
 73 |         for k, prefix_len in keys:
 74 |             if prefix_len > min_len:
 75 |                 min_len = prefix_len
 76 |                 min_key = k
 77 |         return min_key
 78 | 
 79 |     def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
 80 |         key = tuple(key)
 81 |         _key = self._find_longest_prefix_key(key)
 82 |         if _key is None:
 83 |             raise KeyError("Key not found")
 84 |         value = self.cache_state[_key]
 85 |         self.cache_state.move_to_end(_key)
 86 |         return value
 87 | 
 88 |     def __contains__(self, key: Sequence[int]) -> bool:
 89 |         return self._find_longest_prefix_key(tuple(key)) is not None
 90 | 
 91 |     def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
 92 |         key = tuple(key)
 93 |         if key in self.cache_state:
 94 |             del self.cache_state[key]
 95 |         self.cache_state[key] = value
 96 |         while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0:
 97 |             self.cache_state.popitem(last=False)
 98 | 
 99 | 
100 | # Alias for backwards compatibility
101 | LlamaCache = LlamaRAMCache
102 | 
103 | 
104 | class LlamaDiskCache(BaseLlamaCache):
105 |     """Cache for a llama.cpp model using disk."""
106 | 
107 |     def __init__(
108 |         self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30)
109 |     ):
110 |         super().__init__(capacity_bytes)
111 |         self.cache = diskcache.Cache(cache_dir)
112 | 
113 |     @property
114 |     def cache_size(self):
115 |         return int(self.cache.volume())  # type: ignore
116 | 
117 |     def _find_longest_prefix_key(
118 |         self,
119 |         key: Tuple[int, ...],
120 |     ) -> Optional[Tuple[int, ...]]:
121 |         min_len = 0
122 |         min_key: Optional[Tuple[int, ...]] = None
123 |         for k in self.cache.iterkeys():  # type: ignore
124 |             prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key)
125 |             if prefix_len > min_len:
126 |                 min_len = prefix_len
127 |                 min_key = k  # type: ignore
128 |         return min_key
129 | 
130 |     def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
131 |         key = tuple(key)
132 |         _key = self._find_longest_prefix_key(key)
133 |         if _key is None:
134 |             raise KeyError("Key not found")
135 |         value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key)  # type: ignore
136 |         # NOTE: This puts an integer as key in cache, which breaks,
137 |         # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
138 |         # self.cache.push(_key, side="front")  # type: ignore
139 |         return value
140 | 
141 |     def __contains__(self, key: Sequence[int]) -> bool:
142 |         return self._find_longest_prefix_key(tuple(key)) is not None
143 | 
144 |     def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
145 |         print("LlamaDiskCache.__setitem__: called", file=sys.stderr)
146 |         key = tuple(key)
147 |         if key in self.cache:
148 |             print("LlamaDiskCache.__setitem__: delete", file=sys.stderr)
149 |             del self.cache[key]
150 |         self.cache[key] = value
151 |         print("LlamaDiskCache.__setitem__: set", file=sys.stderr)
152 |         while self.cache_size > self.capacity_bytes and len(self.cache) > 0:
153 |             key_to_remove = next(iter(self.cache))
154 |             del self.cache[key_to_remove]
155 |         print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)
156 | 


--------------------------------------------------------------------------------
/llama_cpp/llama_speculative.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | from typing import Any
 4 | 
 5 | import numpy as np
 6 | import numpy.typing as npt
 7 | 
 8 | 
 9 | class LlamaDraftModel(abc.ABC):
10 |     @abc.abstractmethod
11 |     def __call__(
12 |         self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
13 |     ) -> npt.NDArray[np.intc]:
14 |         raise NotImplementedError()
15 | 
16 | 
17 | class LlamaPromptLookupDecoding(LlamaDraftModel):
18 |     """Based on https://github.com/apoorvumang/prompt-lookup-decoding"""
19 | 
20 |     def __init__(self, max_ngram_size: int = 2, num_pred_tokens: int = 10):
21 |         self.max_ngram_size = max_ngram_size
22 |         self.num_pred_tokens = num_pred_tokens
23 | 
24 |     @staticmethod
25 |     def find_candidate_pred_tokens(
26 |         input_ids: npt.NDArray[np.intc],
27 |         max_ngram_size: int,
28 |         num_pred_tokens: int,
29 |     ):
30 |         input_length = input_ids.shape[0]
31 | 
32 |         for ngram_size in range(min(max_ngram_size, input_length - 1), 0, -1):
33 |             # Create sliding windows of size ngram_size
34 |             windows = np.lib.stride_tricks.sliding_window_view(input_ids, (ngram_size,))
35 | 
36 |             # Convert ngram to an array for comparison
37 |             ngram_array = input_ids[-ngram_size:]
38 | 
39 |             # Find where the windows match the ngram
40 |             matches = np.all(windows == ngram_array, axis=1)
41 | 
42 |             # Get the indices of matches
43 |             match_indices = np.nonzero(matches)[0]
44 | 
45 |             # Iterate through match indices to find a valid continuation
46 |             for idx in match_indices:
47 |                 start_idx = idx + ngram_size
48 |                 end_idx = start_idx + num_pred_tokens
49 |                 end_idx = min(end_idx, input_length)
50 | 
51 |                 if start_idx < end_idx:
52 |                     return input_ids[start_idx:end_idx]
53 | 
54 |         # If no match is found, return an empty array
55 |         return np.array([], dtype=np.intc)
56 | 
57 |     def __call__(
58 |         self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
59 |     ) -> npt.NDArray[np.intc]:
60 |         return self.find_candidate_pred_tokens(
61 |             input_ids=input_ids,
62 |             max_ngram_size=self.max_ngram_size,
63 |             num_pred_tokens=self.num_pred_tokens,
64 |         )
65 | 


--------------------------------------------------------------------------------
/llama_cpp/llama_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import abc
  4 | from typing import (
  5 |     List,
  6 |     Optional,
  7 |     Any,
  8 | )
  9 | 
 10 | import llama_cpp
 11 | from llama_cpp.llama_types import List
 12 | 
 13 | 
 14 | class BaseLlamaTokenizer(abc.ABC):
 15 |     @abc.abstractmethod
 16 |     def tokenize(
 17 |         self, text: bytes, add_bos: bool = True, special: bool = True
 18 |     ) -> List[int]:
 19 |         """Tokenize the text into tokens.
 20 | 
 21 |         Args:
 22 |             text: The utf-8 encoded string to tokenize.
 23 |             add_bos: Whether to add a beginning of sequence token.
 24 |             special: Whether to tokenize special tokens.
 25 |         """
 26 |         raise NotImplementedError
 27 | 
 28 |     @abc.abstractmethod
 29 |     def detokenize(
 30 |         self,
 31 |         tokens: List[int],
 32 |         prev_tokens: Optional[List[int]] = None,
 33 |         special: bool = False,
 34 |     ) -> bytes:
 35 |         """Detokenize the tokens into text.
 36 | 
 37 |         Args:
 38 |             tokens: The list of tokens to detokenize.
 39 |             prev_tokens: The list of previous tokens. Offset mapping will be performed if provided.
 40 |             special: Whether to detokenize special tokens.
 41 |         """
 42 |         raise NotImplementedError
 43 | 
 44 | 
 45 | class LlamaTokenizer(BaseLlamaTokenizer):
 46 |     def __init__(self, llama: llama_cpp.Llama):
 47 |         self._model = llama._model  # type: ignore
 48 | 
 49 |     def tokenize(
 50 |         self, text: bytes, add_bos: bool = True, special: bool = True
 51 |     ) -> List[int]:
 52 |         return self._model.tokenize(text, add_bos=add_bos, special=special)
 53 | 
 54 |     def detokenize(
 55 |         self,
 56 |         tokens: List[int],
 57 |         prev_tokens: Optional[List[int]] = None,
 58 |         special: bool = False,
 59 |     ) -> bytes:
 60 |         return self._model.detokenize(tokens, special=special)
 61 | 
 62 |     def encode(
 63 |         self, text: str, add_bos: bool = True, special: bool = True
 64 |     ) -> List[int]:
 65 |         return self.tokenize(
 66 |             text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
 67 |         )
 68 | 
 69 |     def decode(self, tokens: List[int]) -> str:
 70 |         return self.detokenize(tokens).decode("utf-8", errors="ignore")
 71 | 
 72 |     @classmethod
 73 |     def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
 74 |         return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
 75 | 
 76 | 
 77 | class LlamaHFTokenizer(BaseLlamaTokenizer):
 78 |     def __init__(self, hf_tokenizer: Any):
 79 |         self.hf_tokenizer = hf_tokenizer
 80 | 
 81 |     def tokenize(
 82 |         self, text: bytes, add_bos: bool = True, special: bool = True
 83 |     ) -> List[int]:
 84 |         return self.hf_tokenizer.encode(
 85 |             text.decode("utf-8", errors="ignore"), add_special_tokens=special
 86 |         )
 87 | 
 88 |     def detokenize(
 89 |         self,
 90 |         tokens: List[int],
 91 |         prev_tokens: Optional[List[int]] = None,
 92 |         special: bool = False,
 93 |     ) -> bytes:
 94 |         skip_special_tokens = not special
 95 |         if prev_tokens is not None:
 96 |             text = self.hf_tokenizer.decode(
 97 |                 prev_tokens + tokens, skip_special_tokens=skip_special_tokens
 98 |             ).encode("utf-8", errors="ignore")
 99 |             prev_text = self.hf_tokenizer.decode(
100 |                 prev_tokens, skip_special_tokens=skip_special_tokens
101 |             ).encode("utf-8", errors="ignore")
102 |             return text[len(prev_text) :]
103 |         else:
104 |             return self.hf_tokenizer.decode(
105 |                 tokens, skip_special_tokens=skip_special_tokens
106 |             ).encode("utf-8", errors="ignore")
107 | 
108 |     @classmethod
109 |     def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
110 |         try:
111 |             from transformers import AutoTokenizer
112 |         except ImportError:
113 |             raise ImportError(
114 |                 "The `transformers` library is required to use the `HFTokenizer`."
115 |                 "You can install it with `pip install transformers`."
116 |             )
117 |         hf_tokenizer = AutoTokenizer.from_pretrained(
118 |             pretrained_model_name_or_path=pretrained_model_name_or_path
119 |         )
120 |         return cls(hf_tokenizer)
121 | 


--------------------------------------------------------------------------------
/llama_cpp/llama_types.py:
--------------------------------------------------------------------------------
  1 | """Types and request signatures for OpenAI compatibility
  2 | 
  3 | NOTE: These types may change to match the OpenAI OpenAPI specification.
  4 | 
  5 | Based on the OpenAI OpenAPI specification:
  6 | https://github.com/openai/openai-openapi/blob/master/openapi.yaml
  7 | 
  8 | """
  9 | 
 10 | from typing import Any, List, Optional, Dict, Union
 11 | from typing_extensions import TypedDict, NotRequired, Literal
 12 | 
 13 | 
 14 | # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 15 | #       This is a workaround until we can figure out how to do this correctly
 16 | # JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
 17 | JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
 18 | 
 19 | 
 20 | class EmbeddingUsage(TypedDict):
 21 |     prompt_tokens: int
 22 |     total_tokens: int
 23 | 
 24 | 
 25 | class Embedding(TypedDict):
 26 |     index: int
 27 |     object: str
 28 |     embedding: Union[List[float], List[List[float]]]
 29 | 
 30 | 
 31 | class CreateEmbeddingResponse(TypedDict):
 32 |     object: Literal["list"]
 33 |     model: str
 34 |     data: List[Embedding]
 35 |     usage: EmbeddingUsage
 36 | 
 37 | 
 38 | class CompletionLogprobs(TypedDict):
 39 |     text_offset: List[int]
 40 |     token_logprobs: List[Optional[float]]
 41 |     tokens: List[str]
 42 |     top_logprobs: List[Optional[Dict[str, float]]]
 43 | 
 44 | 
 45 | class CompletionChoice(TypedDict):
 46 |     text: str
 47 |     index: int
 48 |     logprobs: Optional[CompletionLogprobs]
 49 |     finish_reason: Optional[Literal["stop", "length"]]
 50 | 
 51 | 
 52 | class CompletionUsage(TypedDict):
 53 |     prompt_tokens: int
 54 |     completion_tokens: int
 55 |     total_tokens: int
 56 | 
 57 | 
 58 | class CreateCompletionResponse(TypedDict):
 59 |     id: str
 60 |     object: Literal["text_completion"]
 61 |     created: int
 62 |     model: str
 63 |     choices: List[CompletionChoice]
 64 |     usage: NotRequired[CompletionUsage]
 65 | 
 66 | 
 67 | class ChatCompletionResponseFunctionCall(TypedDict):
 68 |     name: str
 69 |     arguments: str
 70 | 
 71 | 
 72 | class ChatCompletionResponseMessage(TypedDict):
 73 |     content: Optional[str]
 74 |     tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
 75 |     role: Literal["assistant", "function"]  # NOTE: "function" may be incorrect here
 76 |     function_call: NotRequired[ChatCompletionResponseFunctionCall]  # DEPRECATED
 77 | 
 78 | 
 79 | class ChatCompletionFunction(TypedDict):
 80 |     name: str
 81 |     description: NotRequired[str]
 82 |     parameters: Dict[str, JsonType]  # TODO: make this more specific
 83 | 
 84 | 
 85 | class ChatCompletionTopLogprobToken(TypedDict):
 86 |     token: str
 87 |     logprob: float
 88 |     bytes: Optional[List[int]]
 89 | 
 90 | 
 91 | class ChatCompletionLogprobToken(ChatCompletionTopLogprobToken):
 92 |     token: str
 93 |     logprob: float
 94 |     bytes: Optional[List[int]]
 95 |     top_logprobs: List[ChatCompletionTopLogprobToken]
 96 | 
 97 | 
 98 | class ChatCompletionLogprobs(TypedDict):
 99 |     content: Optional[List[ChatCompletionLogprobToken]]
100 |     refusal: Optional[List[ChatCompletionLogprobToken]]
101 | 
102 | 
103 | class ChatCompletionResponseChoice(TypedDict):
104 |     index: int
105 |     message: "ChatCompletionResponseMessage"
106 |     logprobs: Optional[ChatCompletionLogprobs]
107 |     finish_reason: Optional[str]
108 | 
109 | 
110 | class CreateChatCompletionResponse(TypedDict):
111 |     id: str
112 |     object: Literal["chat.completion"]
113 |     created: int
114 |     model: str
115 |     choices: List["ChatCompletionResponseChoice"]
116 |     usage: CompletionUsage
117 | 
118 | 
119 | class ChatCompletionMessageToolCallChunkFunction(TypedDict):
120 |     name: Optional[str]
121 |     arguments: str
122 | 
123 | 
124 | class ChatCompletionMessageToolCallChunk(TypedDict):
125 |     index: int
126 |     id: NotRequired[str]
127 |     type: Literal["function"]
128 |     function: ChatCompletionMessageToolCallChunkFunction
129 | 
130 | 
131 | class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
132 |     pass
133 | 
134 | 
135 | class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
136 |     name: str
137 |     arguments: str
138 | 
139 | 
140 | class ChatCompletionStreamResponseDelta(TypedDict):
141 |     content: NotRequired[Optional[str]]
142 |     function_call: NotRequired[
143 |         Optional[ChatCompletionStreamResponseDeltaFunctionCall]
144 |     ]  # DEPRECATED
145 |     tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
146 |     role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
147 | 
148 | 
149 | class ChatCompletionStreamResponseChoice(TypedDict):
150 |     index: int
151 |     delta: Union[
152 |         ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
153 |     ]
154 |     finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
155 |     logprobs: NotRequired[Optional[ChatCompletionLogprobs]]
156 | 
157 | 
158 | class CreateChatCompletionStreamResponse(TypedDict):
159 |     id: str
160 |     model: str
161 |     object: Literal["chat.completion.chunk"]
162 |     created: int
163 |     choices: List[ChatCompletionStreamResponseChoice]
164 | 
165 | 
166 | class ChatCompletionFunctions(TypedDict):
167 |     name: str
168 |     description: NotRequired[str]
169 |     parameters: Dict[str, JsonType]  # TODO: make this more specific
170 | 
171 | 
172 | class ChatCompletionFunctionCallOption(TypedDict):
173 |     name: str
174 | 
175 | 
176 | class ChatCompletionRequestResponseFormat(TypedDict):
177 |     type: Literal["text", "json_object"]
178 |     schema: NotRequired[
179 |         JsonType
180 |     ]  # https://docs.endpoints.anyscale.com/guides/json_mode/
181 | 
182 | 
183 | class ChatCompletionRequestMessageContentPartText(TypedDict):
184 |     type: Literal["text"]
185 |     text: str
186 | 
187 | 
188 | class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
189 |     url: str
190 |     detail: NotRequired[Literal["auto", "low", "high"]]
191 | 
192 | 
193 | class ChatCompletionRequestMessageContentPartImage(TypedDict):
194 |     type: Literal["image_url"]
195 |     image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
196 | 
197 | 
198 | ChatCompletionRequestMessageContentPart = Union[
199 |     ChatCompletionRequestMessageContentPartText,
200 |     ChatCompletionRequestMessageContentPartImage,
201 | ]
202 | 
203 | 
204 | class ChatCompletionRequestSystemMessage(TypedDict):
205 |     role: Literal["system"]
206 |     content: Optional[str]
207 | 
208 | 
209 | class ChatCompletionRequestUserMessage(TypedDict):
210 |     role: Literal["user"]
211 |     content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
212 | 
213 | 
214 | class ChatCompletionMessageToolCallFunction(TypedDict):
215 |     name: str
216 |     arguments: str
217 | 
218 | 
219 | class ChatCompletionMessageToolCall(TypedDict):
220 |     id: str
221 |     type: Literal["function"]
222 |     function: ChatCompletionMessageToolCallFunction
223 | 
224 | 
225 | ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
226 | 
227 | 
228 | class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
229 |     name: str
230 |     arguments: str
231 | 
232 | 
233 | class ChatCompletionRequestAssistantMessage(TypedDict):
234 |     role: Literal["assistant"]
235 |     content: NotRequired[str]
236 |     tool_calls: NotRequired[ChatCompletionMessageToolCalls]
237 |     function_call: NotRequired[
238 |         ChatCompletionRequestAssistantMessageFunctionCall
239 |     ]  # DEPRECATED
240 | 
241 | 
242 | class ChatCompletionRequestToolMessage(TypedDict):
243 |     role: Literal["tool"]
244 |     content: Optional[str]
245 |     tool_call_id: str
246 | 
247 | 
248 | class ChatCompletionRequestFunctionMessage(TypedDict):
249 |     role: Literal["function"]
250 |     content: Optional[str]
251 |     name: str
252 | 
253 | 
254 | ChatCompletionRequestMessage = Union[
255 |     ChatCompletionRequestSystemMessage,
256 |     ChatCompletionRequestUserMessage,
257 |     ChatCompletionRequestAssistantMessage,
258 |     ChatCompletionRequestUserMessage,
259 |     ChatCompletionRequestToolMessage,
260 |     ChatCompletionRequestFunctionMessage,
261 | ]
262 | 
263 | 
264 | class ChatCompletionRequestFunctionCallOption(TypedDict):
265 |     name: str
266 | 
267 | 
268 | ChatCompletionRequestFunctionCall = Union[
269 |     Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
270 | ]
271 | 
272 | ChatCompletionFunctionParameters = Dict[str, JsonType]  # TODO: make this more specific
273 | 
274 | 
275 | class ChatCompletionToolFunction(TypedDict):
276 |     name: str
277 |     description: NotRequired[str]
278 |     parameters: ChatCompletionFunctionParameters
279 | 
280 | 
281 | class ChatCompletionTool(TypedDict):
282 |     type: Literal["function"]
283 |     function: ChatCompletionToolFunction
284 | 
285 | 
286 | class ChatCompletionNamedToolChoiceFunction(TypedDict):
287 |     name: str
288 | 
289 | 
290 | class ChatCompletionNamedToolChoice(TypedDict):
291 |     type: Literal["function"]
292 |     function: ChatCompletionNamedToolChoiceFunction
293 | 
294 | 
295 | ChatCompletionToolChoiceOption = Union[
296 |     Literal["none", "auto", "required"], ChatCompletionNamedToolChoice
297 | ]
298 | 
299 | 
300 | # NOTE: The following type names are not part of the OpenAI OpenAPI specification
301 | # and will be removed in a future major release.
302 | 
303 | EmbeddingData = Embedding
304 | CompletionChunk = CreateCompletionResponse
305 | Completion = CreateCompletionResponse
306 | CreateCompletionStreamResponse = CreateCompletionResponse
307 | ChatCompletionMessage = ChatCompletionResponseMessage
308 | ChatCompletionChoice = ChatCompletionResponseChoice
309 | ChatCompletion = CreateChatCompletionResponse
310 | ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
311 | ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
312 | ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
313 | ChatCompletionChunk = CreateChatCompletionStreamResponse
314 | ChatCompletionStreamResponse = CreateChatCompletionStreamResponse
315 | ChatCompletionResponseFunction = ChatCompletionFunction
316 | ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall
317 | 


--------------------------------------------------------------------------------
/llama_cpp/llava_cpp.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from ctypes import (
  5 |     c_bool,
  6 |     c_char_p,
  7 |     c_int,
  8 |     c_uint8,
  9 |     c_float,
 10 |     c_void_p,
 11 |     POINTER,
 12 |     _Pointer,  # type: ignore
 13 |     Structure,
 14 | )
 15 | import pathlib
 16 | from typing import (
 17 |     Union,
 18 |     NewType,
 19 |     Optional,
 20 |     TYPE_CHECKING,
 21 | )
 22 | 
 23 | import llama_cpp.llama_cpp as llama_cpp
 24 | 
 25 | from llama_cpp._ctypes_extensions import (
 26 |     load_shared_library,
 27 |     ctypes_function_for_shared_library,
 28 | )
 29 | 
 30 | if TYPE_CHECKING:
 31 |     from llama_cpp._ctypes_extensions import (
 32 |         CtypesArray,
 33 |     )
 34 | 
 35 | 
 36 | # Specify the base name of the shared library to load
 37 | _libllava_base_name = "llava"
 38 | _libllava_override_path = os.environ.get("LLAVA_CPP_LIB")
 39 | _libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path()
 40 | 
 41 | # Load the library
 42 | _libllava = load_shared_library(_libllava_base_name, _libllava_base_path)
 43 | 
 44 | ctypes_function = ctypes_function_for_shared_library(_libllava)
 45 | 
 46 | 
 47 | ################################################
 48 | # llava.h
 49 | ################################################
 50 | 
 51 | # struct clip_ctx;
 52 | clip_ctx_p = NewType("clip_ctx_p", int)
 53 | clip_ctx_p_ctypes = c_void_p
 54 | 
 55 | 
 56 | # struct llava_image_embed {
 57 | #     float * embed;
 58 | #     int n_image_pos;
 59 | # };
 60 | class llava_image_embed(Structure):
 61 |     _fields_ = [
 62 |         ("embed", POINTER(c_float)),
 63 |         ("n_image_pos", c_int),
 64 |     ]
 65 | 
 66 | 
 67 | # /** sanity check for clip <-> llava embed size match */
 68 | # LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
 69 | @ctypes_function(
 70 |     "llava_validate_embed_size",
 71 |     [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
 72 |     c_bool,
 73 | )
 74 | def llava_validate_embed_size(
 75 |     ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
 76 | ) -> bool:
 77 |     ...
 78 | 
 79 | 
 80 | # /** build an image embed from image file bytes */
 81 | # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 82 | @ctypes_function(
 83 |     "llava_image_embed_make_with_bytes",
 84 |     [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
 85 |     POINTER(llava_image_embed),
 86 | )
 87 | def llava_image_embed_make_with_bytes(
 88 |     ctx_clip: clip_ctx_p,
 89 |     n_threads: Union[c_int, int],
 90 |     image_bytes: CtypesArray[c_uint8],
 91 |     image_bytes_length: Union[c_int, int],
 92 |     /,
 93 | ) -> "_Pointer[llava_image_embed]":
 94 |     ...
 95 | 
 96 | 
 97 | # /** build an image embed from a path to an image filename */
 98 | # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 99 | @ctypes_function(
100 |     "llava_image_embed_make_with_filename",
101 |     [clip_ctx_p_ctypes, c_int, c_char_p],
102 |     POINTER(llava_image_embed),
103 | )
104 | def llava_image_embed_make_with_filename(
105 |     ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
106 | ) -> "_Pointer[llava_image_embed]":
107 |     ...
108 | 
109 | 
110 | # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
111 | # /** free an embedding made with llava_image_embed_make_* */
112 | @ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
113 | def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
114 |     ...
115 | 
116 | 
117 | # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
118 | # LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
119 | @ctypes_function(
120 |     "llava_eval_image_embed",
121 |     [
122 |         llama_cpp.llama_context_p_ctypes,
123 |         POINTER(llava_image_embed),
124 |         c_int,
125 |         POINTER(c_int),
126 |     ],
127 |     c_bool,
128 | )
129 | def llava_eval_image_embed(
130 |     ctx_llama: llama_cpp.llama_context_p,
131 |     embed: "_Pointer[llava_image_embed]",
132 |     n_batch: Union[c_int, int],
133 |     n_past: "_Pointer[c_int]",
134 |     /,
135 | ) -> bool:
136 |     ...
137 | 
138 | 
139 | ################################################
140 | # clip.h
141 | ################################################
142 | 
143 | 
144 | # /** load mmproj model */
145 | # CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
146 | @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
147 | def clip_model_load(
148 |     fname: bytes, verbosity: Union[c_int, int], /
149 | ) -> Optional[clip_ctx_p]:
150 |     ...
151 | 
152 | 
153 | # /** free mmproj model */
154 | # CLIP_API void clip_free(struct clip_ctx * ctx);
155 | @ctypes_function("clip_free", [clip_ctx_p_ctypes], None)
156 | def clip_free(ctx: clip_ctx_p, /):
157 |     ...
158 | 
159 | 


--------------------------------------------------------------------------------
/llama_cpp/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abetlen/llama-cpp-python/b1d23df0bbd327b774083b5cf88e67ca0dd52b92/llama_cpp/py.typed


--------------------------------------------------------------------------------
/llama_cpp/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abetlen/llama-cpp-python/b1d23df0bbd327b774083b5cf88e67ca0dd52b92/llama_cpp/server/__init__.py


--------------------------------------------------------------------------------
/llama_cpp/server/__main__.py:
--------------------------------------------------------------------------------
  1 | """Example FastAPI server for llama.cpp.
  2 | 
  3 | To run this example:
  4 | 
  5 | ```bash
  6 | pip install fastapi uvicorn sse-starlette pydantic-settings
  7 | export MODEL=../models/7B/...
  8 | ```
  9 | 
 10 | Then run:
 11 | ```
 12 | uvicorn llama_cpp.server.app:create_app --reload
 13 | ```
 14 | 
 15 | or
 16 | 
 17 | ```
 18 | python3 -m llama_cpp.server
 19 | ```
 20 | 
 21 | Then visit http://localhost:8000/docs to see the interactive API docs.
 22 | 
 23 | """
 24 | 
 25 | from __future__ import annotations
 26 | 
 27 | import os
 28 | import sys
 29 | import argparse
 30 | 
 31 | import uvicorn
 32 | 
 33 | from llama_cpp.server.app import create_app
 34 | from llama_cpp.server.settings import (
 35 |     Settings,
 36 |     ServerSettings,
 37 |     ModelSettings,
 38 |     ConfigFileSettings,
 39 | )
 40 | from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
 41 | 
 42 | 
 43 | def main():
 44 |     description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
 45 |     parser = argparse.ArgumentParser(description=description)
 46 | 
 47 |     add_args_from_model(parser, Settings)
 48 |     parser.add_argument(
 49 |         "--config_file",
 50 |         type=str,
 51 |         help="Path to a config file to load.",
 52 |     )
 53 |     server_settings: ServerSettings | None = None
 54 |     model_settings: list[ModelSettings] = []
 55 |     args = parser.parse_args()
 56 |     try:
 57 |         # Load server settings from config_file if provided
 58 |         config_file = os.environ.get("CONFIG_FILE", args.config_file)
 59 |         if config_file:
 60 |             if not os.path.exists(config_file):
 61 |                 raise ValueError(f"Config file {config_file} not found!")
 62 |             with open(config_file, "rb") as f:
 63 |                 # Check if yaml file
 64 |                 if config_file.endswith(".yaml") or config_file.endswith(".yml"):
 65 |                     import yaml
 66 |                     import json
 67 | 
 68 |                     config_file_settings = ConfigFileSettings.model_validate_json(
 69 |                         json.dumps(yaml.safe_load(f))
 70 |                     )
 71 |                 else:
 72 |                     config_file_settings = ConfigFileSettings.model_validate_json(
 73 |                         f.read()
 74 |                     )
 75 |                 server_settings = ServerSettings.model_validate(config_file_settings)
 76 |                 model_settings = config_file_settings.models
 77 |         else:
 78 |             server_settings = parse_model_from_args(ServerSettings, args)
 79 |             model_settings = [parse_model_from_args(ModelSettings, args)]
 80 |     except Exception as e:
 81 |         print(e, file=sys.stderr)
 82 |         parser.print_help()
 83 |         sys.exit(1)
 84 |     assert server_settings is not None
 85 |     assert model_settings is not None
 86 |     app = create_app(
 87 |         server_settings=server_settings,
 88 |         model_settings=model_settings,
 89 |     )
 90 |     uvicorn.run(
 91 |         app,
 92 |         host=os.getenv("HOST", server_settings.host),
 93 |         port=int(os.getenv("PORT", server_settings.port)),
 94 |         ssl_keyfile=server_settings.ssl_keyfile,
 95 |         ssl_certfile=server_settings.ssl_certfile,
 96 |     )
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/llama_cpp/server/cli.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import argparse
 4 | 
 5 | from typing import List, Literal, Union, Any, Type, TypeVar
 6 | 
 7 | from pydantic import BaseModel
 8 | 
 9 | 
10 | def _get_base_type(annotation: Type[Any]) -> Type[Any]:
11 |     if getattr(annotation, "__origin__", None) is Literal:
12 |         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
13 |         return type(annotation.__args__[0])  # type: ignore
14 |     elif getattr(annotation, "__origin__", None) is Union:
15 |         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
16 |         non_optional_args: List[Type[Any]] = [
17 |             arg for arg in annotation.__args__ if arg is not type(None)  # type: ignore
18 |         ]
19 |         if non_optional_args:
20 |             return _get_base_type(non_optional_args[0])
21 |     elif (
22 |         getattr(annotation, "__origin__", None) is list
23 |         or getattr(annotation, "__origin__", None) is List
24 |     ):
25 |         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
26 |         return _get_base_type(annotation.__args__[0])  # type: ignore
27 |     return annotation
28 | 
29 | 
30 | def _contains_list_type(annotation: Type[Any] | None) -> bool:
31 |     origin = getattr(annotation, "__origin__", None)
32 | 
33 |     if origin is list or origin is List:
34 |         return True
35 |     elif origin in (Literal, Union):
36 |         return any(_contains_list_type(arg) for arg in annotation.__args__)  # type: ignore
37 |     else:
38 |         return False
39 | 
40 | 
41 | def _parse_bool_arg(arg: str | bytes | bool) -> bool:
42 |     if isinstance(arg, bytes):
43 |         arg = arg.decode("utf-8")
44 | 
45 |     true_values = {"1", "on", "t", "true", "y", "yes"}
46 |     false_values = {"0", "off", "f", "false", "n", "no"}
47 | 
48 |     arg_str = str(arg).lower().strip()
49 | 
50 |     if arg_str in true_values:
51 |         return True
52 |     elif arg_str in false_values:
53 |         return False
54 |     else:
55 |         raise ValueError(f"Invalid boolean argument: {arg}")
56 | 
57 | 
58 | def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
59 |     """Add arguments from a pydantic model to an argparse parser."""
60 | 
61 |     for name, field in model.model_fields.items():
62 |         description = field.description
63 |         if field.default and description and not field.is_required():
64 |             description += f" (default: {field.default})"
65 |         base_type = (
66 |             _get_base_type(field.annotation) if field.annotation is not None else str
67 |         )
68 |         list_type = _contains_list_type(field.annotation)
69 |         if base_type is not bool:
70 |             parser.add_argument(
71 |                 f"--{name}",
72 |                 dest=name,
73 |                 nargs="*" if list_type else None,
74 |                 type=base_type,
75 |                 help=description,
76 |             )
77 |         if base_type is bool:
78 |             parser.add_argument(
79 |                 f"--{name}",
80 |                 dest=name,
81 |                 type=_parse_bool_arg,
82 |                 help=f"{description}",
83 |             )
84 | 
85 | 
86 | T = TypeVar("T", bound=Type[BaseModel])
87 | 
88 | 
89 | def parse_model_from_args(model: T, args: argparse.Namespace) -> T:
90 |     """Parse a pydantic model from an argparse namespace."""
91 |     return model(
92 |         **{
93 |             k: v
94 |             for k, v in vars(args).items()
95 |             if v is not None and k in model.model_fields
96 |         }
97 |     )
98 | 


--------------------------------------------------------------------------------
/llama_cpp/server/errors.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import sys
  4 | import traceback
  5 | import time
  6 | from re import compile, Match, Pattern
  7 | from typing import Callable, Coroutine, Optional, Tuple, Union, Dict
  8 | from typing_extensions import TypedDict
  9 | 
 10 | 
 11 | from fastapi import (
 12 |     Request,
 13 |     Response,
 14 |     HTTPException,
 15 | )
 16 | from fastapi.responses import JSONResponse
 17 | from fastapi.routing import APIRoute
 18 | 
 19 | from llama_cpp.server.types import (
 20 |     CreateCompletionRequest,
 21 |     CreateEmbeddingRequest,
 22 |     CreateChatCompletionRequest,
 23 | )
 24 | 
 25 | 
 26 | class ErrorResponse(TypedDict):
 27 |     """OpenAI style error response"""
 28 | 
 29 |     message: str
 30 |     type: str
 31 |     param: Optional[str]
 32 |     code: Optional[str]
 33 | 
 34 | 
 35 | class ErrorResponseFormatters:
 36 |     """Collection of formatters for error responses.
 37 | 
 38 |     Args:
 39 |         request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
 40 |             Request body
 41 |         match (Match[str]): Match object from regex pattern
 42 | 
 43 |     Returns:
 44 |         Tuple[int, ErrorResponse]: Status code and error response
 45 |     """
 46 | 
 47 |     @staticmethod
 48 |     def context_length_exceeded(
 49 |         request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
 50 |         match,  # type: Match[str] # type: ignore
 51 |     ) -> Tuple[int, ErrorResponse]:
 52 |         """Formatter for context length exceeded error"""
 53 | 
 54 |         context_window = int(match.group(2))
 55 |         prompt_tokens = int(match.group(1))
 56 |         completion_tokens = request.max_tokens
 57 |         if hasattr(request, "messages"):
 58 |             # Chat completion
 59 |             message = (
 60 |                 "This model's maximum context length is {} tokens. "
 61 |                 "However, you requested {} tokens "
 62 |                 "({} in the messages, {} in the completion). "
 63 |                 "Please reduce the length of the messages or completion."
 64 |             )
 65 |         else:
 66 |             # Text completion
 67 |             message = (
 68 |                 "This model's maximum context length is {} tokens, "
 69 |                 "however you requested {} tokens "
 70 |                 "({} in your prompt; {} for the completion). "
 71 |                 "Please reduce your prompt; or completion length."
 72 |             )
 73 |         return 400, ErrorResponse(
 74 |             message=message.format(
 75 |                 context_window,
 76 |                 (completion_tokens or 0) + prompt_tokens,
 77 |                 prompt_tokens,
 78 |                 completion_tokens,
 79 |             ),  # type: ignore
 80 |             type="invalid_request_error",
 81 |             param="messages",
 82 |             code="context_length_exceeded",
 83 |         )
 84 | 
 85 |     @staticmethod
 86 |     def model_not_found(
 87 |         request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
 88 |         match,  # type: Match[str] # type: ignore
 89 |     ) -> Tuple[int, ErrorResponse]:
 90 |         """Formatter for model_not_found error"""
 91 | 
 92 |         model_path = str(match.group(1))
 93 |         message = f"The model `{model_path}` does not exist"
 94 |         return 400, ErrorResponse(
 95 |             message=message,
 96 |             type="invalid_request_error",
 97 |             param=None,
 98 |             code="model_not_found",
 99 |         )
100 | 
101 | 
102 | class RouteErrorHandler(APIRoute):
103 |     """Custom APIRoute that handles application errors and exceptions"""
104 | 
105 |     # key: regex pattern for original error message from llama_cpp
106 |     # value: formatter function
107 |     pattern_and_formatters: Dict[
108 |         "Pattern[str]",
109 |         Callable[
110 |             [
111 |                 Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
112 |                 "Match[str]",
113 |             ],
114 |             Tuple[int, ErrorResponse],
115 |         ],
116 |     ] = {
117 |         compile(
118 |             r"Requested tokens \((\d+)\) exceed context window of (\d+)"
119 |         ): ErrorResponseFormatters.context_length_exceeded,
120 |         compile(
121 |             r"Model path does not exist: (.+)"
122 |         ): ErrorResponseFormatters.model_not_found,
123 |     }
124 | 
125 |     def error_message_wrapper(
126 |         self,
127 |         error: Exception,
128 |         body: Optional[
129 |             Union[
130 |                 "CreateChatCompletionRequest",
131 |                 "CreateCompletionRequest",
132 |                 "CreateEmbeddingRequest",
133 |             ]
134 |         ] = None,
135 |     ) -> Tuple[int, ErrorResponse]:
136 |         """Wraps error message in OpenAI style error response"""
137 |         if body is not None and isinstance(
138 |             body,
139 |             (
140 |                 CreateCompletionRequest,
141 |                 CreateChatCompletionRequest,
142 |             ),
143 |         ):
144 |             # When text completion or chat completion
145 |             for pattern, callback in self.pattern_and_formatters.items():
146 |                 match = pattern.search(str(error))
147 |                 if match is not None:
148 |                     return callback(body, match)
149 | 
150 |         # Only print the trace on unexpected exceptions
151 |         print(f"Exception: {str(error)}", file=sys.stderr)
152 |         traceback.print_exc(file=sys.stderr)
153 | 
154 |         # Wrap other errors as internal server error
155 |         return 500, ErrorResponse(
156 |             message=str(error),
157 |             type="internal_server_error",
158 |             param=None,
159 |             code=None,
160 |         )
161 | 
162 |     def get_route_handler(
163 |         self,
164 |     ) -> Callable[[Request], Coroutine[None, None, Response]]:
165 |         """Defines custom route handler that catches exceptions and formats
166 |         in OpenAI style error response"""
167 | 
168 |         original_route_handler = super().get_route_handler()
169 | 
170 |         async def custom_route_handler(request: Request) -> Response:
171 |             try:
172 |                 start_sec = time.perf_counter()
173 |                 response = await original_route_handler(request)
174 |                 elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000)
175 |                 response.headers["openai-processing-ms"] = f"{elapsed_time_ms}"
176 |                 return response
177 |             except HTTPException as unauthorized:
178 |                 # api key check failed
179 |                 raise unauthorized
180 |             except Exception as exc:
181 |                 json_body = await request.json()
182 |                 try:
183 |                     if "messages" in json_body:
184 |                         # Chat completion
185 |                         body: Optional[
186 |                             Union[
187 |                                 CreateChatCompletionRequest,
188 |                                 CreateCompletionRequest,
189 |                                 CreateEmbeddingRequest,
190 |                             ]
191 |                         ] = CreateChatCompletionRequest(**json_body)
192 |                     elif "prompt" in json_body:
193 |                         # Text completion
194 |                         body = CreateCompletionRequest(**json_body)
195 |                     else:
196 |                         # Embedding
197 |                         body = CreateEmbeddingRequest(**json_body)
198 |                 except Exception:
199 |                     # Invalid request body
200 |                     body = None
201 | 
202 |                 # Get proper error message from the exception
203 |                 (
204 |                     status_code,
205 |                     error_message,
206 |                 ) = self.error_message_wrapper(error=exc, body=body)
207 |                 return JSONResponse(
208 |                     {"error": error_message},
209 |                     status_code=status_code,
210 |                 )
211 | 
212 |         return custom_route_handler
213 | 


--------------------------------------------------------------------------------
/llama_cpp/server/settings.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import multiprocessing
  4 | 
  5 | from typing import Optional, List, Literal, Union, Dict, cast
  6 | from typing_extensions import Self
  7 | 
  8 | from pydantic import Field, model_validator
  9 | from pydantic_settings import BaseSettings
 10 | 
 11 | import llama_cpp
 12 | 
 13 | # Disable warning for model and model_alias settings
 14 | BaseSettings.model_config["protected_namespaces"] = ()
 15 | 
 16 | 
 17 | class ModelSettings(BaseSettings):
 18 |     """Model settings used to load a Llama model."""
 19 | 
 20 |     model: str = Field(
 21 |         description="The path to the model to use for generating completions."
 22 |     )
 23 |     model_alias: Optional[str] = Field(
 24 |         default=None,
 25 |         description="The alias of the model to use for generating completions.",
 26 |     )
 27 |     # Model Params
 28 |     n_gpu_layers: int = Field(
 29 |         default=0,
 30 |         ge=-1,
 31 |         description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
 32 |     )
 33 |     split_mode: int = Field(
 34 |         default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
 35 |         description="The split mode to use.",
 36 |     )
 37 |     main_gpu: int = Field(
 38 |         default=0,
 39 |         ge=0,
 40 |         description="Main GPU to use.",
 41 |     )
 42 |     tensor_split: Optional[List[float]] = Field(
 43 |         default=None,
 44 |         description="Split layers across multiple GPUs in proportion.",
 45 |     )
 46 |     vocab_only: bool = Field(
 47 |         default=False, description="Whether to only return the vocabulary."
 48 |     )
 49 |     use_mmap: bool = Field(
 50 |         default=llama_cpp.llama_supports_mmap(),
 51 |         description="Use mmap.",
 52 |     )
 53 |     use_mlock: bool = Field(
 54 |         default=llama_cpp.llama_supports_mlock(),
 55 |         description="Use mlock.",
 56 |     )
 57 |     kv_overrides: Optional[List[str]] = Field(
 58 |         default=None,
 59 |         description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
 60 |     )
 61 |     rpc_servers: Optional[str] = Field(
 62 |         default=None,
 63 |         description="comma seperated list of rpc servers for offloading",
 64 |     )
 65 |     # Context Params
 66 |     seed: int = Field(
 67 |         default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
 68 |     )
 69 |     n_ctx: int = Field(default=2048, ge=0, description="The context size.")
 70 |     n_batch: int = Field(
 71 |         default=512, ge=1, description="The batch size to use per eval."
 72 |     )
 73 |     n_ubatch: int = Field(
 74 |         default=512, ge=1, description="The physical batch size used by llama.cpp"
 75 |     )
 76 |     n_threads: int = Field(
 77 |         default=max(multiprocessing.cpu_count() // 2, 1),
 78 |         ge=1,
 79 |         description="The number of threads to use. Use -1 for max cpu threads",
 80 |     )
 81 |     n_threads_batch: int = Field(
 82 |         default=max(multiprocessing.cpu_count(), 1),
 83 |         ge=0,
 84 |         description="The number of threads to use when batch processing. Use -1 for max cpu threads",
 85 |     )
 86 |     rope_scaling_type: int = Field(
 87 |         default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
 88 |     )
 89 |     rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
 90 |     rope_freq_scale: float = Field(
 91 |         default=0.0, description="RoPE frequency scaling factor"
 92 |     )
 93 |     yarn_ext_factor: float = Field(default=-1.0)
 94 |     yarn_attn_factor: float = Field(default=1.0)
 95 |     yarn_beta_fast: float = Field(default=32.0)
 96 |     yarn_beta_slow: float = Field(default=1.0)
 97 |     yarn_orig_ctx: int = Field(default=0)
 98 |     mul_mat_q: bool = Field(
 99 |         default=True, description="if true, use experimental mul_mat_q kernels"
100 |     )
101 |     logits_all: bool = Field(default=True, description="Whether to return logits.")
102 |     embedding: bool = Field(default=False, description="Whether to use embeddings.")
103 |     offload_kqv: bool = Field(
104 |         default=True, description="Whether to offload kqv to the GPU."
105 |     )
106 |     flash_attn: bool = Field(
107 |         default=False, description="Whether to use flash attention."
108 |     )
109 |     # Sampling Params
110 |     last_n_tokens_size: int = Field(
111 |         default=64,
112 |         ge=0,
113 |         description="Last n tokens to keep for repeat penalty calculation.",
114 |     )
115 |     # LoRA Params
116 |     lora_base: Optional[str] = Field(
117 |         default=None,
118 |         description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
119 |     )
120 |     lora_path: Optional[str] = Field(
121 |         default=None,
122 |         description="Path to a LoRA file to apply to the model.",
123 |     )
124 |     # Backend Params
125 |     numa: Union[bool, int] = Field(
126 |         default=False,
127 |         description="Enable NUMA support.",
128 |     )
129 |     # Chat Format Params
130 |     chat_format: Optional[str] = Field(
131 |         default=None,
132 |         description="Chat format to use.",
133 |     )
134 |     clip_model_path: Optional[str] = Field(
135 |         default=None,
136 |         description="Path to a CLIP model to use for multi-modal chat completion.",
137 |     )
138 |     # Cache Params
139 |     cache: bool = Field(
140 |         default=False,
141 |         description="Use a cache to reduce processing times for evaluated prompts.",
142 |     )
143 |     cache_type: Literal["ram", "disk"] = Field(
144 |         default="ram",
145 |         description="The type of cache to use. Only used if cache is True.",
146 |     )
147 |     cache_size: int = Field(
148 |         default=2 << 30,
149 |         description="The size of the cache in bytes. Only used if cache is True.",
150 |     )
151 |     # Tokenizer Options
152 |     hf_tokenizer_config_path: Optional[str] = Field(
153 |         default=None,
154 |         description="The path to a HuggingFace tokenizer_config.json file.",
155 |     )
156 |     hf_pretrained_model_name_or_path: Optional[str] = Field(
157 |         default=None,
158 |         description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
159 |     )
160 |     # Loading from HuggingFace Model Hub
161 |     hf_model_repo_id: Optional[str] = Field(
162 |         default=None,
163 |         description="The model repo id to use for the HuggingFace tokenizer model.",
164 |     )
165 |     # Speculative Decoding
166 |     draft_model: Optional[str] = Field(
167 |         default=None,
168 |         description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
169 |     )
170 |     draft_model_num_pred_tokens: int = Field(
171 |         default=10,
172 |         description="Number of tokens to predict using the draft model.",
173 |     )
174 |     # KV Cache Quantization
175 |     type_k: Optional[int] = Field(
176 |         default=None,
177 |         description="Type of the key cache quantization.",
178 |     )
179 |     type_v: Optional[int] = Field(
180 |         default=None,
181 |         description="Type of the value cache quantization.",
182 |     )
183 |     # Misc
184 |     verbose: bool = Field(
185 |         default=True, description="Whether to print debug information."
186 |     )
187 | 
188 |     @model_validator(
189 |         mode="before"
190 |     )  # pre=True to ensure this runs before any other validation
191 |     def set_dynamic_defaults(self) -> Self:
192 |         # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
193 |         cpu_count = multiprocessing.cpu_count()
194 |         values = cast(Dict[str, int], self)
195 |         if values.get("n_threads", 0) == -1:
196 |             values["n_threads"] = cpu_count
197 |         if values.get("n_threads_batch", 0) == -1:
198 |             values["n_threads_batch"] = cpu_count
199 |         return self
200 | 
201 | 
202 | class ServerSettings(BaseSettings):
203 |     """Server settings used to configure the FastAPI and Uvicorn server."""
204 | 
205 |     # Uvicorn Settings
206 |     host: str = Field(default="localhost", description="Listen address")
207 |     port: int = Field(default=8000, description="Listen port")
208 |     ssl_keyfile: Optional[str] = Field(
209 |         default=None, description="SSL key file for HTTPS"
210 |     )
211 |     ssl_certfile: Optional[str] = Field(
212 |         default=None, description="SSL certificate file for HTTPS"
213 |     )
214 |     # FastAPI Settings
215 |     api_key: Optional[str] = Field(
216 |         default=None,
217 |         description="API key for authentication. If set all requests need to be authenticated.",
218 |     )
219 |     interrupt_requests: bool = Field(
220 |         default=True,
221 |         description="Whether to interrupt requests when a new request is received.",
222 |     )
223 |     disable_ping_events: bool = Field(
224 |         default=False,
225 |         description="Disable EventSource pings (may be needed for some clients).",
226 |     )
227 |     root_path: str = Field(
228 |         default="",
229 |         description="The root path for the server. Useful when running behind a reverse proxy.",
230 |     )
231 | 
232 | 
233 | class Settings(ServerSettings, ModelSettings):
234 |     pass
235 | 
236 | 
237 | class ConfigFileSettings(ServerSettings):
238 |     """Configuration file format settings."""
239 | 
240 |     models: List[ModelSettings] = Field(default=[], description="Model configs")
241 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: llama-cpp-python
 2 | repo_url: https://github.com/abetlen/llama-cpp-python
 3 | 
 4 | theme:
 5 |   name: material
 6 |   palette: 
 7 | 
 8 |     # Palette toggle for light mode
 9 |     - scheme: default
10 |       primary: indigo
11 |       toggle:
12 |         icon: material/brightness-7 
13 |         name: Switch to dark mode
14 | 
15 |     # Palette toggle for dark mode
16 |     - scheme: slate
17 |       primary: indigo
18 |       toggle:
19 |         icon: material/brightness-4
20 |         name: Switch to light mode
21 | 
22 | plugins:
23 |   - search
24 |   - mkdocstrings:
25 |       handlers:
26 |         python:
27 |           options:
28 |             members_order: source
29 |             group_by_category: false
30 |             signature_crossrefs: true
31 |             show_signature: true
32 |             docstring_section_style: list
33 |             show_root_heading: true
34 |             heading_level: 3
35 |             preload_modules:
36 |               - typing
37 |               - typing_extensions
38 |               - ctypes
39 |           import:
40 |             - https://docs.python.org/3/objects.inv
41 |             - https://numpy.org/doc/stable/objects.inv
42 | 
43 | watch:
44 |   - llama_cpp
45 |   - README.md
46 | 
47 | nav:
48 |   - "Getting Started": "index.md"
49 |   - "Installation Guides":
50 |     - "macOS (Metal)": "install/macos.md"
51 |   - "API Reference": "api-reference.md"
52 |   - "OpenAI Compatible Web Server": "server.md"
53 |   - "Changelog": "changelog.md"
54 | 
55 | markdown_extensions:
56 |   - attr_list
57 |   - pymdownx.emoji:
58 |       emoji_index: !!python/name:materialx.emoji.twemoji
59 |       emoji_generator: !!python/name:materialx.emoji.to_svg
60 |   - pymdownx.highlight:
61 |       anchor_linenums: true
62 |       line_spans: __span
63 |       pygments_lang_class: true
64 |   - pymdownx.inlinehilite
65 |   - pymdownx.magiclink:
66 |       repo_url_shorthand: true
67 |       user: abetlen
68 |       repo: llama-cpp-python
69 |   - pymdownx.snippets
70 |   - pymdownx.superfences
71 |   - pymdownx.tabbed:
72 |       alternate_style: true 
73 |   - pymdownx.tilde
74 |   - tables
75 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["scikit-build-core[pyproject]>=0.9.2"]
 3 | build-backend = "scikit_build_core.build"
 4 | 
 5 | [project]
 6 | name = "llama_cpp_python"
 7 | dynamic = ["version"]
 8 | description = "Python bindings for the llama.cpp library"
 9 | readme = "README.md"
10 | license = { text = "MIT" }
11 | authors = [
12 |     { name = "Andrei Betlen", email = "abetlen@gmail.com" },
13 | ]
14 | dependencies = [
15 |     "typing-extensions>=4.5.0",
16 |     "numpy>=1.20.0",
17 |     "diskcache>=5.6.1",
18 |     "jinja2>=2.11.3",
19 | ]
20 | requires-python = ">=3.8"
21 | classifiers = [
22 |     "Programming Language :: Python :: 3",
23 |     "Programming Language :: Python :: 3.8",
24 |     "Programming Language :: Python :: 3.9",
25 |     "Programming Language :: Python :: 3.10",
26 |     "Programming Language :: Python :: 3.11",
27 |     "Programming Language :: Python :: 3.12",
28 | ]
29 | 
30 | 
31 | [project.optional-dependencies]
32 | server = [
33 |     "uvicorn>=0.22.0",
34 |     "fastapi>=0.100.0",
35 |     "pydantic-settings>=2.0.1",
36 |     "sse-starlette>=1.6.1",
37 |     "starlette-context>=0.3.6,<0.4",
38 |     "PyYAML>=5.1",
39 | ]
40 | test = [
41 |     "pytest>=7.4.0",
42 |     "httpx>=0.24.1",
43 |     "scipy>=1.10",
44 |     "fastapi>=0.100.0",
45 |     "sse-starlette>=1.6.1",
46 |     "starlette-context>=0.3.6,<0.4",
47 |     "pydantic-settings>=2.0.1",
48 |     "huggingface-hub>=0.23.0"
49 | ]
50 | dev = [
51 |     "black>=23.3.0",
52 |     "twine>=4.0.2",
53 |     "mkdocs>=1.4.3",
54 |     "mkdocstrings[python]>=0.22.0",
55 |     "mkdocs-material>=9.1.18",
56 |     "pytest>=7.4.0",
57 |     "httpx>=0.24.1",
58 | ]
59 | all = [
60 |     "llama_cpp_python[server,test,dev]",
61 | ]
62 | 
63 | [tool.scikit-build]
64 | wheel.packages = ["llama_cpp"]
65 | cmake.verbose = true
66 | cmake.minimum-version = "3.21"
67 | minimum-version = "0.5.1"
68 | sdist.include = [".git", "vendor/llama.cpp/*"]
69 | 
70 | [tool.scikit-build.metadata.version]
71 | provider = "scikit_build_core.metadata.regex"
72 | input = "llama_cpp/__init__.py"
73 | 
74 | [project.urls]
75 | Homepage = "https://github.com/abetlen/llama-cpp-python"
76 | Issues = "https://github.com/abetlen/llama-cpp-python/issues"
77 | Documentation = "https://llama-cpp-python.readthedocs.io/en/latest/"
78 | Changelog = "https://llama-cpp-python.readthedocs.io/en/latest/changelog/"
79 | 
80 | [tool.pytest.ini_options]
81 | testpaths = "tests"
82 | 


--------------------------------------------------------------------------------
/scripts/get-releases.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function to get all releases
 4 | get_all_releases() {
 5 |     local page=1
 6 |     local per_page=100
 7 |     local releases=""
 8 |     local new_releases
 9 | 
10 |     # Prepare headers
11 |     local headers=(-H "Accept: application/vnd.github.v3+json")
12 |     if [ -n "$GITHUB_TOKEN" ]; then
13 |         headers+=(-H "Authorization: Bearer $GITHUB_TOKEN")
14 |     fi
15 | 
16 |     while true; do
17 |         response=$(curl -s "${headers[@]}" \
18 |                         "https://api.github.com/repos/abetlen/llama-cpp-python/releases?page=$page&per_page=$per_page")
19 |         
20 |         # Check if the response is valid JSON
21 |         if ! echo "$response" | jq empty > /dev/null 2>&1; then
22 |             echo "Error: Invalid response from GitHub API" >&2
23 |             echo "Response: $response" >&2
24 |             return 1
25 |         fi
26 | 
27 |         new_releases=$(echo "$response" | jq -r '.[].tag_name')
28 |         if [ -z "$new_releases" ]; then
29 |             break
30 |         fi
31 |         releases="$releases $new_releases"
32 |         ((page++))
33 |     done
34 | 
35 |     echo $releases
36 | }
37 | 
38 | # Get all releases and save to file
39 | releases=$(get_all_releases)
40 | if [ $? -ne 0 ]; then
41 |     echo "Failed to fetch releases. Please check your internet connection and try again later." >&2
42 |     exit 1
43 | fi
44 | 
45 | echo "$releases" | tr ' ' '\n' > all_releases.txt
46 | 
47 | echo "All releases have been saved to all_releases.txt"
48 | 


--------------------------------------------------------------------------------
/scripts/releases-to-pep-503.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Enable exit on error
  4 | set -e
  5 | 
  6 | # Function for logging
  7 | log_error() {
  8 |     echo "ERROR: $1" >&2
  9 | }
 10 | 
 11 | log_info() {
 12 |     echo "INFO: $1"
 13 | }
 14 | 
 15 | # Get output directory or default to index/whl/cpu
 16 | output_dir=${1:-"index/whl/cpu"}
 17 | 
 18 | # Get pattern from second arg or default to valid python package version pattern
 19 | pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
 20 | 
 21 | # Get the current directory (where the script is run from)
 22 | current_dir="$(pwd)"
 23 | 
 24 | # Check if all_releases.txt exists
 25 | if [ ! -f "$current_dir/all_releases.txt" ]; then
 26 |     log_error "all_releases.txt not found in the current directory."
 27 |     exit 1
 28 | fi
 29 | 
 30 | # Create output directory
 31 | mkdir -p "$output_dir"
 32 | 
 33 | # Create an index html file
 34 | cat << EOF > "$output_dir/index.html"
 35 | <!DOCTYPE html>
 36 | <html>
 37 |   <head></head>
 38 |   <body>
 39 |     <a href="llama-cpp-python/">llama-cpp-python</a>
 40 |     <br>
 41 |   </body>
 42 | </html>
 43 | 
 44 | EOF
 45 | 
 46 | # Create llama-cpp-python directory
 47 | mkdir -p "$output_dir/llama-cpp-python"
 48 | 
 49 | # Create an index html file in llama-cpp-python directory
 50 | cat << EOF > "$output_dir/llama-cpp-python/index.html"
 51 | <!DOCTYPE html>
 52 | <html>
 53 |   <body>
 54 |     <h1>Links for llama-cpp-python</h1>
 55 | EOF
 56 | 
 57 | # Filter releases by pattern
 58 | releases=$(grep -E "$pattern" "$current_dir/all_releases.txt")
 59 | 
 60 | # Prepare curl headers
 61 | headers=('--header' 'Accept: application/vnd.github.v3+json')
 62 | if [ -n "$GITHUB_TOKEN" ]; then
 63 |     headers+=('--header' "authorization: Bearer $GITHUB_TOKEN")
 64 | fi
 65 | headers+=('--header' 'content-type: application/json')
 66 | 
 67 | # For each release, get all assets
 68 | for release in $releases; do
 69 |     log_info "Processing release: $release"
 70 |     response=$(curl -s "${headers[@]}" \
 71 |                     "https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release")
 72 |     
 73 |     if [ -z "$response" ]; then
 74 |         log_error "Empty response from GitHub API for release $release"
 75 |         continue
 76 |     fi
 77 | 
 78 |     if ! echo "$response" | jq -e '.assets' > /dev/null 2>&1; then
 79 |         log_error "Invalid or unexpected response from GitHub API for release $release"
 80 |         log_error "Response: $response"
 81 |         continue
 82 |     fi
 83 | 
 84 |     # Get release version from release ie v0.1.0-cu121 -> v0.1.0
 85 |     release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
 86 |     echo "    <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"
 87 |     
 88 |     wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
 89 |     if [ -z "$wheel_urls" ]; then
 90 |         log_error "No wheel files found for release $release"
 91 |         continue
 92 |     fi
 93 | 
 94 |     echo "$wheel_urls" | while read -r asset; do
 95 |         echo "    <a href=\"$asset\">$asset</a>" >> "$output_dir/llama-cpp-python/index.html"
 96 |         echo "    <br>" >> "$output_dir/llama-cpp-python/index.html"
 97 |     done
 98 | done
 99 | 
100 | echo "  </body>" >> "$output_dir/llama-cpp-python/index.html"
101 | echo "</html>" >> "$output_dir/llama-cpp-python/index.html"
102 | echo "" >> "$output_dir/llama-cpp-python/index.html"
103 | 
104 | log_info "Index generation complete. Output directory: $output_dir"
105 | 


--------------------------------------------------------------------------------
/tests/test_llama.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | import multiprocessing
  3 | 
  4 | import numpy as np
  5 | from scipy.special import log_softmax
  6 | 
  7 | from huggingface_hub import hf_hub_download
  8 | 
  9 | import pytest
 10 | 
 11 | import llama_cpp
 12 | import llama_cpp._internals as internals
 13 | 
 14 | 
 15 | MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf"
 16 | 
 17 | 
 18 | def test_llama_cpp_version():
 19 |     assert llama_cpp.__version__
 20 | 
 21 | 
 22 | def test_llama_cpp_tokenization():
 23 |     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False)
 24 | 
 25 |     assert llama
 26 |     assert llama._ctx.ctx is not None
 27 | 
 28 |     text = b"Hello World"
 29 | 
 30 |     tokens = llama.tokenize(text)
 31 |     assert tokens[0] == llama.token_bos()
 32 |     assert tokens == [1, 15043, 2787]
 33 |     detokenized = llama.detokenize(tokens)
 34 |     assert detokenized == text
 35 | 
 36 |     tokens = llama.tokenize(text, add_bos=False)
 37 |     assert tokens[0] != llama.token_bos()
 38 |     assert tokens == [15043, 2787]
 39 | 
 40 |     detokenized = llama.detokenize(tokens)
 41 |     assert detokenized != text
 42 | 
 43 |     text = b"Hello World</s>"
 44 |     tokens = llama.tokenize(text)
 45 |     assert tokens[-1] != llama.token_eos()
 46 |     assert tokens == [1, 15043, 2787, 829, 29879, 29958]
 47 | 
 48 |     tokens = llama.tokenize(text, special=True)
 49 |     assert tokens[-1] == llama.token_eos()
 50 |     assert tokens == [1, 15043, 2787, 2]
 51 | 
 52 |     text = b""
 53 |     tokens = llama.tokenize(text, add_bos=True, special=True)
 54 |     assert tokens[-1] != llama.token_eos()
 55 |     assert tokens == [llama.token_bos()]
 56 |     assert text == llama.detokenize(tokens)
 57 | 
 58 | 
 59 | @pytest.fixture
 60 | def llama_cpp_model_path():
 61 |     repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF"
 62 |     filename = "qwen2-0_5b-instruct-q8_0.gguf"
 63 |     model_path = hf_hub_download(repo_id, filename)
 64 |     return model_path
 65 | 
 66 | 
 67 | def test_real_model(llama_cpp_model_path):
 68 |     import os
 69 |     assert os.path.exists(llama_cpp_model_path)
 70 | 
 71 |     params = llama_cpp.llama_model_default_params()
 72 |     params.use_mmap = llama_cpp.llama_supports_mmap()
 73 |     params.use_mlock = llama_cpp.llama_supports_mlock()
 74 |     params.check_tensors = False
 75 | 
 76 |     model = internals.LlamaModel(path_model=llama_cpp_model_path, params=params)
 77 | 
 78 |     cparams = llama_cpp.llama_context_default_params()
 79 |     cparams.n_ctx = 16
 80 |     cparams.n_batch = 16
 81 |     cparams.n_ubatch = 16
 82 |     cparams.n_threads = multiprocessing.cpu_count()
 83 |     cparams.n_threads_batch = multiprocessing.cpu_count()
 84 |     cparams.logits_all = False
 85 |     cparams.flash_attn = True
 86 | 
 87 |     context = internals.LlamaContext(model=model, params=cparams)
 88 |     tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
 89 | 
 90 |     assert tokens == [9707, 11, 1879, 0]
 91 | 
 92 |     tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True)
 93 | 
 94 |     batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1)
 95 | 
 96 |     seed = 1337
 97 |     sampler = internals.LlamaSampler()
 98 |     sampler.add_top_k(50)
 99 |     sampler.add_top_p(0.9, 1)
100 |     sampler.add_temp(0.8)
101 |     sampler.add_dist(seed)
102 | 
103 |     result = tokens
104 |     n_eval = 0
105 |     for _ in range(4):
106 |         batch.set_batch(tokens, n_past=n_eval, logits_all=False)
107 |         context.decode(batch)
108 |         n_eval += len(tokens)
109 |         token_id = sampler.sample(context, -1)
110 |         tokens = [token_id]
111 |         result += tokens
112 | 
113 |     output = result[5:]
114 |     output_text = model.detokenize(output, special=True)
115 |     assert output_text == b" over the lazy dog"
116 | 
117 | def test_real_llama(llama_cpp_model_path):
118 |     model = llama_cpp.Llama(
119 |         llama_cpp_model_path,
120 |         n_ctx=32,
121 |         n_batch=32,
122 |         n_ubatch=32,
123 |         n_threads=multiprocessing.cpu_count(),
124 |         n_threads_batch=multiprocessing.cpu_count(),
125 |         logits_all=False,
126 |         flash_attn=True,
127 |     )
128 | 
129 |     output = model.create_completion(
130 |         "The quick brown fox jumps",
131 |         max_tokens=4,
132 |         top_k=50,
133 |         top_p=0.9,
134 |         temperature=0.8,
135 |         seed=1337
136 |     )
137 |     assert output["choices"][0]["text"] == " over the lazy dog"
138 | 
139 | 
140 |     output = model.create_completion(
141 |         "The capital of france is paris, 'true' or 'false'?:\n",
142 |         max_tokens=4,
143 |         top_k=50,
144 |         top_p=0.9,
145 |         temperature=0.8,
146 |         seed=1337,
147 |         grammar=llama_cpp.LlamaGrammar.from_string("""
148 | root ::= "true" | "false"
149 | """)
150 |     )
151 |     assert output["choices"][0]["text"] == "true"
152 | 
153 |     suffix = b"rot"
154 |     tokens = model.tokenize(suffix, add_bos=True, special=True)
155 |     def logit_processor_func(input_ids, logits):
156 |         for token in tokens:
157 |             logits[token] *= 1000
158 |         return logits
159 | 
160 |     logit_processors = llama_cpp.LogitsProcessorList(
161 |         [logit_processor_func]
162 |     )
163 | 
164 |     output = model.create_completion(
165 |         "The capital of france is par",
166 |         max_tokens=4,
167 |         top_k=50,
168 |         top_p=0.9,
169 |         temperature=0.8,
170 |         seed=1337,
171 |         logits_processor=logit_processors
172 |     )
173 |     assert output["choices"][0]["text"].lower().startswith("rot")
174 | 
175 |     model.set_seed(1337)
176 | 
177 |     state = model.save_state()
178 | 
179 |     output = model.create_completion(
180 |         "Pick a number from 1 to 10?:\n",
181 |         max_tokens=4,
182 |         top_k=50,
183 |         top_p=0.9,
184 |         temperature=0.8,
185 |         grammar=llama_cpp.LlamaGrammar.from_string("""
186 | root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
187 | """)
188 |     )
189 |     number_1 = output["choices"][0]["text"]
190 | 
191 |     output = model.create_completion(
192 |         "Pick a number from 1 to 10?:\n",
193 |         max_tokens=4,
194 |         top_k=50,
195 |         top_p=0.9,
196 |         temperature=0.8,
197 |         grammar=llama_cpp.LlamaGrammar.from_string("""
198 | root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
199 | """)
200 |     )
201 |     number_2 = output["choices"][0]["text"]
202 | 
203 |     model.load_state(state)
204 | 
205 |     output = model.create_completion(
206 |         "Pick a number from 1 to 10?:\n",
207 |         max_tokens=4,
208 |         top_k=50,
209 |         top_p=0.9,
210 |         temperature=0.8,
211 |         grammar=llama_cpp.LlamaGrammar.from_string("""
212 | root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
213 | """)
214 |     )
215 |     number_3 = output["choices"][0]["text"]
216 | 
217 |     assert number_1 != number_2
218 |     assert number_1 == number_3
219 | 


--------------------------------------------------------------------------------
/tests/test_llama_chat_format.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import jinja2
 4 | 
 5 | from llama_cpp import (
 6 |     ChatCompletionRequestUserMessage,
 7 | )
 8 | import llama_cpp.llama_types as llama_types
 9 | import llama_cpp.llama_chat_format as llama_chat_format
10 | 
11 | from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
12 | 
13 | def test_mistral_instruct():
14 |     chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
15 |     chat_formatter = jinja2.Template(chat_template)
16 |     messages = [
17 |         llama_types.ChatCompletionRequestUserMessage(role="user", content="Instruction"),
18 |         llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="Model answer"),
19 |         llama_types.ChatCompletionRequestUserMessage(role="user", content="Follow-up instruction"),
20 |     ]
21 |     response = llama_chat_format.format_mistral_instruct(
22 |         messages=messages,
23 |     )
24 |     prompt = ("" if response.added_special else "<s>") + response.prompt
25 |     reference = chat_formatter.render(
26 |         messages=messages,
27 |         bos_token="<s>",
28 |         eos_token="</s>",
29 |     )
30 |     assert prompt == reference
31 | 
32 | 
33 | mistral_7b_tokenizer_config = """{
34 |   "add_bos_token": true,
35 |   "add_eos_token": false,
36 |   "added_tokens_decoder": {
37 |     "0": {
38 |       "content": "<unk>",
39 |       "lstrip": false,
40 |       "normalized": false,
41 |       "rstrip": false,
42 |       "single_word": false,
43 |       "special": true
44 |     },
45 |     "1": {
46 |       "content": "<s>",
47 |       "lstrip": false,
48 |       "normalized": false,
49 |       "rstrip": false,
50 |       "single_word": false,
51 |       "special": true
52 |     },
53 |     "2": {
54 |       "content": "</s>",
55 |       "lstrip": false,
56 |       "normalized": false,
57 |       "rstrip": false,
58 |       "single_word": false,
59 |       "special": true
60 |     }
61 |   },
62 |   "additional_special_tokens": [],
63 |   "bos_token": "<s>",
64 |   "clean_up_tokenization_spaces": false,
65 |   "eos_token": "</s>",
66 |   "legacy": true,
67 |   "model_max_length": 1000000000000000019884624838656,
68 |   "pad_token": null,
69 |   "sp_model_kwargs": {},
70 |   "spaces_between_special_tokens": false,
71 |   "tokenizer_class": "LlamaTokenizer",
72 |   "unk_token": "<unk>",
73 |   "use_default_system_prompt": false,
74 |   "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
75 | }"""
76 | 
77 | 
78 | def test_hf_tokenizer_config_str_to_chat_formatter():
79 |     tokenizer_config = json.loads(mistral_7b_tokenizer_config)
80 |     chat_formatter = hf_tokenizer_config_to_chat_formatter(
81 |         tokenizer_config
82 |     )
83 |     chat_formatter_respoonse = chat_formatter(
84 |         messages=[
85 |             ChatCompletionRequestUserMessage(role="user", content="Hello, world!"),
86 |         ]
87 |     )
88 | 
89 |     assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>" "")
90 | 


--------------------------------------------------------------------------------
/tests/test_llama_grammar.py:
--------------------------------------------------------------------------------
 1 | import llama_cpp
 2 | import json
 3 | 
 4 | tree = """
 5 | leaf ::= "."
 6 | node ::= leaf | "(" node node ")"
 7 | root ::= node
 8 | """
 9 | 
10 | 
11 | def test_grammar_from_string():
12 |     grammar = llama_cpp.LlamaGrammar.from_string(tree)
13 |     # assert grammar._n_rules == 3
14 |     # assert grammar._start_rule_index == 2
15 |     # assert grammar.grammar is not None
16 | 
17 | 
18 | def test_composed_pydantic_grammar():
19 |     """
20 |     from pydantic import BaseModel
21 | 
22 |     class A(BaseModel):
23 |         a: int
24 | 
25 |     class B(BaseModel):
26 |         a: A
27 |         b: int
28 |     """
29 | 
30 |     # This schema corresponds to the grammar in the comment above.
31 |     # We don't use the pydantic models directly to avoid the dependency.
32 |     schema = {
33 |         "$defs": {
34 |             "A": {
35 |                 "properties": {"a": {"title": "A", "type": "integer"}},
36 |                 "required": ["a"],
37 |                 "title": "A",
38 |                 "type": "object",
39 |             }
40 |         },
41 |         "properties": {
42 |             "a": {"$ref": "#/$defs/A"},
43 |             "b": {"title": "B", "type": "integer"},
44 |         },
45 |         "required": ["a", "b"],
46 |         "title": "B",
47 |         "type": "object",
48 |     }
49 | 
50 |     grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(schema))
51 | 
52 |     # assert grammar.grammar is not None
53 | 
54 | 
55 | def test_grammar_anyof():
56 |     sch = {
57 |         "properties": {
58 |             "temperature": {
59 |                 "description": "The temperature mentioned",
60 |                 "type": "number",
61 |             },
62 |             "unit": {
63 |                 "anyOf": [
64 |                     {
65 |                         "description": "Unit for temperature",
66 |                         "enum": ["celsius", "fahrenheit"],
67 |                         "type": "string",
68 |                     },
69 |                     {"type": "null"},
70 |                 ],
71 |             },
72 |         },
73 |         "type": "object",
74 |     }
75 | 
76 |     grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(sch))
77 | 
78 |     # assert grammar.grammar is not None
79 | 


--------------------------------------------------------------------------------
/tests/test_llama_speculative.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 4 | 
 5 | def test_find_candidate_pred_tokens():
 6 |     find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens
 7 | 
 8 |     # Test Case 1: Matching ngram is found
 9 |     input_ids1 = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
10 |     result1 = find_candidate_pred_tokens(input_ids1, max_ngram_size=3, num_pred_tokens=2)
11 |     assert np.array_equal(result1, np.array([1, 2]))
12 | 
13 |     # Test Case 2: Matching ngram is not found
14 |     input_ids2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
15 |     result2 = find_candidate_pred_tokens(input_ids2, max_ngram_size=3, num_pred_tokens=2)
16 |     assert np.array_equal(result2, np.array([]))
17 | 


--------------------------------------------------------------------------------