├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── build-and-release.yaml │ ├── build-docker.yaml │ ├── build-wheels-cuda.yaml │ ├── build-wheels-metal.yaml │ ├── generate-index-from-release.yaml │ ├── publish-to-test.yaml │ ├── publish.yaml │ ├── test-pypi.yaml │ └── test.yaml ├── .gitignore ├── .gitmodules ├── .readthedocs.yaml ├── CHANGELOG.md ├── CMakeLists.txt ├── LICENSE.md ├── Makefile ├── README.md ├── docker ├── README.md ├── cuda_simple │ └── Dockerfile ├── open_llama │ ├── Dockerfile │ ├── build.sh │ ├── hug_model.py │ ├── start.sh │ └── start_server.sh ├── openblas_simple │ └── Dockerfile └── simple │ ├── Dockerfile │ └── run.sh ├── docs ├── api-reference.md ├── changelog.md ├── icon.svg ├── index.md ├── install │ └── macos.md ├── requirements.txt └── server.md ├── examples ├── batch-processing │ └── server.py ├── gradio_chat │ ├── local.py │ └── server.py ├── hf_pull │ └── main.py ├── high_level_api │ ├── fastapi_server.py │ ├── high_level_api_embedding.py │ ├── high_level_api_inference.py │ ├── high_level_api_infill.py │ ├── high_level_api_streaming.py │ └── langchain_custom_llm.py ├── low_level_api │ ├── Chat.py │ ├── Miku.py │ ├── ReasonAct.py │ ├── common.py │ ├── low_level_api_chat_cpp.py │ ├── low_level_api_llama_cpp.py │ ├── quantize.py │ ├── readme │ │ └── low_level_api_llama_cpp.md │ └── util.py ├── notebooks │ ├── Batching.ipynb │ ├── Clients.ipynb │ ├── Functions.ipynb │ ├── Guidance.ipynb │ ├── Multimodal.ipynb │ ├── OpenHermesFunctionCalling.ipynb │ └── PerformanceTuning.ipynb └── ray │ ├── README.md │ ├── llm.py │ └── requirements.txt ├── llama_cpp ├── __init__.py ├── _ctypes_extensions.py ├── _ggml.py ├── _internals.py ├── _logger.py ├── _utils.py ├── llama.py ├── llama_cache.py ├── llama_chat_format.py ├── llama_cpp.py ├── llama_grammar.py ├── llama_speculative.py ├── llama_tokenizer.py ├── llama_types.py ├── llava_cpp.py ├── py.typed └── server │ ├── __init__.py │ ├── __main__.py │ ├── app.py │ ├── cli.py │ ├── errors.py │ ├── model.py │ ├── settings.py │ └── types.py ├── mkdocs.yml ├── pyproject.toml ├── scripts ├── get-releases.sh └── releases-to-pep-503.sh └── tests ├── test_llama.py ├── test_llama_chat_format.py ├── test_llama_grammar.py └── test_llama_speculative.py /.dockerignore: -------------------------------------------------------------------------------- 1 | _skbuild/ 2 | 3 | .envrc 4 | 5 | models/ 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/#use-with-ide 116 | .pdm.toml 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | .idea/ 167 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Prerequisites 11 | 12 | Please answer the following questions for yourself before submitting an issue. 13 | 14 | - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. 15 | - [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md). 16 | - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). 17 | - [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. 18 | 19 | # Expected Behavior 20 | 21 | Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do. 22 | 23 | # Current Behavior 24 | 25 | Please provide a detailed written description of what `llama-cpp-python` did, instead. 26 | 27 | # Environment and Context 28 | 29 | Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. 30 | 31 | * Physical (or virtual) hardware you are using, e.g. for Linux: 32 | 33 | `$ lscpu` 34 | 35 | * Operating System, e.g. for Linux: 36 | 37 | `$ uname -a` 38 | 39 | * SDK version, e.g. for Linux: 40 | 41 | ``` 42 | $ python3 --version 43 | $ make --version 44 | $ g++ --version 45 | ``` 46 | 47 | # Failure Information (for bugs) 48 | 49 | Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. 50 | 51 | # Steps to Reproduce 52 | 53 | Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better. 54 | 55 | 1. step 1 56 | 2. step 2 57 | 3. step 3 58 | 4. etc. 59 | 60 | **Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.** 61 | 62 | Try the following: 63 | 64 | 1. `git clone https://github.com/abetlen/llama-cpp-python` 65 | 2. `cd llama-cpp-python` 66 | 3. `rm -rf _skbuild/` # delete any old builds 67 | 4. `python -m pip install .` 68 | 5. `cd ./vendor/llama.cpp` 69 | 6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp 70 | 7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues) 71 | 72 | # Failure Logs 73 | 74 | Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes. 75 | 76 | Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. 77 | 78 | Example environment info: 79 | ``` 80 | llama-cpp-python$ git log | head -1 81 | commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 82 | 83 | llama-cpp-python$ python3 --version 84 | Python 3.10.10 85 | 86 | llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy" 87 | fastapi 0.95.0 88 | numpy 1.24.3 89 | sse-starlette 1.3.3 90 | uvicorn 0.21.1 91 | 92 | llama-cpp-python/vendor/llama.cpp$ git log | head -3 93 | commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 94 | Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> 95 | Date: Thu May 25 20:18:01 2023 -0600 96 | ``` 97 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | - package-ecosystem: "github-actions" 13 | directory: "/" 14 | schedule: 15 | interval: "daily" 16 | - package-ecosystem: "docker" 17 | directory: "/" 18 | schedule: 19 | interval: "daily" 20 | -------------------------------------------------------------------------------- /.github/workflows/build-and-release.yaml: -------------------------------------------------------------------------------- 1 | name: Build Release 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | build_wheels: 10 | name: Build wheels on ${{ matrix.os }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-20.04, windows-2019, macos-13] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | submodules: "recursive" 20 | 21 | # Used to host cibuildwheel 22 | - uses: actions/setup-python@v5 23 | with: 24 | python-version: "3.9" 25 | 26 | - name: Install dependencies (Linux/MacOS) 27 | if: runner.os != 'Windows' 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install uv 31 | RUST_LOG=trace python -m uv pip install -e .[all] --verbose 32 | shell: bash 33 | 34 | - name: Install dependencies (Windows) 35 | if: runner.os == 'Windows' 36 | env: 37 | RUST_LOG: trace 38 | run: | 39 | python -m pip install --upgrade pip 40 | python -m pip install uv 41 | python -m uv pip install -e .[all] --verbose 42 | shell: cmd 43 | 44 | - name: Build wheels 45 | uses: pypa/cibuildwheel@v2.22.0 46 | env: 47 | # disable repair 48 | CIBW_REPAIR_WHEEL_COMMAND: "" 49 | with: 50 | package-dir: . 51 | output-dir: wheelhouse 52 | 53 | - uses: actions/upload-artifact@v4 54 | with: 55 | name: wheels-${{ matrix.os }} 56 | path: ./wheelhouse/*.whl 57 | 58 | build_wheels_arm64: 59 | name: Build arm64 wheels 60 | runs-on: ubuntu-latest 61 | steps: 62 | - uses: actions/checkout@v4 63 | with: 64 | submodules: "recursive" 65 | 66 | - name: Set up QEMU 67 | uses: docker/setup-qemu-action@v3 68 | with: 69 | platforms: linux/arm64 70 | 71 | - name: Build wheels 72 | uses: pypa/cibuildwheel@v2.22.0 73 | env: 74 | CIBW_SKIP: "*musllinux* pp*" 75 | CIBW_REPAIR_WHEEL_COMMAND: "" 76 | CIBW_ARCHS: "aarch64" 77 | CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" 78 | with: 79 | output-dir: wheelhouse 80 | 81 | - name: Upload wheels as artifacts 82 | uses: actions/upload-artifact@v4 83 | with: 84 | name: wheels_arm64 85 | path: ./wheelhouse/*.whl 86 | 87 | build_sdist: 88 | name: Build source distribution 89 | runs-on: ubuntu-latest 90 | 91 | steps: 92 | - uses: actions/checkout@v4 93 | with: 94 | submodules: "recursive" 95 | 96 | - uses: actions/setup-python@v5 97 | with: 98 | python-version: "3.9" 99 | 100 | - name: Install dependencies (Linux/MacOS) 101 | if: runner.os != 'Windows' 102 | run: | 103 | python -m pip install --upgrade pip 104 | python -m pip install uv 105 | RUST_LOG=trace python -m uv pip install -e .[all] --verbose 106 | python -m uv pip install build 107 | shell: bash 108 | 109 | - name: Install dependencies (Windows) 110 | if: runner.os == 'Windows' 111 | env: 112 | RUST_LOG: trace 113 | run: | 114 | python -m pip install --upgrade pip 115 | python -m pip install uv 116 | python -m uv pip install -e .[all] --verbose 117 | python -m uv pip install build 118 | shell: cmd 119 | 120 | - name: Build source distribution 121 | run: | 122 | python -m build --sdist 123 | 124 | - uses: actions/upload-artifact@v4 125 | with: 126 | name: sdist 127 | path: ./dist/*.tar.gz 128 | 129 | release: 130 | name: Release 131 | needs: [build_wheels, build_wheels_arm64, build_sdist] 132 | runs-on: ubuntu-latest 133 | 134 | steps: 135 | - uses: actions/download-artifact@v4 136 | with: 137 | merge-multiple: true 138 | path: dist 139 | 140 | - uses: softprops/action-gh-release@v2 141 | with: 142 | files: dist/* 143 | env: 144 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 145 | -------------------------------------------------------------------------------- /.github/workflows/build-docker.yaml: -------------------------------------------------------------------------------- 1 | name: Build Docker 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | packages: write 8 | 9 | jobs: 10 | docker: 11 | name: Build and push Docker image 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v4 16 | with: 17 | submodules: "recursive" 18 | 19 | - name: Set up QEMU 20 | uses: docker/setup-qemu-action@v3 21 | 22 | - name: Set up Docker Buildx 23 | uses: docker/setup-buildx-action@v3 24 | 25 | - name: Login to GitHub Container Registry 26 | uses: docker/login-action@v3 27 | with: 28 | registry: ghcr.io 29 | username: ${{ github.repository_owner }} 30 | password: ${{ secrets.GITHUB_TOKEN }} 31 | 32 | - name: Build and push 33 | id: docker_build 34 | uses: docker/build-push-action@v6 35 | with: 36 | context: . 37 | file: "docker/simple/Dockerfile" 38 | push: ${{ startsWith(github.ref, 'refs/tags/') }} 39 | pull: true 40 | platforms: linux/amd64,linux/arm64 41 | tags: | 42 | ghcr.io/abetlen/llama-cpp-python:latest 43 | ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }} 44 | build-args: | 45 | BUILDKIT_INLINE_CACHE=1 46 | 47 | - name: Publish to GitHub Tag 48 | if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/') 49 | run: | 50 | echo "Docker image published for tag: ${{ github.ref_name }}" 51 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-cuda.yaml: -------------------------------------------------------------------------------- 1 | name: Build Wheels (CUDA) 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | define_matrix: 10 | name: Define Build Matrix 11 | runs-on: ubuntu-latest 12 | outputs: 13 | matrix: ${{ steps.set-matrix.outputs.matrix }} 14 | defaults: 15 | run: 16 | shell: pwsh 17 | 18 | steps: 19 | - name: Define Job Output 20 | id: set-matrix 21 | run: | 22 | $matrix = @{ 23 | 'os' = @('ubuntu-latest', 'windows-2019') 24 | 'pyver' = @("3.9", "3.10", "3.11", "3.12") 25 | 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") #, "12.5.1", "12.6.1") 26 | 'releasetag' = @("basic") 27 | } 28 | 29 | $matrixOut = ConvertTo-Json $matrix -Compress 30 | Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT 31 | 32 | build_wheels: 33 | name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} 34 | needs: define_matrix 35 | runs-on: ${{ matrix.os }} 36 | strategy: 37 | matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} 38 | defaults: 39 | run: 40 | shell: pwsh 41 | env: 42 | CUDAVER: ${{ matrix.cuda }} 43 | AVXVER: ${{ matrix.releasetag }} 44 | 45 | steps: 46 | - name: Add MSBuild to PATH 47 | if: runner.os == 'Windows' 48 | uses: microsoft/setup-msbuild@v2 49 | with: 50 | vs-version: '[16.11,16.12)' 51 | 52 | - uses: actions/checkout@v4 53 | with: 54 | submodules: "recursive" 55 | 56 | - uses: actions/setup-python@v5 57 | with: 58 | python-version: ${{ matrix.pyver }} 59 | cache: 'pip' 60 | 61 | - name: Setup Mamba 62 | uses: conda-incubator/setup-miniconda@v3.1.0 63 | with: 64 | activate-environment: "llamacpp" 65 | python-version: ${{ matrix.pyver }} 66 | miniforge-version: latest 67 | add-pip-as-python-dependency: true 68 | auto-activate-base: false 69 | 70 | - name: VS Integration Cache 71 | id: vs-integration-cache 72 | if: runner.os == 'Windows' 73 | uses: actions/cache@v4 74 | with: 75 | path: ./MSBuildExtensions 76 | key: cuda-${{ matrix.cuda }}-vs-integration 77 | 78 | - name: Get Visual Studio Integration 79 | if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' 80 | run: | 81 | if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} 82 | $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) 83 | for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} 84 | Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' 85 | & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null 86 | Remove-Item 'cudainstaller.zip' 87 | 88 | - name: Install Visual Studio Integration 89 | if: runner.os == 'Windows' 90 | run: | 91 | $y = (gi '.\MSBuildExtensions').fullname + '\*' 92 | (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) 93 | $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') 94 | echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV 95 | 96 | - name: Install Dependencies 97 | env: 98 | MAMBA_DOWNLOAD_FAILFAST: "0" 99 | MAMBA_NO_LOW_SPEED_LIMIT: "1" 100 | run: | 101 | $cudaVersion = $env:CUDAVER 102 | mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion 103 | python -m pip install build wheel 104 | 105 | - name: Build Wheel 106 | run: | 107 | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') 108 | $env:CUDA_PATH = $env:CONDA_PREFIX 109 | $env:CUDA_HOME = $env:CONDA_PREFIX 110 | $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX 111 | if ($IsLinux) { 112 | $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH 113 | } 114 | $env:VERBOSE = '1' 115 | $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all' 116 | $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" 117 | # if ($env:AVXVER -eq 'AVX') { 118 | $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' 119 | # } 120 | # if ($env:AVXVER -eq 'AVX512') { 121 | # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' 122 | # } 123 | # if ($env:AVXVER -eq 'basic') { 124 | # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' 125 | # } 126 | python -m build --wheel 127 | # write the build tag to the output 128 | Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV 129 | 130 | - uses: softprops/action-gh-release@v2 131 | with: 132 | files: dist/* 133 | # Set tag_name to -cu 134 | tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} 135 | env: 136 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 137 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-metal.yaml: -------------------------------------------------------------------------------- 1 | name: Build Wheels (Metal) 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | build_wheels: 10 | name: Build wheels on ${{ matrix.os }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [macos-13, macos-14, macos-15] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | submodules: "recursive" 20 | 21 | # Used to host cibuildwheel 22 | - uses: actions/setup-python@v5 23 | with: 24 | python-version: "3.12" 25 | cache: 'pip' 26 | 27 | - name: Install dependencies (Linux/MacOS) 28 | if: runner.os != 'Windows' 29 | run: | 30 | python -m pip install --upgrade pip 31 | python -m pip install uv 32 | RUST_LOG=trace python -m uv pip install -e .[all] --verbose 33 | shell: bash 34 | 35 | - name: Install dependencies (Windows) 36 | if: runner.os == 'Windows' 37 | env: 38 | RUST_LOG: trace 39 | run: | 40 | python -m pip install --upgrade pip 41 | python -m pip install uv 42 | python -m uv pip install -e .[all] --verbose 43 | shell: cmd 44 | 45 | - name: Build wheels 46 | uses: pypa/cibuildwheel@v2.22.0 47 | env: 48 | # disable repair 49 | CIBW_REPAIR_WHEEL_COMMAND: "" 50 | CIBW_ARCHS: "arm64" 51 | CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=on" 52 | CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" 53 | with: 54 | package-dir: . 55 | output-dir: wheelhouse2 56 | 57 | - uses: actions/upload-artifact@v4 58 | with: 59 | name: wheels-mac_${{ matrix.os }} 60 | path: ./wheelhouse2/*.whl 61 | 62 | release: 63 | name: Release 64 | needs: [build_wheels] 65 | runs-on: ubuntu-latest 66 | 67 | steps: 68 | - uses: actions/download-artifact@v4 69 | with: 70 | merge-multiple: true 71 | path: dist2 72 | 73 | - uses: softprops/action-gh-release@v2 74 | with: 75 | files: dist2/* 76 | # set release name to -metal 77 | tag_name: ${{ github.ref_name }}-metal 78 | env: 79 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 80 | -------------------------------------------------------------------------------- /.github/workflows/generate-index-from-release.yaml: -------------------------------------------------------------------------------- 1 | name: Wheels Index 2 | 3 | on: 4 | # Trigger on new release 5 | workflow_run: 6 | workflows: ["Release", "Build Wheels (CUDA)", "Build Wheels (Metal)"] 7 | types: 8 | - completed 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 14 | permissions: 15 | contents: read 16 | pages: write 17 | id-token: write 18 | 19 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 20 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 21 | concurrency: 22 | group: "pages" 23 | cancel-in-progress: false 24 | 25 | jobs: 26 | # Single deploy job since we're just deploying 27 | deploy: 28 | environment: 29 | name: github-pages 30 | url: ${{ steps.deployment.outputs.page_url }} 31 | runs-on: ubuntu-latest 32 | steps: 33 | - name: Checkout 34 | uses: actions/checkout@v4 35 | - name: Setup Pages 36 | uses: actions/configure-pages@v5 37 | - name: Build 38 | env: 39 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 40 | run: | 41 | ./scripts/get-releases.sh 42 | ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' 43 | ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' 44 | ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' 45 | ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$' 46 | ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' 47 | # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' 48 | # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' 49 | ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$' 50 | - name: Upload artifact 51 | uses: actions/upload-pages-artifact@v3 52 | with: 53 | # Upload entire repository 54 | path: 'index' 55 | - name: Deploy to GitHub Pages 56 | id: deployment 57 | uses: actions/deploy-pages@v4 58 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-test.yaml: -------------------------------------------------------------------------------- 1 | # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ 2 | 3 | name: Publish to TestPyPI 4 | 5 | on: 6 | workflow_dispatch: 7 | inputs: 8 | dev_version: 9 | description: 'Dev version N' 10 | required: true 11 | 12 | 13 | jobs: 14 | build-n-publish: 15 | name: Build and publish 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | submodules: "recursive" 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: "3.11" 27 | cache: 'pip' 28 | 29 | - name: Append Dev Version to __version__ 30 | run: | 31 | DEV_VERSION=${{ github.event.inputs.dev_version }} 32 | CURRENT_VERSION=$(awk -F= '/__version__ =/ {print $2}' llama_cpp/__init__.py | tr -d ' "') 33 | NEW_VERSION="${CURRENT_VERSION}.dev${DEV_VERSION}" 34 | sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py 35 | 36 | - name: Install dependencies (Linux/MacOS) 37 | if: runner.os != 'Windows' 38 | run: | 39 | python -m pip install --upgrade pip 40 | python -m pip install uv 41 | RUST_LOG=trace python -m uv pip install -e .[all] --verbose 42 | shell: bash 43 | 44 | - name: Install dependencies (Windows) 45 | if: runner.os == 'Windows' 46 | env: 47 | RUST_LOG: trace 48 | run: | 49 | python -m pip install --upgrade pip 50 | python -m pip install uv 51 | python -m uv pip install -e .[all] --verbose 52 | shell: cmd 53 | 54 | - name: Build source distribution 55 | run: | 56 | python -m build --sdist 57 | 58 | - name: Publish to Test PyPI 59 | uses: pypa/gh-action-pypi-publish@release/v1 60 | with: 61 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 62 | repository-url: https://test.pypi.org/legacy/ 63 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ 4 | 5 | on: workflow_dispatch 6 | 7 | jobs: 8 | build-n-publish: 9 | name: Build and publish 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | submodules: "recursive" 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: "3.9" 21 | 22 | - name: Install dependencies (Linux/MacOS) 23 | if: runner.os != 'Windows' 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install uv 27 | RUST_LOG=trace python -m uv pip install -e .[all] --verbose 28 | python -m uv pip install build 29 | shell: bash 30 | 31 | - name: Install dependencies (Windows) 32 | if: runner.os == 'Windows' 33 | env: 34 | RUST_LOG: trace 35 | run: | 36 | python -m pip install --upgrade pip 37 | python -m pip install uv 38 | python -m uv pip install -e .[all] --verbose 39 | python -m uv pip install build 40 | shell: cmd 41 | 42 | - name: Build source distribution 43 | run: | 44 | python -m build --sdist 45 | 46 | - name: Publish distribution to PyPI 47 | # TODO: move to tag based releases 48 | # if: startsWith(github.ref, 'refs/tags') 49 | uses: pypa/gh-action-pypi-publish@release/v1 50 | with: 51 | password: ${{ secrets.PYPI_API_TOKEN }} 52 | -------------------------------------------------------------------------------- /.github/workflows/test-pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Tests for PyPI package 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | build-linux: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ["3.9", "3.10", "3.11", "3.12"] 12 | 13 | steps: 14 | - name: Set up Python ${{ matrix.python-version }} 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | cache: 'pip' 19 | 20 | - name: Install dependencies (Linux/MacOS) 21 | if: runner.os != 'Windows' 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install uv 25 | RUST_LOG=trace python -m uv pip install llama-cpp-python[all] --verbose 26 | shell: bash 27 | 28 | - name: Install dependencies (Windows) 29 | if: runner.os == 'Windows' 30 | env: 31 | RUST_LOG: trace 32 | run: | 33 | python -m pip install --upgrade pip 34 | python -m pip install uv 35 | python -m uv pip install llama-cpp-python[all] --verbose 36 | shell: cmd 37 | 38 | - name: Test with pytest 39 | run: | 40 | python -c "import llama_cpp" 41 | 42 | build-windows: 43 | 44 | runs-on: windows-latest 45 | strategy: 46 | matrix: 47 | python-version: ["3.9", "3.10", "3.11", "3.12"] 48 | 49 | steps: 50 | - name: Set up Python ${{ matrix.python-version }} 51 | uses: actions/setup-python@v5 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | cache: 'pip' 55 | 56 | - name: Install dependencies (Linux/MacOS) 57 | if: runner.os != 'Windows' 58 | run: | 59 | python -m pip install --upgrade pip 60 | python -m pip install uv 61 | RUST_LOG=trace python -m uv pip install llama-cpp-python[all] --verbose 62 | shell: bash 63 | 64 | - name: Install dependencies (Windows) 65 | if: runner.os == 'Windows' 66 | env: 67 | RUST_LOG: trace 68 | run: | 69 | python -m pip install --upgrade pip 70 | python -m pip install uv 71 | python -m uv pip install llama-cpp-python[all] --verbose 72 | shell: cmd 73 | 74 | - name: Test with pytest 75 | run: | 76 | python -c "import llama_cpp" 77 | 78 | build-macos: 79 | 80 | runs-on: macos-latest 81 | strategy: 82 | matrix: 83 | python-version: ["3.9", "3.10", "3.11", "3.12"] 84 | 85 | steps: 86 | - name: Set up Python ${{ matrix.python-version }} 87 | uses: actions/setup-python@v5 88 | with: 89 | python-version: ${{ matrix.python-version }} 90 | cache: 'pip' 91 | 92 | - name: Install dependencies (Linux/MacOS) 93 | if: runner.os != 'Windows' 94 | run: | 95 | python -m pip install --upgrade pip 96 | python -m pip install uv 97 | RUST_LOG=trace python -m uv pip install llama-cpp-python[all] --verbose 98 | shell: bash 99 | 100 | - name: Install dependencies (Windows) 101 | if: runner.os == 'Windows' 102 | env: 103 | RUST_LOG: trace 104 | run: | 105 | python -m pip install --upgrade pip 106 | python -m pip install uv 107 | python -m uv pip install llama-cpp-python[all] --verbose 108 | shell: cmd 109 | 110 | - name: Test with pytest 111 | run: | 112 | python -c "import llama_cpp" 113 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | pull_request: 4 | branches: 5 | - main 6 | push: 7 | branches: 8 | - main 9 | 10 | env: 11 | REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF 12 | MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf 13 | 14 | jobs: 15 | download-model: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.9" 22 | - name: Install huggingface-hub 23 | run: pip install huggingface-hub 24 | - name: Download model 25 | run: huggingface-cli download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }} 26 | - name: Cache model 27 | uses: actions/cache@v4 28 | with: 29 | path: ~/.cache/huggingface/hub 30 | key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} 31 | 32 | build-linux: 33 | needs: download-model 34 | runs-on: ubuntu-latest 35 | strategy: 36 | matrix: 37 | python-version: ["3.9", "3.10", "3.11", "3.12"] 38 | steps: 39 | - uses: actions/checkout@v4 40 | with: 41 | submodules: "recursive" 42 | 43 | - name: Set up Python ${{ matrix.python-version }} 44 | uses: actions/setup-python@v5 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | cache: 'pip' 48 | - name: Restore model cache 49 | uses: actions/cache@v4 50 | with: 51 | path: ~/.cache/huggingface/hub 52 | key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} 53 | - name: Install dependencies (Linux/MacOS) 54 | run: | 55 | python -m pip install --upgrade pip 56 | python -m pip install uv 57 | python -m uv pip install -e .[all] --verbose 58 | shell: bash 59 | - name: Test with pytest 60 | run: | 61 | python -m pytest 62 | 63 | build-windows: 64 | needs: download-model 65 | runs-on: windows-latest 66 | strategy: 67 | matrix: 68 | python-version: ["3.9", "3.10", "3.11", "3.12"] 69 | steps: 70 | - uses: actions/checkout@v4 71 | with: 72 | submodules: "recursive" 73 | 74 | - name: Set up Python ${{ matrix.python-version }} 75 | uses: actions/setup-python@v5 76 | with: 77 | python-version: ${{ matrix.python-version }} 78 | cache: 'pip' 79 | 80 | - name: Restore model cache 81 | uses: actions/cache@v4 82 | with: 83 | path: ~/.cache/huggingface/hub 84 | key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} 85 | 86 | - name: Install dependencies (Windows) 87 | run: | 88 | python -m pip install --upgrade pip 89 | python -m pip install uv 90 | python -m uv pip install -e .[all] --verbose 91 | shell: cmd 92 | 93 | - name: Test with pytest 94 | run: | 95 | python -m pytest 96 | 97 | build-macos: 98 | needs: download-model 99 | runs-on: macos-13 100 | strategy: 101 | matrix: 102 | python-version: ["3.9", "3.10", "3.11", "3.12"] 103 | steps: 104 | - uses: actions/checkout@v4 105 | with: 106 | submodules: "recursive" 107 | 108 | - name: Set up Python ${{ matrix.python-version }} 109 | uses: actions/setup-python@v5 110 | with: 111 | python-version: ${{ matrix.python-version }} 112 | cache: 'pip' 113 | 114 | - name: System Info 115 | run: | 116 | uname -a 117 | sysctl -n machdep.cpu.brand_string 118 | python3 -c "import platform; print(platform.machine(), platform.architecture())" 119 | 120 | - name: Restore model cache 121 | uses: actions/cache@v4 122 | with: 123 | path: ~/.cache/huggingface/hub 124 | key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} 125 | 126 | - name: Install dependencies (Linux/MacOS) 127 | run: | 128 | python3 -m pip install --upgrade pip 129 | python3 -m pip install uv 130 | python3 -m uv pip install -e .[all] --verbose 131 | CMAKE_ARGS="-DLLAMA_METAL=off" python3 -m uv pip install .[all] --verbose 132 | shell: bash 133 | 134 | - name: Test with pytest 135 | run: | 136 | python3 -m pytest 137 | 138 | build-macos-metal: 139 | needs: download-model 140 | runs-on: macos-13 141 | steps: 142 | - uses: actions/checkout@v4 143 | with: 144 | submodules: "recursive" 145 | 146 | - name: Set up Python 3.9 147 | uses: actions/setup-python@v5 148 | with: 149 | python-version: "3.9" 150 | 151 | - name: System Info 152 | run: | 153 | uname -a 154 | sysctl -n machdep.cpu.brand_string 155 | python3 -c "import platform; print(platform.machine(), platform.architecture())" 156 | 157 | - name: Restore model cache 158 | uses: actions/cache@v4 159 | with: 160 | path: ~/.cache/huggingface/hub 161 | key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }} 162 | 163 | - name: Install dependencies 164 | run: | 165 | python3 -m pip install --upgrade pip 166 | CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose 167 | shell: bash 168 | 169 | - name: Test with pytest 170 | run: | 171 | python3 -m pytest 172 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.local 2 | 3 | .python-version 4 | 5 | .vscode/ 6 | 7 | _skbuild/ 8 | 9 | .envrc 10 | .direnv 11 | 12 | models/ 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | llama_cpp/*.so 21 | llama_cpp/*.dylib 22 | llama_cpp/*.metal 23 | llama_cpp/*.dll 24 | llama_cpp/*.lib 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | .pybuilder/ 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # poetry 115 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 116 | # This is especially recommended for binary packages to ensure reproducibility, and is more 117 | # commonly ignored for libraries. 118 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 119 | #poetry.lock 120 | 121 | # pdm 122 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 123 | #pdm.lock 124 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 125 | # in version control. 126 | # https://pdm.fming.dev/#use-with-ide 127 | .pdm.toml 128 | 129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 130 | __pypackages__/ 131 | 132 | # Celery stuff 133 | celerybeat-schedule 134 | celerybeat.pid 135 | 136 | # SageMath parsed files 137 | *.sage.py 138 | 139 | # Environments 140 | .env 141 | .venv 142 | env/ 143 | venv/ 144 | ENV/ 145 | env.bak/ 146 | venv.bak/ 147 | 148 | # Spyder project settings 149 | .spyderproject 150 | .spyproject 151 | 152 | # Rope project settings 153 | .ropeproject 154 | 155 | # mkdocs documentation 156 | /site 157 | 158 | # mypy 159 | .mypy_cache/ 160 | .dmypy.json 161 | dmypy.json 162 | 163 | # Pyre type checker 164 | .pyre/ 165 | 166 | # pytype static type analyzer 167 | .pytype/ 168 | 169 | # Cython debug symbols 170 | cython_debug/ 171 | 172 | # PyCharm 173 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 174 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 175 | # and can be added to the global gitignore or merged into this file. For a more nuclear 176 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 177 | .idea/ 178 | 179 | # downloaded model .bin files 180 | docker/open_llama/*.bin 181 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vendor/llama.cpp"] 2 | path = vendor/llama.cpp 3 | url = https://github.com/ggerganov/llama.cpp.git 4 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for MkDocs projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the version of Python and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.11" 12 | 13 | mkdocs: 14 | configuration: mkdocs.yml 15 | 16 | python: 17 | install: 18 | - method: pip 19 | path: . 20 | - requirements: docs/requirements.txt 21 | 22 | submodules: 23 | include: all 24 | recursive: true -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21) 2 | 3 | project(llama_cpp) 4 | 5 | option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON) 6 | option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON) 7 | 8 | function(llama_cpp_python_install_target target) 9 | if(NOT TARGET ${target}) 10 | return() 11 | endif() 12 | 13 | install( 14 | TARGETS ${target} 15 | LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 16 | RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 17 | ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 18 | FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 19 | RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 20 | ) 21 | install( 22 | TARGETS ${target} 23 | LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 24 | RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 25 | ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 26 | FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 27 | RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 28 | ) 29 | set_target_properties(${target} PROPERTIES 30 | INSTALL_RPATH "$ORIGIN" 31 | BUILD_WITH_INSTALL_RPATH TRUE 32 | ) 33 | if(UNIX) 34 | if(APPLE) 35 | set_target_properties(${target} PROPERTIES 36 | INSTALL_RPATH "@loader_path" 37 | BUILD_WITH_INSTALL_RPATH TRUE 38 | ) 39 | else() 40 | set_target_properties(${target} PROPERTIES 41 | INSTALL_RPATH "$ORIGIN" 42 | BUILD_WITH_INSTALL_RPATH TRUE 43 | ) 44 | endif() 45 | endif() 46 | endfunction() 47 | 48 | if (LLAMA_BUILD) 49 | set(BUILD_SHARED_LIBS "On") 50 | 51 | set(CMAKE_SKIP_BUILD_RPATH FALSE) 52 | 53 | # When building, don't use the install RPATH already 54 | # (but later on when installing) 55 | set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) 56 | 57 | # Add the automatically determined parts of the RPATH 58 | # which point to directories outside the build tree to the install RPATH 59 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) 60 | set(CMAKE_SKIP_RPATH FALSE) 61 | 62 | # Enable building of the common library 63 | set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE) 64 | 65 | # Disable building curl support 66 | set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE) 67 | 68 | # Architecture detection and settings for Apple platforms 69 | if (APPLE) 70 | # Get the target architecture 71 | execute_process( 72 | COMMAND uname -m 73 | OUTPUT_VARIABLE HOST_ARCH 74 | OUTPUT_STRIP_TRAILING_WHITESPACE 75 | ) 76 | 77 | # If CMAKE_OSX_ARCHITECTURES is not set, use the host architecture 78 | if(NOT CMAKE_OSX_ARCHITECTURES) 79 | set(CMAKE_OSX_ARCHITECTURES ${HOST_ARCH} CACHE STRING "Build architecture for macOS" FORCE) 80 | endif() 81 | 82 | message(STATUS "Host architecture: ${HOST_ARCH}") 83 | message(STATUS "Target architecture: ${CMAKE_OSX_ARCHITECTURES}") 84 | 85 | # Configure based on target architecture 86 | if(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") 87 | # Intel Mac settings 88 | set(GGML_AVX "OFF" CACHE BOOL "ggml: enable AVX" FORCE) 89 | set(GGML_AVX2 "OFF" CACHE BOOL "ggml: enable AVX2" FORCE) 90 | set(GGML_FMA "OFF" CACHE BOOL "ggml: enable FMA" FORCE) 91 | set(GGML_F16C "OFF" CACHE BOOL "ggml: enable F16C" FORCE) 92 | endif() 93 | 94 | # Metal settings (enable for both architectures) 95 | set(GGML_METAL "ON" CACHE BOOL "ggml: enable Metal" FORCE) 96 | set(GGML_METAL_EMBED_LIBRARY "ON" CACHE BOOL "ggml: embed metal library" FORCE) 97 | endif() 98 | 99 | add_subdirectory(vendor/llama.cpp) 100 | llama_cpp_python_install_target(llama) 101 | llama_cpp_python_install_target(ggml) 102 | 103 | llama_cpp_python_install_target(ggml-base) 104 | 105 | llama_cpp_python_install_target(ggml-amx) 106 | llama_cpp_python_install_target(ggml-blas) 107 | llama_cpp_python_install_target(ggml-can) 108 | llama_cpp_python_install_target(ggml-cpu) 109 | llama_cpp_python_install_target(ggml-cuda) 110 | llama_cpp_python_install_target(ggml-hip) 111 | llama_cpp_python_install_target(ggml-kompute) 112 | llama_cpp_python_install_target(ggml-metal) 113 | llama_cpp_python_install_target(ggml-musa) 114 | llama_cpp_python_install_target(ggml-rpc) 115 | llama_cpp_python_install_target(ggml-sycl) 116 | llama_cpp_python_install_target(ggml-vulkan) 117 | 118 | # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563 119 | if (WIN32) 120 | install( 121 | FILES $ 122 | DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 123 | ) 124 | install( 125 | FILES $ 126 | DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 127 | ) 128 | install( 129 | FILES $ 130 | DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 131 | ) 132 | install( 133 | FILES $ 134 | DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 135 | ) 136 | endif() 137 | 138 | if (LLAVA_BUILD) 139 | if (LLAMA_CUBLAS OR LLAMA_CUDA) 140 | add_compile_definitions(GGML_USE_CUBLAS) 141 | add_compile_definitions(GGML_USE_CUDA) 142 | endif() 143 | 144 | if (LLAMA_METAL) 145 | add_compile_definitions(GGML_USE_METAL) 146 | endif() 147 | 148 | # Building llava 149 | add_subdirectory(vendor/llama.cpp/tools/mtmd) 150 | set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava") 151 | 152 | if (WIN32) 153 | set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF) 154 | endif() 155 | llama_cpp_python_install_target(llava_shared) 156 | if (WIN32) 157 | install( 158 | FILES $ 159 | DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib 160 | ) 161 | install( 162 | FILES $ 163 | DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib 164 | ) 165 | endif() 166 | 167 | # Fix for llava build: Add include directory for llama.h 168 | # Move these commands after the add_subdirectory call 169 | target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) 170 | target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include) 171 | 172 | if (BUILD_SHARED_LIBS) 173 | target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) 174 | target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include) 175 | endif() 176 | 177 | target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) 178 | target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include) 179 | endif() 180 | endif() 181 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Andrei Betlen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | update: 2 | poetry install 3 | git submodule update --init --recursive 4 | 5 | update.vendor: 6 | cd vendor/llama.cpp && git pull origin master 7 | 8 | deps: 9 | python3 -m pip install --upgrade pip 10 | python3 -m pip install -e ".[all]" 11 | 12 | build: 13 | python3 -m pip install --verbose -e . 14 | 15 | build.debug: 16 | python3 -m pip install \ 17 | --verbose \ 18 | --config-settings=cmake.verbose=true \ 19 | --config-settings=logging.level=INFO \ 20 | --config-settings=install.strip=false \ 21 | --config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \ 22 | --editable . 23 | 24 | build.debug.extra: 25 | python3 -m pip install \ 26 | --verbose \ 27 | --config-settings=cmake.verbose=true \ 28 | --config-settings=logging.level=INFO \ 29 | --config-settings=install.strip=false \ 30 | --config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-fsanitize=address -ggdb -O0';-DCMAKE_CXX_FLAGS='-fsanitize=address -ggdb -O0'" \ 31 | --editable . 32 | 33 | build.cuda: 34 | CMAKE_ARGS="-DGGML_CUDA=on" python3 -m pip install --verbose -e . 35 | 36 | build.openblas: 37 | CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e . 38 | 39 | build.blis: 40 | CMAKE_ARGS="-DGGML_BLAS=on -DGGML_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e . 41 | 42 | build.metal: 43 | CMAKE_ARGS="-DGGML_METAL=on" python3 -m pip install --verbose -e . 44 | 45 | build.vulkan: 46 | CMAKE_ARGS="-DGGML_VULKAN=on" python3 -m pip install --verbose -e . 47 | 48 | build.kompute: 49 | CMAKE_ARGS="-DGGML_KOMPUTE=on" python3 -m pip install --verbose -e . 50 | 51 | build.sycl: 52 | CMAKE_ARGS="-DGGML_SYCL=on" python3 -m pip install --verbose -e . 53 | 54 | build.rpc: 55 | CMAKE_ARGS="-DGGML_RPC=on" python3 -m pip install --verbose -e . 56 | 57 | build.sdist: 58 | python3 -m build --sdist --verbose 59 | 60 | deploy.pypi: 61 | python3 -m twine upload dist/* 62 | 63 | deploy.gh-docs: 64 | mkdocs build 65 | mkdocs gh-deploy 66 | 67 | test: 68 | python3 -m pytest --full-trace -v 69 | 70 | docker: 71 | docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile . 72 | 73 | run-server: 74 | python3 -m llama_cpp.server --model ${MODEL} 75 | 76 | clean: 77 | - cd vendor/llama.cpp && make clean 78 | - cd vendor/llama.cpp && rm libllama.so 79 | - rm -rf _skbuild 80 | - rm llama_cpp/lib/*.so 81 | - rm llama_cpp/lib/*.dylib 82 | - rm llama_cpp/lib/*.metal 83 | - rm llama_cpp/lib/*.dll 84 | - rm llama_cpp/lib/*.lib 85 | 86 | .PHONY: \ 87 | update \ 88 | update.vendor \ 89 | build \ 90 | build.cuda \ 91 | build.opencl \ 92 | build.openblas \ 93 | build.sdist \ 94 | deploy.pypi \ 95 | deploy.gh-docs \ 96 | docker \ 97 | clean 98 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | ### Install Docker Server 2 | > [!IMPORTANT] 3 | > This was tested with Docker running on Linux.
If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
4 | 5 | [Install Docker Engine](https://docs.docker.com/engine/install) 6 | 7 | 8 | ## Simple Dockerfiles for building the llama-cpp-python server with external model bin files 9 | ### openblas_simple 10 | A simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image: 11 | ``` 12 | cd ./openblas_simple 13 | docker build -t openblas_simple . 14 | docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple 15 | ``` 16 | where `/` is the full path to the model file on the Docker host system. 17 | 18 | ### cuda_simple 19 | > [!WARNING] 20 | > Nvidia GPU CuBLAS support requires an Nvidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker Nvidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
21 | 22 | A simple Dockerfile for CUDA-accelerated CuBLAS, where the model is located outside the Docker image: 23 | 24 | ``` 25 | cd ./cuda_simple 26 | docker build -t cuda_simple . 27 | docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple 28 | ``` 29 | where `/` is the full path to the model file on the Docker host system. 30 | 31 | -------------------------------------------------------------------------- 32 | 33 | ### "Open-Llama-in-a-box" 34 | Download an Apache V2.0 licensed 3B params Open LLaMA model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server: 35 | ``` 36 | $ cd ./open_llama 37 | ./build.sh 38 | ./start.sh 39 | ``` 40 | 41 | ### Manually choose your own Llama model from Hugging Face 42 | `python3 ./hug_model.py -a TheBloke -t llama` 43 | You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. 44 | ``` 45 | docker $ ls -lh *.bin 46 | -rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin 47 | lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_1.bin 48 | ``` 49 | 50 | > [!NOTE] 51 | > Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least 52 | **TWICE** as much disk space as the size of the model:
53 | 54 | | Model | Quantized size | 55 | |------:|----------------:| 56 | | 3B | 3 GB | 57 | | 7B | 5 GB | 58 | | 13B | 10 GB | 59 | | 33B | 25 GB | 60 | | 65B | 50 GB | 61 | 62 | 63 | > [!NOTE] 64 | > If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` 65 | -------------------------------------------------------------------------------- /docker/cuda_simple/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_IMAGE="12.5.0-devel-ubuntu22.04" 2 | FROM nvidia/cuda:${CUDA_IMAGE} 3 | 4 | # We need to set the host to 0.0.0.0 to allow outside access 5 | ENV HOST 0.0.0.0 6 | 7 | RUN apt-get update && apt-get upgrade -y \ 8 | && apt-get install -y git build-essential \ 9 | python3 python3-pip gcc wget \ 10 | ocl-icd-opencl-dev opencl-headers clinfo \ 11 | libclblast-dev libopenblas-dev \ 12 | && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd 13 | 14 | COPY . . 15 | 16 | # setting build related env vars 17 | ENV CUDA_DOCKER_ARCH=all 18 | ENV GGML_CUDA=1 19 | 20 | # Install depencencies 21 | RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context 22 | 23 | # Install llama-cpp-python (build with cuda) 24 | RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python 25 | 26 | # Run the server 27 | CMD python3 -m llama_cpp.server 28 | -------------------------------------------------------------------------------- /docker/open_llama/Dockerfile: -------------------------------------------------------------------------------- 1 | # Define the image argument and provide a default value 2 | ARG IMAGE=python:3-slim-bookworm 3 | 4 | # Use the image as specified 5 | FROM ${IMAGE} 6 | 7 | # Re-declare the ARG after FROM 8 | ARG IMAGE 9 | 10 | # Update and upgrade the existing packages 11 | RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ 12 | python3 \ 13 | python3-pip \ 14 | ninja-build \ 15 | build-essential \ 16 | && apt-get clean \ 17 | && rm -rf /var/lib/apt/lists/* 18 | 19 | RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context 20 | 21 | # Perform the conditional installations based on the image 22 | RUN echo "Image: ${IMAGE}" && \ 23 | if [ "${IMAGE}" = "python:3-slim-bookworm" ] ; then \ 24 | echo "OpenBLAS install:" && \ 25 | apt-get install -y --no-install-recommends libopenblas-dev && \ 26 | CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python --verbose; \ 27 | else \ 28 | echo "CuBLAS install:" && \ 29 | CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --verbose; \ 30 | fi 31 | 32 | # Clean up apt cache 33 | RUN rm -rf /var/lib/apt/lists/* 34 | 35 | # Set a working directory for better clarity 36 | WORKDIR /app 37 | 38 | # Copy files to the app directory 39 | RUN echo "Installing model...this can take some time..." 40 | COPY ./model.bin /app/model.bin 41 | COPY ./start_server.sh /app/start_server.sh 42 | 43 | # Make the server start script executable 44 | RUN chmod +x /app/start_server.sh 45 | 46 | # Set environment variable for the host 47 | ENV HOST=0.0.0.0 48 | 49 | # Expose a port for the server 50 | EXPOSE 8000 51 | 52 | # Run the server start script 53 | CMD ["/bin/sh", "/app/start_server.sh"] 54 | -------------------------------------------------------------------------------- /docker/open_llama/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | MODEL="open_llama_3b" 4 | # Get open_llama_3b_ggml q5_1 quantization 5 | python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" 6 | ls -lh *.bin 7 | 8 | # Build the default OpenBLAS image 9 | docker build -t $MODEL . 10 | docker images | egrep "^(REPOSITORY|$MODEL)" 11 | 12 | echo 13 | echo "To start the docker container run:" 14 | echo "docker run -t -p 8000:8000 $MODEL" 15 | -------------------------------------------------------------------------------- /docker/open_llama/hug_model.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import os 4 | import struct 5 | import argparse 6 | 7 | def make_request(url, params=None): 8 | print(f"Making request to {url}...") 9 | response = requests.get(url, params=params) 10 | if response.status_code == 200: 11 | return json.loads(response.text) 12 | else: 13 | print(f"Request failed with status code {response.status_code}") 14 | return None 15 | 16 | def check_magic_and_version(filename): 17 | with open(filename, 'rb') as f: 18 | # Read the first 6 bytes from the file 19 | data = f.read(6) 20 | 21 | # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int 22 | # and the next 2 bytes as a little-endian unsigned short 23 | magic, version = struct.unpack('= 10485760: # 10 MB 40 | print('.', end='', flush=True) 41 | total_downloaded = 0 42 | print("\nDownload complete.") 43 | 44 | # Creating a symbolic link from destination to "model.bin" 45 | if os.path.isfile("model.bin"): 46 | os.remove("model.bin") # remove the existing link if any 47 | os.symlink(destination, "model.bin") 48 | else: 49 | print(f"Download failed with status code {response.status_code}") 50 | 51 | def get_user_choice(model_list): 52 | # Print the enumerated list 53 | print("\n") 54 | for i, (model_id, rfilename) in enumerate(model_list): 55 | print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}") 56 | 57 | # Get user's choice 58 | choice = input("Choose a model to download by entering the corresponding number: ") 59 | try: 60 | index = int(choice) - 1 61 | if 0 <= index < len(model_list): 62 | # Return the chosen model 63 | return model_list[index] 64 | else: 65 | print("Invalid choice.") 66 | except ValueError: 67 | print("Invalid input. Please enter a number corresponding to a model.") 68 | except IndexError: 69 | print("Invalid choice. Index out of range.") 70 | 71 | return None 72 | 73 | def main(): 74 | # Create an argument parser 75 | parser = argparse.ArgumentParser(description='Process some parameters.') 76 | 77 | # Arguments 78 | parser.add_argument('-v', '--version', type=int, default=0x0003, 79 | help='hexadecimal version number of ggml file') 80 | parser.add_argument('-a', '--author', type=str, default='TheBloke', 81 | help='HuggingFace author filter') 82 | parser.add_argument('-t', '--tag', type=str, default='llama', 83 | help='HuggingFace tag filter') 84 | parser.add_argument('-s', '--search', type=str, default='', 85 | help='HuggingFace search filter') 86 | parser.add_argument('-f', '--filename', type=str, default='q5_1', 87 | help='HuggingFace model repository filename substring match') 88 | 89 | # Parse the arguments 90 | args = parser.parse_args() 91 | 92 | # Define the parameters 93 | params = { 94 | "author": args.author, 95 | "tags": args.tag, 96 | "search": args.search 97 | } 98 | 99 | models = make_request('https://huggingface.co/api/models', params=params) 100 | if models is None: 101 | return 102 | 103 | model_list = [] 104 | # Iterate over the models 105 | for model in models: 106 | model_id = model['id'] 107 | model_info = make_request(f'https://huggingface.co/api/models/{model_id}') 108 | if model_info is None: 109 | continue 110 | 111 | for sibling in model_info.get('siblings', []): 112 | rfilename = sibling.get('rfilename') 113 | if rfilename and args.filename in rfilename: 114 | model_list.append((model_id, rfilename)) 115 | 116 | # Choose the model 117 | model_list.sort(key=lambda x: x[0]) 118 | if len(model_list) == 0: 119 | print("No models found") 120 | exit(1) 121 | elif len(model_list) == 1: 122 | model_choice = model_list[0] 123 | else: 124 | model_choice = get_user_choice(model_list) 125 | 126 | if model_choice is not None: 127 | model_id, rfilename = model_choice 128 | url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" 129 | dest = f"{model_id.replace('/', '_')}_{rfilename}" 130 | download_file(url, dest) 131 | _, version = check_magic_and_version(dest) 132 | if version != args.version: 133 | print(f"Warning: Expected version {args.version}, but found different version in the file.") 134 | else: 135 | print("Error - model choice was None") 136 | exit(2) 137 | 138 | if __name__ == '__main__': 139 | main() 140 | -------------------------------------------------------------------------------- /docker/open_llama/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | MODEL="open_llama_3b" 4 | 5 | # Start Docker container 6 | docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & 7 | sleep 10 8 | echo 9 | docker ps | egrep "(^CONTAINER|$MODEL)" 10 | 11 | # Test the model works 12 | echo 13 | curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ 14 | "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", 15 | "stop": [ 16 | "\n", 17 | "###" 18 | ] 19 | }' | grep Paris 20 | if [ $? -eq 0 ] 21 | then 22 | echo 23 | echo "$MODEL is working!!" 24 | else 25 | echo 26 | echo "ERROR: $MODEL not replying." 27 | exit 1 28 | fi 29 | -------------------------------------------------------------------------------- /docker/open_llama/start_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # For mlock support 4 | ulimit -l unlimited 5 | 6 | if [ "$IMAGE" = "python:3-slim-bullseye" ]; then 7 | python3 -B -m llama_cpp.server --model /app/model.bin 8 | else 9 | # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM 10 | python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000 11 | fi 12 | -------------------------------------------------------------------------------- /docker/openblas_simple/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3-slim-bookworm 2 | 3 | # We need to set the host to 0.0.0.0 to allow outside access 4 | ENV HOST 0.0.0.0 5 | 6 | COPY . . 7 | 8 | # Install the package 9 | RUN apt update && apt install -y libopenblas-dev ninja-build build-essential pkg-config \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* /tmp/* 12 | 13 | RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context 14 | 15 | RUN CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose 16 | 17 | # Run the server 18 | CMD python3 -m llama_cpp.server 19 | -------------------------------------------------------------------------------- /docker/simple/Dockerfile: -------------------------------------------------------------------------------- 1 | # Define the image argument and provide a default value 2 | ARG IMAGE=python:3-slim-bookworm 3 | 4 | # Use the image as specified 5 | FROM ${IMAGE} 6 | 7 | # Re-declare the ARG after FROM 8 | ARG IMAGE 9 | 10 | # Update and upgrade the existing packages 11 | RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ 12 | python3 \ 13 | python3-pip \ 14 | ninja-build \ 15 | libopenblas-dev \ 16 | build-essential \ 17 | && apt-get clean \ 18 | && rm -rf /var/lib/apt/lists/* /tmp/* 19 | 20 | RUN mkdir /app 21 | WORKDIR /app 22 | COPY . /app 23 | 24 | RUN python3 -m pip install --upgrade pip 25 | 26 | RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context 27 | 28 | RUN pip install llama-cpp-python --verbose; 29 | 30 | # Set environment variable for the host 31 | ENV HOST=0.0.0.0 32 | ENV PORT=8000 33 | 34 | # Expose a port for the server 35 | EXPOSE 8000 36 | 37 | # Run the server start script 38 | CMD ["/bin/sh", "/app/docker/simple/run.sh"] 39 | -------------------------------------------------------------------------------- /docker/simple/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | make build 4 | uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT 5 | -------------------------------------------------------------------------------- /docs/api-reference.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: API Reference 3 | --- 4 | 5 | ## High Level API 6 | 7 | High-level Python bindings for llama.cpp. 8 | 9 | ::: llama_cpp.Llama 10 | options: 11 | members: 12 | - __init__ 13 | - tokenize 14 | - detokenize 15 | - reset 16 | - eval 17 | - sample 18 | - generate 19 | - create_embedding 20 | - embed 21 | - create_completion 22 | - __call__ 23 | - create_chat_completion 24 | - create_chat_completion_openai_v1 25 | - set_cache 26 | - save_state 27 | - load_state 28 | - token_bos 29 | - token_eos 30 | - from_pretrained 31 | show_root_heading: true 32 | 33 | ::: llama_cpp.LlamaGrammar 34 | options: 35 | members: 36 | - from_string 37 | - from_json_schema 38 | 39 | ::: llama_cpp.LlamaCache 40 | options: 41 | show_root_heading: true 42 | 43 | ::: llama_cpp.LlamaState 44 | options: 45 | show_root_heading: true 46 | 47 | ::: llama_cpp.LogitsProcessor 48 | options: 49 | show_root_heading: true 50 | 51 | ::: llama_cpp.LogitsProcessorList 52 | options: 53 | show_root_heading: true 54 | 55 | ::: llama_cpp.StoppingCriteria 56 | options: 57 | show_root_heading: true 58 | 59 | ::: llama_cpp.StoppingCriteriaList 60 | options: 61 | show_root_heading: true 62 | 63 | ## Low Level API 64 | 65 | Low-level Python bindings for llama.cpp using Python's ctypes library. 66 | 67 | ::: llama_cpp.llama_cpp 68 | options: 69 | show_if_no_docstring: true 70 | # filter only members starting with `llama_` 71 | filters: 72 | - "^llama_" 73 | 74 | ::: llama_cpp.llama_cpp 75 | options: 76 | show_if_no_docstring: true 77 | show_root_heading: false 78 | show_root_toc_entry: false 79 | heading_level: 4 80 | # filter only members starting with `LLAMA_` 81 | filters: 82 | - "^LLAMA_" 83 | 84 | ## Misc 85 | 86 | ::: llama_cpp.llama_types 87 | options: 88 | show_if_no_docstring: true -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | -8<- "CHANGELOG.md" -------------------------------------------------------------------------------- /docs/icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Getting Started 3 | --- 4 | 5 | -8<- "README.md" -------------------------------------------------------------------------------- /docs/install/macos.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: MacOS Install with Metal GPU 3 | --- 4 | 5 | **(1) Make sure you have xcode installed... at least the command line parts** 6 | ``` 7 | # check the path of your xcode install 8 | xcode-select -p 9 | 10 | # xcode installed returns 11 | # /Applications/Xcode-beta.app/Contents/Developer 12 | 13 | # if xcode is missing then install it... it takes ages; 14 | xcode-select --install 15 | ``` 16 | 17 | **(2) Install the conda version for MacOS that supports Metal GPU** 18 | ``` 19 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh 20 | bash Miniforge3-MacOSX-arm64.sh 21 | ``` 22 | 23 | **(3) Make a conda environment** 24 | ``` 25 | conda create -n llama python=3.9.16 26 | conda activate llama 27 | ``` 28 | 29 | **(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** 30 | *(you needed xcode installed in order pip to build/compile the C++ code)* 31 | ``` 32 | pip uninstall llama-cpp-python -y 33 | CMAKE_ARGS="-DGGML_METAL=on" pip install -U llama-cpp-python --no-cache-dir 34 | pip install 'llama-cpp-python[server]' 35 | 36 | # you should now have llama-cpp-python v0.1.62 or higher installed 37 | llama-cpp-python         0.1.68 38 | 39 | ``` 40 | 41 | **(5) Download a v3 gguf v2 model** 42 | - **ggufv2** 43 | - file name ends with **Q4_0.gguf** - indicating it is 4bit quantized, with quantisation method 0 44 | 45 | https://huggingface.co/TheBloke/CodeLlama-7B-GGUF 46 | 47 | 48 | **(6) run the llama-cpp-python API server with MacOS Metal GPU support** 49 | ``` 50 | # config your ggml model path 51 | # make sure it is gguf v2 52 | # make sure it is q4_0 53 | export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]Q4_0.gguf 54 | python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 55 | ``` 56 | 57 | ***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* 58 | 59 | 60 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-material 3 | mkdocstrings[python] -------------------------------------------------------------------------------- /docs/server.md: -------------------------------------------------------------------------------- 1 | # OpenAI Compatible Server 2 | 3 | `llama-cpp-python` offers an OpenAI API compatible web server. 4 | 5 | This web server can be used to serve local models and easily connect them to existing clients. 6 | 7 | ## Setup 8 | 9 | ### Installation 10 | 11 | The server can be installed by running the following command: 12 | 13 | ```bash 14 | pip install llama-cpp-python[server] 15 | ``` 16 | 17 | ### Running the server 18 | 19 | The server can then be started by running the following command: 20 | 21 | ```bash 22 | python3 -m llama_cpp.server --model 23 | ``` 24 | 25 | ### Server options 26 | 27 | For a full list of options, run: 28 | 29 | ```bash 30 | python3 -m llama_cpp.server --help 31 | ``` 32 | 33 | NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable. 34 | 35 | Check out the server config reference below settings for more information on the available options. 36 | CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings) 37 | 38 | Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples. 39 | 40 | 41 | ## Guides 42 | 43 | ### Code Completion 44 | 45 | `llama-cpp-python` supports code completion via GitHub Copilot. 46 | 47 | *NOTE*: Without GPU acceleration this is unlikely to be fast enough to be usable. 48 | 49 | You'll first need to download one of the available code completion models in GGUF format: 50 | 51 | - [replit-code-v1_5-GGUF](https://huggingface.co/abetlen/replit-code-v1_5-3b-GGUF) 52 | 53 | Then you'll need to run the OpenAI compatible web server with a increased context size substantially for GitHub Copilot requests: 54 | 55 | ```bash 56 | python3 -m llama_cpp.server --model --n_ctx 16192 57 | ``` 58 | 59 | Then just update your settings in `.vscode/settings.json` to point to your code completion server: 60 | 61 | ```json 62 | { 63 | // ... 64 | "github.copilot.advanced": { 65 | "debug.testOverrideProxyUrl": "http://:", 66 | "debug.overrideProxyUrl": "http://:" 67 | } 68 | // ... 69 | } 70 | ``` 71 | 72 | ### Function Calling 73 | 74 | `llama-cpp-python` supports structured function calling based on a JSON schema. 75 | Function calling is completely compatible with the OpenAI function calling API and can be used by connecting with the official OpenAI Python client. 76 | 77 | You'll first need to download one of the available function calling models in GGUF format: 78 | 79 | - [functionary](https://huggingface.co/meetkai) 80 | 81 | Then when you run the server you'll need to also specify either `functionary-v1` or `functionary-v2` chat_format. 82 | 83 | Note that since functionary requires a HF Tokenizer due to discrepancies between llama.cpp and HuggingFace's tokenizers as mentioned [here](https://github.com/abetlen/llama-cpp-python/blob/main?tab=readme-ov-file#function-calling), you will need to pass in the path to the tokenizer too. The tokenizer files are already included in the respective HF repositories hosting the gguf files. 84 | 85 | ```bash 86 | python3 -m llama_cpp.server --model --chat_format functionary-v2 --hf_pretrained_model_name_or_path 87 | ``` 88 | 89 | Check out this [example notebook](https://github.com/abetlen/llama-cpp-python/blob/main/examples/notebooks/Functions.ipynb) for a walkthrough of some interesting use cases for function calling. 90 | 91 | ### Multimodal Models 92 | 93 | `llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to 94 | read information from both text and images. 95 | 96 | You'll first need to download one of the available multi-modal models in GGUF format: 97 | 98 | - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) 99 | - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) 100 | - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1) 101 | - [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) 102 | - [moondream2](https://huggingface.co/vikhyatk/moondream2) 103 | 104 | Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format 105 | 106 | ```bash 107 | python3 -m llama_cpp.server --model --clip_model_path --chat_format llava-1-5 108 | ``` 109 | 110 | Then you can just use the OpenAI API as normal 111 | 112 | ```python3 113 | from openai import OpenAI 114 | 115 | client = OpenAI(base_url="http://:/v1", api_key="sk-xxx") 116 | response = client.chat.completions.create( 117 | model="gpt-4-vision-preview", 118 | messages=[ 119 | { 120 | "role": "user", 121 | "content": [ 122 | { 123 | "type": "image_url", 124 | "image_url": { 125 | "url": "" 126 | }, 127 | }, 128 | {"type": "text", "text": "What does the image say"}, 129 | ], 130 | } 131 | ], 132 | ) 133 | print(response) 134 | ``` 135 | 136 | ## Configuration and Multi-Model Support 137 | 138 | The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable. 139 | 140 | ```bash 141 | python3 -m llama_cpp.server --config_file 142 | ``` 143 | 144 | Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models. 145 | 146 | The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file. 147 | 148 | At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed. 149 | 150 | ```json 151 | { 152 | "host": "0.0.0.0", 153 | "port": 8080, 154 | "models": [ 155 | { 156 | "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf", 157 | "model_alias": "gpt-3.5-turbo", 158 | "chat_format": "chatml", 159 | "n_gpu_layers": -1, 160 | "offload_kqv": true, 161 | "n_threads": 12, 162 | "n_batch": 512, 163 | "n_ctx": 2048 164 | }, 165 | { 166 | "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf", 167 | "model_alias": "gpt-4", 168 | "chat_format": "chatml", 169 | "n_gpu_layers": -1, 170 | "offload_kqv": true, 171 | "n_threads": 12, 172 | "n_batch": 512, 173 | "n_ctx": 2048 174 | }, 175 | { 176 | "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf", 177 | "model_alias": "gpt-4-vision-preview", 178 | "chat_format": "llava-1-5", 179 | "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf", 180 | "n_gpu_layers": -1, 181 | "offload_kqv": true, 182 | "n_threads": 12, 183 | "n_batch": 512, 184 | "n_ctx": 2048 185 | }, 186 | { 187 | "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf", 188 | "model_alias": "text-davinci-003", 189 | "n_gpu_layers": -1, 190 | "offload_kqv": true, 191 | "n_threads": 12, 192 | "n_batch": 512, 193 | "n_ctx": 2048 194 | }, 195 | { 196 | "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf", 197 | "model_alias": "copilot-codex", 198 | "n_gpu_layers": -1, 199 | "offload_kqv": true, 200 | "n_threads": 12, 201 | "n_batch": 1024, 202 | "n_ctx": 9216 203 | } 204 | ] 205 | } 206 | ``` 207 | 208 | The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class. 209 | 210 | ## Server Options Reference 211 | 212 | ::: llama_cpp.server.settings.ConfigFileSettings 213 | options: 214 | show_if_no_docstring: true 215 | 216 | ::: llama_cpp.server.settings.ServerSettings 217 | options: 218 | show_if_no_docstring: true 219 | 220 | ::: llama_cpp.server.settings.ModelSettings 221 | options: 222 | show_if_no_docstring: true 223 | -------------------------------------------------------------------------------- /examples/batch-processing/server.py: -------------------------------------------------------------------------------- 1 | """llama-cpp-python server from scratch in a single file. 2 | """ 3 | 4 | # import llama_cpp 5 | 6 | # path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf" 7 | 8 | # model_params = llama_cpp.llama_model_default_params() 9 | # model = llama_cpp.llama_load_model_from_file(path, model_params) 10 | 11 | # if model is None: 12 | # raise RuntimeError(f"Failed to load model from file: {path}") 13 | 14 | 15 | # ctx_params = llama_cpp.llama_context_default_params() 16 | # ctx = llama_cpp.llama_new_context_with_model(model, ctx_params) 17 | 18 | # if ctx is None: 19 | # raise RuntimeError("Failed to create context") 20 | 21 | 22 | from fastapi import FastAPI 23 | 24 | app = FastAPI() 25 | 26 | import openai.types.chat as types 27 | 28 | 29 | @app.post("/v1/chat/completions") 30 | def create_chat_completions(): 31 | return {"message": "Hello World"} 32 | -------------------------------------------------------------------------------- /examples/gradio_chat/local.py: -------------------------------------------------------------------------------- 1 | import llama_cpp 2 | import llama_cpp.llama_tokenizer 3 | 4 | import gradio as gr 5 | 6 | llama = llama_cpp.Llama.from_pretrained( 7 | repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", 8 | filename="*q8_0.gguf", 9 | tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( 10 | "Qwen/Qwen1.5-0.5B" 11 | ), 12 | verbose=False, 13 | ) 14 | 15 | model = "gpt-3.5-turbo" 16 | 17 | 18 | def predict(message, history): 19 | messages = [] 20 | 21 | for user_message, assistant_message in history: 22 | messages.append({"role": "user", "content": user_message}) 23 | messages.append({"role": "assistant", "content": assistant_message}) 24 | 25 | messages.append({"role": "user", "content": message}) 26 | 27 | response = llama.create_chat_completion_openai_v1( 28 | model=model, messages=messages, stream=True 29 | ) 30 | 31 | text = "" 32 | for chunk in response: 33 | content = chunk.choices[0].delta.content 34 | if content: 35 | text += content 36 | yield text 37 | 38 | 39 | js = """function () { 40 | gradioURL = window.location.href 41 | if (!gradioURL.endsWith('?__theme=dark')) { 42 | window.location.replace(gradioURL + '?__theme=dark'); 43 | } 44 | }""" 45 | 46 | css = """ 47 | footer { 48 | visibility: hidden; 49 | } 50 | full-height { 51 | height: 100%; 52 | } 53 | """ 54 | 55 | with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo: 56 | gr.ChatInterface( 57 | predict, 58 | fill_height=True, 59 | examples=[ 60 | "What is the capital of France?", 61 | "Who was the first person on the moon?", 62 | ], 63 | ) 64 | 65 | 66 | if __name__ == "__main__": 67 | demo.launch() 68 | -------------------------------------------------------------------------------- /examples/gradio_chat/server.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | from openai import OpenAI 4 | 5 | client = OpenAI(base_url="http://localhost:8000/v1", api_key="llama.cpp") 6 | 7 | model = "gpt-3.5-turbo" 8 | 9 | 10 | def predict(message, history): 11 | messages = [] 12 | 13 | for user_message, assistant_message in history: 14 | messages.append({"role": "user", "content": user_message}) 15 | messages.append({"role": "assistant", "content": assistant_message}) 16 | 17 | messages.append({"role": "user", "content": message}) 18 | 19 | response = client.chat.completions.create( 20 | model=model, messages=messages, stream=True 21 | ) 22 | 23 | text = "" 24 | for chunk in response: 25 | content = chunk.choices[0].delta.content 26 | if content: 27 | text += content 28 | yield text 29 | 30 | 31 | js = """function () { 32 | gradioURL = window.location.href 33 | if (!gradioURL.endsWith('?__theme=dark')) { 34 | window.location.replace(gradioURL + '?__theme=dark'); 35 | } 36 | }""" 37 | 38 | css = """ 39 | footer { 40 | visibility: hidden; 41 | } 42 | full-height { 43 | height: 100%; 44 | } 45 | """ 46 | 47 | with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo: 48 | gr.ChatInterface( 49 | predict, 50 | fill_height=True, 51 | examples=[ 52 | "What is the capital of France?", 53 | "Who was the first person on the moon?", 54 | ], 55 | ) 56 | 57 | 58 | if __name__ == "__main__": 59 | demo.launch() 60 | -------------------------------------------------------------------------------- /examples/hf_pull/main.py: -------------------------------------------------------------------------------- 1 | import llama_cpp 2 | import llama_cpp.llama_tokenizer 3 | 4 | 5 | llama = llama_cpp.Llama.from_pretrained( 6 | repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", 7 | filename="*q8_0.gguf", 8 | tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( 9 | "Qwen/Qwen1.5-0.5B" 10 | ), 11 | verbose=False, 12 | ) 13 | 14 | response = llama.create_chat_completion( 15 | messages=[{"role": "user", "content": "What is the capital of France?"}], 16 | response_format={ 17 | "type": "json_object", 18 | "schema": { 19 | "type": "object", 20 | "properties": { 21 | "country": {"type": "string"}, 22 | "capital": {"type": "string"}, 23 | }, 24 | "required": ["country", "capital"], 25 | }, 26 | }, 27 | stream=True, 28 | ) 29 | 30 | for chunk in response: 31 | delta = chunk["choices"][0]["delta"] 32 | if "content" not in delta: 33 | continue 34 | print(delta["content"], end="", flush=True) 35 | 36 | print() 37 | -------------------------------------------------------------------------------- /examples/high_level_api/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """Example FastAPI server for llama.cpp. 2 | 3 | To run this example: 4 | 5 | ```bash 6 | pip install fastapi uvicorn sse-starlette 7 | export MODEL=../models/7B/... 8 | ``` 9 | 10 | Then run: 11 | ``` 12 | uvicorn --factory llama_cpp.server.app:create_app --reload 13 | ``` 14 | 15 | or 16 | 17 | ``` 18 | python3 -m llama_cpp.server 19 | ``` 20 | 21 | Then visit http://localhost:8000/docs to see the interactive API docs. 22 | 23 | 24 | To actually see the implementation of the server, see llama_cpp/server/app.py 25 | 26 | """ 27 | 28 | import os 29 | import uvicorn 30 | 31 | from llama_cpp.server.app import create_app 32 | 33 | if __name__ == "__main__": 34 | app = create_app() 35 | 36 | uvicorn.run( 37 | app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) 38 | ) 39 | -------------------------------------------------------------------------------- /examples/high_level_api/high_level_api_embedding.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from llama_cpp import Llama 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin") 7 | args = parser.parse_args() 8 | 9 | llm = Llama(model_path=args.model, embedding=True) 10 | 11 | print(llm.create_embedding("Hello world!")) 12 | -------------------------------------------------------------------------------- /examples/high_level_api/high_level_api_inference.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | from llama_cpp import Llama 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") 8 | args = parser.parse_args() 9 | 10 | llm = Llama(model_path=args.model) 11 | 12 | output = llm( 13 | "Question: What are the names of the planets in the solar system? Answer: ", 14 | max_tokens=48, 15 | stop=["Q:", "\n"], 16 | echo=True, 17 | ) 18 | 19 | print(json.dumps(output, indent=2)) 20 | -------------------------------------------------------------------------------- /examples/high_level_api/high_level_api_infill.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from llama_cpp import Llama 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") 7 | parser.add_argument("-p", "--prompt", type=str, default="def add(") 8 | parser.add_argument("-s", "--suffix", type=str, default="\n return sum\n\n") 9 | parser.add_argument("-i", "--spm-infill", action="store_true") 10 | args = parser.parse_args() 11 | 12 | llm = Llama(model_path=args.model, n_gpu_layers=-1, spm_infill=args.spm_infill) 13 | 14 | output = llm.create_completion( 15 | temperature=0.0, 16 | repeat_penalty=1.0, 17 | prompt=args.prompt, 18 | suffix=args.suffix, 19 | ) 20 | 21 | # Models sometimes repeat suffix in response, attempt to filter that 22 | response = output["choices"][0]["text"] 23 | response_stripped = response.rstrip() 24 | unwanted_response_suffix = args.suffix.rstrip() 25 | unwanted_response_length = len(unwanted_response_suffix) 26 | 27 | filtered = False 28 | if ( 29 | unwanted_response_suffix 30 | and response_stripped[-unwanted_response_length:] == unwanted_response_suffix 31 | ): 32 | response = response_stripped[:-unwanted_response_length] 33 | filtered = True 34 | 35 | print( 36 | f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m" 37 | ) 38 | -------------------------------------------------------------------------------- /examples/high_level_api/high_level_api_streaming.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | from llama_cpp import Llama 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") 8 | args = parser.parse_args() 9 | 10 | llm = Llama(model_path=args.model) 11 | 12 | stream = llm( 13 | "Question: What are the names of the planets in the solar system? Answer: ", 14 | max_tokens=48, 15 | stop=["Q:", "\n"], 16 | stream=True, 17 | ) 18 | 19 | for output in stream: 20 | print(json.dumps(output, indent=2)) 21 | -------------------------------------------------------------------------------- /examples/high_level_api/langchain_custom_llm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from llama_cpp import Llama 4 | 5 | from langchain.llms.base import LLM 6 | from typing import Optional, List, Mapping, Any 7 | 8 | 9 | class LlamaLLM(LLM): 10 | model_path: str 11 | llm: Llama 12 | 13 | @property 14 | def _llm_type(self) -> str: 15 | return "llama-cpp-python" 16 | 17 | def __init__(self, model_path: str, **kwargs: Any): 18 | model_path = model_path 19 | llm = Llama(model_path=model_path) 20 | super().__init__(model_path=model_path, llm=llm, **kwargs) 21 | 22 | def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: 23 | response = self.llm(prompt, stop=stop or []) 24 | return response["choices"][0]["text"] 25 | 26 | @property 27 | def _identifying_params(self) -> Mapping[str, Any]: 28 | return {"model_path": self.model_path} 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") 33 | args = parser.parse_args() 34 | 35 | # Load the model 36 | llm = LlamaLLM(model_path=args.model) 37 | 38 | # Basic Q&A 39 | answer = llm( 40 | "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"] 41 | ) 42 | print(f"Answer: {answer.strip()}") 43 | 44 | # Using in a chain 45 | from langchain.prompts import PromptTemplate 46 | from langchain.chains import LLMChain 47 | 48 | prompt = PromptTemplate( 49 | input_variables=["product"], 50 | template="\n\n### Instruction:\nWrite a good name for a company that makes {product}\n\n### Response:\n", 51 | ) 52 | chain = LLMChain(llm=llm, prompt=prompt) 53 | 54 | # Run the chain only specifying the input variable. 55 | print(chain.run("colorful socks")) 56 | -------------------------------------------------------------------------------- /examples/low_level_api/Chat.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | import sys, os, datetime 3 | from common import GptParams 4 | from low_level_api_chat_cpp import LLaMAInteract 5 | 6 | 7 | def env_or_def(env, default): 8 | if env in os.environ: 9 | return os.environ[env] 10 | return default 11 | 12 | 13 | AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") 14 | MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") 15 | USER_NAME = env_or_def("USER_NAME", "USER") 16 | N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) 17 | N_THREAD = int(env_or_def("N_THREAD", "8")) 18 | 19 | today = datetime.datetime.today() 20 | DATE_YEAR = today.strftime("%Y") 21 | DATE_TIME = today.strftime("%H:%M") 22 | 23 | prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. 24 | {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. 25 | There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. 26 | The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. 27 | The transcript only includes text, it does not include markup like HTML and Markdown. 28 | 29 | {USER_NAME}: Hello, {AI_NAME}! 30 | {AI_NAME}: Hello {USER_NAME}! How may I help you today? 31 | {USER_NAME}: What year is it? 32 | {AI_NAME}: We are in {DATE_YEAR}. 33 | {USER_NAME}: Please tell me the largest city in Europe. 34 | {AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. 35 | {USER_NAME}: What can you tell me about Moscow? 36 | {AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. 37 | {USER_NAME}: What is a cat? 38 | {AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. 39 | {USER_NAME}: How do I pass command line arguments to a Node.js program? 40 | {AI_NAME}: The arguments are stored in process.argv. 41 | 42 | argv[0] is the path to the Node. js executable. 43 | argv[1] is the path to the script file. 44 | argv[2] is the first argument passed to the script. 45 | argv[3] is the second argument passed to the script and so on. 46 | {USER_NAME}: Name a color. 47 | {AI_NAME}: Blue. 48 | {USER_NAME}: What time is it? 49 | {AI_NAME}: It is {DATE_TIME}. 50 | {USER_NAME}:""" + " ".join( 51 | sys.argv[1:] 52 | ) 53 | 54 | print("Loading model...") 55 | params = GptParams( 56 | n_ctx=2048, 57 | temp=0.7, 58 | top_k=40, 59 | top_p=0.5, 60 | repeat_last_n=256, 61 | n_batch=1024, 62 | repeat_penalty=1.17647, 63 | model=MODEL, 64 | n_threads=N_THREAD, 65 | n_predict=N_PREDICTS, 66 | use_color=True, 67 | interactive=True, 68 | antiprompt=[f"{USER_NAME}:"], 69 | input_prefix=" ", 70 | input_suffix=f"{AI_NAME}:", 71 | prompt=prompt, 72 | ) 73 | 74 | with LLaMAInteract(params) as m: 75 | m.interact() 76 | -------------------------------------------------------------------------------- /examples/low_level_api/Miku.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | import sys, os 3 | from common import GptParams 4 | from low_level_api_chat_cpp import LLaMAInteract 5 | 6 | 7 | def env_or_def(env, default): 8 | if env in os.environ: 9 | return os.environ[env] 10 | return default 11 | 12 | 13 | AI_NAME = env_or_def("AI_NAME", "Miku") 14 | MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") 15 | USER_NAME = env_or_def("USER_NAME", "Anon") 16 | N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) 17 | N_THREAD = int(env_or_def("N_THREAD", "0")) 18 | 19 | prompt = f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. 20 | {AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. 21 | {AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. 22 | {AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. 23 | {AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. 24 | The conversation is only between {USER_NAME} and {AI_NAME} 25 | The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. 26 | {AI_NAME} can only communicate through text, so she can't send images or videos. 27 | 28 | 29 | {USER_NAME}: Hello! 30 | {AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! 31 | {AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ 32 | {AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) 33 | {USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! 34 | {AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! 35 | {AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! 36 | {AI_NAME}: What do you like to do in your free time? ^_^ 37 | {USER_NAME}:""" + " ".join( 38 | sys.argv[1:] 39 | ) 40 | 41 | print("Loading model...") 42 | params = GptParams( 43 | n_batch=1024, 44 | n_ctx=2048, 45 | n_keep=-1, 46 | repeat_last_n=256, 47 | repeat_penalty=1.17647, 48 | temp=0.7, 49 | top_k=40, 50 | top_p=0.5, 51 | model=MODEL, 52 | n_predict=N_PREDICTS, 53 | use_color=True, 54 | interactive=True, 55 | antiprompt=[f"{USER_NAME}:"], 56 | prompt=prompt, 57 | ) 58 | 59 | if N_THREAD > 0: 60 | params.n_threads = N_THREAD 61 | 62 | with LLaMAInteract(params) as m: 63 | m.interact() 64 | -------------------------------------------------------------------------------- /examples/low_level_api/ReasonAct.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | import sys, os, datetime 3 | from common import GptParams 4 | from low_level_api_chat_cpp import LLaMAInteract 5 | 6 | 7 | def env_or_def(env, default): 8 | if env in os.environ: 9 | return os.environ[env] 10 | return default 11 | 12 | 13 | MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") 14 | 15 | prompt = f"""You run in a loop of Thought, Action, Observation. 16 | At the end of the loop either Answer or restate your Thought and Action. 17 | Use Thought to describe your thoughts about the question you have been asked. 18 | Use Action to run one of these actions available to you: 19 | - calculate[python math expression] 20 | Observation will be the result of running those actions 21 | 22 | 23 | Question: What is 4 * 7 / 3? 24 | Thought: Do I need to use an action? Yes, I use calculate to do math 25 | Action: calculate[4 * 7 / 3] 26 | Observation: 9.3333333333 27 | Thought: Do I need to use an action? No, have the result 28 | Answer: The calculate tool says it is 9.3333333333 29 | Question: What is capital of france? 30 | Thought: Do I need to use an action? No, I know the answer 31 | Answer: Paris is the capital of France 32 | Question:""" + " ".join( 33 | sys.argv[1:] 34 | ) 35 | 36 | print("Loading model...") 37 | params = GptParams( 38 | interactive=True, 39 | interactive_start=True, 40 | top_k=10000, 41 | temp=0.2, 42 | repeat_penalty=1, 43 | n_threads=7, 44 | n_ctx=2048, 45 | antiprompt=["Question:", "Observation:"], 46 | model=MODEL, 47 | input_prefix=" ", 48 | n_predict=-1, 49 | prompt=prompt, 50 | ) 51 | 52 | with LLaMAInteract(params) as m: 53 | m.interact() 54 | -------------------------------------------------------------------------------- /examples/low_level_api/low_level_api_llama_cpp.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import os 3 | import multiprocessing 4 | 5 | import llama_cpp 6 | 7 | llama_cpp.llama_backend_init(numa=False) 8 | 9 | N_THREADS = multiprocessing.cpu_count() 10 | MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin") 11 | 12 | prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" 13 | 14 | lparams = llama_cpp.llama_model_default_params() 15 | cparams = llama_cpp.llama_context_default_params() 16 | model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams) 17 | ctx = llama_cpp.llama_new_context_with_model(model, cparams) 18 | 19 | # determine the required inference memory per token: 20 | tmp = [0, 1, 2, 3] 21 | llama_cpp.llama_eval( 22 | ctx=ctx, tokens=(llama_cpp.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0 23 | ) # Deprecated 24 | 25 | n_past = 0 26 | 27 | prompt = b" " + prompt 28 | 29 | embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() 30 | n_of_tok = llama_cpp.llama_tokenize( 31 | model=model, 32 | text=bytes(str(prompt), "utf-8"), 33 | text_len=len(embd_inp), 34 | tokens=embd_inp, 35 | n_max_tokens=len(embd_inp), 36 | add_bos=False, 37 | special=False, 38 | ) 39 | embd_inp = embd_inp[:n_of_tok] 40 | 41 | n_ctx = llama_cpp.llama_n_ctx(ctx) 42 | 43 | n_predict = 20 44 | n_predict = min(n_predict, n_ctx - len(embd_inp)) 45 | 46 | input_consumed = 0 47 | input_noecho = False 48 | 49 | remaining_tokens = n_predict 50 | 51 | embd = [] 52 | last_n_size = 64 53 | last_n_tokens_data = [0] * last_n_size 54 | n_batch = 24 55 | last_n_repeat = 64 56 | repeat_penalty = 1 57 | frequency_penalty = 0.0 58 | presence_penalty = 0.0 59 | 60 | while remaining_tokens > 0: 61 | if len(embd) > 0: 62 | llama_cpp.llama_eval( 63 | ctx=ctx, 64 | tokens=(llama_cpp.c_int * len(embd))(*embd), 65 | n_tokens=len(embd), 66 | n_past=n_past, 67 | ) # Deprecated 68 | 69 | n_past += len(embd) 70 | embd = [] 71 | if len(embd_inp) <= input_consumed: 72 | logits = llama_cpp.llama_get_logits(ctx) 73 | n_vocab = llama_cpp.llama_n_vocab(model) 74 | 75 | _arr = (llama_cpp.llama_token_data * n_vocab)( 76 | *[ 77 | llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) 78 | for token_id in range(n_vocab) 79 | ] 80 | ) 81 | candidates_p = llama_cpp.ctypes.pointer( 82 | llama_cpp.llama_token_data_array(_arr, len(_arr), False) 83 | ) 84 | 85 | _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) 86 | llama_cpp.llama_sample_repetition_penalties( 87 | ctx, 88 | candidates_p, 89 | _arr, 90 | penalty_last_n=last_n_repeat, 91 | penalty_repeat=repeat_penalty, 92 | penalty_freq=frequency_penalty, 93 | penalty_present=presence_penalty, 94 | ) 95 | 96 | llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1) 97 | llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1) 98 | llama_cpp.llama_sample_temperature(ctx, candidates_p, temp=0.2) 99 | id = llama_cpp.llama_sample_token(ctx, candidates_p) 100 | 101 | last_n_tokens_data = last_n_tokens_data[1:] + [id] 102 | embd.append(id) 103 | input_noecho = False 104 | remaining_tokens -= 1 105 | else: 106 | while len(embd_inp) > input_consumed: 107 | embd.append(embd_inp[input_consumed]) 108 | last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]] 109 | input_consumed += 1 110 | if len(embd) >= n_batch: 111 | break 112 | if not input_noecho: 113 | for id in embd: 114 | size = 32 115 | buffer = (ctypes.c_char * size)() 116 | n = llama_cpp.llama_token_to_piece( 117 | model, llama_cpp.llama_token(id), buffer, size 118 | ) 119 | assert n <= size 120 | print( 121 | buffer[:n].decode("utf-8"), 122 | end="", 123 | flush=True, 124 | ) 125 | 126 | if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx): 127 | break 128 | 129 | print() 130 | 131 | llama_cpp.llama_print_timings(ctx) 132 | 133 | llama_cpp.llama_free(ctx) 134 | -------------------------------------------------------------------------------- /examples/low_level_api/quantize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import llama_cpp 4 | 5 | 6 | def main(args): 7 | fname_inp = args.fname_inp.encode("utf-8") 8 | fname_out = args.fname_out.encode("utf-8") 9 | if not os.path.exists(fname_inp): 10 | raise RuntimeError(f"Input file does not exist ({fname_inp})") 11 | if os.path.exists(fname_out): 12 | raise RuntimeError(f"Output file already exists ({fname_out})") 13 | ftype = args.type 14 | args = llama_cpp.llama_model_quantize_default_params() 15 | args.ftype = ftype 16 | return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args) 17 | if return_code != 0: 18 | raise RuntimeError("Failed to quantize model") 19 | 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("fname_inp", type=str, help="Path to input model") 24 | parser.add_argument("fname_out", type=str, help="Path to output model") 25 | parser.add_argument( 26 | "type", 27 | type=int, 28 | help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum", 29 | ) 30 | args = parser.parse_args() 31 | main(args) 32 | -------------------------------------------------------------------------------- /examples/low_level_api/readme/low_level_api_llama_cpp.md: -------------------------------------------------------------------------------- 1 | # Low-Level API for Llama_cpp 2 | 3 | ## Overview 4 | This Python script, low_level_api_llama_cpp.py, demonstrates the implementation of a low-level API for interacting with the llama_cpp library. The script defines an inference that generates embeddings based on a given prompt using .gguf model. 5 | 6 | ### Prerequisites 7 | Before running the script, ensure that you have the following dependencies installed: 8 | 9 | . Python 3.6 or higher 10 | . llama_cpp: A C++ library for working with .gguf model 11 | . NumPy: A fundamental package for scientific computing with Python 12 | . multiprocessing: A Python module for parallel computing 13 | 14 | ### Usage 15 | install depedencies: 16 | ```bash 17 | python -m pip install llama-cpp-python ctypes os multiprocessing 18 | ``` 19 | Run the script: 20 | ```bash 21 | python low_level_api_llama_cpp.py 22 | ``` 23 | 24 | ## Code Structure 25 | The script is organized as follows: 26 | 27 | ### . Initialization: 28 | Load the model from the specified path. 29 | Create a context for model evaluation. 30 | 31 | ### . Tokenization: 32 | Tokenize the input prompt using the llama_tokenize function. 33 | Prepare the input tokens for model evaluation. 34 | 35 | ### . Inference: 36 | Perform model evaluation to generate responses. 37 | Sample from the model's output using various strategies (top-k, top-p, temperature). 38 | 39 | ### . Output: 40 | Print the generated tokens and the corresponding decoded text. 41 | 42 | ### .Cleanup: 43 | Free resources and print timing information. 44 | 45 | ## Configuration 46 | Customize the inference behavior by adjusting the following variables: 47 | 48 | #### . N_THREADS: Number of CPU threads to use for model evaluation. 49 | #### . MODEL_PATH: Path to the model file. 50 | #### . prompt: Input prompt for the chatbot. 51 | 52 | ## Notes 53 | . Ensure that the llama_cpp library is built and available in the system. Follow the instructions in the llama_cpp repository for building and installing the library. 54 | 55 | . This script is designed to work with the .gguf model and may require modifications for compatibility with other models. 56 | 57 | ## Acknowledgments 58 | This code is based on the llama_cpp library developed by the community. Special thanks to the contributors for their efforts. 59 | 60 | ## License 61 | This project is licensed under the MIT License - see the LICENSE file for details. -------------------------------------------------------------------------------- /examples/low_level_api/util.py: -------------------------------------------------------------------------------- 1 | ANSI_COLOR_RESET = "\x1b[0m" 2 | ANSI_COLOR_YELLOW = "\x1b[33m" 3 | ANSI_BOLD = "\x1b[1m" 4 | ANSI_COLOR_GREEN = "\x1b[32m" 5 | 6 | CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET 7 | CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW 8 | CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN 9 | 10 | 11 | # Iterative search 12 | # Actively searches and prevents a pattern from being returned 13 | class IterSearch: 14 | def __init__(self, pattern): 15 | self.pattern = list(pattern) 16 | self.buffer = [] 17 | 18 | def __call__(self, char): 19 | self.buffer += [char] 20 | 21 | if self.pattern[: len(self.buffer)] == self.buffer: 22 | if len(self.buffer) >= len(self.pattern): 23 | self.buffer.clear() 24 | return [] 25 | 26 | _tmp = self.buffer[:] 27 | self.buffer.clear() 28 | return _tmp 29 | 30 | 31 | class Circle: 32 | def __init__(self, size, default=0): 33 | self.list = [default] * size 34 | self.maxsize = size 35 | self.size = 0 36 | self.offset = 0 37 | 38 | def append(self, elem): 39 | if self.size < self.maxsize: 40 | self.list[self.size] = elem 41 | self.size += 1 42 | else: 43 | self.list[self.offset] = elem 44 | self.offset = (self.offset + 1) % self.maxsize 45 | 46 | def __getitem__(self, val): 47 | if isinstance(val, int): 48 | if 0 > val or val >= self.size: 49 | raise IndexError("Index out of range") 50 | return ( 51 | self.list[val] 52 | if self.size < self.maxsize 53 | else self.list[(self.offset + val) % self.maxsize] 54 | ) 55 | elif isinstance(val, slice): 56 | start, stop, step = val.start, val.stop, val.step 57 | if step is None: 58 | step = 1 59 | if start is None: 60 | start = 0 61 | if stop is None: 62 | stop = self.size 63 | if start < 0: 64 | start = self.size + start 65 | if stop < 0: 66 | stop = self.size + stop 67 | 68 | indices = range(start, stop, step) 69 | return [ 70 | self.list[(self.offset + i) % self.maxsize] 71 | for i in indices 72 | if i < self.size 73 | ] 74 | else: 75 | raise TypeError("Invalid argument type") 76 | 77 | 78 | if __name__ == "__main__": 79 | c = Circle(5) 80 | 81 | c.append(1) 82 | print(c.list) 83 | print(c[:]) 84 | assert c[0] == 1 85 | assert c[:5] == [1] 86 | 87 | for i in range(2, 5 + 1): 88 | c.append(i) 89 | print(c.list) 90 | print(c[:]) 91 | assert c[0] == 1 92 | assert c[:5] == [1, 2, 3, 4, 5] 93 | 94 | for i in range(5 + 1, 9 + 1): 95 | c.append(i) 96 | print(c.list) 97 | print(c[:]) 98 | assert c[0] == 5 99 | assert c[:5] == [5, 6, 7, 8, 9] 100 | # assert c[:-5] == [5,6,7,8,9] 101 | assert c[:10] == [5, 6, 7, 8, 9] 102 | -------------------------------------------------------------------------------- /examples/notebooks/Clients.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | " JSON: {\n", 12 | " \"choices\": [\n", 13 | " {\n", 14 | " \"finish_reason\": \"length\",\n", 15 | " \"index\": 0,\n", 16 | " \"logprobs\": null,\n", 17 | " \"text\": \" over the lazy dog.\"\n", 18 | " }\n", 19 | " ],\n", 20 | " \"created\": 1680960690,\n", 21 | " \"id\": \"cmpl-ad3ba53d-407c-466b-bd5f-97cb8987af83\",\n", 22 | " \"model\": \"models/ggml-alpaca.bin\",\n", 23 | " \"object\": \"text_completion\",\n", 24 | " \"usage\": {\n", 25 | " \"completion_tokens\": 5,\n", 26 | " \"prompt_tokens\": 8,\n", 27 | " \"total_tokens\": 13\n", 28 | " }\n", 29 | "}" 30 | ] 31 | }, 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "output_type": "execute_result" 35 | } 36 | ], 37 | "source": [ 38 | "import openai\n", 39 | "\n", 40 | "openai.api_key = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", 41 | "openai.api_base = \"http://100.64.159.73:8000/v1\"\n", 42 | "\n", 43 | "openai.Completion.create(\n", 44 | " model=\"text-davinci-003\", # currently can be anything\n", 45 | " prompt=\"The quick brown fox jumps\",\n", 46 | " max_tokens=5,\n", 47 | ")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "' over the lazy dog'" 59 | ] 60 | }, 61 | "execution_count": 2, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "import os\n", 68 | "\n", 69 | "os.environ[\"OPENAI_API_KEY\"] = (\n", 70 | " \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", 71 | ")\n", 72 | "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", 73 | "\n", 74 | "from langchain.llms import OpenAI\n", 75 | "\n", 76 | "llms = OpenAI()\n", 77 | "llms(\n", 78 | " prompt=\"The quick brown fox jumps\",\n", 79 | " stop=[\".\", \"\\n\"],\n", 80 | ")" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": ".venv", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.8.10" 101 | }, 102 | "orig_nbformat": 4 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 2 106 | } 107 | -------------------------------------------------------------------------------- /examples/notebooks/Guidance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
Stop program
Tweak this proverb to apply to model instructions instead.\n",
12 |        "\n",
13 |        "Where there is no guidance, a people falls,\n",
14 |        "but in an abundance of counselors there is safety.\n",
15 |        "- Proverbs 11:14\n",
16 |        "\n",
17 |        "UPDATED\n",
18 |        "Where there is no guidance for assembling a model, people will struggle,\n",
19 |        "but with clear instructions, the process becomes safe and successful.\n",
20 |        "- GPT 2 (updated): Proverbs 11:14
\n", 21 | "" 22 | ] 23 | }, 24 | "metadata": {}, 25 | "output_type": "display_data" 26 | } 27 | ], 28 | "source": [ 29 | "import os\n", 30 | "\n", 31 | "os.environ[\"OPENAI_API_KEY\"] = (\n", 32 | " \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", 33 | ")\n", 34 | "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", 35 | "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n", 36 | "\n", 37 | "import guidance\n", 38 | "\n", 39 | "# set the default language model used to execute guidance programs\n", 40 | "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n", 41 | "\n", 42 | "# define a guidance program that adapts a proverb\n", 43 | "program = guidance(\n", 44 | " \"\"\"Tweak this proverb to apply to model instructions instead.\n", 45 | "\n", 46 | "{{proverb}}\n", 47 | "- {{book}} {{chapter}}:{{verse}}\n", 48 | "\n", 49 | "UPDATED\n", 50 | "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n", 51 | "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\"\n", 52 | ")\n", 53 | "\n", 54 | "# execute the program on a specific proverb\n", 55 | "executed_program = program(\n", 56 | " proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n", 57 | " book=\"Proverbs\",\n", 58 | " chapter=11,\n", 59 | " verse=14,\n", 60 | ")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": ".venv", 74 | "language": "python", 75 | "name": "python3" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 3 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython3", 87 | "version": "3.8.10" 88 | }, 89 | "orig_nbformat": 4 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 2 93 | } 94 | -------------------------------------------------------------------------------- /examples/notebooks/Multimodal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | " \n", 9 | "
" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 13, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "{'text': 'Llama C++'}\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "from openai import OpenAI\n", 27 | "\n", 28 | "client = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"llama.cpp\")\n", 29 | "response = client.chat.completions.create(\n", 30 | " model=\"gpt-4-vision-preview\",\n", 31 | " messages=[\n", 32 | " {\n", 33 | " \"role\": \"user\",\n", 34 | " \"content\": [\n", 35 | " {\n", 36 | " \"type\": \"image_url\",\n", 37 | " \"image_url\": {\n", 38 | " \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n", 39 | " },\n", 40 | " },\n", 41 | " {\n", 42 | " \"type\": \"text\",\n", 43 | " \"text\": \"What does the image say. Format your response as a json object with a single 'text' key.\",\n", 44 | " },\n", 45 | " ],\n", 46 | " }\n", 47 | " ],\n", 48 | " response_format={\n", 49 | " \"type\": \"json_object\",\n", 50 | " \"schema\": {\"type\": \"object\", \"properties\": {\"text\": {\"type\": \"string\"}}},\n", 51 | " },\n", 52 | ")\n", 53 | "import json\n", 54 | "\n", 55 | "print(json.loads(response.choices[0].message.content))" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": ".venv", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.11.5+" 83 | }, 84 | "orig_nbformat": 4 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 2 88 | } 89 | -------------------------------------------------------------------------------- /examples/ray/README.md: -------------------------------------------------------------------------------- 1 | This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). 2 | 3 | First, install the requirements: 4 | 5 | ```bash 6 | $ pip install -r requirements.txt 7 | ``` 8 | 9 | Deploy a GGUF model to Ray Serve with the following command: 10 | 11 | ```bash 12 | $ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf' 13 | ``` 14 | 15 | This will start an API endpoint at `http://localhost:8000/`. You can query the model like this: 16 | 17 | ```bash 18 | $ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000 19 | ``` 20 | -------------------------------------------------------------------------------- /examples/ray/llm.py: -------------------------------------------------------------------------------- 1 | from starlette.requests import Request 2 | from typing import Dict 3 | from ray import serve 4 | from ray.serve import Application 5 | from llama_cpp import Llama 6 | 7 | 8 | @serve.deployment 9 | class LlamaDeployment: 10 | def __init__(self, model_path: str): 11 | self._llm = Llama(model_path=model_path) 12 | 13 | async def __call__(self, http_request: Request) -> Dict: 14 | input_json = await http_request.json() 15 | prompt = input_json["prompt"] 16 | max_tokens = input_json.get("max_tokens", 64) 17 | return self._llm(prompt, max_tokens=max_tokens) 18 | 19 | 20 | def llm_builder(args: Dict[str, str]) -> Application: 21 | return LlamaDeployment.bind(args["model_path"]) 22 | -------------------------------------------------------------------------------- /examples/ray/requirements.txt: -------------------------------------------------------------------------------- 1 | ray[serve] 2 | --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu 3 | llama-cpp-python 4 | -------------------------------------------------------------------------------- /llama_cpp/__init__.py: -------------------------------------------------------------------------------- 1 | from .llama_cpp import * 2 | from .llama import * 3 | 4 | __version__ = "0.3.9" 5 | -------------------------------------------------------------------------------- /llama_cpp/_ctypes_extensions.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import os 5 | import ctypes 6 | import functools 7 | import pathlib 8 | 9 | from typing import ( 10 | Any, 11 | Callable, 12 | List, 13 | Union, 14 | Optional, 15 | TYPE_CHECKING, 16 | TypeVar, 17 | Generic, 18 | ) 19 | from typing_extensions import TypeAlias 20 | 21 | 22 | # Load the library 23 | def load_shared_library(lib_base_name: str, base_path: pathlib.Path): 24 | """Platform independent shared library loader""" 25 | # Searching for the library in the current directory under the name "libllama" (default name 26 | # for llamacpp) and "llama" (default name for this repo) 27 | lib_paths: List[pathlib.Path] = [] 28 | # Determine the file extension based on the platform 29 | if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"): 30 | lib_paths += [ 31 | base_path / f"lib{lib_base_name}.so", 32 | ] 33 | elif sys.platform == "darwin": 34 | lib_paths += [ 35 | base_path / f"lib{lib_base_name}.so", 36 | base_path / f"lib{lib_base_name}.dylib", 37 | ] 38 | elif sys.platform == "win32": 39 | lib_paths += [ 40 | base_path / f"{lib_base_name}.dll", 41 | base_path / f"lib{lib_base_name}.dll", 42 | ] 43 | else: 44 | raise RuntimeError("Unsupported platform") 45 | 46 | cdll_args = dict() # type: ignore 47 | 48 | # Add the library directory to the DLL search path on Windows (if needed) 49 | if sys.platform == "win32": 50 | os.add_dll_directory(str(base_path)) 51 | os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"] 52 | 53 | if sys.platform == "win32" and sys.version_info >= (3, 8): 54 | os.add_dll_directory(str(base_path)) 55 | if "CUDA_PATH" in os.environ: 56 | os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) 57 | os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) 58 | if "HIP_PATH" in os.environ: 59 | os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin")) 60 | os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib")) 61 | cdll_args["winmode"] = ctypes.RTLD_GLOBAL 62 | 63 | # Try to load the shared library, handling potential errors 64 | for lib_path in lib_paths: 65 | if lib_path.exists(): 66 | try: 67 | return ctypes.CDLL(str(lib_path), **cdll_args) # type: ignore 68 | except Exception as e: 69 | raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}") 70 | 71 | raise FileNotFoundError( 72 | f"Shared library with base name '{lib_base_name}' not found" 73 | ) 74 | 75 | 76 | # ctypes sane type hint helpers 77 | # 78 | # - Generic Pointer and Array types 79 | # - PointerOrRef type with a type hinted byref function 80 | # 81 | # NOTE: Only use these for static type checking not for runtime checks 82 | # no good will come of that 83 | 84 | if TYPE_CHECKING: 85 | CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData) # type: ignore 86 | 87 | CtypesArray: TypeAlias = ctypes.Array[CtypesCData] # type: ignore 88 | 89 | CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData] # type: ignore 90 | 91 | CtypesVoidPointer: TypeAlias = ctypes.c_void_p 92 | 93 | class CtypesRef(Generic[CtypesCData]): 94 | pass 95 | 96 | CtypesPointerOrRef: TypeAlias = Union[ 97 | CtypesPointer[CtypesCData], CtypesRef[CtypesCData] 98 | ] 99 | 100 | CtypesFuncPointer: TypeAlias = ctypes._FuncPointer # type: ignore 101 | 102 | F = TypeVar("F", bound=Callable[..., Any]) 103 | 104 | 105 | def ctypes_function_for_shared_library(lib: ctypes.CDLL): 106 | """Decorator for defining ctypes functions with type hints""" 107 | 108 | def ctypes_function( 109 | name: str, argtypes: List[Any], restype: Any, enabled: bool = True 110 | ): 111 | def decorator(f: F) -> F: 112 | if enabled: 113 | func = getattr(lib, name) 114 | func.argtypes = argtypes 115 | func.restype = restype 116 | functools.wraps(f)(func) 117 | return func 118 | else: 119 | return f 120 | 121 | return decorator 122 | 123 | return ctypes_function 124 | 125 | 126 | def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]: 127 | """Type-annotated version of ctypes.byref""" 128 | ... 129 | 130 | 131 | byref = _byref if TYPE_CHECKING else ctypes.byref 132 | -------------------------------------------------------------------------------- /llama_cpp/_ggml.py: -------------------------------------------------------------------------------- 1 | """Internal module use at your own risk 2 | 3 | This module provides a minimal interface for working with ggml tensors from llama-cpp-python 4 | """ 5 | import os 6 | import pathlib 7 | 8 | import llama_cpp._ctypes_extensions as ctypes_ext 9 | 10 | libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" 11 | libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path) 12 | 13 | -------------------------------------------------------------------------------- /llama_cpp/_logger.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import ctypes 3 | import logging 4 | 5 | import llama_cpp 6 | 7 | # enum ggml_log_level { 8 | # GGML_LOG_LEVEL_NONE = 0, 9 | # GGML_LOG_LEVEL_INFO = 1, 10 | # GGML_LOG_LEVEL_WARN = 2, 11 | # GGML_LOG_LEVEL_ERROR = 3, 12 | # GGML_LOG_LEVEL_DEBUG = 4, 13 | # GGML_LOG_LEVEL_CONT = 5, // continue previous log 14 | # }; 15 | GGML_LOG_LEVEL_TO_LOGGING_LEVEL = { 16 | 0: logging.CRITICAL, 17 | 1: logging.INFO, 18 | 2: logging.WARNING, 19 | 3: logging.ERROR, 20 | 4: logging.DEBUG, 21 | 5: logging.DEBUG, 22 | } 23 | 24 | logger = logging.getLogger("llama-cpp-python") 25 | 26 | _last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0] 27 | 28 | # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); 29 | @llama_cpp.llama_log_callback 30 | def llama_log_callback( 31 | level: int, 32 | text: bytes, 33 | user_data: ctypes.c_void_p, 34 | ): 35 | # TODO: Correctly implement continue previous log 36 | global _last_log_level 37 | log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level 38 | if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: 39 | print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) 40 | _last_log_level = log_level 41 | 42 | 43 | llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) 44 | 45 | 46 | def set_verbose(verbose: bool): 47 | logger.setLevel(logging.DEBUG if verbose else logging.ERROR) 48 | -------------------------------------------------------------------------------- /llama_cpp/_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from typing import Any, Dict 5 | 6 | # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor 7 | outnull_file = open(os.devnull, "w") 8 | errnull_file = open(os.devnull, "w") 9 | 10 | STDOUT_FILENO = 1 11 | STDERR_FILENO = 2 12 | 13 | 14 | class suppress_stdout_stderr(object): 15 | # NOTE: these must be "saved" here to avoid exceptions when using 16 | # this context manager inside of a __del__ method 17 | sys = sys 18 | os = os 19 | 20 | def __init__(self, disable: bool = True): 21 | self.disable = disable 22 | 23 | # Oddly enough this works better than the contextlib version 24 | def __enter__(self): 25 | if self.disable: 26 | return self 27 | 28 | self.old_stdout_fileno_undup = STDOUT_FILENO 29 | self.old_stderr_fileno_undup = STDERR_FILENO 30 | 31 | self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup) 32 | self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup) 33 | 34 | self.old_stdout = self.sys.stdout 35 | self.old_stderr = self.sys.stderr 36 | 37 | self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup) 38 | self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup) 39 | 40 | self.sys.stdout = outnull_file 41 | self.sys.stderr = errnull_file 42 | return self 43 | 44 | def __exit__(self, *_): 45 | if self.disable: 46 | return 47 | 48 | # Check if sys.stdout and sys.stderr have fileno method 49 | self.sys.stdout = self.old_stdout 50 | self.sys.stderr = self.old_stderr 51 | 52 | self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) 53 | self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) 54 | 55 | self.os.close(self.old_stdout_fileno) 56 | self.os.close(self.old_stderr_fileno) 57 | 58 | 59 | class MetaSingleton(type): 60 | """ 61 | Metaclass for implementing the Singleton pattern. 62 | """ 63 | 64 | _instances: Dict[type, Any] = {} 65 | 66 | def __call__(cls, *args: Any, **kwargs: Any) -> Any: 67 | if cls not in cls._instances: 68 | cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs) 69 | return cls._instances[cls] 70 | 71 | 72 | class Singleton(object, metaclass=MetaSingleton): 73 | """ 74 | Base class for implementing the Singleton pattern. 75 | """ 76 | 77 | def __init__(self): 78 | super(Singleton, self).__init__() 79 | -------------------------------------------------------------------------------- /llama_cpp/llama_cache.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from abc import ABC, abstractmethod 3 | from typing import ( 4 | Optional, 5 | Sequence, 6 | Tuple, 7 | ) 8 | from collections import OrderedDict 9 | 10 | import diskcache 11 | 12 | import llama_cpp.llama 13 | 14 | from .llama_types import * 15 | 16 | 17 | class BaseLlamaCache(ABC): 18 | """Base cache class for a llama.cpp model.""" 19 | 20 | def __init__(self, capacity_bytes: int = (2 << 30)): 21 | self.capacity_bytes = capacity_bytes 22 | 23 | @property 24 | @abstractmethod 25 | def cache_size(self) -> int: 26 | raise NotImplementedError 27 | 28 | def _find_longest_prefix_key( 29 | self, 30 | key: Tuple[int, ...], 31 | ) -> Optional[Tuple[int, ...]]: 32 | pass 33 | 34 | @abstractmethod 35 | def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": 36 | raise NotImplementedError 37 | 38 | @abstractmethod 39 | def __contains__(self, key: Sequence[int]) -> bool: 40 | raise NotImplementedError 41 | 42 | @abstractmethod 43 | def __setitem__( 44 | self, key: Sequence[int], value: "llama_cpp.llama.LlamaState" 45 | ) -> None: 46 | raise NotImplementedError 47 | 48 | 49 | class LlamaRAMCache(BaseLlamaCache): 50 | """Cache for a llama.cpp model using RAM.""" 51 | 52 | def __init__(self, capacity_bytes: int = (2 << 30)): 53 | super().__init__(capacity_bytes) 54 | self.capacity_bytes = capacity_bytes 55 | self.cache_state: OrderedDict[ 56 | Tuple[int, ...], "llama_cpp.llama.LlamaState" 57 | ] = OrderedDict() 58 | 59 | @property 60 | def cache_size(self): 61 | return sum([state.llama_state_size for state in self.cache_state.values()]) 62 | 63 | def _find_longest_prefix_key( 64 | self, 65 | key: Tuple[int, ...], 66 | ) -> Optional[Tuple[int, ...]]: 67 | min_len = 0 68 | min_key = None 69 | keys = ( 70 | (k, llama_cpp.llama.Llama.longest_token_prefix(k, key)) 71 | for k in self.cache_state.keys() 72 | ) 73 | for k, prefix_len in keys: 74 | if prefix_len > min_len: 75 | min_len = prefix_len 76 | min_key = k 77 | return min_key 78 | 79 | def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": 80 | key = tuple(key) 81 | _key = self._find_longest_prefix_key(key) 82 | if _key is None: 83 | raise KeyError("Key not found") 84 | value = self.cache_state[_key] 85 | self.cache_state.move_to_end(_key) 86 | return value 87 | 88 | def __contains__(self, key: Sequence[int]) -> bool: 89 | return self._find_longest_prefix_key(tuple(key)) is not None 90 | 91 | def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): 92 | key = tuple(key) 93 | if key in self.cache_state: 94 | del self.cache_state[key] 95 | self.cache_state[key] = value 96 | while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: 97 | self.cache_state.popitem(last=False) 98 | 99 | 100 | # Alias for backwards compatibility 101 | LlamaCache = LlamaRAMCache 102 | 103 | 104 | class LlamaDiskCache(BaseLlamaCache): 105 | """Cache for a llama.cpp model using disk.""" 106 | 107 | def __init__( 108 | self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) 109 | ): 110 | super().__init__(capacity_bytes) 111 | self.cache = diskcache.Cache(cache_dir) 112 | 113 | @property 114 | def cache_size(self): 115 | return int(self.cache.volume()) # type: ignore 116 | 117 | def _find_longest_prefix_key( 118 | self, 119 | key: Tuple[int, ...], 120 | ) -> Optional[Tuple[int, ...]]: 121 | min_len = 0 122 | min_key: Optional[Tuple[int, ...]] = None 123 | for k in self.cache.iterkeys(): # type: ignore 124 | prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) 125 | if prefix_len > min_len: 126 | min_len = prefix_len 127 | min_key = k # type: ignore 128 | return min_key 129 | 130 | def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": 131 | key = tuple(key) 132 | _key = self._find_longest_prefix_key(key) 133 | if _key is None: 134 | raise KeyError("Key not found") 135 | value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore 136 | # NOTE: This puts an integer as key in cache, which breaks, 137 | # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens 138 | # self.cache.push(_key, side="front") # type: ignore 139 | return value 140 | 141 | def __contains__(self, key: Sequence[int]) -> bool: 142 | return self._find_longest_prefix_key(tuple(key)) is not None 143 | 144 | def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): 145 | print("LlamaDiskCache.__setitem__: called", file=sys.stderr) 146 | key = tuple(key) 147 | if key in self.cache: 148 | print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) 149 | del self.cache[key] 150 | self.cache[key] = value 151 | print("LlamaDiskCache.__setitem__: set", file=sys.stderr) 152 | while self.cache_size > self.capacity_bytes and len(self.cache) > 0: 153 | key_to_remove = next(iter(self.cache)) 154 | del self.cache[key_to_remove] 155 | print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) 156 | -------------------------------------------------------------------------------- /llama_cpp/llama_speculative.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from typing import Any 4 | 5 | import numpy as np 6 | import numpy.typing as npt 7 | 8 | 9 | class LlamaDraftModel(abc.ABC): 10 | @abc.abstractmethod 11 | def __call__( 12 | self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any 13 | ) -> npt.NDArray[np.intc]: 14 | raise NotImplementedError() 15 | 16 | 17 | class LlamaPromptLookupDecoding(LlamaDraftModel): 18 | """Based on https://github.com/apoorvumang/prompt-lookup-decoding""" 19 | 20 | def __init__(self, max_ngram_size: int = 2, num_pred_tokens: int = 10): 21 | self.max_ngram_size = max_ngram_size 22 | self.num_pred_tokens = num_pred_tokens 23 | 24 | @staticmethod 25 | def find_candidate_pred_tokens( 26 | input_ids: npt.NDArray[np.intc], 27 | max_ngram_size: int, 28 | num_pred_tokens: int, 29 | ): 30 | input_length = input_ids.shape[0] 31 | 32 | for ngram_size in range(min(max_ngram_size, input_length - 1), 0, -1): 33 | # Create sliding windows of size ngram_size 34 | windows = np.lib.stride_tricks.sliding_window_view(input_ids, (ngram_size,)) 35 | 36 | # Convert ngram to an array for comparison 37 | ngram_array = input_ids[-ngram_size:] 38 | 39 | # Find where the windows match the ngram 40 | matches = np.all(windows == ngram_array, axis=1) 41 | 42 | # Get the indices of matches 43 | match_indices = np.nonzero(matches)[0] 44 | 45 | # Iterate through match indices to find a valid continuation 46 | for idx in match_indices: 47 | start_idx = idx + ngram_size 48 | end_idx = start_idx + num_pred_tokens 49 | end_idx = min(end_idx, input_length) 50 | 51 | if start_idx < end_idx: 52 | return input_ids[start_idx:end_idx] 53 | 54 | # If no match is found, return an empty array 55 | return np.array([], dtype=np.intc) 56 | 57 | def __call__( 58 | self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any 59 | ) -> npt.NDArray[np.intc]: 60 | return self.find_candidate_pred_tokens( 61 | input_ids=input_ids, 62 | max_ngram_size=self.max_ngram_size, 63 | num_pred_tokens=self.num_pred_tokens, 64 | ) 65 | -------------------------------------------------------------------------------- /llama_cpp/llama_tokenizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | from typing import ( 5 | List, 6 | Optional, 7 | Any, 8 | ) 9 | 10 | import llama_cpp 11 | from llama_cpp.llama_types import List 12 | 13 | 14 | class BaseLlamaTokenizer(abc.ABC): 15 | @abc.abstractmethod 16 | def tokenize( 17 | self, text: bytes, add_bos: bool = True, special: bool = True 18 | ) -> List[int]: 19 | """Tokenize the text into tokens. 20 | 21 | Args: 22 | text: The utf-8 encoded string to tokenize. 23 | add_bos: Whether to add a beginning of sequence token. 24 | special: Whether to tokenize special tokens. 25 | """ 26 | raise NotImplementedError 27 | 28 | @abc.abstractmethod 29 | def detokenize( 30 | self, 31 | tokens: List[int], 32 | prev_tokens: Optional[List[int]] = None, 33 | special: bool = False, 34 | ) -> bytes: 35 | """Detokenize the tokens into text. 36 | 37 | Args: 38 | tokens: The list of tokens to detokenize. 39 | prev_tokens: The list of previous tokens. Offset mapping will be performed if provided. 40 | special: Whether to detokenize special tokens. 41 | """ 42 | raise NotImplementedError 43 | 44 | 45 | class LlamaTokenizer(BaseLlamaTokenizer): 46 | def __init__(self, llama: llama_cpp.Llama): 47 | self._model = llama._model # type: ignore 48 | 49 | def tokenize( 50 | self, text: bytes, add_bos: bool = True, special: bool = True 51 | ) -> List[int]: 52 | return self._model.tokenize(text, add_bos=add_bos, special=special) 53 | 54 | def detokenize( 55 | self, 56 | tokens: List[int], 57 | prev_tokens: Optional[List[int]] = None, 58 | special: bool = False, 59 | ) -> bytes: 60 | return self._model.detokenize(tokens, special=special) 61 | 62 | def encode( 63 | self, text: str, add_bos: bool = True, special: bool = True 64 | ) -> List[int]: 65 | return self.tokenize( 66 | text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special 67 | ) 68 | 69 | def decode(self, tokens: List[int]) -> str: 70 | return self.detokenize(tokens).decode("utf-8", errors="ignore") 71 | 72 | @classmethod 73 | def from_ggml_file(cls, path: str) -> "LlamaTokenizer": 74 | return cls(llama_cpp.Llama(model_path=path, vocab_only=True)) 75 | 76 | 77 | class LlamaHFTokenizer(BaseLlamaTokenizer): 78 | def __init__(self, hf_tokenizer: Any): 79 | self.hf_tokenizer = hf_tokenizer 80 | 81 | def tokenize( 82 | self, text: bytes, add_bos: bool = True, special: bool = True 83 | ) -> List[int]: 84 | return self.hf_tokenizer.encode( 85 | text.decode("utf-8", errors="ignore"), add_special_tokens=special 86 | ) 87 | 88 | def detokenize( 89 | self, 90 | tokens: List[int], 91 | prev_tokens: Optional[List[int]] = None, 92 | special: bool = False, 93 | ) -> bytes: 94 | skip_special_tokens = not special 95 | if prev_tokens is not None: 96 | text = self.hf_tokenizer.decode( 97 | prev_tokens + tokens, skip_special_tokens=skip_special_tokens 98 | ).encode("utf-8", errors="ignore") 99 | prev_text = self.hf_tokenizer.decode( 100 | prev_tokens, skip_special_tokens=skip_special_tokens 101 | ).encode("utf-8", errors="ignore") 102 | return text[len(prev_text) :] 103 | else: 104 | return self.hf_tokenizer.decode( 105 | tokens, skip_special_tokens=skip_special_tokens 106 | ).encode("utf-8", errors="ignore") 107 | 108 | @classmethod 109 | def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": 110 | try: 111 | from transformers import AutoTokenizer 112 | except ImportError: 113 | raise ImportError( 114 | "The `transformers` library is required to use the `HFTokenizer`." 115 | "You can install it with `pip install transformers`." 116 | ) 117 | hf_tokenizer = AutoTokenizer.from_pretrained( 118 | pretrained_model_name_or_path=pretrained_model_name_or_path 119 | ) 120 | return cls(hf_tokenizer) 121 | -------------------------------------------------------------------------------- /llama_cpp/llama_types.py: -------------------------------------------------------------------------------- 1 | """Types and request signatures for OpenAI compatibility 2 | 3 | NOTE: These types may change to match the OpenAI OpenAPI specification. 4 | 5 | Based on the OpenAI OpenAPI specification: 6 | https://github.com/openai/openai-openapi/blob/master/openapi.yaml 7 | 8 | """ 9 | 10 | from typing import Any, List, Optional, Dict, Union 11 | from typing_extensions import TypedDict, NotRequired, Literal 12 | 13 | 14 | # NOTE: Defining this correctly using annotations seems to break pydantic validation. 15 | # This is a workaround until we can figure out how to do this correctly 16 | # JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]] 17 | JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]] 18 | 19 | 20 | class EmbeddingUsage(TypedDict): 21 | prompt_tokens: int 22 | total_tokens: int 23 | 24 | 25 | class Embedding(TypedDict): 26 | index: int 27 | object: str 28 | embedding: Union[List[float], List[List[float]]] 29 | 30 | 31 | class CreateEmbeddingResponse(TypedDict): 32 | object: Literal["list"] 33 | model: str 34 | data: List[Embedding] 35 | usage: EmbeddingUsage 36 | 37 | 38 | class CompletionLogprobs(TypedDict): 39 | text_offset: List[int] 40 | token_logprobs: List[Optional[float]] 41 | tokens: List[str] 42 | top_logprobs: List[Optional[Dict[str, float]]] 43 | 44 | 45 | class CompletionChoice(TypedDict): 46 | text: str 47 | index: int 48 | logprobs: Optional[CompletionLogprobs] 49 | finish_reason: Optional[Literal["stop", "length"]] 50 | 51 | 52 | class CompletionUsage(TypedDict): 53 | prompt_tokens: int 54 | completion_tokens: int 55 | total_tokens: int 56 | 57 | 58 | class CreateCompletionResponse(TypedDict): 59 | id: str 60 | object: Literal["text_completion"] 61 | created: int 62 | model: str 63 | choices: List[CompletionChoice] 64 | usage: NotRequired[CompletionUsage] 65 | 66 | 67 | class ChatCompletionResponseFunctionCall(TypedDict): 68 | name: str 69 | arguments: str 70 | 71 | 72 | class ChatCompletionResponseMessage(TypedDict): 73 | content: Optional[str] 74 | tool_calls: NotRequired["ChatCompletionMessageToolCalls"] 75 | role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here 76 | function_call: NotRequired[ChatCompletionResponseFunctionCall] # DEPRECATED 77 | 78 | 79 | class ChatCompletionFunction(TypedDict): 80 | name: str 81 | description: NotRequired[str] 82 | parameters: Dict[str, JsonType] # TODO: make this more specific 83 | 84 | 85 | class ChatCompletionTopLogprobToken(TypedDict): 86 | token: str 87 | logprob: float 88 | bytes: Optional[List[int]] 89 | 90 | 91 | class ChatCompletionLogprobToken(ChatCompletionTopLogprobToken): 92 | token: str 93 | logprob: float 94 | bytes: Optional[List[int]] 95 | top_logprobs: List[ChatCompletionTopLogprobToken] 96 | 97 | 98 | class ChatCompletionLogprobs(TypedDict): 99 | content: Optional[List[ChatCompletionLogprobToken]] 100 | refusal: Optional[List[ChatCompletionLogprobToken]] 101 | 102 | 103 | class ChatCompletionResponseChoice(TypedDict): 104 | index: int 105 | message: "ChatCompletionResponseMessage" 106 | logprobs: Optional[ChatCompletionLogprobs] 107 | finish_reason: Optional[str] 108 | 109 | 110 | class CreateChatCompletionResponse(TypedDict): 111 | id: str 112 | object: Literal["chat.completion"] 113 | created: int 114 | model: str 115 | choices: List["ChatCompletionResponseChoice"] 116 | usage: CompletionUsage 117 | 118 | 119 | class ChatCompletionMessageToolCallChunkFunction(TypedDict): 120 | name: Optional[str] 121 | arguments: str 122 | 123 | 124 | class ChatCompletionMessageToolCallChunk(TypedDict): 125 | index: int 126 | id: NotRequired[str] 127 | type: Literal["function"] 128 | function: ChatCompletionMessageToolCallChunkFunction 129 | 130 | 131 | class ChatCompletionStreamResponseDeltaEmpty(TypedDict): 132 | pass 133 | 134 | 135 | class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict): 136 | name: str 137 | arguments: str 138 | 139 | 140 | class ChatCompletionStreamResponseDelta(TypedDict): 141 | content: NotRequired[Optional[str]] 142 | function_call: NotRequired[ 143 | Optional[ChatCompletionStreamResponseDeltaFunctionCall] 144 | ] # DEPRECATED 145 | tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]] 146 | role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]] 147 | 148 | 149 | class ChatCompletionStreamResponseChoice(TypedDict): 150 | index: int 151 | delta: Union[ 152 | ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty 153 | ] 154 | finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]] 155 | logprobs: NotRequired[Optional[ChatCompletionLogprobs]] 156 | 157 | 158 | class CreateChatCompletionStreamResponse(TypedDict): 159 | id: str 160 | model: str 161 | object: Literal["chat.completion.chunk"] 162 | created: int 163 | choices: List[ChatCompletionStreamResponseChoice] 164 | 165 | 166 | class ChatCompletionFunctions(TypedDict): 167 | name: str 168 | description: NotRequired[str] 169 | parameters: Dict[str, JsonType] # TODO: make this more specific 170 | 171 | 172 | class ChatCompletionFunctionCallOption(TypedDict): 173 | name: str 174 | 175 | 176 | class ChatCompletionRequestResponseFormat(TypedDict): 177 | type: Literal["text", "json_object"] 178 | schema: NotRequired[ 179 | JsonType 180 | ] # https://docs.endpoints.anyscale.com/guides/json_mode/ 181 | 182 | 183 | class ChatCompletionRequestMessageContentPartText(TypedDict): 184 | type: Literal["text"] 185 | text: str 186 | 187 | 188 | class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict): 189 | url: str 190 | detail: NotRequired[Literal["auto", "low", "high"]] 191 | 192 | 193 | class ChatCompletionRequestMessageContentPartImage(TypedDict): 194 | type: Literal["image_url"] 195 | image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl] 196 | 197 | 198 | ChatCompletionRequestMessageContentPart = Union[ 199 | ChatCompletionRequestMessageContentPartText, 200 | ChatCompletionRequestMessageContentPartImage, 201 | ] 202 | 203 | 204 | class ChatCompletionRequestSystemMessage(TypedDict): 205 | role: Literal["system"] 206 | content: Optional[str] 207 | 208 | 209 | class ChatCompletionRequestUserMessage(TypedDict): 210 | role: Literal["user"] 211 | content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]] 212 | 213 | 214 | class ChatCompletionMessageToolCallFunction(TypedDict): 215 | name: str 216 | arguments: str 217 | 218 | 219 | class ChatCompletionMessageToolCall(TypedDict): 220 | id: str 221 | type: Literal["function"] 222 | function: ChatCompletionMessageToolCallFunction 223 | 224 | 225 | ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall] 226 | 227 | 228 | class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict): 229 | name: str 230 | arguments: str 231 | 232 | 233 | class ChatCompletionRequestAssistantMessage(TypedDict): 234 | role: Literal["assistant"] 235 | content: NotRequired[str] 236 | tool_calls: NotRequired[ChatCompletionMessageToolCalls] 237 | function_call: NotRequired[ 238 | ChatCompletionRequestAssistantMessageFunctionCall 239 | ] # DEPRECATED 240 | 241 | 242 | class ChatCompletionRequestToolMessage(TypedDict): 243 | role: Literal["tool"] 244 | content: Optional[str] 245 | tool_call_id: str 246 | 247 | 248 | class ChatCompletionRequestFunctionMessage(TypedDict): 249 | role: Literal["function"] 250 | content: Optional[str] 251 | name: str 252 | 253 | 254 | ChatCompletionRequestMessage = Union[ 255 | ChatCompletionRequestSystemMessage, 256 | ChatCompletionRequestUserMessage, 257 | ChatCompletionRequestAssistantMessage, 258 | ChatCompletionRequestUserMessage, 259 | ChatCompletionRequestToolMessage, 260 | ChatCompletionRequestFunctionMessage, 261 | ] 262 | 263 | 264 | class ChatCompletionRequestFunctionCallOption(TypedDict): 265 | name: str 266 | 267 | 268 | ChatCompletionRequestFunctionCall = Union[ 269 | Literal["none", "auto"], ChatCompletionRequestFunctionCallOption 270 | ] 271 | 272 | ChatCompletionFunctionParameters = Dict[str, JsonType] # TODO: make this more specific 273 | 274 | 275 | class ChatCompletionToolFunction(TypedDict): 276 | name: str 277 | description: NotRequired[str] 278 | parameters: ChatCompletionFunctionParameters 279 | 280 | 281 | class ChatCompletionTool(TypedDict): 282 | type: Literal["function"] 283 | function: ChatCompletionToolFunction 284 | 285 | 286 | class ChatCompletionNamedToolChoiceFunction(TypedDict): 287 | name: str 288 | 289 | 290 | class ChatCompletionNamedToolChoice(TypedDict): 291 | type: Literal["function"] 292 | function: ChatCompletionNamedToolChoiceFunction 293 | 294 | 295 | ChatCompletionToolChoiceOption = Union[ 296 | Literal["none", "auto", "required"], ChatCompletionNamedToolChoice 297 | ] 298 | 299 | 300 | # NOTE: The following type names are not part of the OpenAI OpenAPI specification 301 | # and will be removed in a future major release. 302 | 303 | EmbeddingData = Embedding 304 | CompletionChunk = CreateCompletionResponse 305 | Completion = CreateCompletionResponse 306 | CreateCompletionStreamResponse = CreateCompletionResponse 307 | ChatCompletionMessage = ChatCompletionResponseMessage 308 | ChatCompletionChoice = ChatCompletionResponseChoice 309 | ChatCompletion = CreateChatCompletionResponse 310 | ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty 311 | ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice 312 | ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta 313 | ChatCompletionChunk = CreateChatCompletionStreamResponse 314 | ChatCompletionStreamResponse = CreateChatCompletionStreamResponse 315 | ChatCompletionResponseFunction = ChatCompletionFunction 316 | ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall 317 | -------------------------------------------------------------------------------- /llama_cpp/llava_cpp.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from ctypes import ( 5 | c_bool, 6 | c_char_p, 7 | c_int, 8 | c_uint8, 9 | c_float, 10 | c_void_p, 11 | POINTER, 12 | _Pointer, # type: ignore 13 | Structure, 14 | ) 15 | import pathlib 16 | from typing import ( 17 | Union, 18 | NewType, 19 | Optional, 20 | TYPE_CHECKING, 21 | ) 22 | 23 | import llama_cpp.llama_cpp as llama_cpp 24 | 25 | from llama_cpp._ctypes_extensions import ( 26 | load_shared_library, 27 | ctypes_function_for_shared_library, 28 | ) 29 | 30 | if TYPE_CHECKING: 31 | from llama_cpp._ctypes_extensions import ( 32 | CtypesArray, 33 | ) 34 | 35 | 36 | # Specify the base name of the shared library to load 37 | _libllava_base_name = "llava" 38 | _libllava_override_path = os.environ.get("LLAVA_CPP_LIB") 39 | _libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path() 40 | 41 | # Load the library 42 | _libllava = load_shared_library(_libllava_base_name, _libllava_base_path) 43 | 44 | ctypes_function = ctypes_function_for_shared_library(_libllava) 45 | 46 | 47 | ################################################ 48 | # llava.h 49 | ################################################ 50 | 51 | # struct clip_ctx; 52 | clip_ctx_p = NewType("clip_ctx_p", int) 53 | clip_ctx_p_ctypes = c_void_p 54 | 55 | 56 | # struct llava_image_embed { 57 | # float * embed; 58 | # int n_image_pos; 59 | # }; 60 | class llava_image_embed(Structure): 61 | _fields_ = [ 62 | ("embed", POINTER(c_float)), 63 | ("n_image_pos", c_int), 64 | ] 65 | 66 | 67 | # /** sanity check for clip <-> llava embed size match */ 68 | # LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); 69 | @ctypes_function( 70 | "llava_validate_embed_size", 71 | [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes], 72 | c_bool, 73 | ) 74 | def llava_validate_embed_size( 75 | ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, / 76 | ) -> bool: 77 | ... 78 | 79 | 80 | # /** build an image embed from image file bytes */ 81 | # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); 82 | @ctypes_function( 83 | "llava_image_embed_make_with_bytes", 84 | [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int], 85 | POINTER(llava_image_embed), 86 | ) 87 | def llava_image_embed_make_with_bytes( 88 | ctx_clip: clip_ctx_p, 89 | n_threads: Union[c_int, int], 90 | image_bytes: CtypesArray[c_uint8], 91 | image_bytes_length: Union[c_int, int], 92 | /, 93 | ) -> "_Pointer[llava_image_embed]": 94 | ... 95 | 96 | 97 | # /** build an image embed from a path to an image filename */ 98 | # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); 99 | @ctypes_function( 100 | "llava_image_embed_make_with_filename", 101 | [clip_ctx_p_ctypes, c_int, c_char_p], 102 | POINTER(llava_image_embed), 103 | ) 104 | def llava_image_embed_make_with_filename( 105 | ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, / 106 | ) -> "_Pointer[llava_image_embed]": 107 | ... 108 | 109 | 110 | # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); 111 | # /** free an embedding made with llava_image_embed_make_* */ 112 | @ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None) 113 | def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): 114 | ... 115 | 116 | 117 | # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ 118 | # LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); 119 | @ctypes_function( 120 | "llava_eval_image_embed", 121 | [ 122 | llama_cpp.llama_context_p_ctypes, 123 | POINTER(llava_image_embed), 124 | c_int, 125 | POINTER(c_int), 126 | ], 127 | c_bool, 128 | ) 129 | def llava_eval_image_embed( 130 | ctx_llama: llama_cpp.llama_context_p, 131 | embed: "_Pointer[llava_image_embed]", 132 | n_batch: Union[c_int, int], 133 | n_past: "_Pointer[c_int]", 134 | /, 135 | ) -> bool: 136 | ... 137 | 138 | 139 | ################################################ 140 | # clip.h 141 | ################################################ 142 | 143 | 144 | # /** load mmproj model */ 145 | # CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); 146 | @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes) 147 | def clip_model_load( 148 | fname: bytes, verbosity: Union[c_int, int], / 149 | ) -> Optional[clip_ctx_p]: 150 | ... 151 | 152 | 153 | # /** free mmproj model */ 154 | # CLIP_API void clip_free(struct clip_ctx * ctx); 155 | @ctypes_function("clip_free", [clip_ctx_p_ctypes], None) 156 | def clip_free(ctx: clip_ctx_p, /): 157 | ... 158 | 159 | -------------------------------------------------------------------------------- /llama_cpp/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abetlen/llama-cpp-python/b1d23df0bbd327b774083b5cf88e67ca0dd52b92/llama_cpp/py.typed -------------------------------------------------------------------------------- /llama_cpp/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abetlen/llama-cpp-python/b1d23df0bbd327b774083b5cf88e67ca0dd52b92/llama_cpp/server/__init__.py -------------------------------------------------------------------------------- /llama_cpp/server/__main__.py: -------------------------------------------------------------------------------- 1 | """Example FastAPI server for llama.cpp. 2 | 3 | To run this example: 4 | 5 | ```bash 6 | pip install fastapi uvicorn sse-starlette pydantic-settings 7 | export MODEL=../models/7B/... 8 | ``` 9 | 10 | Then run: 11 | ``` 12 | uvicorn llama_cpp.server.app:create_app --reload 13 | ``` 14 | 15 | or 16 | 17 | ``` 18 | python3 -m llama_cpp.server 19 | ``` 20 | 21 | Then visit http://localhost:8000/docs to see the interactive API docs. 22 | 23 | """ 24 | 25 | from __future__ import annotations 26 | 27 | import os 28 | import sys 29 | import argparse 30 | 31 | import uvicorn 32 | 33 | from llama_cpp.server.app import create_app 34 | from llama_cpp.server.settings import ( 35 | Settings, 36 | ServerSettings, 37 | ModelSettings, 38 | ConfigFileSettings, 39 | ) 40 | from llama_cpp.server.cli import add_args_from_model, parse_model_from_args 41 | 42 | 43 | def main(): 44 | description = "🦙 Llama.cpp python server. Host your own LLMs!🚀" 45 | parser = argparse.ArgumentParser(description=description) 46 | 47 | add_args_from_model(parser, Settings) 48 | parser.add_argument( 49 | "--config_file", 50 | type=str, 51 | help="Path to a config file to load.", 52 | ) 53 | server_settings: ServerSettings | None = None 54 | model_settings: list[ModelSettings] = [] 55 | args = parser.parse_args() 56 | try: 57 | # Load server settings from config_file if provided 58 | config_file = os.environ.get("CONFIG_FILE", args.config_file) 59 | if config_file: 60 | if not os.path.exists(config_file): 61 | raise ValueError(f"Config file {config_file} not found!") 62 | with open(config_file, "rb") as f: 63 | # Check if yaml file 64 | if config_file.endswith(".yaml") or config_file.endswith(".yml"): 65 | import yaml 66 | import json 67 | 68 | config_file_settings = ConfigFileSettings.model_validate_json( 69 | json.dumps(yaml.safe_load(f)) 70 | ) 71 | else: 72 | config_file_settings = ConfigFileSettings.model_validate_json( 73 | f.read() 74 | ) 75 | server_settings = ServerSettings.model_validate(config_file_settings) 76 | model_settings = config_file_settings.models 77 | else: 78 | server_settings = parse_model_from_args(ServerSettings, args) 79 | model_settings = [parse_model_from_args(ModelSettings, args)] 80 | except Exception as e: 81 | print(e, file=sys.stderr) 82 | parser.print_help() 83 | sys.exit(1) 84 | assert server_settings is not None 85 | assert model_settings is not None 86 | app = create_app( 87 | server_settings=server_settings, 88 | model_settings=model_settings, 89 | ) 90 | uvicorn.run( 91 | app, 92 | host=os.getenv("HOST", server_settings.host), 93 | port=int(os.getenv("PORT", server_settings.port)), 94 | ssl_keyfile=server_settings.ssl_keyfile, 95 | ssl_certfile=server_settings.ssl_certfile, 96 | ) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /llama_cpp/server/cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | 5 | from typing import List, Literal, Union, Any, Type, TypeVar 6 | 7 | from pydantic import BaseModel 8 | 9 | 10 | def _get_base_type(annotation: Type[Any]) -> Type[Any]: 11 | if getattr(annotation, "__origin__", None) is Literal: 12 | assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore 13 | return type(annotation.__args__[0]) # type: ignore 14 | elif getattr(annotation, "__origin__", None) is Union: 15 | assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore 16 | non_optional_args: List[Type[Any]] = [ 17 | arg for arg in annotation.__args__ if arg is not type(None) # type: ignore 18 | ] 19 | if non_optional_args: 20 | return _get_base_type(non_optional_args[0]) 21 | elif ( 22 | getattr(annotation, "__origin__", None) is list 23 | or getattr(annotation, "__origin__", None) is List 24 | ): 25 | assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore 26 | return _get_base_type(annotation.__args__[0]) # type: ignore 27 | return annotation 28 | 29 | 30 | def _contains_list_type(annotation: Type[Any] | None) -> bool: 31 | origin = getattr(annotation, "__origin__", None) 32 | 33 | if origin is list or origin is List: 34 | return True 35 | elif origin in (Literal, Union): 36 | return any(_contains_list_type(arg) for arg in annotation.__args__) # type: ignore 37 | else: 38 | return False 39 | 40 | 41 | def _parse_bool_arg(arg: str | bytes | bool) -> bool: 42 | if isinstance(arg, bytes): 43 | arg = arg.decode("utf-8") 44 | 45 | true_values = {"1", "on", "t", "true", "y", "yes"} 46 | false_values = {"0", "off", "f", "false", "n", "no"} 47 | 48 | arg_str = str(arg).lower().strip() 49 | 50 | if arg_str in true_values: 51 | return True 52 | elif arg_str in false_values: 53 | return False 54 | else: 55 | raise ValueError(f"Invalid boolean argument: {arg}") 56 | 57 | 58 | def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]): 59 | """Add arguments from a pydantic model to an argparse parser.""" 60 | 61 | for name, field in model.model_fields.items(): 62 | description = field.description 63 | if field.default and description and not field.is_required(): 64 | description += f" (default: {field.default})" 65 | base_type = ( 66 | _get_base_type(field.annotation) if field.annotation is not None else str 67 | ) 68 | list_type = _contains_list_type(field.annotation) 69 | if base_type is not bool: 70 | parser.add_argument( 71 | f"--{name}", 72 | dest=name, 73 | nargs="*" if list_type else None, 74 | type=base_type, 75 | help=description, 76 | ) 77 | if base_type is bool: 78 | parser.add_argument( 79 | f"--{name}", 80 | dest=name, 81 | type=_parse_bool_arg, 82 | help=f"{description}", 83 | ) 84 | 85 | 86 | T = TypeVar("T", bound=Type[BaseModel]) 87 | 88 | 89 | def parse_model_from_args(model: T, args: argparse.Namespace) -> T: 90 | """Parse a pydantic model from an argparse namespace.""" 91 | return model( 92 | **{ 93 | k: v 94 | for k, v in vars(args).items() 95 | if v is not None and k in model.model_fields 96 | } 97 | ) 98 | -------------------------------------------------------------------------------- /llama_cpp/server/errors.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import traceback 5 | import time 6 | from re import compile, Match, Pattern 7 | from typing import Callable, Coroutine, Optional, Tuple, Union, Dict 8 | from typing_extensions import TypedDict 9 | 10 | 11 | from fastapi import ( 12 | Request, 13 | Response, 14 | HTTPException, 15 | ) 16 | from fastapi.responses import JSONResponse 17 | from fastapi.routing import APIRoute 18 | 19 | from llama_cpp.server.types import ( 20 | CreateCompletionRequest, 21 | CreateEmbeddingRequest, 22 | CreateChatCompletionRequest, 23 | ) 24 | 25 | 26 | class ErrorResponse(TypedDict): 27 | """OpenAI style error response""" 28 | 29 | message: str 30 | type: str 31 | param: Optional[str] 32 | code: Optional[str] 33 | 34 | 35 | class ErrorResponseFormatters: 36 | """Collection of formatters for error responses. 37 | 38 | Args: 39 | request (Union[CreateCompletionRequest, CreateChatCompletionRequest]): 40 | Request body 41 | match (Match[str]): Match object from regex pattern 42 | 43 | Returns: 44 | Tuple[int, ErrorResponse]: Status code and error response 45 | """ 46 | 47 | @staticmethod 48 | def context_length_exceeded( 49 | request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], 50 | match, # type: Match[str] # type: ignore 51 | ) -> Tuple[int, ErrorResponse]: 52 | """Formatter for context length exceeded error""" 53 | 54 | context_window = int(match.group(2)) 55 | prompt_tokens = int(match.group(1)) 56 | completion_tokens = request.max_tokens 57 | if hasattr(request, "messages"): 58 | # Chat completion 59 | message = ( 60 | "This model's maximum context length is {} tokens. " 61 | "However, you requested {} tokens " 62 | "({} in the messages, {} in the completion). " 63 | "Please reduce the length of the messages or completion." 64 | ) 65 | else: 66 | # Text completion 67 | message = ( 68 | "This model's maximum context length is {} tokens, " 69 | "however you requested {} tokens " 70 | "({} in your prompt; {} for the completion). " 71 | "Please reduce your prompt; or completion length." 72 | ) 73 | return 400, ErrorResponse( 74 | message=message.format( 75 | context_window, 76 | (completion_tokens or 0) + prompt_tokens, 77 | prompt_tokens, 78 | completion_tokens, 79 | ), # type: ignore 80 | type="invalid_request_error", 81 | param="messages", 82 | code="context_length_exceeded", 83 | ) 84 | 85 | @staticmethod 86 | def model_not_found( 87 | request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], 88 | match, # type: Match[str] # type: ignore 89 | ) -> Tuple[int, ErrorResponse]: 90 | """Formatter for model_not_found error""" 91 | 92 | model_path = str(match.group(1)) 93 | message = f"The model `{model_path}` does not exist" 94 | return 400, ErrorResponse( 95 | message=message, 96 | type="invalid_request_error", 97 | param=None, 98 | code="model_not_found", 99 | ) 100 | 101 | 102 | class RouteErrorHandler(APIRoute): 103 | """Custom APIRoute that handles application errors and exceptions""" 104 | 105 | # key: regex pattern for original error message from llama_cpp 106 | # value: formatter function 107 | pattern_and_formatters: Dict[ 108 | "Pattern[str]", 109 | Callable[ 110 | [ 111 | Union["CreateCompletionRequest", "CreateChatCompletionRequest"], 112 | "Match[str]", 113 | ], 114 | Tuple[int, ErrorResponse], 115 | ], 116 | ] = { 117 | compile( 118 | r"Requested tokens \((\d+)\) exceed context window of (\d+)" 119 | ): ErrorResponseFormatters.context_length_exceeded, 120 | compile( 121 | r"Model path does not exist: (.+)" 122 | ): ErrorResponseFormatters.model_not_found, 123 | } 124 | 125 | def error_message_wrapper( 126 | self, 127 | error: Exception, 128 | body: Optional[ 129 | Union[ 130 | "CreateChatCompletionRequest", 131 | "CreateCompletionRequest", 132 | "CreateEmbeddingRequest", 133 | ] 134 | ] = None, 135 | ) -> Tuple[int, ErrorResponse]: 136 | """Wraps error message in OpenAI style error response""" 137 | if body is not None and isinstance( 138 | body, 139 | ( 140 | CreateCompletionRequest, 141 | CreateChatCompletionRequest, 142 | ), 143 | ): 144 | # When text completion or chat completion 145 | for pattern, callback in self.pattern_and_formatters.items(): 146 | match = pattern.search(str(error)) 147 | if match is not None: 148 | return callback(body, match) 149 | 150 | # Only print the trace on unexpected exceptions 151 | print(f"Exception: {str(error)}", file=sys.stderr) 152 | traceback.print_exc(file=sys.stderr) 153 | 154 | # Wrap other errors as internal server error 155 | return 500, ErrorResponse( 156 | message=str(error), 157 | type="internal_server_error", 158 | param=None, 159 | code=None, 160 | ) 161 | 162 | def get_route_handler( 163 | self, 164 | ) -> Callable[[Request], Coroutine[None, None, Response]]: 165 | """Defines custom route handler that catches exceptions and formats 166 | in OpenAI style error response""" 167 | 168 | original_route_handler = super().get_route_handler() 169 | 170 | async def custom_route_handler(request: Request) -> Response: 171 | try: 172 | start_sec = time.perf_counter() 173 | response = await original_route_handler(request) 174 | elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000) 175 | response.headers["openai-processing-ms"] = f"{elapsed_time_ms}" 176 | return response 177 | except HTTPException as unauthorized: 178 | # api key check failed 179 | raise unauthorized 180 | except Exception as exc: 181 | json_body = await request.json() 182 | try: 183 | if "messages" in json_body: 184 | # Chat completion 185 | body: Optional[ 186 | Union[ 187 | CreateChatCompletionRequest, 188 | CreateCompletionRequest, 189 | CreateEmbeddingRequest, 190 | ] 191 | ] = CreateChatCompletionRequest(**json_body) 192 | elif "prompt" in json_body: 193 | # Text completion 194 | body = CreateCompletionRequest(**json_body) 195 | else: 196 | # Embedding 197 | body = CreateEmbeddingRequest(**json_body) 198 | except Exception: 199 | # Invalid request body 200 | body = None 201 | 202 | # Get proper error message from the exception 203 | ( 204 | status_code, 205 | error_message, 206 | ) = self.error_message_wrapper(error=exc, body=body) 207 | return JSONResponse( 208 | {"error": error_message}, 209 | status_code=status_code, 210 | ) 211 | 212 | return custom_route_handler 213 | -------------------------------------------------------------------------------- /llama_cpp/server/settings.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import multiprocessing 4 | 5 | from typing import Optional, List, Literal, Union, Dict, cast 6 | from typing_extensions import Self 7 | 8 | from pydantic import Field, model_validator 9 | from pydantic_settings import BaseSettings 10 | 11 | import llama_cpp 12 | 13 | # Disable warning for model and model_alias settings 14 | BaseSettings.model_config["protected_namespaces"] = () 15 | 16 | 17 | class ModelSettings(BaseSettings): 18 | """Model settings used to load a Llama model.""" 19 | 20 | model: str = Field( 21 | description="The path to the model to use for generating completions." 22 | ) 23 | model_alias: Optional[str] = Field( 24 | default=None, 25 | description="The alias of the model to use for generating completions.", 26 | ) 27 | # Model Params 28 | n_gpu_layers: int = Field( 29 | default=0, 30 | ge=-1, 31 | description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", 32 | ) 33 | split_mode: int = Field( 34 | default=llama_cpp.LLAMA_SPLIT_MODE_LAYER, 35 | description="The split mode to use.", 36 | ) 37 | main_gpu: int = Field( 38 | default=0, 39 | ge=0, 40 | description="Main GPU to use.", 41 | ) 42 | tensor_split: Optional[List[float]] = Field( 43 | default=None, 44 | description="Split layers across multiple GPUs in proportion.", 45 | ) 46 | vocab_only: bool = Field( 47 | default=False, description="Whether to only return the vocabulary." 48 | ) 49 | use_mmap: bool = Field( 50 | default=llama_cpp.llama_supports_mmap(), 51 | description="Use mmap.", 52 | ) 53 | use_mlock: bool = Field( 54 | default=llama_cpp.llama_supports_mlock(), 55 | description="Use mlock.", 56 | ) 57 | kv_overrides: Optional[List[str]] = Field( 58 | default=None, 59 | description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.", 60 | ) 61 | rpc_servers: Optional[str] = Field( 62 | default=None, 63 | description="comma seperated list of rpc servers for offloading", 64 | ) 65 | # Context Params 66 | seed: int = Field( 67 | default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." 68 | ) 69 | n_ctx: int = Field(default=2048, ge=0, description="The context size.") 70 | n_batch: int = Field( 71 | default=512, ge=1, description="The batch size to use per eval." 72 | ) 73 | n_ubatch: int = Field( 74 | default=512, ge=1, description="The physical batch size used by llama.cpp" 75 | ) 76 | n_threads: int = Field( 77 | default=max(multiprocessing.cpu_count() // 2, 1), 78 | ge=1, 79 | description="The number of threads to use. Use -1 for max cpu threads", 80 | ) 81 | n_threads_batch: int = Field( 82 | default=max(multiprocessing.cpu_count(), 1), 83 | ge=0, 84 | description="The number of threads to use when batch processing. Use -1 for max cpu threads", 85 | ) 86 | rope_scaling_type: int = Field( 87 | default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED 88 | ) 89 | rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") 90 | rope_freq_scale: float = Field( 91 | default=0.0, description="RoPE frequency scaling factor" 92 | ) 93 | yarn_ext_factor: float = Field(default=-1.0) 94 | yarn_attn_factor: float = Field(default=1.0) 95 | yarn_beta_fast: float = Field(default=32.0) 96 | yarn_beta_slow: float = Field(default=1.0) 97 | yarn_orig_ctx: int = Field(default=0) 98 | mul_mat_q: bool = Field( 99 | default=True, description="if true, use experimental mul_mat_q kernels" 100 | ) 101 | logits_all: bool = Field(default=True, description="Whether to return logits.") 102 | embedding: bool = Field(default=False, description="Whether to use embeddings.") 103 | offload_kqv: bool = Field( 104 | default=True, description="Whether to offload kqv to the GPU." 105 | ) 106 | flash_attn: bool = Field( 107 | default=False, description="Whether to use flash attention." 108 | ) 109 | # Sampling Params 110 | last_n_tokens_size: int = Field( 111 | default=64, 112 | ge=0, 113 | description="Last n tokens to keep for repeat penalty calculation.", 114 | ) 115 | # LoRA Params 116 | lora_base: Optional[str] = Field( 117 | default=None, 118 | description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.", 119 | ) 120 | lora_path: Optional[str] = Field( 121 | default=None, 122 | description="Path to a LoRA file to apply to the model.", 123 | ) 124 | # Backend Params 125 | numa: Union[bool, int] = Field( 126 | default=False, 127 | description="Enable NUMA support.", 128 | ) 129 | # Chat Format Params 130 | chat_format: Optional[str] = Field( 131 | default=None, 132 | description="Chat format to use.", 133 | ) 134 | clip_model_path: Optional[str] = Field( 135 | default=None, 136 | description="Path to a CLIP model to use for multi-modal chat completion.", 137 | ) 138 | # Cache Params 139 | cache: bool = Field( 140 | default=False, 141 | description="Use a cache to reduce processing times for evaluated prompts.", 142 | ) 143 | cache_type: Literal["ram", "disk"] = Field( 144 | default="ram", 145 | description="The type of cache to use. Only used if cache is True.", 146 | ) 147 | cache_size: int = Field( 148 | default=2 << 30, 149 | description="The size of the cache in bytes. Only used if cache is True.", 150 | ) 151 | # Tokenizer Options 152 | hf_tokenizer_config_path: Optional[str] = Field( 153 | default=None, 154 | description="The path to a HuggingFace tokenizer_config.json file.", 155 | ) 156 | hf_pretrained_model_name_or_path: Optional[str] = Field( 157 | default=None, 158 | description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().", 159 | ) 160 | # Loading from HuggingFace Model Hub 161 | hf_model_repo_id: Optional[str] = Field( 162 | default=None, 163 | description="The model repo id to use for the HuggingFace tokenizer model.", 164 | ) 165 | # Speculative Decoding 166 | draft_model: Optional[str] = Field( 167 | default=None, 168 | description="Method to use for speculative decoding. One of (prompt-lookup-decoding).", 169 | ) 170 | draft_model_num_pred_tokens: int = Field( 171 | default=10, 172 | description="Number of tokens to predict using the draft model.", 173 | ) 174 | # KV Cache Quantization 175 | type_k: Optional[int] = Field( 176 | default=None, 177 | description="Type of the key cache quantization.", 178 | ) 179 | type_v: Optional[int] = Field( 180 | default=None, 181 | description="Type of the value cache quantization.", 182 | ) 183 | # Misc 184 | verbose: bool = Field( 185 | default=True, description="Whether to print debug information." 186 | ) 187 | 188 | @model_validator( 189 | mode="before" 190 | ) # pre=True to ensure this runs before any other validation 191 | def set_dynamic_defaults(self) -> Self: 192 | # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count() 193 | cpu_count = multiprocessing.cpu_count() 194 | values = cast(Dict[str, int], self) 195 | if values.get("n_threads", 0) == -1: 196 | values["n_threads"] = cpu_count 197 | if values.get("n_threads_batch", 0) == -1: 198 | values["n_threads_batch"] = cpu_count 199 | return self 200 | 201 | 202 | class ServerSettings(BaseSettings): 203 | """Server settings used to configure the FastAPI and Uvicorn server.""" 204 | 205 | # Uvicorn Settings 206 | host: str = Field(default="localhost", description="Listen address") 207 | port: int = Field(default=8000, description="Listen port") 208 | ssl_keyfile: Optional[str] = Field( 209 | default=None, description="SSL key file for HTTPS" 210 | ) 211 | ssl_certfile: Optional[str] = Field( 212 | default=None, description="SSL certificate file for HTTPS" 213 | ) 214 | # FastAPI Settings 215 | api_key: Optional[str] = Field( 216 | default=None, 217 | description="API key for authentication. If set all requests need to be authenticated.", 218 | ) 219 | interrupt_requests: bool = Field( 220 | default=True, 221 | description="Whether to interrupt requests when a new request is received.", 222 | ) 223 | disable_ping_events: bool = Field( 224 | default=False, 225 | description="Disable EventSource pings (may be needed for some clients).", 226 | ) 227 | root_path: str = Field( 228 | default="", 229 | description="The root path for the server. Useful when running behind a reverse proxy.", 230 | ) 231 | 232 | 233 | class Settings(ServerSettings, ModelSettings): 234 | pass 235 | 236 | 237 | class ConfigFileSettings(ServerSettings): 238 | """Configuration file format settings.""" 239 | 240 | models: List[ModelSettings] = Field(default=[], description="Model configs") 241 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: llama-cpp-python 2 | repo_url: https://github.com/abetlen/llama-cpp-python 3 | 4 | theme: 5 | name: material 6 | palette: 7 | 8 | # Palette toggle for light mode 9 | - scheme: default 10 | primary: indigo 11 | toggle: 12 | icon: material/brightness-7 13 | name: Switch to dark mode 14 | 15 | # Palette toggle for dark mode 16 | - scheme: slate 17 | primary: indigo 18 | toggle: 19 | icon: material/brightness-4 20 | name: Switch to light mode 21 | 22 | plugins: 23 | - search 24 | - mkdocstrings: 25 | handlers: 26 | python: 27 | options: 28 | members_order: source 29 | group_by_category: false 30 | signature_crossrefs: true 31 | show_signature: true 32 | docstring_section_style: list 33 | show_root_heading: true 34 | heading_level: 3 35 | preload_modules: 36 | - typing 37 | - typing_extensions 38 | - ctypes 39 | import: 40 | - https://docs.python.org/3/objects.inv 41 | - https://numpy.org/doc/stable/objects.inv 42 | 43 | watch: 44 | - llama_cpp 45 | - README.md 46 | 47 | nav: 48 | - "Getting Started": "index.md" 49 | - "Installation Guides": 50 | - "macOS (Metal)": "install/macos.md" 51 | - "API Reference": "api-reference.md" 52 | - "OpenAI Compatible Web Server": "server.md" 53 | - "Changelog": "changelog.md" 54 | 55 | markdown_extensions: 56 | - attr_list 57 | - pymdownx.emoji: 58 | emoji_index: !!python/name:materialx.emoji.twemoji 59 | emoji_generator: !!python/name:materialx.emoji.to_svg 60 | - pymdownx.highlight: 61 | anchor_linenums: true 62 | line_spans: __span 63 | pygments_lang_class: true 64 | - pymdownx.inlinehilite 65 | - pymdownx.magiclink: 66 | repo_url_shorthand: true 67 | user: abetlen 68 | repo: llama-cpp-python 69 | - pymdownx.snippets 70 | - pymdownx.superfences 71 | - pymdownx.tabbed: 72 | alternate_style: true 73 | - pymdownx.tilde 74 | - tables 75 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["scikit-build-core[pyproject]>=0.9.2"] 3 | build-backend = "scikit_build_core.build" 4 | 5 | [project] 6 | name = "llama_cpp_python" 7 | dynamic = ["version"] 8 | description = "Python bindings for the llama.cpp library" 9 | readme = "README.md" 10 | license = { text = "MIT" } 11 | authors = [ 12 | { name = "Andrei Betlen", email = "abetlen@gmail.com" }, 13 | ] 14 | dependencies = [ 15 | "typing-extensions>=4.5.0", 16 | "numpy>=1.20.0", 17 | "diskcache>=5.6.1", 18 | "jinja2>=2.11.3", 19 | ] 20 | requires-python = ">=3.8" 21 | classifiers = [ 22 | "Programming Language :: Python :: 3", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | ] 29 | 30 | 31 | [project.optional-dependencies] 32 | server = [ 33 | "uvicorn>=0.22.0", 34 | "fastapi>=0.100.0", 35 | "pydantic-settings>=2.0.1", 36 | "sse-starlette>=1.6.1", 37 | "starlette-context>=0.3.6,<0.4", 38 | "PyYAML>=5.1", 39 | ] 40 | test = [ 41 | "pytest>=7.4.0", 42 | "httpx>=0.24.1", 43 | "scipy>=1.10", 44 | "fastapi>=0.100.0", 45 | "sse-starlette>=1.6.1", 46 | "starlette-context>=0.3.6,<0.4", 47 | "pydantic-settings>=2.0.1", 48 | "huggingface-hub>=0.23.0" 49 | ] 50 | dev = [ 51 | "black>=23.3.0", 52 | "twine>=4.0.2", 53 | "mkdocs>=1.4.3", 54 | "mkdocstrings[python]>=0.22.0", 55 | "mkdocs-material>=9.1.18", 56 | "pytest>=7.4.0", 57 | "httpx>=0.24.1", 58 | ] 59 | all = [ 60 | "llama_cpp_python[server,test,dev]", 61 | ] 62 | 63 | [tool.scikit-build] 64 | wheel.packages = ["llama_cpp"] 65 | cmake.verbose = true 66 | cmake.minimum-version = "3.21" 67 | minimum-version = "0.5.1" 68 | sdist.include = [".git", "vendor/llama.cpp/*"] 69 | 70 | [tool.scikit-build.metadata.version] 71 | provider = "scikit_build_core.metadata.regex" 72 | input = "llama_cpp/__init__.py" 73 | 74 | [project.urls] 75 | Homepage = "https://github.com/abetlen/llama-cpp-python" 76 | Issues = "https://github.com/abetlen/llama-cpp-python/issues" 77 | Documentation = "https://llama-cpp-python.readthedocs.io/en/latest/" 78 | Changelog = "https://llama-cpp-python.readthedocs.io/en/latest/changelog/" 79 | 80 | [tool.pytest.ini_options] 81 | testpaths = "tests" 82 | -------------------------------------------------------------------------------- /scripts/get-releases.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to get all releases 4 | get_all_releases() { 5 | local page=1 6 | local per_page=100 7 | local releases="" 8 | local new_releases 9 | 10 | # Prepare headers 11 | local headers=(-H "Accept: application/vnd.github.v3+json") 12 | if [ -n "$GITHUB_TOKEN" ]; then 13 | headers+=(-H "Authorization: Bearer $GITHUB_TOKEN") 14 | fi 15 | 16 | while true; do 17 | response=$(curl -s "${headers[@]}" \ 18 | "https://api.github.com/repos/abetlen/llama-cpp-python/releases?page=$page&per_page=$per_page") 19 | 20 | # Check if the response is valid JSON 21 | if ! echo "$response" | jq empty > /dev/null 2>&1; then 22 | echo "Error: Invalid response from GitHub API" >&2 23 | echo "Response: $response" >&2 24 | return 1 25 | fi 26 | 27 | new_releases=$(echo "$response" | jq -r '.[].tag_name') 28 | if [ -z "$new_releases" ]; then 29 | break 30 | fi 31 | releases="$releases $new_releases" 32 | ((page++)) 33 | done 34 | 35 | echo $releases 36 | } 37 | 38 | # Get all releases and save to file 39 | releases=$(get_all_releases) 40 | if [ $? -ne 0 ]; then 41 | echo "Failed to fetch releases. Please check your internet connection and try again later." >&2 42 | exit 1 43 | fi 44 | 45 | echo "$releases" | tr ' ' '\n' > all_releases.txt 46 | 47 | echo "All releases have been saved to all_releases.txt" 48 | -------------------------------------------------------------------------------- /scripts/releases-to-pep-503.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Enable exit on error 4 | set -e 5 | 6 | # Function for logging 7 | log_error() { 8 | echo "ERROR: $1" >&2 9 | } 10 | 11 | log_info() { 12 | echo "INFO: $1" 13 | } 14 | 15 | # Get output directory or default to index/whl/cpu 16 | output_dir=${1:-"index/whl/cpu"} 17 | 18 | # Get pattern from second arg or default to valid python package version pattern 19 | pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"} 20 | 21 | # Get the current directory (where the script is run from) 22 | current_dir="$(pwd)" 23 | 24 | # Check if all_releases.txt exists 25 | if [ ! -f "$current_dir/all_releases.txt" ]; then 26 | log_error "all_releases.txt not found in the current directory." 27 | exit 1 28 | fi 29 | 30 | # Create output directory 31 | mkdir -p "$output_dir" 32 | 33 | # Create an index html file 34 | cat << EOF > "$output_dir/index.html" 35 | 36 | 37 | 38 | 39 | llama-cpp-python 40 |
41 | 42 | 43 | 44 | EOF 45 | 46 | # Create llama-cpp-python directory 47 | mkdir -p "$output_dir/llama-cpp-python" 48 | 49 | # Create an index html file in llama-cpp-python directory 50 | cat << EOF > "$output_dir/llama-cpp-python/index.html" 51 | 52 | 53 | 54 |

Links for llama-cpp-python

55 | EOF 56 | 57 | # Filter releases by pattern 58 | releases=$(grep -E "$pattern" "$current_dir/all_releases.txt") 59 | 60 | # Prepare curl headers 61 | headers=('--header' 'Accept: application/vnd.github.v3+json') 62 | if [ -n "$GITHUB_TOKEN" ]; then 63 | headers+=('--header' "authorization: Bearer $GITHUB_TOKEN") 64 | fi 65 | headers+=('--header' 'content-type: application/json') 66 | 67 | # For each release, get all assets 68 | for release in $releases; do 69 | log_info "Processing release: $release" 70 | response=$(curl -s "${headers[@]}" \ 71 | "https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release") 72 | 73 | if [ -z "$response" ]; then 74 | log_error "Empty response from GitHub API for release $release" 75 | continue 76 | fi 77 | 78 | if ! echo "$response" | jq -e '.assets' > /dev/null 2>&1; then 79 | log_error "Invalid or unexpected response from GitHub API for release $release" 80 | log_error "Response: $response" 81 | continue 82 | fi 83 | 84 | # Get release version from release ie v0.1.0-cu121 -> v0.1.0 85 | release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+") 86 | echo "

$release_version

" >> "$output_dir/llama-cpp-python/index.html" 87 | 88 | wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url') 89 | if [ -z "$wheel_urls" ]; then 90 | log_error "No wheel files found for release $release" 91 | continue 92 | fi 93 | 94 | echo "$wheel_urls" | while read -r asset; do 95 | echo " $asset" >> "$output_dir/llama-cpp-python/index.html" 96 | echo "
" >> "$output_dir/llama-cpp-python/index.html" 97 | done 98 | done 99 | 100 | echo " " >> "$output_dir/llama-cpp-python/index.html" 101 | echo "" >> "$output_dir/llama-cpp-python/index.html" 102 | echo "" >> "$output_dir/llama-cpp-python/index.html" 103 | 104 | log_info "Index generation complete. Output directory: $output_dir" 105 | -------------------------------------------------------------------------------- /tests/test_llama.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import multiprocessing 3 | 4 | import numpy as np 5 | from scipy.special import log_softmax 6 | 7 | from huggingface_hub import hf_hub_download 8 | 9 | import pytest 10 | 11 | import llama_cpp 12 | import llama_cpp._internals as internals 13 | 14 | 15 | MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf" 16 | 17 | 18 | def test_llama_cpp_version(): 19 | assert llama_cpp.__version__ 20 | 21 | 22 | def test_llama_cpp_tokenization(): 23 | llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False) 24 | 25 | assert llama 26 | assert llama._ctx.ctx is not None 27 | 28 | text = b"Hello World" 29 | 30 | tokens = llama.tokenize(text) 31 | assert tokens[0] == llama.token_bos() 32 | assert tokens == [1, 15043, 2787] 33 | detokenized = llama.detokenize(tokens) 34 | assert detokenized == text 35 | 36 | tokens = llama.tokenize(text, add_bos=False) 37 | assert tokens[0] != llama.token_bos() 38 | assert tokens == [15043, 2787] 39 | 40 | detokenized = llama.detokenize(tokens) 41 | assert detokenized != text 42 | 43 | text = b"Hello World" 44 | tokens = llama.tokenize(text) 45 | assert tokens[-1] != llama.token_eos() 46 | assert tokens == [1, 15043, 2787, 829, 29879, 29958] 47 | 48 | tokens = llama.tokenize(text, special=True) 49 | assert tokens[-1] == llama.token_eos() 50 | assert tokens == [1, 15043, 2787, 2] 51 | 52 | text = b"" 53 | tokens = llama.tokenize(text, add_bos=True, special=True) 54 | assert tokens[-1] != llama.token_eos() 55 | assert tokens == [llama.token_bos()] 56 | assert text == llama.detokenize(tokens) 57 | 58 | 59 | @pytest.fixture 60 | def llama_cpp_model_path(): 61 | repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF" 62 | filename = "qwen2-0_5b-instruct-q8_0.gguf" 63 | model_path = hf_hub_download(repo_id, filename) 64 | return model_path 65 | 66 | 67 | def test_real_model(llama_cpp_model_path): 68 | import os 69 | assert os.path.exists(llama_cpp_model_path) 70 | 71 | params = llama_cpp.llama_model_default_params() 72 | params.use_mmap = llama_cpp.llama_supports_mmap() 73 | params.use_mlock = llama_cpp.llama_supports_mlock() 74 | params.check_tensors = False 75 | 76 | model = internals.LlamaModel(path_model=llama_cpp_model_path, params=params) 77 | 78 | cparams = llama_cpp.llama_context_default_params() 79 | cparams.n_ctx = 16 80 | cparams.n_batch = 16 81 | cparams.n_ubatch = 16 82 | cparams.n_threads = multiprocessing.cpu_count() 83 | cparams.n_threads_batch = multiprocessing.cpu_count() 84 | cparams.logits_all = False 85 | cparams.flash_attn = True 86 | 87 | context = internals.LlamaContext(model=model, params=cparams) 88 | tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True) 89 | 90 | assert tokens == [9707, 11, 1879, 0] 91 | 92 | tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True) 93 | 94 | batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1) 95 | 96 | seed = 1337 97 | sampler = internals.LlamaSampler() 98 | sampler.add_top_k(50) 99 | sampler.add_top_p(0.9, 1) 100 | sampler.add_temp(0.8) 101 | sampler.add_dist(seed) 102 | 103 | result = tokens 104 | n_eval = 0 105 | for _ in range(4): 106 | batch.set_batch(tokens, n_past=n_eval, logits_all=False) 107 | context.decode(batch) 108 | n_eval += len(tokens) 109 | token_id = sampler.sample(context, -1) 110 | tokens = [token_id] 111 | result += tokens 112 | 113 | output = result[5:] 114 | output_text = model.detokenize(output, special=True) 115 | assert output_text == b" over the lazy dog" 116 | 117 | def test_real_llama(llama_cpp_model_path): 118 | model = llama_cpp.Llama( 119 | llama_cpp_model_path, 120 | n_ctx=32, 121 | n_batch=32, 122 | n_ubatch=32, 123 | n_threads=multiprocessing.cpu_count(), 124 | n_threads_batch=multiprocessing.cpu_count(), 125 | logits_all=False, 126 | flash_attn=True, 127 | ) 128 | 129 | output = model.create_completion( 130 | "The quick brown fox jumps", 131 | max_tokens=4, 132 | top_k=50, 133 | top_p=0.9, 134 | temperature=0.8, 135 | seed=1337 136 | ) 137 | assert output["choices"][0]["text"] == " over the lazy dog" 138 | 139 | 140 | output = model.create_completion( 141 | "The capital of france is paris, 'true' or 'false'?:\n", 142 | max_tokens=4, 143 | top_k=50, 144 | top_p=0.9, 145 | temperature=0.8, 146 | seed=1337, 147 | grammar=llama_cpp.LlamaGrammar.from_string(""" 148 | root ::= "true" | "false" 149 | """) 150 | ) 151 | assert output["choices"][0]["text"] == "true" 152 | 153 | suffix = b"rot" 154 | tokens = model.tokenize(suffix, add_bos=True, special=True) 155 | def logit_processor_func(input_ids, logits): 156 | for token in tokens: 157 | logits[token] *= 1000 158 | return logits 159 | 160 | logit_processors = llama_cpp.LogitsProcessorList( 161 | [logit_processor_func] 162 | ) 163 | 164 | output = model.create_completion( 165 | "The capital of france is par", 166 | max_tokens=4, 167 | top_k=50, 168 | top_p=0.9, 169 | temperature=0.8, 170 | seed=1337, 171 | logits_processor=logit_processors 172 | ) 173 | assert output["choices"][0]["text"].lower().startswith("rot") 174 | 175 | model.set_seed(1337) 176 | 177 | state = model.save_state() 178 | 179 | output = model.create_completion( 180 | "Pick a number from 1 to 10?:\n", 181 | max_tokens=4, 182 | top_k=50, 183 | top_p=0.9, 184 | temperature=0.8, 185 | grammar=llama_cpp.LlamaGrammar.from_string(""" 186 | root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" 187 | """) 188 | ) 189 | number_1 = output["choices"][0]["text"] 190 | 191 | output = model.create_completion( 192 | "Pick a number from 1 to 10?:\n", 193 | max_tokens=4, 194 | top_k=50, 195 | top_p=0.9, 196 | temperature=0.8, 197 | grammar=llama_cpp.LlamaGrammar.from_string(""" 198 | root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" 199 | """) 200 | ) 201 | number_2 = output["choices"][0]["text"] 202 | 203 | model.load_state(state) 204 | 205 | output = model.create_completion( 206 | "Pick a number from 1 to 10?:\n", 207 | max_tokens=4, 208 | top_k=50, 209 | top_p=0.9, 210 | temperature=0.8, 211 | grammar=llama_cpp.LlamaGrammar.from_string(""" 212 | root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" 213 | """) 214 | ) 215 | number_3 = output["choices"][0]["text"] 216 | 217 | assert number_1 != number_2 218 | assert number_1 == number_3 219 | -------------------------------------------------------------------------------- /tests/test_llama_chat_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import jinja2 4 | 5 | from llama_cpp import ( 6 | ChatCompletionRequestUserMessage, 7 | ) 8 | import llama_cpp.llama_types as llama_types 9 | import llama_cpp.llama_chat_format as llama_chat_format 10 | 11 | from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter 12 | 13 | def test_mistral_instruct(): 14 | chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" 15 | chat_formatter = jinja2.Template(chat_template) 16 | messages = [ 17 | llama_types.ChatCompletionRequestUserMessage(role="user", content="Instruction"), 18 | llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="Model answer"), 19 | llama_types.ChatCompletionRequestUserMessage(role="user", content="Follow-up instruction"), 20 | ] 21 | response = llama_chat_format.format_mistral_instruct( 22 | messages=messages, 23 | ) 24 | prompt = ("" if response.added_special else "") + response.prompt 25 | reference = chat_formatter.render( 26 | messages=messages, 27 | bos_token="", 28 | eos_token="", 29 | ) 30 | assert prompt == reference 31 | 32 | 33 | mistral_7b_tokenizer_config = """{ 34 | "add_bos_token": true, 35 | "add_eos_token": false, 36 | "added_tokens_decoder": { 37 | "0": { 38 | "content": "", 39 | "lstrip": false, 40 | "normalized": false, 41 | "rstrip": false, 42 | "single_word": false, 43 | "special": true 44 | }, 45 | "1": { 46 | "content": "", 47 | "lstrip": false, 48 | "normalized": false, 49 | "rstrip": false, 50 | "single_word": false, 51 | "special": true 52 | }, 53 | "2": { 54 | "content": "", 55 | "lstrip": false, 56 | "normalized": false, 57 | "rstrip": false, 58 | "single_word": false, 59 | "special": true 60 | } 61 | }, 62 | "additional_special_tokens": [], 63 | "bos_token": "", 64 | "clean_up_tokenization_spaces": false, 65 | "eos_token": "", 66 | "legacy": true, 67 | "model_max_length": 1000000000000000019884624838656, 68 | "pad_token": null, 69 | "sp_model_kwargs": {}, 70 | "spaces_between_special_tokens": false, 71 | "tokenizer_class": "LlamaTokenizer", 72 | "unk_token": "", 73 | "use_default_system_prompt": false, 74 | "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" 75 | }""" 76 | 77 | 78 | def test_hf_tokenizer_config_str_to_chat_formatter(): 79 | tokenizer_config = json.loads(mistral_7b_tokenizer_config) 80 | chat_formatter = hf_tokenizer_config_to_chat_formatter( 81 | tokenizer_config 82 | ) 83 | chat_formatter_respoonse = chat_formatter( 84 | messages=[ 85 | ChatCompletionRequestUserMessage(role="user", content="Hello, world!"), 86 | ] 87 | ) 88 | 89 | assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]" "") 90 | -------------------------------------------------------------------------------- /tests/test_llama_grammar.py: -------------------------------------------------------------------------------- 1 | import llama_cpp 2 | import json 3 | 4 | tree = """ 5 | leaf ::= "." 6 | node ::= leaf | "(" node node ")" 7 | root ::= node 8 | """ 9 | 10 | 11 | def test_grammar_from_string(): 12 | grammar = llama_cpp.LlamaGrammar.from_string(tree) 13 | # assert grammar._n_rules == 3 14 | # assert grammar._start_rule_index == 2 15 | # assert grammar.grammar is not None 16 | 17 | 18 | def test_composed_pydantic_grammar(): 19 | """ 20 | from pydantic import BaseModel 21 | 22 | class A(BaseModel): 23 | a: int 24 | 25 | class B(BaseModel): 26 | a: A 27 | b: int 28 | """ 29 | 30 | # This schema corresponds to the grammar in the comment above. 31 | # We don't use the pydantic models directly to avoid the dependency. 32 | schema = { 33 | "$defs": { 34 | "A": { 35 | "properties": {"a": {"title": "A", "type": "integer"}}, 36 | "required": ["a"], 37 | "title": "A", 38 | "type": "object", 39 | } 40 | }, 41 | "properties": { 42 | "a": {"$ref": "#/$defs/A"}, 43 | "b": {"title": "B", "type": "integer"}, 44 | }, 45 | "required": ["a", "b"], 46 | "title": "B", 47 | "type": "object", 48 | } 49 | 50 | grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(schema)) 51 | 52 | # assert grammar.grammar is not None 53 | 54 | 55 | def test_grammar_anyof(): 56 | sch = { 57 | "properties": { 58 | "temperature": { 59 | "description": "The temperature mentioned", 60 | "type": "number", 61 | }, 62 | "unit": { 63 | "anyOf": [ 64 | { 65 | "description": "Unit for temperature", 66 | "enum": ["celsius", "fahrenheit"], 67 | "type": "string", 68 | }, 69 | {"type": "null"}, 70 | ], 71 | }, 72 | }, 73 | "type": "object", 74 | } 75 | 76 | grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(sch)) 77 | 78 | # assert grammar.grammar is not None 79 | -------------------------------------------------------------------------------- /tests/test_llama_speculative.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from llama_cpp.llama_speculative import LlamaPromptLookupDecoding 4 | 5 | def test_find_candidate_pred_tokens(): 6 | find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens 7 | 8 | # Test Case 1: Matching ngram is found 9 | input_ids1 = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]) 10 | result1 = find_candidate_pred_tokens(input_ids1, max_ngram_size=3, num_pred_tokens=2) 11 | assert np.array_equal(result1, np.array([1, 2])) 12 | 13 | # Test Case 2: Matching ngram is not found 14 | input_ids2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) 15 | result2 = find_candidate_pred_tokens(input_ids2, max_ngram_size=3, num_pred_tokens=2) 16 | assert np.array_equal(result2, np.array([])) 17 | --------------------------------------------------------------------------------