├── .github
    ├── dependabot.yaml
    └── workflows
    │   ├── publish.yaml
    │   ├── test.yaml
    │   ├── wheels-cuda.yaml
    │   ├── wheels-index.yaml
    │   ├── wheels-metal.yaml
    │   └── wheels.yaml
├── .gitignore
├── .gitmodules
├── .readthedocs.yaml
├── CMakeLists.txt
├── LICENSE.md
├── Makefile
├── README.md
├── docs
    ├── api-reference.md
    └── index.md
├── examples
    ├── clip
    │   ├── README.md
    │   ├── convert-pt-to-ggml.py
    │   ├── model.py
    │   ├── requirements.txt
    │   └── utils.py
    ├── custom-operators
    │   └── example_jax.py
    ├── optimizer
    │   └── simple.py
    ├── replit
    │   ├── README.md
    │   ├── app.py
    │   ├── main.py
    │   └── requirements.txt
    └── rpc
    │   ├── main.py
    │   └── worker.py
├── ggml
    ├── __init__.py
    ├── ggml.py
    ├── py.typed
    └── utils.py
├── mkdocs.yml
├── pyproject.toml
├── scripts
    └── releases-to-pep-503.sh
└── tests
    ├── __init__.py
    ├── test_ggml.py
    ├── test_ggml_cuda.py
    ├── test_ggml_metal.py
    └── test_utils.py


/.github/dependabot.yaml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 |   - package-ecosystem: "github-actions"
13 |     directory: "/"
14 |     schedule:
15 |       interval: "weekly"
16 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | 
 3 | # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 4 | 
 5 | on: workflow_dispatch
 6 | 
 7 | jobs:
 8 |   build-n-publish:
 9 |     name: Build and publish
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |       with:
15 |         submodules: "true"
16 |     - name: Set up Python
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: "3.8"
20 |     - name: Install dependencies
21 |       run: |
22 |         python3 -m pip install --upgrade pip
23 |         python3 -m pip install -e .[publish]
24 |     - name: Build source distribution
25 |       run: |
26 |         python3 -m build --sdist
27 |     - name: Publish distribution to PyPI
28 |       # TODO: move to tag based releases
29 |       # if: startsWith(github.ref, 'refs/tags')
30 |       uses: pypa/gh-action-pypi-publish@release/v1
31 |       with:
32 |         password: ${{ secrets.PYPI_API_TOKEN }}
33 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
  1 | name: Tests
  2 | 
  3 | on:
  4 |   pull_request:
  5 |     branches:
  6 |       - main
  7 |   push:
  8 |     branches:
  9 |       - main
 10 | 
 11 | jobs:
 12 |   build-linux:
 13 |     runs-on: ubuntu-latest
 14 |     strategy:
 15 |       matrix:
 16 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 17 | 
 18 |     steps:
 19 |       - uses: actions/checkout@v4
 20 |         with:
 21 |           submodules: "true"
 22 |       - name: Set up Python ${{ matrix.python-version }}
 23 |         uses: actions/setup-python@v5
 24 |         with:
 25 |           python-version: ${{ matrix.python-version }}
 26 |       - name: Install dependencies
 27 |         run: |
 28 |           python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools
 29 |           python3 -m pip install \
 30 |             --verbose \
 31 |             --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' \
 32 |             --config-settings cmake.verbose=true \
 33 |             --config-settings logging.level=INFO \
 34 |             --config-settings install.strip=false \
 35 |             --editable .
 36 |       - name: Test with pytest
 37 |         run: |
 38 |           python -m pytest -s -vvvv
 39 | 
 40 |   build-windows:
 41 |     runs-on: windows-latest
 42 |     strategy:
 43 |       matrix:
 44 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 45 | 
 46 |     steps:
 47 |       - uses: actions/checkout@v4
 48 |         with:
 49 |           submodules: "true"
 50 |       - name: Set up Python ${{ matrix.python-version }}
 51 |         uses: actions/setup-python@v5
 52 |         with:
 53 |           python-version: ${{ matrix.python-version }}
 54 | 
 55 |       - name: Install dependencies
 56 |         run: |
 57 |           python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools
 58 |           python3 -m pip install --verbose --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' --config-settings cmake.verbose=true --config-settings logging.level=INFO --config-settings install.strip=false --editable .
 59 |       - name: Test with pytest
 60 |         run: |
 61 |           python -m pytest -s -vvvv
 62 | 
 63 |   build-macos:
 64 |     runs-on: macos-13
 65 |     strategy:
 66 |       matrix:
 67 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 68 | 
 69 |     steps:
 70 |       - uses: actions/checkout@v4
 71 |         with:
 72 |           submodules: "true"
 73 |       - name: Set up Python ${{ matrix.python-version }}
 74 |         uses: actions/setup-python@v5
 75 |         with:
 76 |           python-version: ${{ matrix.python-version }}
 77 |       - name: Install dependencies
 78 |         run: |
 79 |           python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools
 80 |           python3 -m pip install \
 81 |             --verbose \
 82 |             --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' \
 83 |             --config-settings cmake.verbose=true \
 84 |             --config-settings logging.level=INFO \
 85 |             --config-settings install.strip=false \
 86 |             --editable .
 87 |       - name: Test with pytest
 88 |         run: |
 89 |           python -m pytest -s -vvvv
 90 | 
 91 |   build-macos-metal:
 92 |     runs-on: macos-13
 93 | 
 94 |     steps:
 95 |       - uses: actions/checkout@v4
 96 |         with:
 97 |           submodules: "true"
 98 |       - name: Set up Python
 99 |         uses: actions/setup-python@v5
100 |         with:
101 |           python-version: "3.8"
102 |       - name: Install dependencies
103 |         run: |
104 |           python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools
105 |           python3 -m pip install \
106 |             --verbose \
107 |             --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3;-DGGML_METAL=On' \
108 |             --config-settings cmake.verbose=true \
109 |             --config-settings logging.level=INFO \
110 |             --config-settings install.strip=false \
111 |             --editable .
112 |       - name: Test with pytest
113 |         run: |
114 |           python -m pytest -s -vvvv
115 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels-cuda.yaml:
--------------------------------------------------------------------------------
  1 | name: Wheels CUDA
  2 | 
  3 | on: workflow_dispatch
  4 | 
  5 | permissions:
  6 |   contents: write
  7 | 
  8 | jobs:
  9 |   define_matrix:
 10 |     name: Define Build Matrix
 11 |     runs-on: ubuntu-latest
 12 |     outputs:
 13 |       matrix: ${{ steps.set-matrix.outputs.matrix }}
 14 |     defaults:
 15 |       run:
 16 |         shell: pwsh
 17 | 
 18 |     steps:
 19 |       - name: Define Job Output
 20 |         id: set-matrix
 21 |         run: |
 22 |           $matrix = @{
 23 |               'os' = @('ubuntu-20.04', 'windows-latest')
 24 |               'pyver' = @("3.10", "3.11", "3.12")
 25 |               'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
 26 |               'releasetag' = @("basic")
 27 |           }
 28 | 
 29 |           $matrixOut = ConvertTo-Json $matrix -Compress
 30 |           Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
 31 | 
 32 |   build_wheels:
 33 |     name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
 34 |     needs: define_matrix
 35 |     runs-on: ${{ matrix.os }}
 36 |     strategy:
 37 |       matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
 38 |     defaults:
 39 |       run:
 40 |         shell: pwsh
 41 |     env:
 42 |       CUDAVER: ${{ matrix.cuda }}
 43 |       AVXVER: ${{ matrix.releasetag }}
 44 | 
 45 |     steps:
 46 |       - uses: actions/checkout@v4
 47 |         with:
 48 |           submodules: "recursive"
 49 | 
 50 |       - uses: actions/setup-python@v5
 51 |         with:
 52 |           python-version: ${{ matrix.pyver }}
 53 | 
 54 |       - name: Setup Mamba
 55 |         uses: conda-incubator/setup-miniconda@v3.0.4
 56 |         with:
 57 |           activate-environment: "build"
 58 |           python-version: ${{ matrix.pyver }}
 59 |           miniforge-variant: Mambaforge
 60 |           miniforge-version: latest
 61 |           use-mamba: true
 62 |           add-pip-as-python-dependency: true
 63 |           auto-activate-base: false
 64 | 
 65 |       - name: VS Integration Cache
 66 |         id: vs-integration-cache
 67 |         if: runner.os == 'Windows'
 68 |         uses: actions/cache@v4.0.2
 69 |         with:
 70 |           path: ./MSBuildExtensions
 71 |           key: cuda-${{ matrix.cuda }}-vs-integration
 72 | 
 73 |       - name: Get Visual Studio Integration
 74 |         if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
 75 |         run: |
 76 |           if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
 77 |           $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
 78 |           for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
 79 |           Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
 80 |           & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
 81 |           Remove-Item 'cudainstaller.zip'
 82 | 
 83 |       - name: Install Visual Studio Integration
 84 |         if: runner.os == 'Windows'
 85 |         run: |
 86 |           $y = (gi '.\MSBuildExtensions').fullname + '\*'
 87 |           (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
 88 |           $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
 89 |           echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
 90 | 
 91 |       - name: Install Dependencies
 92 |         env:
 93 |           MAMBA_DOWNLOAD_FAILFAST: "0"
 94 |           MAMBA_NO_LOW_SPEED_LIMIT: "1"
 95 |         run: |
 96 |           $cudaVersion = $env:CUDAVER
 97 |           mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
 98 |           python -m pip install build wheel
 99 | 
100 |       - name: Build Wheel
101 |         run: |
102 |           $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
103 |           $env:CUDA_PATH = $env:CONDA_PREFIX
104 |           $env:CUDA_HOME = $env:CONDA_PREFIX
105 |           $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
106 |           if ($IsLinux) {
107 |             $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
108 |           }
109 |           $env:VERBOSE = '1'
110 |           $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all'
111 |           $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
112 |           if ($env:AVXVER -eq 'AVX') {
113 |             $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
114 |           }
115 |           if ($env:AVXVER -eq 'AVX512') {
116 |             $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on'
117 |           }
118 |           if ($env:AVXVER -eq 'basic') {
119 |             $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
120 |           }
121 |           $buildtag = "-cu$cudaVersion"
122 |           python -m build --wheel
123 |           Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
124 | 
125 |       - uses: softprops/action-gh-release@v2
126 |         with:
127 |           files: dist/*
128 |           tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
129 |         env:
130 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
131 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels-index.yaml:
--------------------------------------------------------------------------------
 1 | name: Wheels Index
 2 | 
 3 | on:
 4 |   # Trigger on any new release
 5 |   release:
 6 |     types: [published]
 7 | 
 8 |   # Allows you to run this workflow manually from the Actions tab
 9 |   workflow_dispatch:
10 | 
11 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
12 | permissions:
13 |   contents: read
14 |   pages: write
15 |   id-token: write
16 | 
17 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
18 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
19 | concurrency:
20 |   group: "pages"
21 |   cancel-in-progress: false
22 | 
23 | jobs:
24 |   # Single deploy job since we're just deploying
25 |   deploy:
26 |     environment:
27 |       name: github-pages
28 |       url: ${{ steps.deployment.outputs.page_url }}
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - name: Checkout
32 |         uses: actions/checkout@v4
33 |       - name: Setup Pages
34 |         uses: actions/configure-pages@v5
35 |       - name: Build
36 |         run: |
37 |           ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
38 |           ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
39 |           ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
40 |           ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
41 |           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
42 |           ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
43 |       - name: Upload artifact
44 |         uses: actions/upload-pages-artifact@v3
45 |         with:
46 |           # Upload entire repository
47 |           path: "index"
48 |       - name: Deploy to GitHub Pages
49 |         id: deployment
50 |         uses: actions/deploy-pages@v4
51 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels-metal.yaml:
--------------------------------------------------------------------------------
 1 | name: Wheels Metal
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | permissions:
 6 |   contents: write
 7 | 
 8 | jobs:
 9 |   define_matrix:
10 |     name: Define Build Matrix
11 |     runs-on: ubuntu-latest
12 |     outputs:
13 |       matrix: ${{ steps.set-matrix.outputs.matrix }}
14 |     defaults:
15 |       run:
16 |         shell: pwsh
17 | 
18 |     steps:
19 |       - name: Define Job Output
20 |         id: set-matrix
21 |         run: |
22 |           $matrix = @{
23 |               'os' = @('macos-11', 'macos-12', 'macos-13')
24 |               'pyver' = @('3.10', '3.11', '3.12')
25 |           }
26 | 
27 |           $matrixOut = ConvertTo-Json $matrix -Compress
28 |           Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
29 | 
30 |   build_wheels:
31 |     name: ${{ matrix.os }} Python ${{ matrix.pyver }}
32 |     needs: define_matrix
33 |     runs-on: ${{ matrix.os }}
34 |     strategy:
35 |       matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
36 |     env:
37 |       OSVER: ${{ matrix.os }}
38 | 
39 |     steps:
40 |       - uses: actions/checkout@v4
41 |         with:
42 |           submodules: "recursive"
43 | 
44 |       - uses: actions/setup-python@v5
45 |         with:
46 |           python-version: ${{ matrix.pyver }}
47 | 
48 |       - name: Install Dependencies
49 |         run: |
50 |           python -m pip install build wheel cmake
51 | 
52 |       - name: Build Wheel
53 |         run: |
54 |           XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
55 |           XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
56 |           export CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_METAL=on"
57 |           [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
58 |           [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
59 |           [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
60 | 
61 |           export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
62 |           VERBOSE=1 python -m build --wheel
63 | 
64 |           if [[ "$OSVER" == "macos-13" ]]; then
65 |             export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
66 |             export MACOSX_DEPLOYMENT_TARGET="14.0"
67 |             VERBOSE=1 python -m build --wheel
68 |           fi
69 | 
70 |           for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
71 | 
72 |           export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_METAL=on" && export ARCHFLAGS="-arch x86_64"
73 |           VERBOSE=1 python -m build --wheel
74 | 
75 |           if [[ "$OSVER" == "macos-13" ]]; then
76 |             export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
77 |             export MACOSX_DEPLOYMENT_TARGET="14.0"
78 |             VERBOSE=1 python -m build --wheel
79 |           fi
80 | 
81 |       - uses: softprops/action-gh-release@v2
82 |         with:
83 |           files: dist/*
84 |           # set release name to <tag>-metal
85 |           tag_name: ${{ github.ref_name }}-metal
86 |         env:
87 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
88 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels.yaml:
--------------------------------------------------------------------------------
 1 | name: Wheels
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | permissions:
 6 |   contents: write
 7 | 
 8 | jobs:
 9 |   build_wheels:
10 |     name: Build wheels on ${{ matrix.os }}
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: [ubuntu-20.04, macos-11, windows-2022]
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |         with:
19 |           submodules: "recursive"
20 | 
21 |       - name: Build wheels
22 |         uses: pypa/cibuildwheel@v2.18.1
23 |         env:
24 |           # disable repair
25 |           CIBW_REPAIR_WHEEL_COMMAND: ""
26 |           # skip building wheels for these platforms
27 |           CIBW_SKIP: pp* cp36-* cp37-* *-musllinux*
28 |           CMAKE_ARGS: -DGGML_METAL=OFF
29 |         with:
30 |           package-dir: .
31 |           output-dir: wheelhouse
32 | 
33 |       - uses: actions/upload-artifact@v4
34 |         with:
35 |           name: wheels-${{ matrix.os }}
36 |           path: ./wheelhouse/*.whl
37 | 
38 |   build_wheels_arm64:
39 |     name: Build arm64 wheels
40 |     runs-on: ubuntu-latest
41 |     steps:
42 |       - uses: actions/checkout@v4
43 |         with:
44 |           submodules: "recursive"
45 | 
46 |       - name: Set up QEMU
47 |         uses: docker/setup-qemu-action@v3
48 |         with:
49 |           platforms: linux/arm64
50 | 
51 |       - name: Build wheels
52 |         uses: pypa/cibuildwheel@v2.18.1
53 |         env:
54 |           CIBW_SKIP: "*musllinux* pp*"
55 |           CIBW_REPAIR_WHEEL_COMMAND: ""
56 |           CIBW_ARCHS: "aarch64"
57 |           CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
58 |         with:
59 |           output-dir: wheelhouse
60 | 
61 |       - name: Upload wheels as artifacts
62 |         uses: actions/upload-artifact@v4
63 |         with:
64 |           name: wheels_arm64
65 |           path: ./wheelhouse/*.whl
66 | 
67 |   release:
68 |     name: Release
69 |     needs: [build_wheels]
70 |     runs-on: ubuntu-latest
71 | 
72 |     steps:
73 |       - uses: actions/download-artifact@v4
74 |         with:
75 |           merge-multiple: true
76 |           path: dist
77 | 
78 |       - uses: softprops/action-gh-release@v2
79 |         with:
80 |           files: dist/*
81 |         env:
82 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
83 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .local/
  2 | 
  3 | .vscode/
  4 | 
  5 | _skbuild/
  6 | 
  7 | .envrc
  8 | 
  9 | models/
 10 | 
 11 | MANIFEST.in
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | *.dll
 21 | *.dylib
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | share/python-wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | MANIFEST
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .nox/
 57 | .coverage
 58 | .coverage.*
 59 | .cache
 60 | nosetests.xml
 61 | coverage.xml
 62 | *.cover
 63 | *.py,cover
 64 | .hypothesis/
 65 | .pytest_cache/
 66 | cover/
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | local_settings.py
 75 | db.sqlite3
 76 | db.sqlite3-journal
 77 | 
 78 | # Flask stuff:
 79 | instance/
 80 | .webassets-cache
 81 | 
 82 | # Scrapy stuff:
 83 | .scrapy
 84 | 
 85 | # Sphinx documentation
 86 | docs/_build/
 87 | 
 88 | # PyBuilder
 89 | .pybuilder/
 90 | target/
 91 | 
 92 | # Jupyter Notebook
 93 | .ipynb_checkpoints
 94 | 
 95 | # IPython
 96 | profile_default/
 97 | ipython_config.py
 98 | 
 99 | # pyenv
100 | #   For a library or package, you might want to ignore these files since the code is
101 | #   intended to run in multiple environments; otherwise, check them in:
102 | # .python-version
103 | 
104 | # pipenv
105 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
106 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
107 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
108 | #   install all needed dependencies.
109 | #Pipfile.lock
110 | 
111 | # poetry
112 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
113 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
114 | #   commonly ignored for libraries.
115 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
116 | #poetry.lock
117 | 
118 | # pdm
119 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
120 | #pdm.lock
121 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
122 | #   in version control.
123 | #   https://pdm.fming.dev/#use-with-ide
124 | .pdm.toml
125 | 
126 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
127 | __pypackages__/
128 | 
129 | # Celery stuff
130 | celerybeat-schedule
131 | celerybeat.pid
132 | 
133 | # SageMath parsed files
134 | *.sage.py
135 | 
136 | # Environments
137 | .env
138 | .venv
139 | env/
140 | venv/
141 | ENV/
142 | env.bak/
143 | venv.bak/
144 | 
145 | # Spyder project settings
146 | .spyderproject
147 | .spyproject
148 | 
149 | # Rope project settings
150 | .ropeproject
151 | 
152 | # mkdocs documentation
153 | /site
154 | 
155 | # mypy
156 | .mypy_cache/
157 | .dmypy.json
158 | dmypy.json
159 | 
160 | # Pyre type checker
161 | .pyre/
162 | 
163 | # pytype static type analyzer
164 | .pytype/
165 | 
166 | # Cython debug symbols
167 | cython_debug/
168 | 
169 | # PyCharm
170 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
171 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
172 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
173 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
174 | .idea/
175 | 
176 | .direnv/


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "vendor/ggml"]
2 | 	path = vendor/ggml
3 | 	url = https://github.com/ggerganov/ggml
4 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for MkDocs projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the version of Python and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.11"
12 | 
13 | mkdocs:
14 |   configuration: mkdocs.yml
15 | 
16 | python:
17 |   install:
18 |     - method: pip
19 |       path: .
20 |       extra_requirements:
21 |         - docs
22 | 
23 | submodules:
24 |   include: all
25 |   recursive: true


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.21)
 2 | 
 3 | project(
 4 |     ${SKBUILD_PROJECT_NAME}
 5 |     VERSION ${SKBUILD_PROJECT_VERSION}
 6 | )
 7 | 
 8 | message(SKBUILD_STATE="${SKBUILD_STATE}")
 9 | 
10 | if(SKBUILD_STATE STREQUAL "editable")
11 |     # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
12 |     set(GGML_PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ggml/lib)
13 | else()
14 |     set(GGML_PYTHON_INSTALL_DIR ${SKBUILD_PLATLIB_DIR}/ggml/lib)
15 | endif()
16 | 
17 | set(BUILD_SHARED_LIBS "On")
18 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
19 | if (APPLE)
20 |     if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
21 |         set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
22 |         set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
23 |         set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE)
24 |         set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE)
25 |     endif()
26 | 
27 |     set(GGML_METAL_EMBED_LIBRARY "On" CACHE BOOL "ggml: embed metal library" FORCE)
28 | endif()
29 | add_subdirectory(vendor/ggml)
30 | install(
31 |     TARGETS ggml 
32 |     ARCHIVE DESTINATION ${GGML_PYTHON_INSTALL_DIR}
33 |     LIBRARY DESTINATION ${GGML_PYTHON_INSTALL_DIR}
34 |     RUNTIME DESTINATION ${GGML_PYTHON_INSTALL_DIR}
35 |     FRAMEWORK DESTINATION ${GGML_PYTHON_INSTALL_DIR}
36 |     RESOURCE DESTINATION ${GGML_PYTHON_INSTALL_DIR}
37 | )
38 | install(
39 |     FILES $<TARGET_RUNTIME_DLLS:ggml>
40 |     DESTINATION ${GGML_PYTHON_INSTALL_DIR}
41 | )
42 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Copyright (c) 2023 Andrei Betlen
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | submodules = vendor/ggml
 2 | 
 3 | all: build
 4 | 
 5 | ${submodules}:
 6 | 	git submodule update --init --recursive
 7 | 
 8 | update-pip:
 9 | 	python3 -m pip install --upgrade pip
10 | 
11 | build: ${submodules} update-pip ## Build ggml-python with cpu support
12 | 	python3 -m pip install --verbose --editable .
13 | 
14 | build.debug: ${submodules} update-pip ## Build ggml-python with cpu support, debug symbols, and lines
15 | 	python3 -m pip install \
16 | 		--verbose \
17 | 		--config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' \
18 | 		--config-settings cmake.verbose=true \
19 | 		--config-settings logging.level=INFO \
20 | 		--config-settings install.strip=false \
21 | 		--editable .
22 | 
23 | build.openblas: ${submodules} update-pip ## Build ggml-python with openblas support
24 | 	python3 -m pip install \
25 | 		--verbose \
26 | 		--config-settings cmake.args='-DGGML_OPENBLAS=On' \
27 | 		--editable .
28 | 
29 | build.cuda: ${submodules} update-pip ## Build ggml-python with cublas / cuda support
30 | 	python3 -m pip install \
31 | 		--verbose \
32 | 		--config-settings cmake.args='-DGGML_CUDA=On' \
33 | 		--editable .
34 | 
35 | build.clblast: ${submodules} update-pip ## Build ggml-python with clblast / opencl support
36 | 	python3 -m pip install \
37 | 		--verbose \
38 | 		--config-settings cmake.args='-DGGML_CLBLAST=On' \
39 | 		--editable .
40 | 
41 | sdist: ## Build source distribution
42 | 	python3 -m build --sdist
43 | 
44 | deploy: ## Deploy to pypi
45 | 	twine upload dist/*
46 | 
47 | test: ## Run tests
48 | 	python3 -m pytest
49 | 
50 | test.gdb: ## Run tests with gdb
51 | 	gdb -ex "set pagination off" -ex r -ex "bt 5" --args python -m pytest -s -vvvv
52 | 
53 | docs: ## Build documentation using mkdocs and serve it
54 | 	mkdocs serve
55 | 
56 | clean: ## Clean build artifacts
57 | 	- rm -rf build
58 | 	- rm -rf dist
59 | 	- rm ggml/*.so
60 | 	- rm ggml/*.dll
61 | 	- rm ggml/*.dylib
62 | 	- rm ${submodules}/*.so
63 | 	- rm ${submodules}/*.dll
64 | 	- rm ${submodules}/*.dylib
65 | 	- cd ${submodules} && make clean
66 | 
67 | help: ## Prints help menu
68 | 	@grep -E '^[\.a-zA-Z_-]+:.*?## .*$$' Makefile | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
69 | 
70 | .PHONY: \
71 | 	all \
72 | 	build \
73 | 	build.debug \
74 | 	build.openblas \
75 | 	build.cuda \
76 | 	build.clblast \
77 | 	sdist \
78 | 	deploy \
79 | 	test \
80 | 	test.gdb \
81 | 	docs \
82 | 	clean \
83 | 	help


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Python bindings for [`ggml`](https://github.com/ggerganov/ggml)
  2 | 
  3 | [![Documentation Status](https://readthedocs.org/projects/ggml-python/badge/?version=latest)](https://ggml-python.readthedocs.io/en/latest/?badge=latest)
  4 | [![Tests](https://github.com/abetlen/ggml-python/actions/workflows/test.yaml/badge.svg)](https://github.com/abetlen/ggml-python/actions/workflows/test.yaml)
  5 | [![PyPI](https://img.shields.io/pypi/v/ggml-python)](https://pypi.org/project/ggml-python/)
  6 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ggml-python)](https://pypi.org/project/ggml-python/)
  7 | [![PyPI - License](https://img.shields.io/pypi/l/ggml-python)](https://pypi.org/project/ggml-python/)
  8 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/ggml-python)](https://pypi.org/project/ggml-python/)
  9 | 
 10 | 
 11 | Python bindings for the [`ggml`](https://github.com/ggerganov/ggml) tensor library for machine learning.
 12 | 
 13 | > ⚠️ Neither this project nor `ggml` currently guarantee backwards-compatibility, if you are using this library in other applications I strongly recommend pinning to specific releases in your `requirements.txt` file.
 14 | 
 15 | # Documentation
 16 | 
 17 | - [Getting Started](https://ggml-python.readthedocs.io/en/latest/)
 18 | - [API Reference](https://ggml-python.readthedocs.io/en/latest/api-reference/)
 19 | - [Examples](https://github.com/abetlen/ggml-python/tree/main/examples)
 20 | 
 21 | # Installation
 22 | 
 23 | 
 24 | Requirements
 25 | - Python 3.8+
 26 | - C compiler (gcc, clang, msvc, etc)
 27 | 
 28 | You can install `ggml-python` using `pip`:
 29 | 
 30 | ```bash
 31 | pip install ggml-python
 32 | ```
 33 | 
 34 | This will compile ggml using cmake which requires a c compiler installed on your system.
 35 | To build ggml with specific features (ie. OpenBLAS, GPU Support, etc) you can pass specific cmake options through the `cmake.args` pip install configuration setting. For example to install ggml-python with cuBLAS support you can run:
 36 | 
 37 | ```bash
 38 | pip install --upgrade pip
 39 | pip install ggml-python --config-settings=cmake.args='-DGGML_CUDA=ON'
 40 | ```
 41 | 
 42 | ## Options
 43 | 
 44 | | Option | Description | Default |
 45 | | --- | --- | --- |
 46 | | `GGML_CUDA` | Enable cuBLAS support | `OFF` |
 47 | | `GGML_CLBLAST` | Enable CLBlast support | `OFF` |
 48 | | `GGML_OPENBLAS` | Enable OpenBLAS support | `OFF` |
 49 | | `GGML_METAL` | Enable Metal support | `OFF` |
 50 | | `GGML_RPC` | Enable RPC support | `OFF` |
 51 | 
 52 | # Usage
 53 | 
 54 | ```python
 55 | import ggml
 56 | import ctypes
 57 | 
 58 | # Allocate a new context with 16 MB of memory
 59 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None)
 60 | ctx = ggml.ggml_init(params)
 61 | 
 62 | # Instantiate tensors
 63 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 64 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 65 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 66 | 
 67 | # Use ggml operations to build a computational graph
 68 | x2 = ggml.ggml_mul(ctx, x, x)
 69 | f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b)
 70 | 
 71 | gf = ggml.ggml_new_graph(ctx)
 72 | ggml.ggml_build_forward_expand(gf, f)
 73 | 
 74 | # Set the input values
 75 | ggml.ggml_set_f32(x, 2.0)
 76 | ggml.ggml_set_f32(a, 3.0)
 77 | ggml.ggml_set_f32(b, 4.0)
 78 | 
 79 | # Compute the graph
 80 | ggml.ggml_graph_compute_with_ctx(ctx, gf, 1)
 81 | 
 82 | # Get the output value
 83 | output = ggml.ggml_get_f32_1d(f, 0)
 84 | assert output == 16.0
 85 | 
 86 | # Free the context
 87 | ggml.ggml_free(ctx)
 88 | ```
 89 | 
 90 | # Troubleshooting
 91 | 
 92 | If you are having trouble installing `ggml-python` or activating specific features please try to install it with the `--verbose` and `--no-cache-dir` flags to get more information about any issues:
 93 | 
 94 | ```bash
 95 | pip install ggml-python --verbose --no-cache-dir --force-reinstall --upgrade
 96 | ```
 97 | 
 98 | # License
 99 | 
100 | This project is licensed under the terms of the MIT license.
101 | 


--------------------------------------------------------------------------------
/docs/api-reference.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: API Reference
 3 | ---
 4 | 
 5 | ::: ggml.ggml
 6 |     options:
 7 |         show_root_full_path: false
 8 |         filters:
 9 |             - "^ggml_"
10 |             - "^gguf_"
11 |             - "^GGML_"
12 |             - "^GGUF_"
13 | 
14 | ::: ggml.utils


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Getting Started
  3 | ---
  4 | 
  5 | ## Introduction
  6 | 
  7 | ggml-python is a python library for working with [ggml](https://github.com/ggerganov/ggml).
  8 | 
  9 | ggml is a tensor library for machine learning developed by Georgi Gerganov, the library has been used to run models like Whisper and LLaMa on a wide range of devices.
 10 | ggml is written in C/C++ and is designed to be fast, portable and easily embeddable; making use of various hardware acceleration systems like BLAS, CUDA, OpenCL, and Metal.
 11 | ggml supports quantized inference for reduced memory footprint and faster inference.
 12 | 
 13 | You can use ggml-python to:
 14 | 
 15 | - Convert and quantize model weights from Python-based ML frameworks (Pytorch, Tensorflow, etc) to ggml.
 16 | - Port existing ML models to ggml and run them from Python.
 17 | 
 18 | ## Installation
 19 | 
 20 | Requirements
 21 | 
 22 | - Python 3.7+
 23 | - C compiler (gcc, clang, msvc, etc)
 24 | 
 25 | You can install `ggml-python` using `pip`:
 26 | 
 27 | ```bash
 28 | pip install ggml-python
 29 | ```
 30 | 
 31 | This will compile ggml using cmake which requires a c compiler installed on your system.
 32 | 
 33 | Below are the available options for building ggml-python with additional options for optimized inference.
 34 | 
 35 | === "**BLAS**"
 36 | 
 37 |     ```bash
 38 |     CMAKE_ARGS="-DGGML_OPENBLAS=ON" pip install ggml-python
 39 |     ```
 40 | 
 41 | === "**CUDA**"
 42 | 
 43 |     ```bash
 44 |     CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install ggml-python
 45 |     ```
 46 | 
 47 | === "**Metal**"
 48 | 
 49 |     ```bash
 50 |     CMAKE_ARGS="-DGGML_METAL=ON" pip install ggml-python
 51 |     ```
 52 | 
 53 | === "**OpenCL**"
 54 | 
 55 |     ```bash
 56 |     CMAKE_ARGS="-DGGML_CLBLAST=ON" pip install ggml-python
 57 |     ```
 58 | 
 59 | ## Basic Example
 60 | 
 61 | Below is a simple example of using ggml-python low level api to compute the value of a function.
 62 | 
 63 | ```python
 64 | import ggml
 65 | import ctypes
 66 | 
 67 | # Allocate a new context with 16 MB of memory
 68 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None)
 69 | ctx = ggml.ggml_init(params)
 70 | 
 71 | # Instantiate tensors
 72 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 73 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 74 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 75 | 
 76 | # Use ggml operations to build a computational graph
 77 | x2 = ggml.ggml_mul(ctx, x, x)
 78 | f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b)
 79 | 
 80 | gf = ggml.ggml_new_graph(ctx)
 81 | ggml.ggml_build_forward_expand(gf, f)
 82 | 
 83 | # Set the input values
 84 | ggml.ggml_set_f32(x, 2.0)
 85 | ggml.ggml_set_f32(a, 3.0)
 86 | ggml.ggml_set_f32(b, 4.0)
 87 | 
 88 | # Compute the graph
 89 | ggml.ggml_graph_compute_with_ctx(ctx, gf, 1)
 90 | 
 91 | # Get the output value
 92 | output = ggml.ggml_get_f32_1d(f, 0)
 93 | assert output == 16.0
 94 | 
 95 | # Free the context
 96 | ggml.ggml_free(ctx)
 97 | ```
 98 | 
 99 | ## Next Steps
100 | 
101 | To learn more about ggml-python, check out the following resources:
102 | 
103 | - [API Reference](api-reference.md)
104 | - Examples
105 |     - [Code Completion Server](https://github.com/abetlen/ggml-python/tree/main/examples/replit) - A code completion server using ggml-python and the replit-code-v1-3b model that you can drop into your editor as a local Github Copilot replacement.
106 |     - [CLIP Embeddings](https://github.com/abetlen/ggml-python/tree/main/examples/clip) - A simple example of using ggml-python to implement CLIP text / image embeddings.
107 | 
108 | ## Development
109 | 
110 | ```bash
111 | git clone https://github.com/abetlen/ggml-python.git
112 | cd ggml-python
113 | # (Optional) Create a virtual environment
114 | python -m venv venv
115 | source venv/bin/activate
116 | # Install dependencies
117 | make build
118 | ```
119 | 
120 | ## Contributing
121 | 
122 | If you would like to contribute to ggml-python, please open an issue or submit a pull request on [GitHub](https://github.com/abetlen/ggml-python).
123 | 
124 | 
125 | ## License
126 | 
127 | This project is licensed under the terms of the MIT license.


--------------------------------------------------------------------------------
/examples/clip/README.md:
--------------------------------------------------------------------------------
 1 | # CLIP Example
 2 | 
 3 | # Setup
 4 | 
 5 | Create a virtual environment and install requirements.
 6 | 
 7 | ```bash
 8 | python3 -m venv venv
 9 | source venv/bin/activate
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | Convert the original CLIP model to GGML format.
14 | 
15 | ```bash
16 | python convert-pt-to-ggml.py ViT-B/32 ./models
17 | ```
18 | 
19 | The other CLIP vision transformers should work, but have not been tested. Namely:
20 | 
21 | - ViT-B/16
22 | - ViT-L/14
23 | - ViT-L/14@336px
24 | 
25 | # Usage
26 | 
27 | ```python
28 | # This implements the same example as the original project: https://github.com/openai/CLIP#usage
29 | from model import ClipModel
30 | from scipy.special import softmax
31 | from PIL import Image
32 | from utils import tokenize, transform
33 | 
34 | 
35 | preprocess = transform(224)
36 | # Example image: https://github.com/openai/CLIP/blob/main/CLIP.png
37 | image = preprocess(Image.open("CLIP.png")).unsqueeze(0)
38 | text = tokenize(["a diagram", "a dog", "a cat"])
39 | 
40 | # Initialize Model
41 | model_file = "models/ViT-B-32.ggml"
42 | model = ClipModel.init_from_file(model_file, n_threads=1)
43 | 
44 | # Features are computed one at a time, batching not supported yet
45 | text_features = model.encode_text(text)
46 | 
47 | # Only single image supported in ggml right now
48 | image_features = model.encode_image(image)
49 | 
50 | logits_per_image, logits_per_text = model(image, text)
51 | 
52 | probs = softmax(logits_per_image)
53 | 
54 | print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]
55 | ```
56 | 


--------------------------------------------------------------------------------
/examples/clip/convert-pt-to-ggml.py:
--------------------------------------------------------------------------------
  1 | # Convert CLIP model from PyTorch to ggml format
  2 | #
  3 | # Usage: python convert-pt-to-ggml.py ViT-B-32 ./models
  4 | #
  5 | # This script loads the specified model and clip assets and saves them in ggml format.
  6 | # The output is a single binary file containing the following information:
  7 | #
  8 | #  - hparams
  9 | #  - tokenizer vocab
 10 | #  - model variables
 11 | #
 12 | # For each variable, write the following:
 13 | #
 14 | #  - Number of dimensions (int)
 15 | #  - Name length (int)
 16 | #  - Dimensions (int[n_dims])
 17 | #  - Name (char[name_length])
 18 | #  - Data (float[n_dims])
 19 | #
 20 | 
 21 | import os
 22 | import sys
 23 | import struct
 24 | import gzip
 25 | import numpy as np
 26 | import clip
 27 | 
 28 | if len(sys.argv) < 3:
 29 |     print("Usage: convert-pt-to-ggml.py clip_model dir-output\n")
 30 |     sys.exit(1)
 31 | 
 32 | clip_model = sys.argv[1]
 33 | dir_out = sys.argv[2]
 34 | 
 35 | # CLIP repo needs to exist at the root directory
 36 | MODELS = clip.clip._MODELS
 37 | model_filename = os.path.basename(MODELS[clip_model]).replace(".pt", "")
 38 | 
 39 | model = clip.load(clip_model, device="cpu")
 40 | state_dict = model[0].state_dict()
 41 | 
 42 | # output in the same directory as the model
 43 | fname_out = os.path.join(dir_out, model_filename + ".ggml")
 44 | os.makedirs(dir_out, exist_ok=True)
 45 | 
 46 | fout = open(fname_out, "wb")
 47 | 
 48 | # Get HParams
 49 | # Only ViT models supported for now
 50 | vit = True
 51 | if vit:
 52 |     vision_width = state_dict["visual.conv1.weight"].shape[0]
 53 |     vision_layers = len(
 54 |         [
 55 |             k
 56 |             for k in state_dict.keys()
 57 |             if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")
 58 |         ]
 59 |     )
 60 |     vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
 61 |     grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
 62 |     image_resolution = vision_patch_size * grid_size
 63 | embed_dim = state_dict["text_projection"].shape[1]
 64 | context_length = state_dict["positional_embedding"].shape[0]
 65 | vocab_size = state_dict["token_embedding.weight"].shape[0]
 66 | transformer_width = state_dict["ln_final.weight"].shape[0]
 67 | transformer_heads = transformer_width // 64
 68 | transformer_layers = len(
 69 |     set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks"))
 70 | )
 71 | print("HParams:")
 72 | print("  vision_width:", vision_width)
 73 | print("  vision_layers:", vision_layers)
 74 | print("  vision_patch_size:", vision_patch_size)
 75 | print("  grid_size:", grid_size)
 76 | print("  image_resolution:", image_resolution)
 77 | print("  embed_dim:", embed_dim)
 78 | print("  context_length:", context_length)
 79 | print("  vocab_size:", vocab_size)
 80 | print("  transformer_width:", transformer_width)
 81 | print("  transformer_heads:", transformer_heads)
 82 | print("  transformer_layers:", transformer_layers)
 83 | 
 84 | 
 85 | ftype = 0
 86 | 
 87 | # Write hparams
 88 | fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
 89 | fout.write(struct.pack("i", vision_width))
 90 | fout.write(struct.pack("i", vision_layers))
 91 | fout.write(struct.pack("i", vision_patch_size))
 92 | fout.write(struct.pack("i", grid_size))
 93 | fout.write(struct.pack("i", image_resolution))
 94 | fout.write(struct.pack("i", embed_dim))
 95 | fout.write(struct.pack("i", context_length))
 96 | fout.write(struct.pack("i", transformer_width))
 97 | fout.write(struct.pack("i", transformer_heads))
 98 | fout.write(struct.pack("i", transformer_layers))
 99 | fout.write(struct.pack("i", ftype))  # ftype: 0 = float32, 1 = float16
100 | 
101 | bpe_path = os.path.join(os.path.dirname(clip.__file__), "bpe_simple_vocab_16e6.txt.gz")
102 | merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
103 | merges = merges[1 : 49152 - 256 - 2 + 1]
104 | merges = [tuple(merge.split()) for merge in merges]
105 | 
106 | vocab = list(clip.simple_tokenizer.bytes_to_unicode().values())
107 | tokens = vocab + [v + "</w>" for v in vocab]
108 | for merge in merges:
109 |     tokens.append("".join(merge))
110 | tokens.extend(["<|startoftext|>", "<|endoftext|>"])
111 | # byte_decoder = {v: k for k, v in clip.simple_tokenizer.bytes_to_unicode().items()}
112 | 
113 | fout.write(struct.pack("i", len(tokens)))
114 | 
115 | for key in tokens:
116 |     text = key.encode("utf-8")
117 |     fout.write(struct.pack("i", len(text)))
118 |     fout.write(text)
119 | 
120 | for name in state_dict.keys():
121 |     data = state_dict[name].squeeze().numpy()
122 |     print("Processing variable: " + name + " with shape: ", data.shape)
123 |     # ftype == 0 -> float32, ftype == 1 -> float16
124 |     ftype = 0
125 | 
126 |     if name == "visual.conv1.weight":
127 |         data = data.astype(np.float16)
128 |         ftype = 1
129 |     n_dims = len(data.shape)
130 | 
131 |     # header
132 |     str = name.encode("utf-8")
133 |     fout.write(struct.pack("iii", n_dims, len(str), ftype))
134 |     for i in range(n_dims):
135 |         fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
136 |     fout.write(str)
137 | 
138 |     # data
139 |     data.tofile(fout)
140 | 
141 | fout.close()
142 | 
143 | print("Done. Output file: " + fname_out)
144 | print("")
145 | 


--------------------------------------------------------------------------------
/examples/clip/model.py:
--------------------------------------------------------------------------------
  1 | """ggml-python implemention of the CLIP model
  2 | """
  3 | import io
  4 | import os
  5 | import ctypes
  6 | import struct
  7 | import argparse
  8 | import numpy as np
  9 | from typing import List, Tuple, Dict
 10 | import ggml
 11 | from ggml.experimental import GGML_FTYPE, Context, InitParams, Tensor, GGML_TYPE, CGraph
 12 | 
 13 | 
 14 | def compute_ctx_size(fin: io.BufferedReader) -> int:
 15 |     # Save current position in file and get file size, then return
 16 |     position = fin.tell()
 17 | 
 18 |     ctx_size = 0
 19 |     while True:
 20 |         nbytes = struct.calcsize("iii")
 21 |         data = fin.read(nbytes)
 22 |         if len(data) != nbytes:
 23 |             break
 24 |         (n_dims, s_len, ftype) = struct.unpack("iii", data)
 25 |         dims = struct.unpack("i" * n_dims, fin.read(struct.calcsize("i" * n_dims)))
 26 |         if ftype == 0:
 27 |             _format = "f"
 28 |         if ftype == 1:
 29 |             _format = "e"
 30 |         n_bytes = struct.calcsize(_format * int(np.prod(dims)))
 31 |         ctx_size += n_bytes
 32 |         ctx_size += 256  # Padding?
 33 |         name = fin.read(s_len).decode("utf-8")
 34 |         # print(f"Name: {name}, dims: {dims}, n_bytes: {n_bytes}")
 35 | 
 36 |         fin.seek(n_bytes, os.SEEK_CUR)
 37 | 
 38 |     # Seek back to saved position
 39 |     fin.seek(position)
 40 |     return ctx_size
 41 | 
 42 | 
 43 | class ResidualAttentionBlock:
 44 |     def __init__(
 45 |         self,
 46 |         ctx: Context,
 47 |         wtype: GGML_TYPE,
 48 |         embed_dim: int,
 49 |         heads: int,
 50 |         use_attn_mask: bool = False,
 51 |     ):
 52 |         self.tensors: Dict[str, Tensor] = {}
 53 |         self.n_head = heads
 54 |         self.embed_dim = embed_dim
 55 |         self.use_attn_mask = use_attn_mask
 56 |         # Layer Norm 1 (ln_1)
 57 |         self.ln_1_weight = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx)
 58 |         self.ln_1_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx)
 59 |         self.tensors["ln_1.weight"] = self.ln_1_weight
 60 |         self.tensors["ln_1.bias"] = self.ln_1_bias
 61 | 
 62 |         # Attention Block (attn)
 63 |         self.in_proj_weight = Tensor.new_tensor_2d(
 64 |             wtype, embed_dim, 3 * embed_dim, ctx=ctx
 65 |         )
 66 |         self.in_proj_bias = Tensor.new_tensor_1d(wtype, 3 * embed_dim, ctx=ctx)
 67 |         self.out_proj_weight = Tensor.new_tensor_2d(
 68 |             wtype, embed_dim, embed_dim, ctx=ctx
 69 |         )
 70 |         self.out_proj_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx)
 71 |         self.tensors["attn.in_proj_weight"] = self.in_proj_weight
 72 |         self.tensors["attn.in_proj_bias"] = self.in_proj_bias
 73 |         self.tensors["attn.out_proj.weight"] = self.out_proj_weight
 74 |         self.tensors["attn.out_proj.bias"] = self.out_proj_bias
 75 | 
 76 |         # Layer Norm 2 (ln_2)
 77 |         self.ln_2_weight = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx)
 78 |         self.ln_2_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx)
 79 |         self.tensors["ln_2.weight"] = self.ln_2_weight
 80 |         self.tensors["ln_2.bias"] = self.ln_2_bias
 81 | 
 82 |         # MLP (mlp)
 83 |         self.mlp_c_fc_weight = Tensor.new_tensor_2d(
 84 |             wtype, embed_dim, embed_dim * 4, ctx=ctx
 85 |         )
 86 |         self.mlp_c_fc_bias = Tensor.new_tensor_1d(wtype, embed_dim * 4, ctx=ctx)
 87 |         self.mlp_c_proj_weight = Tensor.new_tensor_2d(
 88 |             wtype, embed_dim * 4, embed_dim, ctx=ctx
 89 |         )
 90 |         self.mlp_c_proj_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx)
 91 |         self.tensors["mlp.c_fc.weight"] = self.mlp_c_fc_weight
 92 |         self.tensors["mlp.c_fc.bias"] = self.mlp_c_fc_bias
 93 |         self.tensors["mlp.c_proj.weight"] = self.mlp_c_proj_weight
 94 |         self.tensors["mlp.c_proj.bias"] = self.mlp_c_proj_bias
 95 | 
 96 |     @staticmethod
 97 |     def compute_forward_mem_size(
 98 |         N: int, width: int, n_heads: int, use_attn_mask: bool = False
 99 |     ) -> int:
100 |         e_size = 4
101 |         ggml_overhead = 256
102 |         mem_size = 0
103 |         mem_size += (
104 |             e_size * width * N + ggml_overhead
105 |         ) * 5  # ln_1: repeat, repeat, mul, add, norm
106 |         mem_size += (
107 |             e_size * width * 3 * N + ggml_overhead
108 |         ) * 3  # in_proj: mul_mat, repeat, add
109 |         mem_size += ggml_overhead * 3  # view_2d: Qcur, Kcur, Vcur
110 |         mem_size += (
111 |             (e_size * (width // n_heads) * n_heads * N + ggml_overhead)
112 |             + 2 * ggml_overhead
113 |         ) * 2  # K,Q: new_tensor, cpy, permute
114 |         mem_size += e_size * N * N * n_heads + ggml_overhead  # KQ
115 |         mem_size += e_size * 4 + 256  # KQ_scaled: new_f32
116 |         mem_size += e_size * N * N * n_heads + ggml_overhead  # KQ_scaled
117 |         if use_attn_mask:
118 |             mem_size += (
119 |                 e_size * N * N * n_heads + ggml_overhead + e_size * 4 + ggml_overhead
120 |             )  # diag_mask_inf
121 |         mem_size += e_size * N * N * n_heads + ggml_overhead  # KQ_soft_max
122 |         mem_size += (
123 |             e_size * (width // n_heads) * n_heads * N + ggml_overhead
124 |         )  # V_trans: new_tensor_3d
125 |         mem_size += ggml_overhead * 2  # V_trans: cpy and permute
126 |         mem_size += (
127 |             e_size * (width // n_heads) * n_heads * N + ggml_overhead
128 |         )  # V_trans: new_tensor_3d
129 |         mem_size += ggml_overhead  # V_trans: cpy
130 |         mem_size += (
131 |             e_size * (width // n_heads) * n_heads * N + ggml_overhead
132 |         )  # KQV: mul_mat
133 |         mem_size += ggml_overhead  # KQV_merged: permute
134 |         mem_size += e_size * width * N + ggml_overhead  # KQV_merged: new_tensor_2d
135 |         mem_size += ggml_overhead  # KQV_merged: cpy
136 |         mem_size += (
137 |             e_size * width * N + ggml_overhead
138 |         ) * 3  # out_proj: mul_mat, repeat, add
139 |         mem_size += e_size * width * N + ggml_overhead  # Add residual
140 |         mem_size += (
141 |             e_size * width * N + ggml_overhead
142 |         ) * 5  # ln_2: norm, add, repeat, repeat, mul
143 |         mem_size += (
144 |             e_size * width * 4 * N + ggml_overhead
145 |         ) * 3  # MLP: mul_mat, repeat, add
146 |         mem_size += (e_size * 4 + 256) * 2  # SiLU: sf_in, sf_out
147 |         mem_size += (
148 |             e_size * width * 4 * N + ggml_overhead
149 |         ) * 3  # SiLU: scale, silu, scale
150 |         mem_size += (
151 |             e_size * width * N + ggml_overhead
152 |         ) * 3  # mlp_c_proj: mul_mat, repeat, add
153 |         mem_size += e_size * width * N + ggml_overhead  # Add Residual
154 | 
155 |         return mem_size
156 | 
157 |     def forward(self, inpL: Tensor, ctx: Context, gf: CGraph) -> Tensor:
158 |         N = inpL.shape[1]
159 | 
160 |         # [768, N]
161 |         cur = Tensor.norm(inpL, ctx=ctx)
162 |         # cur = ln_1_weight * cur + ln_1_bias
163 |         # [768, N]
164 |         cur = Tensor.add(
165 |             Tensor.mul(Tensor.repeat(self.ln_1_weight, cur, ctx=ctx), cur, ctx=ctx),
166 |             Tensor.repeat(self.ln_1_bias, cur, ctx=ctx),
167 |             ctx=ctx,
168 |         )
169 | 
170 |         # cur = in_proj_weight * cur + in_proj_bias
171 |         # [768, N] - cur (in)
172 |         # [2304, 768] - in_proj_weight
173 |         # [2304, 1] - in_proj_bias
174 |         # [2304, N] - cur (out)
175 |         cur = Tensor.mul_mat(self.in_proj_weight, cur, ctx=ctx)
176 | 
177 |         cur = Tensor.add(Tensor.repeat(self.in_proj_bias, cur, ctx=ctx), cur, ctx=ctx)
178 | 
179 |         # Self-Attention
180 |         n_embd = cur.shape[0] // 3
181 | 
182 |         Qcur = Tensor.view_2d(
183 |             cur,
184 |             n_embd,
185 |             N,
186 |             cur.tensor.contents.nb[1],
187 |             0 * ctypes.sizeof(ctypes.c_float) * n_embd,
188 |             ctx=ctx,
189 |         )
190 | 
191 |         Kcur = Tensor.view_2d(
192 |             cur,
193 |             n_embd,
194 |             N,
195 |             cur.tensor.contents.nb[1],
196 |             1 * ctypes.sizeof(ctypes.c_float) * n_embd,
197 |             ctx=ctx,
198 |         )
199 | 
200 |         Vcur = Tensor.view_2d(
201 |             cur,
202 |             n_embd,
203 |             N,
204 |             cur.tensor.contents.nb[1],
205 |             2 * ctypes.sizeof(ctypes.c_float) * n_embd,
206 |             ctx=ctx,
207 |         )
208 | 
209 |         Q = Tensor.permute(
210 |             Tensor.cpy(
211 |                 Qcur,
212 |                 Tensor.new_tensor_3d(
213 |                     GGML_TYPE.F32, n_embd // self.n_head, self.n_head, N, ctx=ctx
214 |                 ),
215 |                 ctx=ctx,
216 |             ),
217 |             0,
218 |             2,
219 |             1,
220 |             3,
221 |             ctx=ctx,
222 |         )
223 | 
224 |         K = Tensor.permute(
225 |             Tensor.cpy(
226 |                 Kcur,
227 |                 Tensor.new_tensor_3d(
228 |                     GGML_TYPE.F32, n_embd // self.n_head, self.n_head, N, ctx=ctx
229 |                 ),
230 |                 ctx=ctx,
231 |             ),
232 |             0,
233 |             2,
234 |             1,
235 |             3,
236 |             ctx=ctx,
237 |         )
238 | 
239 |         KQ = Tensor.mul_mat(K, Q, ctx=ctx)
240 | 
241 |         KQ_scaled = Tensor.scale(
242 |             KQ,
243 |             Tensor.new_f32(
244 |                 1.0 / np.sqrt(float(n_embd) / self.n_head),
245 |                 ctx=ctx,
246 |             ),
247 |             ctx=ctx,
248 |         )
249 |         if self.use_attn_mask:
250 |             KQ_masked = Tensor.diag_mask_inf(KQ_scaled, 0, ctx=ctx)
251 |             KQ_soft_max = Tensor.soft_max(KQ_masked, ctx=ctx)
252 |         else:
253 |             KQ_soft_max = Tensor.soft_max(KQ_scaled, ctx=ctx)
254 | 
255 |         V_trans = Tensor.cpy(
256 |             Tensor.permute(
257 |                 Tensor.cpy(
258 |                     Vcur,
259 |                     Tensor.new_tensor_3d(
260 |                         GGML_TYPE.F32, n_embd // self.n_head, self.n_head, N, ctx=ctx
261 |                     ),
262 |                     ctx=ctx,
263 |                 ),
264 |                 1,
265 |                 2,
266 |                 0,
267 |                 3,
268 |                 ctx=ctx,
269 |             ),
270 |             Tensor.new_tensor_3d(
271 |                 GGML_TYPE.F32, N, n_embd // self.n_head, self.n_head, ctx=ctx
272 |             ),
273 |             ctx=ctx,
274 |         )
275 | 
276 |         KQV = Tensor.mul_mat(V_trans, KQ_soft_max, ctx=ctx)
277 | 
278 |         KQV_merged = Tensor.permute(
279 |             KQV,
280 |             0,
281 |             2,
282 |             1,
283 |             3,
284 |             ctx=ctx,
285 |         )
286 | 
287 |         cur = Tensor.cpy(
288 |             KQV_merged,
289 |             Tensor.new_tensor_2d(
290 |                 GGML_TYPE.F32,
291 |                 n_embd,
292 |                 N,
293 |                 ctx=ctx,
294 |             ),
295 |             ctx=ctx,
296 |         )
297 | 
298 |         cur = Tensor.mul_mat(
299 |             self.out_proj_weight,
300 |             cur,
301 |             ctx=ctx,
302 |         )
303 | 
304 |         cur = Tensor.add(Tensor.repeat(self.out_proj_bias, cur, ctx=ctx), cur, ctx=ctx)
305 | 
306 |         # Add Residual
307 |         inpL = Tensor.add(inpL, cur, ctx=ctx)
308 | 
309 |         # LN2
310 |         cur = Tensor.norm(inpL, ctx=ctx)
311 |         cur = Tensor.add(
312 |             Tensor.mul(Tensor.repeat(self.ln_2_weight, cur, ctx=ctx), cur, ctx=ctx),
313 |             Tensor.repeat(self.ln_2_bias, cur, ctx=ctx),
314 |             ctx=ctx,
315 |         )
316 | 
317 |         # MLP
318 |         # c_fc
319 |         cur = Tensor.mul_mat(self.mlp_c_fc_weight, cur, ctx=ctx)
320 |         cur = Tensor.add(Tensor.repeat(self.mlp_c_fc_bias, cur, ctx=ctx), cur, ctx=ctx)
321 | 
322 |         # QuickGELU -  x * sigmoid(1.702 * x)
323 |         cur = Tensor.scale(cur, Tensor.new_f32(1.702, ctx=ctx), ctx=ctx)
324 | 
325 |         cur = Tensor.silu(cur, ctx=ctx)
326 | 
327 |         cur = Tensor.scale(cur, Tensor.new_f32(1 / 1.702, ctx=ctx), ctx=ctx)
328 | 
329 |         # c_proj
330 |         cur = Tensor.mul_mat(self.mlp_c_proj_weight, cur, ctx=ctx)
331 |         cur = Tensor.add(
332 |             Tensor.repeat(self.mlp_c_proj_bias, cur, ctx=ctx), cur, ctx=ctx
333 |         )
334 | 
335 |         # Add Residual
336 |         cur = Tensor.add(inpL, cur, ctx=ctx)
337 |         return cur
338 | 
339 | 
340 | class VisionTransformer:
341 |     def __init__(
342 |         self,
343 |         ctx: Context,
344 |         wtype: GGML_TYPE,
345 |         input_resolution: int,
346 |         patch_size: int,
347 |         width: int,
348 |         heads: int,
349 |         layers: int,
350 |         output_dim: int,
351 |     ):
352 |         self.layers = layers
353 |         self.tensors: Dict[str, Tensor] = {}
354 | 
355 |         # Class Embedding (visual.class_embedding)
356 |         self.visual_class_embedding = Tensor.new_tensor_1d(wtype, width, ctx=ctx)
357 |         self.tensors["visual.class_embedding"] = self.visual_class_embedding
358 | 
359 |         # Positional Embedding (visual.positional_embedding)
360 |         self.visual_positional_embedding = Tensor.new_tensor_2d(
361 |             wtype, width, (input_resolution // patch_size) ** 2 + 1, ctx=ctx
362 |         )
363 |         self.tensors["visual.positional_embedding"] = self.visual_positional_embedding
364 | 
365 |         # Convolutional Layer (visual.conv1.weight)
366 |         wtype_f16 = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(1)))
367 |         self.visual_conv1_weight = Tensor.new_tensor_4d(
368 |             wtype_f16, patch_size, patch_size, 3, width, ctx=ctx
369 |         )
370 |         self.tensors["visual.conv1.weight"] = self.visual_conv1_weight
371 | 
372 |         # pre Layer Norm Weight (visual.ln_pre.weight)
373 |         self.visual_ln_pre_weight = Tensor.new_tensor_1d(wtype, width, ctx=ctx)
374 |         self.tensors["visual.ln_pre.weight"] = self.visual_ln_pre_weight
375 | 
376 |         # pre Layer Norm Bias (visual.ln_pre.bias)
377 |         self.visual_ln_pre_bias = Tensor.new_tensor_1d(wtype, width, ctx=ctx)
378 |         self.tensors["visual.ln_pre.bias"] = self.visual_ln_pre_bias
379 |         self.resblocks = []
380 |         for i in range(layers):
381 |             resblock = ResidualAttentionBlock(
382 |                 ctx=ctx, wtype=wtype, embed_dim=width, heads=heads, use_attn_mask=False
383 |             )
384 |             self.resblocks.append(resblock)
385 |             self.tensors.update(
386 |                 {
387 |                     f"visual.transformer.resblocks.{i}." + k: v
388 |                     for k, v in resblock.tensors.items()
389 |                 }
390 |             )
391 | 
392 |         # post Layer Norm (visual.ln_post)
393 |         self.visual_ln_post_weight = Tensor.new_tensor_1d(wtype, width, ctx=ctx)
394 |         self.visual_ln_post_bias = Tensor.new_tensor_1d(wtype, width, ctx=ctx)
395 |         self.tensors["visual.ln_post.weight"] = self.visual_ln_post_weight
396 |         self.tensors["visual.ln_post.bias"] = self.visual_ln_post_bias
397 | 
398 |         # Visual Projection (visual.proj)
399 |         self.visual_proj = Tensor.new_tensor_2d(wtype, output_dim, width, ctx=ctx)
400 |         self.tensors["visual.proj"] = self.visual_proj
401 | 
402 | 
403 | class ClipModel:
404 |     def __init__(
405 |         self,
406 |         ctx: Context,
407 |         wtype: GGML_TYPE,
408 |         vision_width: int,
409 |         vision_layers: int,
410 |         vision_patch_size: int,
411 |         image_resolution: int,
412 |         embed_dim: int,
413 |         context_length: int,
414 |         vocab_size: int,
415 |         transformer_width: int,
416 |         transformer_heads: int,
417 |         transformer_layers: int,
418 |         n_threads: int,
419 |     ):
420 |         self.n_threads = n_threads
421 |         self.tensors: Dict[str, Tensor] = {}
422 | 
423 |         # Vision Transformer
424 |         self.vision_layers = vision_layers
425 |         self.vision_patch_size = vision_patch_size
426 |         self.vision_width = vision_width
427 |         self.vision_heads = vision_width // 64
428 |         self.image_resolution = image_resolution
429 |         self.grid_size = image_resolution // vision_patch_size
430 | 
431 |         # Text Transformer
432 |         self.context_length = context_length
433 |         self.transformer_width = transformer_width
434 |         self.transformer_heads = transformer_heads
435 |         self.transformer_layers = transformer_layers
436 | 
437 |         self.embed_dim = embed_dim
438 | 
439 |         # Positional Embedding (position_embedding)
440 |         self.positional_embedding = Tensor.new_tensor_2d(
441 |             wtype, transformer_width, context_length, ctx=ctx
442 |         )
443 |         self.tensors["positional_embedding"] = self.positional_embedding
444 | 
445 |         # Text Projection (text_projection)
446 |         self.text_projection = Tensor.new_tensor_2d(
447 |             wtype, transformer_width, embed_dim, ctx=ctx
448 |         )
449 |         self.tensors["text_projection"] = self.text_projection
450 | 
451 |         # Logit Scale (logit_scale)
452 |         self.logit_scale = Tensor.new_tensor_1d(wtype, 1, ctx=ctx)
453 |         self.tensors["logit_scale"] = self.logit_scale
454 | 
455 |         # Visual Transformer (visual.)
456 |         self.visual = VisionTransformer(
457 |             ctx=ctx,
458 |             wtype=wtype,
459 |             input_resolution=image_resolution,
460 |             patch_size=vision_patch_size,
461 |             width=vision_width,
462 |             layers=vision_layers,
463 |             heads=self.vision_heads,
464 |             output_dim=embed_dim,
465 |         )
466 |         self.tensors.update(self.visual.tensors)
467 | 
468 |         # Transformer (transformer.)
469 |         self.transformer_res_blocks = []
470 |         for i in range(transformer_layers):
471 |             res_block = ResidualAttentionBlock(
472 |                 ctx=ctx,
473 |                 wtype=wtype,
474 |                 embed_dim=transformer_width,
475 |                 heads=transformer_heads,
476 |                 use_attn_mask=True,
477 |             )
478 |             self.transformer_res_blocks.append(res_block)
479 |             self.tensors.update(
480 |                 {
481 |                     f"transformer.resblocks.{i}." + k: v
482 |                     for k, v in res_block.tensors.items()
483 |                 }
484 |             )
485 | 
486 |         # Token Embedding (token_embedding.weight)
487 |         self.token_embedding = Tensor.new_tensor_2d(
488 |             wtype, transformer_width, vocab_size, ctx=ctx
489 |         )
490 |         self.tensors["token_embedding.weight"] = self.token_embedding
491 | 
492 |         # Final Layer Norm (ln_final.weight)
493 |         self.ln_final_weight = Tensor.new_tensor_1d(wtype, transformer_width, ctx=ctx)
494 |         self.tensors["ln_final.weight"] = self.ln_final_weight
495 | 
496 |         # Final Layer Norm (ln_final.bias)
497 |         self.ln_final_bias = Tensor.new_tensor_1d(wtype, transformer_width, ctx=ctx)
498 |         self.tensors["ln_final.bias"] = self.ln_final_bias
499 | 
500 |     def encode_image(self, image):
501 |         tensor = self._encode_image_internal(image)
502 |         return tensor.numpy().copy().reshape(1, -1)
503 | 
504 |     def encode_text(self, text_embds):
505 |         encodings = []
506 |         # TODO: batchify
507 |         for text_embd in text_embds:
508 |             tensor = self._encode_text_internal(text_embd)
509 |             encodings.append(tensor.numpy().copy().reshape(1, -1))
510 |         return np.concatenate(encodings, axis=0)
511 | 
512 |     def __call__(self, image, text):
513 |         image_features = self.encode_image(image)
514 |         text_features = self.encode_text(text)
515 | 
516 |         # normalized features
517 |         image_features = image_features / np.linalg.norm(
518 |             image_features, axis=1, keepdims=True
519 |         )
520 |         text_features = text_features / np.linalg.norm(
521 |             text_features, axis=1, keepdims=True
522 |         )
523 | 
524 |         # cosine similarity as logits
525 |         logit_scale = np.exp(self.logit_scale.numpy().copy())
526 |         logits_per_image = logit_scale * image_features @ text_features.T
527 |         logits_per_text = logits_per_image.T
528 | 
529 |         # shape = [global_batch_size, global_batch_size]
530 |         return logits_per_image, logits_per_text
531 | 
532 |     def _text_encoder_compute_forward_memsize(self):
533 |         mem_size = 0
534 |         e_size = 4
535 |         ggml_overhead = 256
536 |         mem_size += e_size * self.context_length + ggml_overhead  # input embd
537 | 
538 |         mem_size += (
539 |             e_size * self.context_length * self.embed_dim + ggml_overhead
540 |         )  # token embedding
541 | 
542 |         mem_size += (
543 |             e_size * self.context_length * self.embed_dim + ggml_overhead
544 |         )  # add positional embedding
545 |         res_block_mem_size = ResidualAttentionBlock.compute_forward_mem_size(
546 |             self.context_length,
547 |             self.transformer_width,
548 |             self.transformer_heads,
549 |             use_attn_mask=True,
550 |         )
551 |         mem_size += res_block_mem_size * self.transformer_layers
552 |         mem_size += (
553 |             e_size * self.transformer_width * self.context_length + ggml_overhead
554 |         ) * 5  # ln_final
555 | 
556 |         mem_size += ggml_overhead  # view
557 | 
558 |         mem_size += e_size * self.embed_dim + ggml_overhead  # Text Proj: output
559 |         mem_size += ggml_overhead  # Text Proj: Transpose
560 |         mem_size += (
561 |             e_size * self.embed_dim * self.embed_dim * ggml_overhead
562 |         )  # Text Proj: cpy
563 |         return mem_size
564 | 
565 |     def _encode_text_internal(self, embd_inp: np.ndarray):
566 |         wtype = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(0)))
567 |         N = self.context_length
568 |         mem_size = self._text_encoder_compute_forward_memsize()
569 |         mem_buffer = np.empty(mem_size, dtype=np.uint8)
570 |         init_params = InitParams(
571 |             mem_size=mem_size, mem_buffer=mem_buffer.ctypes.data_as(ctypes.c_void_p)
572 |         )
573 |         ctx0 = Context(init_params=init_params)
574 | 
575 |         gf = CGraph(cgraph=ggml.ggml_cgraph(n_threads=self.n_threads), ctx=ctx0)
576 | 
577 |         embd = Tensor.new_tensor_1d(GGML_TYPE.I32, N, ctx=ctx0)
578 |         embd.numpy()[:] = np.array(embd_inp, dtype=np.int32)
579 |         inpL = Tensor.get_rows(self.token_embedding, embd, ctx=ctx0)
580 |         cur = Tensor.add(inpL, self.positional_embedding, ctx=ctx0)
581 | 
582 |         for il in range(self.transformer_layers):
583 |             resblock = self.transformer_res_blocks[il]
584 |             cur = resblock.forward(cur, ctx=ctx0, gf=gf)
585 | 
586 |         cur = Tensor.norm(cur, ctx=ctx0)
587 |         cur = Tensor.add(
588 |             Tensor.mul(
589 |                 Tensor.repeat(self.ln_final_weight, cur, ctx=ctx0),
590 |                 cur,
591 |                 ctx=ctx0,
592 |             ),
593 |             Tensor.repeat(self.ln_final_bias, cur, ctx=ctx0),
594 |             ctx=ctx0,
595 |         )
596 | 
597 |         # Use the embedding from the EOT token
598 |         eot_idx = embd_inp.argmax()
599 |         cur = Tensor.view_2d(
600 |             cur,
601 |             self.embed_dim,
602 |             1,
603 |             cur.tensor.contents.nb[1],
604 |             eot_idx * cur.tensor.contents.nb[1],
605 |             ctx=ctx0,
606 |         )
607 | 
608 |         cur = Tensor.mul_mat(
609 |             Tensor.cpy(
610 |                 Tensor.transpose(self.text_projection, ctx=ctx0),
611 |                 Tensor.new_tensor_2d(wtype, self.embed_dim, self.embed_dim, ctx=ctx0),
612 |             ),
613 |             cur,
614 |             ctx=ctx0,
615 |         )
616 |         gf.build_forward_expand(cur)
617 |         gf.compute()
618 |         return cur
619 | 
620 |     def _image_encoder_compute_forward_memsize(self):
621 |         e_size = 4
622 |         N = self.grid_size * self.grid_size + 1
623 |         ggml_overhead = 256
624 | 
625 |         mem_size = 0
626 |         mem_size += 256
627 |         mem_size += (
628 |             e_size * self.image_resolution * self.image_resolution * 3 + ggml_overhead
629 |         )  # image
630 |         mem_size += (
631 |             e_size * self.grid_size * self.grid_size * self.vision_width + ggml_overhead
632 |         )  # conv
633 |         mem_size += e_size * self.vision_width * N + ggml_overhead  # concat
634 | 
635 |         mem_size += (
636 |             e_size * self.vision_width * N + ggml_overhead
637 |         ) * 2  # Copy in visual features
638 |         mem_size += e_size * 8 + 256
639 |         mem_size += 2 * ggml_overhead  # cpy and transpose
640 |         mem_size += e_size * 8 + 256  # ???
641 |         mem_size += (
642 |             e_size * self.vision_width * N + ggml_overhead
643 |         )  # Copy in positional embeddings (new tensor 2d)
644 |         mem_size += (
645 |             e_size * self.vision_width * N + ggml_overhead
646 |         )  # copy visual features: ret
647 | 
648 |         mem_size += e_size * self.vision_width * N + ggml_overhead  # add
649 | 
650 |         mem_size += e_size * self.vision_width * N + ggml_overhead  # ln_pre: norm
651 |         mem_size += e_size * self.vision_width * N + ggml_overhead  # ln_pre: repeat
652 |         mem_size += e_size * self.vision_width * N + ggml_overhead  # ln_pre: repeat
653 |         mem_size += e_size * self.vision_width * N + ggml_overhead  # ln_pre: mul
654 |         mem_size += e_size * self.vision_width * N + ggml_overhead  # ln_pre: add
655 | 
656 |         res_block_mem_size = ResidualAttentionBlock.compute_forward_mem_size(
657 |             N, self.vision_width, self.vision_heads, use_attn_mask=False
658 |         )
659 | 
660 |         mem_size += res_block_mem_size * self.vision_layers
661 |         mem_size += ggml_overhead  # ln_post: transpose
662 |         mem_size += (e_size * self.vision_width + ggml_overhead) * 3  # ln_post
663 | 
664 |         mem_size += e_size * self.vision_width * self.embed_dim + ggml_overhead
665 |         mem_size += ggml_overhead  # cpy
666 |         mem_size += 159808  # Compute Overhead ??
667 |         return mem_size
668 | 
669 |     def _encode_image_internal(self, image):
670 |         wtype = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(0)))
671 | 
672 |         mem_size = self._image_encoder_compute_forward_memsize()
673 |         mem_buffer = np.empty(mem_size, dtype=np.uint8)
674 |         init_params = InitParams(
675 |             mem_size=mem_size, mem_buffer=mem_buffer.ctypes.data_as(ctypes.c_void_p)
676 |         )
677 |         ctx0 = Context(init_params=init_params)
678 | 
679 |         gf = CGraph(cgraph=ggml.ggml_cgraph(n_threads=self.n_threads), ctx=ctx0)
680 | 
681 |         img_tensor = Tensor.new_tensor_4d(
682 |             wtype,
683 |             image.shape[3],
684 |             image.shape[2],
685 |             image.shape[1],
686 |             image.shape[0],
687 |             ctx=ctx0,
688 |         )
689 |         img_tensor.numpy()[:] = image.permute(3, 2, 1, 0)
690 |         cur = Tensor.conv_2d_sk_p0(
691 |             self.visual.visual_conv1_weight, img_tensor, ctx=ctx0
692 |         )
693 | 
694 |         cur = Tensor.reshape_2d(
695 |             cur,
696 |             cur.shape[0] * cur.shape[1],
697 |             cur.shape[2],
698 |             ctx=ctx0,
699 |         )
700 | 
701 |         concat = Tensor.new_tensor_2d(wtype, cur.shape[0] + 1, cur.shape[1], ctx=ctx0)
702 | 
703 |         concat = Tensor.set_1d(
704 |             concat,
705 |             Tensor.view_1d(
706 |                 self.visual.visual_class_embedding,
707 |                 self.visual.visual_class_embedding.shape[0],
708 |                 0,
709 |             ),
710 |             0,
711 |             ctx=ctx0,
712 |         )
713 | 
714 |         # Copy in the visual features
715 |         concat = Tensor.set_2d(
716 |             concat,
717 |             Tensor.cpy(
718 |                 Tensor.transpose(cur, ctx=ctx0),
719 |                 Tensor.new_tensor_2d(wtype, cur.shape[0], cur.shape[1]),
720 |                 ctx=ctx0,
721 |             ),
722 |             cur.tensor.contents.nb[1],
723 |             self.visual.visual_class_embedding.nbytes(),
724 |             ctx=ctx0,
725 |         )
726 | 
727 |         # Copy in the positional embeddings
728 |         cur = Tensor.cpy(
729 |             concat,
730 |             Tensor.new_tensor_2d(wtype, concat.shape[1], concat.shape[0], ctx=ctx0),
731 |             ctx=ctx0,
732 |         )
733 | 
734 |         pEmb = self.visual.visual_positional_embedding
735 | 
736 |         cur = Tensor.add(cur, pEmb, ctx=ctx0)
737 | 
738 |         # ln_pre
739 |         cur = Tensor.norm(cur, ctx=ctx0)
740 | 
741 |         cur = Tensor.add(
742 |             Tensor.mul(
743 |                 Tensor.repeat(self.visual.visual_ln_pre_weight, cur, ctx=ctx0),
744 |                 cur,
745 |                 ctx=ctx0,
746 |             ),
747 |             Tensor.repeat(self.visual.visual_ln_pre_bias, cur, ctx=ctx0),
748 |             ctx=ctx0,
749 |         )
750 | 
751 |         # Transformer
752 |         for il in range(self.visual.layers):
753 |             resblock = self.visual.resblocks[il]
754 |             cur = resblock.forward(cur, ctx=ctx0, gf=gf)
755 | 
756 |         # ln_post
757 |         cur = Tensor.norm(
758 |             Tensor.view_2d(Tensor.transpose(cur, ctx=ctx0), cur.shape[0], 1, 1, 0),
759 |             ctx=ctx0,
760 |         )
761 | 
762 |         cur = Tensor.add(
763 |             Tensor.mul(
764 |                 Tensor.repeat(self.visual.visual_ln_post_weight, cur, ctx=ctx0),
765 |                 cur,
766 |                 ctx=ctx0,
767 |             ),
768 |             Tensor.repeat(self.visual.visual_ln_post_bias, cur, ctx=ctx0),
769 |             ctx=ctx0,
770 |         )
771 | 
772 |         # Token Projection
773 |         cur = Tensor.mul_mat(
774 |             Tensor.cpy(
775 |                 Tensor.transpose(self.visual.visual_proj),
776 |                 Tensor.new_tensor_2d(
777 |                     wtype,
778 |                     self.visual.visual_proj.shape[1],
779 |                     self.visual.visual_proj.shape[0],
780 |                     ctx=ctx0,
781 |                 ),
782 |                 ctx=ctx0,
783 |             ),
784 |             Tensor.reshape_2d(cur, cur.shape[0], 1),
785 |             ctx=ctx0,
786 |         )
787 | 
788 |         gf.build_forward_expand(cur)
789 |         gf.compute()
790 | 
791 |         return cur
792 | 
793 |     @staticmethod
794 |     def init_from_file(model_file: str, verbose=True, n_threads=1):
795 |         with open(model_file, "rb") as fin:
796 |             # Magic Number
797 |             (magic,) = struct.unpack("i", (fin.read(struct.calcsize("i"))))
798 | 
799 |             assert magic == ggml.GGML_FILE_MAGIC
800 |             if verbose:
801 |                 print("magic number =", hex(magic))
802 |             # Hyperparameters
803 |             (
804 |                 vision_width,
805 |                 vision_layers,
806 |                 vision_patch_size,
807 |                 grid_size,
808 |                 image_resolution,
809 |                 embed_dim,
810 |                 context_length,
811 |                 transformer_width,
812 |                 transformer_heads,
813 |                 transformer_layers,
814 |                 ftype,
815 |                 vocab_size,
816 |             ) = struct.unpack("iiiiiiiiiiii", fin.read(struct.calcsize("iiiiiiiiiiii")))
817 | 
818 |             qntvr = ftype // ggml.GGML_QNT_VERSION_FACTOR
819 |             if verbose:
820 |                 print("vision_width    =", vision_width)
821 |                 print("vision_layers    =", vision_layers)
822 |                 print("vision_patch_size =", vision_patch_size)
823 |                 print("grid_size       =", grid_size)
824 |                 print("image_resolution =", image_resolution)
825 |                 print("embed_dim       =", embed_dim)
826 |                 print("context_length  =", context_length)
827 |                 print("transformer_width =", transformer_width)
828 |                 print("transformer_heads =", transformer_heads)
829 |                 print("transformer_layers =", transformer_layers)
830 |                 print("ftype           =", ftype)
831 |                 print("qntvr           =", qntvr)
832 |                 print("vocab_size      =", vocab_size)
833 |             ftype %= ggml.GGML_QNT_VERSION_FACTOR
834 |             ftype = GGML_FTYPE(int(ftype))
835 | 
836 |             # Vocabulary
837 |             vocab: List[Tuple[int, str]] = []
838 |             for i in range(vocab_size):
839 |                 (s_len,) = struct.unpack("i", fin.read(struct.calcsize("i")))
840 |                 s = fin.read(s_len).decode("utf-8")
841 |                 vocab.append((i, s))
842 | 
843 |             # Model Weights
844 |             wtype = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(ftype.value)))
845 | 
846 |             ctx_size = compute_ctx_size(fin)
847 | 
848 |             mem_buffer = np.empty(ctx_size, dtype=np.uint8)
849 |             init_params = InitParams(
850 |                 mem_size=ctx_size,
851 |                 mem_buffer=mem_buffer.ctypes.data_as(ctypes.c_void_p),
852 |             )
853 |             ctx = Context(init_params=init_params)
854 | 
855 |             # Create Model
856 |             model = ClipModel(
857 |                 ctx=ctx,
858 |                 wtype=wtype,
859 |                 vision_width=vision_width,
860 |                 vision_layers=vision_layers,
861 |                 vision_patch_size=vision_patch_size,
862 |                 image_resolution=image_resolution,
863 |                 embed_dim=embed_dim,
864 |                 context_length=context_length,
865 |                 vocab_size=vocab_size,
866 |                 transformer_width=transformer_width,
867 |                 transformer_heads=transformer_heads,
868 |                 transformer_layers=transformer_layers,
869 |                 n_threads=n_threads,
870 |             )
871 | 
872 |             # Load Weights
873 |             while True:
874 |                 nbytes = struct.calcsize("iii")
875 |                 data = fin.read(nbytes)
876 |                 if len(data) != nbytes:
877 |                     break
878 |                 (n_dims, s_len, ftype) = struct.unpack("iii", data)
879 |                 dims = struct.unpack(
880 |                     "i" * n_dims, fin.read(struct.calcsize("i" * n_dims))
881 |                 )
882 |                 tensor_name = fin.read(s_len).decode("utf-8")
883 |                 tensor = model.tensors[tensor_name]
884 |                 n_elements = tensor.nelements()
885 |                 expected_n_elements = np.prod(dims)
886 |                 if n_elements != expected_n_elements:
887 |                     raise ValueError(
888 |                         f"tensor {tensor_name} has {n_elements} elements, but {expected_n_elements} were expected"
889 |                     )
890 | 
891 |                 buf = (ctypes.c_char * tensor.nbytes()).from_address(tensor.data)
892 |                 offset = fin.tell()
893 |                 fname = fin.name.encode("utf-8")
894 |                 fin.readinto(buf)
895 | 
896 |             return model
897 | 
898 | 
899 | if __name__ == "__main__":
900 |     parser = argparse.ArgumentParser()
901 |     parser.add_argument("-m", "--model", type=str, default=None)
902 |     parser.add_argument("--use-gpu", action="store_true")
903 |     args = parser.parse_args()
904 | 
905 |     model_file = args.model
906 |     model = ClipModel.init_from_file(model_file, n_threads=1, use_gpu=args.use_gpu)
907 |     image = np.random.rand(3, 224, 224).astype(np.float32)
908 |     output = model.eval([image, image])
909 |     print(output)
910 | 


--------------------------------------------------------------------------------
/examples/clip/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | certifi==2023.7.22
 3 | charset-normalizer==2.1.1
 4 | clip==1.0
 5 | filelock==3.9.0
 6 | ftfy==6.1.1
 7 | ggml-python @ git+https://github.com/abetlen/ggml-python@main
 8 | idna==3.7
 9 | Jinja2==3.1.4
10 | MarkupSafe==2.1.2
11 | mpmath==1.3.0
12 | networkx==3.0
13 | numpy==1.24.1
14 | Pillow==10.3.0
15 | regex==2023.6.3
16 | requests==2.32.0
17 | scipy==1.10.1
18 | sympy==1.11.1
19 | torch==2.0.1+cpu
20 | torchvision==0.15.2+cpu
21 | tqdm==4.66.3
22 | typing-extensions==4.6.3
23 | urllib3==1.26.18
24 | wcwidth==0.2.6
25 | 


--------------------------------------------------------------------------------
/examples/clip/utils.py:
--------------------------------------------------------------------------------
  1 | # These functions were copied directly from
  2 | # https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
  3 | # and
  4 | # https://github.com/openai/CLIP/blob/main/clip/clip.py
  5 | 
  6 | import torch
  7 | import gzip
  8 | import html
  9 | import os
 10 | from functools import lru_cache
 11 | from typing import Union, List
 12 | from pkg_resources import packaging
 13 | import ftfy
 14 | import regex as re
 15 | import clip
 16 | from PIL import Image
 17 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 18 | 
 19 | try:
 20 |     from torchvision.transforms import InterpolationMode
 21 | 
 22 |     BICUBIC = InterpolationMode.BICUBIC
 23 | except ImportError:
 24 |     BICUBIC = Image.BICUBIC
 25 | 
 26 | 
 27 | def _convert_image_to_rgb(image):
 28 |     return image.convert("RGB")
 29 | 
 30 | 
 31 | def transform(n_px):
 32 |     return Compose(
 33 |         [
 34 |             Resize(n_px, interpolation=BICUBIC),
 35 |             CenterCrop(n_px),
 36 |             _convert_image_to_rgb,
 37 |             ToTensor(),
 38 |             Normalize(
 39 |                 (0.48145466, 0.4578275, 0.40821073),
 40 |                 (0.26862954, 0.26130258, 0.27577711),
 41 |             ),
 42 |         ]
 43 |     )
 44 | 
 45 | 
 46 | @lru_cache()
 47 | def default_bpe():
 48 |     return os.path.join(
 49 |         os.path.dirname(os.path.abspath(clip.__file__)), "bpe_simple_vocab_16e6.txt.gz"
 50 |     )
 51 | 
 52 | 
 53 | @lru_cache()
 54 | def bytes_to_unicode():
 55 |     """
 56 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 57 |     The reversible bpe codes work on unicode strings.
 58 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 59 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 60 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 61 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 62 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 63 |     """
 64 |     bs = (
 65 |         list(range(ord("!"), ord("~") + 1))
 66 |         + list(range(ord("¡"), ord("¬") + 1))
 67 |         + list(range(ord("®"), ord("ÿ") + 1))
 68 |     )
 69 |     cs = bs[:]
 70 |     n = 0
 71 |     for b in range(2**8):
 72 |         if b not in bs:
 73 |             bs.append(b)
 74 |             cs.append(2**8 + n)
 75 |             n += 1
 76 |     cs = [chr(n) for n in cs]
 77 |     return dict(zip(bs, cs))
 78 | 
 79 | 
 80 | def get_pairs(word):
 81 |     """Return set of symbol pairs in a word.
 82 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 83 |     """
 84 |     pairs = set()
 85 |     prev_char = word[0]
 86 |     for char in word[1:]:
 87 |         pairs.add((prev_char, char))
 88 |         prev_char = char
 89 |     return pairs
 90 | 
 91 | 
 92 | def basic_clean(text):
 93 |     text = ftfy.fix_text(text)
 94 |     text = html.unescape(html.unescape(text))
 95 |     return text.strip()
 96 | 
 97 | 
 98 | def whitespace_clean(text):
 99 |     text = re.sub(r"\s+", " ", text)
100 |     text = text.strip()
101 |     return text
102 | 
103 | 
104 | class SimpleTokenizer(object):
105 |     def __init__(self, bpe_path: str = default_bpe()):
106 |         self.byte_encoder = bytes_to_unicode()
107 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
108 |         merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
109 |         merges = merges[1 : 49152 - 256 - 2 + 1]
110 |         merges = [tuple(merge.split()) for merge in merges]
111 |         vocab = list(bytes_to_unicode().values())
112 |         vocab = vocab + [v + "</w>" for v in vocab]
113 |         for merge in merges:
114 |             vocab.append("".join(merge))
115 |         vocab.extend(["<|startoftext|>", "<|endoftext|>"])
116 |         self.encoder = dict(zip(vocab, range(len(vocab))))
117 |         self.decoder = {v: k for k, v in self.encoder.items()}
118 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
119 |         self.cache = {
120 |             "<|startoftext|>": "<|startoftext|>",
121 |             "<|endoftext|>": "<|endoftext|>",
122 |         }
123 |         self.pat = re.compile(
124 |             r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
125 |             re.IGNORECASE,
126 |         )
127 | 
128 |     def bpe(self, token):
129 |         if token in self.cache:
130 |             return self.cache[token]
131 |         word = tuple(token[:-1]) + (token[-1] + "</w>",)
132 |         pairs = get_pairs(word)
133 | 
134 |         if not pairs:
135 |             return token + "</w>"
136 | 
137 |         while True:
138 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
139 |             if bigram not in self.bpe_ranks:
140 |                 break
141 |             first, second = bigram
142 |             new_word = []
143 |             i = 0
144 |             while i < len(word):
145 |                 try:
146 |                     j = word.index(first, i)
147 |                     new_word.extend(word[i:j])
148 |                     i = j
149 |                 except:
150 |                     new_word.extend(word[i:])
151 |                     break
152 | 
153 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
154 |                     new_word.append(first + second)
155 |                     i += 2
156 |                 else:
157 |                     new_word.append(word[i])
158 |                     i += 1
159 |             new_word = tuple(new_word)
160 |             word = new_word
161 |             if len(word) == 1:
162 |                 break
163 |             else:
164 |                 pairs = get_pairs(word)
165 |         word = " ".join(word)
166 |         self.cache[token] = word
167 |         return word
168 | 
169 |     def encode(self, text):
170 |         bpe_tokens = []
171 |         text = whitespace_clean(basic_clean(text)).lower()
172 |         for token in re.findall(self.pat, text):
173 |             token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
174 |             bpe_tokens.extend(
175 |                 self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
176 |             )
177 |         return bpe_tokens
178 | 
179 |     def decode(self, tokens):
180 |         text = "".join([self.decoder[token] for token in tokens])
181 |         text = (
182 |             bytearray([self.byte_decoder[c] for c in text])
183 |             .decode("utf-8", errors="replace")
184 |             .replace("</w>", " ")
185 |         )
186 |         return text
187 | 
188 | 
189 | _tokenizer = SimpleTokenizer()
190 | 
191 | 
192 | def tokenize(
193 |     texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False
194 | ) -> Union[torch.IntTensor, torch.LongTensor]:
195 |     """
196 |     Returns the tokenized representation of given input string(s)
197 | 
198 |     Parameters
199 |     ----------
200 |     texts : Union[str, List[str]]
201 |         An input string or a list of input strings to tokenize
202 | 
203 |     context_length : int
204 |         The context length to use; all CLIP models use 77 as the context length
205 | 
206 |     truncate: bool
207 |         Whether to truncate the text in case its encoding is longer than the context length
208 | 
209 |     Returns
210 |     -------
211 |     A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
212 |     We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
213 |     """
214 | 
215 |     if isinstance(texts, str):
216 |         texts = [texts]
217 | 
218 |     sot_token = _tokenizer.encoder["<|startoftext|>"]
219 |     eot_token = _tokenizer.encoder["<|endoftext|>"]
220 |     all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
221 |     if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"):
222 |         result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
223 |     else:
224 |         result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
225 | 
226 |     for i, tokens in enumerate(all_tokens):
227 |         if len(tokens) > context_length:
228 |             if truncate:
229 |                 tokens = tokens[:context_length]
230 |                 tokens[-1] = eot_token
231 |             else:
232 |                 raise RuntimeError(
233 |                     f"Input {texts[i]} is too long for context length {context_length}"
234 |                 )
235 |         result[i, : len(tokens)] = torch.tensor(tokens)
236 | 
237 |     return result
238 | 


--------------------------------------------------------------------------------
/examples/custom-operators/example_jax.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | 
 3 | import ggml
 4 | import ggml.utils
 5 | 
 6 | import jax
 7 | 
 8 | from typing import Optional
 9 | 
10 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None)
11 | ctx = ggml.ggml_init(params)
12 | x_in = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
13 | 
14 | @ggml.ggml_custom1_op_t
15 | def double(
16 |     tensor_out: ggml.ggml_tensor_p,
17 |     tensor_in: ggml.ggml_tensor_p,
18 |     ith: int,
19 |     nth: int,
20 |     userdata: Optional[ctypes.c_void_p],
21 | ):
22 |     x = jax.device_put(ggml.utils.to_numpy(tensor_in))
23 |     x *= 2
24 |     ggml.utils.to_numpy(tensor_out)[:] = jax.device_get(x)
25 | 
26 | x_out = ggml.ggml_map_custom1(ctx, x_in, double, 1, None)
27 | gf = ggml.ggml_build_forward(x_out)
28 | 
29 | ggml.ggml_set_f32(x_in, 21.0)
30 | 
31 | ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1)
32 | output = ggml.ggml_get_f32_1d(x_out, 0)
33 | assert output == 42.0
34 | print("GGML output: ", output)
35 | ggml.ggml_free(ctx)


--------------------------------------------------------------------------------
/examples/optimizer/simple.py:
--------------------------------------------------------------------------------
 1 | # %% [markdown]
 2 | """
 3 | # Single-batch stochastic gradient descent example using ggml
 4 | 
 5 | This example demonstrates how to use ggml to implement a simple SGD optimizer.
 6 | """
 7 | # %%
 8 | import ggml
 9 | import random
10 | 
11 | a_real = 3.0
12 | b_real = 4.0
13 | 
14 | ctx0 = ggml.ggml_init(ggml.ggml_init_params(
15 |     mem_size=128 * 1024 * 1024, mem_buffer=None, no_alloc=False
16 | ))
17 | 
18 | assert ctx0 is not None
19 | 
20 | # define parameters
21 | a = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1)
22 | ggml.ggml_set_param(ctx0, a)
23 | 
24 | b = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1)
25 | ggml.ggml_set_param(ctx0, b)
26 | 
27 | # define input and output
28 | x = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1)
29 | ggml.ggml_set_input(x)
30 | 
31 | tmp = ggml.ggml_mul(ctx0, a, x)
32 | f = ggml.ggml_add(ctx0, tmp, b)
33 | 
34 | # define loss
35 | f_true = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1)
36 | ggml.ggml_set_input(f_true)
37 | 
38 | tmp = ggml.ggml_sub(ctx0, f, f_true)
39 | loss = ggml.ggml_mul(ctx0, tmp, tmp)
40 | 
41 | # build forward and backward graph
42 | gf = ggml.ggml_new_graph_custom(ctx0, ggml.GGML_DEFAULT_GRAPH_SIZE, True)
43 | ggml.ggml_build_forward_expand(gf, loss)
44 | gb = ggml.ggml_graph_dup(ctx0, gf)
45 | ggml.ggml_build_backward_expand(ctx0, gf, gb, False)
46 | 
47 | # initialize parameters
48 | ggml.ggml_set_f32(a, 1.0)
49 | ggml.ggml_set_f32(b, 1.0)
50 | 
51 | # SGD
52 | lr = 1e-2
53 | nsteps = 1000
54 | decay = 1e-3
55 | 
56 | for i in range(nsteps):
57 |     # sample data
58 |     x_sample = random.uniform(-10, 10)
59 |     f_sample = a_real * x_sample + b_real
60 | 
61 |     # set input
62 |     ggml.ggml_set_f32(x, x_sample)
63 |     ggml.ggml_set_f32(f_true, f_sample)
64 | 
65 |     # reset graph
66 |     ggml.ggml_graph_reset(gf)
67 |     ggml.ggml_set_f32(loss.contents.grad, 1.0)
68 | 
69 |     # compute forward and backward
70 |     ggml.ggml_graph_compute_with_ctx(ctx0, gb, 1)
71 | 
72 |     # print loss
73 |     loss_ = ggml.ggml_get_f32_1d(loss, 0)
74 |     print(f"step {i}: loss = {loss_}")
75 | 
76 |     # decay learning rate
77 |     lr *= (1.0 - decay)
78 | 
79 |     # update parameters
80 |     ggml.ggml_set_f32(a, ggml.ggml_get_f32_1d(a, 0) - lr * ggml.ggml_get_f32_1d(a.contents.grad, 0))
81 |     ggml.ggml_set_f32(b, ggml.ggml_get_f32_1d(b, 0) - lr * ggml.ggml_get_f32_1d(b.contents.grad, 0))
82 | 
83 |     # print parameters
84 |     print(f"a = {ggml.ggml_get_f32_1d(a, 0):.2f}, b = {ggml.ggml_get_f32_1d(b, 0):.2f}")
85 | 
86 | 
87 | ggml.ggml_free(ctx0)
88 | 
89 | # %%
90 | 


--------------------------------------------------------------------------------
/examples/replit/README.md:
--------------------------------------------------------------------------------
 1 | # Replit Code Completion Server
 2 | 
 3 | This example is a local-first Github Copilot drop-in replacement using the replit-code-v1-3b model written entirely in ggml-python.
 4 | 
 5 | For best performance (likely still slower than copilot) please run with CUDA, OpenCL, or Metal support.
 6 | 
 7 | 
 8 | ## Installation
 9 | 
10 | ```bash
11 | # Clone the repo
12 | git clone https://github.com/abetlen/ggml-python.git
13 | cd ggml-python/examples/replit
14 | # (Optional) Create a virtual environment
15 | python3 -m venv venv
16 | source venv/bin/activate
17 | # Install dependencies
18 | pip install -r requirements.txt
19 | ```
20 | 
21 | ## Model Weights
22 | 
23 | You can download the quantized model weights from [here](https://huggingface.co/abetlen/replit-code-v1-3b-ggml)
24 | 
25 | ## Running the Server
26 | 
27 | ```bash
28 | # Start the server
29 | MODEL=/path/to/model uvicorn server:app --reload
30 | ```
31 | 
32 | ## Editor Setup
33 | 
34 | ### VSCode
35 | 
36 | Add the following to your `settings.json`:
37 | 
38 | ```json
39 | {
40 |     "github.copilot.advanced": {
41 |         "debug.testOverrideProxyUrl": "http://localhost:8000",
42 |         "debug.overrideProxyUrl": "http://localhost:8000"
43 |     }
44 | }
45 | ```
46 | 
47 | ### Vim / Neovim
48 | 
49 | Add the following to your vimrc or init.vim:
50 | 
51 | ```
52 | let g:copilot_proxy = 'localhost:8000'
53 | let g:copilot_strict_ssl = 0
54 | ```


--------------------------------------------------------------------------------
/examples/replit/app.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import time
  4 | import uuid
  5 | import json
  6 | import multiprocessing
  7 | from functools import partial
  8 | from threading import Lock
  9 | from typing import (
 10 |     Callable,
 11 |     Dict,
 12 |     List,
 13 |     Optional,
 14 |     Union,
 15 |     Iterator,
 16 |     AsyncIterator,
 17 |     Sequence,
 18 | )
 19 | from os import environ
 20 | 
 21 | from typing_extensions import TypedDict, Literal
 22 | 
 23 | import numpy as np
 24 | import numpy.typing as npt
 25 | 
 26 | import anyio
 27 | from anyio.streams.memory import MemoryObjectSendStream
 28 | from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
 29 | from fastapi import FastAPI, Request, Depends
 30 | from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
 31 | from sse_starlette.sse import EventSourceResponse
 32 | 
 33 | from main import ReplitModel, ReplitSentencepieceTokenizer
 34 | 
 35 | 
 36 | ## Types
 37 | class CompletionLogprobs(TypedDict):
 38 |     text_offset: List[int]
 39 |     token_logprobs: List[Optional[float]]
 40 |     tokens: List[str]
 41 |     top_logprobs: List[Optional[Dict[str, float]]]
 42 | 
 43 | 
 44 | class CompletionChoice(TypedDict):
 45 |     text: str
 46 |     index: int
 47 |     logprobs: Optional[CompletionLogprobs]
 48 |     finish_reason: Optional[str]
 49 | 
 50 | 
 51 | class CompletionUsage(TypedDict):
 52 |     prompt_tokens: int
 53 |     completion_tokens: int
 54 |     total_tokens: int
 55 | 
 56 | 
 57 | class CompletionChunk(TypedDict):
 58 |     id: str
 59 |     object: Literal["text_completion"]
 60 |     created: int
 61 |     model: str
 62 |     choices: List[CompletionChoice]
 63 | 
 64 | 
 65 | class Completion(TypedDict):
 66 |     id: str
 67 |     object: Literal["text_completion"]
 68 |     created: int
 69 |     model: str
 70 |     choices: List[CompletionChoice]
 71 |     usage: CompletionUsage
 72 | 
 73 | 
 74 | class OpenAIify:
 75 |     def __init__(
 76 |         self,
 77 |         model: ReplitModel,
 78 |         cancel_callback: Optional[Callable[[], bool]] = None,
 79 |     ):
 80 |         self.model = model
 81 |         self.cancel_callback = cancel_callback
 82 | 
 83 |     def tokenize(self, text: str) -> List[int]:
 84 |         return self.model.tokenize(text)
 85 | 
 86 |     def detokenize(self, tokens: List[int]) -> str:
 87 |         return self.model.detokenize(tokens)
 88 | 
 89 |     def generate(
 90 |         self,
 91 |         tokens: Sequence[int],
 92 |         top_p: float = 0.95,
 93 |         temperature: float = 0.80,
 94 |         frequency_penalty: float = 0.0,
 95 |         presence_penalty: float = 0.0,
 96 |     ) -> Iterator[int]:
 97 |         return self.model.generate(
 98 |             tokens,
 99 |             top_p=top_p,
100 |             temperature=temperature,
101 |             frequency_penalty=frequency_penalty,
102 |             presence_penalty=presence_penalty,
103 |         )
104 | 
105 |     def _create_completion(
106 |         self,
107 |         prompt: str,
108 |         suffix: Optional[str] = None,
109 |         max_tokens: int = 16,
110 |         temperature: float = 0.8,
111 |         top_p: float = 0.95,
112 |         logprobs: Optional[int] = None,
113 |         echo: bool = False,
114 |         stop: Optional[Union[str, List[str]]] = [],
115 |         frequency_penalty: float = 0.0,
116 |         presence_penalty: float = 0.0,
117 |         stream: bool = False,
118 |         model: Optional[str] = None,
119 |     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
120 |         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
121 |         created: int = int(time.time())
122 |         completion_tokens: List[int] = []
123 |         # Add blank space to start of prompt to match OG llama tokenizer
124 |         prompt_tokens: List[int] = self.tokenize(prompt)
125 |         text: str = ""
126 |         returned_tokens: int = 0
127 |         stop = (
128 |             stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else []
129 |         )
130 |         model_name: str = model if model is not None else "replit-code-v1-3b"
131 | 
132 |         # Truncate prompt if it is too long
133 |         max_tokens = min(
134 |             max_tokens, max(0, self.model.max_seq_len - len(prompt_tokens) - 1)
135 |         )
136 |         if len(prompt_tokens) + max_tokens > self.model.max_seq_len:
137 |             raise ValueError(
138 |                 f"Requested tokens exceed context window of {self.model.max_seq_len}"
139 |             )
140 | 
141 |         stop_sequences = stop if stop != [] else []
142 |         finish_reason = "length"
143 |         for token in self.generate(
144 |             prompt_tokens,
145 |             top_p=top_p,
146 |             temperature=temperature,
147 |             frequency_penalty=frequency_penalty,
148 |             presence_penalty=presence_penalty,
149 |         ):
150 |             if token == self.eos_token():
151 |                 text = self.detokenize(completion_tokens)
152 |                 finish_reason = "stop"
153 |                 break
154 | 
155 |             if self.cancel_callback is not None and self.cancel_callback():
156 |                 text = self.detokenize(completion_tokens)
157 |                 finish_reason = "stop"
158 |                 break
159 | 
160 |             completion_tokens.append(token)
161 | 
162 |             all_text = self.detokenize(completion_tokens)
163 |             any_stop = [s for s in stop_sequences if s in all_text]
164 |             if len(any_stop) > 0:
165 |                 first_stop = any_stop[0]
166 |                 text = all_text[: all_text.index(first_stop)]
167 |                 finish_reason = "stop"
168 |                 break
169 | 
170 |             if stream:
171 |                 remaining_tokens = completion_tokens[returned_tokens:]
172 |                 remaining_text = self.detokenize(remaining_tokens)
173 |                 remaining_length = len(remaining_text)
174 | 
175 |                 # We want to avoid yielding any characters from
176 |                 # the generated text if they are part of a stop
177 |                 # sequence.
178 |                 first_stop_position = 0
179 |                 for s in stop_sequences:
180 |                     for i in range(min(len(s), remaining_length), 0, -1):
181 |                         if remaining_text.endswith(s[:i]):
182 |                             if i > first_stop_position:
183 |                                 first_stop_position = i
184 |                             break
185 | 
186 |                 token_end_position = 0
187 |                 for token in remaining_tokens:
188 |                     token_end_position += len(self.detokenize([token]))
189 |                     # Check if stop sequence is in the token
190 |                     if token_end_position >= (
191 |                         remaining_length - first_stop_position - 1
192 |                     ):
193 |                         break
194 |                     logprobs_or_none: Optional[CompletionLogprobs] = None
195 |                     if logprobs is not None:
196 |                         token_str = self.detokenize([token])
197 |                         text_offset = len(prompt) + len(
198 |                             self.detokenize(completion_tokens[:returned_tokens])
199 |                         )
200 |                         token_offset = len(prompt_tokens) + returned_tokens
201 |                         logits = self.model.scores[token_offset - 1, :].tolist()
202 |                         current_logprobs = self.logits_to_logprobs(logits)
203 |                         sorted_logprobs = list(
204 |                             sorted(
205 |                                 zip(current_logprobs, range(len(current_logprobs))),
206 |                                 reverse=True,
207 |                             )
208 |                         )
209 |                         top_logprob = {
210 |                             self.detokenize([i]): logprob
211 |                             for logprob, i in sorted_logprobs[:logprobs]
212 |                         }
213 |                         top_logprob[token_str] = current_logprobs[int(token)]
214 |                         logprobs_or_none = {
215 |                             "tokens": [self.detokenize([token])],
216 |                             "text_offset": [text_offset],
217 |                             "token_logprobs": [sorted_logprobs[int(token)][0]],
218 |                             "top_logprobs": [top_logprob],
219 |                         }
220 |                     returned_tokens += 1
221 |                     yield {
222 |                         "id": completion_id,
223 |                         "object": "text_completion",
224 |                         "created": created,
225 |                         "model": model_name,
226 |                         "choices": [
227 |                             {
228 |                                 "text": self.detokenize([token]),
229 |                                 "index": 0,
230 |                                 "logprobs": logprobs_or_none,
231 |                                 "finish_reason": None,
232 |                             }
233 |                         ],
234 |                     }
235 | 
236 |             if len(completion_tokens) >= max_tokens:
237 |                 text = self.detokenize(completion_tokens)
238 |                 finish_reason = "length"
239 |                 break
240 | 
241 |         if stream:
242 |             remaining_tokens = completion_tokens[returned_tokens:]
243 |             all_text = self.detokenize(remaining_tokens)
244 |             any_stop = [s for s in stop_sequences if s in all_text]
245 |             if len(any_stop) > 0:
246 |                 end = min(all_text.index(stop) for stop in any_stop)
247 |             else:
248 |                 end = len(all_text)
249 | 
250 |             token_end_position = 0
251 |             for token in remaining_tokens:
252 |                 token_end_position += len(self.detokenize([token]))
253 | 
254 |                 logprobs_or_none: Optional[CompletionLogprobs] = None
255 |                 if logprobs is not None:
256 |                     token_str = self.detokenize([token])
257 |                     text_offset = len(prompt) + len(
258 |                         self.detokenize(completion_tokens[:returned_tokens])
259 |                     )
260 |                     token_offset = len(prompt_tokens) + returned_tokens - 1
261 |                     logits = self.model.scores[token_offset, :].tolist()
262 |                     current_logprobs = self.logits_to_logprobs(logits)
263 |                     sorted_logprobs = list(
264 |                         sorted(
265 |                             zip(current_logprobs, range(len(current_logprobs))),
266 |                             reverse=True,
267 |                         )
268 |                     )
269 |                     top_logprob = {
270 |                         self.detokenize([i]): logprob
271 |                         for logprob, i in sorted_logprobs[:logprobs]
272 |                     }
273 |                     top_logprob[token_str] = current_logprobs[int(token)]
274 |                     logprobs_or_none = {
275 |                         "tokens": [self.detokenize([token])],
276 |                         "text_offset": [text_offset],
277 |                         "token_logprobs": [sorted_logprobs[int(token)][0]],
278 |                         "top_logprobs": [top_logprob],
279 |                     }
280 | 
281 |                 if token_end_position >= end:
282 |                     last_text = self.detokenize([token])
283 |                     if token_end_position == end - 1:
284 |                         break
285 |                     returned_tokens += 1
286 |                     yield {
287 |                         "id": completion_id,
288 |                         "object": "text_completion",
289 |                         "created": created,
290 |                         "model": model_name,
291 |                         "choices": [
292 |                             {
293 |                                 "text": last_text[
294 |                                     : len(last_text) - (token_end_position - end)
295 |                                 ],
296 |                                 "index": 0,
297 |                                 "logprobs": logprobs_or_none,
298 |                                 "finish_reason": finish_reason,
299 |                             }
300 |                         ],
301 |                     }
302 |                     break
303 |                 returned_tokens += 1
304 |                 yield {
305 |                     "id": completion_id,
306 |                     "object": "text_completion",
307 |                     "created": created,
308 |                     "model": model_name,
309 |                     "choices": [
310 |                         {
311 |                             "text": self.detokenize([token]),
312 |                             "index": 0,
313 |                             "logprobs": logprobs_or_none,
314 |                             "finish_reason": finish_reason
315 |                             if returned_tokens == len(completion_tokens)
316 |                             else None,
317 |                         }
318 |                     ],
319 |                 }
320 |             return
321 | 
322 |         text_str = text
323 | 
324 |         if echo:
325 |             text_str = prompt + text_str
326 | 
327 |         if suffix is not None:
328 |             text_str = text_str + suffix
329 | 
330 |         logprobs_or_none: Optional[CompletionLogprobs] = None
331 |         if logprobs is not None:
332 |             text_offset = 0 if echo else len(prompt)
333 |             token_offset = 0 if echo else len(prompt_tokens[1:])
334 |             text_offsets: List[int] = []
335 |             token_logprobs: List[Optional[float]] = []
336 |             tokens: List[str] = []
337 |             top_logprobs: List[Optional[Dict[str, float]]] = []
338 | 
339 |             if echo:
340 |                 # Remove leading BOS token
341 |                 all_tokens = prompt_tokens[1:] + completion_tokens
342 |             else:
343 |                 all_tokens = completion_tokens
344 | 
345 |             all_token_strs = [self.detokenize([token]) for token in all_tokens]
346 |             all_logprobs = [
347 |                 self.logits_to_logprobs(row.tolist()) for row in self.model.scores
348 |             ][token_offset:]
349 |             for token, token_str, logprobs_token in zip(
350 |                 all_tokens, all_token_strs, all_logprobs
351 |             ):
352 |                 text_offsets.append(text_offset)
353 |                 text_offset += len(token_str)
354 |                 tokens.append(token_str)
355 |                 sorted_logprobs = list(
356 |                     sorted(
357 |                         zip(logprobs_token, range(len(logprobs_token))), reverse=True
358 |                     )
359 |                 )
360 |                 token_logprobs.append(sorted_logprobs[int(token)][0])
361 |                 top_logprob: Optional[Dict[str, float]] = {
362 |                     self.detokenize([i]): logprob
363 |                     for logprob, i in sorted_logprobs[:logprobs]
364 |                 }
365 |                 top_logprob.update({token_str: logprobs_token[int(token)]})
366 |                 top_logprobs.append(top_logprob)
367 |             # Weird idosincracy of the OpenAI API where
368 |             # token_logprobs and top_logprobs are null for
369 |             # the first token.
370 |             if echo and len(all_tokens) > 0:
371 |                 token_logprobs[0] = None
372 |                 top_logprobs[0] = None
373 |             logprobs_or_none = {
374 |                 "tokens": tokens,
375 |                 "text_offset": text_offsets,
376 |                 "token_logprobs": token_logprobs,
377 |                 "top_logprobs": top_logprobs,
378 |             }
379 | 
380 |         yield {
381 |             "id": completion_id,
382 |             "object": "text_completion",
383 |             "created": created,
384 |             "model": model_name,
385 |             "choices": [
386 |                 {
387 |                     "text": text_str,
388 |                     "index": 0,
389 |                     "logprobs": logprobs_or_none,
390 |                     "finish_reason": finish_reason,
391 |                 }
392 |             ],
393 |             "usage": {
394 |                 "prompt_tokens": len(prompt_tokens),
395 |                 "completion_tokens": len(completion_tokens),
396 |                 "total_tokens": len(prompt_tokens) + len(completion_tokens),
397 |             },
398 |         }
399 | 
400 |     def create_completion(
401 |         self,
402 |         prompt: str,
403 |         suffix: Optional[str] = None,
404 |         max_tokens: int = 128,
405 |         temperature: float = 0.8,
406 |         top_p: float = 0.95,
407 |         logprobs: Optional[int] = None,
408 |         echo: bool = False,
409 |         stop: Optional[Union[str, List[str]]] = [],
410 |         frequency_penalty: float = 0.0,
411 |         presence_penalty: float = 0.0,
412 |         stream: bool = False,
413 |         model: Optional[str] = None,
414 |     ) -> Union[Completion, Iterator[CompletionChunk]]:
415 |         """Generate text from a prompt.
416 | 
417 |         Args:
418 |             prompt: The prompt to generate text from.
419 |             suffix: A suffix to append to the generated text. If None, no suffix is appended.
420 |             max_tokens: The maximum number of tokens to generate.
421 |             temperature: The temperature to use for sampling.
422 |             top_p: The top-p value to use for sampling.
423 |             logprobs: The number of logprobs to return. If None, no logprobs are returned.
424 |             echo: Whether to echo the prompt.
425 |             stop: A list of strings to stop generation when encountered.
426 |             repeat_penalty: The penalty to apply to repeated tokens.
427 |             top_k: The top-k value to use for sampling.
428 |             stream: Whether to stream the results.
429 | 
430 |         Raises:
431 |             ValueError: If the requested tokens exceed the context window.
432 |             RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
433 | 
434 |         Returns:
435 |             Response object containing the generated text.
436 |         """
437 |         completion_or_chunks = self._create_completion(
438 |             prompt=prompt,
439 |             suffix=suffix,
440 |             max_tokens=max_tokens,
441 |             temperature=temperature,
442 |             top_p=top_p,
443 |             logprobs=logprobs,
444 |             echo=echo,
445 |             stop=stop,
446 |             frequency_penalty=frequency_penalty,
447 |             presence_penalty=presence_penalty,
448 |             stream=stream,
449 |             model=model,
450 |         )
451 |         if stream:
452 |             chunks: Iterator[CompletionChunk] = completion_or_chunks
453 |             return chunks
454 |         completion: Completion = next(completion_or_chunks)  # type: ignore
455 |         return completion
456 | 
457 |     def eos_token(self):
458 |         return self.model.eos_token()
459 | 
460 |     def logits_to_logprobs(
461 |         self, logits: npt.NDArray[np.float32]
462 |     ) -> npt.NDArray[np.float32]:
463 |         return np.exp(logits) / (np.sum(np.exp(logits)))  # type: ignore
464 | 
465 | 
466 | class Settings(BaseSettings):
467 |     model_file: str
468 |     n_gpu_layers: int = 32
469 |     n_batch: int = 2048
470 |     n_threads: int = max(multiprocessing.cpu_count() // 2, 1)
471 |     sentencepiece_model: Optional[str] = None
472 | 
473 | 
474 | class CreateCompletionRequest(BaseModel):
475 |     prompt: Union[str, List[str]] = Field(
476 |         default="", description="The prompt to generate completions for."
477 |     )
478 |     suffix: Optional[str] = Field(
479 |         default=None,
480 |         description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
481 |     )
482 |     max_tokens: int = Field(
483 |         default=16,
484 |         ge=1,
485 |         le=2048,
486 |         description="The maximum number of tokens to generate.",
487 |     )
488 |     temperature: float = Field(
489 |         default=0.8,
490 |         ge=0.0,
491 |         le=2.0,
492 |         description="Adjust the randomness of the generated text.\n\n"
493 |         + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
494 |     )
495 |     top_p: float = Field(
496 |         default=0.95,
497 |         ge=0.0,
498 |         le=1.0,
499 |         description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
500 |         + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
501 |     )
502 |     echo: bool = Field(
503 |         default=False,
504 |         description="Whether to echo the prompt in the generated text. Useful for chatbots.",
505 |     )
506 |     stop: Optional[Union[str, List[str]]] = Field(
507 |         default=None,
508 |         description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
509 |     )
510 |     stream: bool = Field(
511 |         default=False,
512 |         description="Whether to stream the results as they are generated. Useful for chatbots.",
513 |     )
514 |     logprobs: Optional[int] = Field(
515 |         default=None,
516 |         ge=0,
517 |         description="The number of logprobs to generate. If None, no logprobs are generated.",
518 |     )
519 |     presence_penalty: Optional[float] = Field(
520 |         default=0.0,
521 |         ge=-2.0,
522 |         le=2.0,
523 |         description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
524 |     )
525 |     frequency_penalty: Optional[float] = Field(
526 |         default=0.0,
527 |         ge=-2.0,
528 |         le=2.0,
529 |         description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
530 |     )
531 |     logprobs: Optional[int] = Field(None)
532 | 
533 |     # ignored or currently unsupported
534 |     model: Optional[str] = Field(
535 |         description="The model to use for generating completions."
536 |     )
537 |     n: Optional[int] = 1
538 |     best_of: Optional[int] = 1
539 |     logit_bias: Optional[Dict[str, float]] = Field(None)
540 |     user: Optional[str] = Field(None)
541 | 
542 |     class Config:
543 |         schema_extra = {
544 |             "example": {
545 |                 "prompt": "def fib(n):",
546 |                 "stop": ["\n\n"],
547 |                 "temperature": 0,
548 |                 "max_tokens": 34,
549 |             }
550 |         }
551 | 
552 | 
553 | settings = Settings(model_file=environ.get("MODEL"))  # type: ignore
554 | app = FastAPI(
555 |     title="Code Completion API",
556 |     description="""
557 | ## Editor Setup
558 | 
559 | ### VSCode
560 | 
561 | Add the following to your `settings.json`:
562 | 
563 | ```json
564 | {
565 |     "github.copilot.advanced": {
566 |         "debug.testOverrideProxyUrl": "http://localhost:8000",
567 |         "debug.overrideProxyUrl": "http://localhost:8000"
568 |     }
569 | }
570 | ```
571 | 
572 | ### Vim / Neovim
573 | 
574 | Add the following to your vimrc or init.vim:
575 | 
576 | ```
577 | let g:copilot_proxy = 'localhost:8000'
578 | let g:copilot_strict_ssl = 0
579 | ```
580 | """,
581 | )
582 | outer_lock = Lock()
583 | inner_lock = Lock()
584 | 
585 | tokenizer = (
586 |     ReplitSentencepieceTokenizer(settings.sentencepiece_model)
587 |     if settings.sentencepiece_model
588 |     else None
589 | )
590 | 
591 | 
592 | def cancel_callback():
593 |     return outer_lock.locked()
594 | 
595 | 
596 | model = OpenAIify(
597 |     ReplitModel.init_from_file(
598 |         model_file=settings.model_file,
599 |         n_gpu_layers=settings.n_gpu_layers,
600 |         tokenizer=tokenizer,
601 |         cancel_callback=cancel_callback,
602 |     ),
603 |     # check if any other requests are pending in the same thread and cancel the stream if so
604 |     cancel_callback=cancel_callback,
605 | )
606 | 
607 | 
608 | def get_model():
609 |     # NOTE: This double lock allows the currently streaming model to check
610 |     # if any other requests are pending in the same thread and cancel the
611 |     # stream if so.
612 |     outer_lock.acquire()
613 |     release_outer_lock = True
614 |     try:
615 |         inner_lock.acquire()
616 |         try:
617 |             outer_lock.release()
618 |             release_outer_lock = False
619 |             yield model
620 |         finally:
621 |             inner_lock.release()
622 |     finally:
623 |         if release_outer_lock:
624 |             outer_lock.release()
625 | 
626 | 
627 | # Used to support copilot.vim
628 | @app.get("/copilot_internal/v2/token")
629 | def get_copilot_token():
630 |     content = {"token": "1", "expires_at": 2600000000, "refresh_in": 900}
631 |     return dict(status_code=200, content=content)
632 | 
633 | 
634 | CreateCompletionResponse = create_model_from_typeddict(Completion)
635 | 
636 | 
637 | # Used to support copilot.vim
638 | @app.post(
639 |     "/v1/engines/copilot-codex/completions",
640 |     # response_model=CreateCompletionResponse,
641 | )
642 | @app.post(
643 |     "/v1/completions",
644 |     # response_model=CreateCompletionResponse,
645 | )
646 | async def create_completion(
647 |     request: Request,
648 |     body: CreateCompletionRequest,
649 |     model: ReplitModel = Depends(get_model),
650 | ):
651 |     if isinstance(body.prompt, list):
652 |         assert len(body.prompt) <= 1
653 |         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
654 | 
655 |     exclude = {
656 |         "n",
657 |         "best_of",
658 |         "logit_bias",
659 |         "user",
660 |     }
661 |     kwargs = body.dict(exclude=exclude)
662 |     if body.stream:
663 |         send_chan, recv_chan = anyio.create_memory_object_stream(10)
664 | 
665 |         async def event_publisher(
666 |             inner_send_chan: MemoryObjectSendStream[Dict[str, Union[str, bool]]]
667 |         ):
668 |             async with inner_send_chan:
669 |                 try:
670 |                     iterator: Iterator[CompletionChunk] = await run_in_threadpool(model.create_completion, **kwargs)  # type: ignore
671 |                     async_iterator: AsyncIterator[
672 |                         CompletionChunk
673 |                     ] = iterate_in_threadpool(iterator)
674 |                     async for chunk in async_iterator:
675 |                         await inner_send_chan.send(dict(data=json.dumps(chunk)))
676 |                         if await request.is_disconnected():
677 |                             raise anyio.get_cancelled_exc_class()()
678 |                     await inner_send_chan.send(dict(data="[DONE]"))
679 |                 except anyio.get_cancelled_exc_class() as e:
680 |                     print("disconnected")
681 |                     with anyio.move_on_after(1, shield=True):
682 |                         print(
683 |                             f"Disconnected from client (via refresh/close) {request.client}"
684 |                         )
685 |                         await inner_send_chan.send(dict(closing=True))
686 |                         raise e
687 | 
688 |         return EventSourceResponse(
689 |             recv_chan, data_sender_callable=partial(event_publisher, send_chan)
690 |         )
691 |     else:
692 |         completion: Completion = await run_in_threadpool(model.create_completion, **kwargs)  # type: ignore
693 |         return completion
694 | 


--------------------------------------------------------------------------------
/examples/replit/main.py:
--------------------------------------------------------------------------------
   1 | """ggml-python implemention of the Replit code model
   2 | 
   3 | Model is available at:
   4 | https://huggingface.co/replit/replit-code-v1-3b
   5 | 
   6 | This implementation is based on the example model code and ggml model file format from:
   7 | https://github.com/ggerganov/ggml/tree/master/examples/replit
   8 | """
   9 | from __future__ import annotations
  10 | import abc
  11 | import math
  12 | import struct
  13 | import ctypes
  14 | import argparse
  15 | import multiprocessing
  16 | from collections import deque
  17 | 
  18 | from typing import (
  19 |     Callable,
  20 |     Deque,
  21 |     Iterator,
  22 |     List,
  23 |     Optional,
  24 |     Sequence,
  25 |     Tuple,
  26 |     Dict,
  27 |     Union,
  28 | )
  29 | 
  30 | import numpy as np
  31 | import numpy.typing as npt
  32 | 
  33 | import ggml
  34 | 
  35 | from ggml.utils import to_numpy
  36 | 
  37 | from contextlib import ExitStack
  38 | 
  39 | 
  40 | class ReplitAbortException(Exception):
  41 |     pass
  42 | 
  43 | ## Generic Sampling Functions
  44 | 
  45 | 
  46 | def sample(
  47 |     logits: npt.NDArray[np.float32],
  48 |     last_tokens: Optional[List[int]] = None,
  49 |     presence_penalty: float = 0.0,
  50 |     frequency_penalty: float = 0.0,
  51 |     temperature: float = 1.0,
  52 |     top_p: float = 0.0,
  53 | ) -> int:
  54 |     if last_tokens is None:
  55 |         last_tokens = []
  56 |     if temperature == 0.0:
  57 |         return int(np.argmax(logits))
  58 |     logits = frequency_and_presence_penalties(
  59 |         logits, last_tokens, frequency_penalty, presence_penalty
  60 |     )
  61 |     return nucleus_sampling(logits, top_p=top_p, temperature=temperature)
  62 | 
  63 | 
  64 | # TODO: this is likely incorrect
  65 | def frequency_and_presence_penalties(
  66 |     logits: npt.NDArray[np.float32],
  67 |     last_tokens: Sequence[int],
  68 |     alpha_frequency: float,
  69 |     alpha_presence: float,
  70 | ):
  71 |     if len(last_tokens) == 0:
  72 |         return logits
  73 | 
  74 |     if alpha_frequency == 0.0 and alpha_presence == 0.0:
  75 |         return logits
  76 | 
  77 |     # Calculate the frequency penalty contribution
  78 |     frequency_penalty = alpha_frequency * np.log(np.unique(last_tokens).size + 1)
  79 | 
  80 |     # Calculate the presence penalty contribution
  81 |     presence_penalty = alpha_presence * np.array(
  82 |         [float(token in last_tokens) for token in range(len(logits))]
  83 |     )
  84 | 
  85 |     # Apply penalties to the logits
  86 |     penalized_logits = logits - frequency_penalty - presence_penalty
  87 | 
  88 |     return penalized_logits
  89 | 
  90 | 
  91 | def nucleus_sampling(
  92 |     logits: npt.NDArray[np.float32], top_p: float, temperature: float = 1.0
  93 | ):
  94 |     # Apply temperature to logits
  95 |     logits /= temperature
  96 | 
  97 |     # Subtract the maximum value for numerical stability
  98 |     logits -= logits.max()  # type: ignore
  99 | 
 100 |     # Calculate probabilities using softmax function with epsilon
 101 |     epsilon = 1e-8
 102 |     probabilities = np.exp(logits) / ((np.exp(logits)).sum() + epsilon)  # type: ignore
 103 | 
 104 |     # Filter out NaN values from probabilities
 105 |     probabilities = np.nan_to_num(probabilities)
 106 | 
 107 |     # Sort the probabilities in descending order and get the corresponding indices
 108 |     sorted_indices = np.argsort(probabilities)[::-1]
 109 | 
 110 |     # Select the indices within the nucleus
 111 |     nucleus_indices = sorted_indices[: int(len(sorted_indices) * top_p)]
 112 | 
 113 |     # Calculate the updated probabilities within the nucleus
 114 |     nucleus_probabilities = probabilities[nucleus_indices]
 115 | 
 116 |     # Normalize the probabilities within the nucleus
 117 |     nucleus_probabilities /= nucleus_probabilities.sum()  # type: ignore
 118 | 
 119 |     # Sample from the updated probabilities
 120 |     selected_token = np.random.choice(nucleus_indices, p=nucleus_probabilities)
 121 | 
 122 |     return selected_token
 123 | 
 124 | 
 125 | ### Context Buffer
 126 | 
 127 | 
 128 | class ContextBuffer(abc.ABC):
 129 |     @abc.abstractmethod
 130 |     def resize(self, new_size: int) -> None:
 131 |         raise NotImplementedError
 132 | 
 133 |     @property
 134 |     @abc.abstractmethod
 135 |     def buffer(self) -> ctypes.c_void_p:
 136 |         raise NotImplementedError
 137 | 
 138 | 
 139 | class CpuContextBuffer(ContextBuffer):
 140 |     def __init__(self, buffer_size: int = 256 * 1024 * 1024):
 141 |         self.buffer_size = buffer_size
 142 |         self._buffer = (ctypes.c_uint8 * self.buffer_size)()
 143 | 
 144 |     def resize(self, new_size: int):
 145 |         assert new_size > self.buffer_size
 146 | 
 147 |         self.buffer_size = new_size
 148 |         ctypes.resize(self._buffer, self.buffer_size)
 149 | 
 150 |     @property
 151 |     def buffer(self) -> ctypes.c_void_p:
 152 |         return ctypes.c_void_p(ctypes.addressof(self._buffer))
 153 | 
 154 | 
 155 | ### Tokenizer
 156 | 
 157 | 
 158 | class Tokenizer(abc.ABC):
 159 |     @abc.abstractmethod
 160 |     def tokenize(self, text: str) -> List[int]:
 161 |         raise NotImplementedError
 162 | 
 163 |     @abc.abstractmethod
 164 |     def detokenize(self, tokens: List[int]) -> str:
 165 |         raise NotImplementedError
 166 | 
 167 | 
 168 | class ReplitTokenizer(Tokenizer):
 169 |     def __init__(self, vocab: List[Tuple[int, str, float]]):
 170 |         self.vocab = vocab
 171 |         self.piece_map = {piece: (i, -score) for i, piece, score in self.vocab}
 172 |         self.ws_symbol = b"\342\226\201"
 173 | 
 174 |     def tokenize(self, text: str) -> List[int]:
 175 |         normalized_text = text.replace(" ", self.ws_symbol.decode("utf-8"))
 176 |         tokenized, _ = ReplitTokenizer.encode_word(normalized_text, self.piece_map)
 177 |         return tokenized
 178 | 
 179 |     def detokenize(self, tokens: List[int]) -> str:
 180 |         text = "".join(self.vocab[token][1] for token in tokens)
 181 |         detokenized = text.replace(self.ws_symbol.decode("utf-8"), " ")
 182 |         return detokenized
 183 | 
 184 |     @staticmethod
 185 |     def encode_word(
 186 |         word: str, model: Dict[str, Tuple[int, float]]
 187 |     ) -> Tuple[List[int], float]:
 188 |         len_word = len(word)
 189 |         best_segmentation_starts = [-1] * (len_word + 1)
 190 |         best_segmentation_scores = [math.inf] * (len_word + 1)
 191 |         best_segmentation_starts[0], best_segmentation_scores[0] = 0, 0.0
 192 | 
 193 |         for idx in range(len_word):
 194 |             if best_segmentation_starts[idx] != -1:
 195 |                 end_idx = idx + 1
 196 |                 while end_idx <= len_word:
 197 |                     token = word[idx:end_idx]
 198 |                     if token in model:
 199 |                         token_score = model[token][1]
 200 |                         if (
 201 |                             best_segmentation_scores[idx] + token_score
 202 |                             < best_segmentation_scores[end_idx]
 203 |                         ):
 204 |                             best_segmentation_starts[end_idx] = idx
 205 |                             best_segmentation_scores[end_idx] = (
 206 |                                 best_segmentation_scores[idx] + token_score
 207 |                             )
 208 |                     end_idx += 1
 209 | 
 210 |         if best_segmentation_scores[-1] == math.inf:
 211 |             return [], 0.0
 212 | 
 213 |         tokens: Deque[int] = deque()
 214 |         idx = len_word
 215 |         while idx > 0:
 216 |             start_idx = best_segmentation_starts[idx]
 217 |             token = word[start_idx:idx]
 218 |             token_id = model[token][0]
 219 |             tokens.appendleft(token_id)
 220 |             idx = start_idx
 221 | 
 222 |         return list(tokens), best_segmentation_scores[-1]
 223 | 
 224 | 
 225 | class ReplitSentencepieceTokenizer(Tokenizer):
 226 |     def __init__(self, model_path: str):
 227 |         import sentencepiece
 228 | 
 229 |         self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_path)
 230 | 
 231 |     def tokenize(self, text: str) -> List[int]:
 232 |         return self.tokenizer.encode(text)
 233 | 
 234 |     def detokenize(self, tokens: List[int]) -> str:
 235 |         return self.tokenizer.decode(tokens)
 236 | 
 237 | 
 238 | ### Replit Model Definition
 239 | 
 240 | 
 241 | class ReplitLayer:
 242 |     def __init__(self, wtype: int, n_embd: int, ctx: ggml.ggml_context_p):
 243 |         self.norm_1_weight = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, n_embd)
 244 |         self.c_attn_wqkv_weight = ggml.ggml_new_tensor_2d(
 245 |             ctx, wtype, n_embd, 3 * n_embd
 246 |         )
 247 |         self.c_attn_out_proj_weight = ggml.ggml_new_tensor_2d(
 248 |             ctx, wtype, n_embd, n_embd
 249 |         )
 250 |         self.norm_2_weight = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, n_embd)
 251 |         self.c_ffn_up_proj_weight = ggml.ggml_new_tensor_2d(
 252 |             ctx, wtype, n_embd, 4 * n_embd
 253 |         )
 254 |         self.c_ffn_down_proj_weight = ggml.ggml_new_tensor_2d(
 255 |             ctx, wtype, 4 * n_embd, n_embd
 256 |         )
 257 | 
 258 | 
 259 | class ReplitModel:
 260 |     def __init__(
 261 |         self,
 262 |         d_model: int,
 263 |         max_seq_len: int,
 264 |         n_heads: int,
 265 |         n_layers: int,
 266 |         vocab_size: int,
 267 |         ftype: int,
 268 |         vocab: List[Tuple[int, str, float]],
 269 |         tokenizer: Tokenizer,
 270 |         n_batch: int,
 271 |         n_threads: int,
 272 |         weights_buffer: ContextBuffer,
 273 |         ctx: ggml.ggml_context_p,
 274 |         cancel_callback: Optional[Callable[[], bool]] = None,
 275 |     ):
 276 |         self.d_model = d_model
 277 |         self.max_seq_len = max_seq_len
 278 |         self.n_heads = n_heads
 279 |         self.n_layers = n_layers
 280 |         self.vocab_size = vocab_size
 281 |         self.ftype = ftype
 282 |         self.ctx = ctx
 283 |         self.layers: List[ReplitLayer] = []
 284 |         self.tensors: Dict[str, ggml.ggml_tensor_p] = {}
 285 |         self.vocab = vocab
 286 |         self.tokenizer = tokenizer
 287 |         self.n_batch = n_batch
 288 |         self.n_threads = n_threads
 289 |         self.weights_buffer = weights_buffer
 290 |         self.cancel_callback = cancel_callback
 291 | 
 292 |         n_layer = self.n_layers
 293 |         n_embd = self.d_model
 294 |         n_ctx = self.max_seq_len
 295 |         n_vocab = self.vocab_size
 296 |         wtype = ggml.ggml_ftype_to_ggml_type(ftype)
 297 | 
 298 |         n_mem = n_layer * n_ctx
 299 |         n_elements = n_embd * n_mem
 300 | 
 301 |         self.memory_k = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F16, n_elements)
 302 |         self.memory_v = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F16, n_elements)
 303 | 
 304 |         self.wte_weight = ggml.ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab)
 305 |         self.norm_f_weight = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, n_embd)
 306 |         self.tensors["transformer.wte.weight"] = self.wte_weight
 307 |         self.tensors["transformer.norm_f.weight"] = self.norm_f_weight
 308 | 
 309 |         self.mem_per_token = 0
 310 |         self.eval_buffer = CpuContextBuffer()
 311 | 
 312 |         for i in range(n_layer):
 313 |             layer = ReplitLayer(
 314 |                 wtype=wtype,
 315 |                 n_embd=n_embd,
 316 |                 ctx=ctx,
 317 |             )
 318 |             self.layers.append(layer)
 319 | 
 320 |             self.tensors[f"transformer.blocks.{i}.norm_1.weight"] = layer.norm_1_weight
 321 |             self.tensors[
 322 |                 f"transformer.blocks.{i}.attn.Wqkv.weight"
 323 |             ] = layer.c_attn_wqkv_weight
 324 |             self.tensors[
 325 |                 f"transformer.blocks.{i}.attn.out_proj.weight"
 326 |             ] = layer.c_attn_out_proj_weight
 327 |             self.tensors[f"transformer.blocks.{i}.norm_2.weight"] = layer.norm_2_weight
 328 |             self.tensors[
 329 |                 f"transformer.blocks.{i}.ffn.up_proj.weight"
 330 |             ] = layer.c_ffn_up_proj_weight
 331 |             self.tensors[
 332 |                 f"transformer.blocks.{i}.ffn.down_proj.weight"
 333 |             ] = layer.c_ffn_down_proj_weight
 334 | 
 335 |         self.n_tokens = 0
 336 |         self.input_ids: npt.NDArray[np.intc] = np.ndarray(
 337 |             (self.max_seq_len), dtype=np.intc
 338 |         )
 339 |         self.scores: npt.NDArray[np.single] = np.ndarray(
 340 |             (self.max_seq_len, n_vocab), dtype=np.single
 341 |         )
 342 | 
 343 |     def __del__(self):
 344 |         ggml.ggml_free(self.ctx)
 345 | 
 346 |     @staticmethod
 347 |     def encode_word(
 348 |         word: str, model: Dict[str, Tuple[int, float]]
 349 |     ) -> Tuple[List[int], float]:
 350 |         len_word = len(word)
 351 |         best_segmentation_starts = [-1] * (len_word + 1)
 352 |         best_segmentation_scores = [math.inf] * (len_word + 1)
 353 |         best_segmentation_starts[0], best_segmentation_scores[0] = 0, 0.0
 354 | 
 355 |         for idx in range(len_word):
 356 |             if best_segmentation_starts[idx] != -1:
 357 |                 end_idx = idx + 1
 358 |                 while end_idx <= len_word:
 359 |                     token = word[idx:end_idx]
 360 |                     if token in model:
 361 |                         token_score = model[token][1]
 362 |                         if (
 363 |                             best_segmentation_scores[idx] + token_score
 364 |                             < best_segmentation_scores[end_idx]
 365 |                         ):
 366 |                             best_segmentation_starts[end_idx] = idx
 367 |                             best_segmentation_scores[end_idx] = (
 368 |                                 best_segmentation_scores[idx] + token_score
 369 |                             )
 370 |                     end_idx += 1
 371 | 
 372 |         if best_segmentation_scores[-1] == math.inf:
 373 |             return [], 0.0
 374 | 
 375 |         tokens: Deque[int] = deque()
 376 |         idx = len_word
 377 |         while idx > 0:
 378 |             start_idx = best_segmentation_starts[idx]
 379 |             token = word[start_idx:idx]
 380 |             token_id = model[token][0]
 381 |             tokens.appendleft(token_id)
 382 |             idx = start_idx
 383 | 
 384 |         return list(tokens), best_segmentation_scores[-1]
 385 | 
 386 |     def tokenize(self, text: str) -> List[int]:
 387 |         return self.tokenizer.tokenize(text)
 388 | 
 389 |     def detokenize(self, tokens: List[int]) -> str:
 390 |         return self.tokenizer.detokenize(tokens)
 391 | 
 392 |     def reset(self):
 393 |         self.n_tokens = 0
 394 | 
 395 |     def _build_forward(
 396 |         self,
 397 |         ctx0: ggml.ggml_context_p,
 398 |         n_tokens: int,
 399 |         n_past: int,
 400 |         n_threads: int,
 401 |     ):
 402 |         N = n_tokens
 403 |         n_embd = self.d_model
 404 |         n_layer = self.n_layers
 405 |         n_ctx = self.max_seq_len
 406 |         n_head = self.n_heads
 407 | 
 408 |         gf = ggml.ggml_cgraph(n_threads=n_threads)
 409 | 
 410 |         embd = ggml.ggml_new_tensor_1d(
 411 |             ctx0,
 412 |             ggml.GGML_TYPE_I32,
 413 |             N,
 414 |         )
 415 |         ggml.ggml_set_name(embd, b"embd")
 416 | 
 417 |         inpL = ggml.ggml_get_rows(ctx0, self.wte_weight, embd)
 418 | 
 419 |         for il in range(n_layer):
 420 |             # // lctx.use_buf(ctx0, 0)
 421 | 
 422 |             # // a = self.ln_1(x)
 423 |             cur = ggml.ggml_norm(ctx0, inpL, 1e-5)
 424 |             # offload_func(cur)
 425 |             ggml.ggml_set_name(cur, b"norm_0")
 426 |             cur = ggml.ggml_mul(
 427 |                 ctx0,
 428 |                 ggml.ggml_repeat(ctx0, self.layers[il].norm_1_weight, cur),
 429 |                 cur,
 430 |             )
 431 |             ggml.ggml_set_name(cur, b"attention_norm_0")
 432 | 
 433 |             # // self-attention
 434 |             # //  b, _, past_key_value = self.attn(a, past_key_value=past_key_value,
 435 |             # //  attn_bias=attn_bias, attention_mask=attention_mask,
 436 |             # //  is_causal=is_causal)
 437 | 
 438 |             # // compute QKV
 439 |             cur = ggml.ggml_mul_mat(ctx0, self.layers[il].c_attn_wqkv_weight, cur)
 440 |             ggml.ggml_set_name(cur, b"tmpkqv")
 441 | 
 442 |             Qcur = ggml.ggml_view_2d(
 443 |                 ctx0,
 444 |                 cur,
 445 |                 n_embd,
 446 |                 N,
 447 |                 cur.contents.nb[1],
 448 |                 0 * ctypes.sizeof(ctypes.c_float) * n_embd,
 449 |             )
 450 |             ggml.ggml_set_name(Qcur, b"Qcur")
 451 |             Kcur = ggml.ggml_view_2d(
 452 |                 ctx0,
 453 |                 cur,
 454 |                 n_embd,
 455 |                 N,
 456 |                 cur.contents.nb[1],
 457 |                 1 * ctypes.sizeof(ctypes.c_float) * n_embd,
 458 |             )
 459 |             ggml.ggml_set_name(Kcur, b"Kcur")
 460 |             Vcur = ggml.ggml_view_2d(
 461 |                 ctx0,
 462 |                 cur,
 463 |                 n_embd,
 464 |                 N,
 465 |                 cur.contents.nb[1],
 466 |                 2 * ctypes.sizeof(ctypes.c_float) * n_embd,
 467 |             )
 468 |             ggml.ggml_set_name(Vcur, b"Vcur")
 469 | 
 470 |             # // store key and value to memory
 471 |             k = ggml.ggml_view_1d(
 472 |                 ctx0,
 473 |                 self.memory_k,
 474 |                 N * n_embd,
 475 |                 (ggml.ggml_element_size(self.memory_k) * n_embd)
 476 |                 * (il * n_ctx + n_past),
 477 |             )
 478 |             ggml.ggml_set_name(k, b"k")
 479 |             v = ggml.ggml_view_1d(
 480 |                 ctx0,
 481 |                 self.memory_v,
 482 |                 N * n_embd,
 483 |                 (ggml.ggml_element_size(self.memory_v) * n_embd)
 484 |                 * (il * n_ctx + n_past),
 485 |             )
 486 |             ggml.ggml_set_name(v, b"v")
 487 | 
 488 |             ggml.ggml_build_forward_expand(
 489 |                 ctypes.pointer(gf),
 490 |                 ggml.ggml_cpy(
 491 |                     ctx0,
 492 |                     Kcur,
 493 |                     k,
 494 |                 ),
 495 |             )
 496 |             ggml.ggml_build_forward_expand(
 497 |                 ctypes.pointer(gf),
 498 |                 ggml.ggml_cpy(
 499 |                     ctx0,
 500 |                     Vcur,
 501 |                     v,
 502 |                 ),
 503 |             )
 504 | 
 505 |             # // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
 506 |             # // 2, 1, 3) [64, N, 12]
 507 |             Q = ggml.ggml_permute(
 508 |                 ctx0,
 509 |                 ggml.ggml_cpy(
 510 |                     ctx0,
 511 |                     Qcur,
 512 |                     ggml.ggml_new_tensor_3d(
 513 |                         ctx0,
 514 |                         ggml.GGML_TYPE_F32,
 515 |                         n_embd // n_head,
 516 |                         n_head,
 517 |                         N,
 518 |                     ),
 519 |                 ),
 520 |                 0,
 521 |                 2,
 522 |                 1,
 523 |                 3,
 524 |             )
 525 |             ggml.ggml_set_name(Q, b"Q")
 526 | 
 527 |             # // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
 528 |             # // 3) [64, n_past + N, 12]
 529 |             K = ggml.ggml_permute(
 530 |                 ctx0,
 531 |                 ggml.ggml_reshape_3d(
 532 |                     ctx0,
 533 |                     ggml.ggml_view_1d(
 534 |                         ctx0,
 535 |                         self.memory_k,
 536 |                         (n_past + N) * n_embd,
 537 |                         il * n_ctx * ggml.ggml_element_size(self.memory_k) * n_embd,
 538 |                     ),
 539 |                     n_embd // n_head,
 540 |                     n_head,
 541 |                     n_past + N,
 542 |                 ),
 543 |                 0,
 544 |                 2,
 545 |                 1,
 546 |                 3,
 547 |             )
 548 |             ggml.ggml_set_name(K, b"K")
 549 | 
 550 |             # // K * Q
 551 |             KQ = ggml.ggml_mul_mat(ctx0, K, Q)
 552 |             ggml.ggml_set_name(KQ, b"KQ")
 553 | 
 554 |             # // KQ_scaled = KQ / sqrt(n_embd/n_head)
 555 |             KQ_scaled = ggml.ggml_scale(
 556 |                 ctx0,
 557 |                 KQ,
 558 |                 1.0 / np.sqrt(float(n_embd) / n_head),
 559 |             )
 560 |             ggml.ggml_set_name(KQ_scaled, b"KQ_scaled")
 561 | 
 562 |             KQ_scaled_alibi = ggml.ggml_alibi(
 563 |                 ctx0,
 564 |                 KQ_scaled,
 565 |                 n_past,
 566 |                 n_head,
 567 |                 8.0,
 568 |             )
 569 |             ggml.ggml_set_name(KQ_scaled_alibi, b"KQ_scaled_alibi")
 570 | 
 571 |             # // KQ_masked = mask_past(KQ_scaled)
 572 |             KQ_masked = ggml.ggml_diag_mask_inf(
 573 |                 ctx0,
 574 |                 KQ_scaled_alibi,
 575 |                 n_past,
 576 |             )
 577 |             ggml.ggml_set_name(KQ_masked, b"KQ_masked")
 578 | 
 579 |             # // KQ = soft_max(KQ_masked)
 580 |             KQ_soft_max = ggml.ggml_soft_max(
 581 |                 ctx0,
 582 |                 KQ_masked,
 583 |             )
 584 |             ggml.ggml_set_name(KQ_soft_max, b"KQ_soft_max")
 585 | 
 586 |             # // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
 587 |             # // 2, 0, 3).contiguous() [n_past + N, 64, 12]
 588 |             V_trans = ggml.ggml_cpy(
 589 |                 ctx0,
 590 |                 ggml.ggml_permute(
 591 |                     ctx0,
 592 |                     ggml.ggml_reshape_3d(
 593 |                         ctx0,
 594 |                         ggml.ggml_view_1d(
 595 |                             ctx0,
 596 |                             self.memory_v,
 597 |                             (n_past + N) * n_embd,
 598 |                             il * n_ctx * ggml.ggml_element_size(self.memory_v) * n_embd,
 599 |                         ),
 600 |                         n_embd // n_head,
 601 |                         n_head,
 602 |                         n_past + N,
 603 |                     ),
 604 |                     1,
 605 |                     2,
 606 |                     0,
 607 |                     3,
 608 |                 ),
 609 |                 ggml.ggml_new_tensor_3d(
 610 |                     ctx0,
 611 |                     self.memory_v.contents.type,
 612 |                     n_past + N,
 613 |                     n_embd // n_head,
 614 |                     n_head,
 615 |                 ),
 616 |             )
 617 |             # offload_func_v(V_trans)
 618 |             ggml.ggml_set_name(V_trans, b"V_trans")
 619 | 
 620 |             # // KQV = transpose(V) * KQ_soft_max
 621 |             KQV = ggml.ggml_mul_mat(ctx0, V_trans, KQ_soft_max)
 622 |             # offload_func_v(KQV)
 623 |             ggml.ggml_set_name(KQV, b"KQV")
 624 | 
 625 |             # // KQV_merged = KQV.permute(0, 2, 1, 3)
 626 |             KQV_merged = ggml.ggml_permute(
 627 |                 ctx0,
 628 |                 KQV,
 629 |                 0,
 630 |                 2,
 631 |                 1,
 632 |                 3,
 633 |             )
 634 |             ggml.ggml_set_name(KQV_merged, b"KQV_merged")
 635 | 
 636 |             # // cur = KQV_merged.contiguous().view(n_embd, N)
 637 |             cur = ggml.ggml_cpy(
 638 |                 ctx0,
 639 |                 KQV_merged,
 640 |                 ggml.ggml_new_tensor_2d(
 641 |                     ctx0,
 642 |                     ggml.GGML_TYPE_F32,
 643 |                     n_embd,
 644 |                     N,
 645 |                 ),
 646 |             )
 647 |             ggml.ggml_set_name(cur, b"KQV_merged_contiguous")
 648 | 
 649 |             # // projection
 650 |             cur = ggml.ggml_mul_mat(
 651 |                 ctx0,
 652 |                 self.layers[il].c_attn_out_proj_weight,
 653 |                 cur,
 654 |             )
 655 |             ggml.ggml_set_name(cur, b"result_wo")
 656 | 
 657 |             # // lctx.use_buf(ctx0, 1)
 658 | 
 659 |             inpL = ggml.ggml_add(
 660 |                 ctx0,
 661 |                 inpL,
 662 |                 cur,
 663 |             )
 664 |             ggml.ggml_set_name(cur, b"inpFF")
 665 | 
 666 |             # // m = self.ln_2(x)
 667 |             cur = ggml.ggml_norm(ctx0, inpL, 1e-5)
 668 |             ggml.ggml_set_name(cur, b"norm_1")
 669 |             cur = ggml.ggml_mul(
 670 |                 ctx0,
 671 |                 ggml.ggml_repeat(ctx0, self.layers[il].norm_2_weight, cur),
 672 |                 cur,
 673 |             )
 674 |             ggml.ggml_set_name(cur, b"norm")
 675 | 
 676 |             # // n = self.mlp(m)
 677 |             cur = ggml.ggml_mul_mat(
 678 |                 ctx0,
 679 |                 self.layers[il].c_ffn_up_proj_weight,
 680 |                 cur,
 681 |             )
 682 |             ggml.ggml_set_name(cur, b"result_mlp_up")
 683 | 
 684 |             # // GELU activation
 685 |             cur = ggml.ggml_gelu(
 686 |                 ctx0,
 687 |                 cur,
 688 |             )
 689 |             ggml.ggml_set_name(cur, b"gelu")
 690 |             # // projection
 691 |             # // cur = proj_w*cur + proj_b
 692 |             cur = ggml.ggml_mul_mat(
 693 |                 ctx0,
 694 |                 self.layers[il].c_ffn_down_proj_weight,
 695 |                 cur,
 696 |             )
 697 |             ggml.ggml_set_name(cur, b"result_mlp_down")
 698 | 
 699 |             # // x = x + n
 700 |             inpL = ggml.ggml_add(
 701 |                 ctx0,
 702 |                 inpL,
 703 |                 cur,
 704 |             )
 705 |             ggml.ggml_set_name(cur, b"inpFF_+_result_mlp_down")
 706 | 
 707 |         # // lctx.use_buf(ctx0, 0)
 708 | 
 709 |         # // norm
 710 |         inpL = ggml.ggml_norm(ctx0, inpL, 1e-5)
 711 |         ggml.ggml_set_name(inpL, b"norm_f")
 712 | 
 713 |         # // inpL = ln_f_g*inpL
 714 |         inpL = ggml.ggml_mul(
 715 |             ctx0,
 716 |             ggml.ggml_repeat(ctx0, self.norm_f_weight, inpL),
 717 |             inpL,
 718 |         )
 719 |         ggml.ggml_set_name(inpL, b"norm_f_mul")
 720 | 
 721 |         # // output embedding weight tied to input embedding
 722 |         inpL = ggml.ggml_mul_mat(
 723 |             ctx0,
 724 |             self.wte_weight,
 725 |             inpL,
 726 |         )
 727 |         ggml.ggml_set_name(inpL, b"result_output")
 728 | 
 729 |         # // lctx.use_buf(ctx0, -1)
 730 | 
 731 |         ggml.ggml_build_forward_expand(ctypes.pointer(gf), inpL)
 732 | 
 733 |         return gf
 734 | 
 735 |     def _eval_internal(self, embd_inp: Sequence[int], n_past: int, n_threads: int):
 736 |         N = len(embd_inp)
 737 |         n_vocab = self.vocab_size
 738 |         required_buffer_size = int(self.mem_per_token * N * 2.0)
 739 |         if (
 740 |             self.mem_per_token > 0
 741 |             and self.eval_buffer.buffer_size < required_buffer_size
 742 |         ):
 743 |             self.eval_buffer.resize(required_buffer_size)
 744 | 
 745 |         init_params = ggml.ggml_init_params(
 746 |             mem_size=self.eval_buffer.buffer_size,
 747 |             mem_buffer=self.eval_buffer.buffer,
 748 |             no_alloc=False,
 749 |         )
 750 |         exit_stack = ExitStack()
 751 |         ctx0 = ggml.ggml_init(init_params)
 752 |         if ctx0 is None:
 753 |             raise RuntimeError("Failed to initialize GGML context")
 754 |         exit_stack.callback(ggml.ggml_free, self.ctx)
 755 |         gf = self._build_forward(ctx0, len(embd_inp), n_past, n_threads)
 756 |         embd = ggml.ggml_graph_get_tensor(ctypes.pointer(gf), b"embd")
 757 |         assert embd is not None
 758 |         inpL = ggml.ggml_graph_get_tensor(ctypes.pointer(gf), b"result_output")
 759 |         assert inpL is not None
 760 |         to_numpy(embd)[:] = np.array(embd_inp, dtype=np.int32)
 761 |         gp = ggml.ggml_graph_plan(ctypes.pointer(gf), self.n_threads)
 762 |         work_data = (ctypes.c_uint8 * gp.work_size)()
 763 |         gp.work_data = ctypes.cast(work_data, ctypes.POINTER(ctypes.c_uint8))
 764 |         if self.cancel_callback is not None:
 765 | 
 766 |             @ggml.ggml_abort_callback
 767 |             def abort_callback(data: ctypes.c_void_p) -> Union[ctypes.c_bool, bool]:
 768 |                 assert self.cancel_callback is not None
 769 |                 return self.cancel_callback()
 770 | 
 771 |             self._abort_callback = abort_callback  # NOTE: keep reference
 772 |             gp.abort_callback = abort_callback
 773 |         rc = ggml.ggml_graph_compute(ctypes.pointer(gf), ctypes.pointer(gp))
 774 |         if rc != ggml.GGML_EXIT_SUCCESS:
 775 |             raise ReplitAbortException("Execution aborted")
 776 |         embd_w = to_numpy(inpL).reshape(
 777 |             -1, n_vocab
 778 |         )  # .copy() # NOTE: likely wrong to not copy here
 779 |         if self.mem_per_token == 0:
 780 |             self.mem_per_token = int(ggml.ggml_used_mem(ctx0) / N)
 781 |         return embd_w
 782 | 
 783 |     def eval(self, tokens: Sequence[int]):
 784 |         if self.mem_per_token == 0:
 785 |             try:
 786 |                 self._eval_internal([1, 2, 3, 4], n_past=0, n_threads=self.n_threads)
 787 |             except ReplitAbortException as e:
 788 |                 self.n_tokens = 0
 789 |                 raise e
 790 |         n_ctx = self.max_seq_len
 791 |         for i in range(0, len(tokens), self.n_batch):
 792 |             batch = tokens[i : min(len(tokens), i + self.n_batch)]
 793 |             n_past = min(n_ctx - len(batch), self.n_tokens)
 794 |             try:
 795 |                 scores = self._eval_internal(
 796 |                     batch,
 797 |                     n_past,
 798 |                     self.n_threads,
 799 |                 )
 800 |                 # Save tokens
 801 |                 self.input_ids[self.n_tokens : self.n_tokens + len(batch)] = batch
 802 |                 # Save logits
 803 |                 self.scores[self.n_tokens : self.n_tokens + len(batch), :] = scores
 804 |                 # Update token count
 805 |                 self.n_tokens += len(batch)
 806 |             except ReplitAbortException as e:
 807 |                 self.n_tokens = n_past
 808 |                 raise e
 809 |         return self.scores[: self.n_tokens, :]
 810 | 
 811 |     def generate(
 812 |         self,
 813 |         tokens: Sequence[int],
 814 |         top_p: float = 0.95,
 815 |         temperature: float = 0.80,
 816 |         frequency_penalty: float = 0.0,
 817 |         presence_penalty: float = 0.0,
 818 |     ) -> Iterator[int]:
 819 |         reset = True
 820 |         if self.n_tokens > 0:
 821 |             longest_prefix = 0
 822 |             for a, b in zip(self.input_ids[: self.n_tokens], tokens[:-1]):
 823 |                 if a == b:
 824 |                     longest_prefix += 1
 825 |                 else:
 826 |                     break
 827 |             if longest_prefix > 0:
 828 |                 reset = False
 829 |                 tokens = tokens[longest_prefix:]
 830 |                 self.n_tokens = longest_prefix
 831 | 
 832 |         if reset:
 833 |             self.reset()
 834 | 
 835 |         while True:
 836 |             scores = self.eval(tokens)
 837 |             logits = scores[-1, :]
 838 |             token = sample(
 839 |                 logits,
 840 |                 top_p=top_p,
 841 |                 temperature=temperature,
 842 |                 frequency_penalty=frequency_penalty,
 843 |                 presence_penalty=presence_penalty,
 844 |             )
 845 |             yield token
 846 |             tokens = [token]
 847 | 
 848 |     @staticmethod
 849 |     def eos_token():
 850 |         return 1
 851 | 
 852 |     @staticmethod
 853 |     def init_from_file(
 854 |         model_file: str,
 855 |         n_gpu_layers: int = 0,
 856 |         n_batch: int = 1,
 857 |         n_threads: int = 1,
 858 |         tokenizer: Optional[Tokenizer] = None,
 859 |         verbose: bool = True,
 860 |         cancel_callback: Optional[Callable[[], bool]] = None,
 861 |     ) -> ReplitModel:
 862 |         with open(model_file, "rb") as fin:
 863 |             # Magic Number
 864 |             (magic,) = struct.unpack("i", (fin.read(struct.calcsize("i"))))
 865 |             assert magic == ggml.GGML_FILE_MAGIC
 866 |             if verbose:
 867 |                 print("magic number =", hex(magic))
 868 |             # Hyperparameters
 869 |             d_model, max_seq_len, n_heads, n_layers, vocab_size, ftype = struct.unpack(
 870 |                 "iiiiii", (fin.read(struct.calcsize("iiiiii")))
 871 |             )
 872 |             qntvr = ftype // ggml.GGML_QNT_VERSION_FACTOR
 873 |             if verbose:
 874 |                 print("d_model      =", d_model)
 875 |                 print("max_seq_len  =", max_seq_len)
 876 |                 print("n_heads      =", n_heads)
 877 |                 print("n_layers     =", n_layers)
 878 |                 print("vocab_size   =", vocab_size)
 879 |                 print("ftype        =", ftype)
 880 |                 print("qntvr        =", qntvr)
 881 |             ftype %= ggml.GGML_QNT_VERSION_FACTOR
 882 |             # Vocabulary
 883 |             vocab: List[Tuple[int, str, float]] = []
 884 |             for i in range(vocab_size):
 885 |                 (s_len,) = struct.unpack("i", (fin.read(struct.calcsize("i"))))
 886 |                 s = fin.read(s_len).decode("utf-8")
 887 |                 (score,) = struct.unpack("f", (fin.read(struct.calcsize("f"))))
 888 |                 vocab.append((i, s, score))
 889 |             # Model Weights
 890 |             wtype = ggml.ggml_ftype_to_ggml_type(ftype)
 891 | 
 892 |             n_embd = d_model
 893 |             n_layer = n_layers
 894 |             n_ctx = max_seq_len
 895 |             n_vocab = vocab_size
 896 | 
 897 |             ctx_size = ReplitModel.compute_ctx_size(
 898 |                 n_embd=n_embd,
 899 |                 n_layer=n_layer,
 900 |                 n_ctx=n_ctx,
 901 |                 n_vocab=n_vocab,
 902 |                 wtype=wtype,
 903 |             )
 904 | 
 905 |             if verbose:
 906 |                 print("ctx size     =", ctx_size // (1024 * 1024), "MB")
 907 | 
 908 |             # create context
 909 |             weights_buffer = CpuContextBuffer(ctx_size)
 910 |             init_params = ggml.ggml_init_params(
 911 |                 mem_size=ctx_size,
 912 |                 mem_buffer=weights_buffer.buffer,
 913 |                 no_alloc=False,
 914 |             )
 915 |             ctx = ggml.ggml_init(init_params)
 916 |             if ctx is None:
 917 |                 raise RuntimeError("Failed to initialize GGML context")
 918 | 
 919 |             model = ReplitModel(
 920 |                 # hyperparameters
 921 |                 d_model=d_model,
 922 |                 max_seq_len=max_seq_len,
 923 |                 n_heads=n_heads,
 924 |                 n_layers=n_layers,
 925 |                 vocab_size=vocab_size,
 926 |                 ftype=ftype,
 927 |                 # vocabulary
 928 |                 vocab=vocab,
 929 |                 tokenizer=ReplitTokenizer(vocab) if tokenizer is None else tokenizer,
 930 |                 ctx=ctx,
 931 |                 n_batch=n_batch,
 932 |                 n_threads=n_threads,
 933 |                 weights_buffer=weights_buffer,
 934 |                 # misc
 935 |                 cancel_callback=cancel_callback,
 936 |             )
 937 | 
 938 |             n_tensors = 0
 939 |             total_size = 0
 940 | 
 941 |             while True:
 942 |                 nbytes = struct.calcsize("iii")
 943 |                 data = fin.read(nbytes)
 944 |                 if len(data) != nbytes:
 945 |                     break
 946 |                 n_dims, length, ttype = struct.unpack("iii", data)
 947 |                 nelements = 1
 948 |                 ne = [1, 1]
 949 |                 for i in range(n_dims):
 950 |                     (dim,) = struct.unpack("i", (fin.read(struct.calcsize("i"))))
 951 |                     ne[i] = dim
 952 |                     nelements *= ne[i]
 953 |                 name = fin.read(length).decode("utf-8")
 954 |                 if name not in model.tensors:
 955 |                     raise ValueError(f"Tensor {name} not found in model")
 956 |                 tensor = model.tensors[name]
 957 |                 if ggml.ggml_nelements(tensor) != nelements:
 958 |                     raise ValueError(
 959 |                         f"Tensor {name} has {ggml.ggml_nelements(tensor)} elements, but {nelements} expected"
 960 |                     )
 961 |                 if tensor.contents.ne[0] != ne[0] or tensor.contents.ne[1] != ne[1]:
 962 |                     raise ValueError(
 963 |                         f"Tensor {name} has {tensor.contents.ne[0]}x{tensor.contents.ne[1]} shape, but {ne[0]}x{ne[1]} expected"
 964 |                     )
 965 |                 bpe = ggml.ggml_type_size(ttype)
 966 |                 if (
 967 |                     (nelements * bpe) / ggml.ggml_blck_size(tensor.contents.type)
 968 |                 ) != ggml.ggml_nbytes(tensor):
 969 |                     raise ValueError(
 970 |                         f"Tensor {name} has {ggml.ggml_nbytes(tensor)} bytes, but {(nelements * bpe) / ggml.ggml_blck_size(tensor.contents.type)} expected"
 971 |                     )
 972 |                 tensor_data = ggml.ggml_get_data(tensor)
 973 |                 if tensor_data is None:
 974 |                     raise ValueError(f"Failed to get data for tensor {name}")
 975 |                 fin.readinto(
 976 |                     (ctypes.c_uint8 * ggml.ggml_nbytes(tensor)).from_address(
 977 |                         tensor_data
 978 |                     )
 979 |                 )
 980 | 
 981 |                 total_size += ggml.ggml_nbytes(tensor)
 982 |                 if n_tensors % 8 == 0:
 983 |                     print(".", end="", flush=True)
 984 |                 n_tensors += 1
 985 |             print("done")
 986 |             print(
 987 |                 "model size =",
 988 |                 total_size // (1024 * 1024),
 989 |                 "MB",
 990 |                 "num tensors =",
 991 |                 n_tensors,
 992 |             )
 993 |         return model
 994 | 
 995 |     @staticmethod
 996 |     def compute_ctx_size(
 997 |         n_embd: int,
 998 |         n_layer: int,
 999 |         n_ctx: int,
1000 |         n_vocab: int,
1001 |         wtype: int,
1002 |     ) -> int:
1003 |         wtype_sizef = ggml.ggml_type_sizef(wtype)
1004 |         f32_sizef = ggml.ggml_type_sizef(ggml.GGML_TYPE_F32)
1005 |         f16_sizef = ggml.ggml_type_sizef(ggml.GGML_TYPE_F16)
1006 | 
1007 |         ctx_size = 0
1008 |         ctx_size += n_embd * n_vocab * wtype_sizef
1009 |         ctx_size += n_embd * f32_sizef
1010 | 
1011 |         ctx_size += n_layer * (n_embd * f32_sizef)
1012 |         ctx_size += n_layer * (3 * n_embd * n_embd * wtype_sizef)
1013 |         ctx_size += n_layer * (n_embd**2 * wtype_sizef)
1014 |         ctx_size += n_layer * (n_embd * f32_sizef)
1015 |         ctx_size += n_layer * (4 * n_embd * n_embd * wtype_sizef)
1016 |         ctx_size += n_layer * (n_embd**2 * 4 * wtype_sizef)
1017 | 
1018 |         ctx_size += n_ctx * n_layer * n_embd * f16_sizef
1019 |         ctx_size += n_ctx * n_layer * n_embd * f16_sizef
1020 | 
1021 |         ctx_size += (1 + 6 * n_layer) * 512
1022 |         ctx_size = int(ctx_size)
1023 |         return ctx_size
1024 | 
1025 | 
1026 | if __name__ == "__main__":
1027 |     parser = argparse.ArgumentParser()
1028 |     parser.add_argument("-m", "--model", type=str, default=None)
1029 |     parser.add_argument("-p", "--prompt", type=str, default="def fib(n):")
1030 |     parser.add_argument(
1031 |         "--n_threads", type=int, default=max(1, multiprocessing.cpu_count() // 2)
1032 |     )
1033 |     parser.add_argument("--n_batch", type=int, default=512)
1034 |     parser.add_argument("--n_gpu_layers", type=int, default=0)
1035 |     parser.add_argument("--max_tokens", type=int, default=32)
1036 |     parser.add_argument("--temperature", type=float, default=1.0)
1037 |     parser.add_argument("--top_p", type=float, default=1.0)
1038 |     parser.add_argument("--presence_penalty", type=float, default=0.0)
1039 |     parser.add_argument("--frequency_penalty", type=float, default=0.0)
1040 |     args = parser.parse_args()
1041 | 
1042 |     model_file = args.model
1043 |     n_threads = args.n_threads
1044 |     n_batch = args.n_batch
1045 |     n_gpu_layers = args.n_gpu_layers
1046 |     max_tokens = args.max_tokens
1047 |     temperature = args.temperature
1048 |     top_p = args.top_p
1049 |     presence_penalty = args.presence_penalty
1050 |     frequency_penalty = args.frequency_penalty
1051 | 
1052 |     model = ReplitModel.init_from_file(
1053 |         model_file, n_gpu_layers=n_gpu_layers, n_threads=n_threads, n_batch=n_batch
1054 |     )
1055 | 
1056 |     prompt = args.prompt
1057 |     prompt_tokens = model.tokenize(prompt)
1058 |     all_tokens: List[int] = prompt_tokens[:]  # type: ignore
1059 |     n_past = 0
1060 |     tokens: List[int] = prompt_tokens[:]  # type: ignore
1061 | 
1062 |     print("number of tokens in prompt =", len(prompt_tokens))
1063 |     for i, token_id in enumerate(prompt_tokens):
1064 |         print(f"token[{i}] =", token_id)
1065 | 
1066 |     print()
1067 |     print(prompt, end="", flush=True)
1068 |     for _ in range(max_tokens):
1069 |         # eval
1070 |         scores = model.eval(tokens)
1071 |         logits = scores[-1, :]
1072 |         # sample
1073 |         token_id = sample(
1074 |             logits,
1075 |             last_tokens=all_tokens,
1076 |             temperature=temperature,
1077 |             top_p=top_p,
1078 |             presence_penalty=presence_penalty,
1079 |             frequency_penalty=frequency_penalty,
1080 |         )
1081 |         if token_id == model.eos_token():
1082 |             break
1083 |         # update
1084 |         all_tokens.append(token_id)
1085 |         print(model.detokenize([token_id]), end="", flush=True)
1086 |         n_past += len(tokens)
1087 |         tokens = [token_id]
1088 |     print()
1089 | 


--------------------------------------------------------------------------------
/examples/replit/requirements.txt:
--------------------------------------------------------------------------------
1 | ggml-python==0.0.7
2 | fastapi==0.109.1
3 | sse-starlette==1.6.1
4 | uvicorn==0.22.0
5 | 


--------------------------------------------------------------------------------
/examples/rpc/main.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import argparse
 3 | import contextlib
 4 | 
 5 | import ggml
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--host", type=str, default="127.0.0.1")
10 |     parser.add_argument("--port", type=int, default=9091)
11 |     args = parser.parse_args()
12 | 
13 |     with contextlib.ExitStack() as stack:
14 |         backend = ggml.ggml_backend_rpc_init(f"{args.host}:{args.port}".encode("utf-8"))
15 |         assert backend is not None
16 |         stack.callback(ggml.ggml_backend_free, backend)
17 | 
18 |         params = ggml.ggml_init_params(
19 |             mem_size=ggml.ggml_tensor_overhead() * 6 + ggml.ggml_graph_overhead() + 10000,
20 |             no_alloc=True,
21 |         )
22 |         ctx = ggml.ggml_init(params)
23 |         assert ctx is not None
24 |         stack.callback(ggml.ggml_free, ctx)
25 | 
26 |         x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
27 |         a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
28 |         b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
29 |         x2 = ggml.ggml_mul(ctx, x, x)
30 |         f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b)
31 |         gf = ggml.ggml_new_graph(ctx)
32 | 
33 |         ggml.ggml_build_forward_expand(gf, f)
34 | 
35 |         buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend)
36 |         assert buffer is not None
37 |         stack.callback(ggml.ggml_backend_buffer_free, buffer)
38 | 
39 |         x_data = (ctypes.c_float * 1)(2.0)
40 |         ggml.ggml_backend_tensor_set(
41 |             x,  # tensor
42 |             x_data,  # data
43 |             0,  # offset
44 |             ctypes.sizeof(x_data),  # size
45 |         )
46 | 
47 |         a_data = (ctypes.c_float * 1)(3.0)
48 |         ggml.ggml_backend_tensor_set(
49 |             a,  # tensor
50 |             a_data,  # data
51 |             0,  # offset
52 |             ctypes.sizeof(a_data),  # size
53 |         )
54 | 
55 |         b_data = (ctypes.c_float * 1)(4.0)
56 |         ggml.ggml_backend_tensor_set(
57 |             b,  # tensor
58 |             b_data,  # data
59 |             0,  # offset
60 |             ctypes.sizeof(b_data),  # size
61 |         )
62 | 
63 |         ggml.ggml_backend_graph_compute(backend, gf)
64 | 
65 |         output = ctypes.c_float()
66 |         ggml.ggml_backend_tensor_get(
67 |             f,  # tensor
68 |             ctypes.byref(output),  # data
69 |             0,  # offset
70 |             ctypes.sizeof(output),  # size
71 |         )
72 | 
73 |         print(f"Output: {output.value}")
74 | 
75 |         assert output.value == 16.0
76 | 
77 | if __name__ == "__main__":
78 |     main()


--------------------------------------------------------------------------------
/examples/rpc/worker.py:
--------------------------------------------------------------------------------
 1 | import ggml
 2 | import argparse
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--host", type=str, default="127.0.0.1")
 7 |     parser.add_argument("--port", type=int, default=9091)
 8 |     parser.add_argument("--free_mem", type=int, default=1 << 30)
 9 |     parser.add_argument("--total_mem", type=int, default=1 << 30)
10 |     parser.add_argument("--backend", type=str, default="cpu", choices=["cpu", "cuda", "metal"])
11 |     parser.add_argument("--backend-cuda-device", type=int, default=0)
12 |     args = parser.parse_args()
13 | 
14 |     print(f"Starting worker on {args.host}:{args.port}")
15 |     print(f"Free memory: {args.free_mem} bytes")
16 |     print(f"Total memory: {args.total_mem} bytes")
17 |     print(f"Backend: {args.backend}")
18 | 
19 |     if args.backend == "cpu":
20 |         backend = ggml.ggml_backend_cpu_init()
21 |     elif args.backend == "cuda":
22 |         backend = ggml.ggml_backend_cuda_init(args.backend_cuda_device)
23 |     elif args.backend == "metal":
24 |         backend = ggml.ggml_backend_metal_init()
25 |     else:
26 |         raise ValueError(f"Unknown backend: {args.backend}")
27 | 
28 |     assert backend is not None, "Failed to initialize CPU backend"
29 | 
30 |     endpoints = "{}:{}".format(args.host, args.port).encode("utf-8")
31 | 
32 |     free_mem = args.free_mem
33 |     total_mem = args.total_mem
34 | 
35 |     ggml.start_rpc_server(backend, endpoints, free_mem, total_mem)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     main()


--------------------------------------------------------------------------------
/ggml/__init__.py:
--------------------------------------------------------------------------------
1 | from .ggml import *
2 | 
3 | __version__ = "0.0.37"


--------------------------------------------------------------------------------
/ggml/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abetlen/ggml-python/90ffdc076df76f290227052b285e71a94f29f865/ggml/py.typed


--------------------------------------------------------------------------------
/ggml/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for ggml-python.
  2 | """
  3 | from __future__ import annotations
  4 | 
  5 | import enum
  6 | import ctypes
  7 | import signal
  8 | import platform
  9 | import traceback
 10 | 
 11 | from typing import Any, Optional, Sequence, Tuple
 12 | 
 13 | from ggml import ggml
 14 | 
 15 | import numpy as np
 16 | import numpy.typing as npt
 17 | 
 18 | 
 19 | class GGML_TYPE(enum.IntEnum):
 20 |     F32 = ggml.GGML_TYPE_F32
 21 |     F16 = ggml.GGML_TYPE_F16
 22 |     Q4_0 = ggml.GGML_TYPE_Q4_0
 23 |     Q4_1 = ggml.GGML_TYPE_Q4_1
 24 |     Q5_0 = ggml.GGML_TYPE_Q5_0
 25 |     Q5_1 = ggml.GGML_TYPE_Q5_1
 26 |     Q8_0 = ggml.GGML_TYPE_Q8_0
 27 |     Q8_1 = ggml.GGML_TYPE_Q8_1
 28 |     I8 = ggml.GGML_TYPE_I8
 29 |     I16 = ggml.GGML_TYPE_I16
 30 |     I32 = ggml.GGML_TYPE_I32
 31 | 
 32 | 
 33 | NUMPY_DTYPE_TO_GGML_TYPE = {
 34 |     np.float16: GGML_TYPE.F16,
 35 |     np.float32: GGML_TYPE.F32,
 36 |     np.int8: GGML_TYPE.I8,
 37 |     np.int16: GGML_TYPE.I16,
 38 |     np.int32: GGML_TYPE.I32,
 39 | }
 40 | 
 41 | GGML_TYPE_TO_NUMPY_DTYPE = {v: k for k, v in NUMPY_DTYPE_TO_GGML_TYPE.items()}
 42 | 
 43 | 
 44 | def to_numpy(
 45 |     tensor: ggml.ggml_tensor_p,
 46 |     shape: Optional[Tuple[int, ...]] = None,
 47 | ) -> npt.NDArray[Any]:
 48 |     """Get the data of a ggml tensor as a numpy array.
 49 | 
 50 |     Parameters:
 51 |         tensor: ggml tensor
 52 | 
 53 |     Returns:
 54 |         Numpy array with a view of data from tensor
 55 |     """
 56 |     ggml_type = GGML_TYPE(tensor.contents.type)
 57 |     if ggml_type == GGML_TYPE.F16:
 58 |         ctypes_type = ctypes.c_uint16
 59 |     else:
 60 |         ctypes_type = np.ctypeslib.as_ctypes_type(GGML_TYPE_TO_NUMPY_DTYPE[ggml_type])
 61 | 
 62 |     data = ggml.ggml_get_data(tensor)
 63 |     if data is None:
 64 |         raise ValueError("tensor data is None")
 65 |     array = (ctypes_type * ggml.ggml_nelements(tensor)).from_address(data)
 66 |     n_dims = ggml.ggml_n_dims(tensor)
 67 |     shape_ = tuple(reversed(tensor.contents.ne[:n_dims]))
 68 |     strides = tuple(reversed(tensor.contents.nb[:n_dims]))
 69 |     output = np.ctypeslib.as_array(array)
 70 |     if ggml_type == GGML_TYPE.F16:
 71 |         output.dtype = np.float16  # type: ignore
 72 |     return np.lib.stride_tricks.as_strided(
 73 |         output, shape=shape if shape is not None else shape_, strides=strides
 74 |     )
 75 | 
 76 | 
 77 | def from_numpy(x: npt.NDArray[Any], ctx: ggml.ggml_context_p) -> ggml.ggml_tensor_p:
 78 |     """Create a new ggml tensor with data copied from a numpy array.
 79 | 
 80 |     Parameters:
 81 |         x: numpy array
 82 |         ctx: ggml context
 83 | 
 84 |     Returns:
 85 |         New ggml tensor with data copied from x
 86 |     """
 87 |     ggml_type = NUMPY_DTYPE_TO_GGML_TYPE[x.dtype.type]
 88 |     shape = tuple(reversed(x.shape))
 89 |     tensor = ggml.ggml_new_tensor(
 90 |         ctx,
 91 |         ggml_type.value,
 92 |         len(shape),
 93 |         (ctypes.c_int64 * len(shape))(*shape),
 94 |     )
 95 |     tensor.contents.nb[: len(shape)] = (ctypes.c_int64 * len(shape))(
 96 |         *tuple(reversed(x.strides))
 97 |     )
 98 |     if ggml.ggml_get_data(tensor) is not None:
 99 |         to_numpy(tensor)[:] = x
100 |     return tensor
101 | 
102 | 
103 | def copy_to_cpu(
104 |     ctx: ggml.ggml_context_p, tensor: ggml.ggml_tensor_p
105 | ) -> ggml.ggml_tensor_p:
106 |     """Copy a ggml tensor from a GPU backend to CPU.
107 | 
108 |     Parameters:
109 |         ctx: ggml context
110 |         tensor: ggml tensor
111 | 
112 |     Returns:
113 |         New ggml tensor with data copied from tensor on CPU backend"""
114 |     tmp = ggml.ggml_dup_tensor(ctx, tensor)
115 |     to_numpy(tmp)[:] = 0
116 |     return ggml.ggml_add_inplace(ctx, tmp, tensor)
117 | 
118 | 
119 | def quantize_0(
120 |     data_f32: ggml.CtypesArray[ctypes.c_float],
121 |     nelements: int,
122 |     ne0: int,
123 |     ttype: GGML_TYPE,
124 |     work: Optional[ggml.CtypesArray[ctypes.c_float]] = None,
125 |     imatrix: Optional[ggml.CtypesArray[ctypes.c_float]] = None,
126 | ):
127 |     """Quantize a float32 array.
128 | 
129 |     Parameters:
130 |         data_f32: float32 array
131 |         nelements: number of elements in data_f32
132 |         ne0: number of elements in data_f32 that are zero
133 |         ttype: ggml type to quantize to
134 |         work: work buffer
135 |         imatrix: quantization matrix
136 | 
137 |     Returns:
138 |         (work, cur_size): outpuut buffer, histogram, number of bytes in work buffer
139 |     """
140 |     work = work or (ctypes.c_float * nelements)()
141 |     cur_size = ggml.ggml_quantize_chunk(
142 |         ttype,
143 |         data_f32,
144 |         ctypes.cast(work, ctypes.c_void_p),
145 |         0,
146 |         nelements,
147 |         ne0,
148 |         imatrix,
149 |     )
150 |     return ctypes.cast(work, ctypes.c_void_p), cur_size
151 | 
152 | 
153 | def quantize_row(
154 |     data_f32: ggml.CtypesArray[ctypes.c_float],
155 |     nelements: int,
156 |     ttype: GGML_TYPE,
157 |     work: Optional[ctypes.c_void_p] = None,
158 | ):
159 |     """Quantize a row of a ggml tensor.
160 | 
161 |     Parameters:
162 |         data_f32: float32 array
163 |         nelements: number of elements in data_f32
164 |         ttype: ggml type to quantize to
165 |         work: work buffer
166 | 
167 |     Returns:
168 |         output buffer"""
169 |     type_traits = ggml.ggml_internal_get_type_traits(ttype.value)
170 |     from_float = type_traits.from_float
171 |     work = work or ctypes.cast((ctypes.c_float * nelements)(), ctypes.c_void_p)
172 |     from_float(data_f32, work, nelements)
173 |     return work
174 | 
175 | 
176 | def dequantize_row(
177 |     data_q: ctypes.c_void_p,
178 |     nelements: int,
179 |     ttype: GGML_TYPE,
180 |     work: Optional[ctypes.c_void_p] = None,
181 | ):
182 |     """Dequantize a row of a ggml tensor.
183 | 
184 |     Parameters:
185 |         data_q: quantized data
186 |         nelements: number of elements in data_q
187 |         ttype: ggml type to dequantize from
188 |         work: work buffer
189 | 
190 |     Returns:
191 |         output buffer"""
192 |     type_traits = ggml.ggml_internal_get_type_traits(ttype.value)
193 |     to_float = type_traits.to_float
194 |     work = work or ctypes.cast((ctypes.c_float * nelements)(), ctypes.c_void_p)
195 |     to_float(data_q, work, nelements)
196 |     return work
197 | 
198 | 
199 | def get_ndims(tensor: ggml.ggml_tensor_p) -> int:
200 |     """Get the number of dimensions of a ggml tensor.
201 | 
202 |     Parameters:
203 |         tensor: ggml tensor
204 | 
205 |     Returns:
206 |         Number of dimensions of tensor
207 |     """
208 |     return ggml.ggml_n_dims(tensor)
209 | 
210 | 
211 | def get_shape(tensor: ggml.ggml_tensor_p) -> Tuple[int, ...]:
212 |     """Get the shape of a ggml tensor.
213 | 
214 |     Parameters:
215 |         tensor: ggml tensor
216 | 
217 |     Returns:
218 |         Shape of tensor
219 |     """
220 |     return tuple(tensor.contents.ne[: ggml.ggml_n_dims(tensor)])
221 | 
222 | 
223 | def get_strides(tensor: ggml.ggml_tensor_p) -> Tuple[int, ...]:
224 |     """Get the strides of a ggml tensor.
225 | 
226 |     Parameters:
227 |         tensor: ggml tensor
228 | 
229 |     Returns:
230 |         Strides of tensor
231 |     """
232 |     return tuple(tensor.contents.nb[: ggml.ggml_n_dims(tensor)])
233 | 
234 | 
235 | def slice_tensor(
236 |     ctx: ggml.ggml_context_p, tensor: ggml.ggml_tensor_p, indices: Sequence[slice]
237 | ):
238 |     """Slice a ggml tensor along multiple dimensions.
239 | 
240 |     The slice is a view of the original tensor with the same number of dimensions.
241 | 
242 |     Parameters:
243 |         ctx: ggml context
244 |         tensor: ggml tensor
245 |         indices: indices to slice along
246 | 
247 |     Returns:
248 |         New ggml tensor slice view"""
249 |     ndims = ggml.ggml_n_dims(tensor)
250 | 
251 |     # check that the number of dimensions match
252 |     if len(indices) != ndims:
253 |         raise ValueError(
254 |             f"tensor has {ndims} dimensions but {len(indices)} indices were given"
255 |         )
256 | 
257 |     # calculate slice
258 |     start = tuple(idx.start or 0 for idx in indices)
259 |     end = tuple(idx.stop or get_shape(tensor)[i] for i, idx in enumerate(indices))
260 |     step = tuple(idx.step or 1 for idx in indices)
261 | 
262 |     # get the shape of the slice
263 |     shape = tuple((end[i] - start[i] + step[i] - 1) // step[i] for i in range(ndims))
264 | 
265 |     # get the strides of the slice
266 |     strides = tuple(get_strides(tensor)[i] * step[i] for i in range(ndims))
267 | 
268 |     # get the offset of the slice
269 |     offset = sum(get_strides(tensor)[i] * start[i] for i in range(ndims))
270 | 
271 |     if ndims == 1:
272 |         return ggml.ggml_view_1d(
273 |             ctx,
274 |             tensor,
275 |             shape[0],
276 |             offset,
277 |         )
278 |     elif ndims == 2:
279 |         return ggml.ggml_view_2d(
280 |             ctx,
281 |             tensor,
282 |             shape[0],
283 |             shape[1],
284 |             strides[1],
285 |             offset,
286 |         )
287 |     elif ndims == 3:
288 |         return ggml.ggml_view_3d(
289 |             ctx,
290 |             tensor,
291 |             shape[0],
292 |             shape[1],
293 |             shape[2],
294 |             strides[1],
295 |             strides[2],
296 |             offset,
297 |         )
298 |     elif ndims == 4:
299 |         return ggml.ggml_view_4d(
300 |             ctx,
301 |             tensor,
302 |             shape[0],
303 |             shape[1],
304 |             shape[2],
305 |             shape[3],
306 |             strides[1],
307 |             strides[2],
308 |             strides[3],
309 |             offset,
310 |         )
311 |     else:
312 |         raise NotImplementedError(
313 |             f"ggml tensors with {ndims} dimensions are not supported"
314 |         )
315 | 
316 | 
317 | def setup_sigabrt_handler():
318 |     if platform.system() == "Windows":
319 |         return
320 | 
321 |     c_globals = ctypes.CDLL(None)  # POSIX
322 |     signal_type = signal.SIGABRT
323 | 
324 |     @ctypes.CFUNCTYPE(None, ctypes.c_int)
325 |     def sigabrt_handler(sig):  # type: ignore
326 |         traceback.print_stack()
327 |         raise Exception("GGML SIGABRT")
328 | 
329 |     c_globals.signal(signal_type, sigabrt_handler)
330 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: ggml-python
 2 | site_url: https://ggml-python.readthedocs.io
 3 | repo_url: https://github.com/abetlen/ggml-python
 4 | 
 5 | theme:
 6 |   name: material
 7 |   palette: 
 8 | 
 9 |     # Palette toggle for light mode
10 |     - scheme: default
11 |       primary: white
12 |       toggle:
13 |         icon: material/brightness-7 
14 |         name: Switch to dark mode
15 | 
16 |     # Palette toggle for dark mode
17 |     - scheme: slate
18 |       primary: black
19 |       toggle:
20 |         icon: material/brightness-4
21 |         name: Switch to light mode
22 | 
23 |   features:
24 |     - navigation.tabs
25 |     - navigation.tabs.sticky
26 |     - toc.integrate
27 |     - navigation.footer
28 | 
29 | plugins:
30 |   - mkdocstrings:
31 |       handlers:
32 |         python:
33 |           options:
34 |             members_order: source
35 |             group_by_category: false
36 |             signature_crossrefs: true
37 |             show_signature: true
38 |             docstring_section_style: list
39 |             show_root_heading: true
40 |           import:
41 |             - https://docs.python.org/3/objects.inv
42 |             - https://numpy.org/doc/stable/objects.inv
43 |   - search
44 |   - social
45 | 
46 | markdown_extensions:
47 |   - pymdownx.superfences
48 |   - pymdownx.inlinehilite
49 |   - pymdownx.snippets
50 |   - pymdownx.tabbed:
51 |       alternate_style: true 
52 |   - pymdownx.highlight:
53 |       anchor_linenums: true
54 |       line_spans: __span
55 |       pygments_lang_class: true
56 | 
57 | watch:
58 |   - ggml


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["scikit-build-core[pyproject]>=0.5.1"]
 3 | build-backend = "scikit_build_core.build"
 4 | 
 5 | [project]
 6 | name = "ggml_python"
 7 | dynamic = ["version"]
 8 | description = "Python bindings for ggml"
 9 | readme = "README.md"
10 | license = { text = "MIT" }
11 | authors = [
12 |     { name = "Andrei Betlen", email = "abetlen@gmail.com" },
13 | ]
14 | requires-python = ">=3.7"
15 | dependencies = [
16 |     "numpy>=1.20.0",
17 |     "typing_extensions>=4.6.3",
18 |     "importlib_resources>=6.4.0; python_version < '3.9'",
19 | ]
20 | classifiers = [
21 |     "Programming Language :: Python :: 3",
22 |     "Programming Language :: Python :: 3.8",
23 |     "Programming Language :: Python :: 3.9",
24 |     "Programming Language :: Python :: 3.10",
25 |     "Programming Language :: Python :: 3.11",
26 |     "Programming Language :: Python :: 3.12",
27 | ]
28 | 
29 | [tool.scikit-build]
30 | wheel.packages = ["ggml"]
31 | wheel.expand-macos-universal-tags = true
32 | cmake.verbose = true
33 | cmake.minimum-version = "3.21"
34 | minimum-version = "0.5.1"
35 | 
36 | [tool.scikit-build.metadata.version]
37 | provider = "scikit_build_core.metadata.regex"
38 | input = "ggml/__init__.py"
39 | 
40 | [tool.pytest.ini_options]
41 | addopts = "--ignore=vendor"
42 | testpaths = "tests"
43 | 
44 | [project.optional-dependencies]
45 | test = ["pytest"]
46 | docs = ["mkdocs", "mkdocstrings[python]", "mkdocs-material", "pillow", "cairosvg"]
47 | publish = ["build"]
48 | convert = [
49 |     "accelerate==0.30.1",
50 |     "numpy==1.26.4",
51 |     "sentencepiece==0.2.0",
52 |     "torch==2.3.0",
53 |     "torchaudio==2.3.0",
54 |     "torchvision==0.18.0",
55 |     "transformers==4.41.2"
56 | ]
57 | 
58 | [project.urls]
59 | Homepage = "https://github.com/abetlen/ggml-python"
60 | Documentation = "https://ggml-python.readthedocs.io/en/latest/"
61 | Issues = "https://github.com/abetlen/ggml-python/issues"
62 | 


--------------------------------------------------------------------------------
/scripts/releases-to-pep-503.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get output directory or default to index/whl/cpu
 4 | output_dir=${1:-"index/whl/cpu"}
 5 | 
 6 | # Create output directory
 7 | mkdir -p $output_dir
 8 | 
 9 | # Change to output directory
10 | pushd $output_dir
11 | 
12 | # Create an index html file
13 | echo "<!DOCTYPE html>" > index.html
14 | echo "<html>" >> index.html
15 | echo "  <head></head>" >> index.html
16 | echo "  <body>" >> index.html
17 | echo "    <a href=\"ggml-python/\">ggml-python</a>" >> index.html
18 | echo "    <br>" >> index.html
19 | echo "  </body>" >> index.html
20 | echo "</html>" >> index.html
21 | echo "" >> index.html
22 | 
23 | # Create ggml-python directory
24 | mkdir -p ggml-python
25 | 
26 | # Change to ggml-python directory
27 | pushd ggml-python
28 | 
29 | # Create an index html file
30 | echo "<!DOCTYPE html>" > index.html
31 | echo "<html>" >> index.html
32 | echo "  <body>" >> index.html
33 | echo "    <h1>Links for ggml-python</h1>" >> index.html
34 | 
35 | # Get all releases
36 | releases=$(curl -s https://api.github.com/repos/abetlen/ggml-python/releases | jq -r .[].tag_name)
37 | 
38 | # Get pattern from second arg or default to valid python package version pattern
39 | pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
40 | 
41 | # Filter releases by pattern
42 | releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
43 | 
44 | # For each release, get all assets
45 | for release in $releases; do
46 |     assets=$(curl -s https://api.github.com/repos/abetlen/ggml-python/releases/tags/$release | jq -r .assets)
47 |     echo "    <h2>$release</h2>" >> index.html
48 |     for asset in $(echo $assets | jq -r .[].browser_download_url); do
49 |         if [[ $asset == *".whl" ]]; then
50 |             echo "    <a href=\"$asset\">$asset</a>" >> index.html
51 |             echo "    <br>" >> index.html
52 |         fi
53 |     done
54 | done
55 | 
56 | echo "  </body>" >> index.html
57 | echo "</html>" >> index.html
58 | echo "" >> index.html


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abetlen/ggml-python/90ffdc076df76f290227052b285e71a94f29f865/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_ggml.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | 
  3 | from typing import Optional
  4 | 
  5 | import ggml
  6 | 
  7 | import numpy as np
  8 | 
  9 | 
 10 | def test_ggml():
 11 |     assert ggml.GGML_FILE_VERSION == 1
 12 | 
 13 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None)
 14 |     ctx = ggml.ggml_init(params)
 15 |     assert ctx is not None
 16 |     assert ggml.ggml_used_mem(ctx) == 0
 17 |     x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 18 |     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 19 |     b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 20 |     x2 = ggml.ggml_mul(ctx, x, x)
 21 |     f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b)
 22 |     gf = ggml.ggml_new_graph(ctx)
 23 |     ggml.ggml_build_forward_expand(gf, f)
 24 | 
 25 |     ggml.ggml_set_f32(x, 2.0)
 26 |     ggml.ggml_set_f32(a, 3.0)
 27 |     ggml.ggml_set_f32(b, 4.0)
 28 | 
 29 |     ggml.ggml_graph_compute_with_ctx(ctx, gf, 1)
 30 |     output = ggml.ggml_get_f32_1d(f, 0)
 31 |     assert output == 16.0
 32 |     ggml.ggml_free(ctx)
 33 | 
 34 | 
 35 | def test_ggml_pythonic():
 36 |     import contextlib
 37 | 
 38 |     with contextlib.ExitStack() as stack:
 39 |         backend = ggml.ggml_backend_cpu_init()
 40 |         assert backend is not None
 41 |         stack.callback(ggml.ggml_backend_free, backend)
 42 | 
 43 |         params = ggml.ggml_init_params(
 44 |             mem_size=ggml.ggml_tensor_overhead() * 6 + ggml.ggml_graph_overhead(),
 45 |             no_alloc=True,
 46 |         )
 47 |         ctx = ggml.ggml_init(params)
 48 |         assert ctx is not None
 49 |         stack.callback(ggml.ggml_free, ctx)
 50 | 
 51 |         x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 52 |         a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 53 |         b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 54 |         x2 = ggml.ggml_mul(ctx, x, x)
 55 |         f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b)
 56 |         gf = ggml.ggml_new_graph(ctx)
 57 | 
 58 |         ggml.ggml_build_forward_expand(gf, f)
 59 | 
 60 |         buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend)
 61 |         assert buffer is not None
 62 |         stack.callback(ggml.ggml_backend_buffer_free, buffer)
 63 | 
 64 |         ggml.ggml_set_f32(x, 2.0)
 65 |         ggml.ggml_set_f32(a, 3.0)
 66 |         ggml.ggml_set_f32(b, 4.0)
 67 | 
 68 |         ggml.ggml_backend_graph_compute(backend, gf)
 69 | 
 70 |         output = ggml.ggml_get_f32_1d(f, 0)
 71 | 
 72 |         assert output == 16.0
 73 | 
 74 | 
 75 | def test_ggml_custom_op():
 76 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None)
 77 |     ctx = ggml.ggml_init(params)
 78 |     assert ctx is not None
 79 |     x_in = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 80 | 
 81 |     @ggml.ggml_custom1_op_t
 82 |     def double(
 83 |         tensor_out: ggml.ggml_tensor_p,
 84 |         tensor_in: ggml.ggml_tensor_p,
 85 |         ith: int,
 86 |         nth: int,
 87 |         userdata: Optional[ctypes.c_void_p],
 88 |     ):
 89 |         value = ggml.ggml_get_f32_1d(tensor_in, 0)
 90 |         ggml.ggml_set_f32(tensor_out, 2 * value)
 91 | 
 92 |     x_out = ggml.ggml_map_custom1(ctx, x_in, double, 1, None)
 93 |     gf = ggml.ggml_new_graph(ctx)
 94 |     ggml.ggml_build_forward_expand(gf, x_out)
 95 | 
 96 |     ggml.ggml_set_f32(x_in, 21.0)
 97 | 
 98 |     ggml.ggml_graph_compute_with_ctx(ctx, gf, 1)
 99 |     output = ggml.ggml_get_f32_1d(x_out, 0)
100 |     assert output == 42.0
101 |     ggml.ggml_free(ctx)
102 | 
103 | 
104 | def test_quantize():
105 |     ne0 = 32
106 |     ne1 = 1
107 |     nelements = ne0 * ne1
108 |     data = [float(i) for i in range(nelements)]
109 |     data_f32 = (ctypes.c_float * len(data))(*data)
110 |     work = (ctypes.c_float * nelements)(0)
111 |     # TODO: convert to ggml.ggml_quantize_chunk
112 |     # cur_size = ggml.ggml_quantize_q8_0(
113 |     cur_size = ggml.ggml_quantize_chunk(
114 |         ggml.GGML_TYPE_Q8_0,
115 |         data_f32,
116 |         ctypes.cast(work, ctypes.c_void_p),
117 |         0,
118 |         nelements // ne0,
119 |         ne0,
120 |         None,
121 |     )
122 |     assert cur_size == 34
123 | 
124 |     type_traits = ggml.ggml_internal_get_type_traits(ggml.GGML_TYPE_Q8_0)
125 |     work2 = (ctypes.c_float * nelements)(0)
126 |     type_traits.to_float(
127 |         ctypes.cast(work, ctypes.c_void_p),
128 |         ctypes.cast(work2, ctypes.POINTER(ctypes.c_float)),
129 |         nelements,
130 |     )
131 | 
132 |     eps = 0.5
133 |     for i in range(nelements):
134 |         assert abs(work2[i] - data[i]) < eps
135 | 
136 | 
137 | def test_ggml_cpu_backend():
138 |     n_tensors = 1 + 2  # input (x) and weights (a, b)
139 |     params = ggml.ggml_init_params(
140 |         mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True
141 |     )
142 |     ctx = ggml.ggml_init(params)
143 |     assert ctx is not None
144 | 
145 |     backend = ggml.ggml_backend_cpu_init()
146 | 
147 |     assert backend is not None
148 | 
149 |     # create the tensors for input and weights
150 |     x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
151 | 
152 |     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
153 |     b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
154 | 
155 |     # allocate the tensors in the backend
156 |     buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend)
157 |     assert buffer is not None
158 | 
159 |     # set the values of the weights
160 |     ggml.ggml_backend_tensor_set(
161 |         a,
162 |         ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
163 |         0,
164 |         ggml.ggml_nbytes(a),
165 |     )
166 |     ggml.ggml_backend_tensor_set(
167 |         b,
168 |         ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
169 |         0,
170 |         ggml.ggml_nbytes(a),
171 |     )
172 | 
173 |     max_nodes = 4096
174 | 
175 |     buf_size = (
176 |         ggml.ggml_tensor_overhead() * max_nodes
177 |         + ggml.ggml_graph_overhead_custom(max_nodes, False)
178 |     )
179 |     buf = (ctypes.c_uint8 * buf_size)()
180 | 
181 |     def build_graph(
182 |         x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p
183 |     ):
184 |         params = ggml.ggml_init_params(
185 |             mem_size=buf_size,
186 |             mem_buffer=ctypes.cast(buf, ctypes.c_void_p),
187 |             no_alloc=True,
188 |         )
189 |         ctx0 = ggml.ggml_init(params)
190 | 
191 |         assert ctx0 is not None
192 | 
193 |         gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False)
194 | 
195 |         x2 = ggml.ggml_mul(ctx0, x, x)
196 |         ax2 = ggml.ggml_mul(ctx0, a, x2)
197 |         f = ggml.ggml_add(ctx0, ax2, b)
198 | 
199 |         ggml.ggml_set_name(x2, b"x2")
200 |         ggml.ggml_set_name(ax2, b"ax2")
201 |         ggml.ggml_set_name(f, b"f")
202 | 
203 |         ggml.ggml_build_forward_expand(gf, f)
204 | 
205 |         ggml.ggml_free(ctx0)
206 | 
207 |         return gf
208 | 
209 |     buffer_type = ggml.ggml_backend_get_default_buffer_type(backend)
210 |     assert buffer_type is not None
211 |     allocr = ggml.ggml_gallocr_new(buffer_type)
212 |     assert allocr is not None
213 | 
214 |     gf = build_graph(x, a, b)
215 | 
216 |     ggml.ggml_gallocr_reserve(allocr, gf)
217 | 
218 |     gf = build_graph(x, a, b)
219 | 
220 |     ggml.ggml_gallocr_alloc_graph(allocr, gf)
221 | 
222 |     ggml.ggml_backend_tensor_set(
223 |         x,
224 |         ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
225 |         0,
226 |         ggml.ggml_nbytes(x),
227 |     )
228 | 
229 |     ggml.ggml_backend_graph_compute(backend, gf)
230 | 
231 |     f = ggml.ggml_graph_get_tensor(gf, b"f")
232 | 
233 |     output = np.zeros(1, dtype=np.single)
234 |     ggml.ggml_backend_tensor_get(
235 |         f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x)
236 |     )
237 | 
238 |     assert output[0] == 16.0
239 | 
240 |     ggml.ggml_gallocr_free(allocr)
241 |     ggml.ggml_backend_buffer_free(buffer)
242 |     ggml.ggml_backend_free(backend)
243 |     ggml.ggml_free(ctx)
244 | 
245 | 
246 | def test_grad():
247 |     nthreads = 1
248 |     params = ggml.ggml_init_params(
249 |         mem_size=128 * 1024 * 1024, mem_buffer=None, no_alloc=False
250 |     )
251 |     ctx0 = ggml.ggml_init(params)
252 |     assert ctx0 is not None
253 | 
254 |     x = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1)
255 | 
256 |     ggml.ggml_set_param(ctx0, x)
257 | 
258 |     a = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1)
259 |     b = ggml.ggml_mul(ctx0, x, x)
260 |     f = ggml.ggml_mul(ctx0, a, b)
261 | 
262 |     gf = ggml.ggml_new_graph_custom(ctx0, ggml.GGML_DEFAULT_GRAPH_SIZE, True)
263 |     ggml.ggml_build_forward_expand(gf, f)
264 | 
265 |     gb = ggml.ggml_graph_dup(ctx0, gf)
266 | 
267 |     ggml.ggml_build_backward_expand(ctx0, gf, gb, False)
268 | 
269 |     ggml.ggml_set_f32(x, 2.0)
270 |     ggml.ggml_set_f32(a, 3.0)
271 | 
272 |     ggml.ggml_graph_reset(gf)
273 |     ggml.ggml_set_f32(f.contents.grad, 1.0)
274 | 
275 |     ggml.ggml_graph_compute_with_ctx(ctx0, gb, nthreads)
276 | 
277 |     assert ggml.ggml_get_f32_1d(f, 0) == 12.0
278 |     assert ggml.ggml_get_f32_1d(x.contents.grad, 0) == 12.0
279 | 
280 |     ggml.ggml_free(ctx0)
281 | 


--------------------------------------------------------------------------------
/tests/test_ggml_cuda.py:
--------------------------------------------------------------------------------
  1 | import ggml
  2 | import ggml.utils
  3 | import ctypes
  4 | import pytest
  5 | import numpy as np
  6 | 
  7 | ggml_cuda_available = ggml.GGML_USE_CUDA
  8 | 
  9 | run_if_ggml_cuda_available = pytest.mark.skipif(
 10 |     not ggml_cuda_available,
 11 |     reason="CUDA not available",
 12 | )
 13 | 
 14 | 
 15 | @run_if_ggml_cuda_available
 16 | def test_cuda():
 17 |     n_tensors = 1 + 2 # input (x) and weights (a, b)
 18 |     params = ggml.ggml_init_params(
 19 |         mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True
 20 |     )
 21 |     ctx = ggml.ggml_init(params)
 22 |     assert ctx is not None
 23 | 
 24 |     backend = ggml.ggml_backend_cuda_init(0)
 25 | 
 26 |     assert backend is not None
 27 | 
 28 |     # create the tensors for input and weights
 29 |     x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 30 | 
 31 |     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 32 |     b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 33 | 
 34 |     # allocate the tensors in the backend
 35 |     buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend)
 36 | 
 37 |     # set the values of the weights
 38 |     ggml.ggml_backend_tensor_set(
 39 |         a,
 40 |         ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
 41 |         0,
 42 |         ggml.ggml_nbytes(a),
 43 |     )
 44 |     ggml.ggml_backend_tensor_set(
 45 |         b,
 46 |         ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
 47 |         0,
 48 |         ggml.ggml_nbytes(a),
 49 |     )
 50 | 
 51 |     max_nodes = 4096
 52 | 
 53 |     buf_size = (
 54 |         ggml.ggml_tensor_overhead() * max_nodes
 55 |         + ggml.ggml_graph_overhead_custom(max_nodes, False)
 56 |     )
 57 |     buf = (ctypes.c_uint8 * buf_size)()
 58 | 
 59 |     def build_graph(
 60 |         x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p
 61 |     ):
 62 |         params = ggml.ggml_init_params(
 63 |             mem_size=buf_size,
 64 |             mem_buffer=ctypes.cast(buf, ctypes.c_void_p),
 65 |             no_alloc=True,
 66 |         )
 67 |         ctx0 = ggml.ggml_init(params)
 68 | 
 69 |         assert ctx0 is not None
 70 | 
 71 |         gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False)
 72 | 
 73 |         x2 = ggml.ggml_mul(ctx0, x, x)
 74 |         ax2 = ggml.ggml_mul(ctx0, a, x2)
 75 |         f = ggml.ggml_add(ctx0, ax2, b)
 76 | 
 77 |         ggml.ggml_set_name(x2, b"x2")
 78 |         ggml.ggml_set_name(ax2, b"ax2")
 79 |         ggml.ggml_set_name(f, b"f")
 80 | 
 81 |         ggml.ggml_build_forward_expand(gf, f)
 82 | 
 83 |         ggml.ggml_free(ctx0)
 84 | 
 85 |         return gf
 86 | 
 87 |     allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend))
 88 | 
 89 |     gf = build_graph(x, a, b)
 90 | 
 91 |     ggml.ggml_gallocr_reserve(allocr, gf)
 92 | 
 93 |     gf = build_graph(x, a, b)
 94 | 
 95 |     ggml.ggml_gallocr_alloc_graph(allocr, gf)
 96 | 
 97 |     ggml.ggml_backend_tensor_set(
 98 |         x,
 99 |         ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
100 |         0,
101 |         ggml.ggml_nbytes(x),
102 |     )
103 | 
104 |     ggml.ggml_backend_graph_compute(backend, gf)
105 | 
106 |     f = ggml.ggml_graph_get_tensor(gf, b"f")
107 | 
108 |     output = np.zeros(1, dtype=np.single)
109 |     ggml.ggml_backend_tensor_get(
110 |         f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x)
111 |     )
112 | 
113 |     assert output[0] == 16.0
114 | 
115 |     ggml.ggml_gallocr_free(allocr)
116 |     ggml.ggml_backend_buffer_free(buffer)
117 |     ggml.ggml_backend_free(backend)
118 |     ggml.ggml_free(ctx)
119 | 


--------------------------------------------------------------------------------
/tests/test_ggml_metal.py:
--------------------------------------------------------------------------------
  1 | import ggml
  2 | import ggml.utils
  3 | import ctypes
  4 | import pytest
  5 | import numpy as np
  6 | 
  7 | from ggml.utils import setup_sigabrt_handler
  8 | 
  9 | setup_sigabrt_handler()
 10 | 
 11 | ggml_metal_available = ggml.GGML_USE_METAL
 12 | 
 13 | run_if_ggml_metal_available = pytest.mark.skipif(
 14 |     not ggml_metal_available,
 15 |     reason="METAL not available",
 16 | )
 17 | 
 18 | @run_if_ggml_metal_available
 19 | def test_metal():
 20 |     n_tensors = 1 + 2 # input (x) and weights (a, b)
 21 |     params = ggml.ggml_init_params(
 22 |         mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True
 23 |     )
 24 |     ctx = ggml.ggml_init(params)
 25 |     assert ctx is not None
 26 | 
 27 |     backend = ggml.ggml_backend_metal_init()
 28 | 
 29 |     assert backend is not None
 30 | 
 31 |     # create the tensors for input and weights
 32 |     x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 33 | 
 34 |     a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 35 |     b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 36 | 
 37 |     # allocate the tensors in the backend
 38 |     buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend)
 39 | 
 40 |     # set the values of the weights
 41 |     ggml.ggml_backend_tensor_set(
 42 |         a,
 43 |         ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
 44 |         0,
 45 |         ggml.ggml_nbytes(a),
 46 |     )
 47 |     ggml.ggml_backend_tensor_set(
 48 |         b,
 49 |         ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
 50 |         0,
 51 |         ggml.ggml_nbytes(a),
 52 |     )
 53 | 
 54 |     max_nodes = 4096
 55 | 
 56 |     buf_size = (
 57 |         ggml.ggml_tensor_overhead() * max_nodes
 58 |         + ggml.ggml_graph_overhead_custom(max_nodes, False)
 59 |     )
 60 |     buf = (ctypes.c_uint8 * buf_size)()
 61 | 
 62 |     def build_graph(
 63 |         x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p
 64 |     ):
 65 |         params = ggml.ggml_init_params(
 66 |             mem_size=buf_size,
 67 |             mem_buffer=ctypes.cast(buf, ctypes.c_void_p),
 68 |             no_alloc=True,
 69 |         )
 70 |         ctx0 = ggml.ggml_init(params)
 71 | 
 72 |         assert ctx0 is not None
 73 | 
 74 |         gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False)
 75 | 
 76 |         x2 = ggml.ggml_mul(ctx0, x, x)
 77 |         ax2 = ggml.ggml_mul(ctx0, a, x2)
 78 |         f = ggml.ggml_add(ctx0, ax2, b)
 79 | 
 80 |         ggml.ggml_set_name(x2, b"x2")
 81 |         ggml.ggml_set_name(ax2, b"ax2")
 82 |         ggml.ggml_set_name(f, b"f")
 83 | 
 84 |         ggml.ggml_build_forward_expand(gf, f)
 85 | 
 86 |         ggml.ggml_free(ctx0)
 87 | 
 88 |         return gf
 89 | 
 90 |     allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend))
 91 | 
 92 |     gf = build_graph(x, a, b)
 93 | 
 94 |     ggml.ggml_gallocr_reserve(allocr, gf)
 95 | 
 96 |     gf = build_graph(x, a, b)
 97 | 
 98 |     ggml.ggml_gallocr_alloc_graph(allocr, gf)
 99 | 
100 |     ggml.ggml_backend_tensor_set(
101 |         x,
102 |         ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
103 |         0,
104 |         ggml.ggml_nbytes(x),
105 |     )
106 | 
107 |     ggml.ggml_backend_graph_compute(backend, gf)
108 | 
109 |     f = ggml.ggml_graph_get_tensor(gf, b"f")
110 | 
111 |     output = np.zeros(1, dtype=np.single)
112 |     ggml.ggml_backend_tensor_get(
113 |         f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x)
114 |     )
115 | 
116 |     assert output[0] == 16.0
117 | 
118 |     ggml.ggml_gallocr_free(allocr)
119 |     ggml.ggml_backend_buffer_free(buffer)
120 |     ggml.ggml_backend_free(backend)
121 |     ggml.ggml_free(ctx)
122 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import ggml
  2 | import ggml.utils
  3 | 
  4 | import pytest
  5 | 
  6 | import numpy as np
  7 | import numpy.typing as npt
  8 | 
  9 | 
 10 | def test_utils():
 11 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
 12 |     ctx = ggml.ggml_init(params)
 13 |     assert ctx is not None
 14 |     x = np.ones((3,), dtype=np.float32)
 15 |     assert x.shape == (3,)
 16 |     t = ggml.utils.from_numpy(x, ctx)
 17 |     assert t.contents.ne[:1] == [3]
 18 |     assert t.contents.type == ggml.GGML_TYPE_F32
 19 |     assert np.allclose(ggml.utils.to_numpy(t), x)
 20 |     ggml.ggml_free(ctx)
 21 | 
 22 | 
 23 | def test_numpy_arrays():
 24 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
 25 |     ctx = ggml.ggml_init(params)
 26 |     assert ctx is not None
 27 |     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32, order="F")
 28 |     assert x.shape == (2, 3)
 29 |     t = ggml.utils.from_numpy(x, ctx)
 30 |     assert t.contents.ne[:2] == [3, 2]
 31 |     y = ggml.utils.to_numpy(t)
 32 |     assert y.shape == (2, 3)
 33 |     ggml.ggml_free(ctx)
 34 | 
 35 | 
 36 | def test_numpy_arrays_transposed():
 37 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
 38 |     ctx = ggml.ggml_init(params)
 39 |     assert ctx is not None
 40 |     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
 41 |     t = ggml.utils.from_numpy(x, ctx)
 42 | 
 43 |     t_T = ggml.ggml_transpose(ctx, t)
 44 | 
 45 |     # ggml_transpose currently modifies the original tensor in place, input must be
 46 |     # set _after_ the transpose operation
 47 |     ggml.utils.to_numpy(t)[:] = x
 48 | 
 49 |     assert ggml.utils.get_shape(t_T) == (2, 3)
 50 |     assert ggml.utils.get_strides(t_T) == (12, 4)
 51 | 
 52 |     assert np.array_equal(ggml.utils.to_numpy(t_T, shape=x.T.shape), x.T)
 53 | 
 54 |     ggml.ggml_free(ctx)
 55 | 
 56 | 
 57 | def test_numpy_arrays_transposed_diff_ctx():
 58 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
 59 |     ctx = ggml.ggml_init(params)
 60 |     assert ctx is not None
 61 |     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
 62 |     t = ggml.utils.from_numpy(x, ctx)
 63 | 
 64 |     ggml.utils.to_numpy(t)[:] = x
 65 | 
 66 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
 67 |     ctx2 = ggml.ggml_init(params)
 68 |     assert ctx2 is not None
 69 | 
 70 |     t_T = ggml.ggml_transpose(ctx2, t)
 71 | 
 72 |     assert ggml.utils.get_shape(t_T) == (2, 3)
 73 |     assert ggml.utils.get_strides(t_T) == (12, 4)
 74 | 
 75 |     assert np.array_equal(ggml.utils.to_numpy(t_T, shape=x.T.shape), x.T)
 76 | 
 77 |     ggml.ggml_free(ctx)
 78 |     ggml.ggml_free(ctx2)
 79 | 
 80 | 
 81 | def test_numpy_arrays_permute_transpose():
 82 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
 83 |     ctx = ggml.ggml_init(params)
 84 |     assert ctx is not None
 85 | 
 86 |     x = np.array(
 87 |         [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], dtype=np.int32
 88 |     )
 89 |     t = ggml.utils.from_numpy(x, ctx)
 90 | 
 91 |     t_T = ggml.ggml_permute(ctx, t, 2, 1, 0, 3)
 92 | 
 93 |     ggml.utils.to_numpy(t)[:] = x
 94 | 
 95 |     x_T = ggml.utils.to_numpy(t_T)
 96 |     assert np.array_equal(x_T, x.T)
 97 | 
 98 |     ggml.ggml_free(ctx)
 99 | 
100 | 
101 | def test_slice_tensor():
102 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
103 |     ctx = ggml.ggml_init(params)
104 |     assert ctx is not None
105 |     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
106 |     t = ggml.utils.from_numpy(x, ctx)
107 |     t_slice = ggml.utils.slice_tensor(ctx, t, [
108 |         slice(0, 1),
109 |         slice(0, 2)
110 |     ])
111 |     x_slice = x[:2, :1]
112 |     t_slice_array = ggml.utils.to_numpy(t_slice)
113 |     assert np.array_equal(t_slice_array, x_slice)
114 |     ggml.ggml_free(ctx)
115 | 
116 | 
117 | @pytest.mark.parametrize("a, b", [
118 |     [np.array([1], dtype=np.float32), np.array([1], dtype=np.float32)],
119 |     [np.array([1, 1], dtype=np.float32), np.array([1], dtype=np.float32)],
120 |     [np.array([1, 1], dtype=np.float32), np.array([[1, 2]], dtype=np.float32)],
121 | ])
122 | def test_broadcast_tensor(a: npt.NDArray[np.float32], b: npt.NDArray[np.float32]):
123 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
124 |     ctx = ggml.ggml_init(params)
125 |     assert ctx is not None
126 |     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
127 |     ctx2 = ggml.ggml_init(params)
128 |     assert ctx2 is not None
129 |     t_a = ggml.utils.from_numpy(a, ctx)
130 |     t_b = ggml.utils.from_numpy(b, ctx)
131 |     t_sum = ggml.ggml_add(ctx2, t_a, t_b)
132 |     gf = ggml.ggml_new_graph(ctx2)
133 |     ggml.ggml_build_forward_expand(gf, t_sum)
134 |     ggml.ggml_graph_compute_with_ctx(ctx2, gf, 1)
135 |     expected = a + b
136 |     result = ggml.utils.to_numpy(t_sum).reshape(expected.shape)
137 |     assert np.array_equal(result, expected)
138 |     ggml.ggml_free(ctx)
139 | 


--------------------------------------------------------------------------------