├── .github ├── dependabot.yaml └── workflows │ ├── publish.yaml │ ├── test.yaml │ ├── wheels-cuda.yaml │ ├── wheels-index.yaml │ ├── wheels-metal.yaml │ └── wheels.yaml ├── .gitignore ├── .gitmodules ├── .readthedocs.yaml ├── CMakeLists.txt ├── LICENSE.md ├── Makefile ├── README.md ├── docs ├── api-reference.md └── index.md ├── examples ├── clip │ ├── README.md │ ├── convert-pt-to-ggml.py │ ├── model.py │ ├── requirements.txt │ └── utils.py ├── custom-operators │ └── example_jax.py ├── optimizer │ └── simple.py ├── replit │ ├── README.md │ ├── app.py │ ├── main.py │ └── requirements.txt └── rpc │ ├── main.py │ └── worker.py ├── ggml ├── __init__.py ├── ggml.py ├── py.typed └── utils.py ├── mkdocs.yml ├── pyproject.toml ├── scripts └── releases-to-pep-503.sh └── tests ├── __init__.py ├── test_ggml.py ├── test_ggml_cuda.py ├── test_ggml_metal.py └── test_utils.py /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | - package-ecosystem: "github-actions" 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ 4 | 5 | on: workflow_dispatch 6 | 7 | jobs: 8 | build-n-publish: 9 | name: Build and publish 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | submodules: "true" 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.8" 20 | - name: Install dependencies 21 | run: | 22 | python3 -m pip install --upgrade pip 23 | python3 -m pip install -e .[publish] 24 | - name: Build source distribution 25 | run: | 26 | python3 -m build --sdist 27 | - name: Publish distribution to PyPI 28 | # TODO: move to tag based releases 29 | # if: startsWith(github.ref, 'refs/tags') 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | with: 32 | password: ${{ secrets.PYPI_API_TOKEN }} 33 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | push: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | submodules: "true" 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools 29 | python3 -m pip install \ 30 | --verbose \ 31 | --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' \ 32 | --config-settings cmake.verbose=true \ 33 | --config-settings logging.level=INFO \ 34 | --config-settings install.strip=false \ 35 | --editable . 36 | - name: Test with pytest 37 | run: | 38 | python -m pytest -s -vvvv 39 | 40 | build-windows: 41 | runs-on: windows-latest 42 | strategy: 43 | matrix: 44 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 45 | 46 | steps: 47 | - uses: actions/checkout@v4 48 | with: 49 | submodules: "true" 50 | - name: Set up Python ${{ matrix.python-version }} 51 | uses: actions/setup-python@v5 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | 55 | - name: Install dependencies 56 | run: | 57 | python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools 58 | python3 -m pip install --verbose --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' --config-settings cmake.verbose=true --config-settings logging.level=INFO --config-settings install.strip=false --editable . 59 | - name: Test with pytest 60 | run: | 61 | python -m pytest -s -vvvv 62 | 63 | build-macos: 64 | runs-on: macos-13 65 | strategy: 66 | matrix: 67 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 68 | 69 | steps: 70 | - uses: actions/checkout@v4 71 | with: 72 | submodules: "true" 73 | - name: Set up Python ${{ matrix.python-version }} 74 | uses: actions/setup-python@v5 75 | with: 76 | python-version: ${{ matrix.python-version }} 77 | - name: Install dependencies 78 | run: | 79 | python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools 80 | python3 -m pip install \ 81 | --verbose \ 82 | --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' \ 83 | --config-settings cmake.verbose=true \ 84 | --config-settings logging.level=INFO \ 85 | --config-settings install.strip=false \ 86 | --editable . 87 | - name: Test with pytest 88 | run: | 89 | python -m pytest -s -vvvv 90 | 91 | build-macos-metal: 92 | runs-on: macos-13 93 | 94 | steps: 95 | - uses: actions/checkout@v4 96 | with: 97 | submodules: "true" 98 | - name: Set up Python 99 | uses: actions/setup-python@v5 100 | with: 101 | python-version: "3.8" 102 | - name: Install dependencies 103 | run: | 104 | python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools 105 | python3 -m pip install \ 106 | --verbose \ 107 | --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3;-DGGML_METAL=On' \ 108 | --config-settings cmake.verbose=true \ 109 | --config-settings logging.level=INFO \ 110 | --config-settings install.strip=false \ 111 | --editable . 112 | - name: Test with pytest 113 | run: | 114 | python -m pytest -s -vvvv 115 | -------------------------------------------------------------------------------- /.github/workflows/wheels-cuda.yaml: -------------------------------------------------------------------------------- 1 | name: Wheels CUDA 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | define_matrix: 10 | name: Define Build Matrix 11 | runs-on: ubuntu-latest 12 | outputs: 13 | matrix: ${{ steps.set-matrix.outputs.matrix }} 14 | defaults: 15 | run: 16 | shell: pwsh 17 | 18 | steps: 19 | - name: Define Job Output 20 | id: set-matrix 21 | run: | 22 | $matrix = @{ 23 | 'os' = @('ubuntu-20.04', 'windows-latest') 24 | 'pyver' = @("3.10", "3.11", "3.12") 25 | 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") 26 | 'releasetag' = @("basic") 27 | } 28 | 29 | $matrixOut = ConvertTo-Json $matrix -Compress 30 | Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT 31 | 32 | build_wheels: 33 | name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }} 34 | needs: define_matrix 35 | runs-on: ${{ matrix.os }} 36 | strategy: 37 | matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} 38 | defaults: 39 | run: 40 | shell: pwsh 41 | env: 42 | CUDAVER: ${{ matrix.cuda }} 43 | AVXVER: ${{ matrix.releasetag }} 44 | 45 | steps: 46 | - uses: actions/checkout@v4 47 | with: 48 | submodules: "recursive" 49 | 50 | - uses: actions/setup-python@v5 51 | with: 52 | python-version: ${{ matrix.pyver }} 53 | 54 | - name: Setup Mamba 55 | uses: conda-incubator/setup-miniconda@v3.0.4 56 | with: 57 | activate-environment: "build" 58 | python-version: ${{ matrix.pyver }} 59 | miniforge-variant: Mambaforge 60 | miniforge-version: latest 61 | use-mamba: true 62 | add-pip-as-python-dependency: true 63 | auto-activate-base: false 64 | 65 | - name: VS Integration Cache 66 | id: vs-integration-cache 67 | if: runner.os == 'Windows' 68 | uses: actions/cache@v4.0.2 69 | with: 70 | path: ./MSBuildExtensions 71 | key: cuda-${{ matrix.cuda }}-vs-integration 72 | 73 | - name: Get Visual Studio Integration 74 | if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true' 75 | run: | 76 | if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER} 77 | $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''}) 78 | for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}} 79 | Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip' 80 | & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null 81 | Remove-Item 'cudainstaller.zip' 82 | 83 | - name: Install Visual Studio Integration 84 | if: runner.os == 'Windows' 85 | run: | 86 | $y = (gi '.\MSBuildExtensions').fullname + '\*' 87 | (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) 88 | $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') 89 | echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV 90 | 91 | - name: Install Dependencies 92 | env: 93 | MAMBA_DOWNLOAD_FAILFAST: "0" 94 | MAMBA_NO_LOW_SPEED_LIMIT: "1" 95 | run: | 96 | $cudaVersion = $env:CUDAVER 97 | mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion 98 | python -m pip install build wheel 99 | 100 | - name: Build Wheel 101 | run: | 102 | $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') 103 | $env:CUDA_PATH = $env:CONDA_PREFIX 104 | $env:CUDA_HOME = $env:CONDA_PREFIX 105 | $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX 106 | if ($IsLinux) { 107 | $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH 108 | } 109 | $env:VERBOSE = '1' 110 | $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all' 111 | $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" 112 | if ($env:AVXVER -eq 'AVX') { 113 | $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' 114 | } 115 | if ($env:AVXVER -eq 'AVX512') { 116 | $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX512=on' 117 | } 118 | if ($env:AVXVER -eq 'basic') { 119 | $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' 120 | } 121 | $buildtag = "-cu$cudaVersion" 122 | python -m build --wheel 123 | Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV 124 | 125 | - uses: softprops/action-gh-release@v2 126 | with: 127 | files: dist/* 128 | tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} 129 | env: 130 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 131 | -------------------------------------------------------------------------------- /.github/workflows/wheels-index.yaml: -------------------------------------------------------------------------------- 1 | name: Wheels Index 2 | 3 | on: 4 | # Trigger on any new release 5 | release: 6 | types: [published] 7 | 8 | # Allows you to run this workflow manually from the Actions tab 9 | workflow_dispatch: 10 | 11 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 12 | permissions: 13 | contents: read 14 | pages: write 15 | id-token: write 16 | 17 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 18 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 19 | concurrency: 20 | group: "pages" 21 | cancel-in-progress: false 22 | 23 | jobs: 24 | # Single deploy job since we're just deploying 25 | deploy: 26 | environment: 27 | name: github-pages 28 | url: ${{ steps.deployment.outputs.page_url }} 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v4 33 | - name: Setup Pages 34 | uses: actions/configure-pages@v5 35 | - name: Build 36 | run: | 37 | ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' 38 | ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' 39 | ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' 40 | ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$' 41 | ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' 42 | ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$' 43 | - name: Upload artifact 44 | uses: actions/upload-pages-artifact@v3 45 | with: 46 | # Upload entire repository 47 | path: "index" 48 | - name: Deploy to GitHub Pages 49 | id: deployment 50 | uses: actions/deploy-pages@v4 51 | -------------------------------------------------------------------------------- /.github/workflows/wheels-metal.yaml: -------------------------------------------------------------------------------- 1 | name: Wheels Metal 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | define_matrix: 10 | name: Define Build Matrix 11 | runs-on: ubuntu-latest 12 | outputs: 13 | matrix: ${{ steps.set-matrix.outputs.matrix }} 14 | defaults: 15 | run: 16 | shell: pwsh 17 | 18 | steps: 19 | - name: Define Job Output 20 | id: set-matrix 21 | run: | 22 | $matrix = @{ 23 | 'os' = @('macos-11', 'macos-12', 'macos-13') 24 | 'pyver' = @('3.10', '3.11', '3.12') 25 | } 26 | 27 | $matrixOut = ConvertTo-Json $matrix -Compress 28 | Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT 29 | 30 | build_wheels: 31 | name: ${{ matrix.os }} Python ${{ matrix.pyver }} 32 | needs: define_matrix 33 | runs-on: ${{ matrix.os }} 34 | strategy: 35 | matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} 36 | env: 37 | OSVER: ${{ matrix.os }} 38 | 39 | steps: 40 | - uses: actions/checkout@v4 41 | with: 42 | submodules: "recursive" 43 | 44 | - uses: actions/setup-python@v5 45 | with: 46 | python-version: ${{ matrix.pyver }} 47 | 48 | - name: Install Dependencies 49 | run: | 50 | python -m pip install build wheel cmake 51 | 52 | - name: Build Wheel 53 | run: | 54 | XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer" 55 | XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin" 56 | export CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_METAL=on" 57 | [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0" 58 | [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0" 59 | [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0" 60 | 61 | export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64" 62 | VERBOSE=1 python -m build --wheel 63 | 64 | if [[ "$OSVER" == "macos-13" ]]; then 65 | export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" 66 | export MACOSX_DEPLOYMENT_TARGET="14.0" 67 | VERBOSE=1 python -m build --wheel 68 | fi 69 | 70 | for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done 71 | 72 | export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_METAL=on" && export ARCHFLAGS="-arch x86_64" 73 | VERBOSE=1 python -m build --wheel 74 | 75 | if [[ "$OSVER" == "macos-13" ]]; then 76 | export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" 77 | export MACOSX_DEPLOYMENT_TARGET="14.0" 78 | VERBOSE=1 python -m build --wheel 79 | fi 80 | 81 | - uses: softprops/action-gh-release@v2 82 | with: 83 | files: dist/* 84 | # set release name to -metal 85 | tag_name: ${{ github.ref_name }}-metal 86 | env: 87 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 88 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yaml: -------------------------------------------------------------------------------- 1 | name: Wheels 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | build_wheels: 10 | name: Build wheels on ${{ matrix.os }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-20.04, macos-11, windows-2022] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | submodules: "recursive" 20 | 21 | - name: Build wheels 22 | uses: pypa/cibuildwheel@v2.18.1 23 | env: 24 | # disable repair 25 | CIBW_REPAIR_WHEEL_COMMAND: "" 26 | # skip building wheels for these platforms 27 | CIBW_SKIP: pp* cp36-* cp37-* *-musllinux* 28 | CMAKE_ARGS: -DGGML_METAL=OFF 29 | with: 30 | package-dir: . 31 | output-dir: wheelhouse 32 | 33 | - uses: actions/upload-artifact@v4 34 | with: 35 | name: wheels-${{ matrix.os }} 36 | path: ./wheelhouse/*.whl 37 | 38 | build_wheels_arm64: 39 | name: Build arm64 wheels 40 | runs-on: ubuntu-latest 41 | steps: 42 | - uses: actions/checkout@v4 43 | with: 44 | submodules: "recursive" 45 | 46 | - name: Set up QEMU 47 | uses: docker/setup-qemu-action@v3 48 | with: 49 | platforms: linux/arm64 50 | 51 | - name: Build wheels 52 | uses: pypa/cibuildwheel@v2.18.1 53 | env: 54 | CIBW_SKIP: "*musllinux* pp*" 55 | CIBW_REPAIR_WHEEL_COMMAND: "" 56 | CIBW_ARCHS: "aarch64" 57 | CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" 58 | with: 59 | output-dir: wheelhouse 60 | 61 | - name: Upload wheels as artifacts 62 | uses: actions/upload-artifact@v4 63 | with: 64 | name: wheels_arm64 65 | path: ./wheelhouse/*.whl 66 | 67 | release: 68 | name: Release 69 | needs: [build_wheels] 70 | runs-on: ubuntu-latest 71 | 72 | steps: 73 | - uses: actions/download-artifact@v4 74 | with: 75 | merge-multiple: true 76 | path: dist 77 | 78 | - uses: softprops/action-gh-release@v2 79 | with: 80 | files: dist/* 81 | env: 82 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 83 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .local/ 2 | 3 | .vscode/ 4 | 5 | _skbuild/ 6 | 7 | .envrc 8 | 9 | models/ 10 | 11 | MANIFEST.in 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | *.dll 21 | *.dylib 22 | 23 | # Distribution / packaging 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .nox/ 57 | .coverage 58 | .coverage.* 59 | .cache 60 | nosetests.xml 61 | coverage.xml 62 | *.cover 63 | *.py,cover 64 | .hypothesis/ 65 | .pytest_cache/ 66 | cover/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | db.sqlite3-journal 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | .pybuilder/ 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | # For a library or package, you might want to ignore these files since the code is 101 | # intended to run in multiple environments; otherwise, check them in: 102 | # .python-version 103 | 104 | # pipenv 105 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 106 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 107 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 108 | # install all needed dependencies. 109 | #Pipfile.lock 110 | 111 | # poetry 112 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 113 | # This is especially recommended for binary packages to ensure reproducibility, and is more 114 | # commonly ignored for libraries. 115 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 116 | #poetry.lock 117 | 118 | # pdm 119 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 120 | #pdm.lock 121 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 122 | # in version control. 123 | # https://pdm.fming.dev/#use-with-ide 124 | .pdm.toml 125 | 126 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 127 | __pypackages__/ 128 | 129 | # Celery stuff 130 | celerybeat-schedule 131 | celerybeat.pid 132 | 133 | # SageMath parsed files 134 | *.sage.py 135 | 136 | # Environments 137 | .env 138 | .venv 139 | env/ 140 | venv/ 141 | ENV/ 142 | env.bak/ 143 | venv.bak/ 144 | 145 | # Spyder project settings 146 | .spyderproject 147 | .spyproject 148 | 149 | # Rope project settings 150 | .ropeproject 151 | 152 | # mkdocs documentation 153 | /site 154 | 155 | # mypy 156 | .mypy_cache/ 157 | .dmypy.json 158 | dmypy.json 159 | 160 | # Pyre type checker 161 | .pyre/ 162 | 163 | # pytype static type analyzer 164 | .pytype/ 165 | 166 | # Cython debug symbols 167 | cython_debug/ 168 | 169 | # PyCharm 170 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 171 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 172 | # and can be added to the global gitignore or merged into this file. For a more nuclear 173 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 174 | .idea/ 175 | 176 | .direnv/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vendor/ggml"] 2 | path = vendor/ggml 3 | url = https://github.com/ggerganov/ggml 4 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for MkDocs projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the version of Python and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.11" 12 | 13 | mkdocs: 14 | configuration: mkdocs.yml 15 | 16 | python: 17 | install: 18 | - method: pip 19 | path: . 20 | extra_requirements: 21 | - docs 22 | 23 | submodules: 24 | include: all 25 | recursive: true -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21) 2 | 3 | project( 4 | ${SKBUILD_PROJECT_NAME} 5 | VERSION ${SKBUILD_PROJECT_VERSION} 6 | ) 7 | 8 | message(SKBUILD_STATE="${SKBUILD_STATE}") 9 | 10 | if(SKBUILD_STATE STREQUAL "editable") 11 | # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 12 | set(GGML_PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/ggml/lib) 13 | else() 14 | set(GGML_PYTHON_INSTALL_DIR ${SKBUILD_PLATLIB_DIR}/ggml/lib) 15 | endif() 16 | 17 | set(BUILD_SHARED_LIBS "On") 18 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) 19 | if (APPLE) 20 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") 21 | set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE) 22 | set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE) 23 | set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE) 24 | set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE) 25 | endif() 26 | 27 | set(GGML_METAL_EMBED_LIBRARY "On" CACHE BOOL "ggml: embed metal library" FORCE) 28 | endif() 29 | add_subdirectory(vendor/ggml) 30 | install( 31 | TARGETS ggml 32 | ARCHIVE DESTINATION ${GGML_PYTHON_INSTALL_DIR} 33 | LIBRARY DESTINATION ${GGML_PYTHON_INSTALL_DIR} 34 | RUNTIME DESTINATION ${GGML_PYTHON_INSTALL_DIR} 35 | FRAMEWORK DESTINATION ${GGML_PYTHON_INSTALL_DIR} 36 | RESOURCE DESTINATION ${GGML_PYTHON_INSTALL_DIR} 37 | ) 38 | install( 39 | FILES $ 40 | DESTINATION ${GGML_PYTHON_INSTALL_DIR} 41 | ) 42 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Andrei Betlen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | submodules = vendor/ggml 2 | 3 | all: build 4 | 5 | ${submodules}: 6 | git submodule update --init --recursive 7 | 8 | update-pip: 9 | python3 -m pip install --upgrade pip 10 | 11 | build: ${submodules} update-pip ## Build ggml-python with cpu support 12 | python3 -m pip install --verbose --editable . 13 | 14 | build.debug: ${submodules} update-pip ## Build ggml-python with cpu support, debug symbols, and lines 15 | python3 -m pip install \ 16 | --verbose \ 17 | --config-settings cmake.args='-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_CXX_FLAGS=-g3;-DCMAKE_C_FLAGS=-g3' \ 18 | --config-settings cmake.verbose=true \ 19 | --config-settings logging.level=INFO \ 20 | --config-settings install.strip=false \ 21 | --editable . 22 | 23 | build.openblas: ${submodules} update-pip ## Build ggml-python with openblas support 24 | python3 -m pip install \ 25 | --verbose \ 26 | --config-settings cmake.args='-DGGML_OPENBLAS=On' \ 27 | --editable . 28 | 29 | build.cuda: ${submodules} update-pip ## Build ggml-python with cublas / cuda support 30 | python3 -m pip install \ 31 | --verbose \ 32 | --config-settings cmake.args='-DGGML_CUDA=On' \ 33 | --editable . 34 | 35 | build.clblast: ${submodules} update-pip ## Build ggml-python with clblast / opencl support 36 | python3 -m pip install \ 37 | --verbose \ 38 | --config-settings cmake.args='-DGGML_CLBLAST=On' \ 39 | --editable . 40 | 41 | sdist: ## Build source distribution 42 | python3 -m build --sdist 43 | 44 | deploy: ## Deploy to pypi 45 | twine upload dist/* 46 | 47 | test: ## Run tests 48 | python3 -m pytest 49 | 50 | test.gdb: ## Run tests with gdb 51 | gdb -ex "set pagination off" -ex r -ex "bt 5" --args python -m pytest -s -vvvv 52 | 53 | docs: ## Build documentation using mkdocs and serve it 54 | mkdocs serve 55 | 56 | clean: ## Clean build artifacts 57 | - rm -rf build 58 | - rm -rf dist 59 | - rm ggml/*.so 60 | - rm ggml/*.dll 61 | - rm ggml/*.dylib 62 | - rm ${submodules}/*.so 63 | - rm ${submodules}/*.dll 64 | - rm ${submodules}/*.dylib 65 | - cd ${submodules} && make clean 66 | 67 | help: ## Prints help menu 68 | @grep -E '^[\.a-zA-Z_-]+:.*?## .*$$' Makefile | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 69 | 70 | .PHONY: \ 71 | all \ 72 | build \ 73 | build.debug \ 74 | build.openblas \ 75 | build.cuda \ 76 | build.clblast \ 77 | sdist \ 78 | deploy \ 79 | test \ 80 | test.gdb \ 81 | docs \ 82 | clean \ 83 | help -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python bindings for [`ggml`](https://github.com/ggerganov/ggml) 2 | 3 | [![Documentation Status](https://readthedocs.org/projects/ggml-python/badge/?version=latest)](https://ggml-python.readthedocs.io/en/latest/?badge=latest) 4 | [![Tests](https://github.com/abetlen/ggml-python/actions/workflows/test.yaml/badge.svg)](https://github.com/abetlen/ggml-python/actions/workflows/test.yaml) 5 | [![PyPI](https://img.shields.io/pypi/v/ggml-python)](https://pypi.org/project/ggml-python/) 6 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ggml-python)](https://pypi.org/project/ggml-python/) 7 | [![PyPI - License](https://img.shields.io/pypi/l/ggml-python)](https://pypi.org/project/ggml-python/) 8 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/ggml-python)](https://pypi.org/project/ggml-python/) 9 | 10 | 11 | Python bindings for the [`ggml`](https://github.com/ggerganov/ggml) tensor library for machine learning. 12 | 13 | > ⚠️ Neither this project nor `ggml` currently guarantee backwards-compatibility, if you are using this library in other applications I strongly recommend pinning to specific releases in your `requirements.txt` file. 14 | 15 | # Documentation 16 | 17 | - [Getting Started](https://ggml-python.readthedocs.io/en/latest/) 18 | - [API Reference](https://ggml-python.readthedocs.io/en/latest/api-reference/) 19 | - [Examples](https://github.com/abetlen/ggml-python/tree/main/examples) 20 | 21 | # Installation 22 | 23 | 24 | Requirements 25 | - Python 3.8+ 26 | - C compiler (gcc, clang, msvc, etc) 27 | 28 | You can install `ggml-python` using `pip`: 29 | 30 | ```bash 31 | pip install ggml-python 32 | ``` 33 | 34 | This will compile ggml using cmake which requires a c compiler installed on your system. 35 | To build ggml with specific features (ie. OpenBLAS, GPU Support, etc) you can pass specific cmake options through the `cmake.args` pip install configuration setting. For example to install ggml-python with cuBLAS support you can run: 36 | 37 | ```bash 38 | pip install --upgrade pip 39 | pip install ggml-python --config-settings=cmake.args='-DGGML_CUDA=ON' 40 | ``` 41 | 42 | ## Options 43 | 44 | | Option | Description | Default | 45 | | --- | --- | --- | 46 | | `GGML_CUDA` | Enable cuBLAS support | `OFF` | 47 | | `GGML_CLBLAST` | Enable CLBlast support | `OFF` | 48 | | `GGML_OPENBLAS` | Enable OpenBLAS support | `OFF` | 49 | | `GGML_METAL` | Enable Metal support | `OFF` | 50 | | `GGML_RPC` | Enable RPC support | `OFF` | 51 | 52 | # Usage 53 | 54 | ```python 55 | import ggml 56 | import ctypes 57 | 58 | # Allocate a new context with 16 MB of memory 59 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None) 60 | ctx = ggml.ggml_init(params) 61 | 62 | # Instantiate tensors 63 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 64 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 65 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 66 | 67 | # Use ggml operations to build a computational graph 68 | x2 = ggml.ggml_mul(ctx, x, x) 69 | f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b) 70 | 71 | gf = ggml.ggml_new_graph(ctx) 72 | ggml.ggml_build_forward_expand(gf, f) 73 | 74 | # Set the input values 75 | ggml.ggml_set_f32(x, 2.0) 76 | ggml.ggml_set_f32(a, 3.0) 77 | ggml.ggml_set_f32(b, 4.0) 78 | 79 | # Compute the graph 80 | ggml.ggml_graph_compute_with_ctx(ctx, gf, 1) 81 | 82 | # Get the output value 83 | output = ggml.ggml_get_f32_1d(f, 0) 84 | assert output == 16.0 85 | 86 | # Free the context 87 | ggml.ggml_free(ctx) 88 | ``` 89 | 90 | # Troubleshooting 91 | 92 | If you are having trouble installing `ggml-python` or activating specific features please try to install it with the `--verbose` and `--no-cache-dir` flags to get more information about any issues: 93 | 94 | ```bash 95 | pip install ggml-python --verbose --no-cache-dir --force-reinstall --upgrade 96 | ``` 97 | 98 | # License 99 | 100 | This project is licensed under the terms of the MIT license. 101 | -------------------------------------------------------------------------------- /docs/api-reference.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: API Reference 3 | --- 4 | 5 | ::: ggml.ggml 6 | options: 7 | show_root_full_path: false 8 | filters: 9 | - "^ggml_" 10 | - "^gguf_" 11 | - "^GGML_" 12 | - "^GGUF_" 13 | 14 | ::: ggml.utils -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Getting Started 3 | --- 4 | 5 | ## Introduction 6 | 7 | ggml-python is a python library for working with [ggml](https://github.com/ggerganov/ggml). 8 | 9 | ggml is a tensor library for machine learning developed by Georgi Gerganov, the library has been used to run models like Whisper and LLaMa on a wide range of devices. 10 | ggml is written in C/C++ and is designed to be fast, portable and easily embeddable; making use of various hardware acceleration systems like BLAS, CUDA, OpenCL, and Metal. 11 | ggml supports quantized inference for reduced memory footprint and faster inference. 12 | 13 | You can use ggml-python to: 14 | 15 | - Convert and quantize model weights from Python-based ML frameworks (Pytorch, Tensorflow, etc) to ggml. 16 | - Port existing ML models to ggml and run them from Python. 17 | 18 | ## Installation 19 | 20 | Requirements 21 | 22 | - Python 3.7+ 23 | - C compiler (gcc, clang, msvc, etc) 24 | 25 | You can install `ggml-python` using `pip`: 26 | 27 | ```bash 28 | pip install ggml-python 29 | ``` 30 | 31 | This will compile ggml using cmake which requires a c compiler installed on your system. 32 | 33 | Below are the available options for building ggml-python with additional options for optimized inference. 34 | 35 | === "**BLAS**" 36 | 37 | ```bash 38 | CMAKE_ARGS="-DGGML_OPENBLAS=ON" pip install ggml-python 39 | ``` 40 | 41 | === "**CUDA**" 42 | 43 | ```bash 44 | CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install ggml-python 45 | ``` 46 | 47 | === "**Metal**" 48 | 49 | ```bash 50 | CMAKE_ARGS="-DGGML_METAL=ON" pip install ggml-python 51 | ``` 52 | 53 | === "**OpenCL**" 54 | 55 | ```bash 56 | CMAKE_ARGS="-DGGML_CLBLAST=ON" pip install ggml-python 57 | ``` 58 | 59 | ## Basic Example 60 | 61 | Below is a simple example of using ggml-python low level api to compute the value of a function. 62 | 63 | ```python 64 | import ggml 65 | import ctypes 66 | 67 | # Allocate a new context with 16 MB of memory 68 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None) 69 | ctx = ggml.ggml_init(params) 70 | 71 | # Instantiate tensors 72 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 73 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 74 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 75 | 76 | # Use ggml operations to build a computational graph 77 | x2 = ggml.ggml_mul(ctx, x, x) 78 | f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b) 79 | 80 | gf = ggml.ggml_new_graph(ctx) 81 | ggml.ggml_build_forward_expand(gf, f) 82 | 83 | # Set the input values 84 | ggml.ggml_set_f32(x, 2.0) 85 | ggml.ggml_set_f32(a, 3.0) 86 | ggml.ggml_set_f32(b, 4.0) 87 | 88 | # Compute the graph 89 | ggml.ggml_graph_compute_with_ctx(ctx, gf, 1) 90 | 91 | # Get the output value 92 | output = ggml.ggml_get_f32_1d(f, 0) 93 | assert output == 16.0 94 | 95 | # Free the context 96 | ggml.ggml_free(ctx) 97 | ``` 98 | 99 | ## Next Steps 100 | 101 | To learn more about ggml-python, check out the following resources: 102 | 103 | - [API Reference](api-reference.md) 104 | - Examples 105 | - [Code Completion Server](https://github.com/abetlen/ggml-python/tree/main/examples/replit) - A code completion server using ggml-python and the replit-code-v1-3b model that you can drop into your editor as a local Github Copilot replacement. 106 | - [CLIP Embeddings](https://github.com/abetlen/ggml-python/tree/main/examples/clip) - A simple example of using ggml-python to implement CLIP text / image embeddings. 107 | 108 | ## Development 109 | 110 | ```bash 111 | git clone https://github.com/abetlen/ggml-python.git 112 | cd ggml-python 113 | # (Optional) Create a virtual environment 114 | python -m venv venv 115 | source venv/bin/activate 116 | # Install dependencies 117 | make build 118 | ``` 119 | 120 | ## Contributing 121 | 122 | If you would like to contribute to ggml-python, please open an issue or submit a pull request on [GitHub](https://github.com/abetlen/ggml-python). 123 | 124 | 125 | ## License 126 | 127 | This project is licensed under the terms of the MIT license. -------------------------------------------------------------------------------- /examples/clip/README.md: -------------------------------------------------------------------------------- 1 | # CLIP Example 2 | 3 | # Setup 4 | 5 | Create a virtual environment and install requirements. 6 | 7 | ```bash 8 | python3 -m venv venv 9 | source venv/bin/activate 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | Convert the original CLIP model to GGML format. 14 | 15 | ```bash 16 | python convert-pt-to-ggml.py ViT-B/32 ./models 17 | ``` 18 | 19 | The other CLIP vision transformers should work, but have not been tested. Namely: 20 | 21 | - ViT-B/16 22 | - ViT-L/14 23 | - ViT-L/14@336px 24 | 25 | # Usage 26 | 27 | ```python 28 | # This implements the same example as the original project: https://github.com/openai/CLIP#usage 29 | from model import ClipModel 30 | from scipy.special import softmax 31 | from PIL import Image 32 | from utils import tokenize, transform 33 | 34 | 35 | preprocess = transform(224) 36 | # Example image: https://github.com/openai/CLIP/blob/main/CLIP.png 37 | image = preprocess(Image.open("CLIP.png")).unsqueeze(0) 38 | text = tokenize(["a diagram", "a dog", "a cat"]) 39 | 40 | # Initialize Model 41 | model_file = "models/ViT-B-32.ggml" 42 | model = ClipModel.init_from_file(model_file, n_threads=1) 43 | 44 | # Features are computed one at a time, batching not supported yet 45 | text_features = model.encode_text(text) 46 | 47 | # Only single image supported in ggml right now 48 | image_features = model.encode_image(image) 49 | 50 | logits_per_image, logits_per_text = model(image, text) 51 | 52 | probs = softmax(logits_per_image) 53 | 54 | print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]] 55 | ``` 56 | -------------------------------------------------------------------------------- /examples/clip/convert-pt-to-ggml.py: -------------------------------------------------------------------------------- 1 | # Convert CLIP model from PyTorch to ggml format 2 | # 3 | # Usage: python convert-pt-to-ggml.py ViT-B-32 ./models 4 | # 5 | # This script loads the specified model and clip assets and saves them in ggml format. 6 | # The output is a single binary file containing the following information: 7 | # 8 | # - hparams 9 | # - tokenizer vocab 10 | # - model variables 11 | # 12 | # For each variable, write the following: 13 | # 14 | # - Number of dimensions (int) 15 | # - Name length (int) 16 | # - Dimensions (int[n_dims]) 17 | # - Name (char[name_length]) 18 | # - Data (float[n_dims]) 19 | # 20 | 21 | import os 22 | import sys 23 | import struct 24 | import gzip 25 | import numpy as np 26 | import clip 27 | 28 | if len(sys.argv) < 3: 29 | print("Usage: convert-pt-to-ggml.py clip_model dir-output\n") 30 | sys.exit(1) 31 | 32 | clip_model = sys.argv[1] 33 | dir_out = sys.argv[2] 34 | 35 | # CLIP repo needs to exist at the root directory 36 | MODELS = clip.clip._MODELS 37 | model_filename = os.path.basename(MODELS[clip_model]).replace(".pt", "") 38 | 39 | model = clip.load(clip_model, device="cpu") 40 | state_dict = model[0].state_dict() 41 | 42 | # output in the same directory as the model 43 | fname_out = os.path.join(dir_out, model_filename + ".ggml") 44 | os.makedirs(dir_out, exist_ok=True) 45 | 46 | fout = open(fname_out, "wb") 47 | 48 | # Get HParams 49 | # Only ViT models supported for now 50 | vit = True 51 | if vit: 52 | vision_width = state_dict["visual.conv1.weight"].shape[0] 53 | vision_layers = len( 54 | [ 55 | k 56 | for k in state_dict.keys() 57 | if k.startswith("visual.") and k.endswith(".attn.in_proj_weight") 58 | ] 59 | ) 60 | vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] 61 | grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5) 62 | image_resolution = vision_patch_size * grid_size 63 | embed_dim = state_dict["text_projection"].shape[1] 64 | context_length = state_dict["positional_embedding"].shape[0] 65 | vocab_size = state_dict["token_embedding.weight"].shape[0] 66 | transformer_width = state_dict["ln_final.weight"].shape[0] 67 | transformer_heads = transformer_width // 64 68 | transformer_layers = len( 69 | set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")) 70 | ) 71 | print("HParams:") 72 | print(" vision_width:", vision_width) 73 | print(" vision_layers:", vision_layers) 74 | print(" vision_patch_size:", vision_patch_size) 75 | print(" grid_size:", grid_size) 76 | print(" image_resolution:", image_resolution) 77 | print(" embed_dim:", embed_dim) 78 | print(" context_length:", context_length) 79 | print(" vocab_size:", vocab_size) 80 | print(" transformer_width:", transformer_width) 81 | print(" transformer_heads:", transformer_heads) 82 | print(" transformer_layers:", transformer_layers) 83 | 84 | 85 | ftype = 0 86 | 87 | # Write hparams 88 | fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex 89 | fout.write(struct.pack("i", vision_width)) 90 | fout.write(struct.pack("i", vision_layers)) 91 | fout.write(struct.pack("i", vision_patch_size)) 92 | fout.write(struct.pack("i", grid_size)) 93 | fout.write(struct.pack("i", image_resolution)) 94 | fout.write(struct.pack("i", embed_dim)) 95 | fout.write(struct.pack("i", context_length)) 96 | fout.write(struct.pack("i", transformer_width)) 97 | fout.write(struct.pack("i", transformer_heads)) 98 | fout.write(struct.pack("i", transformer_layers)) 99 | fout.write(struct.pack("i", ftype)) # ftype: 0 = float32, 1 = float16 100 | 101 | bpe_path = os.path.join(os.path.dirname(clip.__file__), "bpe_simple_vocab_16e6.txt.gz") 102 | merges = gzip.open(bpe_path).read().decode("utf-8").split("\n") 103 | merges = merges[1 : 49152 - 256 - 2 + 1] 104 | merges = [tuple(merge.split()) for merge in merges] 105 | 106 | vocab = list(clip.simple_tokenizer.bytes_to_unicode().values()) 107 | tokens = vocab + [v + "" for v in vocab] 108 | for merge in merges: 109 | tokens.append("".join(merge)) 110 | tokens.extend(["<|startoftext|>", "<|endoftext|>"]) 111 | # byte_decoder = {v: k for k, v in clip.simple_tokenizer.bytes_to_unicode().items()} 112 | 113 | fout.write(struct.pack("i", len(tokens))) 114 | 115 | for key in tokens: 116 | text = key.encode("utf-8") 117 | fout.write(struct.pack("i", len(text))) 118 | fout.write(text) 119 | 120 | for name in state_dict.keys(): 121 | data = state_dict[name].squeeze().numpy() 122 | print("Processing variable: " + name + " with shape: ", data.shape) 123 | # ftype == 0 -> float32, ftype == 1 -> float16 124 | ftype = 0 125 | 126 | if name == "visual.conv1.weight": 127 | data = data.astype(np.float16) 128 | ftype = 1 129 | n_dims = len(data.shape) 130 | 131 | # header 132 | str = name.encode("utf-8") 133 | fout.write(struct.pack("iii", n_dims, len(str), ftype)) 134 | for i in range(n_dims): 135 | fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) 136 | fout.write(str) 137 | 138 | # data 139 | data.tofile(fout) 140 | 141 | fout.close() 142 | 143 | print("Done. Output file: " + fname_out) 144 | print("") 145 | -------------------------------------------------------------------------------- /examples/clip/model.py: -------------------------------------------------------------------------------- 1 | """ggml-python implemention of the CLIP model 2 | """ 3 | import io 4 | import os 5 | import ctypes 6 | import struct 7 | import argparse 8 | import numpy as np 9 | from typing import List, Tuple, Dict 10 | import ggml 11 | from ggml.experimental import GGML_FTYPE, Context, InitParams, Tensor, GGML_TYPE, CGraph 12 | 13 | 14 | def compute_ctx_size(fin: io.BufferedReader) -> int: 15 | # Save current position in file and get file size, then return 16 | position = fin.tell() 17 | 18 | ctx_size = 0 19 | while True: 20 | nbytes = struct.calcsize("iii") 21 | data = fin.read(nbytes) 22 | if len(data) != nbytes: 23 | break 24 | (n_dims, s_len, ftype) = struct.unpack("iii", data) 25 | dims = struct.unpack("i" * n_dims, fin.read(struct.calcsize("i" * n_dims))) 26 | if ftype == 0: 27 | _format = "f" 28 | if ftype == 1: 29 | _format = "e" 30 | n_bytes = struct.calcsize(_format * int(np.prod(dims))) 31 | ctx_size += n_bytes 32 | ctx_size += 256 # Padding? 33 | name = fin.read(s_len).decode("utf-8") 34 | # print(f"Name: {name}, dims: {dims}, n_bytes: {n_bytes}") 35 | 36 | fin.seek(n_bytes, os.SEEK_CUR) 37 | 38 | # Seek back to saved position 39 | fin.seek(position) 40 | return ctx_size 41 | 42 | 43 | class ResidualAttentionBlock: 44 | def __init__( 45 | self, 46 | ctx: Context, 47 | wtype: GGML_TYPE, 48 | embed_dim: int, 49 | heads: int, 50 | use_attn_mask: bool = False, 51 | ): 52 | self.tensors: Dict[str, Tensor] = {} 53 | self.n_head = heads 54 | self.embed_dim = embed_dim 55 | self.use_attn_mask = use_attn_mask 56 | # Layer Norm 1 (ln_1) 57 | self.ln_1_weight = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx) 58 | self.ln_1_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx) 59 | self.tensors["ln_1.weight"] = self.ln_1_weight 60 | self.tensors["ln_1.bias"] = self.ln_1_bias 61 | 62 | # Attention Block (attn) 63 | self.in_proj_weight = Tensor.new_tensor_2d( 64 | wtype, embed_dim, 3 * embed_dim, ctx=ctx 65 | ) 66 | self.in_proj_bias = Tensor.new_tensor_1d(wtype, 3 * embed_dim, ctx=ctx) 67 | self.out_proj_weight = Tensor.new_tensor_2d( 68 | wtype, embed_dim, embed_dim, ctx=ctx 69 | ) 70 | self.out_proj_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx) 71 | self.tensors["attn.in_proj_weight"] = self.in_proj_weight 72 | self.tensors["attn.in_proj_bias"] = self.in_proj_bias 73 | self.tensors["attn.out_proj.weight"] = self.out_proj_weight 74 | self.tensors["attn.out_proj.bias"] = self.out_proj_bias 75 | 76 | # Layer Norm 2 (ln_2) 77 | self.ln_2_weight = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx) 78 | self.ln_2_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx) 79 | self.tensors["ln_2.weight"] = self.ln_2_weight 80 | self.tensors["ln_2.bias"] = self.ln_2_bias 81 | 82 | # MLP (mlp) 83 | self.mlp_c_fc_weight = Tensor.new_tensor_2d( 84 | wtype, embed_dim, embed_dim * 4, ctx=ctx 85 | ) 86 | self.mlp_c_fc_bias = Tensor.new_tensor_1d(wtype, embed_dim * 4, ctx=ctx) 87 | self.mlp_c_proj_weight = Tensor.new_tensor_2d( 88 | wtype, embed_dim * 4, embed_dim, ctx=ctx 89 | ) 90 | self.mlp_c_proj_bias = Tensor.new_tensor_1d(wtype, embed_dim, ctx=ctx) 91 | self.tensors["mlp.c_fc.weight"] = self.mlp_c_fc_weight 92 | self.tensors["mlp.c_fc.bias"] = self.mlp_c_fc_bias 93 | self.tensors["mlp.c_proj.weight"] = self.mlp_c_proj_weight 94 | self.tensors["mlp.c_proj.bias"] = self.mlp_c_proj_bias 95 | 96 | @staticmethod 97 | def compute_forward_mem_size( 98 | N: int, width: int, n_heads: int, use_attn_mask: bool = False 99 | ) -> int: 100 | e_size = 4 101 | ggml_overhead = 256 102 | mem_size = 0 103 | mem_size += ( 104 | e_size * width * N + ggml_overhead 105 | ) * 5 # ln_1: repeat, repeat, mul, add, norm 106 | mem_size += ( 107 | e_size * width * 3 * N + ggml_overhead 108 | ) * 3 # in_proj: mul_mat, repeat, add 109 | mem_size += ggml_overhead * 3 # view_2d: Qcur, Kcur, Vcur 110 | mem_size += ( 111 | (e_size * (width // n_heads) * n_heads * N + ggml_overhead) 112 | + 2 * ggml_overhead 113 | ) * 2 # K,Q: new_tensor, cpy, permute 114 | mem_size += e_size * N * N * n_heads + ggml_overhead # KQ 115 | mem_size += e_size * 4 + 256 # KQ_scaled: new_f32 116 | mem_size += e_size * N * N * n_heads + ggml_overhead # KQ_scaled 117 | if use_attn_mask: 118 | mem_size += ( 119 | e_size * N * N * n_heads + ggml_overhead + e_size * 4 + ggml_overhead 120 | ) # diag_mask_inf 121 | mem_size += e_size * N * N * n_heads + ggml_overhead # KQ_soft_max 122 | mem_size += ( 123 | e_size * (width // n_heads) * n_heads * N + ggml_overhead 124 | ) # V_trans: new_tensor_3d 125 | mem_size += ggml_overhead * 2 # V_trans: cpy and permute 126 | mem_size += ( 127 | e_size * (width // n_heads) * n_heads * N + ggml_overhead 128 | ) # V_trans: new_tensor_3d 129 | mem_size += ggml_overhead # V_trans: cpy 130 | mem_size += ( 131 | e_size * (width // n_heads) * n_heads * N + ggml_overhead 132 | ) # KQV: mul_mat 133 | mem_size += ggml_overhead # KQV_merged: permute 134 | mem_size += e_size * width * N + ggml_overhead # KQV_merged: new_tensor_2d 135 | mem_size += ggml_overhead # KQV_merged: cpy 136 | mem_size += ( 137 | e_size * width * N + ggml_overhead 138 | ) * 3 # out_proj: mul_mat, repeat, add 139 | mem_size += e_size * width * N + ggml_overhead # Add residual 140 | mem_size += ( 141 | e_size * width * N + ggml_overhead 142 | ) * 5 # ln_2: norm, add, repeat, repeat, mul 143 | mem_size += ( 144 | e_size * width * 4 * N + ggml_overhead 145 | ) * 3 # MLP: mul_mat, repeat, add 146 | mem_size += (e_size * 4 + 256) * 2 # SiLU: sf_in, sf_out 147 | mem_size += ( 148 | e_size * width * 4 * N + ggml_overhead 149 | ) * 3 # SiLU: scale, silu, scale 150 | mem_size += ( 151 | e_size * width * N + ggml_overhead 152 | ) * 3 # mlp_c_proj: mul_mat, repeat, add 153 | mem_size += e_size * width * N + ggml_overhead # Add Residual 154 | 155 | return mem_size 156 | 157 | def forward(self, inpL: Tensor, ctx: Context, gf: CGraph) -> Tensor: 158 | N = inpL.shape[1] 159 | 160 | # [768, N] 161 | cur = Tensor.norm(inpL, ctx=ctx) 162 | # cur = ln_1_weight * cur + ln_1_bias 163 | # [768, N] 164 | cur = Tensor.add( 165 | Tensor.mul(Tensor.repeat(self.ln_1_weight, cur, ctx=ctx), cur, ctx=ctx), 166 | Tensor.repeat(self.ln_1_bias, cur, ctx=ctx), 167 | ctx=ctx, 168 | ) 169 | 170 | # cur = in_proj_weight * cur + in_proj_bias 171 | # [768, N] - cur (in) 172 | # [2304, 768] - in_proj_weight 173 | # [2304, 1] - in_proj_bias 174 | # [2304, N] - cur (out) 175 | cur = Tensor.mul_mat(self.in_proj_weight, cur, ctx=ctx) 176 | 177 | cur = Tensor.add(Tensor.repeat(self.in_proj_bias, cur, ctx=ctx), cur, ctx=ctx) 178 | 179 | # Self-Attention 180 | n_embd = cur.shape[0] // 3 181 | 182 | Qcur = Tensor.view_2d( 183 | cur, 184 | n_embd, 185 | N, 186 | cur.tensor.contents.nb[1], 187 | 0 * ctypes.sizeof(ctypes.c_float) * n_embd, 188 | ctx=ctx, 189 | ) 190 | 191 | Kcur = Tensor.view_2d( 192 | cur, 193 | n_embd, 194 | N, 195 | cur.tensor.contents.nb[1], 196 | 1 * ctypes.sizeof(ctypes.c_float) * n_embd, 197 | ctx=ctx, 198 | ) 199 | 200 | Vcur = Tensor.view_2d( 201 | cur, 202 | n_embd, 203 | N, 204 | cur.tensor.contents.nb[1], 205 | 2 * ctypes.sizeof(ctypes.c_float) * n_embd, 206 | ctx=ctx, 207 | ) 208 | 209 | Q = Tensor.permute( 210 | Tensor.cpy( 211 | Qcur, 212 | Tensor.new_tensor_3d( 213 | GGML_TYPE.F32, n_embd // self.n_head, self.n_head, N, ctx=ctx 214 | ), 215 | ctx=ctx, 216 | ), 217 | 0, 218 | 2, 219 | 1, 220 | 3, 221 | ctx=ctx, 222 | ) 223 | 224 | K = Tensor.permute( 225 | Tensor.cpy( 226 | Kcur, 227 | Tensor.new_tensor_3d( 228 | GGML_TYPE.F32, n_embd // self.n_head, self.n_head, N, ctx=ctx 229 | ), 230 | ctx=ctx, 231 | ), 232 | 0, 233 | 2, 234 | 1, 235 | 3, 236 | ctx=ctx, 237 | ) 238 | 239 | KQ = Tensor.mul_mat(K, Q, ctx=ctx) 240 | 241 | KQ_scaled = Tensor.scale( 242 | KQ, 243 | Tensor.new_f32( 244 | 1.0 / np.sqrt(float(n_embd) / self.n_head), 245 | ctx=ctx, 246 | ), 247 | ctx=ctx, 248 | ) 249 | if self.use_attn_mask: 250 | KQ_masked = Tensor.diag_mask_inf(KQ_scaled, 0, ctx=ctx) 251 | KQ_soft_max = Tensor.soft_max(KQ_masked, ctx=ctx) 252 | else: 253 | KQ_soft_max = Tensor.soft_max(KQ_scaled, ctx=ctx) 254 | 255 | V_trans = Tensor.cpy( 256 | Tensor.permute( 257 | Tensor.cpy( 258 | Vcur, 259 | Tensor.new_tensor_3d( 260 | GGML_TYPE.F32, n_embd // self.n_head, self.n_head, N, ctx=ctx 261 | ), 262 | ctx=ctx, 263 | ), 264 | 1, 265 | 2, 266 | 0, 267 | 3, 268 | ctx=ctx, 269 | ), 270 | Tensor.new_tensor_3d( 271 | GGML_TYPE.F32, N, n_embd // self.n_head, self.n_head, ctx=ctx 272 | ), 273 | ctx=ctx, 274 | ) 275 | 276 | KQV = Tensor.mul_mat(V_trans, KQ_soft_max, ctx=ctx) 277 | 278 | KQV_merged = Tensor.permute( 279 | KQV, 280 | 0, 281 | 2, 282 | 1, 283 | 3, 284 | ctx=ctx, 285 | ) 286 | 287 | cur = Tensor.cpy( 288 | KQV_merged, 289 | Tensor.new_tensor_2d( 290 | GGML_TYPE.F32, 291 | n_embd, 292 | N, 293 | ctx=ctx, 294 | ), 295 | ctx=ctx, 296 | ) 297 | 298 | cur = Tensor.mul_mat( 299 | self.out_proj_weight, 300 | cur, 301 | ctx=ctx, 302 | ) 303 | 304 | cur = Tensor.add(Tensor.repeat(self.out_proj_bias, cur, ctx=ctx), cur, ctx=ctx) 305 | 306 | # Add Residual 307 | inpL = Tensor.add(inpL, cur, ctx=ctx) 308 | 309 | # LN2 310 | cur = Tensor.norm(inpL, ctx=ctx) 311 | cur = Tensor.add( 312 | Tensor.mul(Tensor.repeat(self.ln_2_weight, cur, ctx=ctx), cur, ctx=ctx), 313 | Tensor.repeat(self.ln_2_bias, cur, ctx=ctx), 314 | ctx=ctx, 315 | ) 316 | 317 | # MLP 318 | # c_fc 319 | cur = Tensor.mul_mat(self.mlp_c_fc_weight, cur, ctx=ctx) 320 | cur = Tensor.add(Tensor.repeat(self.mlp_c_fc_bias, cur, ctx=ctx), cur, ctx=ctx) 321 | 322 | # QuickGELU - x * sigmoid(1.702 * x) 323 | cur = Tensor.scale(cur, Tensor.new_f32(1.702, ctx=ctx), ctx=ctx) 324 | 325 | cur = Tensor.silu(cur, ctx=ctx) 326 | 327 | cur = Tensor.scale(cur, Tensor.new_f32(1 / 1.702, ctx=ctx), ctx=ctx) 328 | 329 | # c_proj 330 | cur = Tensor.mul_mat(self.mlp_c_proj_weight, cur, ctx=ctx) 331 | cur = Tensor.add( 332 | Tensor.repeat(self.mlp_c_proj_bias, cur, ctx=ctx), cur, ctx=ctx 333 | ) 334 | 335 | # Add Residual 336 | cur = Tensor.add(inpL, cur, ctx=ctx) 337 | return cur 338 | 339 | 340 | class VisionTransformer: 341 | def __init__( 342 | self, 343 | ctx: Context, 344 | wtype: GGML_TYPE, 345 | input_resolution: int, 346 | patch_size: int, 347 | width: int, 348 | heads: int, 349 | layers: int, 350 | output_dim: int, 351 | ): 352 | self.layers = layers 353 | self.tensors: Dict[str, Tensor] = {} 354 | 355 | # Class Embedding (visual.class_embedding) 356 | self.visual_class_embedding = Tensor.new_tensor_1d(wtype, width, ctx=ctx) 357 | self.tensors["visual.class_embedding"] = self.visual_class_embedding 358 | 359 | # Positional Embedding (visual.positional_embedding) 360 | self.visual_positional_embedding = Tensor.new_tensor_2d( 361 | wtype, width, (input_resolution // patch_size) ** 2 + 1, ctx=ctx 362 | ) 363 | self.tensors["visual.positional_embedding"] = self.visual_positional_embedding 364 | 365 | # Convolutional Layer (visual.conv1.weight) 366 | wtype_f16 = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(1))) 367 | self.visual_conv1_weight = Tensor.new_tensor_4d( 368 | wtype_f16, patch_size, patch_size, 3, width, ctx=ctx 369 | ) 370 | self.tensors["visual.conv1.weight"] = self.visual_conv1_weight 371 | 372 | # pre Layer Norm Weight (visual.ln_pre.weight) 373 | self.visual_ln_pre_weight = Tensor.new_tensor_1d(wtype, width, ctx=ctx) 374 | self.tensors["visual.ln_pre.weight"] = self.visual_ln_pre_weight 375 | 376 | # pre Layer Norm Bias (visual.ln_pre.bias) 377 | self.visual_ln_pre_bias = Tensor.new_tensor_1d(wtype, width, ctx=ctx) 378 | self.tensors["visual.ln_pre.bias"] = self.visual_ln_pre_bias 379 | self.resblocks = [] 380 | for i in range(layers): 381 | resblock = ResidualAttentionBlock( 382 | ctx=ctx, wtype=wtype, embed_dim=width, heads=heads, use_attn_mask=False 383 | ) 384 | self.resblocks.append(resblock) 385 | self.tensors.update( 386 | { 387 | f"visual.transformer.resblocks.{i}." + k: v 388 | for k, v in resblock.tensors.items() 389 | } 390 | ) 391 | 392 | # post Layer Norm (visual.ln_post) 393 | self.visual_ln_post_weight = Tensor.new_tensor_1d(wtype, width, ctx=ctx) 394 | self.visual_ln_post_bias = Tensor.new_tensor_1d(wtype, width, ctx=ctx) 395 | self.tensors["visual.ln_post.weight"] = self.visual_ln_post_weight 396 | self.tensors["visual.ln_post.bias"] = self.visual_ln_post_bias 397 | 398 | # Visual Projection (visual.proj) 399 | self.visual_proj = Tensor.new_tensor_2d(wtype, output_dim, width, ctx=ctx) 400 | self.tensors["visual.proj"] = self.visual_proj 401 | 402 | 403 | class ClipModel: 404 | def __init__( 405 | self, 406 | ctx: Context, 407 | wtype: GGML_TYPE, 408 | vision_width: int, 409 | vision_layers: int, 410 | vision_patch_size: int, 411 | image_resolution: int, 412 | embed_dim: int, 413 | context_length: int, 414 | vocab_size: int, 415 | transformer_width: int, 416 | transformer_heads: int, 417 | transformer_layers: int, 418 | n_threads: int, 419 | ): 420 | self.n_threads = n_threads 421 | self.tensors: Dict[str, Tensor] = {} 422 | 423 | # Vision Transformer 424 | self.vision_layers = vision_layers 425 | self.vision_patch_size = vision_patch_size 426 | self.vision_width = vision_width 427 | self.vision_heads = vision_width // 64 428 | self.image_resolution = image_resolution 429 | self.grid_size = image_resolution // vision_patch_size 430 | 431 | # Text Transformer 432 | self.context_length = context_length 433 | self.transformer_width = transformer_width 434 | self.transformer_heads = transformer_heads 435 | self.transformer_layers = transformer_layers 436 | 437 | self.embed_dim = embed_dim 438 | 439 | # Positional Embedding (position_embedding) 440 | self.positional_embedding = Tensor.new_tensor_2d( 441 | wtype, transformer_width, context_length, ctx=ctx 442 | ) 443 | self.tensors["positional_embedding"] = self.positional_embedding 444 | 445 | # Text Projection (text_projection) 446 | self.text_projection = Tensor.new_tensor_2d( 447 | wtype, transformer_width, embed_dim, ctx=ctx 448 | ) 449 | self.tensors["text_projection"] = self.text_projection 450 | 451 | # Logit Scale (logit_scale) 452 | self.logit_scale = Tensor.new_tensor_1d(wtype, 1, ctx=ctx) 453 | self.tensors["logit_scale"] = self.logit_scale 454 | 455 | # Visual Transformer (visual.) 456 | self.visual = VisionTransformer( 457 | ctx=ctx, 458 | wtype=wtype, 459 | input_resolution=image_resolution, 460 | patch_size=vision_patch_size, 461 | width=vision_width, 462 | layers=vision_layers, 463 | heads=self.vision_heads, 464 | output_dim=embed_dim, 465 | ) 466 | self.tensors.update(self.visual.tensors) 467 | 468 | # Transformer (transformer.) 469 | self.transformer_res_blocks = [] 470 | for i in range(transformer_layers): 471 | res_block = ResidualAttentionBlock( 472 | ctx=ctx, 473 | wtype=wtype, 474 | embed_dim=transformer_width, 475 | heads=transformer_heads, 476 | use_attn_mask=True, 477 | ) 478 | self.transformer_res_blocks.append(res_block) 479 | self.tensors.update( 480 | { 481 | f"transformer.resblocks.{i}." + k: v 482 | for k, v in res_block.tensors.items() 483 | } 484 | ) 485 | 486 | # Token Embedding (token_embedding.weight) 487 | self.token_embedding = Tensor.new_tensor_2d( 488 | wtype, transformer_width, vocab_size, ctx=ctx 489 | ) 490 | self.tensors["token_embedding.weight"] = self.token_embedding 491 | 492 | # Final Layer Norm (ln_final.weight) 493 | self.ln_final_weight = Tensor.new_tensor_1d(wtype, transformer_width, ctx=ctx) 494 | self.tensors["ln_final.weight"] = self.ln_final_weight 495 | 496 | # Final Layer Norm (ln_final.bias) 497 | self.ln_final_bias = Tensor.new_tensor_1d(wtype, transformer_width, ctx=ctx) 498 | self.tensors["ln_final.bias"] = self.ln_final_bias 499 | 500 | def encode_image(self, image): 501 | tensor = self._encode_image_internal(image) 502 | return tensor.numpy().copy().reshape(1, -1) 503 | 504 | def encode_text(self, text_embds): 505 | encodings = [] 506 | # TODO: batchify 507 | for text_embd in text_embds: 508 | tensor = self._encode_text_internal(text_embd) 509 | encodings.append(tensor.numpy().copy().reshape(1, -1)) 510 | return np.concatenate(encodings, axis=0) 511 | 512 | def __call__(self, image, text): 513 | image_features = self.encode_image(image) 514 | text_features = self.encode_text(text) 515 | 516 | # normalized features 517 | image_features = image_features / np.linalg.norm( 518 | image_features, axis=1, keepdims=True 519 | ) 520 | text_features = text_features / np.linalg.norm( 521 | text_features, axis=1, keepdims=True 522 | ) 523 | 524 | # cosine similarity as logits 525 | logit_scale = np.exp(self.logit_scale.numpy().copy()) 526 | logits_per_image = logit_scale * image_features @ text_features.T 527 | logits_per_text = logits_per_image.T 528 | 529 | # shape = [global_batch_size, global_batch_size] 530 | return logits_per_image, logits_per_text 531 | 532 | def _text_encoder_compute_forward_memsize(self): 533 | mem_size = 0 534 | e_size = 4 535 | ggml_overhead = 256 536 | mem_size += e_size * self.context_length + ggml_overhead # input embd 537 | 538 | mem_size += ( 539 | e_size * self.context_length * self.embed_dim + ggml_overhead 540 | ) # token embedding 541 | 542 | mem_size += ( 543 | e_size * self.context_length * self.embed_dim + ggml_overhead 544 | ) # add positional embedding 545 | res_block_mem_size = ResidualAttentionBlock.compute_forward_mem_size( 546 | self.context_length, 547 | self.transformer_width, 548 | self.transformer_heads, 549 | use_attn_mask=True, 550 | ) 551 | mem_size += res_block_mem_size * self.transformer_layers 552 | mem_size += ( 553 | e_size * self.transformer_width * self.context_length + ggml_overhead 554 | ) * 5 # ln_final 555 | 556 | mem_size += ggml_overhead # view 557 | 558 | mem_size += e_size * self.embed_dim + ggml_overhead # Text Proj: output 559 | mem_size += ggml_overhead # Text Proj: Transpose 560 | mem_size += ( 561 | e_size * self.embed_dim * self.embed_dim * ggml_overhead 562 | ) # Text Proj: cpy 563 | return mem_size 564 | 565 | def _encode_text_internal(self, embd_inp: np.ndarray): 566 | wtype = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(0))) 567 | N = self.context_length 568 | mem_size = self._text_encoder_compute_forward_memsize() 569 | mem_buffer = np.empty(mem_size, dtype=np.uint8) 570 | init_params = InitParams( 571 | mem_size=mem_size, mem_buffer=mem_buffer.ctypes.data_as(ctypes.c_void_p) 572 | ) 573 | ctx0 = Context(init_params=init_params) 574 | 575 | gf = CGraph(cgraph=ggml.ggml_cgraph(n_threads=self.n_threads), ctx=ctx0) 576 | 577 | embd = Tensor.new_tensor_1d(GGML_TYPE.I32, N, ctx=ctx0) 578 | embd.numpy()[:] = np.array(embd_inp, dtype=np.int32) 579 | inpL = Tensor.get_rows(self.token_embedding, embd, ctx=ctx0) 580 | cur = Tensor.add(inpL, self.positional_embedding, ctx=ctx0) 581 | 582 | for il in range(self.transformer_layers): 583 | resblock = self.transformer_res_blocks[il] 584 | cur = resblock.forward(cur, ctx=ctx0, gf=gf) 585 | 586 | cur = Tensor.norm(cur, ctx=ctx0) 587 | cur = Tensor.add( 588 | Tensor.mul( 589 | Tensor.repeat(self.ln_final_weight, cur, ctx=ctx0), 590 | cur, 591 | ctx=ctx0, 592 | ), 593 | Tensor.repeat(self.ln_final_bias, cur, ctx=ctx0), 594 | ctx=ctx0, 595 | ) 596 | 597 | # Use the embedding from the EOT token 598 | eot_idx = embd_inp.argmax() 599 | cur = Tensor.view_2d( 600 | cur, 601 | self.embed_dim, 602 | 1, 603 | cur.tensor.contents.nb[1], 604 | eot_idx * cur.tensor.contents.nb[1], 605 | ctx=ctx0, 606 | ) 607 | 608 | cur = Tensor.mul_mat( 609 | Tensor.cpy( 610 | Tensor.transpose(self.text_projection, ctx=ctx0), 611 | Tensor.new_tensor_2d(wtype, self.embed_dim, self.embed_dim, ctx=ctx0), 612 | ), 613 | cur, 614 | ctx=ctx0, 615 | ) 616 | gf.build_forward_expand(cur) 617 | gf.compute() 618 | return cur 619 | 620 | def _image_encoder_compute_forward_memsize(self): 621 | e_size = 4 622 | N = self.grid_size * self.grid_size + 1 623 | ggml_overhead = 256 624 | 625 | mem_size = 0 626 | mem_size += 256 627 | mem_size += ( 628 | e_size * self.image_resolution * self.image_resolution * 3 + ggml_overhead 629 | ) # image 630 | mem_size += ( 631 | e_size * self.grid_size * self.grid_size * self.vision_width + ggml_overhead 632 | ) # conv 633 | mem_size += e_size * self.vision_width * N + ggml_overhead # concat 634 | 635 | mem_size += ( 636 | e_size * self.vision_width * N + ggml_overhead 637 | ) * 2 # Copy in visual features 638 | mem_size += e_size * 8 + 256 639 | mem_size += 2 * ggml_overhead # cpy and transpose 640 | mem_size += e_size * 8 + 256 # ??? 641 | mem_size += ( 642 | e_size * self.vision_width * N + ggml_overhead 643 | ) # Copy in positional embeddings (new tensor 2d) 644 | mem_size += ( 645 | e_size * self.vision_width * N + ggml_overhead 646 | ) # copy visual features: ret 647 | 648 | mem_size += e_size * self.vision_width * N + ggml_overhead # add 649 | 650 | mem_size += e_size * self.vision_width * N + ggml_overhead # ln_pre: norm 651 | mem_size += e_size * self.vision_width * N + ggml_overhead # ln_pre: repeat 652 | mem_size += e_size * self.vision_width * N + ggml_overhead # ln_pre: repeat 653 | mem_size += e_size * self.vision_width * N + ggml_overhead # ln_pre: mul 654 | mem_size += e_size * self.vision_width * N + ggml_overhead # ln_pre: add 655 | 656 | res_block_mem_size = ResidualAttentionBlock.compute_forward_mem_size( 657 | N, self.vision_width, self.vision_heads, use_attn_mask=False 658 | ) 659 | 660 | mem_size += res_block_mem_size * self.vision_layers 661 | mem_size += ggml_overhead # ln_post: transpose 662 | mem_size += (e_size * self.vision_width + ggml_overhead) * 3 # ln_post 663 | 664 | mem_size += e_size * self.vision_width * self.embed_dim + ggml_overhead 665 | mem_size += ggml_overhead # cpy 666 | mem_size += 159808 # Compute Overhead ?? 667 | return mem_size 668 | 669 | def _encode_image_internal(self, image): 670 | wtype = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(0))) 671 | 672 | mem_size = self._image_encoder_compute_forward_memsize() 673 | mem_buffer = np.empty(mem_size, dtype=np.uint8) 674 | init_params = InitParams( 675 | mem_size=mem_size, mem_buffer=mem_buffer.ctypes.data_as(ctypes.c_void_p) 676 | ) 677 | ctx0 = Context(init_params=init_params) 678 | 679 | gf = CGraph(cgraph=ggml.ggml_cgraph(n_threads=self.n_threads), ctx=ctx0) 680 | 681 | img_tensor = Tensor.new_tensor_4d( 682 | wtype, 683 | image.shape[3], 684 | image.shape[2], 685 | image.shape[1], 686 | image.shape[0], 687 | ctx=ctx0, 688 | ) 689 | img_tensor.numpy()[:] = image.permute(3, 2, 1, 0) 690 | cur = Tensor.conv_2d_sk_p0( 691 | self.visual.visual_conv1_weight, img_tensor, ctx=ctx0 692 | ) 693 | 694 | cur = Tensor.reshape_2d( 695 | cur, 696 | cur.shape[0] * cur.shape[1], 697 | cur.shape[2], 698 | ctx=ctx0, 699 | ) 700 | 701 | concat = Tensor.new_tensor_2d(wtype, cur.shape[0] + 1, cur.shape[1], ctx=ctx0) 702 | 703 | concat = Tensor.set_1d( 704 | concat, 705 | Tensor.view_1d( 706 | self.visual.visual_class_embedding, 707 | self.visual.visual_class_embedding.shape[0], 708 | 0, 709 | ), 710 | 0, 711 | ctx=ctx0, 712 | ) 713 | 714 | # Copy in the visual features 715 | concat = Tensor.set_2d( 716 | concat, 717 | Tensor.cpy( 718 | Tensor.transpose(cur, ctx=ctx0), 719 | Tensor.new_tensor_2d(wtype, cur.shape[0], cur.shape[1]), 720 | ctx=ctx0, 721 | ), 722 | cur.tensor.contents.nb[1], 723 | self.visual.visual_class_embedding.nbytes(), 724 | ctx=ctx0, 725 | ) 726 | 727 | # Copy in the positional embeddings 728 | cur = Tensor.cpy( 729 | concat, 730 | Tensor.new_tensor_2d(wtype, concat.shape[1], concat.shape[0], ctx=ctx0), 731 | ctx=ctx0, 732 | ) 733 | 734 | pEmb = self.visual.visual_positional_embedding 735 | 736 | cur = Tensor.add(cur, pEmb, ctx=ctx0) 737 | 738 | # ln_pre 739 | cur = Tensor.norm(cur, ctx=ctx0) 740 | 741 | cur = Tensor.add( 742 | Tensor.mul( 743 | Tensor.repeat(self.visual.visual_ln_pre_weight, cur, ctx=ctx0), 744 | cur, 745 | ctx=ctx0, 746 | ), 747 | Tensor.repeat(self.visual.visual_ln_pre_bias, cur, ctx=ctx0), 748 | ctx=ctx0, 749 | ) 750 | 751 | # Transformer 752 | for il in range(self.visual.layers): 753 | resblock = self.visual.resblocks[il] 754 | cur = resblock.forward(cur, ctx=ctx0, gf=gf) 755 | 756 | # ln_post 757 | cur = Tensor.norm( 758 | Tensor.view_2d(Tensor.transpose(cur, ctx=ctx0), cur.shape[0], 1, 1, 0), 759 | ctx=ctx0, 760 | ) 761 | 762 | cur = Tensor.add( 763 | Tensor.mul( 764 | Tensor.repeat(self.visual.visual_ln_post_weight, cur, ctx=ctx0), 765 | cur, 766 | ctx=ctx0, 767 | ), 768 | Tensor.repeat(self.visual.visual_ln_post_bias, cur, ctx=ctx0), 769 | ctx=ctx0, 770 | ) 771 | 772 | # Token Projection 773 | cur = Tensor.mul_mat( 774 | Tensor.cpy( 775 | Tensor.transpose(self.visual.visual_proj), 776 | Tensor.new_tensor_2d( 777 | wtype, 778 | self.visual.visual_proj.shape[1], 779 | self.visual.visual_proj.shape[0], 780 | ctx=ctx0, 781 | ), 782 | ctx=ctx0, 783 | ), 784 | Tensor.reshape_2d(cur, cur.shape[0], 1), 785 | ctx=ctx0, 786 | ) 787 | 788 | gf.build_forward_expand(cur) 789 | gf.compute() 790 | 791 | return cur 792 | 793 | @staticmethod 794 | def init_from_file(model_file: str, verbose=True, n_threads=1): 795 | with open(model_file, "rb") as fin: 796 | # Magic Number 797 | (magic,) = struct.unpack("i", (fin.read(struct.calcsize("i")))) 798 | 799 | assert magic == ggml.GGML_FILE_MAGIC 800 | if verbose: 801 | print("magic number =", hex(magic)) 802 | # Hyperparameters 803 | ( 804 | vision_width, 805 | vision_layers, 806 | vision_patch_size, 807 | grid_size, 808 | image_resolution, 809 | embed_dim, 810 | context_length, 811 | transformer_width, 812 | transformer_heads, 813 | transformer_layers, 814 | ftype, 815 | vocab_size, 816 | ) = struct.unpack("iiiiiiiiiiii", fin.read(struct.calcsize("iiiiiiiiiiii"))) 817 | 818 | qntvr = ftype // ggml.GGML_QNT_VERSION_FACTOR 819 | if verbose: 820 | print("vision_width =", vision_width) 821 | print("vision_layers =", vision_layers) 822 | print("vision_patch_size =", vision_patch_size) 823 | print("grid_size =", grid_size) 824 | print("image_resolution =", image_resolution) 825 | print("embed_dim =", embed_dim) 826 | print("context_length =", context_length) 827 | print("transformer_width =", transformer_width) 828 | print("transformer_heads =", transformer_heads) 829 | print("transformer_layers =", transformer_layers) 830 | print("ftype =", ftype) 831 | print("qntvr =", qntvr) 832 | print("vocab_size =", vocab_size) 833 | ftype %= ggml.GGML_QNT_VERSION_FACTOR 834 | ftype = GGML_FTYPE(int(ftype)) 835 | 836 | # Vocabulary 837 | vocab: List[Tuple[int, str]] = [] 838 | for i in range(vocab_size): 839 | (s_len,) = struct.unpack("i", fin.read(struct.calcsize("i"))) 840 | s = fin.read(s_len).decode("utf-8") 841 | vocab.append((i, s)) 842 | 843 | # Model Weights 844 | wtype = GGML_TYPE(ggml.ggml_ftype_to_ggml_type(ctypes.c_int(ftype.value))) 845 | 846 | ctx_size = compute_ctx_size(fin) 847 | 848 | mem_buffer = np.empty(ctx_size, dtype=np.uint8) 849 | init_params = InitParams( 850 | mem_size=ctx_size, 851 | mem_buffer=mem_buffer.ctypes.data_as(ctypes.c_void_p), 852 | ) 853 | ctx = Context(init_params=init_params) 854 | 855 | # Create Model 856 | model = ClipModel( 857 | ctx=ctx, 858 | wtype=wtype, 859 | vision_width=vision_width, 860 | vision_layers=vision_layers, 861 | vision_patch_size=vision_patch_size, 862 | image_resolution=image_resolution, 863 | embed_dim=embed_dim, 864 | context_length=context_length, 865 | vocab_size=vocab_size, 866 | transformer_width=transformer_width, 867 | transformer_heads=transformer_heads, 868 | transformer_layers=transformer_layers, 869 | n_threads=n_threads, 870 | ) 871 | 872 | # Load Weights 873 | while True: 874 | nbytes = struct.calcsize("iii") 875 | data = fin.read(nbytes) 876 | if len(data) != nbytes: 877 | break 878 | (n_dims, s_len, ftype) = struct.unpack("iii", data) 879 | dims = struct.unpack( 880 | "i" * n_dims, fin.read(struct.calcsize("i" * n_dims)) 881 | ) 882 | tensor_name = fin.read(s_len).decode("utf-8") 883 | tensor = model.tensors[tensor_name] 884 | n_elements = tensor.nelements() 885 | expected_n_elements = np.prod(dims) 886 | if n_elements != expected_n_elements: 887 | raise ValueError( 888 | f"tensor {tensor_name} has {n_elements} elements, but {expected_n_elements} were expected" 889 | ) 890 | 891 | buf = (ctypes.c_char * tensor.nbytes()).from_address(tensor.data) 892 | offset = fin.tell() 893 | fname = fin.name.encode("utf-8") 894 | fin.readinto(buf) 895 | 896 | return model 897 | 898 | 899 | if __name__ == "__main__": 900 | parser = argparse.ArgumentParser() 901 | parser.add_argument("-m", "--model", type=str, default=None) 902 | parser.add_argument("--use-gpu", action="store_true") 903 | args = parser.parse_args() 904 | 905 | model_file = args.model 906 | model = ClipModel.init_from_file(model_file, n_threads=1, use_gpu=args.use_gpu) 907 | image = np.random.rand(3, 224, 224).astype(np.float32) 908 | output = model.eval([image, image]) 909 | print(output) 910 | -------------------------------------------------------------------------------- /examples/clip/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | certifi==2023.7.22 3 | charset-normalizer==2.1.1 4 | clip==1.0 5 | filelock==3.9.0 6 | ftfy==6.1.1 7 | ggml-python @ git+https://github.com/abetlen/ggml-python@main 8 | idna==3.7 9 | Jinja2==3.1.4 10 | MarkupSafe==2.1.2 11 | mpmath==1.3.0 12 | networkx==3.0 13 | numpy==1.24.1 14 | Pillow==10.3.0 15 | regex==2023.6.3 16 | requests==2.32.0 17 | scipy==1.10.1 18 | sympy==1.11.1 19 | torch==2.0.1+cpu 20 | torchvision==0.15.2+cpu 21 | tqdm==4.66.3 22 | typing-extensions==4.6.3 23 | urllib3==1.26.18 24 | wcwidth==0.2.6 25 | -------------------------------------------------------------------------------- /examples/clip/utils.py: -------------------------------------------------------------------------------- 1 | # These functions were copied directly from 2 | # https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py 3 | # and 4 | # https://github.com/openai/CLIP/blob/main/clip/clip.py 5 | 6 | import torch 7 | import gzip 8 | import html 9 | import os 10 | from functools import lru_cache 11 | from typing import Union, List 12 | from pkg_resources import packaging 13 | import ftfy 14 | import regex as re 15 | import clip 16 | from PIL import Image 17 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize 18 | 19 | try: 20 | from torchvision.transforms import InterpolationMode 21 | 22 | BICUBIC = InterpolationMode.BICUBIC 23 | except ImportError: 24 | BICUBIC = Image.BICUBIC 25 | 26 | 27 | def _convert_image_to_rgb(image): 28 | return image.convert("RGB") 29 | 30 | 31 | def transform(n_px): 32 | return Compose( 33 | [ 34 | Resize(n_px, interpolation=BICUBIC), 35 | CenterCrop(n_px), 36 | _convert_image_to_rgb, 37 | ToTensor(), 38 | Normalize( 39 | (0.48145466, 0.4578275, 0.40821073), 40 | (0.26862954, 0.26130258, 0.27577711), 41 | ), 42 | ] 43 | ) 44 | 45 | 46 | @lru_cache() 47 | def default_bpe(): 48 | return os.path.join( 49 | os.path.dirname(os.path.abspath(clip.__file__)), "bpe_simple_vocab_16e6.txt.gz" 50 | ) 51 | 52 | 53 | @lru_cache() 54 | def bytes_to_unicode(): 55 | """ 56 | Returns list of utf-8 byte and a corresponding list of unicode strings. 57 | The reversible bpe codes work on unicode strings. 58 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 59 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 60 | This is a signficant percentage of your normal, say, 32K bpe vocab. 61 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 62 | And avoids mapping to whitespace/control characters the bpe code barfs on. 63 | """ 64 | bs = ( 65 | list(range(ord("!"), ord("~") + 1)) 66 | + list(range(ord("¡"), ord("¬") + 1)) 67 | + list(range(ord("®"), ord("ÿ") + 1)) 68 | ) 69 | cs = bs[:] 70 | n = 0 71 | for b in range(2**8): 72 | if b not in bs: 73 | bs.append(b) 74 | cs.append(2**8 + n) 75 | n += 1 76 | cs = [chr(n) for n in cs] 77 | return dict(zip(bs, cs)) 78 | 79 | 80 | def get_pairs(word): 81 | """Return set of symbol pairs in a word. 82 | Word is represented as tuple of symbols (symbols being variable-length strings). 83 | """ 84 | pairs = set() 85 | prev_char = word[0] 86 | for char in word[1:]: 87 | pairs.add((prev_char, char)) 88 | prev_char = char 89 | return pairs 90 | 91 | 92 | def basic_clean(text): 93 | text = ftfy.fix_text(text) 94 | text = html.unescape(html.unescape(text)) 95 | return text.strip() 96 | 97 | 98 | def whitespace_clean(text): 99 | text = re.sub(r"\s+", " ", text) 100 | text = text.strip() 101 | return text 102 | 103 | 104 | class SimpleTokenizer(object): 105 | def __init__(self, bpe_path: str = default_bpe()): 106 | self.byte_encoder = bytes_to_unicode() 107 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 108 | merges = gzip.open(bpe_path).read().decode("utf-8").split("\n") 109 | merges = merges[1 : 49152 - 256 - 2 + 1] 110 | merges = [tuple(merge.split()) for merge in merges] 111 | vocab = list(bytes_to_unicode().values()) 112 | vocab = vocab + [v + "" for v in vocab] 113 | for merge in merges: 114 | vocab.append("".join(merge)) 115 | vocab.extend(["<|startoftext|>", "<|endoftext|>"]) 116 | self.encoder = dict(zip(vocab, range(len(vocab)))) 117 | self.decoder = {v: k for k, v in self.encoder.items()} 118 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 119 | self.cache = { 120 | "<|startoftext|>": "<|startoftext|>", 121 | "<|endoftext|>": "<|endoftext|>", 122 | } 123 | self.pat = re.compile( 124 | r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", 125 | re.IGNORECASE, 126 | ) 127 | 128 | def bpe(self, token): 129 | if token in self.cache: 130 | return self.cache[token] 131 | word = tuple(token[:-1]) + (token[-1] + "",) 132 | pairs = get_pairs(word) 133 | 134 | if not pairs: 135 | return token + "" 136 | 137 | while True: 138 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) 139 | if bigram not in self.bpe_ranks: 140 | break 141 | first, second = bigram 142 | new_word = [] 143 | i = 0 144 | while i < len(word): 145 | try: 146 | j = word.index(first, i) 147 | new_word.extend(word[i:j]) 148 | i = j 149 | except: 150 | new_word.extend(word[i:]) 151 | break 152 | 153 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 154 | new_word.append(first + second) 155 | i += 2 156 | else: 157 | new_word.append(word[i]) 158 | i += 1 159 | new_word = tuple(new_word) 160 | word = new_word 161 | if len(word) == 1: 162 | break 163 | else: 164 | pairs = get_pairs(word) 165 | word = " ".join(word) 166 | self.cache[token] = word 167 | return word 168 | 169 | def encode(self, text): 170 | bpe_tokens = [] 171 | text = whitespace_clean(basic_clean(text)).lower() 172 | for token in re.findall(self.pat, text): 173 | token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) 174 | bpe_tokens.extend( 175 | self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ") 176 | ) 177 | return bpe_tokens 178 | 179 | def decode(self, tokens): 180 | text = "".join([self.decoder[token] for token in tokens]) 181 | text = ( 182 | bytearray([self.byte_decoder[c] for c in text]) 183 | .decode("utf-8", errors="replace") 184 | .replace("", " ") 185 | ) 186 | return text 187 | 188 | 189 | _tokenizer = SimpleTokenizer() 190 | 191 | 192 | def tokenize( 193 | texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False 194 | ) -> Union[torch.IntTensor, torch.LongTensor]: 195 | """ 196 | Returns the tokenized representation of given input string(s) 197 | 198 | Parameters 199 | ---------- 200 | texts : Union[str, List[str]] 201 | An input string or a list of input strings to tokenize 202 | 203 | context_length : int 204 | The context length to use; all CLIP models use 77 as the context length 205 | 206 | truncate: bool 207 | Whether to truncate the text in case its encoding is longer than the context length 208 | 209 | Returns 210 | ------- 211 | A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]. 212 | We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long. 213 | """ 214 | 215 | if isinstance(texts, str): 216 | texts = [texts] 217 | 218 | sot_token = _tokenizer.encoder["<|startoftext|>"] 219 | eot_token = _tokenizer.encoder["<|endoftext|>"] 220 | all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] 221 | if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"): 222 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) 223 | else: 224 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.int) 225 | 226 | for i, tokens in enumerate(all_tokens): 227 | if len(tokens) > context_length: 228 | if truncate: 229 | tokens = tokens[:context_length] 230 | tokens[-1] = eot_token 231 | else: 232 | raise RuntimeError( 233 | f"Input {texts[i]} is too long for context length {context_length}" 234 | ) 235 | result[i, : len(tokens)] = torch.tensor(tokens) 236 | 237 | return result 238 | -------------------------------------------------------------------------------- /examples/custom-operators/example_jax.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | 3 | import ggml 4 | import ggml.utils 5 | 6 | import jax 7 | 8 | from typing import Optional 9 | 10 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None) 11 | ctx = ggml.ggml_init(params) 12 | x_in = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 13 | 14 | @ggml.ggml_custom1_op_t 15 | def double( 16 | tensor_out: ggml.ggml_tensor_p, 17 | tensor_in: ggml.ggml_tensor_p, 18 | ith: int, 19 | nth: int, 20 | userdata: Optional[ctypes.c_void_p], 21 | ): 22 | x = jax.device_put(ggml.utils.to_numpy(tensor_in)) 23 | x *= 2 24 | ggml.utils.to_numpy(tensor_out)[:] = jax.device_get(x) 25 | 26 | x_out = ggml.ggml_map_custom1(ctx, x_in, double, 1, None) 27 | gf = ggml.ggml_build_forward(x_out) 28 | 29 | ggml.ggml_set_f32(x_in, 21.0) 30 | 31 | ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1) 32 | output = ggml.ggml_get_f32_1d(x_out, 0) 33 | assert output == 42.0 34 | print("GGML output: ", output) 35 | ggml.ggml_free(ctx) -------------------------------------------------------------------------------- /examples/optimizer/simple.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | """ 3 | # Single-batch stochastic gradient descent example using ggml 4 | 5 | This example demonstrates how to use ggml to implement a simple SGD optimizer. 6 | """ 7 | # %% 8 | import ggml 9 | import random 10 | 11 | a_real = 3.0 12 | b_real = 4.0 13 | 14 | ctx0 = ggml.ggml_init(ggml.ggml_init_params( 15 | mem_size=128 * 1024 * 1024, mem_buffer=None, no_alloc=False 16 | )) 17 | 18 | assert ctx0 is not None 19 | 20 | # define parameters 21 | a = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1) 22 | ggml.ggml_set_param(ctx0, a) 23 | 24 | b = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1) 25 | ggml.ggml_set_param(ctx0, b) 26 | 27 | # define input and output 28 | x = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1) 29 | ggml.ggml_set_input(x) 30 | 31 | tmp = ggml.ggml_mul(ctx0, a, x) 32 | f = ggml.ggml_add(ctx0, tmp, b) 33 | 34 | # define loss 35 | f_true = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1) 36 | ggml.ggml_set_input(f_true) 37 | 38 | tmp = ggml.ggml_sub(ctx0, f, f_true) 39 | loss = ggml.ggml_mul(ctx0, tmp, tmp) 40 | 41 | # build forward and backward graph 42 | gf = ggml.ggml_new_graph_custom(ctx0, ggml.GGML_DEFAULT_GRAPH_SIZE, True) 43 | ggml.ggml_build_forward_expand(gf, loss) 44 | gb = ggml.ggml_graph_dup(ctx0, gf) 45 | ggml.ggml_build_backward_expand(ctx0, gf, gb, False) 46 | 47 | # initialize parameters 48 | ggml.ggml_set_f32(a, 1.0) 49 | ggml.ggml_set_f32(b, 1.0) 50 | 51 | # SGD 52 | lr = 1e-2 53 | nsteps = 1000 54 | decay = 1e-3 55 | 56 | for i in range(nsteps): 57 | # sample data 58 | x_sample = random.uniform(-10, 10) 59 | f_sample = a_real * x_sample + b_real 60 | 61 | # set input 62 | ggml.ggml_set_f32(x, x_sample) 63 | ggml.ggml_set_f32(f_true, f_sample) 64 | 65 | # reset graph 66 | ggml.ggml_graph_reset(gf) 67 | ggml.ggml_set_f32(loss.contents.grad, 1.0) 68 | 69 | # compute forward and backward 70 | ggml.ggml_graph_compute_with_ctx(ctx0, gb, 1) 71 | 72 | # print loss 73 | loss_ = ggml.ggml_get_f32_1d(loss, 0) 74 | print(f"step {i}: loss = {loss_}") 75 | 76 | # decay learning rate 77 | lr *= (1.0 - decay) 78 | 79 | # update parameters 80 | ggml.ggml_set_f32(a, ggml.ggml_get_f32_1d(a, 0) - lr * ggml.ggml_get_f32_1d(a.contents.grad, 0)) 81 | ggml.ggml_set_f32(b, ggml.ggml_get_f32_1d(b, 0) - lr * ggml.ggml_get_f32_1d(b.contents.grad, 0)) 82 | 83 | # print parameters 84 | print(f"a = {ggml.ggml_get_f32_1d(a, 0):.2f}, b = {ggml.ggml_get_f32_1d(b, 0):.2f}") 85 | 86 | 87 | ggml.ggml_free(ctx0) 88 | 89 | # %% 90 | -------------------------------------------------------------------------------- /examples/replit/README.md: -------------------------------------------------------------------------------- 1 | # Replit Code Completion Server 2 | 3 | This example is a local-first Github Copilot drop-in replacement using the replit-code-v1-3b model written entirely in ggml-python. 4 | 5 | For best performance (likely still slower than copilot) please run with CUDA, OpenCL, or Metal support. 6 | 7 | 8 | ## Installation 9 | 10 | ```bash 11 | # Clone the repo 12 | git clone https://github.com/abetlen/ggml-python.git 13 | cd ggml-python/examples/replit 14 | # (Optional) Create a virtual environment 15 | python3 -m venv venv 16 | source venv/bin/activate 17 | # Install dependencies 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | ## Model Weights 22 | 23 | You can download the quantized model weights from [here](https://huggingface.co/abetlen/replit-code-v1-3b-ggml) 24 | 25 | ## Running the Server 26 | 27 | ```bash 28 | # Start the server 29 | MODEL=/path/to/model uvicorn server:app --reload 30 | ``` 31 | 32 | ## Editor Setup 33 | 34 | ### VSCode 35 | 36 | Add the following to your `settings.json`: 37 | 38 | ```json 39 | { 40 | "github.copilot.advanced": { 41 | "debug.testOverrideProxyUrl": "http://localhost:8000", 42 | "debug.overrideProxyUrl": "http://localhost:8000" 43 | } 44 | } 45 | ``` 46 | 47 | ### Vim / Neovim 48 | 49 | Add the following to your vimrc or init.vim: 50 | 51 | ``` 52 | let g:copilot_proxy = 'localhost:8000' 53 | let g:copilot_strict_ssl = 0 54 | ``` -------------------------------------------------------------------------------- /examples/replit/app.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import time 4 | import uuid 5 | import json 6 | import multiprocessing 7 | from functools import partial 8 | from threading import Lock 9 | from typing import ( 10 | Callable, 11 | Dict, 12 | List, 13 | Optional, 14 | Union, 15 | Iterator, 16 | AsyncIterator, 17 | Sequence, 18 | ) 19 | from os import environ 20 | 21 | from typing_extensions import TypedDict, Literal 22 | 23 | import numpy as np 24 | import numpy.typing as npt 25 | 26 | import anyio 27 | from anyio.streams.memory import MemoryObjectSendStream 28 | from starlette.concurrency import run_in_threadpool, iterate_in_threadpool 29 | from fastapi import FastAPI, Request, Depends 30 | from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict 31 | from sse_starlette.sse import EventSourceResponse 32 | 33 | from main import ReplitModel, ReplitSentencepieceTokenizer 34 | 35 | 36 | ## Types 37 | class CompletionLogprobs(TypedDict): 38 | text_offset: List[int] 39 | token_logprobs: List[Optional[float]] 40 | tokens: List[str] 41 | top_logprobs: List[Optional[Dict[str, float]]] 42 | 43 | 44 | class CompletionChoice(TypedDict): 45 | text: str 46 | index: int 47 | logprobs: Optional[CompletionLogprobs] 48 | finish_reason: Optional[str] 49 | 50 | 51 | class CompletionUsage(TypedDict): 52 | prompt_tokens: int 53 | completion_tokens: int 54 | total_tokens: int 55 | 56 | 57 | class CompletionChunk(TypedDict): 58 | id: str 59 | object: Literal["text_completion"] 60 | created: int 61 | model: str 62 | choices: List[CompletionChoice] 63 | 64 | 65 | class Completion(TypedDict): 66 | id: str 67 | object: Literal["text_completion"] 68 | created: int 69 | model: str 70 | choices: List[CompletionChoice] 71 | usage: CompletionUsage 72 | 73 | 74 | class OpenAIify: 75 | def __init__( 76 | self, 77 | model: ReplitModel, 78 | cancel_callback: Optional[Callable[[], bool]] = None, 79 | ): 80 | self.model = model 81 | self.cancel_callback = cancel_callback 82 | 83 | def tokenize(self, text: str) -> List[int]: 84 | return self.model.tokenize(text) 85 | 86 | def detokenize(self, tokens: List[int]) -> str: 87 | return self.model.detokenize(tokens) 88 | 89 | def generate( 90 | self, 91 | tokens: Sequence[int], 92 | top_p: float = 0.95, 93 | temperature: float = 0.80, 94 | frequency_penalty: float = 0.0, 95 | presence_penalty: float = 0.0, 96 | ) -> Iterator[int]: 97 | return self.model.generate( 98 | tokens, 99 | top_p=top_p, 100 | temperature=temperature, 101 | frequency_penalty=frequency_penalty, 102 | presence_penalty=presence_penalty, 103 | ) 104 | 105 | def _create_completion( 106 | self, 107 | prompt: str, 108 | suffix: Optional[str] = None, 109 | max_tokens: int = 16, 110 | temperature: float = 0.8, 111 | top_p: float = 0.95, 112 | logprobs: Optional[int] = None, 113 | echo: bool = False, 114 | stop: Optional[Union[str, List[str]]] = [], 115 | frequency_penalty: float = 0.0, 116 | presence_penalty: float = 0.0, 117 | stream: bool = False, 118 | model: Optional[str] = None, 119 | ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: 120 | completion_id: str = f"cmpl-{str(uuid.uuid4())}" 121 | created: int = int(time.time()) 122 | completion_tokens: List[int] = [] 123 | # Add blank space to start of prompt to match OG llama tokenizer 124 | prompt_tokens: List[int] = self.tokenize(prompt) 125 | text: str = "" 126 | returned_tokens: int = 0 127 | stop = ( 128 | stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] 129 | ) 130 | model_name: str = model if model is not None else "replit-code-v1-3b" 131 | 132 | # Truncate prompt if it is too long 133 | max_tokens = min( 134 | max_tokens, max(0, self.model.max_seq_len - len(prompt_tokens) - 1) 135 | ) 136 | if len(prompt_tokens) + max_tokens > self.model.max_seq_len: 137 | raise ValueError( 138 | f"Requested tokens exceed context window of {self.model.max_seq_len}" 139 | ) 140 | 141 | stop_sequences = stop if stop != [] else [] 142 | finish_reason = "length" 143 | for token in self.generate( 144 | prompt_tokens, 145 | top_p=top_p, 146 | temperature=temperature, 147 | frequency_penalty=frequency_penalty, 148 | presence_penalty=presence_penalty, 149 | ): 150 | if token == self.eos_token(): 151 | text = self.detokenize(completion_tokens) 152 | finish_reason = "stop" 153 | break 154 | 155 | if self.cancel_callback is not None and self.cancel_callback(): 156 | text = self.detokenize(completion_tokens) 157 | finish_reason = "stop" 158 | break 159 | 160 | completion_tokens.append(token) 161 | 162 | all_text = self.detokenize(completion_tokens) 163 | any_stop = [s for s in stop_sequences if s in all_text] 164 | if len(any_stop) > 0: 165 | first_stop = any_stop[0] 166 | text = all_text[: all_text.index(first_stop)] 167 | finish_reason = "stop" 168 | break 169 | 170 | if stream: 171 | remaining_tokens = completion_tokens[returned_tokens:] 172 | remaining_text = self.detokenize(remaining_tokens) 173 | remaining_length = len(remaining_text) 174 | 175 | # We want to avoid yielding any characters from 176 | # the generated text if they are part of a stop 177 | # sequence. 178 | first_stop_position = 0 179 | for s in stop_sequences: 180 | for i in range(min(len(s), remaining_length), 0, -1): 181 | if remaining_text.endswith(s[:i]): 182 | if i > first_stop_position: 183 | first_stop_position = i 184 | break 185 | 186 | token_end_position = 0 187 | for token in remaining_tokens: 188 | token_end_position += len(self.detokenize([token])) 189 | # Check if stop sequence is in the token 190 | if token_end_position >= ( 191 | remaining_length - first_stop_position - 1 192 | ): 193 | break 194 | logprobs_or_none: Optional[CompletionLogprobs] = None 195 | if logprobs is not None: 196 | token_str = self.detokenize([token]) 197 | text_offset = len(prompt) + len( 198 | self.detokenize(completion_tokens[:returned_tokens]) 199 | ) 200 | token_offset = len(prompt_tokens) + returned_tokens 201 | logits = self.model.scores[token_offset - 1, :].tolist() 202 | current_logprobs = self.logits_to_logprobs(logits) 203 | sorted_logprobs = list( 204 | sorted( 205 | zip(current_logprobs, range(len(current_logprobs))), 206 | reverse=True, 207 | ) 208 | ) 209 | top_logprob = { 210 | self.detokenize([i]): logprob 211 | for logprob, i in sorted_logprobs[:logprobs] 212 | } 213 | top_logprob[token_str] = current_logprobs[int(token)] 214 | logprobs_or_none = { 215 | "tokens": [self.detokenize([token])], 216 | "text_offset": [text_offset], 217 | "token_logprobs": [sorted_logprobs[int(token)][0]], 218 | "top_logprobs": [top_logprob], 219 | } 220 | returned_tokens += 1 221 | yield { 222 | "id": completion_id, 223 | "object": "text_completion", 224 | "created": created, 225 | "model": model_name, 226 | "choices": [ 227 | { 228 | "text": self.detokenize([token]), 229 | "index": 0, 230 | "logprobs": logprobs_or_none, 231 | "finish_reason": None, 232 | } 233 | ], 234 | } 235 | 236 | if len(completion_tokens) >= max_tokens: 237 | text = self.detokenize(completion_tokens) 238 | finish_reason = "length" 239 | break 240 | 241 | if stream: 242 | remaining_tokens = completion_tokens[returned_tokens:] 243 | all_text = self.detokenize(remaining_tokens) 244 | any_stop = [s for s in stop_sequences if s in all_text] 245 | if len(any_stop) > 0: 246 | end = min(all_text.index(stop) for stop in any_stop) 247 | else: 248 | end = len(all_text) 249 | 250 | token_end_position = 0 251 | for token in remaining_tokens: 252 | token_end_position += len(self.detokenize([token])) 253 | 254 | logprobs_or_none: Optional[CompletionLogprobs] = None 255 | if logprobs is not None: 256 | token_str = self.detokenize([token]) 257 | text_offset = len(prompt) + len( 258 | self.detokenize(completion_tokens[:returned_tokens]) 259 | ) 260 | token_offset = len(prompt_tokens) + returned_tokens - 1 261 | logits = self.model.scores[token_offset, :].tolist() 262 | current_logprobs = self.logits_to_logprobs(logits) 263 | sorted_logprobs = list( 264 | sorted( 265 | zip(current_logprobs, range(len(current_logprobs))), 266 | reverse=True, 267 | ) 268 | ) 269 | top_logprob = { 270 | self.detokenize([i]): logprob 271 | for logprob, i in sorted_logprobs[:logprobs] 272 | } 273 | top_logprob[token_str] = current_logprobs[int(token)] 274 | logprobs_or_none = { 275 | "tokens": [self.detokenize([token])], 276 | "text_offset": [text_offset], 277 | "token_logprobs": [sorted_logprobs[int(token)][0]], 278 | "top_logprobs": [top_logprob], 279 | } 280 | 281 | if token_end_position >= end: 282 | last_text = self.detokenize([token]) 283 | if token_end_position == end - 1: 284 | break 285 | returned_tokens += 1 286 | yield { 287 | "id": completion_id, 288 | "object": "text_completion", 289 | "created": created, 290 | "model": model_name, 291 | "choices": [ 292 | { 293 | "text": last_text[ 294 | : len(last_text) - (token_end_position - end) 295 | ], 296 | "index": 0, 297 | "logprobs": logprobs_or_none, 298 | "finish_reason": finish_reason, 299 | } 300 | ], 301 | } 302 | break 303 | returned_tokens += 1 304 | yield { 305 | "id": completion_id, 306 | "object": "text_completion", 307 | "created": created, 308 | "model": model_name, 309 | "choices": [ 310 | { 311 | "text": self.detokenize([token]), 312 | "index": 0, 313 | "logprobs": logprobs_or_none, 314 | "finish_reason": finish_reason 315 | if returned_tokens == len(completion_tokens) 316 | else None, 317 | } 318 | ], 319 | } 320 | return 321 | 322 | text_str = text 323 | 324 | if echo: 325 | text_str = prompt + text_str 326 | 327 | if suffix is not None: 328 | text_str = text_str + suffix 329 | 330 | logprobs_or_none: Optional[CompletionLogprobs] = None 331 | if logprobs is not None: 332 | text_offset = 0 if echo else len(prompt) 333 | token_offset = 0 if echo else len(prompt_tokens[1:]) 334 | text_offsets: List[int] = [] 335 | token_logprobs: List[Optional[float]] = [] 336 | tokens: List[str] = [] 337 | top_logprobs: List[Optional[Dict[str, float]]] = [] 338 | 339 | if echo: 340 | # Remove leading BOS token 341 | all_tokens = prompt_tokens[1:] + completion_tokens 342 | else: 343 | all_tokens = completion_tokens 344 | 345 | all_token_strs = [self.detokenize([token]) for token in all_tokens] 346 | all_logprobs = [ 347 | self.logits_to_logprobs(row.tolist()) for row in self.model.scores 348 | ][token_offset:] 349 | for token, token_str, logprobs_token in zip( 350 | all_tokens, all_token_strs, all_logprobs 351 | ): 352 | text_offsets.append(text_offset) 353 | text_offset += len(token_str) 354 | tokens.append(token_str) 355 | sorted_logprobs = list( 356 | sorted( 357 | zip(logprobs_token, range(len(logprobs_token))), reverse=True 358 | ) 359 | ) 360 | token_logprobs.append(sorted_logprobs[int(token)][0]) 361 | top_logprob: Optional[Dict[str, float]] = { 362 | self.detokenize([i]): logprob 363 | for logprob, i in sorted_logprobs[:logprobs] 364 | } 365 | top_logprob.update({token_str: logprobs_token[int(token)]}) 366 | top_logprobs.append(top_logprob) 367 | # Weird idosincracy of the OpenAI API where 368 | # token_logprobs and top_logprobs are null for 369 | # the first token. 370 | if echo and len(all_tokens) > 0: 371 | token_logprobs[0] = None 372 | top_logprobs[0] = None 373 | logprobs_or_none = { 374 | "tokens": tokens, 375 | "text_offset": text_offsets, 376 | "token_logprobs": token_logprobs, 377 | "top_logprobs": top_logprobs, 378 | } 379 | 380 | yield { 381 | "id": completion_id, 382 | "object": "text_completion", 383 | "created": created, 384 | "model": model_name, 385 | "choices": [ 386 | { 387 | "text": text_str, 388 | "index": 0, 389 | "logprobs": logprobs_or_none, 390 | "finish_reason": finish_reason, 391 | } 392 | ], 393 | "usage": { 394 | "prompt_tokens": len(prompt_tokens), 395 | "completion_tokens": len(completion_tokens), 396 | "total_tokens": len(prompt_tokens) + len(completion_tokens), 397 | }, 398 | } 399 | 400 | def create_completion( 401 | self, 402 | prompt: str, 403 | suffix: Optional[str] = None, 404 | max_tokens: int = 128, 405 | temperature: float = 0.8, 406 | top_p: float = 0.95, 407 | logprobs: Optional[int] = None, 408 | echo: bool = False, 409 | stop: Optional[Union[str, List[str]]] = [], 410 | frequency_penalty: float = 0.0, 411 | presence_penalty: float = 0.0, 412 | stream: bool = False, 413 | model: Optional[str] = None, 414 | ) -> Union[Completion, Iterator[CompletionChunk]]: 415 | """Generate text from a prompt. 416 | 417 | Args: 418 | prompt: The prompt to generate text from. 419 | suffix: A suffix to append to the generated text. If None, no suffix is appended. 420 | max_tokens: The maximum number of tokens to generate. 421 | temperature: The temperature to use for sampling. 422 | top_p: The top-p value to use for sampling. 423 | logprobs: The number of logprobs to return. If None, no logprobs are returned. 424 | echo: Whether to echo the prompt. 425 | stop: A list of strings to stop generation when encountered. 426 | repeat_penalty: The penalty to apply to repeated tokens. 427 | top_k: The top-k value to use for sampling. 428 | stream: Whether to stream the results. 429 | 430 | Raises: 431 | ValueError: If the requested tokens exceed the context window. 432 | RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. 433 | 434 | Returns: 435 | Response object containing the generated text. 436 | """ 437 | completion_or_chunks = self._create_completion( 438 | prompt=prompt, 439 | suffix=suffix, 440 | max_tokens=max_tokens, 441 | temperature=temperature, 442 | top_p=top_p, 443 | logprobs=logprobs, 444 | echo=echo, 445 | stop=stop, 446 | frequency_penalty=frequency_penalty, 447 | presence_penalty=presence_penalty, 448 | stream=stream, 449 | model=model, 450 | ) 451 | if stream: 452 | chunks: Iterator[CompletionChunk] = completion_or_chunks 453 | return chunks 454 | completion: Completion = next(completion_or_chunks) # type: ignore 455 | return completion 456 | 457 | def eos_token(self): 458 | return self.model.eos_token() 459 | 460 | def logits_to_logprobs( 461 | self, logits: npt.NDArray[np.float32] 462 | ) -> npt.NDArray[np.float32]: 463 | return np.exp(logits) / (np.sum(np.exp(logits))) # type: ignore 464 | 465 | 466 | class Settings(BaseSettings): 467 | model_file: str 468 | n_gpu_layers: int = 32 469 | n_batch: int = 2048 470 | n_threads: int = max(multiprocessing.cpu_count() // 2, 1) 471 | sentencepiece_model: Optional[str] = None 472 | 473 | 474 | class CreateCompletionRequest(BaseModel): 475 | prompt: Union[str, List[str]] = Field( 476 | default="", description="The prompt to generate completions for." 477 | ) 478 | suffix: Optional[str] = Field( 479 | default=None, 480 | description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", 481 | ) 482 | max_tokens: int = Field( 483 | default=16, 484 | ge=1, 485 | le=2048, 486 | description="The maximum number of tokens to generate.", 487 | ) 488 | temperature: float = Field( 489 | default=0.8, 490 | ge=0.0, 491 | le=2.0, 492 | description="Adjust the randomness of the generated text.\n\n" 493 | + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", 494 | ) 495 | top_p: float = Field( 496 | default=0.95, 497 | ge=0.0, 498 | le=1.0, 499 | description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" 500 | + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", 501 | ) 502 | echo: bool = Field( 503 | default=False, 504 | description="Whether to echo the prompt in the generated text. Useful for chatbots.", 505 | ) 506 | stop: Optional[Union[str, List[str]]] = Field( 507 | default=None, 508 | description="A list of tokens at which to stop generation. If None, no stop tokens are used.", 509 | ) 510 | stream: bool = Field( 511 | default=False, 512 | description="Whether to stream the results as they are generated. Useful for chatbots.", 513 | ) 514 | logprobs: Optional[int] = Field( 515 | default=None, 516 | ge=0, 517 | description="The number of logprobs to generate. If None, no logprobs are generated.", 518 | ) 519 | presence_penalty: Optional[float] = Field( 520 | default=0.0, 521 | ge=-2.0, 522 | le=2.0, 523 | description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", 524 | ) 525 | frequency_penalty: Optional[float] = Field( 526 | default=0.0, 527 | ge=-2.0, 528 | le=2.0, 529 | description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", 530 | ) 531 | logprobs: Optional[int] = Field(None) 532 | 533 | # ignored or currently unsupported 534 | model: Optional[str] = Field( 535 | description="The model to use for generating completions." 536 | ) 537 | n: Optional[int] = 1 538 | best_of: Optional[int] = 1 539 | logit_bias: Optional[Dict[str, float]] = Field(None) 540 | user: Optional[str] = Field(None) 541 | 542 | class Config: 543 | schema_extra = { 544 | "example": { 545 | "prompt": "def fib(n):", 546 | "stop": ["\n\n"], 547 | "temperature": 0, 548 | "max_tokens": 34, 549 | } 550 | } 551 | 552 | 553 | settings = Settings(model_file=environ.get("MODEL")) # type: ignore 554 | app = FastAPI( 555 | title="Code Completion API", 556 | description=""" 557 | ## Editor Setup 558 | 559 | ### VSCode 560 | 561 | Add the following to your `settings.json`: 562 | 563 | ```json 564 | { 565 | "github.copilot.advanced": { 566 | "debug.testOverrideProxyUrl": "http://localhost:8000", 567 | "debug.overrideProxyUrl": "http://localhost:8000" 568 | } 569 | } 570 | ``` 571 | 572 | ### Vim / Neovim 573 | 574 | Add the following to your vimrc or init.vim: 575 | 576 | ``` 577 | let g:copilot_proxy = 'localhost:8000' 578 | let g:copilot_strict_ssl = 0 579 | ``` 580 | """, 581 | ) 582 | outer_lock = Lock() 583 | inner_lock = Lock() 584 | 585 | tokenizer = ( 586 | ReplitSentencepieceTokenizer(settings.sentencepiece_model) 587 | if settings.sentencepiece_model 588 | else None 589 | ) 590 | 591 | 592 | def cancel_callback(): 593 | return outer_lock.locked() 594 | 595 | 596 | model = OpenAIify( 597 | ReplitModel.init_from_file( 598 | model_file=settings.model_file, 599 | n_gpu_layers=settings.n_gpu_layers, 600 | tokenizer=tokenizer, 601 | cancel_callback=cancel_callback, 602 | ), 603 | # check if any other requests are pending in the same thread and cancel the stream if so 604 | cancel_callback=cancel_callback, 605 | ) 606 | 607 | 608 | def get_model(): 609 | # NOTE: This double lock allows the currently streaming model to check 610 | # if any other requests are pending in the same thread and cancel the 611 | # stream if so. 612 | outer_lock.acquire() 613 | release_outer_lock = True 614 | try: 615 | inner_lock.acquire() 616 | try: 617 | outer_lock.release() 618 | release_outer_lock = False 619 | yield model 620 | finally: 621 | inner_lock.release() 622 | finally: 623 | if release_outer_lock: 624 | outer_lock.release() 625 | 626 | 627 | # Used to support copilot.vim 628 | @app.get("/copilot_internal/v2/token") 629 | def get_copilot_token(): 630 | content = {"token": "1", "expires_at": 2600000000, "refresh_in": 900} 631 | return dict(status_code=200, content=content) 632 | 633 | 634 | CreateCompletionResponse = create_model_from_typeddict(Completion) 635 | 636 | 637 | # Used to support copilot.vim 638 | @app.post( 639 | "/v1/engines/copilot-codex/completions", 640 | # response_model=CreateCompletionResponse, 641 | ) 642 | @app.post( 643 | "/v1/completions", 644 | # response_model=CreateCompletionResponse, 645 | ) 646 | async def create_completion( 647 | request: Request, 648 | body: CreateCompletionRequest, 649 | model: ReplitModel = Depends(get_model), 650 | ): 651 | if isinstance(body.prompt, list): 652 | assert len(body.prompt) <= 1 653 | body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" 654 | 655 | exclude = { 656 | "n", 657 | "best_of", 658 | "logit_bias", 659 | "user", 660 | } 661 | kwargs = body.dict(exclude=exclude) 662 | if body.stream: 663 | send_chan, recv_chan = anyio.create_memory_object_stream(10) 664 | 665 | async def event_publisher( 666 | inner_send_chan: MemoryObjectSendStream[Dict[str, Union[str, bool]]] 667 | ): 668 | async with inner_send_chan: 669 | try: 670 | iterator: Iterator[CompletionChunk] = await run_in_threadpool(model.create_completion, **kwargs) # type: ignore 671 | async_iterator: AsyncIterator[ 672 | CompletionChunk 673 | ] = iterate_in_threadpool(iterator) 674 | async for chunk in async_iterator: 675 | await inner_send_chan.send(dict(data=json.dumps(chunk))) 676 | if await request.is_disconnected(): 677 | raise anyio.get_cancelled_exc_class()() 678 | await inner_send_chan.send(dict(data="[DONE]")) 679 | except anyio.get_cancelled_exc_class() as e: 680 | print("disconnected") 681 | with anyio.move_on_after(1, shield=True): 682 | print( 683 | f"Disconnected from client (via refresh/close) {request.client}" 684 | ) 685 | await inner_send_chan.send(dict(closing=True)) 686 | raise e 687 | 688 | return EventSourceResponse( 689 | recv_chan, data_sender_callable=partial(event_publisher, send_chan) 690 | ) 691 | else: 692 | completion: Completion = await run_in_threadpool(model.create_completion, **kwargs) # type: ignore 693 | return completion 694 | -------------------------------------------------------------------------------- /examples/replit/main.py: -------------------------------------------------------------------------------- 1 | """ggml-python implemention of the Replit code model 2 | 3 | Model is available at: 4 | https://huggingface.co/replit/replit-code-v1-3b 5 | 6 | This implementation is based on the example model code and ggml model file format from: 7 | https://github.com/ggerganov/ggml/tree/master/examples/replit 8 | """ 9 | from __future__ import annotations 10 | import abc 11 | import math 12 | import struct 13 | import ctypes 14 | import argparse 15 | import multiprocessing 16 | from collections import deque 17 | 18 | from typing import ( 19 | Callable, 20 | Deque, 21 | Iterator, 22 | List, 23 | Optional, 24 | Sequence, 25 | Tuple, 26 | Dict, 27 | Union, 28 | ) 29 | 30 | import numpy as np 31 | import numpy.typing as npt 32 | 33 | import ggml 34 | 35 | from ggml.utils import to_numpy 36 | 37 | from contextlib import ExitStack 38 | 39 | 40 | class ReplitAbortException(Exception): 41 | pass 42 | 43 | ## Generic Sampling Functions 44 | 45 | 46 | def sample( 47 | logits: npt.NDArray[np.float32], 48 | last_tokens: Optional[List[int]] = None, 49 | presence_penalty: float = 0.0, 50 | frequency_penalty: float = 0.0, 51 | temperature: float = 1.0, 52 | top_p: float = 0.0, 53 | ) -> int: 54 | if last_tokens is None: 55 | last_tokens = [] 56 | if temperature == 0.0: 57 | return int(np.argmax(logits)) 58 | logits = frequency_and_presence_penalties( 59 | logits, last_tokens, frequency_penalty, presence_penalty 60 | ) 61 | return nucleus_sampling(logits, top_p=top_p, temperature=temperature) 62 | 63 | 64 | # TODO: this is likely incorrect 65 | def frequency_and_presence_penalties( 66 | logits: npt.NDArray[np.float32], 67 | last_tokens: Sequence[int], 68 | alpha_frequency: float, 69 | alpha_presence: float, 70 | ): 71 | if len(last_tokens) == 0: 72 | return logits 73 | 74 | if alpha_frequency == 0.0 and alpha_presence == 0.0: 75 | return logits 76 | 77 | # Calculate the frequency penalty contribution 78 | frequency_penalty = alpha_frequency * np.log(np.unique(last_tokens).size + 1) 79 | 80 | # Calculate the presence penalty contribution 81 | presence_penalty = alpha_presence * np.array( 82 | [float(token in last_tokens) for token in range(len(logits))] 83 | ) 84 | 85 | # Apply penalties to the logits 86 | penalized_logits = logits - frequency_penalty - presence_penalty 87 | 88 | return penalized_logits 89 | 90 | 91 | def nucleus_sampling( 92 | logits: npt.NDArray[np.float32], top_p: float, temperature: float = 1.0 93 | ): 94 | # Apply temperature to logits 95 | logits /= temperature 96 | 97 | # Subtract the maximum value for numerical stability 98 | logits -= logits.max() # type: ignore 99 | 100 | # Calculate probabilities using softmax function with epsilon 101 | epsilon = 1e-8 102 | probabilities = np.exp(logits) / ((np.exp(logits)).sum() + epsilon) # type: ignore 103 | 104 | # Filter out NaN values from probabilities 105 | probabilities = np.nan_to_num(probabilities) 106 | 107 | # Sort the probabilities in descending order and get the corresponding indices 108 | sorted_indices = np.argsort(probabilities)[::-1] 109 | 110 | # Select the indices within the nucleus 111 | nucleus_indices = sorted_indices[: int(len(sorted_indices) * top_p)] 112 | 113 | # Calculate the updated probabilities within the nucleus 114 | nucleus_probabilities = probabilities[nucleus_indices] 115 | 116 | # Normalize the probabilities within the nucleus 117 | nucleus_probabilities /= nucleus_probabilities.sum() # type: ignore 118 | 119 | # Sample from the updated probabilities 120 | selected_token = np.random.choice(nucleus_indices, p=nucleus_probabilities) 121 | 122 | return selected_token 123 | 124 | 125 | ### Context Buffer 126 | 127 | 128 | class ContextBuffer(abc.ABC): 129 | @abc.abstractmethod 130 | def resize(self, new_size: int) -> None: 131 | raise NotImplementedError 132 | 133 | @property 134 | @abc.abstractmethod 135 | def buffer(self) -> ctypes.c_void_p: 136 | raise NotImplementedError 137 | 138 | 139 | class CpuContextBuffer(ContextBuffer): 140 | def __init__(self, buffer_size: int = 256 * 1024 * 1024): 141 | self.buffer_size = buffer_size 142 | self._buffer = (ctypes.c_uint8 * self.buffer_size)() 143 | 144 | def resize(self, new_size: int): 145 | assert new_size > self.buffer_size 146 | 147 | self.buffer_size = new_size 148 | ctypes.resize(self._buffer, self.buffer_size) 149 | 150 | @property 151 | def buffer(self) -> ctypes.c_void_p: 152 | return ctypes.c_void_p(ctypes.addressof(self._buffer)) 153 | 154 | 155 | ### Tokenizer 156 | 157 | 158 | class Tokenizer(abc.ABC): 159 | @abc.abstractmethod 160 | def tokenize(self, text: str) -> List[int]: 161 | raise NotImplementedError 162 | 163 | @abc.abstractmethod 164 | def detokenize(self, tokens: List[int]) -> str: 165 | raise NotImplementedError 166 | 167 | 168 | class ReplitTokenizer(Tokenizer): 169 | def __init__(self, vocab: List[Tuple[int, str, float]]): 170 | self.vocab = vocab 171 | self.piece_map = {piece: (i, -score) for i, piece, score in self.vocab} 172 | self.ws_symbol = b"\342\226\201" 173 | 174 | def tokenize(self, text: str) -> List[int]: 175 | normalized_text = text.replace(" ", self.ws_symbol.decode("utf-8")) 176 | tokenized, _ = ReplitTokenizer.encode_word(normalized_text, self.piece_map) 177 | return tokenized 178 | 179 | def detokenize(self, tokens: List[int]) -> str: 180 | text = "".join(self.vocab[token][1] for token in tokens) 181 | detokenized = text.replace(self.ws_symbol.decode("utf-8"), " ") 182 | return detokenized 183 | 184 | @staticmethod 185 | def encode_word( 186 | word: str, model: Dict[str, Tuple[int, float]] 187 | ) -> Tuple[List[int], float]: 188 | len_word = len(word) 189 | best_segmentation_starts = [-1] * (len_word + 1) 190 | best_segmentation_scores = [math.inf] * (len_word + 1) 191 | best_segmentation_starts[0], best_segmentation_scores[0] = 0, 0.0 192 | 193 | for idx in range(len_word): 194 | if best_segmentation_starts[idx] != -1: 195 | end_idx = idx + 1 196 | while end_idx <= len_word: 197 | token = word[idx:end_idx] 198 | if token in model: 199 | token_score = model[token][1] 200 | if ( 201 | best_segmentation_scores[idx] + token_score 202 | < best_segmentation_scores[end_idx] 203 | ): 204 | best_segmentation_starts[end_idx] = idx 205 | best_segmentation_scores[end_idx] = ( 206 | best_segmentation_scores[idx] + token_score 207 | ) 208 | end_idx += 1 209 | 210 | if best_segmentation_scores[-1] == math.inf: 211 | return [], 0.0 212 | 213 | tokens: Deque[int] = deque() 214 | idx = len_word 215 | while idx > 0: 216 | start_idx = best_segmentation_starts[idx] 217 | token = word[start_idx:idx] 218 | token_id = model[token][0] 219 | tokens.appendleft(token_id) 220 | idx = start_idx 221 | 222 | return list(tokens), best_segmentation_scores[-1] 223 | 224 | 225 | class ReplitSentencepieceTokenizer(Tokenizer): 226 | def __init__(self, model_path: str): 227 | import sentencepiece 228 | 229 | self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_path) 230 | 231 | def tokenize(self, text: str) -> List[int]: 232 | return self.tokenizer.encode(text) 233 | 234 | def detokenize(self, tokens: List[int]) -> str: 235 | return self.tokenizer.decode(tokens) 236 | 237 | 238 | ### Replit Model Definition 239 | 240 | 241 | class ReplitLayer: 242 | def __init__(self, wtype: int, n_embd: int, ctx: ggml.ggml_context_p): 243 | self.norm_1_weight = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, n_embd) 244 | self.c_attn_wqkv_weight = ggml.ggml_new_tensor_2d( 245 | ctx, wtype, n_embd, 3 * n_embd 246 | ) 247 | self.c_attn_out_proj_weight = ggml.ggml_new_tensor_2d( 248 | ctx, wtype, n_embd, n_embd 249 | ) 250 | self.norm_2_weight = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, n_embd) 251 | self.c_ffn_up_proj_weight = ggml.ggml_new_tensor_2d( 252 | ctx, wtype, n_embd, 4 * n_embd 253 | ) 254 | self.c_ffn_down_proj_weight = ggml.ggml_new_tensor_2d( 255 | ctx, wtype, 4 * n_embd, n_embd 256 | ) 257 | 258 | 259 | class ReplitModel: 260 | def __init__( 261 | self, 262 | d_model: int, 263 | max_seq_len: int, 264 | n_heads: int, 265 | n_layers: int, 266 | vocab_size: int, 267 | ftype: int, 268 | vocab: List[Tuple[int, str, float]], 269 | tokenizer: Tokenizer, 270 | n_batch: int, 271 | n_threads: int, 272 | weights_buffer: ContextBuffer, 273 | ctx: ggml.ggml_context_p, 274 | cancel_callback: Optional[Callable[[], bool]] = None, 275 | ): 276 | self.d_model = d_model 277 | self.max_seq_len = max_seq_len 278 | self.n_heads = n_heads 279 | self.n_layers = n_layers 280 | self.vocab_size = vocab_size 281 | self.ftype = ftype 282 | self.ctx = ctx 283 | self.layers: List[ReplitLayer] = [] 284 | self.tensors: Dict[str, ggml.ggml_tensor_p] = {} 285 | self.vocab = vocab 286 | self.tokenizer = tokenizer 287 | self.n_batch = n_batch 288 | self.n_threads = n_threads 289 | self.weights_buffer = weights_buffer 290 | self.cancel_callback = cancel_callback 291 | 292 | n_layer = self.n_layers 293 | n_embd = self.d_model 294 | n_ctx = self.max_seq_len 295 | n_vocab = self.vocab_size 296 | wtype = ggml.ggml_ftype_to_ggml_type(ftype) 297 | 298 | n_mem = n_layer * n_ctx 299 | n_elements = n_embd * n_mem 300 | 301 | self.memory_k = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F16, n_elements) 302 | self.memory_v = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F16, n_elements) 303 | 304 | self.wte_weight = ggml.ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab) 305 | self.norm_f_weight = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, n_embd) 306 | self.tensors["transformer.wte.weight"] = self.wte_weight 307 | self.tensors["transformer.norm_f.weight"] = self.norm_f_weight 308 | 309 | self.mem_per_token = 0 310 | self.eval_buffer = CpuContextBuffer() 311 | 312 | for i in range(n_layer): 313 | layer = ReplitLayer( 314 | wtype=wtype, 315 | n_embd=n_embd, 316 | ctx=ctx, 317 | ) 318 | self.layers.append(layer) 319 | 320 | self.tensors[f"transformer.blocks.{i}.norm_1.weight"] = layer.norm_1_weight 321 | self.tensors[ 322 | f"transformer.blocks.{i}.attn.Wqkv.weight" 323 | ] = layer.c_attn_wqkv_weight 324 | self.tensors[ 325 | f"transformer.blocks.{i}.attn.out_proj.weight" 326 | ] = layer.c_attn_out_proj_weight 327 | self.tensors[f"transformer.blocks.{i}.norm_2.weight"] = layer.norm_2_weight 328 | self.tensors[ 329 | f"transformer.blocks.{i}.ffn.up_proj.weight" 330 | ] = layer.c_ffn_up_proj_weight 331 | self.tensors[ 332 | f"transformer.blocks.{i}.ffn.down_proj.weight" 333 | ] = layer.c_ffn_down_proj_weight 334 | 335 | self.n_tokens = 0 336 | self.input_ids: npt.NDArray[np.intc] = np.ndarray( 337 | (self.max_seq_len), dtype=np.intc 338 | ) 339 | self.scores: npt.NDArray[np.single] = np.ndarray( 340 | (self.max_seq_len, n_vocab), dtype=np.single 341 | ) 342 | 343 | def __del__(self): 344 | ggml.ggml_free(self.ctx) 345 | 346 | @staticmethod 347 | def encode_word( 348 | word: str, model: Dict[str, Tuple[int, float]] 349 | ) -> Tuple[List[int], float]: 350 | len_word = len(word) 351 | best_segmentation_starts = [-1] * (len_word + 1) 352 | best_segmentation_scores = [math.inf] * (len_word + 1) 353 | best_segmentation_starts[0], best_segmentation_scores[0] = 0, 0.0 354 | 355 | for idx in range(len_word): 356 | if best_segmentation_starts[idx] != -1: 357 | end_idx = idx + 1 358 | while end_idx <= len_word: 359 | token = word[idx:end_idx] 360 | if token in model: 361 | token_score = model[token][1] 362 | if ( 363 | best_segmentation_scores[idx] + token_score 364 | < best_segmentation_scores[end_idx] 365 | ): 366 | best_segmentation_starts[end_idx] = idx 367 | best_segmentation_scores[end_idx] = ( 368 | best_segmentation_scores[idx] + token_score 369 | ) 370 | end_idx += 1 371 | 372 | if best_segmentation_scores[-1] == math.inf: 373 | return [], 0.0 374 | 375 | tokens: Deque[int] = deque() 376 | idx = len_word 377 | while idx > 0: 378 | start_idx = best_segmentation_starts[idx] 379 | token = word[start_idx:idx] 380 | token_id = model[token][0] 381 | tokens.appendleft(token_id) 382 | idx = start_idx 383 | 384 | return list(tokens), best_segmentation_scores[-1] 385 | 386 | def tokenize(self, text: str) -> List[int]: 387 | return self.tokenizer.tokenize(text) 388 | 389 | def detokenize(self, tokens: List[int]) -> str: 390 | return self.tokenizer.detokenize(tokens) 391 | 392 | def reset(self): 393 | self.n_tokens = 0 394 | 395 | def _build_forward( 396 | self, 397 | ctx0: ggml.ggml_context_p, 398 | n_tokens: int, 399 | n_past: int, 400 | n_threads: int, 401 | ): 402 | N = n_tokens 403 | n_embd = self.d_model 404 | n_layer = self.n_layers 405 | n_ctx = self.max_seq_len 406 | n_head = self.n_heads 407 | 408 | gf = ggml.ggml_cgraph(n_threads=n_threads) 409 | 410 | embd = ggml.ggml_new_tensor_1d( 411 | ctx0, 412 | ggml.GGML_TYPE_I32, 413 | N, 414 | ) 415 | ggml.ggml_set_name(embd, b"embd") 416 | 417 | inpL = ggml.ggml_get_rows(ctx0, self.wte_weight, embd) 418 | 419 | for il in range(n_layer): 420 | # // lctx.use_buf(ctx0, 0) 421 | 422 | # // a = self.ln_1(x) 423 | cur = ggml.ggml_norm(ctx0, inpL, 1e-5) 424 | # offload_func(cur) 425 | ggml.ggml_set_name(cur, b"norm_0") 426 | cur = ggml.ggml_mul( 427 | ctx0, 428 | ggml.ggml_repeat(ctx0, self.layers[il].norm_1_weight, cur), 429 | cur, 430 | ) 431 | ggml.ggml_set_name(cur, b"attention_norm_0") 432 | 433 | # // self-attention 434 | # // b, _, past_key_value = self.attn(a, past_key_value=past_key_value, 435 | # // attn_bias=attn_bias, attention_mask=attention_mask, 436 | # // is_causal=is_causal) 437 | 438 | # // compute QKV 439 | cur = ggml.ggml_mul_mat(ctx0, self.layers[il].c_attn_wqkv_weight, cur) 440 | ggml.ggml_set_name(cur, b"tmpkqv") 441 | 442 | Qcur = ggml.ggml_view_2d( 443 | ctx0, 444 | cur, 445 | n_embd, 446 | N, 447 | cur.contents.nb[1], 448 | 0 * ctypes.sizeof(ctypes.c_float) * n_embd, 449 | ) 450 | ggml.ggml_set_name(Qcur, b"Qcur") 451 | Kcur = ggml.ggml_view_2d( 452 | ctx0, 453 | cur, 454 | n_embd, 455 | N, 456 | cur.contents.nb[1], 457 | 1 * ctypes.sizeof(ctypes.c_float) * n_embd, 458 | ) 459 | ggml.ggml_set_name(Kcur, b"Kcur") 460 | Vcur = ggml.ggml_view_2d( 461 | ctx0, 462 | cur, 463 | n_embd, 464 | N, 465 | cur.contents.nb[1], 466 | 2 * ctypes.sizeof(ctypes.c_float) * n_embd, 467 | ) 468 | ggml.ggml_set_name(Vcur, b"Vcur") 469 | 470 | # // store key and value to memory 471 | k = ggml.ggml_view_1d( 472 | ctx0, 473 | self.memory_k, 474 | N * n_embd, 475 | (ggml.ggml_element_size(self.memory_k) * n_embd) 476 | * (il * n_ctx + n_past), 477 | ) 478 | ggml.ggml_set_name(k, b"k") 479 | v = ggml.ggml_view_1d( 480 | ctx0, 481 | self.memory_v, 482 | N * n_embd, 483 | (ggml.ggml_element_size(self.memory_v) * n_embd) 484 | * (il * n_ctx + n_past), 485 | ) 486 | ggml.ggml_set_name(v, b"v") 487 | 488 | ggml.ggml_build_forward_expand( 489 | ctypes.pointer(gf), 490 | ggml.ggml_cpy( 491 | ctx0, 492 | Kcur, 493 | k, 494 | ), 495 | ) 496 | ggml.ggml_build_forward_expand( 497 | ctypes.pointer(gf), 498 | ggml.ggml_cpy( 499 | ctx0, 500 | Vcur, 501 | v, 502 | ), 503 | ) 504 | 505 | # // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 506 | # // 2, 1, 3) [64, N, 12] 507 | Q = ggml.ggml_permute( 508 | ctx0, 509 | ggml.ggml_cpy( 510 | ctx0, 511 | Qcur, 512 | ggml.ggml_new_tensor_3d( 513 | ctx0, 514 | ggml.GGML_TYPE_F32, 515 | n_embd // n_head, 516 | n_head, 517 | N, 518 | ), 519 | ), 520 | 0, 521 | 2, 522 | 1, 523 | 3, 524 | ) 525 | ggml.ggml_set_name(Q, b"Q") 526 | 527 | # // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 528 | # // 3) [64, n_past + N, 12] 529 | K = ggml.ggml_permute( 530 | ctx0, 531 | ggml.ggml_reshape_3d( 532 | ctx0, 533 | ggml.ggml_view_1d( 534 | ctx0, 535 | self.memory_k, 536 | (n_past + N) * n_embd, 537 | il * n_ctx * ggml.ggml_element_size(self.memory_k) * n_embd, 538 | ), 539 | n_embd // n_head, 540 | n_head, 541 | n_past + N, 542 | ), 543 | 0, 544 | 2, 545 | 1, 546 | 3, 547 | ) 548 | ggml.ggml_set_name(K, b"K") 549 | 550 | # // K * Q 551 | KQ = ggml.ggml_mul_mat(ctx0, K, Q) 552 | ggml.ggml_set_name(KQ, b"KQ") 553 | 554 | # // KQ_scaled = KQ / sqrt(n_embd/n_head) 555 | KQ_scaled = ggml.ggml_scale( 556 | ctx0, 557 | KQ, 558 | 1.0 / np.sqrt(float(n_embd) / n_head), 559 | ) 560 | ggml.ggml_set_name(KQ_scaled, b"KQ_scaled") 561 | 562 | KQ_scaled_alibi = ggml.ggml_alibi( 563 | ctx0, 564 | KQ_scaled, 565 | n_past, 566 | n_head, 567 | 8.0, 568 | ) 569 | ggml.ggml_set_name(KQ_scaled_alibi, b"KQ_scaled_alibi") 570 | 571 | # // KQ_masked = mask_past(KQ_scaled) 572 | KQ_masked = ggml.ggml_diag_mask_inf( 573 | ctx0, 574 | KQ_scaled_alibi, 575 | n_past, 576 | ) 577 | ggml.ggml_set_name(KQ_masked, b"KQ_masked") 578 | 579 | # // KQ = soft_max(KQ_masked) 580 | KQ_soft_max = ggml.ggml_soft_max( 581 | ctx0, 582 | KQ_masked, 583 | ) 584 | ggml.ggml_set_name(KQ_soft_max, b"KQ_soft_max") 585 | 586 | # // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 587 | # // 2, 0, 3).contiguous() [n_past + N, 64, 12] 588 | V_trans = ggml.ggml_cpy( 589 | ctx0, 590 | ggml.ggml_permute( 591 | ctx0, 592 | ggml.ggml_reshape_3d( 593 | ctx0, 594 | ggml.ggml_view_1d( 595 | ctx0, 596 | self.memory_v, 597 | (n_past + N) * n_embd, 598 | il * n_ctx * ggml.ggml_element_size(self.memory_v) * n_embd, 599 | ), 600 | n_embd // n_head, 601 | n_head, 602 | n_past + N, 603 | ), 604 | 1, 605 | 2, 606 | 0, 607 | 3, 608 | ), 609 | ggml.ggml_new_tensor_3d( 610 | ctx0, 611 | self.memory_v.contents.type, 612 | n_past + N, 613 | n_embd // n_head, 614 | n_head, 615 | ), 616 | ) 617 | # offload_func_v(V_trans) 618 | ggml.ggml_set_name(V_trans, b"V_trans") 619 | 620 | # // KQV = transpose(V) * KQ_soft_max 621 | KQV = ggml.ggml_mul_mat(ctx0, V_trans, KQ_soft_max) 622 | # offload_func_v(KQV) 623 | ggml.ggml_set_name(KQV, b"KQV") 624 | 625 | # // KQV_merged = KQV.permute(0, 2, 1, 3) 626 | KQV_merged = ggml.ggml_permute( 627 | ctx0, 628 | KQV, 629 | 0, 630 | 2, 631 | 1, 632 | 3, 633 | ) 634 | ggml.ggml_set_name(KQV_merged, b"KQV_merged") 635 | 636 | # // cur = KQV_merged.contiguous().view(n_embd, N) 637 | cur = ggml.ggml_cpy( 638 | ctx0, 639 | KQV_merged, 640 | ggml.ggml_new_tensor_2d( 641 | ctx0, 642 | ggml.GGML_TYPE_F32, 643 | n_embd, 644 | N, 645 | ), 646 | ) 647 | ggml.ggml_set_name(cur, b"KQV_merged_contiguous") 648 | 649 | # // projection 650 | cur = ggml.ggml_mul_mat( 651 | ctx0, 652 | self.layers[il].c_attn_out_proj_weight, 653 | cur, 654 | ) 655 | ggml.ggml_set_name(cur, b"result_wo") 656 | 657 | # // lctx.use_buf(ctx0, 1) 658 | 659 | inpL = ggml.ggml_add( 660 | ctx0, 661 | inpL, 662 | cur, 663 | ) 664 | ggml.ggml_set_name(cur, b"inpFF") 665 | 666 | # // m = self.ln_2(x) 667 | cur = ggml.ggml_norm(ctx0, inpL, 1e-5) 668 | ggml.ggml_set_name(cur, b"norm_1") 669 | cur = ggml.ggml_mul( 670 | ctx0, 671 | ggml.ggml_repeat(ctx0, self.layers[il].norm_2_weight, cur), 672 | cur, 673 | ) 674 | ggml.ggml_set_name(cur, b"norm") 675 | 676 | # // n = self.mlp(m) 677 | cur = ggml.ggml_mul_mat( 678 | ctx0, 679 | self.layers[il].c_ffn_up_proj_weight, 680 | cur, 681 | ) 682 | ggml.ggml_set_name(cur, b"result_mlp_up") 683 | 684 | # // GELU activation 685 | cur = ggml.ggml_gelu( 686 | ctx0, 687 | cur, 688 | ) 689 | ggml.ggml_set_name(cur, b"gelu") 690 | # // projection 691 | # // cur = proj_w*cur + proj_b 692 | cur = ggml.ggml_mul_mat( 693 | ctx0, 694 | self.layers[il].c_ffn_down_proj_weight, 695 | cur, 696 | ) 697 | ggml.ggml_set_name(cur, b"result_mlp_down") 698 | 699 | # // x = x + n 700 | inpL = ggml.ggml_add( 701 | ctx0, 702 | inpL, 703 | cur, 704 | ) 705 | ggml.ggml_set_name(cur, b"inpFF_+_result_mlp_down") 706 | 707 | # // lctx.use_buf(ctx0, 0) 708 | 709 | # // norm 710 | inpL = ggml.ggml_norm(ctx0, inpL, 1e-5) 711 | ggml.ggml_set_name(inpL, b"norm_f") 712 | 713 | # // inpL = ln_f_g*inpL 714 | inpL = ggml.ggml_mul( 715 | ctx0, 716 | ggml.ggml_repeat(ctx0, self.norm_f_weight, inpL), 717 | inpL, 718 | ) 719 | ggml.ggml_set_name(inpL, b"norm_f_mul") 720 | 721 | # // output embedding weight tied to input embedding 722 | inpL = ggml.ggml_mul_mat( 723 | ctx0, 724 | self.wte_weight, 725 | inpL, 726 | ) 727 | ggml.ggml_set_name(inpL, b"result_output") 728 | 729 | # // lctx.use_buf(ctx0, -1) 730 | 731 | ggml.ggml_build_forward_expand(ctypes.pointer(gf), inpL) 732 | 733 | return gf 734 | 735 | def _eval_internal(self, embd_inp: Sequence[int], n_past: int, n_threads: int): 736 | N = len(embd_inp) 737 | n_vocab = self.vocab_size 738 | required_buffer_size = int(self.mem_per_token * N * 2.0) 739 | if ( 740 | self.mem_per_token > 0 741 | and self.eval_buffer.buffer_size < required_buffer_size 742 | ): 743 | self.eval_buffer.resize(required_buffer_size) 744 | 745 | init_params = ggml.ggml_init_params( 746 | mem_size=self.eval_buffer.buffer_size, 747 | mem_buffer=self.eval_buffer.buffer, 748 | no_alloc=False, 749 | ) 750 | exit_stack = ExitStack() 751 | ctx0 = ggml.ggml_init(init_params) 752 | if ctx0 is None: 753 | raise RuntimeError("Failed to initialize GGML context") 754 | exit_stack.callback(ggml.ggml_free, self.ctx) 755 | gf = self._build_forward(ctx0, len(embd_inp), n_past, n_threads) 756 | embd = ggml.ggml_graph_get_tensor(ctypes.pointer(gf), b"embd") 757 | assert embd is not None 758 | inpL = ggml.ggml_graph_get_tensor(ctypes.pointer(gf), b"result_output") 759 | assert inpL is not None 760 | to_numpy(embd)[:] = np.array(embd_inp, dtype=np.int32) 761 | gp = ggml.ggml_graph_plan(ctypes.pointer(gf), self.n_threads) 762 | work_data = (ctypes.c_uint8 * gp.work_size)() 763 | gp.work_data = ctypes.cast(work_data, ctypes.POINTER(ctypes.c_uint8)) 764 | if self.cancel_callback is not None: 765 | 766 | @ggml.ggml_abort_callback 767 | def abort_callback(data: ctypes.c_void_p) -> Union[ctypes.c_bool, bool]: 768 | assert self.cancel_callback is not None 769 | return self.cancel_callback() 770 | 771 | self._abort_callback = abort_callback # NOTE: keep reference 772 | gp.abort_callback = abort_callback 773 | rc = ggml.ggml_graph_compute(ctypes.pointer(gf), ctypes.pointer(gp)) 774 | if rc != ggml.GGML_EXIT_SUCCESS: 775 | raise ReplitAbortException("Execution aborted") 776 | embd_w = to_numpy(inpL).reshape( 777 | -1, n_vocab 778 | ) # .copy() # NOTE: likely wrong to not copy here 779 | if self.mem_per_token == 0: 780 | self.mem_per_token = int(ggml.ggml_used_mem(ctx0) / N) 781 | return embd_w 782 | 783 | def eval(self, tokens: Sequence[int]): 784 | if self.mem_per_token == 0: 785 | try: 786 | self._eval_internal([1, 2, 3, 4], n_past=0, n_threads=self.n_threads) 787 | except ReplitAbortException as e: 788 | self.n_tokens = 0 789 | raise e 790 | n_ctx = self.max_seq_len 791 | for i in range(0, len(tokens), self.n_batch): 792 | batch = tokens[i : min(len(tokens), i + self.n_batch)] 793 | n_past = min(n_ctx - len(batch), self.n_tokens) 794 | try: 795 | scores = self._eval_internal( 796 | batch, 797 | n_past, 798 | self.n_threads, 799 | ) 800 | # Save tokens 801 | self.input_ids[self.n_tokens : self.n_tokens + len(batch)] = batch 802 | # Save logits 803 | self.scores[self.n_tokens : self.n_tokens + len(batch), :] = scores 804 | # Update token count 805 | self.n_tokens += len(batch) 806 | except ReplitAbortException as e: 807 | self.n_tokens = n_past 808 | raise e 809 | return self.scores[: self.n_tokens, :] 810 | 811 | def generate( 812 | self, 813 | tokens: Sequence[int], 814 | top_p: float = 0.95, 815 | temperature: float = 0.80, 816 | frequency_penalty: float = 0.0, 817 | presence_penalty: float = 0.0, 818 | ) -> Iterator[int]: 819 | reset = True 820 | if self.n_tokens > 0: 821 | longest_prefix = 0 822 | for a, b in zip(self.input_ids[: self.n_tokens], tokens[:-1]): 823 | if a == b: 824 | longest_prefix += 1 825 | else: 826 | break 827 | if longest_prefix > 0: 828 | reset = False 829 | tokens = tokens[longest_prefix:] 830 | self.n_tokens = longest_prefix 831 | 832 | if reset: 833 | self.reset() 834 | 835 | while True: 836 | scores = self.eval(tokens) 837 | logits = scores[-1, :] 838 | token = sample( 839 | logits, 840 | top_p=top_p, 841 | temperature=temperature, 842 | frequency_penalty=frequency_penalty, 843 | presence_penalty=presence_penalty, 844 | ) 845 | yield token 846 | tokens = [token] 847 | 848 | @staticmethod 849 | def eos_token(): 850 | return 1 851 | 852 | @staticmethod 853 | def init_from_file( 854 | model_file: str, 855 | n_gpu_layers: int = 0, 856 | n_batch: int = 1, 857 | n_threads: int = 1, 858 | tokenizer: Optional[Tokenizer] = None, 859 | verbose: bool = True, 860 | cancel_callback: Optional[Callable[[], bool]] = None, 861 | ) -> ReplitModel: 862 | with open(model_file, "rb") as fin: 863 | # Magic Number 864 | (magic,) = struct.unpack("i", (fin.read(struct.calcsize("i")))) 865 | assert magic == ggml.GGML_FILE_MAGIC 866 | if verbose: 867 | print("magic number =", hex(magic)) 868 | # Hyperparameters 869 | d_model, max_seq_len, n_heads, n_layers, vocab_size, ftype = struct.unpack( 870 | "iiiiii", (fin.read(struct.calcsize("iiiiii"))) 871 | ) 872 | qntvr = ftype // ggml.GGML_QNT_VERSION_FACTOR 873 | if verbose: 874 | print("d_model =", d_model) 875 | print("max_seq_len =", max_seq_len) 876 | print("n_heads =", n_heads) 877 | print("n_layers =", n_layers) 878 | print("vocab_size =", vocab_size) 879 | print("ftype =", ftype) 880 | print("qntvr =", qntvr) 881 | ftype %= ggml.GGML_QNT_VERSION_FACTOR 882 | # Vocabulary 883 | vocab: List[Tuple[int, str, float]] = [] 884 | for i in range(vocab_size): 885 | (s_len,) = struct.unpack("i", (fin.read(struct.calcsize("i")))) 886 | s = fin.read(s_len).decode("utf-8") 887 | (score,) = struct.unpack("f", (fin.read(struct.calcsize("f")))) 888 | vocab.append((i, s, score)) 889 | # Model Weights 890 | wtype = ggml.ggml_ftype_to_ggml_type(ftype) 891 | 892 | n_embd = d_model 893 | n_layer = n_layers 894 | n_ctx = max_seq_len 895 | n_vocab = vocab_size 896 | 897 | ctx_size = ReplitModel.compute_ctx_size( 898 | n_embd=n_embd, 899 | n_layer=n_layer, 900 | n_ctx=n_ctx, 901 | n_vocab=n_vocab, 902 | wtype=wtype, 903 | ) 904 | 905 | if verbose: 906 | print("ctx size =", ctx_size // (1024 * 1024), "MB") 907 | 908 | # create context 909 | weights_buffer = CpuContextBuffer(ctx_size) 910 | init_params = ggml.ggml_init_params( 911 | mem_size=ctx_size, 912 | mem_buffer=weights_buffer.buffer, 913 | no_alloc=False, 914 | ) 915 | ctx = ggml.ggml_init(init_params) 916 | if ctx is None: 917 | raise RuntimeError("Failed to initialize GGML context") 918 | 919 | model = ReplitModel( 920 | # hyperparameters 921 | d_model=d_model, 922 | max_seq_len=max_seq_len, 923 | n_heads=n_heads, 924 | n_layers=n_layers, 925 | vocab_size=vocab_size, 926 | ftype=ftype, 927 | # vocabulary 928 | vocab=vocab, 929 | tokenizer=ReplitTokenizer(vocab) if tokenizer is None else tokenizer, 930 | ctx=ctx, 931 | n_batch=n_batch, 932 | n_threads=n_threads, 933 | weights_buffer=weights_buffer, 934 | # misc 935 | cancel_callback=cancel_callback, 936 | ) 937 | 938 | n_tensors = 0 939 | total_size = 0 940 | 941 | while True: 942 | nbytes = struct.calcsize("iii") 943 | data = fin.read(nbytes) 944 | if len(data) != nbytes: 945 | break 946 | n_dims, length, ttype = struct.unpack("iii", data) 947 | nelements = 1 948 | ne = [1, 1] 949 | for i in range(n_dims): 950 | (dim,) = struct.unpack("i", (fin.read(struct.calcsize("i")))) 951 | ne[i] = dim 952 | nelements *= ne[i] 953 | name = fin.read(length).decode("utf-8") 954 | if name not in model.tensors: 955 | raise ValueError(f"Tensor {name} not found in model") 956 | tensor = model.tensors[name] 957 | if ggml.ggml_nelements(tensor) != nelements: 958 | raise ValueError( 959 | f"Tensor {name} has {ggml.ggml_nelements(tensor)} elements, but {nelements} expected" 960 | ) 961 | if tensor.contents.ne[0] != ne[0] or tensor.contents.ne[1] != ne[1]: 962 | raise ValueError( 963 | f"Tensor {name} has {tensor.contents.ne[0]}x{tensor.contents.ne[1]} shape, but {ne[0]}x{ne[1]} expected" 964 | ) 965 | bpe = ggml.ggml_type_size(ttype) 966 | if ( 967 | (nelements * bpe) / ggml.ggml_blck_size(tensor.contents.type) 968 | ) != ggml.ggml_nbytes(tensor): 969 | raise ValueError( 970 | f"Tensor {name} has {ggml.ggml_nbytes(tensor)} bytes, but {(nelements * bpe) / ggml.ggml_blck_size(tensor.contents.type)} expected" 971 | ) 972 | tensor_data = ggml.ggml_get_data(tensor) 973 | if tensor_data is None: 974 | raise ValueError(f"Failed to get data for tensor {name}") 975 | fin.readinto( 976 | (ctypes.c_uint8 * ggml.ggml_nbytes(tensor)).from_address( 977 | tensor_data 978 | ) 979 | ) 980 | 981 | total_size += ggml.ggml_nbytes(tensor) 982 | if n_tensors % 8 == 0: 983 | print(".", end="", flush=True) 984 | n_tensors += 1 985 | print("done") 986 | print( 987 | "model size =", 988 | total_size // (1024 * 1024), 989 | "MB", 990 | "num tensors =", 991 | n_tensors, 992 | ) 993 | return model 994 | 995 | @staticmethod 996 | def compute_ctx_size( 997 | n_embd: int, 998 | n_layer: int, 999 | n_ctx: int, 1000 | n_vocab: int, 1001 | wtype: int, 1002 | ) -> int: 1003 | wtype_sizef = ggml.ggml_type_sizef(wtype) 1004 | f32_sizef = ggml.ggml_type_sizef(ggml.GGML_TYPE_F32) 1005 | f16_sizef = ggml.ggml_type_sizef(ggml.GGML_TYPE_F16) 1006 | 1007 | ctx_size = 0 1008 | ctx_size += n_embd * n_vocab * wtype_sizef 1009 | ctx_size += n_embd * f32_sizef 1010 | 1011 | ctx_size += n_layer * (n_embd * f32_sizef) 1012 | ctx_size += n_layer * (3 * n_embd * n_embd * wtype_sizef) 1013 | ctx_size += n_layer * (n_embd**2 * wtype_sizef) 1014 | ctx_size += n_layer * (n_embd * f32_sizef) 1015 | ctx_size += n_layer * (4 * n_embd * n_embd * wtype_sizef) 1016 | ctx_size += n_layer * (n_embd**2 * 4 * wtype_sizef) 1017 | 1018 | ctx_size += n_ctx * n_layer * n_embd * f16_sizef 1019 | ctx_size += n_ctx * n_layer * n_embd * f16_sizef 1020 | 1021 | ctx_size += (1 + 6 * n_layer) * 512 1022 | ctx_size = int(ctx_size) 1023 | return ctx_size 1024 | 1025 | 1026 | if __name__ == "__main__": 1027 | parser = argparse.ArgumentParser() 1028 | parser.add_argument("-m", "--model", type=str, default=None) 1029 | parser.add_argument("-p", "--prompt", type=str, default="def fib(n):") 1030 | parser.add_argument( 1031 | "--n_threads", type=int, default=max(1, multiprocessing.cpu_count() // 2) 1032 | ) 1033 | parser.add_argument("--n_batch", type=int, default=512) 1034 | parser.add_argument("--n_gpu_layers", type=int, default=0) 1035 | parser.add_argument("--max_tokens", type=int, default=32) 1036 | parser.add_argument("--temperature", type=float, default=1.0) 1037 | parser.add_argument("--top_p", type=float, default=1.0) 1038 | parser.add_argument("--presence_penalty", type=float, default=0.0) 1039 | parser.add_argument("--frequency_penalty", type=float, default=0.0) 1040 | args = parser.parse_args() 1041 | 1042 | model_file = args.model 1043 | n_threads = args.n_threads 1044 | n_batch = args.n_batch 1045 | n_gpu_layers = args.n_gpu_layers 1046 | max_tokens = args.max_tokens 1047 | temperature = args.temperature 1048 | top_p = args.top_p 1049 | presence_penalty = args.presence_penalty 1050 | frequency_penalty = args.frequency_penalty 1051 | 1052 | model = ReplitModel.init_from_file( 1053 | model_file, n_gpu_layers=n_gpu_layers, n_threads=n_threads, n_batch=n_batch 1054 | ) 1055 | 1056 | prompt = args.prompt 1057 | prompt_tokens = model.tokenize(prompt) 1058 | all_tokens: List[int] = prompt_tokens[:] # type: ignore 1059 | n_past = 0 1060 | tokens: List[int] = prompt_tokens[:] # type: ignore 1061 | 1062 | print("number of tokens in prompt =", len(prompt_tokens)) 1063 | for i, token_id in enumerate(prompt_tokens): 1064 | print(f"token[{i}] =", token_id) 1065 | 1066 | print() 1067 | print(prompt, end="", flush=True) 1068 | for _ in range(max_tokens): 1069 | # eval 1070 | scores = model.eval(tokens) 1071 | logits = scores[-1, :] 1072 | # sample 1073 | token_id = sample( 1074 | logits, 1075 | last_tokens=all_tokens, 1076 | temperature=temperature, 1077 | top_p=top_p, 1078 | presence_penalty=presence_penalty, 1079 | frequency_penalty=frequency_penalty, 1080 | ) 1081 | if token_id == model.eos_token(): 1082 | break 1083 | # update 1084 | all_tokens.append(token_id) 1085 | print(model.detokenize([token_id]), end="", flush=True) 1086 | n_past += len(tokens) 1087 | tokens = [token_id] 1088 | print() 1089 | -------------------------------------------------------------------------------- /examples/replit/requirements.txt: -------------------------------------------------------------------------------- 1 | ggml-python==0.0.7 2 | fastapi==0.109.1 3 | sse-starlette==1.6.1 4 | uvicorn==0.22.0 5 | -------------------------------------------------------------------------------- /examples/rpc/main.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import argparse 3 | import contextlib 4 | 5 | import ggml 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--host", type=str, default="127.0.0.1") 10 | parser.add_argument("--port", type=int, default=9091) 11 | args = parser.parse_args() 12 | 13 | with contextlib.ExitStack() as stack: 14 | backend = ggml.ggml_backend_rpc_init(f"{args.host}:{args.port}".encode("utf-8")) 15 | assert backend is not None 16 | stack.callback(ggml.ggml_backend_free, backend) 17 | 18 | params = ggml.ggml_init_params( 19 | mem_size=ggml.ggml_tensor_overhead() * 6 + ggml.ggml_graph_overhead() + 10000, 20 | no_alloc=True, 21 | ) 22 | ctx = ggml.ggml_init(params) 23 | assert ctx is not None 24 | stack.callback(ggml.ggml_free, ctx) 25 | 26 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 27 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 28 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 29 | x2 = ggml.ggml_mul(ctx, x, x) 30 | f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b) 31 | gf = ggml.ggml_new_graph(ctx) 32 | 33 | ggml.ggml_build_forward_expand(gf, f) 34 | 35 | buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend) 36 | assert buffer is not None 37 | stack.callback(ggml.ggml_backend_buffer_free, buffer) 38 | 39 | x_data = (ctypes.c_float * 1)(2.0) 40 | ggml.ggml_backend_tensor_set( 41 | x, # tensor 42 | x_data, # data 43 | 0, # offset 44 | ctypes.sizeof(x_data), # size 45 | ) 46 | 47 | a_data = (ctypes.c_float * 1)(3.0) 48 | ggml.ggml_backend_tensor_set( 49 | a, # tensor 50 | a_data, # data 51 | 0, # offset 52 | ctypes.sizeof(a_data), # size 53 | ) 54 | 55 | b_data = (ctypes.c_float * 1)(4.0) 56 | ggml.ggml_backend_tensor_set( 57 | b, # tensor 58 | b_data, # data 59 | 0, # offset 60 | ctypes.sizeof(b_data), # size 61 | ) 62 | 63 | ggml.ggml_backend_graph_compute(backend, gf) 64 | 65 | output = ctypes.c_float() 66 | ggml.ggml_backend_tensor_get( 67 | f, # tensor 68 | ctypes.byref(output), # data 69 | 0, # offset 70 | ctypes.sizeof(output), # size 71 | ) 72 | 73 | print(f"Output: {output.value}") 74 | 75 | assert output.value == 16.0 76 | 77 | if __name__ == "__main__": 78 | main() -------------------------------------------------------------------------------- /examples/rpc/worker.py: -------------------------------------------------------------------------------- 1 | import ggml 2 | import argparse 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--host", type=str, default="127.0.0.1") 7 | parser.add_argument("--port", type=int, default=9091) 8 | parser.add_argument("--free_mem", type=int, default=1 << 30) 9 | parser.add_argument("--total_mem", type=int, default=1 << 30) 10 | parser.add_argument("--backend", type=str, default="cpu", choices=["cpu", "cuda", "metal"]) 11 | parser.add_argument("--backend-cuda-device", type=int, default=0) 12 | args = parser.parse_args() 13 | 14 | print(f"Starting worker on {args.host}:{args.port}") 15 | print(f"Free memory: {args.free_mem} bytes") 16 | print(f"Total memory: {args.total_mem} bytes") 17 | print(f"Backend: {args.backend}") 18 | 19 | if args.backend == "cpu": 20 | backend = ggml.ggml_backend_cpu_init() 21 | elif args.backend == "cuda": 22 | backend = ggml.ggml_backend_cuda_init(args.backend_cuda_device) 23 | elif args.backend == "metal": 24 | backend = ggml.ggml_backend_metal_init() 25 | else: 26 | raise ValueError(f"Unknown backend: {args.backend}") 27 | 28 | assert backend is not None, "Failed to initialize CPU backend" 29 | 30 | endpoints = "{}:{}".format(args.host, args.port).encode("utf-8") 31 | 32 | free_mem = args.free_mem 33 | total_mem = args.total_mem 34 | 35 | ggml.start_rpc_server(backend, endpoints, free_mem, total_mem) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() -------------------------------------------------------------------------------- /ggml/__init__.py: -------------------------------------------------------------------------------- 1 | from .ggml import * 2 | 3 | __version__ = "0.0.37" -------------------------------------------------------------------------------- /ggml/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abetlen/ggml-python/90ffdc076df76f290227052b285e71a94f29f865/ggml/py.typed -------------------------------------------------------------------------------- /ggml/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for ggml-python. 2 | """ 3 | from __future__ import annotations 4 | 5 | import enum 6 | import ctypes 7 | import signal 8 | import platform 9 | import traceback 10 | 11 | from typing import Any, Optional, Sequence, Tuple 12 | 13 | from ggml import ggml 14 | 15 | import numpy as np 16 | import numpy.typing as npt 17 | 18 | 19 | class GGML_TYPE(enum.IntEnum): 20 | F32 = ggml.GGML_TYPE_F32 21 | F16 = ggml.GGML_TYPE_F16 22 | Q4_0 = ggml.GGML_TYPE_Q4_0 23 | Q4_1 = ggml.GGML_TYPE_Q4_1 24 | Q5_0 = ggml.GGML_TYPE_Q5_0 25 | Q5_1 = ggml.GGML_TYPE_Q5_1 26 | Q8_0 = ggml.GGML_TYPE_Q8_0 27 | Q8_1 = ggml.GGML_TYPE_Q8_1 28 | I8 = ggml.GGML_TYPE_I8 29 | I16 = ggml.GGML_TYPE_I16 30 | I32 = ggml.GGML_TYPE_I32 31 | 32 | 33 | NUMPY_DTYPE_TO_GGML_TYPE = { 34 | np.float16: GGML_TYPE.F16, 35 | np.float32: GGML_TYPE.F32, 36 | np.int8: GGML_TYPE.I8, 37 | np.int16: GGML_TYPE.I16, 38 | np.int32: GGML_TYPE.I32, 39 | } 40 | 41 | GGML_TYPE_TO_NUMPY_DTYPE = {v: k for k, v in NUMPY_DTYPE_TO_GGML_TYPE.items()} 42 | 43 | 44 | def to_numpy( 45 | tensor: ggml.ggml_tensor_p, 46 | shape: Optional[Tuple[int, ...]] = None, 47 | ) -> npt.NDArray[Any]: 48 | """Get the data of a ggml tensor as a numpy array. 49 | 50 | Parameters: 51 | tensor: ggml tensor 52 | 53 | Returns: 54 | Numpy array with a view of data from tensor 55 | """ 56 | ggml_type = GGML_TYPE(tensor.contents.type) 57 | if ggml_type == GGML_TYPE.F16: 58 | ctypes_type = ctypes.c_uint16 59 | else: 60 | ctypes_type = np.ctypeslib.as_ctypes_type(GGML_TYPE_TO_NUMPY_DTYPE[ggml_type]) 61 | 62 | data = ggml.ggml_get_data(tensor) 63 | if data is None: 64 | raise ValueError("tensor data is None") 65 | array = (ctypes_type * ggml.ggml_nelements(tensor)).from_address(data) 66 | n_dims = ggml.ggml_n_dims(tensor) 67 | shape_ = tuple(reversed(tensor.contents.ne[:n_dims])) 68 | strides = tuple(reversed(tensor.contents.nb[:n_dims])) 69 | output = np.ctypeslib.as_array(array) 70 | if ggml_type == GGML_TYPE.F16: 71 | output.dtype = np.float16 # type: ignore 72 | return np.lib.stride_tricks.as_strided( 73 | output, shape=shape if shape is not None else shape_, strides=strides 74 | ) 75 | 76 | 77 | def from_numpy(x: npt.NDArray[Any], ctx: ggml.ggml_context_p) -> ggml.ggml_tensor_p: 78 | """Create a new ggml tensor with data copied from a numpy array. 79 | 80 | Parameters: 81 | x: numpy array 82 | ctx: ggml context 83 | 84 | Returns: 85 | New ggml tensor with data copied from x 86 | """ 87 | ggml_type = NUMPY_DTYPE_TO_GGML_TYPE[x.dtype.type] 88 | shape = tuple(reversed(x.shape)) 89 | tensor = ggml.ggml_new_tensor( 90 | ctx, 91 | ggml_type.value, 92 | len(shape), 93 | (ctypes.c_int64 * len(shape))(*shape), 94 | ) 95 | tensor.contents.nb[: len(shape)] = (ctypes.c_int64 * len(shape))( 96 | *tuple(reversed(x.strides)) 97 | ) 98 | if ggml.ggml_get_data(tensor) is not None: 99 | to_numpy(tensor)[:] = x 100 | return tensor 101 | 102 | 103 | def copy_to_cpu( 104 | ctx: ggml.ggml_context_p, tensor: ggml.ggml_tensor_p 105 | ) -> ggml.ggml_tensor_p: 106 | """Copy a ggml tensor from a GPU backend to CPU. 107 | 108 | Parameters: 109 | ctx: ggml context 110 | tensor: ggml tensor 111 | 112 | Returns: 113 | New ggml tensor with data copied from tensor on CPU backend""" 114 | tmp = ggml.ggml_dup_tensor(ctx, tensor) 115 | to_numpy(tmp)[:] = 0 116 | return ggml.ggml_add_inplace(ctx, tmp, tensor) 117 | 118 | 119 | def quantize_0( 120 | data_f32: ggml.CtypesArray[ctypes.c_float], 121 | nelements: int, 122 | ne0: int, 123 | ttype: GGML_TYPE, 124 | work: Optional[ggml.CtypesArray[ctypes.c_float]] = None, 125 | imatrix: Optional[ggml.CtypesArray[ctypes.c_float]] = None, 126 | ): 127 | """Quantize a float32 array. 128 | 129 | Parameters: 130 | data_f32: float32 array 131 | nelements: number of elements in data_f32 132 | ne0: number of elements in data_f32 that are zero 133 | ttype: ggml type to quantize to 134 | work: work buffer 135 | imatrix: quantization matrix 136 | 137 | Returns: 138 | (work, cur_size): outpuut buffer, histogram, number of bytes in work buffer 139 | """ 140 | work = work or (ctypes.c_float * nelements)() 141 | cur_size = ggml.ggml_quantize_chunk( 142 | ttype, 143 | data_f32, 144 | ctypes.cast(work, ctypes.c_void_p), 145 | 0, 146 | nelements, 147 | ne0, 148 | imatrix, 149 | ) 150 | return ctypes.cast(work, ctypes.c_void_p), cur_size 151 | 152 | 153 | def quantize_row( 154 | data_f32: ggml.CtypesArray[ctypes.c_float], 155 | nelements: int, 156 | ttype: GGML_TYPE, 157 | work: Optional[ctypes.c_void_p] = None, 158 | ): 159 | """Quantize a row of a ggml tensor. 160 | 161 | Parameters: 162 | data_f32: float32 array 163 | nelements: number of elements in data_f32 164 | ttype: ggml type to quantize to 165 | work: work buffer 166 | 167 | Returns: 168 | output buffer""" 169 | type_traits = ggml.ggml_internal_get_type_traits(ttype.value) 170 | from_float = type_traits.from_float 171 | work = work or ctypes.cast((ctypes.c_float * nelements)(), ctypes.c_void_p) 172 | from_float(data_f32, work, nelements) 173 | return work 174 | 175 | 176 | def dequantize_row( 177 | data_q: ctypes.c_void_p, 178 | nelements: int, 179 | ttype: GGML_TYPE, 180 | work: Optional[ctypes.c_void_p] = None, 181 | ): 182 | """Dequantize a row of a ggml tensor. 183 | 184 | Parameters: 185 | data_q: quantized data 186 | nelements: number of elements in data_q 187 | ttype: ggml type to dequantize from 188 | work: work buffer 189 | 190 | Returns: 191 | output buffer""" 192 | type_traits = ggml.ggml_internal_get_type_traits(ttype.value) 193 | to_float = type_traits.to_float 194 | work = work or ctypes.cast((ctypes.c_float * nelements)(), ctypes.c_void_p) 195 | to_float(data_q, work, nelements) 196 | return work 197 | 198 | 199 | def get_ndims(tensor: ggml.ggml_tensor_p) -> int: 200 | """Get the number of dimensions of a ggml tensor. 201 | 202 | Parameters: 203 | tensor: ggml tensor 204 | 205 | Returns: 206 | Number of dimensions of tensor 207 | """ 208 | return ggml.ggml_n_dims(tensor) 209 | 210 | 211 | def get_shape(tensor: ggml.ggml_tensor_p) -> Tuple[int, ...]: 212 | """Get the shape of a ggml tensor. 213 | 214 | Parameters: 215 | tensor: ggml tensor 216 | 217 | Returns: 218 | Shape of tensor 219 | """ 220 | return tuple(tensor.contents.ne[: ggml.ggml_n_dims(tensor)]) 221 | 222 | 223 | def get_strides(tensor: ggml.ggml_tensor_p) -> Tuple[int, ...]: 224 | """Get the strides of a ggml tensor. 225 | 226 | Parameters: 227 | tensor: ggml tensor 228 | 229 | Returns: 230 | Strides of tensor 231 | """ 232 | return tuple(tensor.contents.nb[: ggml.ggml_n_dims(tensor)]) 233 | 234 | 235 | def slice_tensor( 236 | ctx: ggml.ggml_context_p, tensor: ggml.ggml_tensor_p, indices: Sequence[slice] 237 | ): 238 | """Slice a ggml tensor along multiple dimensions. 239 | 240 | The slice is a view of the original tensor with the same number of dimensions. 241 | 242 | Parameters: 243 | ctx: ggml context 244 | tensor: ggml tensor 245 | indices: indices to slice along 246 | 247 | Returns: 248 | New ggml tensor slice view""" 249 | ndims = ggml.ggml_n_dims(tensor) 250 | 251 | # check that the number of dimensions match 252 | if len(indices) != ndims: 253 | raise ValueError( 254 | f"tensor has {ndims} dimensions but {len(indices)} indices were given" 255 | ) 256 | 257 | # calculate slice 258 | start = tuple(idx.start or 0 for idx in indices) 259 | end = tuple(idx.stop or get_shape(tensor)[i] for i, idx in enumerate(indices)) 260 | step = tuple(idx.step or 1 for idx in indices) 261 | 262 | # get the shape of the slice 263 | shape = tuple((end[i] - start[i] + step[i] - 1) // step[i] for i in range(ndims)) 264 | 265 | # get the strides of the slice 266 | strides = tuple(get_strides(tensor)[i] * step[i] for i in range(ndims)) 267 | 268 | # get the offset of the slice 269 | offset = sum(get_strides(tensor)[i] * start[i] for i in range(ndims)) 270 | 271 | if ndims == 1: 272 | return ggml.ggml_view_1d( 273 | ctx, 274 | tensor, 275 | shape[0], 276 | offset, 277 | ) 278 | elif ndims == 2: 279 | return ggml.ggml_view_2d( 280 | ctx, 281 | tensor, 282 | shape[0], 283 | shape[1], 284 | strides[1], 285 | offset, 286 | ) 287 | elif ndims == 3: 288 | return ggml.ggml_view_3d( 289 | ctx, 290 | tensor, 291 | shape[0], 292 | shape[1], 293 | shape[2], 294 | strides[1], 295 | strides[2], 296 | offset, 297 | ) 298 | elif ndims == 4: 299 | return ggml.ggml_view_4d( 300 | ctx, 301 | tensor, 302 | shape[0], 303 | shape[1], 304 | shape[2], 305 | shape[3], 306 | strides[1], 307 | strides[2], 308 | strides[3], 309 | offset, 310 | ) 311 | else: 312 | raise NotImplementedError( 313 | f"ggml tensors with {ndims} dimensions are not supported" 314 | ) 315 | 316 | 317 | def setup_sigabrt_handler(): 318 | if platform.system() == "Windows": 319 | return 320 | 321 | c_globals = ctypes.CDLL(None) # POSIX 322 | signal_type = signal.SIGABRT 323 | 324 | @ctypes.CFUNCTYPE(None, ctypes.c_int) 325 | def sigabrt_handler(sig): # type: ignore 326 | traceback.print_stack() 327 | raise Exception("GGML SIGABRT") 328 | 329 | c_globals.signal(signal_type, sigabrt_handler) 330 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: ggml-python 2 | site_url: https://ggml-python.readthedocs.io 3 | repo_url: https://github.com/abetlen/ggml-python 4 | 5 | theme: 6 | name: material 7 | palette: 8 | 9 | # Palette toggle for light mode 10 | - scheme: default 11 | primary: white 12 | toggle: 13 | icon: material/brightness-7 14 | name: Switch to dark mode 15 | 16 | # Palette toggle for dark mode 17 | - scheme: slate 18 | primary: black 19 | toggle: 20 | icon: material/brightness-4 21 | name: Switch to light mode 22 | 23 | features: 24 | - navigation.tabs 25 | - navigation.tabs.sticky 26 | - toc.integrate 27 | - navigation.footer 28 | 29 | plugins: 30 | - mkdocstrings: 31 | handlers: 32 | python: 33 | options: 34 | members_order: source 35 | group_by_category: false 36 | signature_crossrefs: true 37 | show_signature: true 38 | docstring_section_style: list 39 | show_root_heading: true 40 | import: 41 | - https://docs.python.org/3/objects.inv 42 | - https://numpy.org/doc/stable/objects.inv 43 | - search 44 | - social 45 | 46 | markdown_extensions: 47 | - pymdownx.superfences 48 | - pymdownx.inlinehilite 49 | - pymdownx.snippets 50 | - pymdownx.tabbed: 51 | alternate_style: true 52 | - pymdownx.highlight: 53 | anchor_linenums: true 54 | line_spans: __span 55 | pygments_lang_class: true 56 | 57 | watch: 58 | - ggml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["scikit-build-core[pyproject]>=0.5.1"] 3 | build-backend = "scikit_build_core.build" 4 | 5 | [project] 6 | name = "ggml_python" 7 | dynamic = ["version"] 8 | description = "Python bindings for ggml" 9 | readme = "README.md" 10 | license = { text = "MIT" } 11 | authors = [ 12 | { name = "Andrei Betlen", email = "abetlen@gmail.com" }, 13 | ] 14 | requires-python = ">=3.7" 15 | dependencies = [ 16 | "numpy>=1.20.0", 17 | "typing_extensions>=4.6.3", 18 | "importlib_resources>=6.4.0; python_version < '3.9'", 19 | ] 20 | classifiers = [ 21 | "Programming Language :: Python :: 3", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: 3.11", 26 | "Programming Language :: Python :: 3.12", 27 | ] 28 | 29 | [tool.scikit-build] 30 | wheel.packages = ["ggml"] 31 | wheel.expand-macos-universal-tags = true 32 | cmake.verbose = true 33 | cmake.minimum-version = "3.21" 34 | minimum-version = "0.5.1" 35 | 36 | [tool.scikit-build.metadata.version] 37 | provider = "scikit_build_core.metadata.regex" 38 | input = "ggml/__init__.py" 39 | 40 | [tool.pytest.ini_options] 41 | addopts = "--ignore=vendor" 42 | testpaths = "tests" 43 | 44 | [project.optional-dependencies] 45 | test = ["pytest"] 46 | docs = ["mkdocs", "mkdocstrings[python]", "mkdocs-material", "pillow", "cairosvg"] 47 | publish = ["build"] 48 | convert = [ 49 | "accelerate==0.30.1", 50 | "numpy==1.26.4", 51 | "sentencepiece==0.2.0", 52 | "torch==2.3.0", 53 | "torchaudio==2.3.0", 54 | "torchvision==0.18.0", 55 | "transformers==4.41.2" 56 | ] 57 | 58 | [project.urls] 59 | Homepage = "https://github.com/abetlen/ggml-python" 60 | Documentation = "https://ggml-python.readthedocs.io/en/latest/" 61 | Issues = "https://github.com/abetlen/ggml-python/issues" 62 | -------------------------------------------------------------------------------- /scripts/releases-to-pep-503.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get output directory or default to index/whl/cpu 4 | output_dir=${1:-"index/whl/cpu"} 5 | 6 | # Create output directory 7 | mkdir -p $output_dir 8 | 9 | # Change to output directory 10 | pushd $output_dir 11 | 12 | # Create an index html file 13 | echo "" > index.html 14 | echo "" >> index.html 15 | echo " " >> index.html 16 | echo " " >> index.html 17 | echo " ggml-python" >> index.html 18 | echo "
" >> index.html 19 | echo " " >> index.html 20 | echo "" >> index.html 21 | echo "" >> index.html 22 | 23 | # Create ggml-python directory 24 | mkdir -p ggml-python 25 | 26 | # Change to ggml-python directory 27 | pushd ggml-python 28 | 29 | # Create an index html file 30 | echo "" > index.html 31 | echo "" >> index.html 32 | echo " " >> index.html 33 | echo "

Links for ggml-python

" >> index.html 34 | 35 | # Get all releases 36 | releases=$(curl -s https://api.github.com/repos/abetlen/ggml-python/releases | jq -r .[].tag_name) 37 | 38 | # Get pattern from second arg or default to valid python package version pattern 39 | pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"} 40 | 41 | # Filter releases by pattern 42 | releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern) 43 | 44 | # For each release, get all assets 45 | for release in $releases; do 46 | assets=$(curl -s https://api.github.com/repos/abetlen/ggml-python/releases/tags/$release | jq -r .assets) 47 | echo "

$release

" >> index.html 48 | for asset in $(echo $assets | jq -r .[].browser_download_url); do 49 | if [[ $asset == *".whl" ]]; then 50 | echo " $asset" >> index.html 51 | echo "
" >> index.html 52 | fi 53 | done 54 | done 55 | 56 | echo " " >> index.html 57 | echo "" >> index.html 58 | echo "" >> index.html -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abetlen/ggml-python/90ffdc076df76f290227052b285e71a94f29f865/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_ggml.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | 3 | from typing import Optional 4 | 5 | import ggml 6 | 7 | import numpy as np 8 | 9 | 10 | def test_ggml(): 11 | assert ggml.GGML_FILE_VERSION == 1 12 | 13 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None) 14 | ctx = ggml.ggml_init(params) 15 | assert ctx is not None 16 | assert ggml.ggml_used_mem(ctx) == 0 17 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 18 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 19 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 20 | x2 = ggml.ggml_mul(ctx, x, x) 21 | f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b) 22 | gf = ggml.ggml_new_graph(ctx) 23 | ggml.ggml_build_forward_expand(gf, f) 24 | 25 | ggml.ggml_set_f32(x, 2.0) 26 | ggml.ggml_set_f32(a, 3.0) 27 | ggml.ggml_set_f32(b, 4.0) 28 | 29 | ggml.ggml_graph_compute_with_ctx(ctx, gf, 1) 30 | output = ggml.ggml_get_f32_1d(f, 0) 31 | assert output == 16.0 32 | ggml.ggml_free(ctx) 33 | 34 | 35 | def test_ggml_pythonic(): 36 | import contextlib 37 | 38 | with contextlib.ExitStack() as stack: 39 | backend = ggml.ggml_backend_cpu_init() 40 | assert backend is not None 41 | stack.callback(ggml.ggml_backend_free, backend) 42 | 43 | params = ggml.ggml_init_params( 44 | mem_size=ggml.ggml_tensor_overhead() * 6 + ggml.ggml_graph_overhead(), 45 | no_alloc=True, 46 | ) 47 | ctx = ggml.ggml_init(params) 48 | assert ctx is not None 49 | stack.callback(ggml.ggml_free, ctx) 50 | 51 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 52 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 53 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 54 | x2 = ggml.ggml_mul(ctx, x, x) 55 | f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b) 56 | gf = ggml.ggml_new_graph(ctx) 57 | 58 | ggml.ggml_build_forward_expand(gf, f) 59 | 60 | buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend) 61 | assert buffer is not None 62 | stack.callback(ggml.ggml_backend_buffer_free, buffer) 63 | 64 | ggml.ggml_set_f32(x, 2.0) 65 | ggml.ggml_set_f32(a, 3.0) 66 | ggml.ggml_set_f32(b, 4.0) 67 | 68 | ggml.ggml_backend_graph_compute(backend, gf) 69 | 70 | output = ggml.ggml_get_f32_1d(f, 0) 71 | 72 | assert output == 16.0 73 | 74 | 75 | def test_ggml_custom_op(): 76 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None) 77 | ctx = ggml.ggml_init(params) 78 | assert ctx is not None 79 | x_in = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 80 | 81 | @ggml.ggml_custom1_op_t 82 | def double( 83 | tensor_out: ggml.ggml_tensor_p, 84 | tensor_in: ggml.ggml_tensor_p, 85 | ith: int, 86 | nth: int, 87 | userdata: Optional[ctypes.c_void_p], 88 | ): 89 | value = ggml.ggml_get_f32_1d(tensor_in, 0) 90 | ggml.ggml_set_f32(tensor_out, 2 * value) 91 | 92 | x_out = ggml.ggml_map_custom1(ctx, x_in, double, 1, None) 93 | gf = ggml.ggml_new_graph(ctx) 94 | ggml.ggml_build_forward_expand(gf, x_out) 95 | 96 | ggml.ggml_set_f32(x_in, 21.0) 97 | 98 | ggml.ggml_graph_compute_with_ctx(ctx, gf, 1) 99 | output = ggml.ggml_get_f32_1d(x_out, 0) 100 | assert output == 42.0 101 | ggml.ggml_free(ctx) 102 | 103 | 104 | def test_quantize(): 105 | ne0 = 32 106 | ne1 = 1 107 | nelements = ne0 * ne1 108 | data = [float(i) for i in range(nelements)] 109 | data_f32 = (ctypes.c_float * len(data))(*data) 110 | work = (ctypes.c_float * nelements)(0) 111 | # TODO: convert to ggml.ggml_quantize_chunk 112 | # cur_size = ggml.ggml_quantize_q8_0( 113 | cur_size = ggml.ggml_quantize_chunk( 114 | ggml.GGML_TYPE_Q8_0, 115 | data_f32, 116 | ctypes.cast(work, ctypes.c_void_p), 117 | 0, 118 | nelements // ne0, 119 | ne0, 120 | None, 121 | ) 122 | assert cur_size == 34 123 | 124 | type_traits = ggml.ggml_internal_get_type_traits(ggml.GGML_TYPE_Q8_0) 125 | work2 = (ctypes.c_float * nelements)(0) 126 | type_traits.to_float( 127 | ctypes.cast(work, ctypes.c_void_p), 128 | ctypes.cast(work2, ctypes.POINTER(ctypes.c_float)), 129 | nelements, 130 | ) 131 | 132 | eps = 0.5 133 | for i in range(nelements): 134 | assert abs(work2[i] - data[i]) < eps 135 | 136 | 137 | def test_ggml_cpu_backend(): 138 | n_tensors = 1 + 2 # input (x) and weights (a, b) 139 | params = ggml.ggml_init_params( 140 | mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True 141 | ) 142 | ctx = ggml.ggml_init(params) 143 | assert ctx is not None 144 | 145 | backend = ggml.ggml_backend_cpu_init() 146 | 147 | assert backend is not None 148 | 149 | # create the tensors for input and weights 150 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 151 | 152 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 153 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 154 | 155 | # allocate the tensors in the backend 156 | buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend) 157 | assert buffer is not None 158 | 159 | # set the values of the weights 160 | ggml.ggml_backend_tensor_set( 161 | a, 162 | ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 163 | 0, 164 | ggml.ggml_nbytes(a), 165 | ) 166 | ggml.ggml_backend_tensor_set( 167 | b, 168 | ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 169 | 0, 170 | ggml.ggml_nbytes(a), 171 | ) 172 | 173 | max_nodes = 4096 174 | 175 | buf_size = ( 176 | ggml.ggml_tensor_overhead() * max_nodes 177 | + ggml.ggml_graph_overhead_custom(max_nodes, False) 178 | ) 179 | buf = (ctypes.c_uint8 * buf_size)() 180 | 181 | def build_graph( 182 | x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p 183 | ): 184 | params = ggml.ggml_init_params( 185 | mem_size=buf_size, 186 | mem_buffer=ctypes.cast(buf, ctypes.c_void_p), 187 | no_alloc=True, 188 | ) 189 | ctx0 = ggml.ggml_init(params) 190 | 191 | assert ctx0 is not None 192 | 193 | gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False) 194 | 195 | x2 = ggml.ggml_mul(ctx0, x, x) 196 | ax2 = ggml.ggml_mul(ctx0, a, x2) 197 | f = ggml.ggml_add(ctx0, ax2, b) 198 | 199 | ggml.ggml_set_name(x2, b"x2") 200 | ggml.ggml_set_name(ax2, b"ax2") 201 | ggml.ggml_set_name(f, b"f") 202 | 203 | ggml.ggml_build_forward_expand(gf, f) 204 | 205 | ggml.ggml_free(ctx0) 206 | 207 | return gf 208 | 209 | buffer_type = ggml.ggml_backend_get_default_buffer_type(backend) 210 | assert buffer_type is not None 211 | allocr = ggml.ggml_gallocr_new(buffer_type) 212 | assert allocr is not None 213 | 214 | gf = build_graph(x, a, b) 215 | 216 | ggml.ggml_gallocr_reserve(allocr, gf) 217 | 218 | gf = build_graph(x, a, b) 219 | 220 | ggml.ggml_gallocr_alloc_graph(allocr, gf) 221 | 222 | ggml.ggml_backend_tensor_set( 223 | x, 224 | ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 225 | 0, 226 | ggml.ggml_nbytes(x), 227 | ) 228 | 229 | ggml.ggml_backend_graph_compute(backend, gf) 230 | 231 | f = ggml.ggml_graph_get_tensor(gf, b"f") 232 | 233 | output = np.zeros(1, dtype=np.single) 234 | ggml.ggml_backend_tensor_get( 235 | f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x) 236 | ) 237 | 238 | assert output[0] == 16.0 239 | 240 | ggml.ggml_gallocr_free(allocr) 241 | ggml.ggml_backend_buffer_free(buffer) 242 | ggml.ggml_backend_free(backend) 243 | ggml.ggml_free(ctx) 244 | 245 | 246 | def test_grad(): 247 | nthreads = 1 248 | params = ggml.ggml_init_params( 249 | mem_size=128 * 1024 * 1024, mem_buffer=None, no_alloc=False 250 | ) 251 | ctx0 = ggml.ggml_init(params) 252 | assert ctx0 is not None 253 | 254 | x = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1) 255 | 256 | ggml.ggml_set_param(ctx0, x) 257 | 258 | a = ggml.ggml_new_tensor_1d(ctx0, ggml.GGML_TYPE_F32, 1) 259 | b = ggml.ggml_mul(ctx0, x, x) 260 | f = ggml.ggml_mul(ctx0, a, b) 261 | 262 | gf = ggml.ggml_new_graph_custom(ctx0, ggml.GGML_DEFAULT_GRAPH_SIZE, True) 263 | ggml.ggml_build_forward_expand(gf, f) 264 | 265 | gb = ggml.ggml_graph_dup(ctx0, gf) 266 | 267 | ggml.ggml_build_backward_expand(ctx0, gf, gb, False) 268 | 269 | ggml.ggml_set_f32(x, 2.0) 270 | ggml.ggml_set_f32(a, 3.0) 271 | 272 | ggml.ggml_graph_reset(gf) 273 | ggml.ggml_set_f32(f.contents.grad, 1.0) 274 | 275 | ggml.ggml_graph_compute_with_ctx(ctx0, gb, nthreads) 276 | 277 | assert ggml.ggml_get_f32_1d(f, 0) == 12.0 278 | assert ggml.ggml_get_f32_1d(x.contents.grad, 0) == 12.0 279 | 280 | ggml.ggml_free(ctx0) 281 | -------------------------------------------------------------------------------- /tests/test_ggml_cuda.py: -------------------------------------------------------------------------------- 1 | import ggml 2 | import ggml.utils 3 | import ctypes 4 | import pytest 5 | import numpy as np 6 | 7 | ggml_cuda_available = ggml.GGML_USE_CUDA 8 | 9 | run_if_ggml_cuda_available = pytest.mark.skipif( 10 | not ggml_cuda_available, 11 | reason="CUDA not available", 12 | ) 13 | 14 | 15 | @run_if_ggml_cuda_available 16 | def test_cuda(): 17 | n_tensors = 1 + 2 # input (x) and weights (a, b) 18 | params = ggml.ggml_init_params( 19 | mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True 20 | ) 21 | ctx = ggml.ggml_init(params) 22 | assert ctx is not None 23 | 24 | backend = ggml.ggml_backend_cuda_init(0) 25 | 26 | assert backend is not None 27 | 28 | # create the tensors for input and weights 29 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 30 | 31 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 32 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 33 | 34 | # allocate the tensors in the backend 35 | buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend) 36 | 37 | # set the values of the weights 38 | ggml.ggml_backend_tensor_set( 39 | a, 40 | ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 41 | 0, 42 | ggml.ggml_nbytes(a), 43 | ) 44 | ggml.ggml_backend_tensor_set( 45 | b, 46 | ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 47 | 0, 48 | ggml.ggml_nbytes(a), 49 | ) 50 | 51 | max_nodes = 4096 52 | 53 | buf_size = ( 54 | ggml.ggml_tensor_overhead() * max_nodes 55 | + ggml.ggml_graph_overhead_custom(max_nodes, False) 56 | ) 57 | buf = (ctypes.c_uint8 * buf_size)() 58 | 59 | def build_graph( 60 | x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p 61 | ): 62 | params = ggml.ggml_init_params( 63 | mem_size=buf_size, 64 | mem_buffer=ctypes.cast(buf, ctypes.c_void_p), 65 | no_alloc=True, 66 | ) 67 | ctx0 = ggml.ggml_init(params) 68 | 69 | assert ctx0 is not None 70 | 71 | gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False) 72 | 73 | x2 = ggml.ggml_mul(ctx0, x, x) 74 | ax2 = ggml.ggml_mul(ctx0, a, x2) 75 | f = ggml.ggml_add(ctx0, ax2, b) 76 | 77 | ggml.ggml_set_name(x2, b"x2") 78 | ggml.ggml_set_name(ax2, b"ax2") 79 | ggml.ggml_set_name(f, b"f") 80 | 81 | ggml.ggml_build_forward_expand(gf, f) 82 | 83 | ggml.ggml_free(ctx0) 84 | 85 | return gf 86 | 87 | allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend)) 88 | 89 | gf = build_graph(x, a, b) 90 | 91 | ggml.ggml_gallocr_reserve(allocr, gf) 92 | 93 | gf = build_graph(x, a, b) 94 | 95 | ggml.ggml_gallocr_alloc_graph(allocr, gf) 96 | 97 | ggml.ggml_backend_tensor_set( 98 | x, 99 | ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 100 | 0, 101 | ggml.ggml_nbytes(x), 102 | ) 103 | 104 | ggml.ggml_backend_graph_compute(backend, gf) 105 | 106 | f = ggml.ggml_graph_get_tensor(gf, b"f") 107 | 108 | output = np.zeros(1, dtype=np.single) 109 | ggml.ggml_backend_tensor_get( 110 | f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x) 111 | ) 112 | 113 | assert output[0] == 16.0 114 | 115 | ggml.ggml_gallocr_free(allocr) 116 | ggml.ggml_backend_buffer_free(buffer) 117 | ggml.ggml_backend_free(backend) 118 | ggml.ggml_free(ctx) 119 | -------------------------------------------------------------------------------- /tests/test_ggml_metal.py: -------------------------------------------------------------------------------- 1 | import ggml 2 | import ggml.utils 3 | import ctypes 4 | import pytest 5 | import numpy as np 6 | 7 | from ggml.utils import setup_sigabrt_handler 8 | 9 | setup_sigabrt_handler() 10 | 11 | ggml_metal_available = ggml.GGML_USE_METAL 12 | 13 | run_if_ggml_metal_available = pytest.mark.skipif( 14 | not ggml_metal_available, 15 | reason="METAL not available", 16 | ) 17 | 18 | @run_if_ggml_metal_available 19 | def test_metal(): 20 | n_tensors = 1 + 2 # input (x) and weights (a, b) 21 | params = ggml.ggml_init_params( 22 | mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True 23 | ) 24 | ctx = ggml.ggml_init(params) 25 | assert ctx is not None 26 | 27 | backend = ggml.ggml_backend_metal_init() 28 | 29 | assert backend is not None 30 | 31 | # create the tensors for input and weights 32 | x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 33 | 34 | a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 35 | b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) 36 | 37 | # allocate the tensors in the backend 38 | buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend) 39 | 40 | # set the values of the weights 41 | ggml.ggml_backend_tensor_set( 42 | a, 43 | ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 44 | 0, 45 | ggml.ggml_nbytes(a), 46 | ) 47 | ggml.ggml_backend_tensor_set( 48 | b, 49 | ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 50 | 0, 51 | ggml.ggml_nbytes(a), 52 | ) 53 | 54 | max_nodes = 4096 55 | 56 | buf_size = ( 57 | ggml.ggml_tensor_overhead() * max_nodes 58 | + ggml.ggml_graph_overhead_custom(max_nodes, False) 59 | ) 60 | buf = (ctypes.c_uint8 * buf_size)() 61 | 62 | def build_graph( 63 | x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p 64 | ): 65 | params = ggml.ggml_init_params( 66 | mem_size=buf_size, 67 | mem_buffer=ctypes.cast(buf, ctypes.c_void_p), 68 | no_alloc=True, 69 | ) 70 | ctx0 = ggml.ggml_init(params) 71 | 72 | assert ctx0 is not None 73 | 74 | gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False) 75 | 76 | x2 = ggml.ggml_mul(ctx0, x, x) 77 | ax2 = ggml.ggml_mul(ctx0, a, x2) 78 | f = ggml.ggml_add(ctx0, ax2, b) 79 | 80 | ggml.ggml_set_name(x2, b"x2") 81 | ggml.ggml_set_name(ax2, b"ax2") 82 | ggml.ggml_set_name(f, b"f") 83 | 84 | ggml.ggml_build_forward_expand(gf, f) 85 | 86 | ggml.ggml_free(ctx0) 87 | 88 | return gf 89 | 90 | allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend)) 91 | 92 | gf = build_graph(x, a, b) 93 | 94 | ggml.ggml_gallocr_reserve(allocr, gf) 95 | 96 | gf = build_graph(x, a, b) 97 | 98 | ggml.ggml_gallocr_alloc_graph(allocr, gf) 99 | 100 | ggml.ggml_backend_tensor_set( 101 | x, 102 | ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p), 103 | 0, 104 | ggml.ggml_nbytes(x), 105 | ) 106 | 107 | ggml.ggml_backend_graph_compute(backend, gf) 108 | 109 | f = ggml.ggml_graph_get_tensor(gf, b"f") 110 | 111 | output = np.zeros(1, dtype=np.single) 112 | ggml.ggml_backend_tensor_get( 113 | f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x) 114 | ) 115 | 116 | assert output[0] == 16.0 117 | 118 | ggml.ggml_gallocr_free(allocr) 119 | ggml.ggml_backend_buffer_free(buffer) 120 | ggml.ggml_backend_free(backend) 121 | ggml.ggml_free(ctx) 122 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import ggml 2 | import ggml.utils 3 | 4 | import pytest 5 | 6 | import numpy as np 7 | import numpy.typing as npt 8 | 9 | 10 | def test_utils(): 11 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 12 | ctx = ggml.ggml_init(params) 13 | assert ctx is not None 14 | x = np.ones((3,), dtype=np.float32) 15 | assert x.shape == (3,) 16 | t = ggml.utils.from_numpy(x, ctx) 17 | assert t.contents.ne[:1] == [3] 18 | assert t.contents.type == ggml.GGML_TYPE_F32 19 | assert np.allclose(ggml.utils.to_numpy(t), x) 20 | ggml.ggml_free(ctx) 21 | 22 | 23 | def test_numpy_arrays(): 24 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 25 | ctx = ggml.ggml_init(params) 26 | assert ctx is not None 27 | x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32, order="F") 28 | assert x.shape == (2, 3) 29 | t = ggml.utils.from_numpy(x, ctx) 30 | assert t.contents.ne[:2] == [3, 2] 31 | y = ggml.utils.to_numpy(t) 32 | assert y.shape == (2, 3) 33 | ggml.ggml_free(ctx) 34 | 35 | 36 | def test_numpy_arrays_transposed(): 37 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 38 | ctx = ggml.ggml_init(params) 39 | assert ctx is not None 40 | x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) 41 | t = ggml.utils.from_numpy(x, ctx) 42 | 43 | t_T = ggml.ggml_transpose(ctx, t) 44 | 45 | # ggml_transpose currently modifies the original tensor in place, input must be 46 | # set _after_ the transpose operation 47 | ggml.utils.to_numpy(t)[:] = x 48 | 49 | assert ggml.utils.get_shape(t_T) == (2, 3) 50 | assert ggml.utils.get_strides(t_T) == (12, 4) 51 | 52 | assert np.array_equal(ggml.utils.to_numpy(t_T, shape=x.T.shape), x.T) 53 | 54 | ggml.ggml_free(ctx) 55 | 56 | 57 | def test_numpy_arrays_transposed_diff_ctx(): 58 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 59 | ctx = ggml.ggml_init(params) 60 | assert ctx is not None 61 | x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) 62 | t = ggml.utils.from_numpy(x, ctx) 63 | 64 | ggml.utils.to_numpy(t)[:] = x 65 | 66 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 67 | ctx2 = ggml.ggml_init(params) 68 | assert ctx2 is not None 69 | 70 | t_T = ggml.ggml_transpose(ctx2, t) 71 | 72 | assert ggml.utils.get_shape(t_T) == (2, 3) 73 | assert ggml.utils.get_strides(t_T) == (12, 4) 74 | 75 | assert np.array_equal(ggml.utils.to_numpy(t_T, shape=x.T.shape), x.T) 76 | 77 | ggml.ggml_free(ctx) 78 | ggml.ggml_free(ctx2) 79 | 80 | 81 | def test_numpy_arrays_permute_transpose(): 82 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 83 | ctx = ggml.ggml_init(params) 84 | assert ctx is not None 85 | 86 | x = np.array( 87 | [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], dtype=np.int32 88 | ) 89 | t = ggml.utils.from_numpy(x, ctx) 90 | 91 | t_T = ggml.ggml_permute(ctx, t, 2, 1, 0, 3) 92 | 93 | ggml.utils.to_numpy(t)[:] = x 94 | 95 | x_T = ggml.utils.to_numpy(t_T) 96 | assert np.array_equal(x_T, x.T) 97 | 98 | ggml.ggml_free(ctx) 99 | 100 | 101 | def test_slice_tensor(): 102 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 103 | ctx = ggml.ggml_init(params) 104 | assert ctx is not None 105 | x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) 106 | t = ggml.utils.from_numpy(x, ctx) 107 | t_slice = ggml.utils.slice_tensor(ctx, t, [ 108 | slice(0, 1), 109 | slice(0, 2) 110 | ]) 111 | x_slice = x[:2, :1] 112 | t_slice_array = ggml.utils.to_numpy(t_slice) 113 | assert np.array_equal(t_slice_array, x_slice) 114 | ggml.ggml_free(ctx) 115 | 116 | 117 | @pytest.mark.parametrize("a, b", [ 118 | [np.array([1], dtype=np.float32), np.array([1], dtype=np.float32)], 119 | [np.array([1, 1], dtype=np.float32), np.array([1], dtype=np.float32)], 120 | [np.array([1, 1], dtype=np.float32), np.array([[1, 2]], dtype=np.float32)], 121 | ]) 122 | def test_broadcast_tensor(a: npt.NDArray[np.float32], b: npt.NDArray[np.float32]): 123 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 124 | ctx = ggml.ggml_init(params) 125 | assert ctx is not None 126 | params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) 127 | ctx2 = ggml.ggml_init(params) 128 | assert ctx2 is not None 129 | t_a = ggml.utils.from_numpy(a, ctx) 130 | t_b = ggml.utils.from_numpy(b, ctx) 131 | t_sum = ggml.ggml_add(ctx2, t_a, t_b) 132 | gf = ggml.ggml_new_graph(ctx2) 133 | ggml.ggml_build_forward_expand(gf, t_sum) 134 | ggml.ggml_graph_compute_with_ctx(ctx2, gf, 1) 135 | expected = a + b 136 | result = ggml.utils.to_numpy(t_sum).reshape(expected.shape) 137 | assert np.array_equal(result, expected) 138 | ggml.ggml_free(ctx) 139 | --------------------------------------------------------------------------------