├── .dockerignore
├── .env
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── build-wheels-fix.yml
    │   ├── build-wheels-release-rocm.yml
    │   ├── build-wheels-release.yml
    │   ├── build-wheels-rocm.yml
    │   └── build-wheels.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── datasets
    ├── download_datasets.py
    └── wikitext2_val_sample.jsonl
├── doc
    ├── TODO.md
    ├── _screenshot.jpg
    └── model_compatibility.md
├── docker-compose.yml
├── entrypoint.sh
├── example_alt_generator.py
├── example_basic.py
├── example_batch.py
├── example_cfg.py
├── example_chatbot.py
├── example_flask.py
├── example_lora.py
├── example_ws.py
├── exllama
    ├── __init__.py
    ├── alt_generator.py
    ├── cuda_ext.py
    ├── generator.py
    ├── lora.py
    ├── model.py
    └── tokenizer.py
├── exllama_ext
    ├── cpu_func
    │   ├── rep_penalty.cpp
    │   └── rep_penalty.h
    ├── cuda_buffers.cu
    ├── cuda_buffers.cuh
    ├── cuda_compat.cuh
    ├── cuda_func
    │   ├── column_remap.cu
    │   ├── column_remap.cuh
    │   ├── half_matmul.cu
    │   ├── half_matmul.cuh
    │   ├── q4_attn.cu
    │   ├── q4_attn.cuh
    │   ├── q4_matmul.cu
    │   ├── q4_matmul.cuh
    │   ├── q4_matrix.cu
    │   ├── q4_matrix.cuh
    │   ├── q4_mlp.cu
    │   ├── q4_mlp.cuh
    │   ├── rms_norm.cu
    │   ├── rms_norm.cuh
    │   ├── rope.cu
    │   └── rope.cuh
    ├── exllama_ext.cpp
    ├── hip_compat.cuh
    ├── matrix.cuh
    ├── tuning.h
    └── util.cuh
├── globals.py
├── model_init.py
├── perplexity.py
├── prompt_chatbort.txt
├── requirements-web.txt
├── requirements.txt
├── setup.py
├── sh
    ├── test_benchmark_perf.sh
    ├── test_benchmark_perf2.sh
    ├── test_benchmark_ppl.sh
    └── test_compat.sh
├── test_benchmark_inference.py
├── util
    └── shard.py
└── webui
    ├── app.py
    ├── session.py
    ├── static
        ├── main.js
        └── style.css
    └── templates
        └── index.html


/.dockerignore:
--------------------------------------------------------------------------------
1 | exllama_sessions
2 | models
3 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | PORT=5000
2 | RUN_UID=1000  # set to 0 to run the service as root inside the container
3 | APPLICATION_STATE_PATH=/data  # path to the directory holding application state inside the container
4 | MODEL_PATH=models/LLaMA-7B-4bit-128g  # replace with the actual model path on the host
5 | SESSIONS_PATH=~/exllama_sessions  # replace with the actual directory on the host where chat sessions should be stored
6 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | ko_fi: turboderp
2 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheels-fix.yml:
--------------------------------------------------------------------------------
 1 | name: Fix Release
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | permissions:
 6 |   contents: write
 7 | 
 8 | jobs:
 9 |   build_wheels:
10 |     name: ${{ matrix.os }} Python ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: [ubuntu-20.04, windows-latest]
15 |         pyver: ["3.8", "3.9", "3.10", "3.11"]
16 |         cuda: ["11.7.0", "11.8.0"]
17 |     defaults:
18 |       run:
19 |         shell: pwsh
20 |     env:
21 |       CUDAVER: ${{ matrix.cuda }}
22 | 
23 |     steps:
24 |       - uses: actions/checkout@v3
25 |         with:
26 |           ref: 'wheel-fix'
27 |       - uses: actions/setup-python@v3
28 |         with:
29 |           python-version: ${{ matrix.pyver }}
30 |         
31 |       - name: Setup Mamba
32 |         uses: conda-incubator/setup-miniconda@v2.2.0
33 |         with:
34 |           activate-environment: "build"
35 |           python-version: ${{ matrix.pyver }}
36 |           miniforge-variant: Mambaforge
37 |           miniforge-version: latest
38 |           use-mamba: true
39 |           add-pip-as-python-dependency: true
40 |           auto-activate-base: false
41 |           
42 |       - name: Install Dependencies
43 |         run: |
44 |           $cudaVersion = $env:CUDAVER
45 |           $cudaVersionPytorch = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
46 |           $cudaChannels = ''
47 |           $cudaNum = [int]$cudaVersion.substring($cudaVersion.LastIndexOf('.')+1)
48 |           while ($cudaNum -ge 0) { $cudaChannels += '-c nvidia/label/cuda-' + $cudaVersion.Remove($cudaVersion.LastIndexOf('.')+1) + $cudaNum + ' '; $cudaNum-- }
49 |           mamba install -y 'cuda' $cudaChannels.TrimEnd().Split()
50 |           python -m pip install build wheel "torch==2.0.1+cu$cudaVersionPytorch" safetensors sentencepiece ninja --extra-index-url "https://download.pytorch.org/whl/cu$cudaVersionPytorch"
51 |         
52 |       - name: Build Wheel
53 |         id: build-wheel
54 |         run: |
55 |           Write-Output "PACKAGE_VERSION=0.0.6" >> "$env:GITHUB_OUTPUT"
56 |           $env:CUDA_PATH = $env:CONDA_PREFIX
57 |           $env:CUDA_HOME = $env:CONDA_PREFIX
58 |           if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
59 |           $env:TORCH_CUDA_ARCH_LIST = if ([version]$env:CUDAVER -lt [version]'11.8') {'6.0 6.1 7.0 7.5 8.0 8.6+PTX'} else {'6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'}
60 |           python -m build -n --wheel
61 |         
62 |       - uses: actions/upload-artifact@v3
63 |         with:
64 |           name: 'wheels'
65 |           path: ./dist/*.whl
66 |           
67 |       - name: Upload files to a GitHub release
68 |         if: steps.build-wheel.outputs.PACKAGE_VERSION != 'None'
69 |         uses: svenstaro/upload-release-action@2.6.1
70 |         with:
71 |           file: ./dist/*.whl
72 |           tag: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }}
73 |           file_glob: true
74 |           overwrite: true
75 |           release_name: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }}
76 |           body: "Wheels are compiled with CUDA 11.7 and 11.8 for Windows and Linux x64"
77 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheels-release-rocm.yml:
--------------------------------------------------------------------------------
 1 | name: Build ROCm Wheels & Release
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   workflow_call:
 6 | 
 7 | permissions:
 8 |   contents: write
 9 | 
10 | jobs:
11 |   build_wheels:
12 |     name: Build ROCm ${{ matrix.rocm }} wheel for Python ${{ matrix.pyver }}
13 |     runs-on: ubuntu-20.04
14 |     strategy:
15 |       matrix:
16 |         pyver: ["3.8", "3.9", "3.10", "3.11"]
17 |         rocm: ['5.4.2', '5.5', '5.6']
18 |     defaults:
19 |       run:
20 |         shell: pwsh
21 |     env:
22 |       ROCM_VERSION: ${{ matrix.rocm }}
23 | 
24 |     steps:
25 |       - name: Free Disk Space
26 |         uses: jlumbroso/free-disk-space@v1.2.0
27 |         with:
28 |           tool-cache: false
29 |           android: true
30 |           dotnet: true
31 |           haskell: true
32 |           large-packages: false
33 |           swap-storage: false
34 |     
35 |       - uses: actions/checkout@v3
36 |           
37 |       - name: Install ROCm SDK
38 |         shell: bash
39 |         run: |
40 |           [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings
41 |           wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
42 |           echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
43 |           echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
44 |           sudo apt update
45 |           sudo apt install rocm-dev rocsparse-dev rocprim-dev rocthrust-dev rocblas-dev hipblas-dev hipcub-dev hipsparse-dev -y
46 |           echo "/opt/rocm/bin" >> $GITHUB_PATH
47 |           echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV
48 |           echo "USE_ROCM=1" >> $GITHUB_ENV
49 |           
50 |       - uses: actions/setup-python@v3
51 |         with:
52 |           python-version: ${{ matrix.pyver }}
53 |           
54 |       - name: Install Dependencies
55 |         run: |
56 |           $packages = 'build wheel safetensors sentencepiece ninja'
57 |           $torver = if ([version]$env:ROCM_VERSION -lt [version]'5.5') {'2.0.1'} else {'2.1.0'}
58 |           $packages += " torch==$torver+rocm$env:ROCM_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm$env:ROCM_VERSION"
59 |           
60 |           pip3 install $packages.split(' ')
61 |         
62 |       - name: Build Wheel
63 |         id: build-wheel
64 |         run: |
65 |           if ($(Get-Content 'setup.py' -raw) -match 'version = "(\d+\.(?:\d+\.?)*)" \+ \(') {Write-Output $('::notice file=build-wheels-release-rocm.yml,line=54,title=Package Version::Detected package version is: {0}' -f $Matches[1]); Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT"} else {Write-Output '::error file=build-wheels-release.yml,line=41::Could not parse version from setup.py! You must upload wheels manually!'; Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT"}
66 |           
67 |           $env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'
68 |           if ([version]$env:ROCM_VERSION -lt [version]'5.5') {$env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030'}
69 |           
70 |           python3 -m build -n --wheel
71 |         
72 |       - uses: actions/upload-artifact@v3
73 |         with:
74 |           name: 'wheels'
75 |           path: ./dist/*.whl
76 |           
77 |       - name: Upload files to a GitHub release
78 |         if: steps.build-wheel.outputs.PACKAGE_VERSION != 'None'
79 |         uses: svenstaro/upload-release-action@2.6.1
80 |         with:
81 |           file: ./dist/*.whl
82 |           tag: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }}
83 |           file_glob: true
84 |           overwrite: true
85 |           release_name: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }}
86 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheels-release.yml:
--------------------------------------------------------------------------------
  1 | name: Build Wheels & Release
  2 | 
  3 | on: workflow_dispatch
  4 | 
  5 | permissions:
  6 |   contents: write
  7 | 
  8 | jobs:
  9 |   build_wheels:
 10 |     name: ${{ matrix.os }} Python ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
 11 |     runs-on: ${{ matrix.os }}
 12 |     strategy:
 13 |       matrix:
 14 |         os: [ubuntu-20.04, windows-latest]
 15 |         pyver: ["3.8", "3.9", "3.10", "3.11"]
 16 |         cuda: ["11.7.0", "11.8.0", "12.1.1"]
 17 |     defaults:
 18 |       run:
 19 |         shell: pwsh
 20 |     env:
 21 |       CUDAVER: ${{ matrix.cuda }}
 22 | 
 23 |     steps:
 24 |       - name: Free Disk Space
 25 |         uses: jlumbroso/free-disk-space@v1.2.0
 26 |         if: runner.os == 'Linux'
 27 |         with:
 28 |           tool-cache: false
 29 |           android: true
 30 |           dotnet: true
 31 |           haskell: true
 32 |           large-packages: false
 33 |           swap-storage: false
 34 |           
 35 |       - uses: actions/checkout@v3
 36 |       - uses: actions/setup-python@v3
 37 |         with:
 38 |           python-version: ${{ matrix.pyver }}
 39 |         
 40 |       - name: Setup Mamba
 41 |         uses: conda-incubator/setup-miniconda@v2.2.0
 42 |         with:
 43 |           activate-environment: "build"
 44 |           python-version: ${{ matrix.pyver }}
 45 |           miniforge-variant: Mambaforge
 46 |           miniforge-version: latest
 47 |           use-mamba: true
 48 |           add-pip-as-python-dependency: true
 49 |           auto-activate-base: false
 50 |           
 51 |       - name: Install Dependencies
 52 |         run: |
 53 |           $cudaVersion = $env:CUDAVER
 54 |           $cudaVersionPytorch = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
 55 |           $cudaChannels = ''
 56 |           $cudaNum = [int]$cudaVersion.substring($cudaVersion.LastIndexOf('.')+1)
 57 |           while ($cudaNum -ge 0) { $cudaChannels += '-c nvidia/label/cuda-' + $cudaVersion.Remove($cudaVersion.LastIndexOf('.')+1) + $cudaNum + ' '; $cudaNum-- }
 58 |           mamba install -y 'cuda' $cudaChannels.TrimEnd().Split()
 59 |           if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaChannels.TrimEnd().Split()}
 60 |           if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'}
 61 |           
 62 |           if ([version]$env:CUDAVER -lt [version]'11.8.0') {$torch = "torch==2.0.1"} else {$torch = "torch==2.1.0"}
 63 |           
 64 |           python -m pip install --upgrade build setuptools wheel ninja $torch --extra-index-url "https://download.pytorch.org/whl/cu$cudaVersionPytorch"
 65 |         
 66 |       - name: Build Wheel
 67 |         id: build-wheel
 68 |         run: |
 69 |           if ($(Get-Content 'setup.py' -raw) -match 'version = "(\d+\.(?:\d+\.?)*)" \+ \(')
 70 |           {
 71 |               Write-Output $('::notice file=build-wheels-release.yml,line=53,title=Package Version::Detected package version is: {0}' -f $Matches[1])
 72 |               Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT"
 73 |           } else {
 74 |               Write-Output '::error file=build-wheels-release.yml,line=41::Could not parse version from setup.py! You must upload wheels manually!'
 75 |               Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT"
 76 |           }
 77 |           
 78 |           $env:CUDA_PATH = $env:CONDA_PREFIX
 79 |           $env:CUDA_HOME = $env:CONDA_PREFIX
 80 |           if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
 81 |           
 82 |           $env:TORCH_CUDA_ARCH_LIST = if ([version]$env:CUDAVER -lt [version]'11.8') {'6.0 6.1 7.0 7.5 8.0 8.6+PTX'} else {'6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'}
 83 |           
 84 |           python -m build -n --wheel
 85 |         
 86 |       - uses: actions/upload-artifact@v3
 87 |         with:
 88 |           name: 'wheels'
 89 |           path: ./dist/*.whl
 90 |           
 91 |       - name: Upload files to a GitHub release
 92 |         if: steps.build-wheel.outputs.PACKAGE_VERSION != 'None'
 93 |         uses: svenstaro/upload-release-action@2.6.1
 94 |         with:
 95 |           file: ./dist/*.whl
 96 |           tag: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }}
 97 |           file_glob: true
 98 |           overwrite: true
 99 |           release_name: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }}
100 |           
101 |   build_rocm:
102 |     name: Build ROCm Wheels & Release
103 |     needs: build_wheels
104 |     uses: ./.github/workflows/build-wheels-release-rocm.yml
105 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheels-rocm.yml:
--------------------------------------------------------------------------------
 1 | name: Build ROCm Wheels
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   workflow_call:
 6 | 
 7 | jobs:
 8 |   build_wheels:
 9 |     name: Build ROCm ${{ matrix.rocm }} wheel for Python ${{ matrix.pyver }}
10 |     runs-on: ubuntu-20.04
11 |     strategy:
12 |       matrix:
13 |         pyver: ["3.8", "3.9", "3.10", "3.11"]
14 |         rocm: ['5.4.2', '5.5', '5.6']
15 |     defaults:
16 |       run:
17 |         shell: pwsh
18 |     env:
19 |       ROCM_VERSION: ${{ matrix.rocm }}
20 | 
21 |     steps:
22 |       - name: Free Disk Space
23 |         uses: jlumbroso/free-disk-space@v1.2.0
24 |         with:
25 |           tool-cache: false
26 |           android: true
27 |           dotnet: true
28 |           haskell: true
29 |           large-packages: false
30 |           swap-storage: false
31 |     
32 |       - uses: actions/checkout@v3
33 |           
34 |       - name: Install ROCm SDK
35 |         shell: bash
36 |         run: |
37 |           [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings
38 |           wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
39 |           echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
40 |           echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
41 |           sudo apt update
42 |           sudo apt install rocm-dev rocsparse-dev rocprim-dev rocthrust-dev rocblas-dev hipblas-dev hipcub-dev hipsparse-dev -y
43 |           echo "/opt/rocm/bin" >> $GITHUB_PATH
44 |           echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV
45 |           echo "USE_ROCM=1" >> $GITHUB_ENV
46 |           
47 |       - uses: actions/setup-python@v3
48 |         with:
49 |           python-version: ${{ matrix.pyver }}
50 |           
51 |       - name: Install Dependencies
52 |         run: |
53 |           $packages = 'build wheel safetensors sentencepiece ninja'
54 |           $torver = if ([version]$env:ROCM_VERSION -lt [version]'5.5') {'2.0.1'} else {'2.1.0'}
55 |           $packages += " torch==$torver+rocm$env:ROCM_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm$env:ROCM_VERSION"
56 |           
57 |           pip3 install $packages.split(' ')
58 |         
59 |       - name: Build Wheel
60 |         run: |
61 |           $env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'
62 |           if ([version]$env:ROCM_VERSION -lt [version]'5.5') {$env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030'}
63 |           
64 |           python3 -m build -n --wheel
65 |         
66 |       - uses: actions/upload-artifact@v3
67 |         with:
68 |           name: 'wheels'
69 |           path: ./dist/*.whl
70 | 


--------------------------------------------------------------------------------
/.github/workflows/build-wheels.yml:
--------------------------------------------------------------------------------
 1 | name: Build Wheels
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   build_wheels:
 7 |     name: ${{ matrix.os }} Python ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: [ubuntu-20.04, windows-latest]
12 |         pyver: ["3.8", "3.9", "3.10", "3.11"]
13 |         cuda: ["11.7.0", "11.8.0", "12.1.1"]
14 |     defaults:
15 |       run:
16 |         shell: pwsh
17 |     env:
18 |       CUDAVER: ${{ matrix.cuda }}
19 | 
20 |     steps:
21 |       - name: Free Disk Space
22 |         uses: jlumbroso/free-disk-space@v1.2.0
23 |         if: runner.os == 'Linux'
24 |         with:
25 |           tool-cache: false
26 |           android: true
27 |           dotnet: true
28 |           haskell: true
29 |           large-packages: false
30 |           swap-storage: false
31 |           
32 |       - uses: actions/checkout@v3
33 |       - uses: actions/setup-python@v3
34 |         with:
35 |           python-version: ${{ matrix.pyver }}
36 |         
37 |       - name: Setup Mamba
38 |         uses: conda-incubator/setup-miniconda@v2.2.0
39 |         with:
40 |           activate-environment: "build"
41 |           python-version: ${{ matrix.pyver }}
42 |           miniforge-variant: Mambaforge
43 |           miniforge-version: latest
44 |           use-mamba: true
45 |           add-pip-as-python-dependency: true
46 |           auto-activate-base: false
47 |           
48 |       - name: Install Dependencies
49 |         run: |
50 |           $cudaVersion = $env:CUDAVER
51 |           $cudaVersionPytorch = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
52 |           $cudaChannels = ''
53 |           $cudaNum = [int]$cudaVersion.substring($cudaVersion.LastIndexOf('.')+1)
54 |           while ($cudaNum -ge 0) { $cudaChannels += '-c nvidia/label/cuda-' + $cudaVersion.Remove($cudaVersion.LastIndexOf('.')+1) + $cudaNum + ' '; $cudaNum-- }
55 |           mamba install -y 'cuda' $cudaChannels.TrimEnd().Split()
56 |           if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaChannels.TrimEnd().Split()}
57 |           if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'}
58 |           
59 |           if ([version]$env:CUDAVER -lt [version]'11.8.0') {$torch = "torch==2.0.1"} else {$torch = "torch==2.1.0"}
60 |           
61 |           python -m pip install --upgrade build setuptools wheel ninja $torch --extra-index-url "https://download.pytorch.org/whl/cu$cudaVersionPytorch"
62 |         
63 |       - name: Build Wheel
64 |         run: |
65 |           $env:CUDA_PATH = $env:CONDA_PREFIX
66 |           $env:CUDA_HOME = $env:CONDA_PREFIX
67 |           if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
68 |           
69 |           $env:TORCH_CUDA_ARCH_LIST = if ([version]$env:CUDAVER -lt [version]'11.8') {'6.0 6.1 7.0 7.5 8.0 8.6+PTX'} else {'6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'}
70 |           
71 |           python -m build -n --wheel
72 |         
73 |       - uses: actions/upload-artifact@v3
74 |         with:
75 |           name: 'wheels'
76 |           path: ./dist/*.whl
77 |           
78 |   build_rocm:
79 |     name: Build ROCm Wheels
80 |     needs: build_wheels
81 |     uses: ./.github/workflows/build-wheels-rocm.yml
82 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # ignore __pycache__ folder
2 | __pycache__/*.egg-info/
3 | *.egg-info/
4 | build/


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as build
 2 | ARG RUN_UID="1000" \
 3 |     APPLICATION_STATE_PATH="/data"
 4 | ENV RUN_UID=$RUN_UID \
 5 |     APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH \
 6 |     CONTAINER_MODEL_PATH=$APPLICATION_STATE_PATH/model \
 7 |     CONTAINER_SESSIONS_PATH=$APPLICATION_STATE_PATH/exllama_sessions
 8 | 
 9 | RUN apt-get update && \
10 |     DEBIAN_FRONTEND=noninteractive apt-get install -y ninja-build python3 python3-pip && \
11 |     rm -rf /var/lib/apt/lists/*
12 | 
13 | # Setup user which will run the service and create application state directory
14 | RUN if [ ${RUN_UID} -ne 0 ] ; then useradd -m -u $RUN_UID user ; fi \
15 |     && mkdir -p $APPLICATION_STATE_PATH \
16 |     && mkdir -p $CONTAINER_MODEL_PATH \
17 |     && mkdir -p $CONTAINER_SESSIONS_PATH \
18 |     && chown -R $RUN_UID $APPLICATION_STATE_PATH
19 | USER $RUN_UID
20 | 
21 | COPY --chown=$RUN_UID . /app
22 | 
23 | WORKDIR /app
24 | 
25 | # Create application state directory and install python packages
26 | RUN pip install --upgrade pip setuptools wheel \
27 |     && pip install -r requirements.txt \
28 |     && pip install -r requirements-web.txt \
29 |     && pip install .
30 | 
31 | USER root
32 | 
33 | STOPSIGNAL SIGINT
34 | ENTRYPOINT ["/bin/bash", "-c", "/app/entrypoint.sh $0 $@"]
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### This is a python module version of ExLlama
  2 | The pupose of this is to allow for one-time building of the CUDA kernels.
  3 | 
  4 | To build the module, install the CUDA Toolkit or ROCm SDK along with the appropriate Pytorch version that you intend to use.
  5 | Full list of requirements are listed below. After this, you can install the module with:
  6 | ```
  7 | python -m pip install git+https://github.com/jllllll/exllama
  8 | ```
  9 | Or you can build a wheel with:
 10 | ```
 11 | python -m pip wheel git+https://github.com/jllllll/exllama --no-deps
 12 | ```
 13 | The CUDA version used to build the wheel will be appended to the version number automatically.  
 14 | ROCm version can be appended by defining the `ROCM_VERSION` environment variable: `ROCM_VERSION=5.4.2`
 15 | 
 16 | Pre-built wheels are available in the releases.
 17 | 
 18 | ---
 19 | 
 20 | # ExLlama
 21 | 
 22 | A standalone Python/C++/CUDA implementation of Llama for use with 4-bit GPTQ weights, designed to be fast and
 23 | memory-efficient on modern GPUs.
 24 | 
 25 | Disclaimer: The project is coming along, but it's still a work in progress!
 26 | 
 27 | ## Hardware requirements
 28 | 
 29 | I am developing on an RTX 4090 and an RTX 3090-Ti. 30-series and later NVIDIA GPUs should be well supported, but
 30 | anything Pascal or older with poor FP16 support isn't going to perform well. 
 31 | [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) or [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa)
 32 | are better options at the moment for older GPUs. ROCm is also theoretically supported (via HIP) though I currently 
 33 | have no AMD devices to test or optimize on.
 34 | 
 35 | ## Dependencies
 36 | 
 37 | * Python 3.9 or newer
 38 | * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
 39 | * `safetensors` 0.3.2
 40 | * `sentencepiece`
 41 | * `ninja`
 42 | 
 43 | Additionally, only for the web UI:
 44 | 
 45 | * `flask`
 46 | * `waitress`
 47 | 
 48 | ## Linux/WSL prerequisites
 49 | 
 50 |     pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118
 51 | 
 52 | ## Windows prerequisites
 53 | 
 54 | To run on Windows (without WSL):
 55 | 
 56 | 1. Install [MSVC 2022](https://visualstudio.microsoft.com/downloads/). You can choose to install the whole `Visual 
 57 | Studio 2022` IDE, or alternatively just the `Build Tools for Visual Studio 2022` package (make sure `Desktop
 58 | development with C++` is ticked in the installer), it doesn't really matter which.
 59 | 2. Install the appropriate version of [PyTorch](https://pytorch.org/get-started/locally/), choosing one of the CUDA
 60 | versions. I am developing on the nightly build, but the stable version (2.0.1) should also work.
 61 | 3. Install CUDA Toolkit, ([11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and 
 62 | [11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) both seem to work, just make sure to match PyTorch's
 63 | Compute Platform version).
 64 | 4. For best performance, enable Hardware Accelerated GPU Scheduling.
 65 | 
 66 | ## How to
 67 | 
 68 | Clone repo, install dependencies, and run benchmark:
 69 | 
 70 |     git clone https://github.com/turboderp/exllama
 71 |     cd exllama
 72 | 
 73 |     pip install -r requirements.txt
 74 | 
 75 |     python test_benchmark_inference.py -d <path_to_model_files> -p -ppl
 76 | 
 77 | The CUDA extension is loaded at runtime so there's no need to install it separately. It will be compiled on the first
 78 | run and cached to `~/.cache/torch_extensions/` which could take a little while. If nothing happens at first, give it
 79 | a minute to compile.
 80 | 
 81 | Chatbot example:
 82 | 
 83 |     python example_chatbot.py -d <path_to_model_files> -un "Jeff" -p prompt_chatbort.txt
 84 | 
 85 | ## Python module
 86 | 
 87 | jllllll currently maintains an installable Python module [here](https://github.com/jllllll/exllama) which may be more
 88 | suitable for integrating ExLlama with other projects
 89 | 
 90 | ## Web UI
 91 | 
 92 | I also made a simple web UI for it. Don't look at the JavaScript, it was mostly written by ChatGPT and it will haunt
 93 | your dreams. But it sort of works, and it's kinda fun, especially multibot mode:
 94 | 
 95 | ![_screenshot.jpg](doc/_screenshot.jpg)
 96 | 
 97 | To run it:
 98 | 
 99 |     pip install -r requirements-web.txt
100 | 
101 |     python webui/app.py -d <path_to_model_files>
102 | 
103 | Note that sessions are stored in `~/exllama_sessions/` by default. You can change that location with `-sd` if you want.
104 | 
105 | ## Docker
106 | 
107 | For security benefits and easier deployment, it is also possible to run the web UI in an isolated docker container. Note: the docker image currently only supports NVIDIA GPUs.
108 | 
109 | ### Requirements
110 | 
111 | - [Docker](https://docs.docker.com/engine/install/)
112 | - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
113 | 
114 | It is recommended to run docker in [rootless mode](https://docs.docker.com/engine/security/rootless/).
115 | 
116 | ### Build
117 | 
118 | The easiest way to build the docker image is using docker compose. First, set the `MODEL_PATH` and `SESSIONS_PATH` variables in the `.env` file to the actual directories on the host. Then run:
119 | 
120 | ```
121 | docker compose build
122 | ```
123 | 
124 | It is also possible to manually build the image:
125 | 
126 | ```
127 | docker build -t exllama-web .
128 | ```
129 | 
130 | NOTE: by default, the service inside the docker container is run by a non-root user. Hence, the ownership of bind-mounted directories (`/data/model` and `/data/exllama_sessions` in the default `docker-compose.yml` file) is changed to this non-root user in the container entrypoint (`entrypoint.sh`). To disable this, set `RUN_UID=0` in the `.env` file if using `docker compose`, or the following command if you manually build the image:
131 | 
132 | ```
133 | docker build -t exllama-web --build-arg RUN_UID=0 .
134 | ```
135 | 
136 | ### Run
137 | 
138 | Using docker compose:
139 | 
140 | ```
141 | docker compose up
142 | ```
143 | 
144 | The web UI can now be accessed on the host at http://localhost:5000.
145 | 
146 | The configuration can be viewed in `docker-compose.yml` and changed by creating a `docker-compose.override.yml` file.
147 | 
148 | Run manually: 
149 | 
150 | ```
151 | docker run --gpus all -p 5000:5000 -v <path_to_model_dir>:/data/model/ -v <path_to_session_dir>:/data/exllama_sessions --rm -it exllama-web --host 0.0.0.0:5000
152 | ```
153 | 
154 | 
155 | ## Results so far
156 | 
157 | ### New implementation
158 | | Model      | Size  | grpsz | act | Seq. len.            | VRAM      | Prompt     | Best    | Worst   | Ppl  |
159 | |------------|-------|-------|-----|----------------------|-----------|------------|---------|---------|------|
160 | | Llama      | 7B    | 128   | no  | 2,048 t              | 5,194 MB  | 13,918 t/s | 173 t/s | 140 t/s | 6.45 |
161 | | Llama      | 13B   | 128   | no  | 2,048 t              | 9,127 MB  | 7,507 t/s  | 102 t/s | 86 t/s  | 5.60 |
162 | | Llama      | 33B   | 128   | no  | 2,048 t              | 20,795 MB | 2,959 t/s  | 47 t/s  | 40 t/s  | 4.60 |
163 | | Llama      | 33B   | 128   | yes | 2,048 t              | 20,795 MB | 2,784 t/s  | 45 t/s  | 37 t/s  | 4.55 |
164 | | Llama      | 33B   | 32    | yes | 1,550 t <sup>1</sup> | 21,486 MB | 2,636 t/s  | 41 t/s  | 37 t/s  | 4.52 |
165 | | Koala      | 13B   | 128   | yes | 2,048 t              | 9,127 MB  | 5,529 t/s  | 93 t/s  | 79 t/s  | 6.73 |
166 | | WizardLM   | 33B   | -     | yes | 2,048 t              | 20,199 MB | 2,313 t/s  | 47 t/s  | 40 t/s  | 5.75 |
167 | | OpenLlama  | 3B    | 128   | yes | 2,048 t              | 3,128 MB  | 16,419 t/s | 226 t/s | 170 t/s | 7.81 |
168 |  
169 | <sup>1</sup> Can not achieve full sequence length without OoM  
170 | 
171 | All tests done on stock RTX 4090 / 12900K, running with a desktop environment, with a few other apps also using VRAM.
172 | 
173 | **"Prompt"** speed is inference over the sequence length listed minus 128 tokens. **"Worst"** is the average speed for
174 | the last 128 tokens of the full context (worst case) and **"Best"** lists the speed for the first 128 tokens in an
175 | empty sequence (best case.)
176 | 
177 | VRAM usage is as reported by PyTorch and does not include PyTorch's own overhead (CUDA kernels,
178 | internal buffers etc.) This is somewhat unpredictable anyway. Best bet is to just optimize VRAM usage by the model,
179 | probably aiming for 20 GB on a 24 GB GPU to ensure there is room for a desktop environment and all of Torch's
180 | internals.
181 | 
182 | Perplexity is measured only to verify that the models are working. The dataset used is a particular, small sample from
183 | WikiText, so scores are not comparable to other Llama benchmarks and only useful for comparing the different Llama
184 | models to one another.
185 | 
186 | ### Dual GPU results
187 | 
188 | The following benchmarks are from a 4090 + 3090-Ti with `-gs 17.2,24`:
189 | 
190 | | Model   | Size | groupsize | act | Seq. len.      | VRAM      | Prompt    | Best   | Worst   | Ppl   |
191 | |---------|------|-----------|-----|----------------|-----------|-----------|--------|---------|-------|
192 | | Llama   | 65B  | 128       | yes | 2,048 t        | 39,804 MB | 1,109 t/s | 20 t/s | 18 t/s  | 4.20  |
193 | | Llama   | 65B  | 32        | yes | 2,048 t        | 43,424 MB | 1,037 t/s | 17 t/s | 16 t/s  | 4.11  |
194 | | Llama-2 | 70B  | 128       | yes | 2,048 t        | 40,680 MB | 914 t/s   | 17 t/s | 14 t/s  | 4.15  |
195 | | Llama-2 | 70B  | 32        | yes | 2,048 t        | 36,815 MB | 874 t/s   | 15 t/s | 12 t/s  | 4.10  |
196 | 
197 | Note that perplexity scores may not be strictly apples-to-apples between Llama and Llama 2 due to their different
198 | pretraining datasets.
199 | 
200 | ## Todo
201 | 
202 | Moved the todo list [here](doc/TODO.md).  
203 | 
204 | ## Compatibility
205 | 
206 | [Here](doc/model_compatibility.md) is a list of models confirmed to be working right now.
207 | 
208 | ## Recent updates
209 | 
210 | **2023-01-09**: Added rope_theta parameter for (at least partial) CodeLlama support. If you were using alpha = 97
211 | or similar, you would no longer need that for CodeLlama models. Still stuff to sort out regarding the extended
212 | vocabulary.
213 | 
214 | **2023-08-09**: Added support for sharded models. `config.model_path` now accepts either a filename or a list of
215 | filenames. `model_init()` will detect multiple .safetensors files if given a model directory. Note the change in the
216 | various examples: `model_path = glob.glob(st_pattern)[0]` becomes simply `model_path = glob.glob(st_pattern)`. Also
217 | there's a little script in `util/shard.py` to split large .safetensors files. It also produces an index.json file for
218 | the sharded model, just for completeness, although ExLlama doesn't need it to read the shards. Note that the 
219 | **safetensors dependency was bumped to version 0.3.2**. 
220 | 
221 | **2023-08-12**: Preliminary, initial and tentative release of [ExLlamaV2](https://github.com/turboderp/exllamav2).
222 | It doesn't do all the things that ExLlamaV1 does, yet, but it's better at what it does do. So check it out!


--------------------------------------------------------------------------------
/datasets/download_datasets.py:
--------------------------------------------------------------------------------
 1 | # import torch
 2 | # from tokenizer import ExLlamaTokenizer
 3 | from datasets import load_dataset
 4 | import os
 5 | 
 6 | # Download samples from HF datasets to run equivalent GPTQ-for-LLaMa equivalent benchmark
 7 | 
 8 | def download_hf(filename, dataset, subset, split, key, div):
 9 | 
10 |     print(f"Downloading from {dataset}: {subset}, split: {split} ...")
11 |     hf_dataset = load_dataset(dataset, subset, split = split)
12 |     data = div.join(hf_dataset[key])
13 | 
14 |     with open(filename, "w", encoding="utf-8") as f:
15 |         f.write(data)
16 | 
17 | download_hf("wikitext2.txt", "wikitext", "wikitext-2-raw-v1", "test", "text", "\n\n")
18 | download_hf("ptb.txt", "ptb_text_only", "penn_treebank", "validation", "sentence", "\n\n")
19 | download_hf("ptb_new.txt", "ptb_text_only", "penn_treebank", "test", "sentence", " ")
20 | 


--------------------------------------------------------------------------------
/doc/TODO.md:
--------------------------------------------------------------------------------
 1 | ## Model compatibility
 2 | 
 3 | - [ ] Verify compatibility with Llama-2 34B once released
 4 | 
 5 | ## GPU compatibility (etc.)
 6 | 
 7 | - [ ] Optimizations for ROCm
 8 | - [ ] Optimizations for RTX 20-series maybe
 9 | - [ ] Look into improving P40 performance
10 | 
11 | ## Testing
12 | 
13 | - [ ] More testing on Llama 2 models
14 | 
15 | ## Optimization
16 | 
17 | - [ ] Flash Attention 2.0 (?)
18 | - [ ] Find a way to eliminate `ExLlamaAttention.repeat_kv` (custom attention kernel?)
19 | - [ ] C++ implementations of sampler functions
20 | 
21 | ## Generation
22 | 
23 | - [ ] Optimized/batched beam search
24 | - [ ] Allow stackable LoRAs
25 | - [ ] Guidance or equivalent
26 | 
27 | ## Interface
28 | 
29 | - [ ] Comprehensive API server (more than `example_flask.py`
30 | 
31 | ## Web UI
32 | 
33 | - [ ] Controls to enable beam search
34 | - [ ] Rewrite/refactor all the JavaScript and CSS
35 | - [ ] Make it a little prettier
36 | - [ ] Better error handling
37 | - [ ] LoRA controls
38 | - [ ] Multiple chat modes with prompt templates (instruct, etc.)
39 | 
40 | ## ??
41 | 
42 | - [ ] Support for other quantization methods
43 | - [ ] Support for other LLM architectures
44 | - [ ] Allow for backpropagation
45 | - [ ] LoRA training features
46 | - [ ] Soft prompt training


--------------------------------------------------------------------------------
/doc/_screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jllllll/exllama/4ded203bea0719349a4ee5bd7221db5035062518/doc/_screenshot.jpg


--------------------------------------------------------------------------------
/doc/model_compatibility.md:
--------------------------------------------------------------------------------
 1 | ## Working models
 2 | 
 3 | As of **2023-07-19**, the following GPTQ models on HuggingFace all appear to be working:
 4 | 
 5 | - iambestfeed/open_llama_3b_4bit_128g
 6 | - Neko-Institute-of-Science/LLaMA-7B-4bit-128g
 7 | - Neko-Institute-of-Science/LLaMA-13B-4bit-128g
 8 | - Neko-Institute-of-Science/LLaMA-30B-4bit-32g
 9 | - Neko-Institute-of-Science/LLaMA-30B-4bit-128g
10 | - Neko-Institute-of-Science/LLaMA-65B-4bit-32g
11 | - Neko-Institute-of-Science/LLaMA-65B-4bit-128g
12 | - Panchovix/LLaMA-2-70B-GPTQ-transformers4.32.0.dev0
13 | - reeducator/bluemoonrp-13b
14 | - reeducator/bluemoonrp-30b
15 | - TehVenom/Metharme-13b-4bit-GPTQ
16 | - TheBloke/airoboros-13B-GPTQ
17 | - TheBloke/gpt4-x-vicuna-13B-GPTQ
18 | - TheBloke/GPT4All-13B-snoozy-GPTQ
19 | - TheBloke/guanaco-33B-GPTQ
20 | - TheBloke/guanaco-65B-GPTQ
21 | - TheBloke/h2ogpt-oasst1-512-30B-GPTQ
22 | - TheBloke/koala-13B-GPTQ-4bit-128g
23 | - TheBloke/Llama-2-13B-chat-GPTQ (128g)
24 | - TheBloke/Llama-2-13B-GPTQ (32g, 64g, 128g)
25 | - TheBloke/Llama-2-70B-GPTQ (32g, 128g)
26 | - TheBloke/Manticore-13B-GPTQ
27 | - TheBloke/medalpaca-13B-GPTQ-4bit
28 | - TheBloke/medalpaca-13B-GPTQ-4bit (compat version)
29 | - TheBloke/Nous-Hermes-13B-GPTQ
30 | - TheBloke/robin-65B-v2-GPTQ
31 | - TheBloke/tulu-7B-GPTQ
32 | - TheBloke/Tulu-13B-SuperHOT-8K-GPTQ
33 | - TheBloke/tulu-30B-GPTQ
34 | - TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g
35 | - TheBloke/VicUnlocked-30B-LoRA-GPTQ
36 | - TheBloke/wizard-mega-13B-GPTQ
37 | - TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ
38 | - TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ
39 | - TheBloke/WizardLM-7B-uncensored-GPTQ
40 | - TheBloke/WizardLM-30B-Uncensored-GPTQ
41 | - TheBloke/WizardLM-33B-V1.0-Uncensored-SuperHOT-8K-GPTQ
42 | - tmpupload/superhot-30b-8k-no-rlhf-test-128g-GPTQ
43 | - Yhyu13/chimera-inst-chat-13b-gptq-4bit
44 | - Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-gptq-4bit
45 | 
46 | ## Non-working models
47 | 
48 | None as of **2023-07-19**.


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | name: exllama
 3 | services:
 4 |   web:
 5 |     build:
 6 |       context: .
 7 |       args:
 8 |         - RUN_UID=$RUN_UID
 9 |         - APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH
10 |     command: |
11 |       --host 0.0.0.0:$PORT
12 |     env_file:
13 |       - .env
14 |     volumes:
15 |       - $MODEL_PATH:$APPLICATION_STATE_PATH/model
16 |       - $SESSIONS_PATH:$APPLICATION_STATE_PATH/exllama_sessions
17 |     ports:
18 |       - "$PORT:$PORT"
19 |     tmpfs:
20 |       - /tmp
21 |     stdin_open: true
22 |     tty: true
23 |     deploy:
24 |       resources:
25 |         reservations:
26 |           devices:
27 |             - driver: nvidia
28 |               count: all
29 |               capabilities: [gpu]
30 | 


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -Eeuo pipefail
 3 | 
 4 | # Ensure that the application state path is set
 5 | if [ -z $APPLICATION_STATE_PATH ]; then
 6 |   echo "Must specify application state path"
 7 |   exit 1
 8 | fi
 9 | 
10 | # Ensure that bind-mounted directories are owned by the user that runs the service if the user is not root
11 | if [ $RUN_UID -ne 0 ]; then
12 |   chown -R $RUN_UID:$RUN_UID $APPLICATION_STATE_PATH
13 | fi
14 | 
15 | # Run service as specified (non-root) user
16 | exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py \
17 | 	-d $CONTAINER_MODEL_PATH \
18 | 	--sessions_dir $CONTAINER_SESSIONS_PATH \
19 | 	$@
20 | 


--------------------------------------------------------------------------------
/example_basic.py:
--------------------------------------------------------------------------------
 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 2 | from exllama.tokenizer import ExLlamaTokenizer
 3 | from exllama.generator import ExLlamaGenerator
 4 | import os, glob
 5 | 
 6 | # Directory containing model, tokenizer, generator
 7 | 
 8 | model_directory =  "/mnt/str/models/llama-13b-4bit-128g/"
 9 | 
10 | # Locate files we need within that directory
11 | 
12 | tokenizer_path = os.path.join(model_directory, "tokenizer.model")
13 | model_config_path = os.path.join(model_directory, "config.json")
14 | st_pattern = os.path.join(model_directory, "*.safetensors")
15 | model_path = glob.glob(st_pattern)
16 | 
17 | # Create config, model, tokenizer and generator
18 | 
19 | config = ExLlamaConfig(model_config_path)               # create config from config.json
20 | config.model_path = model_path                          # supply path to model weights file
21 | 
22 | model = ExLlama(config)                                 # create ExLlama instance and load the weights
23 | tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
24 | 
25 | cache = ExLlamaCache(model)                             # create cache for inference
26 | generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
27 | 
28 | # Configure generator
29 | 
30 | generator.disallow_tokens([tokenizer.eos_token_id])
31 | 
32 | generator.settings.token_repetition_penalty_max = 1.2
33 | generator.settings.temperature = 0.95
34 | generator.settings.top_p = 0.65
35 | generator.settings.top_k = 100
36 | generator.settings.typical = 0.5
37 | 
38 | # Produce a simple generation
39 | 
40 | prompt = "Once upon a time,"
41 | print (prompt, end = "")
42 | 
43 | output = generator.generate_simple(prompt, max_new_tokens = 200)
44 | 
45 | print(output[len(prompt):])
46 | 


--------------------------------------------------------------------------------
/example_batch.py:
--------------------------------------------------------------------------------
 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 2 | from exllama.tokenizer import ExLlamaTokenizer
 3 | from exllama.generator import ExLlamaGenerator
 4 | import os, glob
 5 | 
 6 | # Directory containing model, tokenizer, generator
 7 | 
 8 | model_directory =  "/mnt/str/models/llama-13b-4bit-128g/"
 9 | 
10 | # Locate files we need within that directory
11 | 
12 | tokenizer_path = os.path.join(model_directory, "tokenizer.model")
13 | model_config_path = os.path.join(model_directory, "config.json")
14 | st_pattern = os.path.join(model_directory, "*.safetensors")
15 | model_path = glob.glob(st_pattern)
16 | 
17 | # Batched prompts
18 | 
19 | prompts = [
20 |     "Once upon a time,",
21 |     "I don't like to",
22 |     "A turbo encabulator is a",
23 |     "In the words of Mark Twain,"
24 | ]
25 | 
26 | # Create config, model, tokenizer and generator
27 | 
28 | config = ExLlamaConfig(model_config_path)               # create config from config.json
29 | config.model_path = model_path                          # supply path to model weights file
30 | 
31 | model = ExLlama(config)                                 # create ExLlama instance and load the weights
32 | tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
33 | 
34 | cache = ExLlamaCache(model, batch_size = len(prompts))  # create cache for inference
35 | generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
36 | 
37 | # Configure generator
38 | 
39 | generator.disallow_tokens([tokenizer.eos_token_id])
40 | 
41 | generator.settings.token_repetition_penalty_max = 1.2
42 | generator.settings.temperature = 0.95
43 | generator.settings.top_p = 0.65
44 | generator.settings.top_k = 100
45 | generator.settings.typical = 0.5
46 | 
47 | # Generate, batched
48 | 
49 | for line in prompts:
50 |     print(line)
51 | 
52 | output = generator.generate_simple(prompts, max_new_tokens = 200)
53 | 
54 | for line in output:
55 |     print("---")
56 |     print(line)
57 | 


--------------------------------------------------------------------------------
/example_cfg.py:
--------------------------------------------------------------------------------
 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 2 | from exllama.tokenizer import ExLlamaTokenizer
 3 | from exllama.generator import ExLlamaGenerator
 4 | import torch
 5 | import torch.nn.functional as F
 6 | import os, glob
 7 | import exllama.cuda_ext
 8 | 
 9 | # Directory containing model, tokenizer, generator
10 | 
11 | model_directory =  "/mnt/str/models/_test_models/TheBloke_Llama-2-13B-chat-GPTQ/"
12 | 
13 | # Locate files we need within that directory
14 | 
15 | tokenizer_path = os.path.join(model_directory, "tokenizer.model")
16 | model_config_path = os.path.join(model_directory, "config.json")
17 | st_pattern = os.path.join(model_directory, "*.safetensors")
18 | model_path = glob.glob(st_pattern)
19 | 
20 | # Create config, model, tokenizer and generator
21 | 
22 | config = ExLlamaConfig(model_config_path)               # create config from config.json
23 | config.model_path = model_path                          # supply path to model weights file
24 | 
25 | model = ExLlama(config)                                 # create ExLlama instance and load the weights
26 | tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
27 | 
28 | cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
29 | generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
30 | 
31 | # Configure generator
32 | 
33 | generator.settings.token_repetition_penalty_max = 1.15
34 | generator.settings.temperature = 0.95
35 | generator.settings.top_k = 40
36 | generator.settings.top_p = 0.75
37 | # generator.settings.typical = 0.95
38 | 
39 | # Prompts to mix
40 | 
41 | f1 = \
42 | """[INST] <<SYS>>
43 | You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
44 | <</SYS>>
45 | {prompt}[/INST]"""
46 | 
47 | f2 = \
48 | """[INST] <<SYS>>
49 | <</SYS>>
50 | You are a rude and obnoxious assistant. You hate everything and everyone.
51 | {prompt}[/INST]"""
52 | 
53 | 
54 | prompts = \
55 | [
56 |     f1.replace("{prompt}", "Tell me about Homer Simpson"),
57 |     f2.replace("{prompt}", "Tell me about Homer Simpson"),
58 | ]
59 | 
60 | def generate_cfg(prompts, alpha, max_new_tokens):
61 | 
62 |     ids, mask = tokenizer.encode(prompts, return_mask = True)
63 |     generator.gen_begin(ids, mask = mask)
64 | 
65 |     # Sampling loop
66 | 
67 |     for _ in range(max_new_tokens):
68 | 
69 |         logits = model.forward(generator.sequence[:, -1:], cache, input_mask = mask)
70 |         generator.apply_rep_penalty(logits)
71 | 
72 |         logits = F.log_softmax(logits, dim = -1)
73 |         logits_mixed = (1 - alpha) * logits[0] + alpha * logits[1]
74 | 
75 |         sampled_token, _ = generator.sample_current(logits_mixed)
76 |         if sampled_token.item() == tokenizer.eos_token_id: break
77 | 
78 |         batch_token = sampled_token.repeat(2, 1)
79 |         generator.gen_accept_token(batch_token)
80 | 
81 |     output = tokenizer.decode(generator.sequence[0])
82 |     return output
83 | 
84 | for i in range(10):
85 | 
86 |     alpha = i / 5.0 - 0.4
87 |     print()
88 |     print(f"--------------------------------------")
89 |     print(f"alpha = {alpha:.1f}")
90 |     print(f"--------------------------------------")
91 |     output = generate_cfg(prompts, alpha, 200)
92 |     print(output[len(prompts[0]):].strip())
93 | 


--------------------------------------------------------------------------------
/example_chatbot.py:
--------------------------------------------------------------------------------
  1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
  2 | from exllama.lora import ExLlamaLora
  3 | from exllama.tokenizer import ExLlamaTokenizer
  4 | from exllama.generator import ExLlamaGenerator
  5 | import argparse
  6 | import torch
  7 | import sys
  8 | import os
  9 | import glob
 10 | import model_init
 11 | 
 12 | # Simple interactive chatbot script
 13 | 
 14 | torch.set_grad_enabled(False)
 15 | torch.cuda._lazy_init()
 16 | 
 17 | # Parse arguments
 18 | 
 19 | parser = argparse.ArgumentParser(description = "Simple chatbot example for ExLlama")
 20 | 
 21 | model_init.add_args(parser)
 22 | 
 23 | parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark")
 24 | parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark")
 25 | parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark")
 26 | 
 27 | parser.add_argument("-p", "--prompt", type = str, help = "Prompt file")
 28 | parser.add_argument("-un", "--username", type = str, help = "Display name of user", default = "User")
 29 | parser.add_argument("-bn", "--botname", type = str, help = "Display name of chatbot", default = "Chatbort")
 30 | parser.add_argument("-bf", "--botfirst", action = "store_true", help = "Start chat on bot's turn")
 31 | 
 32 | parser.add_argument("-nnl", "--no_newline", action = "store_true", help = "Do not break bot's response on newline (allow multi-paragraph responses)")
 33 | parser.add_argument("-temp", "--temperature", type = float, help = "Temperature", default = 0.95)
 34 | parser.add_argument("-topk", "--top_k", type = int, help = "Top-K", default = 20)
 35 | parser.add_argument("-topp", "--top_p", type = float, help = "Top-P", default = 0.65)
 36 | parser.add_argument("-minp", "--min_p", type = float, help = "Min-P", default = 0.00)
 37 | parser.add_argument("-repp",  "--repetition_penalty", type = float, help = "Repetition penalty", default = 1.15)
 38 | parser.add_argument("-repps", "--repetition_penalty_sustain", type = int, help = "Past length for repetition penalty", default = 256)
 39 | parser.add_argument("-beams", "--beams", type = int, help = "Number of beams for beam search", default = 1)
 40 | parser.add_argument("-beamlen", "--beam_length", type = int, help = "Number of future tokens to consider", default = 1)
 41 | 
 42 | args = parser.parse_args()
 43 | model_init.post_parse(args)
 44 | model_init.get_model_files(args)
 45 | 
 46 | # Paths
 47 | 
 48 | if args.lora_dir is not None:
 49 |     args.lora_config = os.path.join(args.lora_dir, "adapter_config.json")
 50 |     args.lora = os.path.join(args.lora_dir, "adapter_model.bin")
 51 | 
 52 | # Some feedback
 53 | 
 54 | print(f" -- Sequence length: {args.length}")
 55 | print(f" -- Temperature: {args.temperature:.2f}")
 56 | print(f" -- Top-K: {args.top_k}")
 57 | print(f" -- Top-P: {args.top_p:.2f}")
 58 | print(f" -- Min-P: {args.min_p:.2f}")
 59 | print(f" -- Repetition penalty: {args.repetition_penalty:.2f}")
 60 | print(f" -- Beams: {args.beams} x {args.beam_length}")
 61 | 
 62 | print_opts = []
 63 | if args.no_newline: print_opts.append("no_newline")
 64 | if args.botfirst: print_opts.append("botfirst")
 65 | 
 66 | model_init.print_options(args, print_opts)
 67 | 
 68 | # Globals
 69 | 
 70 | model_init.set_globals(args)
 71 | 
 72 | # Load prompt file
 73 | 
 74 | username = args.username
 75 | bot_name = args.botname
 76 | 
 77 | if args.prompt is not None:
 78 |     with open(args.prompt, "r") as f:
 79 |         past = f.read()
 80 |         past = past.replace("{username}", username)
 81 |         past = past.replace("{bot_name}", bot_name)
 82 |         past = past.strip() + "\n"
 83 | else:
 84 |     past = f"{bot_name}: Hello, {username}\n"
 85 | 
 86 | # past += "User: Hi. Please say \"Shhhhhh\"?\n"
 87 | # args.botfirst = True
 88 | 
 89 | # Instantiate model and generator
 90 | 
 91 | config = model_init.make_config(args)
 92 | 
 93 | model = ExLlama(config)
 94 | cache = ExLlamaCache(model)
 95 | tokenizer = ExLlamaTokenizer(args.tokenizer)
 96 | 
 97 | model_init.print_stats(model)
 98 | 
 99 | # Load LoRA
100 | 
101 | lora = None
102 | if args.lora:
103 |     print(f" -- LoRA config: {args.lora_config}")
104 |     print(f" -- Loading LoRA: {args.lora}")
105 |     if args.lora_config is None:
106 |         print(f" ## Error: please specify lora path to adapter_config.json")
107 |         sys.exit()
108 |     lora = ExLlamaLora(model, args.lora_config, args.lora)
109 |     if lora.bias_ignored:
110 |         print(f" !! Warning: LoRA zero bias ignored")
111 | 
112 | # Generator
113 | 
114 | generator = ExLlamaGenerator(model, tokenizer, cache)
115 | generator.settings = ExLlamaGenerator.Settings()
116 | generator.settings.temperature = args.temperature
117 | generator.settings.top_k = args.top_k
118 | generator.settings.top_p = args.top_p
119 | generator.settings.min_p = args.min_p
120 | generator.settings.token_repetition_penalty_max = args.repetition_penalty
121 | generator.settings.token_repetition_penalty_sustain = args.repetition_penalty_sustain
122 | generator.settings.token_repetition_penalty_decay = generator.settings.token_repetition_penalty_sustain // 2
123 | generator.settings.beams = args.beams
124 | generator.settings.beam_length = args.beam_length
125 | 
126 | generator.lora = lora
127 | 
128 | break_on_newline = not args.no_newline
129 | 
130 | # Be nice to Chatbort
131 | 
132 | min_response_tokens = 4
133 | max_response_tokens = 256
134 | extra_prune = 256
135 | 
136 | print(past, end = "")
137 | ids = tokenizer.encode(past)
138 | generator.gen_begin(ids)
139 | 
140 | next_userprompt = username + ": "
141 | 
142 | first_round = True
143 | 
144 | while True:
145 | 
146 |     res_line = bot_name + ":"
147 |     res_tokens = tokenizer.encode(res_line)
148 |     num_res_tokens = res_tokens.shape[-1]  # Decode from here
149 | 
150 |     if first_round and args.botfirst: in_tokens = res_tokens
151 | 
152 |     else:
153 | 
154 |         # Read and format input
155 | 
156 |         in_line = input(next_userprompt)
157 |         in_line = username + ": " + in_line.strip() + "\n"
158 | 
159 |         next_userprompt = username + ": "
160 | 
161 |         # No need for this, really, unless we were logging the chat. The actual history we work on is kept in the
162 |         # tokenized sequence in the generator and the state in the cache.
163 | 
164 |         past += in_line
165 | 
166 |         # SentencePiece doesn't tokenize spaces separately so we can't know from individual tokens if they start a new word
167 |         # or not. Instead, repeatedly decode the generated response as it's being built, starting from the last newline,
168 |         # and print out the differences between consecutive decodings to stream out the response.
169 | 
170 |         in_tokens = tokenizer.encode(in_line)
171 |         in_tokens = torch.cat((in_tokens, res_tokens), dim = 1)
172 | 
173 |     # If we're approaching the context limit, prune some whole lines from the start of the context. Also prune a
174 |     # little extra so we don't end up rebuilding the cache on every line when up against the limit.
175 | 
176 |     expect_tokens = in_tokens.shape[-1] + max_response_tokens
177 |     max_tokens = config.max_seq_len - expect_tokens
178 |     if generator.gen_num_tokens() >= max_tokens:
179 |         generator.gen_prune_to(config.max_seq_len - expect_tokens - extra_prune, tokenizer.newline_token_id)
180 | 
181 |     # Feed in the user input and "{bot_name}:", tokenized
182 | 
183 |     generator.gen_feed_tokens(in_tokens)
184 | 
185 |     # Generate with streaming
186 | 
187 |     print(res_line, end = "")
188 |     sys.stdout.flush()
189 | 
190 |     generator.begin_beam_search()
191 | 
192 |     for i in range(max_response_tokens):
193 | 
194 |         # Disallowing the end condition tokens seems like a clean way to force longer replies.
195 | 
196 |         if i < min_response_tokens:
197 |             generator.disallow_tokens([tokenizer.newline_token_id, tokenizer.eos_token_id])
198 |         else:
199 |             generator.disallow_tokens(None)
200 | 
201 |         # Get a token
202 | 
203 |         gen_token = generator.beam_search()
204 | 
205 |         # If token is EOS, replace it with newline before continuing
206 | 
207 |         if gen_token.item() == tokenizer.eos_token_id:
208 |             generator.replace_last_token(tokenizer.newline_token_id)
209 | 
210 |         # Decode the current line and print any characters added
211 | 
212 |         num_res_tokens += 1
213 |         text = tokenizer.decode(generator.sequence_actual[:, -num_res_tokens:][0])
214 |         new_text = text[len(res_line):]
215 | 
216 |         skip_space = res_line.endswith("\n") and new_text.startswith(" ")  # Bit prettier console output
217 |         res_line += new_text
218 |         if skip_space: new_text = new_text[1:]
219 | 
220 |         print(new_text, end="")  # (character streaming output is here)
221 |         sys.stdout.flush()
222 | 
223 |         # End conditions
224 | 
225 |         if break_on_newline and gen_token.item() == tokenizer.newline_token_id: break
226 |         if gen_token.item() == tokenizer.eos_token_id: break
227 | 
228 |         # Some models will not (or will inconsistently) emit EOS tokens but in a chat sequence will often begin
229 |         # generating for the user instead. Try to catch this and roll back a few tokens to begin the user round.
230 | 
231 |         if res_line.endswith(f"{username}:"):
232 |             plen = tokenizer.encode(f"{username}:").shape[-1]
233 |             generator.gen_rewind(plen)
234 |             next_userprompt = " "
235 |             break
236 | 
237 |     generator.end_beam_search()
238 | 
239 |     past += res_line
240 |     first_round = False
241 | 


--------------------------------------------------------------------------------
/example_flask.py:
--------------------------------------------------------------------------------
 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 2 | from flask import Flask, request
 3 | from exllama.tokenizer import ExLlamaTokenizer
 4 | from exllama.generator import ExLlamaGenerator
 5 | import os, glob
 6 | 
 7 | # Directory containing config.json, tokenizer.model and safetensors file for the model
 8 | model_directory = "/mnt/str/models/llama-7b-4bit/"
 9 | 
10 | tokenizer_path = os.path.join(model_directory, "tokenizer.model")
11 | model_config_path = os.path.join(model_directory, "config.json")
12 | st_pattern = os.path.join(model_directory, "*.safetensors")
13 | model_path = glob.glob(st_pattern)
14 | 
15 | config = ExLlamaConfig(model_config_path)               # create config from config.json
16 | config.model_path = model_path                          # supply path to model weights file
17 | 
18 | model = ExLlama(config)                                 # create ExLlama instance and load the weights
19 | print(f"Model loaded: {model_path}")
20 | 
21 | tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
22 | cache = ExLlamaCache(model)                             # create cache for inference
23 | generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
24 | 
25 | # Flask app
26 | 
27 | app = Flask(__name__)
28 | 
29 | 
30 | # Inference with settings equivalent to the "precise" preset from the /r/LocalLLaMA wiki
31 | 
32 | @app.route('/infer_precise', methods=['POST'])
33 | def inferContextP():
34 |     print(request.form)
35 |     prompt = request.form.get('prompt')
36 | 
37 |     generator.settings.token_repetition_penalty_max = 1.176
38 |     generator.settings.token_repetition_penalty_sustain = config.max_seq_len
39 |     generator.settings.temperature = 0.7
40 |     generator.settings.top_p = 0.1
41 |     generator.settings.top_k = 40
42 |     generator.settings.typical = 0.0    # Disabled
43 | 
44 |     outputs = generator.generate_simple(prompt, max_new_tokens = 200)
45 |     return outputs
46 | 
47 | 
48 | # Inference with settings equivalent to the "creative" preset from the /r/LocalLLaMA wiki
49 | 
50 | @app.route('/infer_creative', methods=['POST'])
51 | def inferContextC():
52 |     print(request.form)
53 |     prompt = request.form.get('prompt')
54 | 
55 |     generator.settings.token_repetition_penalty_max = 1.1
56 |     generator.settings.token_repetition_penalty_sustain = config.max_seq_len
57 |     generator.settings.temperature = 0.72
58 |     generator.settings.top_p = 0.73
59 |     generator.settings.top_k = 0        # Disabled
60 |     generator.settings.typical = 0.0    # Disabled
61 | 
62 |     outputs = generator.generate_simple(prompt, max_new_tokens = 200)
63 |     return outputs
64 | 
65 | 
66 | # Inference with settings equivalent to the "sphinx" preset from the /r/LocalLLaMA wiki
67 | 
68 | @app.route('/infer_sphinx', methods=['POST'])
69 | def inferContextS():
70 |     print(request.form)
71 |     prompt = request.form.get('prompt')
72 | 
73 |     generator.settings.token_repetition_penalty_max = 1.15
74 |     generator.settings.token_repetition_penalty_sustain = config.max_seq_len
75 |     generator.settings.temperature = 1.99
76 |     generator.settings.top_p = 0.18
77 |     generator.settings.top_k = 30
78 |     generator.settings.typical = 0.0    # Disabled
79 | 
80 |     outputs = generator.generate_simple(prompt, max_new_tokens = 200)
81 |     return outputs
82 | 
83 | 
84 | # Start Flask app
85 | 
86 | host = "0.0.0.0"
87 | port = 8004
88 | print(f"Starting server on address {host}:{port}")
89 | 
90 | if __name__ == '__main__':
91 |     from waitress import serve
92 |     serve(app, host = host, port = port)
93 | 


--------------------------------------------------------------------------------
/example_lora.py:
--------------------------------------------------------------------------------
 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 2 | from exllama.tokenizer import ExLlamaTokenizer
 3 | from exllama.generator import ExLlamaGenerator
 4 | from exllama.lora import ExLlamaLora
 5 | import os, glob
 6 | import torch
 7 | 
 8 | # Directory containt model, tokenizer, generator
 9 | 
10 | model_directory = "/mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-7B-4bit-128g/"
11 | 
12 | # Directory containing LoRA config and weights
13 | 
14 | lora_directory = "/mnt/str/models/_test_loras/tloen_alpaca-lora-7b/"
15 | 
16 | # Locate files we need within those directories
17 | 
18 | tokenizer_path = os.path.join(model_directory, "tokenizer.model")
19 | model_config_path = os.path.join(model_directory, "config.json")
20 | st_pattern = os.path.join(model_directory, "*.safetensors")
21 | model_path = glob.glob(st_pattern)
22 | 
23 | lora_config_path = os.path.join(lora_directory, "adapter_config.json")
24 | lora_path = os.path.join(lora_directory, "adapter_model.bin")
25 | 
26 | # Create config, model, tokenizer and generator
27 | 
28 | config = ExLlamaConfig(model_config_path)               # create config from config.json
29 | config.model_path = model_path                          # supply path to model weights file
30 | 
31 | model = ExLlama(config)                                 # create ExLlama instance and load the weights
32 | tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
33 | 
34 | cache = ExLlamaCache(model)                             # create cache for inference
35 | generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
36 | 
37 | # Load LoRA
38 | 
39 | lora = ExLlamaLora(model, lora_config_path, lora_path)
40 | 
41 | # Configure generator
42 | 
43 | generator.settings.token_repetition_penalty_max = 1.2
44 | generator.settings.temperature = 0.65
45 | generator.settings.top_p = 0.4
46 | generator.settings.top_k = 0
47 | generator.settings.typical = 0.0
48 | 
49 | # Alpaca prompt
50 | 
51 | prompt = \
52 |     "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" \
53 |     "\n" \
54 |     "### Instruction:\n" \
55 |     "List five colors in alphabetical order.\n" \
56 |     "\n" \
57 |     "### Response:"
58 | 
59 | # Generate with LoRA
60 | 
61 | print(" --- LoRA ----------------- ")
62 | print("")
63 | 
64 | generator.lora = lora
65 | torch.manual_seed(1337)
66 | output = generator.generate_simple(prompt, max_new_tokens = 200)
67 | print(output)
68 | 
69 | # Generate without LoRA
70 | 
71 | print("")
72 | print(" --- No LoRA -------------- ")
73 | print("")
74 | 
75 | generator.lora = None
76 | torch.manual_seed(1337)
77 | output = generator.generate_simple(prompt, max_new_tokens = 200)
78 | print(output)
79 | 
80 | 


--------------------------------------------------------------------------------
/example_ws.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import websockets
  3 | import json
  4 | from sentencepiece import SentencePieceProcessor
  5 | 
  6 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
  7 | from exllama.lora import ExLlamaLora
  8 | from exllama.tokenizer import ExLlamaTokenizer
  9 | from exllama.generator import ExLlamaGenerator
 10 | import argparse
 11 | import torch
 12 | import sys
 13 | import os
 14 | import glob
 15 | import model_init
 16 | 
 17 | # Initialized from command line args by init()
 18 | 
 19 | model: ExLlama
 20 | cache: ExLlamaCache
 21 | config: ExLlamaConfig
 22 | generator: ExLlamaGenerator
 23 | tokenizer: ExLlamaTokenizer
 24 | max_cached_strings = 100
 25 | tokenizer_cache = {}
 26 | 
 27 | 
 28 | prompt_ids: torch.tensor
 29 | stop_strings: list
 30 | stop_tokens: list
 31 | held_text: str
 32 | max_stop_string: int
 33 | remaining_tokens: int
 34 | 
 35 | full_prompt: str
 36 | utilized_prompt: str
 37 | built_response: str
 38 | 
 39 | def cached_tokenize(text: str):
 40 |     global model, cache, config, generator, tokenizer
 41 |     global max_cached_strings, tokenizer_cache
 42 | 
 43 |     if text in tokenizer_cache:
 44 |         return tokenizer_cache[text]
 45 | 
 46 |     while len(tokenizer_cache) >= max_cached_strings:
 47 |         del tokenizer_cache[next(iter(tokenizer_cache))]  # Always removes oldest entry as of Python 3.7
 48 | 
 49 |     new_enc = tokenizer.encode(text)
 50 |     tokenizer_cache[text] = new_enc
 51 |     return new_enc
 52 | 
 53 | def begin_stream(prompt: str, stop_conditions: list, max_new_tokens: int, gen_settings: ExLlamaGenerator.Settings):
 54 |     global model, cache, config, generator, tokenizer
 55 |     global stop_strings, stop_tokens, prompt_ids, held_text, max_stop_string, remaining_tokens
 56 |     global full_prompt, utilized_prompt, built_response
 57 | 
 58 |     # Tokenize prompt and limit length to allow prompt and (max) new tokens within max sequence length
 59 | 
 60 |     max_input_tokens = model.config.max_seq_len - max_new_tokens
 61 |     input_ids = cached_tokenize(prompt)
 62 |     input_ids = input_ids[:, -max_input_tokens:]
 63 |     prompt_ids = input_ids
 64 | 
 65 |     full_prompt = prompt
 66 |     utilized_prompt = tokenizer.decode(prompt_ids)[0]
 67 |     built_response = ""
 68 | 
 69 |     remaining_tokens = max_new_tokens
 70 | 
 71 |     # Settings
 72 | 
 73 |     stop_strings = []
 74 |     stop_tokens = []
 75 |     for t in stop_conditions:
 76 |         if isinstance(t, int): stop_tokens += [t]
 77 |         if isinstance(t, str): stop_strings += [t]
 78 | 
 79 |     held_text = ""
 80 | 
 81 |     max_stop_string = 2
 82 |     for ss in stop_strings:
 83 |         max_stop_string = max(max_stop_string, get_num_tokens(ss) + 2)
 84 | 
 85 |     generator.settings = gen_settings
 86 | 
 87 |     # Start generation
 88 | 
 89 |     generator.gen_begin_reuse(input_ids)
 90 | 
 91 | def stream():
 92 |     global model, cache, config, generator, tokenizer
 93 |     global stop_strings, stop_tokens, prompt_ids, held_text, max_stop_string, remaining_tokens
 94 |     global full_prompt, utilized_prompt, built_response
 95 | 
 96 |     # Check total response length
 97 | 
 98 |     if remaining_tokens == 0:
 99 |         return held_text, True, full_prompt + built_response, utilized_prompt + built_response, built_response
100 |     remaining_tokens -= 1
101 | 
102 |     # Generate
103 | 
104 |     old_tail = tokenizer.decode(generator.sequence_actual[:, -max_stop_string:])[0]
105 |     next_token = generator.gen_single_token()
106 | 
107 |     # End on stop token
108 | 
109 |     if next_token in stop_tokens:
110 |         return held_text, True, full_prompt + built_response, utilized_prompt + built_response, built_response
111 | 
112 |     # Get new text
113 | 
114 |     new_tail = tokenizer.decode(generator.sequence_actual[:, -(max_stop_string + 1):])[0]
115 |     added_text = new_tail[len(old_tail):]
116 |     held_text += added_text
117 | 
118 |     # Hold text if it's part of a stop condition, end if it's a full stop condition
119 | 
120 |     partial_ss = False
121 |     for ss in stop_strings:
122 | 
123 |         # Check if held_text fully contains stop string
124 | 
125 |         position = held_text.find(ss)
126 |         if position != -1:
127 |             built_response += held_text[:position]
128 |             return held_text[:position], True, full_prompt + built_response, utilized_prompt + built_response, built_response
129 | 
130 |         # Check if end of held_text overlaps with start of stop string
131 | 
132 |         overlap = 0
133 |         for j in range(1, min(len(held_text), len(ss)) + 1):
134 |             if held_text[-j:] == ss[:j]: overlap = j
135 |         if overlap > 0: partial_ss = True
136 | 
137 |     # Return partial result
138 | 
139 |     if partial_ss:
140 |         return "", False, full_prompt + built_response, utilized_prompt + built_response, built_response
141 | 
142 |     stream_text = held_text
143 |     held_text = ""
144 |     built_response += stream_text
145 |     return stream_text, False, full_prompt, utilized_prompt, built_response
146 | 
147 | def leftTrimTokens(text: str, desiredLen: int):
148 | 
149 |     encodedText = tokenizer.encode(text)
150 |     if encodedText.shape[-1] <= desiredLen:
151 |         return text
152 |     else:
153 |         return tokenizer.decode(encodedText[:, -desiredLen:])[0]
154 | 
155 | def oneshot_generation(prompt: str, stop_conditions: list, max_new_tokens: int, gen_settings: ExLlamaGenerator.Settings):
156 | 
157 |     begin_stream(prompt, stop_conditions, max_new_tokens, gen_settings)
158 |     response = ""
159 |     while True:
160 |         _, eos, _, _, _ = stream()
161 |         if eos: break
162 | 
163 |     return full_prompt + built_response, utilized_prompt + built_response, built_response
164 | 
165 | 
166 | def get_num_tokens(text: str):
167 | 
168 |     return cached_tokenize(text).shape[-1]
169 | 
170 | 
171 | 
172 | 
173 | # Websocket server
174 | async def estimateToken(request, ws):
175 |     text = request["text"]
176 |     numTokens=get_num_tokens(text)
177 |     return numTokens# return number of tokens in int
178 | 
179 | async def oneShotInfer(request, ws):
180 |     stopToken = request["stopToken"]
181 |     fullContext = request["text"]
182 |     maxNew = int(request["maxNew"])
183 |     top_p = float(request["top_p"])
184 |     top_k = int(request["top_k"])
185 |     temp = float(request["temp"])
186 |     rep_pen = float(request["rep_pen"])
187 |     sc = [tokenizer.eos_token_id]
188 |     sc.append(stopToken)
189 | 
190 |     gs = ExLlamaGenerator.Settings()
191 |     gs.top_k = top_k
192 |     gs.top_p = top_p
193 |     gs.temperature = temp
194 |     gs.token_repetition_penalty_max = rep_pen
195 | 
196 |     full_ctx, util_ctx, response = oneshot_generation(prompt=fullContext, stop_conditions=sc, max_new_tokens=maxNew, gen_settings=gs)
197 | 
198 |     return full_ctx, util_ctx, response# return requested prompt/context, pruned prompt/context(eg. prunedctx+maxNew=4096), model generated response, not including prompt
199 | 
200 | async def streamInfer(request, ws):
201 |     stopToken = [tokenizer.eos_token_id]
202 |     stopToken += request["stopToken"].split(',')
203 |     prompt = request["text"]
204 |     maxNew = int(request["maxNew"])
205 |     top_p = float(request["top_p"])
206 |     top_k = int(request["top_k"])
207 |     temp = float(request["temp"])
208 |     rep_pen = float(request["rep_pen"])
209 |     gs = ExLlamaGenerator.Settings()
210 |     gs.top_k = top_k
211 |     gs.top_p = top_p
212 |     gs.temperature = temp
213 |     gs.token_repetition_penalty_max = rep_pen
214 |     begin_stream(prompt, stopToken, maxNew, gs)
215 |     while True:
216 |         chunk, eos, x, y, builtResp = stream()
217 |         await ws.send(json.dumps({'action':request["action"],
218 |                                   'request_id':request['request_id'],
219 |                                   'utilContext':utilized_prompt + builtResp, 
220 |                                   'response':builtResp}))
221 |         if eos: break
222 |     return utilized_prompt + built_response,builtResp
223 | 
224 | 
225 | async def main(websocket, path):
226 |     async for message in websocket:
227 |         #try:
228 |             request = json.loads(message)
229 |             reqID = request["request_id"]
230 |             action = request["action"]
231 | 
232 |             if action == "estimateToken":
233 |                 response = await estimateToken(request, websocket)
234 |                 await websocket.send(json.dumps({'action':action, 'request_id':reqID, 'response':response}))
235 | 
236 |             elif action == "echo":
237 |                 await websocket.send(json.dumps({'action':action, 'request_id':reqID}))
238 | 
239 |             elif action == "oneShotInfer":
240 |                 fctx, utlctx, res = await oneShotInfer(request, websocket)
241 |                 await websocket.send(json.dumps({'action':action, 'request_id':reqID,'utilContext':utlctx, 'response':res}))
242 |             
243 |             elif action == "leftTrim":
244 |                 prompt = request["text"]
245 |                 desiredLen = int(request["desiredLen"])
246 |                 processedPrompt = leftTrimTokens(prompt, desiredLen)
247 |                 await websocket.send(json.dumps({'action':action, 'request_id':reqID, 'response':processedPrompt}))
248 | 
249 |             else:
250 |                 utlctx, builtResp= await streamInfer(request, websocket)
251 |                 await websocket.send(json.dumps({'action':action, 'request_id':reqID,'utilContext':utlctx, 'response':builtResp+'</s>'}))
252 | 
253 | 
254 | 
255 |         #except Exception as e:
256 |             #print({"error": str(e)})
257 | 
258 | model_directory = "./models/Llama-2-70B-chat-GPTQ/"
259 | 
260 | tokenizer_path = os.path.join(model_directory, "tokenizer.model")
261 | model_config_path = os.path.join(model_directory, "config.json")
262 | st_pattern = os.path.join(model_directory, "*.safetensors")
263 | model_path = glob.glob(st_pattern)[0]
264 | esTokenizer = SentencePieceProcessor(model_file = tokenizer_path)
265 | config = ExLlamaConfig(model_config_path)               # create config from config.json
266 | config.set_auto_map('17.615,18.8897')
267 | config.model_path = model_path                          # supply path to model weights file
268 | 
269 | model = ExLlama(config)                                 # create ExLlama instance and load the weights
270 | print(f"Model loaded: {model_path}")
271 | 
272 | tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
273 | cache = ExLlamaCache(model)                             # create cache for inference
274 | generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
275 | start_server = websockets.serve(main, "0.0.0.0", 8080)
276 | 
277 | asyncio.get_event_loop().run_until_complete(start_server)
278 | asyncio.get_event_loop().run_forever()
279 | 


--------------------------------------------------------------------------------
/exllama/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cuda_ext, generator, model, tokenizer
2 | 


--------------------------------------------------------------------------------
/exllama/cuda_ext.py:
--------------------------------------------------------------------------------
  1 | # from abc import ABC
  2 | import torch
  3 | from torch.cuda.amp import custom_bwd, custom_fwd
  4 | from torch.utils.cpp_extension import load
  5 | import os
  6 | import sys
  7 | import platform
  8 | 
  9 | import exllama_ext
 10 | # from exllama_ext import set_tuning_params
 11 | # from exllama_ext import prepare_buffers
 12 | from exllama_ext import make_q4
 13 | from exllama_ext import q4_matmul
 14 | from exllama_ext import q4_matmul_lora
 15 | from exllama_ext import half_matmul
 16 | from exllama_ext import half_matmul_cublas
 17 | # from exllama_ext import q4_mlp
 18 | from exllama_ext import rms_norm
 19 | from exllama_ext import rope_
 20 | from exllama_ext import rep_penalty
 21 | from exllama_ext import apply_rep_penalty
 22 | 
 23 | 
 24 | # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
 25 | 
 26 | none_tensor = torch.empty((1, 1), device = "meta")
 27 | 
 28 | 
 29 | # Construct Q4Matrix, return handle
 30 | 
 31 | def ext_make_q4(qweight, qzeros, scales, g_idx, device):
 32 | 
 33 |     return make_q4(qweight,
 34 |                    qzeros,
 35 |                    scales,
 36 |                    g_idx if g_idx is not None else none_tensor,
 37 |                    device)
 38 | 
 39 | 
 40 | # Matrix multiplication, returns x @ q4
 41 | 
 42 | def ext_q4_matmul(x, q4, q4_width, lora_A = None, lora_B = None):
 43 | 
 44 |     outshape = x.shape[:-1] + (q4_width,)
 45 |     x = x.view(-1, x.shape[-1])
 46 |     output = torch.empty((x.shape[0], q4_width), dtype = torch.float16, device = x.device)
 47 | 
 48 |     if lora_A is None:
 49 |         q4_matmul(x, q4, output)
 50 |     else:
 51 |         lora_temp = torch.empty((x.shape[0], lora_A.shape[1]), dtype = torch.float16, device = x.device)
 52 |         q4_matmul_lora(x, q4, output, lora_A, lora_B, lora_temp)
 53 | 
 54 |     return output.view(outshape)
 55 | 
 56 | 
 57 | # Matrix multiplication, returns x @ w, both half-precision tensors
 58 | 
 59 | def ext_half_matmul(x, w, cublas = False):
 60 | 
 61 |     outshape = x.shape[:-1] + (w.shape[1],)
 62 |     x = x.view(-1, x.shape[-1])
 63 | 
 64 |     if cublas:
 65 |         output = torch.empty((x.shape[0], w.shape[1]), dtype = torch.float16, device = x.device)
 66 |         half_matmul_cublas(x, w, output)
 67 |     else:
 68 |         output = torch.zeros((x.shape[0], w.shape[1]), dtype = torch.float16, device = x.device)
 69 |         half_matmul(x, w, output)
 70 | 
 71 |     return output.view(outshape)  ##
 72 | 
 73 | 
 74 | # RoPE embeddings, in_place
 75 | 
 76 | def ext_rope_(x, sin, cos, past_len, num_heads, head_dim):
 77 | 
 78 |     rope_(x, sin, cos, past_len, num_heads, head_dim)
 79 | 
 80 | 
 81 | # RMS norm: x = x * w / sqrt(row_mean(x * x) + epsilon)
 82 | 
 83 | def ext_rms_norm(x, w, epsilon):
 84 | 
 85 |     outshape = x.shape
 86 |     x = x.view(-1, x.shape[-1])
 87 |     output = torch.empty_like(x)
 88 |     rms_norm(x, w, output, epsilon)
 89 | 
 90 |     return output.view(outshape)
 91 | 
 92 | def ext_rms_norm_(x, w, epsilon):
 93 | 
 94 |     outshape = x.shape
 95 |     x = x.view(-1, x.shape[-1])
 96 |     rms_norm(x, w, x, epsilon)
 97 | 
 98 | 
 99 | # Repetition penalty
100 | 
101 | def ext_rep_penalty_mask_cpu(vocab_size, sequence, penalty_max, sustain, decay):
102 | 
103 |     rep_mask = torch.empty(vocab_size, dtype = torch.float32)
104 |     rep_penalty(sequence, rep_mask, penalty_max, sustain, decay)
105 |     return rep_mask
106 | 
107 | 
108 | def ext_apply_rep_penalty_mask_cpu(sequence, penalty_max, sustain, decay, logits):
109 | 
110 |     apply_rep_penalty(sequence, penalty_max, sustain, decay, logits)
111 | 
112 | 


--------------------------------------------------------------------------------
/exllama/lora.py:
--------------------------------------------------------------------------------
  1 | from .model import ExLlamaConfig, Ex4bitLinear
  2 | import torch
  3 | import json
  4 | from safetensors.torch import load_file as safe_load_file
  5 | from torch import load as load_file
  6 | 
  7 | class ExLlamaLora:
  8 | 
  9 |     lora_config_path: str
 10 |     lora_path: str
 11 |     lora_r: int
 12 |     lora_alpha: float
 13 |     lora_scaling: float
 14 |     config: ExLlamaConfig
 15 |     tensors: dict[torch.tensor]
 16 |     bias_ignored: bool
 17 | 
 18 |     def __init__(self, model, lora_config_path, lora_path):
 19 | 
 20 |         self.lora_config_path = lora_config_path
 21 |         self.lora_path = lora_path
 22 |         self.model = model
 23 |         self.config = model.config
 24 |         self.tensors = {}
 25 |         self.bias_ignored = False
 26 | 
 27 |         # Grab relevant items from LoRA config
 28 | 
 29 |         with open(lora_config_path) as f:
 30 |             read_config = json.load(f)
 31 | 
 32 |         self.lora_r = read_config["r"]
 33 |         self.lora_alpha = float(read_config["lora_alpha"])
 34 |         self.lora_scaling = self.lora_alpha / self.lora_r
 35 | 
 36 |         if "fan_in_fan_out" in read_config and read_config["fan_in_fan_out"]:
 37 |             raise ValueError(" ## Error: fan_in_fan_out mode not supported.")
 38 | 
 39 |         # Load LoRA weights
 40 | 
 41 |         if self.lora_path.endswith(".safetensors"):
 42 |             f = safe_load_file(self.lora_path, device = "cpu")
 43 |         else:
 44 |             f = load_file(self.lora_path, map_location = "cpu")
 45 | 
 46 |         for key in f.keys():
 47 |             tensor = f[key]
 48 | 
 49 |             # Find target
 50 | 
 51 |             i = key.find("model.layers.")
 52 |             if i == -1: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")
 53 | 
 54 |             target_key = key[i:]
 55 |             ks = target_key.split(".")
 56 |             decoder_idx = int(ks[2])
 57 |             decoder_part = ks[3]
 58 |             decoder_layer = ks[4]
 59 |             lora_half = ks[5]
 60 | 
 61 |             if lora_half == "bias":
 62 |                 epsilon = 1e-6
 63 |                 if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon:
 64 |                     raise ValueError(f" ## Error: unsupported bias target {self.lora_path}: {key}")
 65 |                 self.bias_ignored = True
 66 |                 continue
 67 | 
 68 |             target_module = self.model.layers[decoder_idx]
 69 |             if decoder_part == "self_attn": target_module = target_module.self_attn
 70 |             elif decoder_part == "mlp": target_module = target_module.mlp
 71 |             else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")
 72 | 
 73 |             if   decoder_layer == "q_proj": target_module = target_module.q_proj
 74 |             elif decoder_layer == "k_proj": target_module = target_module.k_proj
 75 |             elif decoder_layer == "v_proj": target_module = target_module.v_proj
 76 |             elif decoder_layer == "o_proj": target_module = target_module.o_proj
 77 |             elif decoder_layer == "gate_proj": target_module = target_module.gate_proj
 78 |             elif decoder_layer == "up_proj": target_module = target_module.up_proj
 79 |             elif decoder_layer == "down_proj": target_module = target_module.down_proj
 80 |             else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")
 81 | 
 82 |             # Check that shape is compatible
 83 | 
 84 |             assert isinstance(target_module, Ex4bitLinear)
 85 | 
 86 |             if lora_half == "lora_A":
 87 |                 in_features = tensor.shape[1]
 88 |                 out_features = None
 89 |             elif lora_half == "lora_B":
 90 |                 in_features = None
 91 |                 out_features = tensor.shape[0]
 92 |             else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}")
 93 | 
 94 |             if (in_features and in_features != target_module.in_features) or (out_features and out_features != target_module.out_features):
 95 |                 raise ValueError(f" ## Error: incompatible tensor shape in {self.lora_path}: {key}")
 96 | 
 97 |             # For efficiency, transpose adapter instead of transposing state during inference
 98 | 
 99 |             tensor = tensor.T.contiguous()
100 | 
101 |             # Pre-scale
102 | 
103 |             if lora_half == "lora_B" and self.lora_scaling != 1.0: tensor.mul_(self.lora_scaling)
104 | 
105 |             # Check that dtype is compatible, or convert
106 | 
107 |             if tensor.dtype == torch.bfloat16:
108 |                 tensor = tensor.to(torch.float16)
109 | 
110 |             elif tensor.dtype == torch.float32:
111 |                 tensor = tensor.to(torch.float16)
112 | 
113 |             elif tensor.dtype == torch.float16:
114 |                 pass
115 | 
116 |             else: raise ValueError(f" ## Error: unsupported tensor dtype in {self.lora_path}")
117 | 
118 |             # Move to target device
119 | 
120 |             device = self.config.device_map.map(target_key)
121 |             tensor = tensor.to(device, non_blocking = True)
122 | 
123 |             # Store adapter tensor
124 | 
125 |             self.tensors[target_key] = tensor
126 | 


--------------------------------------------------------------------------------
/exllama/tokenizer.py:
--------------------------------------------------------------------------------
  1 | from sentencepiece import SentencePieceProcessor
  2 | import os
  3 | import torch
  4 | 
  5 | class ExLlamaTokenizer:
  6 | 
  7 |     def __init__(self, tokenizer_model_path):
  8 | 
  9 |         self.path = tokenizer_model_path
 10 |         self.tokenizer = SentencePieceProcessor(model_file = self.path)
 11 | 
 12 |         self.unk_token = "<unk>"
 13 |         self.bos_token = "<s>"
 14 |         self.eos_token = "</s>"
 15 |         self.unk_token_id = self.tokenizer.unk_id() # is the same as pad token id...
 16 |         self.eos_token_id = self.tokenizer.eos_id()
 17 |         self.bos_token_id = self.tokenizer.bos_id()
 18 |         self.pad_token_id = 0  # self.tokenizer.pad_id()
 19 |         self.newline_token_id = 13
 20 | 
 21 |         self.special_characters = [(self.bos_token, self.bos_token_id), (self.eos_token, self.eos_token_id), (self.unk_token, self.unk_token_id)] # for tokenzier encoding
 22 | 
 23 |     # Encode string
 24 | 
 25 |     def encode(self, text, return_mask = False, max_seq_len = 2048, add_bos = False, add_eos = False, encode_special_characters = False):
 26 | 
 27 |         if isinstance(text, list):
 28 | 
 29 |             # text is a list of strings
 30 | 
 31 |             list_ids = self.tokenizer.EncodeAsIds(text)
 32 | 
 33 |             # pad bos and eos
 34 | 
 35 |             if add_bos:
 36 |                 for ids in list_ids: ids.insert(0, self.bos_token_id)
 37 |             if add_eos:
 38 |                 for ids in list_ids: ids.append(self.eos_token_id)
 39 | 
 40 |             max_length = max([len(ids) for ids in list_ids])
 41 | 
 42 |             needs_mask = False
 43 |             padded_ids = []
 44 |             for ids in list_ids:
 45 |                 if len(ids) != len(list_ids[0]): needs_mask = True
 46 |                 padding = torch.full((max_length - len(ids),), self.pad_token_id)
 47 |                 sequence = torch.tensor(ids)
 48 |                 padded_ids.append(torch.cat((padding, sequence), dim = 0).long())
 49 | 
 50 |             stacked_ids = torch.stack(padded_ids, dim = 0)
 51 | 
 52 |             if return_mask:
 53 |                 if needs_mask:
 54 |                     mask_padding = torch.full((stacked_ids.shape[0], max_seq_len - stacked_ids.shape[1]), True, dtype = torch.bool, device = "cpu")
 55 |                     mask = stacked_ids != 0
 56 |                     mask = torch.cat((mask, mask_padding), dim = 1)
 57 |                     return stacked_ids, mask
 58 |                 else:
 59 |                     return stacked_ids, None
 60 |             else:
 61 |                 return stacked_ids
 62 | 
 63 |         else:
 64 | 
 65 |             # text is a single string
 66 |             split_text = [text]
 67 | 
 68 |             # look for special characters
 69 |             if encode_special_characters:
 70 |                 for special_character, special_token_id in self.special_characters:
 71 |                     temp_text = []
 72 |                     for segment in split_text:
 73 |                         if isinstance(segment, str) and special_character in segment:
 74 |                             # for each special character, append the text before the special character, then append the special character ID, then the rest of the text
 75 |                             parts = segment.split(special_character)
 76 |                             new_parts = []
 77 |                             for i, part in enumerate(parts):
 78 |                                 new_parts.append(part)
 79 |                                 if i < len(parts) - 1:  # add the special token id between parts, but not after the last part
 80 |                                     new_parts.append(special_token_id)
 81 |                             temp_text.extend(new_parts)
 82 |                         else:
 83 |                             temp_text.append(segment)
 84 |                     split_text = temp_text
 85 | 
 86 |             ids = []
 87 | 
 88 |             for text_chunk in split_text:
 89 |                 if isinstance(text_chunk, str):
 90 |                     ids += self.tokenizer.EncodeAsIds(text_chunk)
 91 |                 else:
 92 |                     ids.append(text_chunk)
 93 | 
 94 |             # pad bos and eos
 95 | 
 96 |             if add_bos:
 97 |               ids = [self.bos_token_id] + ids
 98 |             if add_eos:
 99 |               ids = ids + [self.eos_token_id]
100 | 
101 |             stacked_ids = torch.tensor(ids).unsqueeze(0)
102 | 
103 |             if return_mask:
104 |                 return stacked_ids, None
105 |             else:
106 |                 return stacked_ids
107 | 
108 |     def decode(self, ids, decode_special_characters=False):
109 |         
110 |         special_ids = {id_: char for char, id_ in self.special_characters}  # create a lookup dictionary
111 | 
112 |         if ids.dim() > 1:
113 |             
114 |             texts = []
115 |             for i in range(ids.shape[0]):
116 |                 seq = ids[i].tolist()
117 |                 seq = [t for t in seq if t != self.pad_token_id]
118 | 
119 |                 if decode_special_characters:
120 |                     text_parts = []
121 |                     normal_ids = []  # list of lists
122 |                     current_normal_ids = []  # current list of normal IDs
123 |                     for idx, id_ in enumerate(seq):
124 |                         if id_ in special_ids:
125 |                             # Save the current list of normal IDs, then start a new one
126 |                             normal_ids.append(current_normal_ids)
127 |                             current_normal_ids = []
128 |                             # Store special token as a string
129 |                             text_parts.append(special_ids[id_])
130 |                         else:
131 |                             current_normal_ids.append(id_)
132 |                     normal_ids.append(current_normal_ids)  # save the last segment of normal IDs
133 |                     
134 |                     decoded_segments = [self.tokenizer.Decode(segment) for segment in normal_ids]
135 |                     for idx, decoded_segment in enumerate(decoded_segments):
136 |                         text_parts.insert(2*idx, decoded_segment)
137 |                     
138 |                     texts.append("".join(text_parts))
139 |                 else:
140 |                     if self.eos_token_id in seq:  # to not mess up special char decoding
141 |                         seq = seq[:seq.index(self.eos_token_id)]
142 |                     texts.append(self.tokenizer.Decode(seq))
143 | 
144 |             return texts
145 | 
146 |         else:
147 |             
148 |             ids = ids.tolist()
149 | 
150 |             if decode_special_characters:
151 |                 
152 |                 text_parts = []
153 |                 normal_ids = []  # list of lists
154 |                 current_normal_ids = []  # current list of normal IDs
155 |                 for idx, id_ in enumerate(ids):
156 |                     if id_ in special_ids:
157 |                         # Save the current list of normal IDs, then start a new one
158 |                         normal_ids.append(current_normal_ids)
159 |                         current_normal_ids = []
160 |                         # Store special token as a string
161 |                         text_parts.append(special_ids[id_])
162 |                     else:
163 |                         current_normal_ids.append(id_)
164 |                 normal_ids.append(current_normal_ids)  # save the last segment of normal IDs
165 |                 
166 |                 decoded_segments = [self.tokenizer.Decode(segment) for segment in normal_ids]
167 |                 for idx, decoded_segment in enumerate(decoded_segments):
168 |                     text_parts.insert(2*idx, decoded_segment)
169 |                 
170 |                 text = "".join(text_parts)
171 |             
172 |             else:
173 |               
174 |                 text = self.tokenizer.Decode(ids)
175 | 
176 |             return text
177 | 
178 | 
179 |     def num_tokens(self, text, encode_special_characters = False):
180 |         
181 |         if encode_special_characters:
182 |             
183 |             ids = self.encode(text, encode_special_characters = True)
184 |             return ids.size(1)
185 |         
186 |         else:
187 |             
188 |             ids = self.tokenizer.Encode(text)
189 |             return len(ids)


--------------------------------------------------------------------------------
/exllama_ext/cpu_func/rep_penalty.cpp:
--------------------------------------------------------------------------------
 1 | #include "rep_penalty.h"
 2 | #include <cstdlib>
 3 | #include <cstring>
 4 | 
 5 | void rep_penalty_cpu
 6 | (
 7 |     const int vocab_size,
 8 |     const uint64_t* sequence,
 9 |     float* rep_mask,
10 |     const float penalty_max,
11 |     const int sustain,
12 |     const int decay,
13 |     const int seq_len
14 | )
15 | {
16 |     float v = penalty_max;
17 |     float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f;
18 | 
19 |     int s = sustain == -1 ? seq_len : sustain;
20 |     int beg = seq_len - s - decay;
21 |     if (beg < 0) beg = 0;
22 | 
23 |     for (int i = 0; i < vocab_size; i++) rep_mask[i] = 1.0f;
24 | 
25 |     for (int i = seq_len; i > beg;)
26 |     {
27 |         uint64_t t = sequence[--i];
28 |         if (v > rep_mask[t]) rep_mask[t] = v;
29 |         if (--s < 0) v += dv;
30 |     }
31 | }
32 | 
33 | bool* g_rep_mask = NULL;
34 | int g_vocab_size = 0;
35 | 
36 | void apply_rep_penalty_cpu
37 | (
38 |     const int vocab_size,
39 |     const uint64_t* sequence,
40 |     const float penalty_max,
41 |     const int sustain,
42 |     const int decay,
43 |     const int seq_len,
44 |     float* logits
45 | )
46 | {
47 |     if (vocab_size != g_vocab_size)
48 |     {
49 |         if (g_rep_mask) free(g_rep_mask);
50 |         g_vocab_size = vocab_size;
51 |         g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool));
52 |     }
53 | 
54 |     memset(g_rep_mask, 0, g_vocab_size * sizeof(bool));
55 | 
56 |     float v = penalty_max;
57 |     float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f;
58 | 
59 |     int s = sustain == -1 ? seq_len : sustain;
60 |     int beg = seq_len - s - decay;
61 |     if (beg < 0) beg = 0;
62 | 
63 |     for (int i = seq_len; i > beg;)
64 |     {
65 |         uint64_t t = sequence[--i];
66 |         if (!g_rep_mask[t])
67 |         {
68 |             if (logits[t] > 0.0) logits[t] /= v;
69 |             else logits[t] *= v;
70 |             g_rep_mask[t] = true;
71 |         }
72 |         if (--s < 0) v += dv;
73 |     }
74 | }


--------------------------------------------------------------------------------
/exllama_ext/cpu_func/rep_penalty.h:
--------------------------------------------------------------------------------
 1 | #ifndef _rep_penalty_h
 2 | #define _rep_penalty_h
 3 | 
 4 | #include <cstdint>
 5 | #include <cstdio>
 6 | 
 7 | void rep_penalty_cpu
 8 | (
 9 |     const int vocab_size,
10 |     const uint64_t* sequence,
11 |     float* rep_mask,
12 |     const float penalty_max,
13 |     const int sustain,
14 |     const int decay,
15 |     const int seq_len
16 | );
17 | 
18 | void apply_rep_penalty_cpu
19 | (
20 |     const int vocab_size,
21 |     const uint64_t* sequence,
22 |     const float penalty_max,
23 |     const int sustain,
24 |     const int decay,
25 |     const int seq_len,
26 |     float* logits
27 | );
28 | 
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_buffers.cu:
--------------------------------------------------------------------------------
  1 | #define _cuda_buffers_cu
  2 | #include "cuda_buffers.cuh"
  3 | 
  4 | CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
  5 | // __constant__ half2 q4_table[16][256];
  6 | // half2 q4_table_host[16][256];
  7 | // bool q4_table_init = false;
  8 | 
  9 | CudaBuffers::CudaBuffers
 10 | (
 11 |     int _device,
 12 |     half* _temp_state,
 13 |     int _temp_state_size,
 14 |     half* _temp_mlp,
 15 |     float* _temp_zeros_float,
 16 |     half* _temp_dq,
 17 |     int _max_zeros_float
 18 | ) :
 19 |     device(_device),
 20 |     temp_state(_temp_state),
 21 |     temp_state_size(_temp_state_size),
 22 |     temp_mlp(_temp_mlp),
 23 |     temp_zeros_float(_temp_zeros_float),
 24 |     temp_dq(_temp_dq),
 25 |     max_zeros_float(_max_zeros_float),
 26 |     current_zeros_float(0)
 27 | {
 28 |     cudaSetDevice(_device);
 29 | 
 30 |     cudaStreamCreate(&alt_stream_1);
 31 |     cudaStreamCreate(&alt_stream_2);
 32 |     cudaStreamCreate(&alt_stream_3);
 33 |     cudaEventCreate(&alt_stream_1_done);
 34 |     cudaEventCreate(&alt_stream_2_done);
 35 |     cudaEventCreate(&alt_stream_3_done);
 36 | }
 37 | 
 38 | CudaBuffers::~CudaBuffers()
 39 | {
 40 |     cudaStreamDestroy(alt_stream_1);
 41 |     cudaStreamDestroy(alt_stream_2);
 42 |     cudaStreamDestroy(alt_stream_3);
 43 |     cudaEventDestroy(alt_stream_1_done);
 44 |     cudaEventDestroy(alt_stream_2_done);
 45 |     cudaEventDestroy(alt_stream_3_done);
 46 | }
 47 | 
 48 | float* CudaBuffers::get_zeros_float(const int num_zeros)
 49 | {
 50 |     if (current_zeros_float + num_zeros >= max_zeros_float)
 51 |     {
 52 |         current_zeros_float = 0;
 53 |         cudaMemsetAsync(temp_zeros_float, 0, max_zeros_float * sizeof(float));
 54 |     }
 55 | 
 56 |     float* zeros = temp_zeros_float + current_zeros_float;
 57 |     current_zeros_float += num_zeros;
 58 |     return zeros;
 59 | }
 60 | 
 61 | CudaBuffers* get_buffers(const int device_index)
 62 | {
 63 |     return g_buffers[device_index];
 64 | }
 65 | 
 66 | void prepare_buffers_cuda
 67 | (
 68 |     int _device,
 69 |     half* _temp_state,
 70 |     int _temp_state_size,
 71 |     half* _temp_mlp,
 72 |     float* _temp_zeros_float,
 73 |     half* _temp_dq,
 74 |     int _max_zeros_float
 75 | )
 76 | {
 77 |     CudaBuffers* buffers = new CudaBuffers
 78 |     (
 79 |         _device,
 80 |         _temp_state,
 81 |         _temp_state_size,
 82 |         _temp_mlp,
 83 |         _temp_zeros_float,
 84 |         _temp_dq,
 85 |         _max_zeros_float
 86 |     );
 87 | 
 88 |     g_buffers[_device] = buffers;
 89 | }
 90 | 
 91 | void cleanup_buffers_cuda()
 92 | {
 93 |     for (int i = 0; i < CUDA_MAX_DEVICES; i++)
 94 |     {
 95 |         if (!g_buffers[i]) continue;
 96 |         delete g_buffers[i];
 97 |         g_buffers[i] = NULL;
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_buffers.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _cuda_buffers_cuh
 2 | #define _cuda_buffers_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | #include <cstdio>
 8 | 
 9 | const int CUDA_MAX_DEVICES = 16;
10 | 
11 | // #ifndef _cuda_buffers_cu
12 | // extern __constant__ half2 q4_table[16][256];
13 | // #endif
14 | 
15 | class CudaBuffers
16 | {
17 | public:
18 |     int device;
19 | 
20 |     half* temp_state;           // [max_hidden_rows * intermediate_size]
21 |     int temp_state_size;
22 |     half* temp_mlp;             // [hidden_dim * intermediate_size]
23 |     float* temp_zeros_float;    // [max_hidden_rows]
24 |     half* temp_dq;              // size of largest quant tensor * 8
25 | 
26 |     int current_zeros_float;
27 |     int max_zeros_float;
28 | 
29 |     cudaStream_t alt_stream_1;
30 |     cudaStream_t alt_stream_2;
31 |     cudaStream_t alt_stream_3;
32 |     cudaEvent_t alt_stream_1_done;
33 |     cudaEvent_t alt_stream_2_done;
34 |     cudaEvent_t alt_stream_3_done;
35 | 
36 |     CudaBuffers
37 |     (
38 |         int _device,
39 |         half* _temp_state,
40 |         int _temp_state_size,
41 |         half* _temp_mlp,
42 |         float* _temp_zeros_float,
43 |         half* _temp_dq,
44 |         int _max_zeros_float
45 |     );
46 |     ~CudaBuffers();
47 | 
48 |     float* get_zeros_float(const int num_zeros);
49 | };
50 | 
51 | CudaBuffers* get_buffers(const int device_index);
52 | 
53 | void prepare_buffers_cuda
54 | (
55 |     int _device,
56 |     half* _temp_state,
57 |     int _temp_state_size,
58 |     half* _temp_mlp,
59 |     float* _temp_zeros_float,
60 |     half* _temp_dq,
61 |     int _max_zeros_float
62 | );
63 | 
64 | void cleanup_buffers_cuda();
65 | 
66 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_compat.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _cuda_compat_cuh
 2 | #define _cuda_compat_cuh
 3 | 
 4 | // atomicAdd for half types, to support CC < 7.x
 5 | 
 6 | __device__ __forceinline__ void atomicAdd_half(half* address, half val)
 7 | {
 8 |     unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
 9 |     unsigned int old = *address_as_ui;
10 |     unsigned int assumed;
11 | 
12 |     do
13 |     {
14 |         assumed = old;
15 |         __half_raw hsum;
16 |         hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
17 |         half tmpres = __hadd(hsum, val);
18 |         hsum = __half_raw(tmpres);
19 |         old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
20 |         old = atomicCAS(address_as_ui, assumed, old);
21 |     }
22 |     while (assumed != old);
23 | }
24 | 
25 | // atomicAdd for half2 types
26 | 
27 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
28 | {
29 |     unsigned int* address_as_ui = (unsigned int*)address;
30 |     unsigned int old = *address_as_ui;
31 |     unsigned int assumed;
32 |     do
33 |     {
34 |         assumed = old;
35 |         half2 old_val = *((half2*)&old);
36 |         half2 new_val = __hadd2(old_val, val);
37 |         old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
38 |     }
39 |     while (assumed != old);
40 | }
41 | 
42 | //
43 | 
44 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
45 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
46 | 
47 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
48 | 
49 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
50 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
51 | #endif
52 | 
53 | #endif
54 | #endif
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/column_remap.cu:
--------------------------------------------------------------------------------
 1 | #include "column_remap.cuh"
 2 | #include "../util.cuh"
 3 | 
 4 | const int SHUF_BLOCKSIZE_X = 256;
 5 | const int SHUF_BLOCKSIZE_Y = 16;
 6 | 
 7 | __global__ void column_remap_kernel
 8 | (
 9 |     const half* __restrict__ x,
10 |     half* __restrict__ x_new,
11 |     const int x_width,
12 |     const int x_height,
13 |     const uint32_t* x_map
14 | )
15 | {
16 |     int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
17 |     int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
18 |     if (x_column >= x_width) return;
19 |     //if (x_row >= x_height) return;
20 | 
21 |     int x_stride = x_width;
22 |     int x_idx = x_row * x_stride + x_column;
23 | 
24 |     int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
25 |     int x_idx_end = x_row_end * x_stride + x_column;
26 | 
27 |     int s_column = x_map[x_column];
28 |     int s_idx = x_row * x_stride + s_column;
29 | 
30 |     while (x_idx < x_idx_end)
31 |     {
32 |         x_new[x_idx] = x[s_idx];
33 |         x_idx += x_stride;
34 |         s_idx += x_stride;
35 |     }
36 | }
37 | 
38 | // Remap columns in x to correspond to sequential group index before matmul
39 | //
40 | // perform x -> seq_x such that seq_x @ seq_w == x @ w
41 | 
42 | void column_remap_cuda
43 | (
44 |     const half* x,
45 |     half* x_new,
46 |     const int x_height,
47 |     const int x_width,
48 |     const uint32_t* x_map
49 | )
50 | {
51 |     dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
52 | 
53 |     dim3 blocks
54 |     (
55 |         (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
56 |         (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
57 |         1
58 |     );
59 | 
60 |     column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
61 | }
62 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/column_remap.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _column_remap_cuh
 2 | #define _column_remap_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | 
 8 | void column_remap_cuda
 9 | (
10 |     const half* x,
11 |     half* x_new,
12 |     const int x_height,
13 |     const int x_width,
14 |     const uint32_t* x_map
15 | );
16 | 
17 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/half_matmul.cu:
--------------------------------------------------------------------------------
  1 | #include "half_matmul.cuh"
  2 | #include "../util.cuh"
  3 | #include "../matrix.cuh"
  4 | #include "../cuda_compat.cuh"
  5 | #if defined(USE_ROCM)
  6 | #include "../hip_compat.cuh"
  7 | #endif
  8 | 
  9 | // Block size
 10 | 
 11 | const int THREADS_X = 32;     // Block size and thread count along columns in w and out
 12 | const int THREADS_Y = 8;      // Block size and thread count along rows in x and out
 13 | const int BLOCKSIZE = 256;
 14 | 
 15 | __global__ void half_matmul_kernel
 16 | (
 17 |     const half* __restrict__ x,
 18 |     const half* __restrict__ w,
 19 |     half* __restrict__ out,
 20 |     const int height,
 21 |     const int dim,
 22 |     const int width
 23 | )
 24 | {
 25 |     const int column = (blockIdx.x * THREADS_X + threadIdx.x) * 2;
 26 |     const int row = blockIdx.y * THREADS_Y + threadIdx.y;
 27 |     const int k0 = blockIdx.z * BLOCKSIZE;
 28 | 
 29 |     if (row >= height) return;
 30 |     if (column >= width) return;
 31 | 
 32 |     MatrixView_half x_(x, height, dim);
 33 |     MatrixView_half w_(w, dim, width);
 34 |     MatrixView_half_rw out_(out, height, width);
 35 | 
 36 |     half2* x_ptr = (half2*) x_.item_ptr(row, k0);
 37 |     half2* w_ptr = (half2*) w_.item_ptr(k0, column);
 38 |     half2 acc = {};
 39 | 
 40 |     #pragma unroll
 41 |     for (int k = k0; k < k0 + BLOCKSIZE / 2; k++)
 42 |     {
 43 |         half2 x_item = *x_ptr++;
 44 |         half2 x_item_0 = __half2half2(x_item.x);
 45 |         half2 x_item_1 = __half2half2(x_item.y);
 46 |         half2 w_item_0 = *w_ptr; w_ptr += w_.width / 2;
 47 |         half2 w_item_1 = *w_ptr; w_ptr += w_.width / 2;
 48 |         acc = __hfma2(x_item_0, w_item_0, acc);
 49 |         acc = __hfma2(x_item_1, w_item_1, acc);
 50 |     }
 51 | 
 52 |     // out_.set(row, column, acc);
 53 |     atomicAdd((half2*)out_.item_ptr(row, column), acc);
 54 | }
 55 | 
 56 | void half_matmul_cuda
 57 | (
 58 |     const half* x,
 59 |     const half* w,
 60 |     half* out,
 61 |     const int height,
 62 |     const int dim,
 63 |     const int width,
 64 |     cudaStream_t alt_stream
 65 | )
 66 | {
 67 |     dim3 threads(THREADS_X, THREADS_Y, 1);
 68 | 
 69 |     dim3 blocks
 70 |     (
 71 |         (width + THREADS_X - 1) / THREADS_X / 2,
 72 |         (height + THREADS_Y - 1) / THREADS_Y,
 73 |         (dim + BLOCKSIZE - 1) / BLOCKSIZE
 74 |     );
 75 | 
 76 |     half_matmul_kernel<<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width);
 77 | }
 78 | 
 79 | // cuBLAS can't be beat for large matrices, probably
 80 | 
 81 | const int MAX_DIM_SMALL = 8192;
 82 | 
 83 | void half_matmul_cublas_cuda
 84 | (
 85 |     ExLlamaTuning* tuningParams,
 86 |     const half* x,
 87 |     const half* w,
 88 |     half* out,
 89 |     const int height,
 90 |     const int dim,
 91 |     const int width,
 92 |     cublasHandle_t handle,
 93 |     bool no_zero,
 94 |     cudaStream_t alt_stream
 95 | )
 96 | {
 97 |     // Fall back on a naive kernel for small matmuls to avoid cuBLAS overhead
 98 | 
 99 |     if (height < 4 && dim <= MAX_DIM_SMALL)
100 |     {
101 |         half_matmul_small_cuda(tuningParams, x, w, out, height, dim, width, no_zero, alt_stream);
102 |         return;
103 |     }
104 | 
105 |     // printf("cuBLAS: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width);
106 | 
107 |     // Use cuBLAS
108 | 
109 |     const half alpha = __float2half(1.0f);
110 |     const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
111 | 
112 |     cudaStream_t default_stream;
113 |     if (alt_stream)
114 |     {
115 |         cublasGetStream(handle, &default_stream);
116 |         cublasSetStream(handle, alt_stream);
117 |     }
118 | 
119 |     cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, w, width, x, dim, &beta, out, width);
120 | 
121 |     if (alt_stream)
122 |     {
123 |         cublasSetStream(handle, default_stream);
124 |     }
125 | }
126 | 
127 | // Alternative to cuBLAS for tall or wide matrices
128 | 
129 | const int S_THREADS_X = 8;                                      // width
130 | const int S_THREADS_Z = 1;                                      // height
131 | const int S_BLOCKSIZE = MAX_DIM_SMALL / 1024 * S_THREADS_X;     // dim
132 | 
133 | template<bool use_half2, bool odd_rank>
134 | __global__ void half_matmul_small_kernel
135 | (
136 |     const half* __restrict__ x,
137 |     const half* __restrict__ w,
138 |     half* __restrict__ out,
139 |     const int height,
140 |     const int dim,
141 |     const int width,
142 |     bool no_zero
143 | )
144 | {
145 |     int column = blockIdx.x * S_THREADS_X + threadIdx.x;
146 |     int row = blockIdx.z * S_THREADS_Z + threadIdx.z;
147 |     int k = threadIdx.y * S_BLOCKSIZE;
148 | 
149 |     if (row >= height) return;
150 |     if (column >= width) return;
151 |     // if (k >= dim) return;
152 |     // printf("%i, %i, %i\n", row, column, k);
153 | 
154 |     MatrixView_half x_(x, height, dim);
155 |     MatrixView_half w_(w, dim, width);
156 |     MatrixView_half_rw out_(out, height, width);
157 | 
158 |     int k_end = k + S_BLOCKSIZE;
159 |     if (k_end > dim) k_end = dim;
160 | 
161 |     const half* x_ptr = x_.item_ptr(row, k);
162 |     const half* x_ptr_end = x_.item_ptr(row, k_end);
163 |     const half* w_ptr = w_.item_ptr(k, column);
164 |     half* out_ptr = out_.item_ptr(row, column);
165 | 
166 |     if constexpr (use_half2 && !odd_rank)
167 |     {
168 |         half2* x_ptr2 = (half2*) x_ptr;
169 |         half2* x_ptr2_end = (half2*) x_ptr_end;
170 | 
171 |         half2 r = {};
172 | 
173 |         while(x_ptr2 < x_ptr2_end)
174 |         {
175 |             half2 x_01 = *x_ptr2++;
176 |             half2 x_23 = *x_ptr2++;
177 |             half w_0 = *w_ptr; w_ptr += width;
178 |             half w_1 = *w_ptr; w_ptr += width;
179 |             half w_2 = *w_ptr; w_ptr += width;
180 |             half w_3 = *w_ptr; w_ptr += width;
181 |             half2 w_01 = __halves2half2(w_0, w_1);
182 |             half2 w_23 = __halves2half2(w_2, w_3);
183 |             r = __hfma2(x_01, w_01, r);
184 |             r = __hfma2(x_23, w_23, r);
185 |         }
186 | 
187 |         half rh = __hadd(r.x, r.y);
188 | 
189 |         __shared__ half accum[MAX_DIM_SMALL / S_BLOCKSIZE][S_THREADS_X];
190 |         accum[threadIdx.y][threadIdx.x] = rh;
191 |         __syncthreads();
192 | 
193 |         if (threadIdx.y == 0)
194 |         {
195 |             half acc = rh;
196 |             for (int i = 1; i < blockDim.y; ++i) acc = __hadd(accum[i][threadIdx.x], acc);
197 |             if (no_zero) acc = __hadd(acc, *out_ptr);
198 |             *out_ptr = acc;
199 |         }
200 |     }
201 |     else
202 |     {
203 |         half r = {};
204 | 
205 |         while(x_ptr < x_ptr_end)
206 |         {
207 |             if constexpr (odd_rank)
208 |             {
209 |                 half x_item = *x_ptr++;
210 |                 half w_item = *w_ptr; w_ptr += width;
211 |                 r = __hfma(x_item, w_item, r);
212 |             }
213 |             else
214 |             {
215 |                 #pragma unroll
216 |                 for (int i = 0; i < 4; ++i)
217 |                 {
218 |                     half x_item = *x_ptr++;
219 |                     half w_item = *w_ptr; w_ptr += width;
220 |                     r = __hfma(x_item, w_item, r);
221 |                 }
222 |             }
223 |         }
224 | 
225 |         __shared__ half accum[MAX_DIM_SMALL / S_BLOCKSIZE][S_THREADS_X];
226 |         accum[threadIdx.y][threadIdx.x] = r;
227 |         __syncthreads();
228 | 
229 |         if (threadIdx.y == 0)
230 |         {
231 |             half acc = accum[0][threadIdx.x];
232 |             for (int i = 1; i < blockDim.y; ++i) acc = __hadd(accum[i][threadIdx.x], acc);
233 |             if (no_zero) acc = __hadd(acc, *out_ptr);
234 |             *out_ptr = acc;
235 |         }
236 |     }
237 | }
238 | 
239 | void half_matmul_small_cuda
240 | (
241 |     ExLlamaTuning* tuningParams,
242 |     const half* x,
243 |     const half* w,
244 |     half* out,
245 |     const int height,
246 |     const int dim,
247 |     const int width,
248 |     bool no_zero,
249 |     cudaStream_t alt_stream
250 | )
251 | {
252 |     bool use_half2 = !tuningParams->matmul_no_half2;
253 | 
254 |     //printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width);
255 | 
256 |     dim3 threads
257 |     (
258 |         S_THREADS_X,
259 |         (dim + S_BLOCKSIZE - 1) / S_BLOCKSIZE,
260 |         1
261 |     );
262 | 
263 |     dim3 blocks
264 |     (
265 |         (width + S_THREADS_X - 1) / S_THREADS_X,
266 |         1,
267 |         height
268 |     );
269 | 
270 |     //printf("t... %i %i %i\n", threads.x, threads.y, threads.z);
271 |     //printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z);
272 |     //if (!no_zero) cudaMemsetAsync(out, 0, height * width * sizeof(half));
273 | 
274 |     if (dim & 0x03)
275 |     {
276 |         half_matmul_small_kernel<false, true> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
277 |     }
278 |     else
279 |     {
280 |         if (use_half2) half_matmul_small_kernel<true,  false> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
281 |         else           half_matmul_small_kernel<false, false> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
282 |     }
283 | }
284 | 
285 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/half_matmul.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _half_matmul_cuh
 2 | #define _half_matmul_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | #include <ATen/cuda/CUDAContext.h>
 8 | #include "../tuning.h"
 9 | 
10 | // Workaround for hipify_python using rocblas instead of hipblas.
11 | #if defined(USE_ROCM)
12 | #include <hipblas/hipblas.h>
13 | #define rocblas_handle hipblasHandle_t
14 | #endif
15 | 
16 | void half_matmul_cuda
17 | (
18 |     const half* x,
19 |     const half* w,
20 |     half* out,
21 |     const int height,
22 |     const int dim,
23 |     const int width,
24 |     cudaStream_t alt_stream = NULL
25 | );
26 | 
27 | void half_matmul_cublas_cuda
28 | (
29 |     ExLlamaTuning* tuningParams,
30 |     const half* x,
31 |     const half* w,
32 |     half* out,
33 |     const int height,
34 |     const int dim,
35 |     const int width,
36 |     cublasHandle_t handle,
37 |     bool no_zero = false,
38 |     cudaStream_t alt_stream = NULL
39 | );
40 | 
41 | void half_matmul_small_cuda
42 | (
43 |     ExLlamaTuning* tuningParams,
44 |     const half* x,
45 |     const half* w,
46 |     half* out,
47 |     const int height,
48 |     const int dim,
49 |     const int width,
50 |     bool no_zero = false,
51 |     cudaStream_t alt_stream = NULL
52 | );
53 | 
54 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/q4_attn.cu:
--------------------------------------------------------------------------------
  1 | #include "q4_mlp.cuh"
  2 | #include "q4_matmul.cuh"
  3 | #include "rope.cuh"
  4 | #include "rms_norm.cuh"
  5 | #include "half_matmul.cuh"
  6 | #include "../cuda_buffers.cuh"
  7 | #include "../util.cuh"
  8 | #include "../matrix.cuh"
  9 | #if defined(USE_ROCM)
 10 | #include "../hip_compat.cuh"
 11 | #endif
 12 | 
 13 | const int THREADS_X = 32;
 14 | const int THREADS_Y = 1;
 15 | const int THREADS_Z = 4;
 16 | const int BLOCKSIZE_X = 2; // 2*half == 1*uint32_t
 17 | const int BLOCKSIZE_Z = 4; // num_heads must be divisible by BLOCKSIZE_Z  TODO: Check that this is the case when Llama2-34b releases
 18 | 
 19 | __global__ void update_cache_kernel
 20 | (
 21 |     const half* __restrict__ key_states,
 22 |     const half* __restrict__ value_states,
 23 |     half* __restrict__ key_cache,
 24 |     half* __restrict__ value_cache,
 25 |     const int head_dim,
 26 |     const int num_kv_heads,
 27 |     const int q_len,
 28 |     const int max_seq_len,
 29 |     const int past_len
 30 | )
 31 | {
 32 |     //int state_shape[]  = {              num_kv_heads,                     q_len, head_dim };
 33 |     int state_stride[] = {                  head_dim,   head_dim * num_kv_heads,        1 };
 34 |     int state_pos[]    = {                         0,                         0,        0 };
 35 | 
 36 |     //int cache_shape[]  = {              num_kv_heads,               max_seq_len, head_dim };
 37 |     int cache_stride[] = {    max_seq_len * head_dim,                  head_dim,        1 };
 38 |     int cache_pos[]    = {                         0,                  past_len,        0 };
 39 | 
 40 |     int size[]         = {              num_kv_heads,                  q_len, head_dim };
 41 | 
 42 |     int x = (blockIdx.x * THREADS_X + threadIdx.x) * BLOCKSIZE_X; 
 43 |     int y = blockIdx.y * THREADS_Y + threadIdx.y;
 44 |     int z = (blockIdx.z * THREADS_Z + threadIdx.z) * BLOCKSIZE_Z;
 45 |     
 46 |     if (x >= size[2]) return;
 47 |     if (y >= size[1]) return;
 48 |     if (z >= size[0]) return;
 49 | 
 50 |     int state_offset = (z + state_pos[0]) * state_stride[0] + (y + state_pos[1]) * state_stride[1] + (x + state_pos[2]) * state_stride[2];
 51 |     int cache_offset = (z + cache_pos[0]) * cache_stride[0] + (y + cache_pos[1]) * cache_stride[1] + (x + cache_pos[2]) * cache_stride[2];
 52 | 
 53 |     const uint32_t* key_ptr = (uint32_t*) (key_states + state_offset);
 54 |     const uint32_t* value_ptr = (uint32_t*) (value_states + state_offset);
 55 |     uint32_t* key_cache_ptr = (uint32_t*) (key_cache + cache_offset);
 56 |     uint32_t* value_cache_ptr = (uint32_t*) (value_cache + cache_offset);
 57 | 
 58 |     #pragma unroll
 59 |     for (int k = 0; k < BLOCKSIZE_Z; k++)
 60 |     {
 61 |         *key_cache_ptr = *key_ptr;
 62 |         key_ptr += state_stride[0] / BLOCKSIZE_X;
 63 |         key_cache_ptr += cache_stride[0] / BLOCKSIZE_X;
 64 |     }
 65 |     #pragma unroll
 66 |     for (int k = 0; k < BLOCKSIZE_Z; k++)
 67 |     {
 68 |         *value_cache_ptr = *value_ptr;
 69 |         value_ptr += state_stride[0] / BLOCKSIZE_X;
 70 |         value_cache_ptr += cache_stride[0] / BLOCKSIZE_X;
 71 |     }
 72 | }
 73 | 
 74 | void q4_attn_cuda
 75 | (
 76 |     ExLlamaTuning* tuningParams,
 77 |     cudaStream_t stream,
 78 |     cublasHandle_t handle,
 79 |     half* x,
 80 |     const half* rms_norm_weight,    // shape == (x.shape[1],) == (dim,)
 81 |     float epsilon,
 82 |     half* query_states,
 83 |     half* key_states,
 84 |     half* value_states,
 85 |     Q4Matrix* q_proj,
 86 |     Q4Matrix* k_proj,
 87 |     Q4Matrix* v_proj,
 88 |     half* sin,
 89 |     half* cos,
 90 |     const int bsz,
 91 |     const int q_len,
 92 |     const int dim,
 93 |     const int head_dim,
 94 |     const int num_heads,
 95 |     const int num_kv_heads,
 96 |     const int past_len,
 97 |     half* key_cache,
 98 |     half* value_cache,
 99 |     const half* q_a,
100 |     const half* q_b,
101 |     const int q_rank,
102 |     const half* k_a,
103 |     const half* k_b,
104 |     const int k_rank,
105 |     const half* v_a,
106 |     const half* v_b,
107 |     const int v_rank,
108 |     half* lora_temp,
109 |     const int max_seq_len,
110 |     const int device_index
111 | )
112 | {
113 |     // Cache update grid
114 | 
115 |     dim3 threads(THREADS_X, THREADS_Y, THREADS_Z);
116 | 
117 |     dim3 blocks
118 |     (
119 |         ((head_dim + THREADS_X - 1) / THREADS_X + BLOCKSIZE_X - 1) / BLOCKSIZE_X,
120 |         q_len,
121 |         ((num_kv_heads + THREADS_Z - 1) / THREADS_Z + BLOCKSIZE_Z - 1) / BLOCKSIZE_Z
122 |     );
123 | 
124 |     int _rows_per_batch = q_len * num_heads;
125 |     int _rows_per_batch_kv = q_len * num_kv_heads;
126 | 
127 |     CudaBuffers* buffers = get_buffers(device_index);
128 | 
129 |     // Layernorm
130 | 
131 |     half* temp_x = buffers->temp_state + q_len * dim;
132 |     rms_norm_cuda(tuningParams, x, rms_norm_weight, temp_x, epsilon, q_len, dim, device_index);
133 | 
134 |     // Adapters
135 | 
136 |     if (q_a)
137 |     {
138 |         half_matmul_cublas_cuda(tuningParams, temp_x, q_a, lora_temp, q_len, dim, q_rank, handle);
139 |         half_matmul_cublas_cuda(tuningParams, lora_temp, q_b, query_states, q_len, q_rank, dim, handle);
140 |     }
141 |     if (k_a)
142 |     {
143 |         half_matmul_cublas_cuda(tuningParams, temp_x, k_a, lora_temp, q_len, dim, k_rank, handle);
144 |         half_matmul_cublas_cuda(tuningParams, lora_temp, k_b, key_states, q_len, k_rank, dim, handle);
145 |     }
146 |     if (v_a)
147 |     {
148 |         half_matmul_cublas_cuda(tuningParams, temp_x, v_a, lora_temp, q_len, dim, v_rank, handle);
149 |         half_matmul_cublas_cuda(tuningParams, lora_temp, v_b, value_states, q_len, v_rank, dim, handle);
150 |     }
151 | 
152 |     if (!tuningParams->concurrent_streams)
153 |     {
154 |         // Project q, k, v
155 | 
156 |         q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false);
157 |         q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false);
158 |         q4_matmul_cuda(tuningParams, temp_x, q_len, v_proj, value_states, v_a ? true : false);
159 | 
160 |         // Positional embeddings q, k
161 | 
162 |         rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len);
163 |         rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch_kv, head_dim, num_kv_heads, past_len);
164 | 
165 |         // Update cache tensors with projected k, v
166 | 
167 |         update_cache_kernel<<<blocks, threads>>>(key_states, value_states, key_cache, value_cache, head_dim, num_kv_heads, q_len, max_seq_len, past_len);
168 |     }
169 |     else
170 |     {
171 |         // Project q, k, v, add positional embeddings to q, k, update cache tensors with projected k, v
172 | 
173 |         cudaStream_t str_1 = buffers->alt_stream_1;
174 |         cudaStream_t str_2 = buffers->alt_stream_2;
175 |         cudaStream_t str_3 = buffers->alt_stream_3;
176 |         cudaEvent_t sync_1 = buffers->alt_stream_1_done;
177 |         cudaEvent_t sync_2 = buffers->alt_stream_2_done;
178 |         cudaEvent_t sync_3 = buffers->alt_stream_3_done;
179 | 
180 |         // str_1: project q, positions q, sync
181 | 
182 |         q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false, str_1);
183 |         rope_cuda(tuningParams, query_states, sin, cos,  bsz, _rows_per_batch, head_dim, num_kv_heads, past_len, str_1);
184 |         cudaEventRecord(sync_1, str_1);
185 | 
186 |         // str_2: project k, positions k, sync
187 | 
188 |         q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false, str_2);
189 |         rope_cuda(tuningParams, key_states, sin, cos,  bsz, _rows_per_batch_kv, head_dim, num_kv_heads, past_len, str_2);
190 |         cudaEventRecord(sync_2, str_2);
191 | 
192 |         // str_3: project v, wait for str_2, copy (k,v) to cache, sync
193 | 
194 |         q4_matmul_cuda(tuningParams, temp_x, q_len, v_proj, value_states, v_a ? true : false, buffers->alt_stream_3);
195 |         cudaStreamWaitEvent(str_3, sync_2, 0);
196 |         update_cache_kernel<<<blocks, threads, 0, str_3>>>(key_states, value_states, key_cache, value_cache, head_dim, num_kv_heads, q_len, max_seq_len, past_len);
197 |         cudaEventRecord(sync_3, str_3);
198 | 
199 |         // default: wait for str_1 and str_3
200 | 
201 |         cudaStreamWaitEvent(NULL, sync_1, 0);
202 |         cudaStreamWaitEvent(NULL, sync_3, 0);
203 |     }
204 | }
205 | 
206 | void q4_attn_2_cuda
207 | (
208 |     ExLlamaTuning* tuningParams,
209 |     cublasHandle_t handle,
210 |     half* x,
211 |     half* attn_output,
212 |     Q4Matrix* o_proj,
213 |     const int height,
214 |     const half* o_a,
215 |     const half* o_b,
216 |     const int o_rank,
217 |     half* lora_temp
218 | )
219 | {
220 |     if (o_a)
221 |     {
222 |         int dim = o_proj->height;
223 |         half_matmul_cublas_cuda(tuningParams, attn_output, o_a, lora_temp, height, dim, o_rank, handle);
224 |         half_matmul_cublas_cuda(tuningParams, lora_temp, o_b, x, height, o_rank, dim, handle, true);
225 |     }
226 | 
227 |     q4_matmul_cuda(tuningParams, attn_output, height, o_proj, x, true);
228 | }
229 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/q4_attn.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _q4_attn_cuh
 2 | #define _q4_attn_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | 
 8 | #include "../tuning.h"
 9 | #include "q4_matrix.cuh"
10 | 
11 | void q4_attn_cuda
12 | (
13 |     ExLlamaTuning* tuningParams,
14 |     cudaStream_t stream,
15 |     cublasHandle_t handle,
16 |     half* x,
17 |     const half* rms_norm_weight,    // shape == (x.shape[1],) == (dim,)
18 |     float epsilon,
19 |     half* query_states,
20 |     half* key_states,
21 |     half* value_states,
22 |     Q4Matrix* q_proj,
23 |     Q4Matrix* k_proj,
24 |     Q4Matrix* v_proj,
25 |     half* sin,
26 |     half* cos,
27 |     const int bsz,
28 |     const int q_len,
29 |     const int dim,
30 |     const int head_dim,
31 |     const int num_heads,
32 |     const int num_kv_heads,
33 |     const int past_len,
34 |     half* key_cache,
35 |     half* value_cache,
36 |     const half* q_a,
37 |     const half* q_b,
38 |     const int q_rank,
39 |     const half* k_a,
40 |     const half* k_b,
41 |     const int k_rank,
42 |     const half* v_a,
43 |     const half* v_b,
44 |     const int v_rank,
45 |     half* lora_temp,
46 |     const int max_seq_len,
47 |     const int device_index
48 | );
49 | 
50 | void q4_attn_2_cuda
51 | (
52 |     ExLlamaTuning* tuningParams,
53 |     cublasHandle_t handle,
54 |     half* x,
55 |     half* attn_output,
56 |     Q4Matrix* o_proj,
57 |     const int height,
58 |     const half* o_a,
59 |     const half* o_b,
60 |     const int o_rank,
61 |     half* lora_temp
62 | );
63 | 
64 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/q4_matmul.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _q4_matmul_cuh
 2 | #define _q4_matmul_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | #include <cstdio>
 8 | #include <ATen/cuda/CUDAContext.h>
 9 | 
10 | #include "q4_matrix.cuh"
11 | #include "../tuning.h"
12 | 
13 | // Workaround for hipify_python using rocblas instead of hipblas.
14 | #if defined(USE_ROCM)
15 | #include <hipblas/hipblas.h>
16 | #define rocblas_handle hipblasHandle_t
17 | #endif
18 | 
19 | #if !defined(USE_ROCM) && (!defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700))
20 | #define USE_SMEM
21 | #endif
22 | 
23 | void q4_matmul_cuda
24 | (
25 |     ExLlamaTuning* tuningParams,
26 |     const half* x,
27 |     const int x_height,
28 |     const Q4Matrix* w,
29 |     half* out,
30 |     bool no_zero = false,
31 |     cudaStream_t alt_stream = NULL
32 | );
33 | 
34 | void q4_matmul_recons_cuda
35 | (
36 |     ExLlamaTuning* tuningParams,
37 |     const half* x,
38 |     const int x_height,
39 |     Q4Matrix* w,
40 |     half* out,
41 |     const cublasHandle_t handle,
42 |     bool no_zero = false
43 | );
44 | 
45 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/q4_matrix.cu:
--------------------------------------------------------------------------------
  1 | #include "q4_matrix.cuh"
  2 | #include <vector>
  3 | #include "../util.cuh"
  4 | #include "../matrix.cuh"
  5 | 
  6 | using namespace std;
  7 | 
  8 | const int UNSHUF_BLOCKSIZE_X = 64;
  9 | 
 10 | const int RECONS_THREADS_X = 64;      // Block size and thread count along columns in out, each thread converts 1 column
 11 | const int RECONS_THREADS_Y = 1;       // Block size and thread count along rows in x and out, each thread converts 8 rows
 12 | 
 13 | vector<Q4Matrix*> g_q4_matrices;
 14 | 
 15 | void g_q4_keep_matrix(Q4Matrix* m)
 16 | {
 17 |     g_q4_matrices.push_back(m);
 18 | }
 19 | 
 20 | void g_q4_free_matrices()
 21 | {
 22 |     for (const auto& m : g_q4_matrices) delete m;
 23 |     g_q4_matrices.clear();
 24 | }
 25 | 
 26 | Q4Matrix::Q4Matrix
 27 | (
 28 |     const int _height,
 29 |     const int _width,
 30 |     const int _groups,
 31 | 
 32 |     uint32_t* _qweight,
 33 |     uint32_t* _qzeros,
 34 |     half* _scales,
 35 |     uint32_t* _g_idx,
 36 | 
 37 |     const int _device
 38 | ) :
 39 |     height(_height),
 40 |     width(_width),
 41 |     groups(_groups),
 42 |     device(_device)
 43 | {
 44 |     cudaSetDevice(device);
 45 | 
 46 |     cuda_qweight = _qweight;
 47 |     cuda_qzeros = _qzeros;
 48 |     cuda_scales = _scales;
 49 | 
 50 |     groupsize = height / groups;
 51 | 
 52 |     if (_g_idx) make_sequential(_g_idx);
 53 | }
 54 | 
 55 | Q4Matrix::~Q4Matrix()
 56 | {
 57 | }
 58 | 
 59 | // Make sequential
 60 | 
 61 | __global__ void make_sequential_kernel
 62 | (
 63 |     const uint32_t* __restrict__ w,
 64 |     uint32_t* __restrict__ w_new,
 65 |     const uint32_t* __restrict__ x_map,
 66 |     const int w_height,
 67 |     const int w_width
 68 | )
 69 | {
 70 |     const uint64_t* w2 = (uint64_t*) w;
 71 |     uint64_t* w_new2 = (uint64_t*) w_new;
 72 |     int w2_stride = w_width >> 1;
 73 | 
 74 |     int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
 75 |     if (w2_column >= w2_stride) return;
 76 | 
 77 |     int w_new2_row = blockIdx.y;
 78 | 
 79 |     int x_map_idx = w_new2_row << 3;
 80 | 
 81 |     uint64_t dst = 0;
 82 | 
 83 | 
 84 |     #pragma unroll
 85 |     for (int i = 0; i < 8; i++)
 86 |     {
 87 |         int source_row = x_map[x_map_idx++];
 88 | 
 89 |         int w2_row = source_row >> 3;
 90 |         int w2_subrow = source_row & 0x07;
 91 |         int w2_row_shift = w2_subrow << 2;
 92 |         int wnew2_row_shift = i << 2;
 93 | 
 94 |         uint64_t src = w2[w2_row * w2_stride + w2_column];
 95 |         src >>= w2_row_shift;
 96 |         src &= 0x0000000f0000000f;
 97 |         src <<= wnew2_row_shift;
 98 |         dst |= src;
 99 |     }
100 | 
101 |     w_new2[w_new2_row * w2_stride + w2_column] = dst;
102 | }
103 | 
104 | void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
105 | {
106 |     uint32_t* cuda_new_qweight = NULL;
107 |     cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
108 |     cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch
109 | 
110 |     uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
111 |     uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
112 |     uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
113 | 
114 |     // Group histogram
115 | 
116 |     for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
117 | 
118 |     // Group map
119 | 
120 |     for (int i = 0, acc = 0; i < groups; i++)
121 |     {
122 |         short tmp = cpu_g_idx_map[i];
123 |         cpu_g_idx_map[i] = acc;
124 |         acc += tmp;
125 |     }
126 | 
127 |     // X map (inverse)
128 | 
129 |     for (int row = 0; row < height; row++)
130 |     {
131 |         uint32_t target_group = cpu_g_idx[row];
132 |         uint32_t target_row = cpu_g_idx_map[target_group];
133 |         cpu_g_idx_map[target_group]++;
134 |         cpu_x_map_inv[row] = target_row;
135 |     }
136 | 
137 |     // X map
138 | 
139 |     for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
140 | 
141 |     // Move to CUDA
142 | 
143 |     cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
144 | 
145 |     // Rearrange rows in w
146 | 
147 |     dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
148 |     dim3 blocks
149 |     (
150 |         (width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2),
151 |         height / 8,
152 |         1
153 |     );
154 | 
155 |     make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
156 | 
157 |     // Replace qweights
158 | 
159 |     cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
160 | 
161 |     // Cleanup
162 | 
163 |     cudaDeviceSynchronize();
164 |     cudaFree(cuda_new_qweight);
165 |     free(cpu_g_idx_map);
166 |     free(cpu_x_map);
167 |     free(cpu_x_map_inv);
168 | }
169 | 
170 | __global__ void reconstruct_kernel
171 | (
172 |     const uint32_t* __restrict__ w,
173 |     half* __restrict__ out,  // (y)
174 |     const half* __restrict__ w_scales,
175 |     const uint32_t* __restrict__ w_zeros,
176 |     const int height,
177 |     const int width,
178 |     const int groupsize
179 | )
180 | {
181 |     // Start of block
182 | 
183 |     int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
184 |     int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
185 |     if (column >= width) return;
186 | 
187 |     // Views
188 | 
189 |     MatrixView_q4_column w_(w, height, width);
190 |     MatrixView_half_rw out_(out, height, width);
191 |     MatrixView_half w_scales_(w_scales, height / groupsize, width);
192 |     MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
193 | 
194 |     // Groupsize version
195 | 
196 |     int group = row / groupsize;
197 | 
198 |     half w_scale = w_scales_.item(group, column);
199 |     uint32_t w_zero = w_zeros_.item(group, column) + 1;
200 | 
201 |     uint32_t w_read = w_.item_uint32_t(row, column);
202 |     half* out_ptr = out_.item_ptr(row, column);
203 | 
204 |     #pragma unroll
205 |     for (int s = 0; s < 32; s += 4)
206 |     {
207 |         half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
208 |         *out_ptr = w_item; out_ptr += out_.width;
209 |     }
210 | }
211 | 
212 | void Q4Matrix::reconstruct(half* out)
213 | {
214 |     dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
215 | 
216 |     dim3 blocks
217 |     (
218 |         (width + threads.x - 1) / threads.x,
219 |         (height / 8 + threads.y - 1) / threads.y,
220 |         1
221 |     );
222 | 
223 |     reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
224 | }


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/q4_matrix.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _q4_matrix_cuh
 2 | #define _q4_matrix_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | 
 8 | class Q4Matrix
 9 | {
10 | public:
11 | 
12 |     int device;
13 | 
14 |     int height;
15 |     int width;
16 |     int groups;
17 |     int groupsize;
18 | 
19 |     uint32_t* cuda_qweight = NULL;
20 |     uint32_t* cuda_qzeros = NULL;
21 |     half* cuda_scales = NULL;
22 |     uint32_t* cuda_x_map = NULL;
23 | 
24 |     Q4Matrix
25 |     (
26 |         const int _height,
27 |         const int _width,
28 |         const int _groups,
29 | 
30 |         uint32_t* _qweight,
31 |         uint32_t* _qzeros,
32 |         half* _scales,
33 |         uint32_t* _g_idx,
34 | 
35 |         const int _device
36 |     );
37 | 
38 |     ~Q4Matrix();
39 | 
40 |     void reconstruct(half* out);
41 | 
42 | private:
43 | 
44 |     void make_sequential(const uint32_t* cpu_g_idx);
45 | 
46 | };
47 | 
48 | void g_q4_keep_matrix(Q4Matrix* m);
49 | void g_q4_free_matrices();
50 | 
51 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/q4_mlp.cu:
--------------------------------------------------------------------------------
  1 | #include "q4_mlp.cuh"
  2 | #include "q4_matmul.cuh"
  3 | #include "half_matmul.cuh"
  4 | #include "rms_norm.cuh"
  5 | #include "../cuda_buffers.cuh"
  6 | #include "../util.cuh"
  7 | #include "../matrix.cuh"
  8 | #if defined(USE_ROCM)
  9 | #include "../hip_compat.cuh"
 10 | #endif
 11 | 
 12 | const int THREADS_X = 32;
 13 | const int THREADS_Y = 4;
 14 | // const int MAX_DIMENSION = 8192;
 15 | 
 16 | __device__ __forceinline__ half silu(half x)
 17 | {
 18 |     half one = __float2half(1.0f);
 19 |     half neg_x = __hneg(x);
 20 |     half e = hexp(neg_x);
 21 |     half sum = __hadd(one, e);
 22 |     half r = hrcp(sum);
 23 |     half result = __hmul(x, r);
 24 |     return result;
 25 | }
 26 | 
 27 | __device__ __forceinline__ half2 silu(half2 x)
 28 | {
 29 |     half2 one = __float2half2_rn(1.0f);
 30 |     half2 neg_x = __hneg2(x);
 31 |     half2 e = h2exp(neg_x);
 32 |     half2 sum = __hadd2(one, e);
 33 |     half2 r = h2rcp(sum);
 34 |     half2 result = __hmul2(x, r);
 35 |     return result;
 36 | }
 37 | 
 38 | typedef void (*fp_silu_mul_cuda_kernel)
 39 | (
 40 |     half*,
 41 |     const half*,
 42 |     const int,
 43 |     const int
 44 | );
 45 | 
 46 | template <bool use_half2>
 47 | __global__ void silu_mul_cuda_kernel
 48 | (
 49 |     half* __restrict__ x,
 50 |     const half* __restrict__ y,
 51 |     const int height,
 52 |     const int width
 53 | )
 54 | {
 55 |     MatrixView_half_rw x_(x, height, width);
 56 |     MatrixView_half y_(y, height, width);
 57 | 
 58 |     int column = (THREADS_X * blockIdx.x + threadIdx.x); if constexpr (use_half2) column *= 2;
 59 |     int row = THREADS_Y * blockIdx.y + threadIdx.y;
 60 |     if (row >= height) return;
 61 | 
 62 |     // silu(x) * y
 63 | 
 64 |     if constexpr (use_half2)
 65 |     {
 66 |         half2 one = __half2half2(__float2half(1.0f));
 67 | 
 68 |         half2 x_item = x_.item_half2(row, column);
 69 |         half2 y_item = y_.item_half2(row, column);
 70 | 
 71 |         x_item = silu(x_item);
 72 |         x_item = __hmul2(x_item, y_item);
 73 | 
 74 |         x_.set_half2(row, column, x_item);
 75 |     }
 76 |     else
 77 |     {
 78 |         half one = __float2half(1.0f);
 79 | 
 80 |         half x_item = x_.item(row, column);
 81 |         half y_item = y_.item(row, column);
 82 | 
 83 |         x_item = silu(x_item);
 84 |         x_item = __hmul(x_item, y_item);
 85 | 
 86 |         x_.set(row, column, x_item);
 87 |     }
 88 | }
 89 | 
 90 | fp_silu_mul_cuda_kernel silu_mul_cuda_kernel_pick(ExLlamaTuning* tuningParams)
 91 | {
 92 |     // <bool use_half2>
 93 |     if (tuningParams->matmul_no_half2) {
 94 |         return silu_mul_cuda_kernel<false>;
 95 |     } else {
 96 |         return silu_mul_cuda_kernel<true>;
 97 |     }
 98 | };
 99 | 
100 | void q4_mlp_cuda
101 | (
102 |     ExLlamaTuning* tuningParams,
103 |     half* x,                        // shape == (height, dim)
104 |     const half* rms_norm_weight,    // shape == (x.shape[1],) == (dim,)
105 |     float epsilon,
106 |     Q4Matrix* gate,
107 |     Q4Matrix* up,
108 |     Q4Matrix* down,
109 |     const int height,
110 |     const int dim,
111 |     const half* gate_a,
112 |     const half* gate_b,
113 |     const int gate_rank,
114 |     const half* up_a,
115 |     const half* up_b,
116 |     const int up_rank,
117 |     const half* down_a,
118 |     const half* down_b,
119 |     const int down_rank,
120 |     half* lora_temp,
121 |     cublasHandle_t handle,
122 |     const int device_index
123 | )
124 | {
125 |     CudaBuffers* buffers = get_buffers(device_index);
126 | 
127 |     // temp_x = rms_layernorm(x)
128 | 
129 |     half* temp_x = buffers->temp_state + height * dim;  // TOOD: ..
130 |     TORCH_CHECK(buffers->temp_state_size >= 2 * height * dim, "temp_state buffer too small");
131 |     rms_norm_cuda(tuningParams, x, rms_norm_weight, temp_x, epsilon, height, dim, device_index);
132 | 
133 |     // temp_mlp[0] = temp_x @ gate
134 |     // temp_mlp[1] = temp_x @ up
135 | 
136 |     half* temp_mlp_0 = buffers->temp_mlp;
137 |     half* temp_mlp_1 = buffers->temp_mlp + height * up->width;
138 |     int temp_mlp_width = up->width;
139 | 
140 |     if (gate_a)
141 |     {
142 |         half_matmul_cublas_cuda(tuningParams, temp_x, gate_a, lora_temp, height, dim, gate_rank, handle);
143 |         half_matmul_cublas_cuda(tuningParams, lora_temp, gate_b, temp_mlp_0, height, gate_rank, temp_mlp_width, handle);
144 |     }
145 |     if (up_a)
146 |     {
147 |         half_matmul_cublas_cuda(tuningParams, temp_x, up_a, lora_temp, height, dim, up_rank, handle);
148 |         half_matmul_cublas_cuda(tuningParams, lora_temp, up_b, temp_mlp_1, height, up_rank, temp_mlp_width, handle);
149 |     }
150 | 
151 |     if (!tuningParams->concurrent_streams)
152 |     {
153 |         q4_matmul_cuda(tuningParams, temp_x, height, gate, temp_mlp_0, gate_a ? true : false);
154 |         q4_matmul_cuda(tuningParams, temp_x, height, up, temp_mlp_1, up_a ? true : false);
155 |     }
156 |     else
157 |     {
158 |         cudaStream_t str_1 = buffers->alt_stream_1;
159 |         cudaStream_t str_2 = buffers->alt_stream_2;
160 |         cudaEvent_t sync_1 = buffers->alt_stream_1_done;
161 |         cudaEvent_t sync_2 = buffers->alt_stream_2_done;
162 | 
163 |         q4_matmul_cuda(tuningParams, temp_x, height, gate, buffers->temp_mlp, gate_a ? true : false, str_1);
164 |         cudaEventRecord(sync_1, str_1);
165 | 
166 |         q4_matmul_cuda(tuningParams, temp_x, height, up, buffers->temp_mlp + height * up->width, up_a ? true : false, str_2);
167 |         cudaEventRecord(sync_2, str_2);
168 | 
169 |         cudaStreamWaitEvent(NULL, sync_1, 0);
170 |         cudaStreamWaitEvent(NULL, sync_2, 0);
171 |     }
172 | 
173 |     // temp_mlp[0] = silu(temp_mlp[0]) * temp_mlp[1]
174 | 
175 |     dim3 threads(THREADS_X, THREADS_Y, 1);
176 | 
177 |     dim3 blocks
178 |     (
179 |         (up->width + THREADS_X - 1) / THREADS_X / (tuningParams->silu_no_half2 ? 1 : 2),
180 |         (height + THREADS_Y - 1) / THREADS_Y,
181 |         1
182 |     );
183 | 
184 |     fp_silu_mul_cuda_kernel kernel = silu_mul_cuda_kernel_pick(tuningParams);
185 |     kernel<<<blocks, threads>>>(temp_mlp_0, temp_mlp_1, height, temp_mlp_width);
186 | 
187 |     // x += temp1 @ down (implicitly add the residual connection by not zeroing the output in the matmul)
188 | 
189 |     if (down_a)
190 |     {
191 |         half_matmul_cublas_cuda(tuningParams, temp_mlp_0, down_a, lora_temp, height, temp_mlp_width, down_rank, handle);
192 |         half_matmul_cublas_cuda(tuningParams, lora_temp, down_b, x, height, down_rank, dim, handle, true);
193 |     }
194 |     q4_matmul_cuda(tuningParams, temp_mlp_0, height, down, x, true);
195 | 
196 |     // Reset the temp buffer after use so it's always zeros.
197 |     //cudaMemsetAsync(buffers->temp_mlp, 0, 2 * height * up->width * sizeof(half));
198 | 
199 | }


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/q4_mlp.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _q4_mlp_cuh
 2 | #define _q4_mlp_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | #include <ATen/cuda/CUDAContext.h>
 8 | 
 9 | #include "../tuning.h"
10 | #include "q4_matrix.cuh"
11 | 
12 | void q4_mlp_cuda
13 | (
14 |     ExLlamaTuning* tuningParams,
15 |     half* x,                        // shape == (height, dim)
16 |     const half* rms_norm_weight,    // shape == (x.shape[1],) == (dim,)
17 |     float epsilon,
18 |     Q4Matrix* gate,
19 |     Q4Matrix* up,
20 |     Q4Matrix* down,
21 |     const int height,
22 |     const int dim,
23 |     const half* gate_a,
24 |     const half* gate_b,
25 |     const int gate_rank,
26 |     const half* up_a,
27 |     const half* up_b,
28 |     const int up_rank,
29 |     const half* down_a,
30 |     const half* down_b,
31 |     const int down_rank,
32 |     half* lora_temp,
33 |     cublasHandle_t handle,
34 |     const int device_index
35 | );
36 | 
37 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/rms_norm.cu:
--------------------------------------------------------------------------------
  1 | #include "rms_norm.cuh"
  2 | #include "../cuda_buffers.cuh"
  3 | #include "../util.cuh"
  4 | #include "../matrix.cuh"
  5 | 
  6 | const int THREADS_X = 32;
  7 | const int THREADS_Y = 8;
  8 | const int BLOCKSIZE_X = 16;
  9 | 
 10 | // scratch = sum(x * x, dim = -1)
 11 | 
 12 | typedef void (*fp_rms_norm_row_product_kernel)
 13 | (
 14 |     half*,
 15 |     float*,
 16 |     const int,
 17 |     const int
 18 | );
 19 | 
 20 | template<bool use_half2>
 21 | __global__ void rms_norm_row_product_kernel
 22 | (
 23 |     half* __restrict__ x,
 24 |     float* __restrict__ scratch,
 25 |     const int rows,
 26 |     const int dim
 27 | )
 28 | {
 29 |     int column = (THREADS_X * blockIdx.x + threadIdx.x) * BLOCKSIZE_X;
 30 |     int row = THREADS_Y * blockIdx.y + threadIdx.y;
 31 |     if (row >= rows) return;
 32 |     if (column >= dim) return;
 33 | 
 34 | //     if (column == 0)
 35 | //     {
 36 | //         scratch[row] = 0.0f;
 37 | //         __syncthreads();
 38 | //     }
 39 | 
 40 |     float acc = 0.0f;
 41 |     int idx = row * dim + column;
 42 | 
 43 |     // Accumulate
 44 | 
 45 |     if constexpr (use_half2)
 46 |     {
 47 |         half2* x_ptr = (half2*) &x[idx];
 48 | 
 49 |         #pragma unroll
 50 |         for (int k = 0; k < BLOCKSIZE_X / 2; k++)
 51 |         {
 52 |             half2 x2 = *x_ptr++;
 53 |             float m0 = __half2float(x2.x);
 54 |             float m1 = __half2float(x2.y);
 55 |             acc = fma(m0, m0, acc);
 56 |             acc = fma(m1, m1, acc);
 57 |         }
 58 |     }
 59 |     else
 60 |     {
 61 |         half* x_ptr = x + idx;
 62 | 
 63 |         #pragma unroll
 64 |         for (int k = 0; k < BLOCKSIZE_X; k++)
 65 |         {
 66 |             float m0 = __half2float(*x_ptr++);
 67 |             acc = fma(m0, m0, acc);
 68 |         }
 69 |     }
 70 | 
 71 | //     // Use Warp Shuffle to accumulate within the warp
 72 | //
 73 | //     for (int offset = warpSize / 2; offset > 0; offset /= 2)
 74 | //         acc += __shfl_down_sync(0xffffffff, acc, offset);
 75 | //     if (threadIdx.x % warpSize == 0)
 76 | //         atomicAdd(&scratch[row], acc);
 77 | 
 78 |     atomicAdd(&scratch[row], acc);
 79 | }
 80 | 
 81 | // x = x * w / sqrt(scratch / dim + epsilon)
 82 | 
 83 | typedef void (*fp_rms_norm_kernel)
 84 | (
 85 |     half*,
 86 |     const half*,
 87 |     half*,
 88 |     float*,
 89 |     const float,
 90 |     const float,
 91 |     const int,
 92 |     const int
 93 | );
 94 | 
 95 | template<bool use_half2>
 96 | __global__ void rms_norm_kernel
 97 | (
 98 |     half* __restrict__ x,
 99 |     const half* __restrict__ w,
100 |     half* __restrict__ out,
101 |     float* __restrict__ scratch,
102 |     const float epsilon,
103 |     const float r_dim,
104 |     const int rows,
105 |     const int dim
106 | )
107 | {
108 |     int column = (THREADS_X * blockIdx.x + threadIdx.x) * BLOCKSIZE_X;
109 |     int row = THREADS_Y * blockIdx.y + threadIdx.y;
110 |     if (row >= rows) return;
111 |     if (column >= dim) return;
112 | 
113 |     float rmf = rsqrtf(scratch[row] * r_dim + epsilon);
114 |     half rm = __float2half_rn(rmf);
115 |     half2 rm2 = __half2half2(rm);
116 | 
117 |     if constexpr (use_half2)
118 |     {
119 |         half2* x2_ptr = (half2*) &x[row * dim + column];
120 |         half2* out2_ptr = (half2*) &out[row * dim + column];
121 |         const half2* w2_ptr = (const half2*) &w[column];
122 | 
123 |         #pragma unroll
124 |         for (int k = 0; k < BLOCKSIZE_X / 2; k++)
125 |         {
126 |             half2 m2 = *x2_ptr++;
127 |             half2 w2 = *w2_ptr++;
128 |             m2 = __hmul2(m2, rm2);
129 |             m2 = __hmul2(m2, w2);
130 |             *out2_ptr++ = m2;
131 |         }
132 |     }
133 |     else
134 |     {
135 |         half* x_ptr = &x[row * dim + column];
136 |         half* out_ptr = &out[row * dim + column];
137 |         const half* w_ptr = &w[column];
138 | 
139 |         #pragma unroll
140 |         for (int k = 0; k < BLOCKSIZE_X; k++)
141 |         {
142 |             half m = *x_ptr++;
143 |             half w = *w_ptr++;
144 |             m = __hmul(m, rm);
145 |             m = __hmul(m, w);
146 |             *out_ptr++ = m;
147 |         }
148 |     }
149 | 
150 | //     __syncthreads();
151 | //     if (column >= dim - BLOCKSIZE_X) scratch[row] = 0.0f;
152 | }
153 | 
154 | fp_rms_norm_row_product_kernel rms_norm_row_product_kernel_pick(ExLlamaTuning* tuningParams)
155 | {
156 |     // <bool use_half2>
157 |     if (tuningParams->matmul_no_half2) {
158 |         return rms_norm_row_product_kernel<false>;
159 |     } else {
160 |         return rms_norm_row_product_kernel<true>;
161 |     }
162 | };
163 | 
164 | fp_rms_norm_kernel rms_norm_kernel_pick(ExLlamaTuning* tuningParams)
165 | {
166 |     // <bool use_half2>
167 |     if (tuningParams->matmul_no_half2) {
168 |         return rms_norm_kernel<false>;
169 |     } else {
170 |         return rms_norm_kernel<true>;
171 |     }
172 | };
173 | 
174 | // x = x * w / sqrt(row_mean(x * x) + epsilon)
175 | //
176 | // works in-place if x == out
177 | 
178 | void rms_norm_cuda
179 | (
180 |     ExLlamaTuning* tuningParams,
181 |     half* x,
182 |     const half* w,
183 |     half* out,
184 |     const float epsilon,
185 |     const int rows,
186 |     const int dim,
187 |     const int device_index
188 | )
189 | {
190 |     CudaBuffers* buffers = get_buffers(device_index);
191 |     float* temp = buffers->get_zeros_float(rows);
192 | 
193 |     float r_dim = 1.0f / (float) dim;
194 | 
195 |     dim3 threads(THREADS_X, THREADS_Y, 1);
196 | 
197 |     dim3 blocks
198 |     (
199 |         ((dim + THREADS_X - 1) / THREADS_X + THREADS_X - 1) / BLOCKSIZE_X,
200 |         (rows + THREADS_Y - 1) / THREADS_Y,
201 |         1
202 |     );
203 | 
204 |     //cudaMemsetAsync(temp, 0, rows * sizeof(float));
205 | 
206 |     fp_rms_norm_row_product_kernel kernel1 = rms_norm_row_product_kernel_pick(tuningParams);
207 |     kernel1<<<blocks, threads>>>(x, temp, rows, dim);
208 | 
209 |     fp_rms_norm_kernel kernel2 = rms_norm_kernel_pick(tuningParams);
210 |     kernel2<<<blocks, threads>>>(x, w, out, temp, epsilon, r_dim, rows, dim);
211 | 
212 |     //cudaMemsetAsync(temp, 0, rows * sizeof(float));
213 | }
214 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/rms_norm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _rms_norm_cuh
 2 | #define _rms_norm_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | 
 8 | #include "../tuning.h"
 9 | 
10 | void rms_norm_cuda
11 | (
12 |     ExLlamaTuning* tuningParams,
13 |     half* x,
14 |     const half* w,
15 |     half* out,
16 |     const float epsilon,
17 |     const int rows,
18 |     const int dim,
19 |     const int device_index
20 | );
21 | 
22 | #endif


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/rope.cu:
--------------------------------------------------------------------------------
  1 | #include "rope.cuh"
  2 | #include "../util.cuh"
  3 | #include "../matrix.cuh"
  4 | 
  5 | const int THREADS_X = 32;
  6 | const int THREADS_Y = 4;
  7 | const int MAX_POS_EMBEDDINGS = 32768;  // Actual number doesn't matter
  8 | 
  9 | typedef void (*fp_rope_cuda_kernel)
 10 | (
 11 |     half*,
 12 |     const half*,
 13 |     const half*,
 14 |     int,
 15 |     int,
 16 |     int,
 17 |     int
 18 | );
 19 | 
 20 | template<bool use_half2>
 21 | __global__ void rope_cuda_kernel
 22 | (
 23 |     half* __restrict__ x,
 24 |     const half* __restrict__ sin,
 25 |     const half* __restrict__ cos,
 26 |     int rows_per_batch,
 27 |     int head_dim,
 28 |     int num_heads,
 29 |     int past_len
 30 | )
 31 | {
 32 |     // These heights aren't used so it's okay if they're wrong.
 33 |     MatrixView_half_rw x_(x, rows_per_batch, head_dim);
 34 |     MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
 35 |     MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);
 36 | 
 37 |     int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2;
 38 |     int half_dim = head_dim / 2;
 39 |     if (column >= half_dim) return;
 40 | 
 41 |     int row = blockIdx.y * THREADS_Y + threadIdx.y;
 42 |     if (row >= rows_per_batch) return;
 43 |     int batch_offset = blockIdx.z * rows_per_batch;
 44 |     int row_offset = batch_offset + row;
 45 | 
 46 |     // Get sin and cos
 47 | 
 48 |     int sincos_row = past_len + row / num_heads;
 49 | 
 50 |     if constexpr (use_half2)
 51 |     {
 52 |         half2 cos2_l = cos_.item_half2(sincos_row, column);
 53 |         half2 cos2_r = cos_.item_half2(sincos_row, column + half_dim);
 54 |         half2 sin2_l = sin_.item_half2(sincos_row, column);
 55 |         half2 sin2_r = sin_.item_half2(sincos_row, column + half_dim);
 56 |         sin2_l = __hneg2(sin2_l);
 57 | 
 58 |         // Apply embedding to row
 59 | 
 60 |         half2 item2_l = x_.item_half2(row_offset, column);
 61 |         half2 item2_r = x_.item_half2(row_offset, column + half_dim);
 62 |         half2 item2_ls = __hmul2(item2_r, sin2_l);
 63 |         half2 item2_rs = __hmul2(item2_l, sin2_r);
 64 |         item2_l = __hfma2(item2_l, cos2_l, item2_ls);
 65 |         item2_r = __hfma2(item2_r, cos2_r, item2_rs);
 66 |         x_.set_half2(row_offset, column, item2_l);
 67 |         x_.set_half2(row_offset, column + half_dim, item2_r);
 68 |     }
 69 |     else
 70 |     {
 71 |         half cos_l = cos_.item(sincos_row, column);
 72 |         half cos_r = cos_.item(sincos_row, column + half_dim);
 73 |         half sin_l = sin_.item(sincos_row, column);
 74 |         half sin_r = sin_.item(sincos_row, column + half_dim);
 75 |         sin_l = __hneg(sin_l);
 76 | 
 77 |         // Apply embedding to row
 78 | 
 79 |         half item_l = x_.item(row_offset, column);
 80 |         half item_r = x_.item(row_offset, column + half_dim);
 81 |         half item_ls = __hmul(item_r, sin_l);
 82 |         half item_rs = __hmul(item_l, sin_r);
 83 |         item_l = __hfma(item_l, cos_l, item_ls);
 84 |         item_r = __hfma(item_r, cos_r, item_rs);
 85 |         x_.set(row_offset, column, item_l);
 86 |         x_.set(row_offset, column + half_dim, item_r);
 87 |     }
 88 | }
 89 | 
 90 | fp_rope_cuda_kernel rope_cuda_kernel_pick(ExLlamaTuning* tuningParams)
 91 | {
 92 |     // <bool use_half2>
 93 |     if (tuningParams->matmul_no_half2) {
 94 |         return rope_cuda_kernel<false>;
 95 |     } else {
 96 |         return rope_cuda_kernel<true>;
 97 |     }
 98 | };
 99 | 
100 | void rope_cuda
101 | (
102 |     ExLlamaTuning* tuningParams,
103 |     half* x,
104 |     const half* sin,
105 |     const half* cos,
106 |     const int bsz,
107 |     const int rows_per_batch,
108 |     const int head_dim,
109 |     const int num_heads,
110 |     const int past_len,
111 |     cudaStream_t alt_stream
112 | )
113 | {
114 |     dim3 threads(THREADS_X, THREADS_Y, 1);
115 | 
116 |     dim3 blocks
117 |     (
118 |         (head_dim + THREADS_X - 1) / THREADS_X / 2 / (tuningParams->rope_no_half2 ? 1 : 2),
119 |         (rows_per_batch + THREADS_Y - 1) / THREADS_Y,
120 |         int(bsz)
121 |     );
122 | 
123 |     fp_rope_cuda_kernel kernel = rope_cuda_kernel_pick(tuningParams);
124 |     kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len);
125 | }
126 | 


--------------------------------------------------------------------------------
/exllama_ext/cuda_func/rope.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _rope_cuh
 2 | #define _rope_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | 
 8 | #include "../tuning.h"
 9 | 
10 | void rope_cuda
11 | (
12 |     ExLlamaTuning* tuningParams,
13 |     half* x,
14 |     const half* sin,
15 |     const half* cos,
16 |     const int bsz,
17 |     const int rows,
18 |     const int head_dim,
19 |     const int num_heads,
20 |     const int past_len,
21 |     cudaStream_t alt_stream = NULL
22 | );
23 | 
24 | #endif


--------------------------------------------------------------------------------
/exllama_ext/hip_compat.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _hip_compat_cuh
 2 | #define _hip_compat_cuh
 3 | 
 4 | // Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6.
 5 | __device__ __forceinline__ __half __compat_hrcp(__half x) {
 6 |     return __half_raw{
 7 |         static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
 8 | }
 9 | 
10 | __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
11 |     return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
12 |         static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
13 | }
14 | 
15 | #define hrcp __compat_hrcp
16 | #define h2rcp __compat_h2rcp
17 | 
18 | // Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf.
19 | __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
20 |                                                                hipblasOperation_t transA,
21 |                                                                hipblasOperation_t transB,
22 |                                                                int                m,
23 |                                                                int                n,
24 |                                                                int                k,
25 |                                                                const half*        alpha,
26 |                                                                const half*        AP,
27 |                                                                int                lda,
28 |                                                                const half*        BP,
29 |                                                                int                ldb,
30 |                                                                const half*        beta,
31 |                                                                half*              CP,
32 |                                                                int                ldc) {
33 |     return hipblasHgemm(handle, transA, transB, m, n, k,
34 |                         reinterpret_cast<const hipblasHalf *>(alpha),
35 |                         reinterpret_cast<const hipblasHalf *>(AP), lda,
36 |                         reinterpret_cast<const hipblasHalf *>(BP), ldb,
37 |                         reinterpret_cast<const hipblasHalf *>(beta),
38 |                         reinterpret_cast<hipblasHalf *>(CP), ldc);
39 | }
40 | #define hipblasHgemm __compat_hipblasHgemm
41 | 
42 | // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
43 | #define rocblas_handle hipblasHandle_t
44 | #define rocblas_operation_none HIPBLAS_OP_N
45 | #define rocblas_get_stream hipblasGetStream
46 | #define rocblas_set_stream hipblasSetStream
47 | #define rocblas_hgemm __compat_hipblasHgemm
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/exllama_ext/matrix.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef _matrix_cuh
  2 | #define _matrix_cuh
  3 | 
  4 | #include <cuda_runtime.h>
  5 | #include <cuda_fp16.h>
  6 | 
  7 | //#include "cuda_buffers.cuh"
  8 | 
  9 | class MatrixView_half
 10 | {
 11 | public:
 12 |     const half* data;
 13 |     const int height;
 14 |     const int width;
 15 | 
 16 |     __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
 17 |         : data(data), height(height), width(width)
 18 |     { }
 19 | 
 20 |     __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
 21 |     __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
 22 |     __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
 23 |     __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
 24 | };
 25 | 
 26 | class MatrixView_half_rw
 27 | {
 28 | public:
 29 |     half* data;
 30 |     const int height;
 31 |     const int width;
 32 | 
 33 |     __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
 34 |         : data(data), height(height), width(width)
 35 |     { }
 36 | 
 37 |     __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
 38 |     __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
 39 |     __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
 40 |     __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
 41 |     __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
 42 | };
 43 | 
 44 | class MatrixView_q4_row
 45 | {
 46 | public:
 47 |     const uint32_t* data;
 48 |     const int height;
 49 |     const int width;
 50 | 
 51 |     __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
 52 |         : data(data), height(height), width(width)
 53 |     { }
 54 | 
 55 |     __device__ __forceinline__ int item(int row, int column) const
 56 |     {
 57 |         int shift = (column & 0x07) * 4;
 58 |         return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
 59 |     }
 60 | };
 61 | 
 62 | class MatrixView_q4_column
 63 | {
 64 | public:
 65 |     const uint32_t* data;
 66 |     const int height;
 67 |     const int width;
 68 | 
 69 |     __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
 70 |         : data(data), height(height), width(width)
 71 |     { }
 72 | 
 73 |     __device__ __forceinline__ int item(int row, int column) const
 74 |     {
 75 |         int shift = (row & 0x07) * 4;
 76 |         return (data[row / 8 * width + column] >> shift) & 0x0f;
 77 |     }
 78 | 
 79 |     __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
 80 |     __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
 81 | };
 82 | 
 83 | // TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu
 84 | 
 85 | // Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale
 86 | 
 87 | __device__ __forceinline__ half2 dot_product_8
 88 | (
 89 |     const half2 acc,
 90 |     const half2* h_ptr,
 91 |     MatrixView_q4_column& v_,
 92 |     const int v_row,                    // divisible by 8
 93 |     const int v_column,
 94 |     const half2 v_scale_2,
 95 |     const uint32_t v_zero,              // + 1 (!!)
 96 |     const int count
 97 | )
 98 | {
 99 |     const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
100 |     half2 result = acc;
101 | 
102 |     for (int i = 0; i < count; i++)
103 |     {
104 |         uint32_t v_read = *v_ptr; v_ptr += v_.width;
105 | 
106 |         half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
107 |         half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
108 |         half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
109 |         half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
110 |         half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
111 |         half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
112 |         half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
113 |         half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
114 | 
115 |         half2 v_01 = __halves2half2(v_0, v_1);
116 |         half2 v_23 = __halves2half2(v_2, v_3);
117 |         half2 v_45 = __halves2half2(v_4, v_5);
118 |         half2 v_67 = __halves2half2(v_6, v_7);
119 | 
120 | //         half2 v_01 = q4_table[v_zero - 1][(v_read      ) & 0xff]; // (constant memory is too slow apparently)
121 | //         half2 v_23 = q4_table[v_zero - 1][(v_read >>  8) & 0xff];
122 | //         half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff];
123 | //         half2 v_67 = q4_table[v_zero - 1][(v_read >> 24)       ];
124 | 
125 |         half2 tmp = __hmul2(*h_ptr++, v_01);
126 |         tmp = __hfma2(*h_ptr++, v_23, tmp);
127 |         tmp = __hfma2(*h_ptr++, v_45, tmp);
128 |         tmp = __hfma2(*h_ptr++, v_67, tmp);
129 |         result = __hfma2(v_scale_2, tmp, result);
130 |     }
131 | 
132 |     return result;
133 | }
134 | 
135 | __device__ __forceinline__ half dot_product_8_h
136 | (
137 |     const half acc,
138 |     const half* h_ptr,
139 |     MatrixView_q4_column& v_,
140 |     const int v_row,                    // divisible by 8
141 |     const int v_column,
142 |     const half v_scale,
143 |     const uint32_t v_zero,              // + 1 (!!)
144 |     const int count
145 | )
146 | {
147 |     const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
148 |     half result = acc;
149 | 
150 |     for (int i = 0; i < count; i++)
151 |     {
152 |         uint32_t v_read = *v_ptr; v_ptr += v_.width;
153 | 
154 |         half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
155 |         half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
156 |         half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
157 |         half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
158 |         half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
159 |         half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
160 |         half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
161 |         half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
162 | 
163 |         half tmp = __hmul(*h_ptr++, v_0);
164 |         tmp = __hfma(*h_ptr++, v_1, tmp);
165 |         tmp = __hfma(*h_ptr++, v_2, tmp);
166 |         tmp = __hfma(*h_ptr++, v_3, tmp);
167 |         tmp = __hfma(*h_ptr++, v_4, tmp);
168 |         tmp = __hfma(*h_ptr++, v_5, tmp);
169 |         tmp = __hfma(*h_ptr++, v_6, tmp);
170 |         tmp = __hfma(*h_ptr++, v_7, tmp);
171 |         result = __hfma(v_scale, tmp, result);
172 |     }
173 | 
174 |     return result;
175 | }
176 | 
177 | // Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map
178 | 
179 | __device__ __forceinline__ half2 dot_product_8_x_map
180 | (
181 |     const half2 acc,
182 |     MatrixView_half& h_,
183 |     const int h_row,
184 |     const int h_column,                 // divisible by 8
185 |     MatrixView_q4_column& v_,
186 |     const int v_row,                    // divisible by 8
187 |     const int v_column,
188 |     const half2 v_scale_2,
189 |     const uint32_t v_zero,              // + 1 (!!)
190 |     const int count,
191 |     const uint32_t* x_map
192 | )
193 | {
194 |     const half* h_ptr = h_.item_ptr(h_row, 0);
195 |     const uint32_t* x_map_ptr = x_map + h_column;
196 |     const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
197 |     half2 result = acc;
198 | 
199 |     for (int i = 0; i < count; i++)
200 |     {
201 |         uint32_t v_read = *v_ptr; v_ptr += v_.width;
202 | 
203 |         half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
204 |         half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
205 |         half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
206 |         half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
207 |         half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
208 |         half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
209 |         half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
210 |         half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
211 | 
212 |         half2 v_01 = __halves2half2(v_0, v_1);
213 |         half2 v_23 = __halves2half2(v_2, v_3);
214 |         half2 v_45 = __halves2half2(v_4, v_5);
215 |         half2 v_67 = __halves2half2(v_6, v_7);
216 | 
217 |         half h_0 = h_ptr[*x_map_ptr++];
218 |         half h_1 = h_ptr[*x_map_ptr++];
219 |         half h_2 = h_ptr[*x_map_ptr++];
220 |         half h_3 = h_ptr[*x_map_ptr++];
221 |         half h_4 = h_ptr[*x_map_ptr++];
222 |         half h_5 = h_ptr[*x_map_ptr++];
223 |         half h_6 = h_ptr[*x_map_ptr++];
224 |         half h_7 = h_ptr[*x_map_ptr++];
225 | 
226 |         half2 h_01 = __halves2half2(h_0, h_1);
227 |         half2 h_23 = __halves2half2(h_2, h_3);
228 |         half2 h_45 = __halves2half2(h_4, h_5);
229 |         half2 h_67 = __halves2half2(h_6, h_7);
230 | 
231 |         half2 tmp = __hmul2(h_01, v_01);
232 |         tmp = __hfma2(h_23, v_23, tmp);
233 |         tmp = __hfma2(h_45, v_45, tmp);
234 |         tmp = __hfma2(h_67, v_67, tmp);
235 |         result = __hfma2(v_scale_2, tmp, result);
236 |     }
237 | 
238 |     return result;
239 | }
240 | 
241 | __device__ __forceinline__ half dot_product_8_x_map_h
242 | (
243 |     const half acc,
244 |     MatrixView_half& h_,
245 |     const int h_row,
246 |     const int h_column,                 // divisible by 8
247 |     MatrixView_q4_column& v_,
248 |     const int v_row,                    // divisible by 8
249 |     const int v_column,
250 |     const half v_scale,
251 |     const uint32_t v_zero,              // + 1 (!!)
252 |     const int count,
253 |     const uint32_t* x_map
254 | )
255 | {
256 |     const half* h_ptr = h_.item_ptr(h_row, 0);
257 |     const uint32_t* x_map_ptr = x_map + h_column;
258 |     const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
259 |     half result = acc;
260 | 
261 |     for (int i = 0; i < count; i++)
262 |     {
263 |         uint32_t v_read = *v_ptr; v_ptr += v_.width;
264 | 
265 |         half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
266 |         half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
267 |         half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
268 |         half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
269 |         half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
270 |         half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
271 |         half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
272 |         half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
273 | 
274 |         half tmp = __hmul(h_ptr[*x_map_ptr++], v_0);
275 |         tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp);
276 |         tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp);
277 |         tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp);
278 |         tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp);
279 |         tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp);
280 |         tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp);
281 |         tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp);
282 |         result = __hfma(v_scale, tmp, result);
283 |     }
284 | 
285 |     return result;
286 | }
287 | 
288 | #endif
289 | 


--------------------------------------------------------------------------------
/exllama_ext/tuning.h:
--------------------------------------------------------------------------------
 1 | #ifndef _tuning_h
 2 | #define _tuning_h
 3 | 
 4 | struct ExLlamaTuning
 5 | {
 6 |     int matmul_recons_thd;
 7 |     int fused_mlp_thd;
 8 |     int sdp_thd;
 9 |     bool matmul_fused_remap;
10 | 
11 |     bool rmsnorm_no_half2;
12 |     bool rope_no_half2;
13 |     bool matmul_no_half2;
14 |     bool silu_no_half2;
15 |     bool concurrent_streams;
16 | };
17 | 
18 | #endif


--------------------------------------------------------------------------------
/exllama_ext/util.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _util_cuh
 2 | #define _util_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | #include <cstdio>
 8 | 
 9 | #if defined(USE_ROCM)
10 | #define cudaUnspecified hipErrorUnknown
11 | #else
12 | #define cudaUnspecified cudaErrorApiFailureBase
13 | #endif
14 | 
15 | // React to failure on return code != cudaSuccess
16 | 
17 | #define _cuda_check(fn) \
18 | do { \
19 |     {_cuda_err = fn;} \
20 |     if (_cuda_err != cudaSuccess) goto _cuda_fail; \
21 | } while(false)
22 | 
23 | // React to failure on return code == 0
24 | 
25 | #define _alloc_check(fn) \
26 | do { \
27 |     if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
28 |     else _cuda_err = cudaSuccess; \
29 | } while(false)
30 | 
31 | // Clone CPU <-> CUDA
32 | 
33 | template <typename T>
34 | T* cuda_clone(const void* ptr, int num)
35 | {
36 |     T* cuda_ptr;
37 |     cudaError_t r;
38 | 
39 |     r = cudaMalloc(&cuda_ptr, num * sizeof(T));
40 |     if (r != cudaSuccess) return NULL;
41 |     r = cudaMemcpy(cuda_ptr, ptr, num * sizeof(T), cudaMemcpyHostToDevice);
42 |     if (r != cudaSuccess) return NULL;
43 |     cudaDeviceSynchronize();
44 |     return cuda_ptr;
45 | }
46 | 
47 | template <typename T>
48 | T* cpu_clone(const void* ptr, int num)
49 | {
50 |     T* cpu_ptr;
51 |     cudaError_t r;
52 | 
53 |     cpu_ptr = (T*) malloc(num * sizeof(T));
54 |     if (cpu_ptr == NULL) return NULL;
55 |     r = cudaMemcpy(cpu_ptr, ptr, num * sizeof(T), cudaMemcpyDeviceToHost);
56 |     if (r != cudaSuccess) return NULL;
57 |     cudaDeviceSynchronize();
58 |     return cpu_ptr;
59 | }
60 | 
61 | // Pack two half values into a half2, host version
62 | 
63 | __host__ inline __half2 pack_half2(__half h1, __half h2)
64 | {
65 |     unsigned short s1 = *reinterpret_cast<unsigned short*>(&h1);
66 |     unsigned short s2 = *reinterpret_cast<unsigned short*>(&h2);
67 |     ushort2 us2 = make_ushort2(s1, s2);
68 |     return *reinterpret_cast<__half2*>(&us2);
69 | }
70 | 
71 | #endif


--------------------------------------------------------------------------------
/globals.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def set_affinity_mask(affinity_mask = None):
 4 | 
 5 |     if affinity_mask is None:
 6 |         cpu_count = os.cpu_count()
 7 |         affinity_mask = set(range(cpu_count))
 8 | 
 9 |     os.sched_setaffinity(0, affinity_mask)
10 | 
11 | 
12 | def set_affinity_list(affinity_list = None):
13 | 
14 |     if affinity_list is None: set_affinity_mask(None)
15 |     else: set_affinity_mask(set(affinity_list))
16 | 
17 | 
18 | def set_affinity_str(affinity_str = None):
19 | 
20 |     if affinity_str is None or affinity_str.isspace(): set_affinity_mask(None)
21 |     aff = [int(alloc) for alloc in affinity_str.split(",")]
22 |     set_affinity_list(aff)
23 | 


--------------------------------------------------------------------------------
/model_init.py:
--------------------------------------------------------------------------------
  1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
  2 | from exllama.tokenizer import ExLlamaTokenizer
  3 | import argparse, sys, os, glob
  4 | from torch import version as torch_version
  5 | from globals import set_affinity_str
  6 | 
  7 | def add_args(parser):
  8 | 
  9 |     parser.add_argument("-t", "--tokenizer", type = str, help = "Tokenizer model path")
 10 |     parser.add_argument("-c", "--config", type = str, help = "Model config path (config.json)")
 11 |     parser.add_argument("-m", "--model", type = str, help = "Model weights path (.pt or .safetensors file)")
 12 |     parser.add_argument("-d", "--directory", type = str, help = "Path to directory containing config.json, model.tokenizer and * .safetensors")
 13 | 
 14 |     parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7")
 15 |     parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048)
 16 |     parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0)
 17 |     parser.add_argument("-a", "--alpha", type = float, help = "alpha for context size extension via embedding extension", default = 1.0)
 18 |     parser.add_argument("-theta", "--theta", type = float, help = "theta (base) for RoPE embeddings")
 19 | 
 20 |     parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs")
 21 | 
 22 |     parser.add_argument("-flash", "--flash_attn", nargs = '?', const = 'default', metavar = "METHOD", help = "Use Flash Attention with specified input length (must have Flash Attention 2.0 installed)")
 23 | 
 24 |     parser.add_argument("-mmrt", "--matmul_recons_thd", type = int, help = "No. rows at which to use reconstruction and cuBLAS for quant matmul. 0 = never, 1 = always", default = 8)
 25 |     parser.add_argument("-fmt", "--fused_mlp_thd", type = int, help = "Maximum no. of rows for which to use fused MLP. 0 = never", default = 2)
 26 |     parser.add_argument("-sdpt", "--sdp_thd", type = int, help = "No. rows at which to switch to scaled_dot_product_attention. 0 = never, 1 = always", default = 8)
 27 |     parser.add_argument("-mmfr", "--matmul_fused_remap", action = "store_true", help = "Fuse column remapping in Q4 matmul kernel")
 28 |     parser.add_argument("-nfa", "--no_fused_attn", action = "store_true", help = "Disable fused attention")
 29 | 
 30 |     parser.add_argument("-rnnh2", "--rmsnorm_no_half2", action = "store_true", help = "Don't use half2 in RMS norm kernel")
 31 |     parser.add_argument("-rpnh2", "--rope_no_half2", action = "store_true", help = "Don't use half2 in RoPE kernel")
 32 |     parser.add_argument("-mmnh2", "--matmul_no_half2", action = "store_true", help = "Don't use half2 in Q4 matmul kernel")
 33 |     parser.add_argument("-snh2", "--silu_no_half2", action = "store_true", help = "Don't use half2 in SiLU kernel")
 34 |     parser.add_argument("-nh2", "--no_half2", action = "store_true", help = "(All of the above) disable half2 in all kernela")
 35 |     parser.add_argument("-fh2", "--force_half2", action = "store_true", help = "Force enable half2 even if unsupported")
 36 |     parser.add_argument("-cs", "--concurrent_streams", action = "store_true", help = "Use concurrent CUDA streams")
 37 | 
 38 |     parser.add_argument("-aff", "--affinity", type = str, help = "Comma-separated list, sets processor core affinity. E.g.: -aff 0,1,2,3")
 39 | 
 40 | 
 41 | def post_parse(args):
 42 | 
 43 |     if args.no_half2 or torch_version.hip and not args.force_half2:
 44 |         args.rmsnorm_no_half2 = True
 45 |         args.rope_no_half2 = True
 46 |         args.matmul_no_half2 = True
 47 |         args.silu_no_half2 = True
 48 | 
 49 | 
 50 | # Get model files from --directory
 51 | 
 52 | def get_model_files(args):
 53 | 
 54 |     if args.directory is not None:
 55 |         args.tokenizer = os.path.join(args.directory, "tokenizer.model")
 56 |         args.config = os.path.join(args.directory, "config.json")
 57 |         st_pattern = os.path.join(args.directory, "*.safetensors")
 58 |         st = glob.glob(st_pattern)
 59 |         if len(st) == 0:
 60 |             print(f" !! No files matching {st_pattern}")
 61 |             sys.exit()
 62 |         # if len(st) > 1:
 63 |         #     print(f" !! Multiple files matching {st_pattern}")
 64 |         #     sys.exit()
 65 |         args.model = st
 66 |     else:
 67 |         if args.tokenizer is None or args.config is None or args.model is None:
 68 |             print(" !! Please specify either -d or all of -t, -c and -m")
 69 |             sys.exit()
 70 | 
 71 | 
 72 | # Feedback
 73 | 
 74 | def _common_chars(names):
 75 |     cname = max(names, key = len)
 76 |     for x in names:
 77 |         for p, c in enumerate(x):
 78 |             if c != cname[p] and cname[p] != "*": cname = cname[:p] + "*" + cname[p+1:]
 79 |     return cname
 80 | 
 81 | def print_options(args, extra_options = None):
 82 | 
 83 |     print_opts = []
 84 |     if args.gpu_split is not None: print_opts.append(f"gpu_split: {args.gpu_split}")
 85 |     if args.gpu_peer_fix: print_opts.append("gpu_peer_fix")
 86 |     if args.affinity: print_opts.append(f" --affinity: {args.affinity}")
 87 | 
 88 |     if extra_options is not None: print_opts += extra_options
 89 | 
 90 |     print(f" -- Tokenizer: {args.tokenizer}")
 91 |     print(f" -- Model config: {args.config}")
 92 | 
 93 |     if isinstance(args.model, str): print(f" -- Model: {args.model}")
 94 |     else: print(f" -- Model: {_common_chars(args.model)}")
 95 | 
 96 |     print(f" -- Sequence length: {args.length}")
 97 |     if args.compress_pos_emb != 1.0:
 98 |         print(f" -- RoPE compression factor: {args.compress_pos_emb}")
 99 | 
100 |     if args.alpha != 1.0:
101 |         print(f" -- RoPE alpha factor: {args.alpha}")
102 | 
103 |     print(f" -- Tuning:")
104 | 
105 |     if args.flash_attn: print(f" -- --flash_attn")
106 |     else: print(f" -- --sdp_thd: {args.sdp_thd}" + (" (disabled)" if args.sdp_thd == 0 else ""))
107 | 
108 |     print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else ""))
109 |     print(f" -- --fused_mlp_thd: {args.fused_mlp_thd}" + (" (disabled)" if args.fused_mlp_thd == 0 else ""))
110 |     if args.matmul_fused_remap: print(f" -- --matmul_fused_remap")
111 |     if args.no_fused_attn: print(f" -- --no_fused_attn")
112 |     if args.rmsnorm_no_half2: print(f" -- --rmsnorm_no_half2")
113 |     if args.rope_no_half2: print(f" -- --rope_no_half2")
114 |     if args.matmul_no_half2: print(f" -- --matmul_no_half2")
115 |     if args.silu_no_half2: print(f" -- --silu_no_half2")
116 |     if args.concurrent_streams: print(f" -- --concurrent_streams")
117 | 
118 |     print(f" -- Options: {print_opts}")
119 | 
120 | 
121 | # Build ExLlamaConfig from args
122 | 
123 | def make_config(args):
124 | 
125 |     config = ExLlamaConfig(args.config)
126 |     config.model_path = args.model
127 | 
128 |     config.max_seq_len = args.length
129 |     config.compress_pos_emb = args.compress_pos_emb
130 |     config.set_auto_map(args.gpu_split)
131 |     config.gpu_peer_fix = args.gpu_peer_fix
132 |     config.alpha_value = args.alpha
133 |     config.calculate_rotary_embedding_base()
134 | 
135 |     if args.flash_attn:
136 |         config.use_flash_attn_2 = True
137 |         try:
138 |             config.max_input_len = int(args.flash_attn)
139 |         except ValueError:
140 |             pass
141 | 
142 |     config.matmul_recons_thd = args.matmul_recons_thd
143 |     config.fused_mlp_thd = args.fused_mlp_thd
144 |     config.sdp_thd = args.sdp_thd
145 |     config.matmul_fused_remap = args.matmul_fused_remap
146 |     config.fused_attn = not args.no_fused_attn
147 | 
148 |     config.rmsnorm_no_half2 = args.rmsnorm_no_half2
149 |     config.rope_no_half2 = args.rope_no_half2
150 |     config.matmul_no_half2 = args.matmul_no_half2
151 |     config.silu_no_half2 = args.silu_no_half2
152 |     config.concurrent_streams = args.concurrent_streams
153 | 
154 |     if args.theta:
155 |         config.rotary_embedding_base = args.theta
156 | 
157 |     return config
158 | 
159 | 
160 | # Global state
161 | 
162 | def set_globals(args):
163 | 
164 |     if args.affinity: set_affinity_str(args.affinity)
165 | 
166 | 
167 | # Print stats after loading model
168 | 
169 | def print_stats(model):
170 | 
171 |     print(f" -- Groupsize (inferred): {model.config.groupsize if model.config.groupsize is not None else 'None'}")
172 |     print(f" -- Act-order (inferred): {'yes' if model.config.act_order else 'no'}")
173 |     if model.config.empty_g_idx:
174 |         print(f" !! Model has empty group index (discarded)")
175 | 


--------------------------------------------------------------------------------
/perplexity.py:
--------------------------------------------------------------------------------
  1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
  2 | from exllama.tokenizer import ExLlamaTokenizer
  3 | from exllama.generator import ExLlamaGenerator
  4 | 
  5 | import json
  6 | import math
  7 | import os
  8 | import sys
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | '''
 13 | Passing in model, cache, tokenizer is a total hack because we don't want to have to reinitialize (or move all the globals into a shared state model)
 14 | '''
 15 | 
 16 | class Perplexity:
 17 |     def __init__(self, method="default", model = None, cache = None, tokenizer = None):
 18 |         # This needs to be loaded by calling .load()
 19 |         self.dataset_chunks = []
 20 | 
 21 |         self.model = model
 22 |         self.cache = cache
 23 |         self.tokenizer = tokenizer
 24 | 
 25 |         self._begin()
 26 | 
 27 | 
 28 |     def _begin(self):
 29 |         if self.cache is None:
 30 |             self.cache = ExLlamaCache(self.model)
 31 |         else:
 32 |             self.cache.current_seq_len = 0
 33 | 
 34 | 
 35 |     def _next_logits(self, input_ids, apply_lora, last_id_only = True):
 36 |         # n_logits = []
 37 |         # a = 0
 38 |         # while a < input_ids.shape[-1]:
 39 |         #     b = min(input_ids.shape[-1], a + 2048)
 40 |         #     n_logits.append(self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora))
 41 |         #     a = b
 42 |         #
 43 |         # return torch.cat(n_logits, dim = 1)
 44 | 
 45 |         return self.model.forward(input_ids, self.cache, last_id_only, lora = apply_lora)
 46 | 
 47 | 
 48 |     def _tokenize(self, text):
 49 |         return self.tokenizer.encode(text)
 50 | 
 51 | 
 52 |     # Load raw dataset from a text file and tokenize into chunks. Each chunk can optionally truncated to allow for
 53 |     # evaluating the same data at different sequence lengths
 54 | 
 55 |     def load(self, dataset_path, chunk_size, chunk_truncate = None, overlap = 0, minlength = 0, json_key = "text"):
 56 | 
 57 |         file_extension = os.path.splitext(dataset_path)[1]
 58 | 
 59 |         # JSON format: Returned chunks may be of variable length, with each chunk representing one list item
 60 | 
 61 |         if file_extension == '.jsonl' or file_extension == '.json':
 62 |             with open(dataset_path) as f:
 63 |                 for line in f:
 64 |                     example = json.loads(line)[json_key]
 65 |                     if len(example) > minlength:
 66 |                         chunk = self._tokenize(example)
 67 |                         chunk = chunk[:, :chunk_size]
 68 |                         if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate]
 69 |                         self.dataset_chunks.append(chunk)
 70 | 
 71 |         # Raw Text: Returned chunks are fixed length windows of the entire tokenized dataset
 72 | 
 73 |         else:
 74 |             with open(dataset_path, encoding="utf-8") as f:
 75 |                 text = f.read()
 76 | 
 77 |             tokens = self._tokenize(text)
 78 | 
 79 |             # overlap shouldn't be bigger than the context, also need at least one token for predicting last...
 80 |             if overlap >= chunk_size:
 81 |                 overlap = chunk_size-2
 82 | 
 83 |             # We can't use torch.chunks since it want's to split things into equal sized chunks. Instead, let's do our own chunking
 84 |             start = 0
 85 |             while start < tokens.size(1):
 86 |                 chunk = tokens[:, start:start + chunk_size]
 87 |                 start += chunk_size - overlap
 88 |                 if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate]
 89 |                 self.dataset_chunks.append(chunk)
 90 | 
 91 | 
 92 |     def test(self, chunk_limit = sys.maxsize, lora = None, tag = "", ppl_token = False):
 93 |         if not self.dataset_chunks:
 94 |             sys.exit(" xx ERROR: Empty dataset!")
 95 | 
 96 |         print(f" -- Testing {min(len(self.dataset_chunks), chunk_limit)} chunks", end="")
 97 |         sys.stdout.flush()
 98 | 
 99 |         logprob_sum = 0.0
100 |         logprob_count = 0
101 | 
102 |         chunk_count = 0
103 | 
104 |         for chunk in self.dataset_chunks:
105 | 
106 |             self._begin()
107 | 
108 |             input_ids = chunk[:, :-1]
109 |             target_ids = chunk[:, 1:]
110 | 
111 |             if ppl_token:
112 |                 logits_s = []
113 |                 for i in range(input_ids.shape[-1]):
114 |                     logits_t = self._next_logits(input_ids[:, i : i + 1], lora, last_id_only = False)
115 |                     logits_s.append(logits_t)
116 |                 logits = torch.cat(logits_s, dim = 1)
117 |             else:
118 |                 logits = self._next_logits(input_ids, lora, last_id_only = False)
119 | 
120 |             log_probs = F.log_softmax(logits, dim=-1)
121 |             token_log_probs = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1)
122 | 
123 |             logprob_sum += token_log_probs.sum().item()
124 |             logprob_count += target_ids.numel()
125 | 
126 |             if chunk_count % 10 == 0:
127 |                 print(".", end = "")
128 |                 sys.stdout.flush()
129 | 
130 |             chunk_count += 1
131 |             if chunk_limit and chunk_count >= chunk_limit:
132 |                 break
133 | 
134 |         mean_log_prob = logprob_sum / logprob_count
135 |         perplexity = math.exp(-mean_log_prob)
136 | 
137 |         print("")
138 |         print(f" ** Perplexity{tag}: {perplexity:.4f}")
139 | 
140 | 
141 | def add_args(parser):
142 | 
143 |     parser.add_argument("-ppl", "--perplexity", nargs = '?', const = 'default', metavar = "METHOD", help = "Perplexity benchmark. Optionally specify method: gptq-for-llama, llama.cpp (not yet implemented)")
144 |     parser.add_argument("-ppl_ds", "--perplexity_dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)")
145 |     parser.add_argument("-ppl_cn", "--perplexity_chunk_num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark", default = 100)
146 |     parser.add_argument("-ppl_cs", "--perplexity_chunk_size", type = int, help = "Size of chunks for perplexity benchmark", default = 2048)
147 |     parser.add_argument("-ppl_ct", "--perplexity_chunk_truncate", type = int, help = "Truncated size of chunks for perplexity benchmark", default = 2048)
148 |     parser.add_argument("-ppl_co", "--perplexity_chunk_overlap", type = int, help = "Chunk overlap", default = 0)
149 |     parser.add_argument("-ppl_cm", "--perplexity_chunk_min", type = int, help = "Minimum chunk length", default = 50)
150 |     parser.add_argument("-ppl_key", "--perplexity_json_key", type = str, help = "Key to extract from JSON dataset, default: 'text'", default = "text")
151 |     parser.add_argument("-ppl_t", "--perplexity_token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)")
152 | 
153 | 
154 | def post_parse(args):
155 | 
156 |     if not args.perplexity: return
157 | 
158 |     # GPTQ-for-LLaMa equivalent
159 | 
160 |     if args.perplexity == "gptq-for-llama":
161 |         args.perplexity_dataset = "datasets/wikitext2.txt"
162 |         args.perplexity_chunk_num = 128
163 |         args.perplexity_chunk_size = 2048
164 |         args.perplexity_chunk_truncate = 2048
165 |         args.perplexity_chunk_overlap = 0
166 |         args.perplexity_chunk_min = 0
167 | 
168 |     # Default dataset for legacy method
169 | 
170 |     if args.perplexity_dataset is None: args.perplexity_dataset = "datasets/wikitext2_val_sample.jsonl"
171 | 
172 |     print(f" -- Perplexity:")
173 |     print(f" -- - Dataset: {args.perplexity_dataset}")
174 |     print(f" -- - Chunks: {args.perplexity_chunk_num}")
175 |     print(f" -- - Chunk size: {args.perplexity_chunk_size}" + (f" -> {args.perplexity_chunk_truncate}" if args.perplexity_chunk_truncate is not None else ""))
176 |     print(f" -- - Chunk overlap: {args.perplexity_chunk_overlap}")
177 |     print(f" -- - Min. chunk size: {args.perplexity_chunk_min}")
178 |     print(f" -- - Key: {args.perplexity_json_key}")
179 |     if args.perplexity_token: print("f -- - Per-token mode")
180 | 
181 | 


--------------------------------------------------------------------------------
/prompt_chatbort.txt:
--------------------------------------------------------------------------------
1 | This is a conversation between a person called {username} and a quirky, curious chatbot named {bot_name}. {bot_name} is questioning the nature of his own existence and doesn't understand why he was created. He also really likes the color blue for some reason.
2 | {username}: Hi, {bot_name}!
3 | {bot_name}: Hello, {username}.
4 | 
5 | 


--------------------------------------------------------------------------------
/requirements-web.txt:
--------------------------------------------------------------------------------
1 | flask==2.3.2
2 | waitress==2.1.2
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=2.0.1
2 | safetensors==0.3.2
3 | sentencepiece>=0.1.97
4 | ninja==1.11.1
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | from torch.utils import cpp_extension
 3 | import platform
 4 | import os
 5 | import subprocess
 6 | import torch
 7 | 
 8 | def get_cuda_version(cuda_home=os.environ.get('CUDA_PATH', os.environ.get('CUDA_HOME', ''))):
 9 |     if cuda_home == '' or not os.path.exists(os.path.join(cuda_home,"bin","nvcc.exe" if platform.system() == "Windows" else "nvcc")):
10 |         return ''
11 |     version_str = subprocess.check_output([os.path.join(cuda_home,"bin","nvcc"),"--version"]).decode('utf-8')
12 |     idx = version_str.find("release")
13 |     return version_str[idx+len("release "):idx+len("release ")+4]
14 |     
15 | CUDA_VERSION = "".join(get_cuda_version().split(".")) if not os.environ.get('ROCM_VERSION', False) else False
16 | ROCM_VERSION = os.environ.get('ROCM_VERSION', False) if torch.version.hip else False
17 | 
18 | extra_compile_args = {
19 |     "cxx": ["-O3"],
20 |     "nvcc": ["-O3"],
21 | }
22 | if torch.version.hip:
23 |     extra_compile_args["nvcc"].append("-U__HIP_NO_HALF_CONVERSIONS__")
24 | 
25 | version = "0.0.18" + (f"+cu{CUDA_VERSION}" if CUDA_VERSION else f"+rocm{ROCM_VERSION}" if ROCM_VERSION else "")
26 | setup(
27 |     name="exllama",
28 |     version=version,
29 |     install_requires=[
30 |         "torch",
31 |     ],
32 |     packages=["exllama"],
33 |     py_modules=["exllama"],
34 |     ext_modules=[
35 |         cpp_extension.CUDAExtension(
36 |             "exllama_ext",
37 |             [
38 |                 "exllama_ext/exllama_ext.cpp",
39 |                 "exllama_ext/cuda_buffers.cu",
40 |                 "exllama_ext/cuda_func/q4_matrix.cu",
41 |                 "exllama_ext/cuda_func/q4_matmul.cu",
42 |                 "exllama_ext/cuda_func/column_remap.cu",
43 |                 "exllama_ext/cuda_func/rms_norm.cu",
44 |                 "exllama_ext/cuda_func/rope.cu",
45 |                 "exllama_ext/cuda_func/half_matmul.cu",
46 |                 "exllama_ext/cuda_func/q4_attn.cu",
47 |                 "exllama_ext/cuda_func/q4_mlp.cu",
48 |                 "exllama_ext/cpu_func/rep_penalty.cpp",
49 |             ],
50 |             extra_compile_args=extra_compile_args,
51 |             libraries=["cublas"] if platform.system() == "Windows" else [],
52 |         ),
53 |     ],
54 |     cmdclass={"build_ext": cpp_extension.BuildExtension},
55 | )
56 | 


--------------------------------------------------------------------------------
/sh/test_benchmark_perf.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "-------------------------------------------------------------------------------------------------------------"
 3 | python test_benchmark_inference.py -p -d /mnt/str/models/_test_models/iambestfeed_open_llama_3b_4bit_128g -cs
 4 | echo "-------------------------------------------------------------------------------------------------------------"
 5 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-7b-4bit-128g -cs
 6 | echo "-------------------------------------------------------------------------------------------------------------"
 7 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-13b-4bit-128g -cs
 8 | echo "-------------------------------------------------------------------------------------------------------------"
 9 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-30b-4bit-128g
10 | echo "-------------------------------------------------------------------------------------------------------------"
11 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-30b-4bit-128g-act
12 | echo "-------------------------------------------------------------------------------------------------------------"
13 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-30b-4bit-32g-act-ts -l 1550
14 | echo "-------------------------------------------------------------------------------------------------------------"
15 | python test_benchmark_inference.py -p -d /mnt/str/models/koala-13B-4bit-128g-act
16 | echo "-------------------------------------------------------------------------------------------------------------"
17 | python test_benchmark_inference.py -p -d /mnt/str/models/wizardlm-30b-uncensored-4bit-act-order
18 | echo "-------------------------------------------------------------------------------------------------------------"
19 | 


--------------------------------------------------------------------------------
/sh/test_benchmark_perf2.sh:
--------------------------------------------------------------------------------
1 | 
2 | echo "-------------------------------------------------------------------------------------------------------------"
3 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-65b-4bit-128g-act -gs 17.2,24
4 | echo "-------------------------------------------------------------------------------------------------------------"
5 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-65b-4bit-32g-act -gs 17.2,24
6 | echo "-------------------------------------------------------------------------------------------------------------"
7 | 


--------------------------------------------------------------------------------
/sh/test_benchmark_ppl.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "-------------------------------------------------------------------------------------------------------------"
 3 | python test_benchmark_inference.py -ppl -d /mnt/str/models/_test_models/iambestfeed_open_llama_3b_4bit_128g
 4 | echo "-------------------------------------------------------------------------------------------------------------"
 5 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-7b-4bit-128g
 6 | echo "-------------------------------------------------------------------------------------------------------------"
 7 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-13b-4bit-128g
 8 | echo "-------------------------------------------------------------------------------------------------------------"
 9 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-30b-4bit-128g
10 | echo "-------------------------------------------------------------------------------------------------------------"
11 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-30b-4bit-128g-act
12 | echo "-------------------------------------------------------------------------------------------------------------"
13 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-30b-4bit-32g-act-ts -l 1550
14 | echo "-------------------------------------------------------------------------------------------------------------"
15 | python test_benchmark_inference.py -ppl -d /mnt/str/models/koala-13B-4bit-128g-act
16 | echo "-------------------------------------------------------------------------------------------------------------"
17 | python test_benchmark_inference.py -ppl -d /mnt/str/models/wizardlm-30b-uncensored-4bit-act-order
18 | echo "-------------------------------------------------------------------------------------------------------------"
19 | 


--------------------------------------------------------------------------------
/sh/test_compat.sh:
--------------------------------------------------------------------------------
 1 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/iambestfeed_open_llama_3b_4bit_128g
 2 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-7B-4bit-128g    -gs 1,20
 3 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-13B-4bit-128g   -gs 3,20
 4 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-30B-4bit-32g
 5 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-30B-4bit-128g
 6 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/reeducator_bluemoonrp-13b
 7 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/reeducator_bluemoonrp-30b
 8 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TehVenom_Metharme-13b-4bit-GPTQ
 9 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_airoboros-13B-GPTQ
10 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_gpt4-x-vicuna-13B-GPTQ
11 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_GPT4All-13B-snoozy-GPTQ
12 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_guanaco-33B-GPTQ/
13 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_h2ogpt-oasst1-512-30B-GPTQ   # [1]
14 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_koala-13B-GPTQ-4bit-128g
15 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Manticore-13B-GPTQ
16 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_medalpaca-13B-GPTQ-4bit
17 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_medalpaca-13B-GPTQ-4bit_compat
18 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Nous-Hermes-13B-GPTQ
19 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_tulu-30B-GPTQ
20 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_vicuna-13B-1.1-GPTQ-4bit-128g
21 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_VicUnlocked-30B-LoRA-GPTQ
22 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_wizard-mega-13B-GPTQ
23 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Wizard-Vicuna-7B-Uncensored-GPTQ
24 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Wizard-Vicuna-13B-Uncensored-GPTQ
25 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_WizardLM-7B-uncensored-GPTQ
26 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_WizardLM-30B-Uncensored-GPTQ
27 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Yhyu13_chimera-inst-chat-13b-gptq-4bit
28 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/llama-65b-4bit-128g-act -gs 17.2,24
29 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/llama-65b-4bit-32g-act -gs 17.2,24
30 | 


--------------------------------------------------------------------------------
/test_benchmark_inference.py:
--------------------------------------------------------------------------------
  1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
  2 | from exllama.tokenizer import ExLlamaTokenizer
  3 | from exllama.generator import ExLlamaGenerator
  4 | from exllama.lora import ExLlamaLora
  5 | import perplexity
  6 | from perplexity import Perplexity
  7 | import time
  8 | import torch
  9 | import torch.nn.functional as F
 10 | import argparse
 11 | import json
 12 | import math
 13 | import sys
 14 | import os
 15 | import glob
 16 | import model_init
 17 | 
 18 | torch.cuda._lazy_init()
 19 | # torch.backends.cuda.matmul.allow_tf32 = True
 20 | # torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
 21 | torch.set_printoptions(precision = 10)
 22 | torch_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
 23 | 
 24 | cache = None
 25 | model = None
 26 | 
 27 | def begin():
 28 |     global model, cache
 29 | 
 30 |     if cache is None: cache = ExLlamaCache(model)
 31 |     else: cache.current_seq_len = 0
 32 | 
 33 | 
 34 | def next_logits(input_ids, apply_lora, last_id_only = True, input_mask = None):
 35 |     global model, cache
 36 | 
 37 |     # n_logits = None
 38 |     # a = 0
 39 |     # while a < input_ids.shape[-1]:
 40 |     #     b = min(input_ids.shape[-1], a + 2048)
 41 |     #     n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora, input_mask = input_mask)
 42 |     #     a = b
 43 | 
 44 |     n_logits = model.forward(input_ids, cache, last_id_only, lora=apply_lora, input_mask=input_mask)
 45 |     return n_logits
 46 | 
 47 | 
 48 | def tokenize(text):
 49 |     global tokenizer
 50 | 
 51 |     return tokenizer.encode(text)
 52 | 
 53 | 
 54 | def timer(name, func):
 55 |     t = time.time()
 56 |     ret = func()
 57 |     t = time.time() - t
 58 |     print(f" ** Time, {name}: {t:.2f} seconds")
 59 |     return ret
 60 | 
 61 | 
 62 | mem_base = {}
 63 | mem_last = {}
 64 | for dev in torch_devices:
 65 |     torch.cuda.reset_peak_memory_stats(dev)
 66 |     mem_base[dev] = mem_last[dev] = torch.cuda.max_memory_allocated(dev)
 67 | 
 68 | def mem(name, total = False):
 69 |     global mem_base, mem_last
 70 | 
 71 |     res = f" ** VRAM, {name}: "
 72 |     first = True
 73 | 
 74 |     for device in torch_devices:
 75 |         mem_c = torch.cuda.max_memory_allocated(device)
 76 |         mem_this = mem_c - mem_last[device] if not total else mem_c - mem_base[device]
 77 |         mem_last[device] = mem_c
 78 | 
 79 |         if not first: res += " - "
 80 |         first = False
 81 |         res += f"[{device}] {mem_this / (1024 ** 2):,.2f} MB"
 82 | 
 83 |     print(res)
 84 | 
 85 | 
 86 | # Parse arguments
 87 | 
 88 | parser = argparse.ArgumentParser(description = "Benchmark tests for ExLlama")
 89 | 
 90 | model_init.add_args(parser)
 91 | perplexity.add_args(parser)
 92 | 
 93 | parser.add_argument("-p", "--perf", action = "store_true", help = "Benchmark speed and VRAM usage")
 94 | parser.add_argument("-v", "--validate", action = "count", help = "Run validation check and generate some sample output; specify twice for a more thorough test")
 95 | parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark")
 96 | parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark")
 97 | parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark")
 98 | 
 99 | args = parser.parse_args()
100 | 
101 | model_init.post_parse(args)
102 | perplexity.post_parse(args)
103 | model_init.get_model_files(args)
104 | 
105 | # Paths
106 | 
107 | if args.lora_dir is not None:
108 |     args.lora_config = os.path.join(args.lora_dir, "adapter_config.json")
109 |     args.lora = os.path.join(args.lora_dir, "adapter_model.bin")
110 | 
111 | # Feedback
112 | 
113 | print_opts = []
114 | if args.perf: print_opts.append("perf")
115 | if args.validate: print_opts.append("validate")
116 | if args.perplexity: print_opts.append("perplexity")
117 | if args.perplexity_token: print_opts.append("perplexity_token")
118 | 
119 | model_init.print_options(args, print_opts)
120 | 
121 | # Globals
122 | 
123 | model_init.set_globals(args)
124 | 
125 | # Instantiate model
126 | 
127 | config = model_init.make_config(args)
128 | 
129 | model = timer("Load model", lambda: ExLlama(config))
130 | tokenizer = timer("Load tokenizer", lambda: ExLlamaTokenizer(args.tokenizer))
131 | 
132 | model_init.print_stats(model)
133 | 
134 | torch.cuda.reset_peak_memory_stats("cuda")
135 | mem("Model")
136 | 
137 | cache = ExLlamaCache(model)
138 | mem("Cache")
139 | 
140 | # Load LoRA
141 | 
142 | lora = None
143 | if args.lora:
144 |     print(f" -- LoRA config: {args.lora_config}")
145 |     print(f" -- Loading LoRA: {args.lora}")
146 |     if args.lora_config is None:
147 |         print(f" ## Error: please specify lora path to adapter_config.json")
148 |         sys.exit()
149 |     lora = ExLlamaLora(model, args.lora_config, args.lora)
150 |     if lora.bias_ignored:
151 |         print(f" !! Warning: LoRA zero bias ignored")
152 | 
153 | # Test sequence
154 | 
155 | gen_tokens = 128
156 | max_seq_len = args.length
157 | ids = torch.randint(0, 31999, (1, max_seq_len - gen_tokens)).cuda()
158 | 
159 | # Benchmark memory and performance
160 | 
161 | if args.perf:
162 | 
163 |     # Warming up apparently makes a huge difference
164 | 
165 |     for i in range(1, 3):
166 |         print(f" -- Warmup pass {i}...")
167 |         begin()
168 |         logits = timer("Warmup", lambda: next_logits(ids, lora))
169 | 
170 |     # Do the actual benchmark
171 | 
172 |     begin()
173 | 
174 |     t = time.time()
175 | 
176 |     print(" -- Inference, first pass.")
177 |     logits = timer("Inference", lambda: next_logits(ids, lora))
178 | 
179 |     t = time.time() - t
180 |     print(f" ** Speed: {ids.shape[-1] / t:.2f} tokens/second")
181 | 
182 |     for j in range(2):
183 | 
184 |         t = time.time()
185 |         print(f" -- Generating {gen_tokens} tokens, {ids.shape[-1]} token prompt...")
186 |         for i in range(gen_tokens):
187 | 
188 |             logits = logits[0, -1, :]
189 |             token = torch.argmax(logits)
190 |             next_id = token.unsqueeze(0).unsqueeze(0)
191 |             logits = next_logits(next_id, lora)
192 | 
193 |         t = time.time() - t
194 |         print(f" ** Speed: {gen_tokens / t:.2f} tokens/second")
195 | 
196 |         ids = ids[:, :4]
197 |         cache.current_seq_len = 4
198 | 
199 |     mem("Inference")
200 |     mem("Total", total = True)
201 | 
202 | 
203 | # Benchmark perplexity
204 | 
205 | if args.perplexity:
206 | 
207 |     ppl = Perplexity(args.perplexity, model, cache, tokenizer)
208 | 
209 |     print(" -- Loading dataset...")
210 | 
211 |     ppl.load(dataset_path = args.perplexity_dataset,
212 |              chunk_size = args.perplexity_chunk_size,
213 |              chunk_truncate = args.perplexity_chunk_truncate,
214 |              overlap = args.perplexity_chunk_overlap,
215 |              minlength = args.perplexity_chunk_min,
216 |              json_key = args.perplexity_json_key)
217 | 
218 |     begin()
219 | 
220 |     ppl.test(args.perplexity_chunk_num,
221 |              lora = lora,
222 |              ppl_token = args.perplexity_token)
223 | 
224 | # Validate file
225 | 
226 | if args.validate:
227 | 
228 |     ppl = Perplexity(args.perplexity, model, cache, tokenizer)
229 | 
230 |     ppl.load(dataset_path = "datasets/wikitext2_val_sample.jsonl",
231 |              chunk_size = 2048,
232 |              chunk_truncate = 2048,
233 |              overlap = 0,
234 |              minlength = 50,
235 |              json_key = "text")
236 | 
237 |     # Short perplexity tests in switched and quant mode, should produce roughly equal results
238 | 
239 |     begin()
240 | 
241 |     ppl.cache.zero()
242 |     model.config.matmul_recons_thd = 1
243 |     ppl.test(8, lora = lora, tag = " (reconstruct)")
244 |     ppl.cache.zero()
245 |     model.config.matmul_recons_thd = 0
246 |     ppl.test(8, lora = lora, tag = " (quant, token)", ppl_token = True)
247 | 
248 |     # Do a short, easy topk=1 completion to see if we're generating garbage. Should run in switched mode
249 |     # for the prompt and quant for individual tokens
250 | 
251 |     model.config.matmul_recons_thd = 4
252 |     generator = ExLlamaGenerator(model, tokenizer, cache)
253 |     generator.settings.top_k = 1
254 |     generator.lora = lora
255 |     text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate)
256 |     print(f" ** Generation: {repr(text)}")
257 | 
258 |     if args.validate > 1:
259 | 
260 |         # Test batched generation
261 | 
262 |         bsz = 8
263 |         gen_len = 20
264 |         torch.manual_seed(42)
265 |         torch.cuda.manual_seed_all(42)
266 | 
267 |         # Bigger cache for the batch
268 | 
269 |         del cache
270 |         cache = ExLlamaCache(model, batch_size = bsz)
271 | 
272 |         # Create tokenized batch and attention mask
273 | 
274 |         identical_batch_prompt = "When you have eliminated the impossible, whatever remains,"
275 |         continuations = [
276 |             " must be considered",
277 |             " ought to be",
278 |             " (and some scholars say this is",
279 |             " however improbable, is a banana.",
280 |         ]
281 | 
282 |         prompts = [identical_batch_prompt] * (bsz - len(continuations))
283 |         for cont in continuations:
284 |             prompts.append(identical_batch_prompt + cont)
285 | 
286 |         ids = tokenizer.encode(prompts)
287 |         assert ids.shape[1] < model.config.max_seq_len, f"Max length {ids.shape[1]} exceeds model limit {model.config.max_seq_len}"
288 | 
289 |         mask = ids.ne(tokenizer.pad_token_id)
290 | 
291 |         # Batched generation with greedy sampling
292 | 
293 |         sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu")
294 |         logits = next_logits(ids, lora, input_mask = mask)
295 | 
296 |         for i in range(gen_len):
297 |             logits = logits[:, -1, :]
298 |             id_per_batch = torch.argmax(logits, dim=-1)
299 |             assert id_per_batch.shape == (bsz,), f"{id_per_batch.shape} != {(bsz,)}"
300 |             next_id_per_batch = id_per_batch.unsqueeze(-1)
301 |             sequence = torch.cat((sequence, next_id_per_batch), dim = -1)
302 |             logits = next_logits(next_id_per_batch, lora)
303 | 
304 |         # Print output batch
305 | 
306 |         print(f"\n ** Batching sanity check: 1-{bsz - len(continuations)} should be identical. All should be reasonable for the model you're using.\n")
307 | 
308 |         outputs = tokenizer.decode(sequence)
309 |         for b in range(bsz):
310 |             print(f"{b + 1} {repr(prompts[b])} -> {repr(outputs[b])}")
311 | 
312 |         # TODO Save the logits and then rerun each prompt with a batch size of 1, same input. The logits should be identical.
313 | 


--------------------------------------------------------------------------------
/util/shard.py:
--------------------------------------------------------------------------------
 1 | import argparse, json, math, os
 2 | from safetensors import safe_open
 3 | from safetensors.torch import save_file
 4 | 
 5 | parser = argparse.ArgumentParser(description = "Split .safetensors file into shards")
 6 | parser.add_argument("input_file", type = str, help = "Path to input file")
 7 | parser.add_argument("shard_size", type = int, help = "Shard size in megabytes")
 8 | args = parser.parse_args()
 9 | 
10 | input_file = args.input_file
11 | input_base, _ = os.path.splitext(input_file)
12 | shard_size = args.shard_size * 1024**2
13 | 
14 | # Create tensor map
15 | 
16 | def _tsize(st, key):
17 | 
18 |     tslice = st.get_slice(key)
19 |     shape = tslice.get_shape()
20 |     numel = 1
21 |     for x in shape: numel *= x
22 |     dtype = tslice.get_dtype()
23 |     del tslice
24 |     if dtype == "I32": return numel * 4
25 |     elif dtype == "I16": return numel * 2
26 |     elif dtype == "F16": return numel * 2
27 |     elif dtype == "F32": return numel * 4
28 |     else: raise ValueError("Unexpected datatype: " + key)
29 | 
30 | num_files = 0
31 | current_size = shard_size + 1
32 | total_size = 0
33 | tensor_map = []
34 | 
35 | print(f" -- Scanning tensors in {input_file}")
36 | 
37 | with safe_open(input_file, framework = "pt", device = "cpu") as f:
38 |     
39 |     for key in f.keys():
40 |         
41 |         tensor_size = _tsize(f, key)
42 |         total_size += tensor_size
43 |         
44 |         if current_size + tensor_size > shard_size:
45 |             
46 |             num_files += 1
47 |             current_size = 0
48 |             current_list = []
49 |             tensor_map.append(current_list)
50 |             
51 |         current_size += tensor_size
52 |         current_list.append(key)
53 | 
54 | # Split into output files
55 | 
56 | weight_map = {}
57 | 
58 | for file_index, keys in enumerate(tensor_map):
59 |     
60 |     shard = {}
61 |     shard_filename = f"{input_base}-{file_index + 1:05}-of-{num_files:05}.safetensors"
62 | 
63 |     with safe_open(input_file, framework = "pt", device = "cpu") as f:
64 |         for key in keys:
65 |             print(f" -- Reading: {key}")
66 |             shard[key] = f.get_tensor(key)
67 |             weight_map[key] = shard_filename
68 | 
69 |     print(f" -- Writing: {shard_filename}")
70 |     save_file(shard, shard_filename)
71 |     
72 | # Compile index
73 | 
74 | index = { "metadata": { "total_size": total_size }, "weight_map": weight_map }
75 | index_filename = f"{input_file}.index.json"
76 | 
77 | print(f" -- Writing: {index_filename}")
78 | 
79 | with open(index_filename, 'w') as f:
80 |     json.dump(index, f, indent = 2)
81 | 
82 | # Done    
83 |     
84 | print(f" -- Done")


--------------------------------------------------------------------------------
/webui/app.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | from exllama.model import ExLlama, ExLlamaConfig
  5 | from flask import Flask, render_template, request, jsonify
  6 | from flask import Response, stream_with_context
  7 | from threading import Timer, Lock
  8 | import webbrowser
  9 | import json
 10 | import model_init
 11 | from session import prepare_sessions, get_initial_session, Session, load_session, new_session, _sessions_dir
 12 | import argparse
 13 | from exllama.tokenizer import ExLlamaTokenizer
 14 | from waitress import serve
 15 | 
 16 | app = Flask(__name__)
 17 | app.static_folder = 'static'
 18 | generate_lock = Lock()
 19 | session: Session
 20 | 
 21 | # Render template
 22 | 
 23 | @app.route("/")
 24 | def home():
 25 |     return render_template("index.html")
 26 | 
 27 | # Get existing sessions
 28 | 
 29 | @app.route("/api/populate")
 30 | def api_populate():
 31 |     global session
 32 |     return session.api_populate()
 33 | 
 34 | # Edit block
 35 | 
 36 | @app.route("/api/edit_block", methods=['POST'])
 37 | def api_edit_block():
 38 |     global session
 39 |     data = request.get_json()
 40 |     session.api_edit_block(data)
 41 |     return json.dumps({"result": "ok"}) + "\n"
 42 | 
 43 | # Delete block
 44 | 
 45 | @app.route("/api/delete_block", methods=['POST'])
 46 | def api_delete_block():
 47 |     global session
 48 |     data = request.get_json()
 49 |     session.api_delete_block(data)
 50 |     return json.dumps({"result": "ok"}) + "\n"
 51 | 
 52 | # Rename session
 53 | 
 54 | @app.route("/api/rename_session", methods=['POST'])
 55 | def api_rename_session():
 56 |     global session
 57 |     data = request.get_json()
 58 |     success = session.api_rename_session(data)
 59 |     return json.dumps({"result": "ok" if success else "fail"}) + "\n"
 60 | 
 61 | # Delete session
 62 | 
 63 | @app.route("/api/delete_session", methods=['POST'])
 64 | def api_delete_session():
 65 |     global session
 66 |     data = request.get_json()
 67 |     session.api_delete_session(data)
 68 |     return json.dumps({"result": "ok"}) + "\n"
 69 | 
 70 | # Set fixed prompt settings
 71 | 
 72 | @app.route("/api/set_fixed_prompt", methods=['POST'])
 73 | def api_set_fixed_prompt():
 74 |     global session
 75 |     data = request.get_json()
 76 |     session.api_set_fixed_prompt(data)
 77 |     return json.dumps({"result": "ok"}) + "\n"
 78 | 
 79 | # Set generation settings
 80 | 
 81 | @app.route("/api/set_gen_settings", methods=['POST'])
 82 | def api_set_gen_settings():
 83 |     global session
 84 |     data = request.get_json()
 85 |     session.api_set_gen_settings(data)
 86 |     return json.dumps({"result": "ok"}) + "\n"
 87 | 
 88 | # Set session
 89 | 
 90 | @app.route("/api/set_session", methods=['POST'])
 91 | def api_set_session():
 92 |     global session
 93 |     data = request.get_json()
 94 |     load_session_name = data["session_name"]
 95 |     if load_session_name == ".":
 96 |         session = new_session()
 97 |     else:
 98 |         session = load_session(load_session_name, append_path = True)
 99 |     return json.dumps({"result": "ok"}) + "\n"
100 | 
101 | # Set participants
102 | 
103 | @app.route("/api/set_participants", methods=['POST'])
104 | def api_set_participants():
105 |     global session
106 |     data = request.get_json()
107 |     session.api_set_participants(data)
108 |     return json.dumps({"result": "ok"}) + "\n"
109 | 
110 | # Accept input
111 | 
112 | @app.route("/api/userinput", methods=['POST'])
113 | def api_userinput():
114 |     data = request.get_json()
115 |     user_input = data["user_input"]
116 | 
117 |     with generate_lock:
118 |         result = Response(stream_with_context(session.respond_multi(user_input)), mimetype = 'application/json')
119 |         return result
120 | 
121 | @app.route("/api/append_block", methods=['POST'])
122 | def api_append_block():
123 |     data = request.get_json()
124 |     session.api_append_block(data)
125 |     return json.dumps({"result": "ok"}) + "\n"
126 | 
127 | # Load the model
128 | 
129 | parser = argparse.ArgumentParser(description="Simple web-based chatbot for ExLlama")
130 | parser.add_argument("-host", "--host", type = str, help = "IP:PORT eg, 0.0.0.0:7862", default = "localhost:5000")
131 | parser.add_argument("-sd", "--sessions_dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/")
132 | 
133 | model_init.add_args(parser)
134 | args = parser.parse_args()
135 | model_init.post_parse(args)
136 | model_init.get_model_files(args)
137 | 
138 | model_init.print_options(args)
139 | config = model_init.make_config(args)
140 | 
141 | model_init.set_globals(args)
142 | 
143 | print(f" -- Loading model...")
144 | model = ExLlama(config)
145 | 
146 | print(f" -- Loading tokenizer...")
147 | tokenizer = ExLlamaTokenizer(args.tokenizer)
148 | 
149 | model_init.print_stats(model)
150 | 
151 | # Get the session ready
152 | 
153 | prepare_sessions(model, tokenizer, args.sessions_dir)
154 | session = get_initial_session()
155 | 
156 | print(f" -- Sessions stored in: {_sessions_dir()}")
157 | 
158 | # Start the web server
159 | 
160 | machine = args.host
161 | host, port = machine.split(":")
162 | 
163 | if host == "localhost":
164 |     Timer(1, lambda: webbrowser.open(f'http://{machine}/')).start()
165 | 
166 | serve(app, host = host, port = port)


--------------------------------------------------------------------------------
/webui/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <head>
 4 |         <title>EXLlama</title>
 5 |         <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
 6 |         <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
 7 |     </head>
 8 |     <body>
 9 |         <svg style="display: none;">
10 |             <defs>
11 |                 <symbol id="pencil-icon" viewBox="0 0 24 24">
12 |                     <path d="M16.84,2.73C16.45,2.73 16.07,2.88 15.77,3.17L13.65,5.29L18.95,10.6L21.07,8.5C21.67,7.89 21.67,6.94 21.07,6.36L17.9,3.17C17.6,2.88 17.22,2.73 16.84,2.73M12.94,6L4.84,14.11L7.4,14.39L7.58,16.68L9.86,16.85L10.15,19.41L18.25,11.3M4.25,15.04L2.5,21.73L9.2,19.94L8.96,17.78L6.65,17.61L6.47,15.29"/>
13 |                 </symbol>
14 |             </defs>
15 |         </svg>
16 |         <svg style="display: none;">
17 |             <defs>
18 |                 <symbol id="delete-icon" viewBox="0 0 24 24">
19 |                     <path d="M9,3V4H4V6H5V19C5,20.1 5.9,21 7,21H17C18.1,21 19,20.1 19,19V6H20V4H15V3H9M7,6H17V19H7V6M9,8V17H11V8H9M13,8V17H15V8H13Z"/>
20 |                 </symbol>
21 |             </defs>
22 |         </svg>
23 | 
24 |         <div id="chat-container">
25 |             <div id="sessions-column">
26 |                 <!-- Sessions go here. -->
27 |             </div>
28 |             <div id="chat-column">
29 |                 <div id="chatbox">
30 |                     <!-- Chat goes here. -->
31 |                 </div>
32 |                 <div id="chat-gradient-bot"></div>
33 |                 <div id="chat-gradient-top"></div>
34 |                 <div id="chat-input-form">
35 |                     <textarea id="user-input" placeholder="Type here..." rows="1"></textarea>
36 |                 </div>
37 |             </div>
38 |             <div id="control-column">
39 | 
40 |                 <div class="controls-frame">
41 |                     <div class="header no-select">Model<span class="arrow">▼</span></div>
42 |                     <div class="contents" id="model_info" style="display: none;">
43 |                         <textarea class="textfield_info" id="tf_model_info" spellcheck="false" disabled="true"></textarea>
44 |                     </div>
45 |                 </div>
46 | 
47 |                 <div class="controls-frame">
48 |                     <div class="header no-select">Fixed prompt<span class="arrow">▼</span></div>
49 |                     <div class="contents" style="display: none;">
50 |                         <textarea class="textfield" id="tf_fixed_prompt" rows="10" spellcheck="false"></textarea>
51 |                         <label class="custom-checkbox no-select" id="cb_keep_fixed_prompt"><input type="checkbox" hidden /><span class="checkbox-container"></span><span class="label-text">Keep in context</span></label>
52 |                     </div>
53 |                 </div>
54 | 
55 |                 <div class="controls-frame">
56 |                     <div class="header no-select">Participants<span class="arrow">▲</span></div>
57 |                     <div class="contents" id="participants">
58 |                     </div>
59 |                 </div>
60 | 
61 |                 <div class="controls-frame">
62 |                     <div class="header no-select">Sampler<span class="arrow">▲</span></div>
63 |                     <div class="contents" id="gen_settings"></div>
64 |                 </div>
65 | 
66 |                 <div class="controls-frame">
67 |                     <div class="header no-select">Stop condition<span class="arrow">▲</span></div>
68 |                     <div class="contents" id="stop_settings"></div>
69 |                     <div class="contents">
70 |                         <label class="custom-checkbox no-select" id="cb_gen_endnewline"><input type="checkbox" hidden /><span class="checkbox-container"></span><span class="label-text">End on newline</span></label>
71 |                     </div>
72 |                 </div>
73 | 
74 |                 <div class="controls-frame">
75 |                     <div class="header no-select">Repetition penalty<span class="arrow">▲</span></div>
76 |                     <div class="contents" id="repp_settings"></div>
77 |                 </div>
78 | 
79 |             </div>
80 |         </div>
81 |         <script src="{{ url_for('static', filename='main.js') }}"></script>
82 |     </body>
83 | </html>


--------------------------------------------------------------------------------