├── .dockerignore ├── .env ├── .github ├── FUNDING.yml └── workflows │ ├── build-wheels-fix.yml │ ├── build-wheels-release-rocm.yml │ ├── build-wheels-release.yml │ ├── build-wheels-rocm.yml │ └── build-wheels.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── datasets ├── download_datasets.py └── wikitext2_val_sample.jsonl ├── doc ├── TODO.md ├── _screenshot.jpg └── model_compatibility.md ├── docker-compose.yml ├── entrypoint.sh ├── example_alt_generator.py ├── example_basic.py ├── example_batch.py ├── example_cfg.py ├── example_chatbot.py ├── example_flask.py ├── example_lora.py ├── example_ws.py ├── exllama ├── __init__.py ├── alt_generator.py ├── cuda_ext.py ├── generator.py ├── lora.py ├── model.py └── tokenizer.py ├── exllama_ext ├── cpu_func │ ├── rep_penalty.cpp │ └── rep_penalty.h ├── cuda_buffers.cu ├── cuda_buffers.cuh ├── cuda_compat.cuh ├── cuda_func │ ├── column_remap.cu │ ├── column_remap.cuh │ ├── half_matmul.cu │ ├── half_matmul.cuh │ ├── q4_attn.cu │ ├── q4_attn.cuh │ ├── q4_matmul.cu │ ├── q4_matmul.cuh │ ├── q4_matrix.cu │ ├── q4_matrix.cuh │ ├── q4_mlp.cu │ ├── q4_mlp.cuh │ ├── rms_norm.cu │ ├── rms_norm.cuh │ ├── rope.cu │ └── rope.cuh ├── exllama_ext.cpp ├── hip_compat.cuh ├── matrix.cuh ├── tuning.h └── util.cuh ├── globals.py ├── model_init.py ├── perplexity.py ├── prompt_chatbort.txt ├── requirements-web.txt ├── requirements.txt ├── setup.py ├── sh ├── test_benchmark_perf.sh ├── test_benchmark_perf2.sh ├── test_benchmark_ppl.sh └── test_compat.sh ├── test_benchmark_inference.py ├── util └── shard.py └── webui ├── app.py ├── session.py ├── static ├── main.js └── style.css └── templates └── index.html /.dockerignore: -------------------------------------------------------------------------------- 1 | exllama_sessions 2 | models 3 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | PORT=5000 2 | RUN_UID=1000 # set to 0 to run the service as root inside the container 3 | APPLICATION_STATE_PATH=/data # path to the directory holding application state inside the container 4 | MODEL_PATH=models/LLaMA-7B-4bit-128g # replace with the actual model path on the host 5 | SESSIONS_PATH=~/exllama_sessions # replace with the actual directory on the host where chat sessions should be stored 6 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | ko_fi: turboderp 2 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-fix.yml: -------------------------------------------------------------------------------- 1 | name: Fix Release 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | build_wheels: 10 | name: ${{ matrix.os }} Python ${{ matrix.pyver }} CUDA ${{ matrix.cuda }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-20.04, windows-latest] 15 | pyver: ["3.8", "3.9", "3.10", "3.11"] 16 | cuda: ["11.7.0", "11.8.0"] 17 | defaults: 18 | run: 19 | shell: pwsh 20 | env: 21 | CUDAVER: ${{ matrix.cuda }} 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | with: 26 | ref: 'wheel-fix' 27 | - uses: actions/setup-python@v3 28 | with: 29 | python-version: ${{ matrix.pyver }} 30 | 31 | - name: Setup Mamba 32 | uses: conda-incubator/setup-miniconda@v2.2.0 33 | with: 34 | activate-environment: "build" 35 | python-version: ${{ matrix.pyver }} 36 | miniforge-variant: Mambaforge 37 | miniforge-version: latest 38 | use-mamba: true 39 | add-pip-as-python-dependency: true 40 | auto-activate-base: false 41 | 42 | - name: Install Dependencies 43 | run: | 44 | $cudaVersion = $env:CUDAVER 45 | $cudaVersionPytorch = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') 46 | $cudaChannels = '' 47 | $cudaNum = [int]$cudaVersion.substring($cudaVersion.LastIndexOf('.')+1) 48 | while ($cudaNum -ge 0) { $cudaChannels += '-c nvidia/label/cuda-' + $cudaVersion.Remove($cudaVersion.LastIndexOf('.')+1) + $cudaNum + ' '; $cudaNum-- } 49 | mamba install -y 'cuda' $cudaChannels.TrimEnd().Split() 50 | python -m pip install build wheel "torch==2.0.1+cu$cudaVersionPytorch" safetensors sentencepiece ninja --extra-index-url "https://download.pytorch.org/whl/cu$cudaVersionPytorch" 51 | 52 | - name: Build Wheel 53 | id: build-wheel 54 | run: | 55 | Write-Output "PACKAGE_VERSION=0.0.6" >> "$env:GITHUB_OUTPUT" 56 | $env:CUDA_PATH = $env:CONDA_PREFIX 57 | $env:CUDA_HOME = $env:CONDA_PREFIX 58 | if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} 59 | $env:TORCH_CUDA_ARCH_LIST = if ([version]$env:CUDAVER -lt [version]'11.8') {'6.0 6.1 7.0 7.5 8.0 8.6+PTX'} else {'6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'} 60 | python -m build -n --wheel 61 | 62 | - uses: actions/upload-artifact@v3 63 | with: 64 | name: 'wheels' 65 | path: ./dist/*.whl 66 | 67 | - name: Upload files to a GitHub release 68 | if: steps.build-wheel.outputs.PACKAGE_VERSION != 'None' 69 | uses: svenstaro/upload-release-action@2.6.1 70 | with: 71 | file: ./dist/*.whl 72 | tag: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }} 73 | file_glob: true 74 | overwrite: true 75 | release_name: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }} 76 | body: "Wheels are compiled with CUDA 11.7 and 11.8 for Windows and Linux x64" 77 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-release-rocm.yml: -------------------------------------------------------------------------------- 1 | name: Build ROCm Wheels & Release 2 | 3 | on: 4 | workflow_dispatch: 5 | workflow_call: 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | build_wheels: 12 | name: Build ROCm ${{ matrix.rocm }} wheel for Python ${{ matrix.pyver }} 13 | runs-on: ubuntu-20.04 14 | strategy: 15 | matrix: 16 | pyver: ["3.8", "3.9", "3.10", "3.11"] 17 | rocm: ['5.4.2', '5.5', '5.6'] 18 | defaults: 19 | run: 20 | shell: pwsh 21 | env: 22 | ROCM_VERSION: ${{ matrix.rocm }} 23 | 24 | steps: 25 | - name: Free Disk Space 26 | uses: jlumbroso/free-disk-space@v1.2.0 27 | with: 28 | tool-cache: false 29 | android: true 30 | dotnet: true 31 | haskell: true 32 | large-packages: false 33 | swap-storage: false 34 | 35 | - uses: actions/checkout@v3 36 | 37 | - name: Install ROCm SDK 38 | shell: bash 39 | run: | 40 | [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings 41 | wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 42 | echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list 43 | echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 44 | sudo apt update 45 | sudo apt install rocm-dev rocsparse-dev rocprim-dev rocthrust-dev rocblas-dev hipblas-dev hipcub-dev hipsparse-dev -y 46 | echo "/opt/rocm/bin" >> $GITHUB_PATH 47 | echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV 48 | echo "USE_ROCM=1" >> $GITHUB_ENV 49 | 50 | - uses: actions/setup-python@v3 51 | with: 52 | python-version: ${{ matrix.pyver }} 53 | 54 | - name: Install Dependencies 55 | run: | 56 | $packages = 'build wheel safetensors sentencepiece ninja' 57 | $torver = if ([version]$env:ROCM_VERSION -lt [version]'5.5') {'2.0.1'} else {'2.1.0'} 58 | $packages += " torch==$torver+rocm$env:ROCM_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm$env:ROCM_VERSION" 59 | 60 | pip3 install $packages.split(' ') 61 | 62 | - name: Build Wheel 63 | id: build-wheel 64 | run: | 65 | if ($(Get-Content 'setup.py' -raw) -match 'version = "(\d+\.(?:\d+\.?)*)" \+ \(') {Write-Output $('::notice file=build-wheels-release-rocm.yml,line=54,title=Package Version::Detected package version is: {0}' -f $Matches[1]); Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT"} else {Write-Output '::error file=build-wheels-release.yml,line=41::Could not parse version from setup.py! You must upload wheels manually!'; Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT"} 66 | 67 | $env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' 68 | if ([version]$env:ROCM_VERSION -lt [version]'5.5') {$env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030'} 69 | 70 | python3 -m build -n --wheel 71 | 72 | - uses: actions/upload-artifact@v3 73 | with: 74 | name: 'wheels' 75 | path: ./dist/*.whl 76 | 77 | - name: Upload files to a GitHub release 78 | if: steps.build-wheel.outputs.PACKAGE_VERSION != 'None' 79 | uses: svenstaro/upload-release-action@2.6.1 80 | with: 81 | file: ./dist/*.whl 82 | tag: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }} 83 | file_glob: true 84 | overwrite: true 85 | release_name: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }} 86 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-release.yml: -------------------------------------------------------------------------------- 1 | name: Build Wheels & Release 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | build_wheels: 10 | name: ${{ matrix.os }} Python ${{ matrix.pyver }} CUDA ${{ matrix.cuda }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-20.04, windows-latest] 15 | pyver: ["3.8", "3.9", "3.10", "3.11"] 16 | cuda: ["11.7.0", "11.8.0", "12.1.1"] 17 | defaults: 18 | run: 19 | shell: pwsh 20 | env: 21 | CUDAVER: ${{ matrix.cuda }} 22 | 23 | steps: 24 | - name: Free Disk Space 25 | uses: jlumbroso/free-disk-space@v1.2.0 26 | if: runner.os == 'Linux' 27 | with: 28 | tool-cache: false 29 | android: true 30 | dotnet: true 31 | haskell: true 32 | large-packages: false 33 | swap-storage: false 34 | 35 | - uses: actions/checkout@v3 36 | - uses: actions/setup-python@v3 37 | with: 38 | python-version: ${{ matrix.pyver }} 39 | 40 | - name: Setup Mamba 41 | uses: conda-incubator/setup-miniconda@v2.2.0 42 | with: 43 | activate-environment: "build" 44 | python-version: ${{ matrix.pyver }} 45 | miniforge-variant: Mambaforge 46 | miniforge-version: latest 47 | use-mamba: true 48 | add-pip-as-python-dependency: true 49 | auto-activate-base: false 50 | 51 | - name: Install Dependencies 52 | run: | 53 | $cudaVersion = $env:CUDAVER 54 | $cudaVersionPytorch = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') 55 | $cudaChannels = '' 56 | $cudaNum = [int]$cudaVersion.substring($cudaVersion.LastIndexOf('.')+1) 57 | while ($cudaNum -ge 0) { $cudaChannels += '-c nvidia/label/cuda-' + $cudaVersion.Remove($cudaVersion.LastIndexOf('.')+1) + $cudaNum + ' '; $cudaNum-- } 58 | mamba install -y 'cuda' $cudaChannels.TrimEnd().Split() 59 | if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaChannels.TrimEnd().Split()} 60 | if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'} 61 | 62 | if ([version]$env:CUDAVER -lt [version]'11.8.0') {$torch = "torch==2.0.1"} else {$torch = "torch==2.1.0"} 63 | 64 | python -m pip install --upgrade build setuptools wheel ninja $torch --extra-index-url "https://download.pytorch.org/whl/cu$cudaVersionPytorch" 65 | 66 | - name: Build Wheel 67 | id: build-wheel 68 | run: | 69 | if ($(Get-Content 'setup.py' -raw) -match 'version = "(\d+\.(?:\d+\.?)*)" \+ \(') 70 | { 71 | Write-Output $('::notice file=build-wheels-release.yml,line=53,title=Package Version::Detected package version is: {0}' -f $Matches[1]) 72 | Write-Output "PACKAGE_VERSION=$($Matches[1])" >> "$env:GITHUB_OUTPUT" 73 | } else { 74 | Write-Output '::error file=build-wheels-release.yml,line=41::Could not parse version from setup.py! You must upload wheels manually!' 75 | Write-Output "PACKAGE_VERSION=None" >> "$env:GITHUB_OUTPUT" 76 | } 77 | 78 | $env:CUDA_PATH = $env:CONDA_PREFIX 79 | $env:CUDA_HOME = $env:CONDA_PREFIX 80 | if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} 81 | 82 | $env:TORCH_CUDA_ARCH_LIST = if ([version]$env:CUDAVER -lt [version]'11.8') {'6.0 6.1 7.0 7.5 8.0 8.6+PTX'} else {'6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'} 83 | 84 | python -m build -n --wheel 85 | 86 | - uses: actions/upload-artifact@v3 87 | with: 88 | name: 'wheels' 89 | path: ./dist/*.whl 90 | 91 | - name: Upload files to a GitHub release 92 | if: steps.build-wheel.outputs.PACKAGE_VERSION != 'None' 93 | uses: svenstaro/upload-release-action@2.6.1 94 | with: 95 | file: ./dist/*.whl 96 | tag: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }} 97 | file_glob: true 98 | overwrite: true 99 | release_name: ${{ steps.build-wheel.outputs.PACKAGE_VERSION }} 100 | 101 | build_rocm: 102 | name: Build ROCm Wheels & Release 103 | needs: build_wheels 104 | uses: ./.github/workflows/build-wheels-release-rocm.yml 105 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels-rocm.yml: -------------------------------------------------------------------------------- 1 | name: Build ROCm Wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | workflow_call: 6 | 7 | jobs: 8 | build_wheels: 9 | name: Build ROCm ${{ matrix.rocm }} wheel for Python ${{ matrix.pyver }} 10 | runs-on: ubuntu-20.04 11 | strategy: 12 | matrix: 13 | pyver: ["3.8", "3.9", "3.10", "3.11"] 14 | rocm: ['5.4.2', '5.5', '5.6'] 15 | defaults: 16 | run: 17 | shell: pwsh 18 | env: 19 | ROCM_VERSION: ${{ matrix.rocm }} 20 | 21 | steps: 22 | - name: Free Disk Space 23 | uses: jlumbroso/free-disk-space@v1.2.0 24 | with: 25 | tool-cache: false 26 | android: true 27 | dotnet: true 28 | haskell: true 29 | large-packages: false 30 | swap-storage: false 31 | 32 | - uses: actions/checkout@v3 33 | 34 | - name: Install ROCm SDK 35 | shell: bash 36 | run: | 37 | [ ! -d /etc/apt/keyrings ] && sudo mkdir --parents --mode=0755 /etc/apt/keyrings 38 | wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 39 | echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" | sudo tee --append /etc/apt/sources.list.d/rocm.list 40 | echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 41 | sudo apt update 42 | sudo apt install rocm-dev rocsparse-dev rocprim-dev rocthrust-dev rocblas-dev hipblas-dev hipcub-dev hipsparse-dev -y 43 | echo "/opt/rocm/bin" >> $GITHUB_PATH 44 | echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV 45 | echo "USE_ROCM=1" >> $GITHUB_ENV 46 | 47 | - uses: actions/setup-python@v3 48 | with: 49 | python-version: ${{ matrix.pyver }} 50 | 51 | - name: Install Dependencies 52 | run: | 53 | $packages = 'build wheel safetensors sentencepiece ninja' 54 | $torver = if ([version]$env:ROCM_VERSION -lt [version]'5.5') {'2.0.1'} else {'2.1.0'} 55 | $packages += " torch==$torver+rocm$env:ROCM_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm$env:ROCM_VERSION" 56 | 57 | pip3 install $packages.split(' ') 58 | 59 | - name: Build Wheel 60 | run: | 61 | $env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' 62 | if ([version]$env:ROCM_VERSION -lt [version]'5.5') {$env:PYTORCH_ROCM_ARCH = 'gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030'} 63 | 64 | python3 -m build -n --wheel 65 | 66 | - uses: actions/upload-artifact@v3 67 | with: 68 | name: 'wheels' 69 | path: ./dist/*.whl 70 | -------------------------------------------------------------------------------- /.github/workflows/build-wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build Wheels 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | build_wheels: 7 | name: ${{ matrix.os }} Python ${{ matrix.pyver }} CUDA ${{ matrix.cuda }} 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: [ubuntu-20.04, windows-latest] 12 | pyver: ["3.8", "3.9", "3.10", "3.11"] 13 | cuda: ["11.7.0", "11.8.0", "12.1.1"] 14 | defaults: 15 | run: 16 | shell: pwsh 17 | env: 18 | CUDAVER: ${{ matrix.cuda }} 19 | 20 | steps: 21 | - name: Free Disk Space 22 | uses: jlumbroso/free-disk-space@v1.2.0 23 | if: runner.os == 'Linux' 24 | with: 25 | tool-cache: false 26 | android: true 27 | dotnet: true 28 | haskell: true 29 | large-packages: false 30 | swap-storage: false 31 | 32 | - uses: actions/checkout@v3 33 | - uses: actions/setup-python@v3 34 | with: 35 | python-version: ${{ matrix.pyver }} 36 | 37 | - name: Setup Mamba 38 | uses: conda-incubator/setup-miniconda@v2.2.0 39 | with: 40 | activate-environment: "build" 41 | python-version: ${{ matrix.pyver }} 42 | miniforge-variant: Mambaforge 43 | miniforge-version: latest 44 | use-mamba: true 45 | add-pip-as-python-dependency: true 46 | auto-activate-base: false 47 | 48 | - name: Install Dependencies 49 | run: | 50 | $cudaVersion = $env:CUDAVER 51 | $cudaVersionPytorch = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') 52 | $cudaChannels = '' 53 | $cudaNum = [int]$cudaVersion.substring($cudaVersion.LastIndexOf('.')+1) 54 | while ($cudaNum -ge 0) { $cudaChannels += '-c nvidia/label/cuda-' + $cudaVersion.Remove($cudaVersion.LastIndexOf('.')+1) + $cudaNum + ' '; $cudaNum-- } 55 | mamba install -y 'cuda' $cudaChannels.TrimEnd().Split() 56 | if (!(mamba list cuda)[-1].contains('cuda')) {sleep -s 10; mamba install -y 'cuda' $cudaChannels.TrimEnd().Split()} 57 | if (!(mamba list cuda)[-1].contains('cuda')) {throw 'CUDA Toolkit failed to install!'} 58 | 59 | if ([version]$env:CUDAVER -lt [version]'11.8.0') {$torch = "torch==2.0.1"} else {$torch = "torch==2.1.0"} 60 | 61 | python -m pip install --upgrade build setuptools wheel ninja $torch --extra-index-url "https://download.pytorch.org/whl/cu$cudaVersionPytorch" 62 | 63 | - name: Build Wheel 64 | run: | 65 | $env:CUDA_PATH = $env:CONDA_PREFIX 66 | $env:CUDA_HOME = $env:CONDA_PREFIX 67 | if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH} 68 | 69 | $env:TORCH_CUDA_ARCH_LIST = if ([version]$env:CUDAVER -lt [version]'11.8') {'6.0 6.1 7.0 7.5 8.0 8.6+PTX'} else {'6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'} 70 | 71 | python -m build -n --wheel 72 | 73 | - uses: actions/upload-artifact@v3 74 | with: 75 | name: 'wheels' 76 | path: ./dist/*.whl 77 | 78 | build_rocm: 79 | name: Build ROCm Wheels 80 | needs: build_wheels 81 | uses: ./.github/workflows/build-wheels-rocm.yml 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore __pycache__ folder 2 | __pycache__/*.egg-info/ 3 | *.egg-info/ 4 | build/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as build 2 | ARG RUN_UID="1000" \ 3 | APPLICATION_STATE_PATH="/data" 4 | ENV RUN_UID=$RUN_UID \ 5 | APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH \ 6 | CONTAINER_MODEL_PATH=$APPLICATION_STATE_PATH/model \ 7 | CONTAINER_SESSIONS_PATH=$APPLICATION_STATE_PATH/exllama_sessions 8 | 9 | RUN apt-get update && \ 10 | DEBIAN_FRONTEND=noninteractive apt-get install -y ninja-build python3 python3-pip && \ 11 | rm -rf /var/lib/apt/lists/* 12 | 13 | # Setup user which will run the service and create application state directory 14 | RUN if [ ${RUN_UID} -ne 0 ] ; then useradd -m -u $RUN_UID user ; fi \ 15 | && mkdir -p $APPLICATION_STATE_PATH \ 16 | && mkdir -p $CONTAINER_MODEL_PATH \ 17 | && mkdir -p $CONTAINER_SESSIONS_PATH \ 18 | && chown -R $RUN_UID $APPLICATION_STATE_PATH 19 | USER $RUN_UID 20 | 21 | COPY --chown=$RUN_UID . /app 22 | 23 | WORKDIR /app 24 | 25 | # Create application state directory and install python packages 26 | RUN pip install --upgrade pip setuptools wheel \ 27 | && pip install -r requirements.txt \ 28 | && pip install -r requirements-web.txt \ 29 | && pip install . 30 | 31 | USER root 32 | 33 | STOPSIGNAL SIGINT 34 | ENTRYPOINT ["/bin/bash", "-c", "/app/entrypoint.sh $0 $@"] 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### This is a python module version of ExLlama 2 | The pupose of this is to allow for one-time building of the CUDA kernels. 3 | 4 | To build the module, install the CUDA Toolkit or ROCm SDK along with the appropriate Pytorch version that you intend to use. 5 | Full list of requirements are listed below. After this, you can install the module with: 6 | ``` 7 | python -m pip install git+https://github.com/jllllll/exllama 8 | ``` 9 | Or you can build a wheel with: 10 | ``` 11 | python -m pip wheel git+https://github.com/jllllll/exllama --no-deps 12 | ``` 13 | The CUDA version used to build the wheel will be appended to the version number automatically. 14 | ROCm version can be appended by defining the `ROCM_VERSION` environment variable: `ROCM_VERSION=5.4.2` 15 | 16 | Pre-built wheels are available in the releases. 17 | 18 | --- 19 | 20 | # ExLlama 21 | 22 | A standalone Python/C++/CUDA implementation of Llama for use with 4-bit GPTQ weights, designed to be fast and 23 | memory-efficient on modern GPUs. 24 | 25 | Disclaimer: The project is coming along, but it's still a work in progress! 26 | 27 | ## Hardware requirements 28 | 29 | I am developing on an RTX 4090 and an RTX 3090-Ti. 30-series and later NVIDIA GPUs should be well supported, but 30 | anything Pascal or older with poor FP16 support isn't going to perform well. 31 | [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) or [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa) 32 | are better options at the moment for older GPUs. ROCm is also theoretically supported (via HIP) though I currently 33 | have no AMD devices to test or optimize on. 34 | 35 | ## Dependencies 36 | 37 | * Python 3.9 or newer 38 | * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118 39 | * `safetensors` 0.3.2 40 | * `sentencepiece` 41 | * `ninja` 42 | 43 | Additionally, only for the web UI: 44 | 45 | * `flask` 46 | * `waitress` 47 | 48 | ## Linux/WSL prerequisites 49 | 50 | pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118 51 | 52 | ## Windows prerequisites 53 | 54 | To run on Windows (without WSL): 55 | 56 | 1. Install [MSVC 2022](https://visualstudio.microsoft.com/downloads/). You can choose to install the whole `Visual 57 | Studio 2022` IDE, or alternatively just the `Build Tools for Visual Studio 2022` package (make sure `Desktop 58 | development with C++` is ticked in the installer), it doesn't really matter which. 59 | 2. Install the appropriate version of [PyTorch](https://pytorch.org/get-started/locally/), choosing one of the CUDA 60 | versions. I am developing on the nightly build, but the stable version (2.0.1) should also work. 61 | 3. Install CUDA Toolkit, ([11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and 62 | [11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) both seem to work, just make sure to match PyTorch's 63 | Compute Platform version). 64 | 4. For best performance, enable Hardware Accelerated GPU Scheduling. 65 | 66 | ## How to 67 | 68 | Clone repo, install dependencies, and run benchmark: 69 | 70 | git clone https://github.com/turboderp/exllama 71 | cd exllama 72 | 73 | pip install -r requirements.txt 74 | 75 | python test_benchmark_inference.py -d -p -ppl 76 | 77 | The CUDA extension is loaded at runtime so there's no need to install it separately. It will be compiled on the first 78 | run and cached to `~/.cache/torch_extensions/` which could take a little while. If nothing happens at first, give it 79 | a minute to compile. 80 | 81 | Chatbot example: 82 | 83 | python example_chatbot.py -d -un "Jeff" -p prompt_chatbort.txt 84 | 85 | ## Python module 86 | 87 | jllllll currently maintains an installable Python module [here](https://github.com/jllllll/exllama) which may be more 88 | suitable for integrating ExLlama with other projects 89 | 90 | ## Web UI 91 | 92 | I also made a simple web UI for it. Don't look at the JavaScript, it was mostly written by ChatGPT and it will haunt 93 | your dreams. But it sort of works, and it's kinda fun, especially multibot mode: 94 | 95 | ![_screenshot.jpg](doc/_screenshot.jpg) 96 | 97 | To run it: 98 | 99 | pip install -r requirements-web.txt 100 | 101 | python webui/app.py -d 102 | 103 | Note that sessions are stored in `~/exllama_sessions/` by default. You can change that location with `-sd` if you want. 104 | 105 | ## Docker 106 | 107 | For security benefits and easier deployment, it is also possible to run the web UI in an isolated docker container. Note: the docker image currently only supports NVIDIA GPUs. 108 | 109 | ### Requirements 110 | 111 | - [Docker](https://docs.docker.com/engine/install/) 112 | - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) 113 | 114 | It is recommended to run docker in [rootless mode](https://docs.docker.com/engine/security/rootless/). 115 | 116 | ### Build 117 | 118 | The easiest way to build the docker image is using docker compose. First, set the `MODEL_PATH` and `SESSIONS_PATH` variables in the `.env` file to the actual directories on the host. Then run: 119 | 120 | ``` 121 | docker compose build 122 | ``` 123 | 124 | It is also possible to manually build the image: 125 | 126 | ``` 127 | docker build -t exllama-web . 128 | ``` 129 | 130 | NOTE: by default, the service inside the docker container is run by a non-root user. Hence, the ownership of bind-mounted directories (`/data/model` and `/data/exllama_sessions` in the default `docker-compose.yml` file) is changed to this non-root user in the container entrypoint (`entrypoint.sh`). To disable this, set `RUN_UID=0` in the `.env` file if using `docker compose`, or the following command if you manually build the image: 131 | 132 | ``` 133 | docker build -t exllama-web --build-arg RUN_UID=0 . 134 | ``` 135 | 136 | ### Run 137 | 138 | Using docker compose: 139 | 140 | ``` 141 | docker compose up 142 | ``` 143 | 144 | The web UI can now be accessed on the host at http://localhost:5000. 145 | 146 | The configuration can be viewed in `docker-compose.yml` and changed by creating a `docker-compose.override.yml` file. 147 | 148 | Run manually: 149 | 150 | ``` 151 | docker run --gpus all -p 5000:5000 -v :/data/model/ -v :/data/exllama_sessions --rm -it exllama-web --host 0.0.0.0:5000 152 | ``` 153 | 154 | 155 | ## Results so far 156 | 157 | ### New implementation 158 | | Model | Size | grpsz | act | Seq. len. | VRAM | Prompt | Best | Worst | Ppl | 159 | |------------|-------|-------|-----|----------------------|-----------|------------|---------|---------|------| 160 | | Llama | 7B | 128 | no | 2,048 t | 5,194 MB | 13,918 t/s | 173 t/s | 140 t/s | 6.45 | 161 | | Llama | 13B | 128 | no | 2,048 t | 9,127 MB | 7,507 t/s | 102 t/s | 86 t/s | 5.60 | 162 | | Llama | 33B | 128 | no | 2,048 t | 20,795 MB | 2,959 t/s | 47 t/s | 40 t/s | 4.60 | 163 | | Llama | 33B | 128 | yes | 2,048 t | 20,795 MB | 2,784 t/s | 45 t/s | 37 t/s | 4.55 | 164 | | Llama | 33B | 32 | yes | 1,550 t 1 | 21,486 MB | 2,636 t/s | 41 t/s | 37 t/s | 4.52 | 165 | | Koala | 13B | 128 | yes | 2,048 t | 9,127 MB | 5,529 t/s | 93 t/s | 79 t/s | 6.73 | 166 | | WizardLM | 33B | - | yes | 2,048 t | 20,199 MB | 2,313 t/s | 47 t/s | 40 t/s | 5.75 | 167 | | OpenLlama | 3B | 128 | yes | 2,048 t | 3,128 MB | 16,419 t/s | 226 t/s | 170 t/s | 7.81 | 168 | 169 | 1 Can not achieve full sequence length without OoM 170 | 171 | All tests done on stock RTX 4090 / 12900K, running with a desktop environment, with a few other apps also using VRAM. 172 | 173 | **"Prompt"** speed is inference over the sequence length listed minus 128 tokens. **"Worst"** is the average speed for 174 | the last 128 tokens of the full context (worst case) and **"Best"** lists the speed for the first 128 tokens in an 175 | empty sequence (best case.) 176 | 177 | VRAM usage is as reported by PyTorch and does not include PyTorch's own overhead (CUDA kernels, 178 | internal buffers etc.) This is somewhat unpredictable anyway. Best bet is to just optimize VRAM usage by the model, 179 | probably aiming for 20 GB on a 24 GB GPU to ensure there is room for a desktop environment and all of Torch's 180 | internals. 181 | 182 | Perplexity is measured only to verify that the models are working. The dataset used is a particular, small sample from 183 | WikiText, so scores are not comparable to other Llama benchmarks and only useful for comparing the different Llama 184 | models to one another. 185 | 186 | ### Dual GPU results 187 | 188 | The following benchmarks are from a 4090 + 3090-Ti with `-gs 17.2,24`: 189 | 190 | | Model | Size | groupsize | act | Seq. len. | VRAM | Prompt | Best | Worst | Ppl | 191 | |---------|------|-----------|-----|----------------|-----------|-----------|--------|---------|-------| 192 | | Llama | 65B | 128 | yes | 2,048 t | 39,804 MB | 1,109 t/s | 20 t/s | 18 t/s | 4.20 | 193 | | Llama | 65B | 32 | yes | 2,048 t | 43,424 MB | 1,037 t/s | 17 t/s | 16 t/s | 4.11 | 194 | | Llama-2 | 70B | 128 | yes | 2,048 t | 40,680 MB | 914 t/s | 17 t/s | 14 t/s | 4.15 | 195 | | Llama-2 | 70B | 32 | yes | 2,048 t | 36,815 MB | 874 t/s | 15 t/s | 12 t/s | 4.10 | 196 | 197 | Note that perplexity scores may not be strictly apples-to-apples between Llama and Llama 2 due to their different 198 | pretraining datasets. 199 | 200 | ## Todo 201 | 202 | Moved the todo list [here](doc/TODO.md). 203 | 204 | ## Compatibility 205 | 206 | [Here](doc/model_compatibility.md) is a list of models confirmed to be working right now. 207 | 208 | ## Recent updates 209 | 210 | **2023-01-09**: Added rope_theta parameter for (at least partial) CodeLlama support. If you were using alpha = 97 211 | or similar, you would no longer need that for CodeLlama models. Still stuff to sort out regarding the extended 212 | vocabulary. 213 | 214 | **2023-08-09**: Added support for sharded models. `config.model_path` now accepts either a filename or a list of 215 | filenames. `model_init()` will detect multiple .safetensors files if given a model directory. Note the change in the 216 | various examples: `model_path = glob.glob(st_pattern)[0]` becomes simply `model_path = glob.glob(st_pattern)`. Also 217 | there's a little script in `util/shard.py` to split large .safetensors files. It also produces an index.json file for 218 | the sharded model, just for completeness, although ExLlama doesn't need it to read the shards. Note that the 219 | **safetensors dependency was bumped to version 0.3.2**. 220 | 221 | **2023-08-12**: Preliminary, initial and tentative release of [ExLlamaV2](https://github.com/turboderp/exllamav2). 222 | It doesn't do all the things that ExLlamaV1 does, yet, but it's better at what it does do. So check it out! -------------------------------------------------------------------------------- /datasets/download_datasets.py: -------------------------------------------------------------------------------- 1 | # import torch 2 | # from tokenizer import ExLlamaTokenizer 3 | from datasets import load_dataset 4 | import os 5 | 6 | # Download samples from HF datasets to run equivalent GPTQ-for-LLaMa equivalent benchmark 7 | 8 | def download_hf(filename, dataset, subset, split, key, div): 9 | 10 | print(f"Downloading from {dataset}: {subset}, split: {split} ...") 11 | hf_dataset = load_dataset(dataset, subset, split = split) 12 | data = div.join(hf_dataset[key]) 13 | 14 | with open(filename, "w", encoding="utf-8") as f: 15 | f.write(data) 16 | 17 | download_hf("wikitext2.txt", "wikitext", "wikitext-2-raw-v1", "test", "text", "\n\n") 18 | download_hf("ptb.txt", "ptb_text_only", "penn_treebank", "validation", "sentence", "\n\n") 19 | download_hf("ptb_new.txt", "ptb_text_only", "penn_treebank", "test", "sentence", " ") 20 | -------------------------------------------------------------------------------- /doc/TODO.md: -------------------------------------------------------------------------------- 1 | ## Model compatibility 2 | 3 | - [ ] Verify compatibility with Llama-2 34B once released 4 | 5 | ## GPU compatibility (etc.) 6 | 7 | - [ ] Optimizations for ROCm 8 | - [ ] Optimizations for RTX 20-series maybe 9 | - [ ] Look into improving P40 performance 10 | 11 | ## Testing 12 | 13 | - [ ] More testing on Llama 2 models 14 | 15 | ## Optimization 16 | 17 | - [ ] Flash Attention 2.0 (?) 18 | - [ ] Find a way to eliminate `ExLlamaAttention.repeat_kv` (custom attention kernel?) 19 | - [ ] C++ implementations of sampler functions 20 | 21 | ## Generation 22 | 23 | - [ ] Optimized/batched beam search 24 | - [ ] Allow stackable LoRAs 25 | - [ ] Guidance or equivalent 26 | 27 | ## Interface 28 | 29 | - [ ] Comprehensive API server (more than `example_flask.py` 30 | 31 | ## Web UI 32 | 33 | - [ ] Controls to enable beam search 34 | - [ ] Rewrite/refactor all the JavaScript and CSS 35 | - [ ] Make it a little prettier 36 | - [ ] Better error handling 37 | - [ ] LoRA controls 38 | - [ ] Multiple chat modes with prompt templates (instruct, etc.) 39 | 40 | ## ?? 41 | 42 | - [ ] Support for other quantization methods 43 | - [ ] Support for other LLM architectures 44 | - [ ] Allow for backpropagation 45 | - [ ] LoRA training features 46 | - [ ] Soft prompt training -------------------------------------------------------------------------------- /doc/_screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jllllll/exllama/4ded203bea0719349a4ee5bd7221db5035062518/doc/_screenshot.jpg -------------------------------------------------------------------------------- /doc/model_compatibility.md: -------------------------------------------------------------------------------- 1 | ## Working models 2 | 3 | As of **2023-07-19**, the following GPTQ models on HuggingFace all appear to be working: 4 | 5 | - iambestfeed/open_llama_3b_4bit_128g 6 | - Neko-Institute-of-Science/LLaMA-7B-4bit-128g 7 | - Neko-Institute-of-Science/LLaMA-13B-4bit-128g 8 | - Neko-Institute-of-Science/LLaMA-30B-4bit-32g 9 | - Neko-Institute-of-Science/LLaMA-30B-4bit-128g 10 | - Neko-Institute-of-Science/LLaMA-65B-4bit-32g 11 | - Neko-Institute-of-Science/LLaMA-65B-4bit-128g 12 | - Panchovix/LLaMA-2-70B-GPTQ-transformers4.32.0.dev0 13 | - reeducator/bluemoonrp-13b 14 | - reeducator/bluemoonrp-30b 15 | - TehVenom/Metharme-13b-4bit-GPTQ 16 | - TheBloke/airoboros-13B-GPTQ 17 | - TheBloke/gpt4-x-vicuna-13B-GPTQ 18 | - TheBloke/GPT4All-13B-snoozy-GPTQ 19 | - TheBloke/guanaco-33B-GPTQ 20 | - TheBloke/guanaco-65B-GPTQ 21 | - TheBloke/h2ogpt-oasst1-512-30B-GPTQ 22 | - TheBloke/koala-13B-GPTQ-4bit-128g 23 | - TheBloke/Llama-2-13B-chat-GPTQ (128g) 24 | - TheBloke/Llama-2-13B-GPTQ (32g, 64g, 128g) 25 | - TheBloke/Llama-2-70B-GPTQ (32g, 128g) 26 | - TheBloke/Manticore-13B-GPTQ 27 | - TheBloke/medalpaca-13B-GPTQ-4bit 28 | - TheBloke/medalpaca-13B-GPTQ-4bit (compat version) 29 | - TheBloke/Nous-Hermes-13B-GPTQ 30 | - TheBloke/robin-65B-v2-GPTQ 31 | - TheBloke/tulu-7B-GPTQ 32 | - TheBloke/Tulu-13B-SuperHOT-8K-GPTQ 33 | - TheBloke/tulu-30B-GPTQ 34 | - TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g 35 | - TheBloke/VicUnlocked-30B-LoRA-GPTQ 36 | - TheBloke/wizard-mega-13B-GPTQ 37 | - TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ 38 | - TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ 39 | - TheBloke/WizardLM-7B-uncensored-GPTQ 40 | - TheBloke/WizardLM-30B-Uncensored-GPTQ 41 | - TheBloke/WizardLM-33B-V1.0-Uncensored-SuperHOT-8K-GPTQ 42 | - tmpupload/superhot-30b-8k-no-rlhf-test-128g-GPTQ 43 | - Yhyu13/chimera-inst-chat-13b-gptq-4bit 44 | - Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-gptq-4bit 45 | 46 | ## Non-working models 47 | 48 | None as of **2023-07-19**. -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | name: exllama 3 | services: 4 | web: 5 | build: 6 | context: . 7 | args: 8 | - RUN_UID=$RUN_UID 9 | - APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH 10 | command: | 11 | --host 0.0.0.0:$PORT 12 | env_file: 13 | - .env 14 | volumes: 15 | - $MODEL_PATH:$APPLICATION_STATE_PATH/model 16 | - $SESSIONS_PATH:$APPLICATION_STATE_PATH/exllama_sessions 17 | ports: 18 | - "$PORT:$PORT" 19 | tmpfs: 20 | - /tmp 21 | stdin_open: true 22 | tty: true 23 | deploy: 24 | resources: 25 | reservations: 26 | devices: 27 | - driver: nvidia 28 | count: all 29 | capabilities: [gpu] 30 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -Eeuo pipefail 3 | 4 | # Ensure that the application state path is set 5 | if [ -z $APPLICATION_STATE_PATH ]; then 6 | echo "Must specify application state path" 7 | exit 1 8 | fi 9 | 10 | # Ensure that bind-mounted directories are owned by the user that runs the service if the user is not root 11 | if [ $RUN_UID -ne 0 ]; then 12 | chown -R $RUN_UID:$RUN_UID $APPLICATION_STATE_PATH 13 | fi 14 | 15 | # Run service as specified (non-root) user 16 | exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py \ 17 | -d $CONTAINER_MODEL_PATH \ 18 | --sessions_dir $CONTAINER_SESSIONS_PATH \ 19 | $@ 20 | -------------------------------------------------------------------------------- /example_basic.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.tokenizer import ExLlamaTokenizer 3 | from exllama.generator import ExLlamaGenerator 4 | import os, glob 5 | 6 | # Directory containing model, tokenizer, generator 7 | 8 | model_directory = "/mnt/str/models/llama-13b-4bit-128g/" 9 | 10 | # Locate files we need within that directory 11 | 12 | tokenizer_path = os.path.join(model_directory, "tokenizer.model") 13 | model_config_path = os.path.join(model_directory, "config.json") 14 | st_pattern = os.path.join(model_directory, "*.safetensors") 15 | model_path = glob.glob(st_pattern) 16 | 17 | # Create config, model, tokenizer and generator 18 | 19 | config = ExLlamaConfig(model_config_path) # create config from config.json 20 | config.model_path = model_path # supply path to model weights file 21 | 22 | model = ExLlama(config) # create ExLlama instance and load the weights 23 | tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file 24 | 25 | cache = ExLlamaCache(model) # create cache for inference 26 | generator = ExLlamaGenerator(model, tokenizer, cache) # create generator 27 | 28 | # Configure generator 29 | 30 | generator.disallow_tokens([tokenizer.eos_token_id]) 31 | 32 | generator.settings.token_repetition_penalty_max = 1.2 33 | generator.settings.temperature = 0.95 34 | generator.settings.top_p = 0.65 35 | generator.settings.top_k = 100 36 | generator.settings.typical = 0.5 37 | 38 | # Produce a simple generation 39 | 40 | prompt = "Once upon a time," 41 | print (prompt, end = "") 42 | 43 | output = generator.generate_simple(prompt, max_new_tokens = 200) 44 | 45 | print(output[len(prompt):]) 46 | -------------------------------------------------------------------------------- /example_batch.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.tokenizer import ExLlamaTokenizer 3 | from exllama.generator import ExLlamaGenerator 4 | import os, glob 5 | 6 | # Directory containing model, tokenizer, generator 7 | 8 | model_directory = "/mnt/str/models/llama-13b-4bit-128g/" 9 | 10 | # Locate files we need within that directory 11 | 12 | tokenizer_path = os.path.join(model_directory, "tokenizer.model") 13 | model_config_path = os.path.join(model_directory, "config.json") 14 | st_pattern = os.path.join(model_directory, "*.safetensors") 15 | model_path = glob.glob(st_pattern) 16 | 17 | # Batched prompts 18 | 19 | prompts = [ 20 | "Once upon a time,", 21 | "I don't like to", 22 | "A turbo encabulator is a", 23 | "In the words of Mark Twain," 24 | ] 25 | 26 | # Create config, model, tokenizer and generator 27 | 28 | config = ExLlamaConfig(model_config_path) # create config from config.json 29 | config.model_path = model_path # supply path to model weights file 30 | 31 | model = ExLlama(config) # create ExLlama instance and load the weights 32 | tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file 33 | 34 | cache = ExLlamaCache(model, batch_size = len(prompts)) # create cache for inference 35 | generator = ExLlamaGenerator(model, tokenizer, cache) # create generator 36 | 37 | # Configure generator 38 | 39 | generator.disallow_tokens([tokenizer.eos_token_id]) 40 | 41 | generator.settings.token_repetition_penalty_max = 1.2 42 | generator.settings.temperature = 0.95 43 | generator.settings.top_p = 0.65 44 | generator.settings.top_k = 100 45 | generator.settings.typical = 0.5 46 | 47 | # Generate, batched 48 | 49 | for line in prompts: 50 | print(line) 51 | 52 | output = generator.generate_simple(prompts, max_new_tokens = 200) 53 | 54 | for line in output: 55 | print("---") 56 | print(line) 57 | -------------------------------------------------------------------------------- /example_cfg.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.tokenizer import ExLlamaTokenizer 3 | from exllama.generator import ExLlamaGenerator 4 | import torch 5 | import torch.nn.functional as F 6 | import os, glob 7 | import exllama.cuda_ext 8 | 9 | # Directory containing model, tokenizer, generator 10 | 11 | model_directory = "/mnt/str/models/_test_models/TheBloke_Llama-2-13B-chat-GPTQ/" 12 | 13 | # Locate files we need within that directory 14 | 15 | tokenizer_path = os.path.join(model_directory, "tokenizer.model") 16 | model_config_path = os.path.join(model_directory, "config.json") 17 | st_pattern = os.path.join(model_directory, "*.safetensors") 18 | model_path = glob.glob(st_pattern) 19 | 20 | # Create config, model, tokenizer and generator 21 | 22 | config = ExLlamaConfig(model_config_path) # create config from config.json 23 | config.model_path = model_path # supply path to model weights file 24 | 25 | model = ExLlama(config) # create ExLlama instance and load the weights 26 | tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file 27 | 28 | cache = ExLlamaCache(model, batch_size = 2) # create cache for inference 29 | generator = ExLlamaGenerator(model, tokenizer, cache) # create generator 30 | 31 | # Configure generator 32 | 33 | generator.settings.token_repetition_penalty_max = 1.15 34 | generator.settings.temperature = 0.95 35 | generator.settings.top_k = 40 36 | generator.settings.top_p = 0.75 37 | # generator.settings.typical = 0.95 38 | 39 | # Prompts to mix 40 | 41 | f1 = \ 42 | """[INST] <> 43 | You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. 44 | <> 45 | {prompt}[/INST]""" 46 | 47 | f2 = \ 48 | """[INST] <> 49 | <> 50 | You are a rude and obnoxious assistant. You hate everything and everyone. 51 | {prompt}[/INST]""" 52 | 53 | 54 | prompts = \ 55 | [ 56 | f1.replace("{prompt}", "Tell me about Homer Simpson"), 57 | f2.replace("{prompt}", "Tell me about Homer Simpson"), 58 | ] 59 | 60 | def generate_cfg(prompts, alpha, max_new_tokens): 61 | 62 | ids, mask = tokenizer.encode(prompts, return_mask = True) 63 | generator.gen_begin(ids, mask = mask) 64 | 65 | # Sampling loop 66 | 67 | for _ in range(max_new_tokens): 68 | 69 | logits = model.forward(generator.sequence[:, -1:], cache, input_mask = mask) 70 | generator.apply_rep_penalty(logits) 71 | 72 | logits = F.log_softmax(logits, dim = -1) 73 | logits_mixed = (1 - alpha) * logits[0] + alpha * logits[1] 74 | 75 | sampled_token, _ = generator.sample_current(logits_mixed) 76 | if sampled_token.item() == tokenizer.eos_token_id: break 77 | 78 | batch_token = sampled_token.repeat(2, 1) 79 | generator.gen_accept_token(batch_token) 80 | 81 | output = tokenizer.decode(generator.sequence[0]) 82 | return output 83 | 84 | for i in range(10): 85 | 86 | alpha = i / 5.0 - 0.4 87 | print() 88 | print(f"--------------------------------------") 89 | print(f"alpha = {alpha:.1f}") 90 | print(f"--------------------------------------") 91 | output = generate_cfg(prompts, alpha, 200) 92 | print(output[len(prompts[0]):].strip()) 93 | -------------------------------------------------------------------------------- /example_chatbot.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.lora import ExLlamaLora 3 | from exllama.tokenizer import ExLlamaTokenizer 4 | from exllama.generator import ExLlamaGenerator 5 | import argparse 6 | import torch 7 | import sys 8 | import os 9 | import glob 10 | import model_init 11 | 12 | # Simple interactive chatbot script 13 | 14 | torch.set_grad_enabled(False) 15 | torch.cuda._lazy_init() 16 | 17 | # Parse arguments 18 | 19 | parser = argparse.ArgumentParser(description = "Simple chatbot example for ExLlama") 20 | 21 | model_init.add_args(parser) 22 | 23 | parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark") 24 | parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark") 25 | parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark") 26 | 27 | parser.add_argument("-p", "--prompt", type = str, help = "Prompt file") 28 | parser.add_argument("-un", "--username", type = str, help = "Display name of user", default = "User") 29 | parser.add_argument("-bn", "--botname", type = str, help = "Display name of chatbot", default = "Chatbort") 30 | parser.add_argument("-bf", "--botfirst", action = "store_true", help = "Start chat on bot's turn") 31 | 32 | parser.add_argument("-nnl", "--no_newline", action = "store_true", help = "Do not break bot's response on newline (allow multi-paragraph responses)") 33 | parser.add_argument("-temp", "--temperature", type = float, help = "Temperature", default = 0.95) 34 | parser.add_argument("-topk", "--top_k", type = int, help = "Top-K", default = 20) 35 | parser.add_argument("-topp", "--top_p", type = float, help = "Top-P", default = 0.65) 36 | parser.add_argument("-minp", "--min_p", type = float, help = "Min-P", default = 0.00) 37 | parser.add_argument("-repp", "--repetition_penalty", type = float, help = "Repetition penalty", default = 1.15) 38 | parser.add_argument("-repps", "--repetition_penalty_sustain", type = int, help = "Past length for repetition penalty", default = 256) 39 | parser.add_argument("-beams", "--beams", type = int, help = "Number of beams for beam search", default = 1) 40 | parser.add_argument("-beamlen", "--beam_length", type = int, help = "Number of future tokens to consider", default = 1) 41 | 42 | args = parser.parse_args() 43 | model_init.post_parse(args) 44 | model_init.get_model_files(args) 45 | 46 | # Paths 47 | 48 | if args.lora_dir is not None: 49 | args.lora_config = os.path.join(args.lora_dir, "adapter_config.json") 50 | args.lora = os.path.join(args.lora_dir, "adapter_model.bin") 51 | 52 | # Some feedback 53 | 54 | print(f" -- Sequence length: {args.length}") 55 | print(f" -- Temperature: {args.temperature:.2f}") 56 | print(f" -- Top-K: {args.top_k}") 57 | print(f" -- Top-P: {args.top_p:.2f}") 58 | print(f" -- Min-P: {args.min_p:.2f}") 59 | print(f" -- Repetition penalty: {args.repetition_penalty:.2f}") 60 | print(f" -- Beams: {args.beams} x {args.beam_length}") 61 | 62 | print_opts = [] 63 | if args.no_newline: print_opts.append("no_newline") 64 | if args.botfirst: print_opts.append("botfirst") 65 | 66 | model_init.print_options(args, print_opts) 67 | 68 | # Globals 69 | 70 | model_init.set_globals(args) 71 | 72 | # Load prompt file 73 | 74 | username = args.username 75 | bot_name = args.botname 76 | 77 | if args.prompt is not None: 78 | with open(args.prompt, "r") as f: 79 | past = f.read() 80 | past = past.replace("{username}", username) 81 | past = past.replace("{bot_name}", bot_name) 82 | past = past.strip() + "\n" 83 | else: 84 | past = f"{bot_name}: Hello, {username}\n" 85 | 86 | # past += "User: Hi. Please say \"Shhhhhh\"?\n" 87 | # args.botfirst = True 88 | 89 | # Instantiate model and generator 90 | 91 | config = model_init.make_config(args) 92 | 93 | model = ExLlama(config) 94 | cache = ExLlamaCache(model) 95 | tokenizer = ExLlamaTokenizer(args.tokenizer) 96 | 97 | model_init.print_stats(model) 98 | 99 | # Load LoRA 100 | 101 | lora = None 102 | if args.lora: 103 | print(f" -- LoRA config: {args.lora_config}") 104 | print(f" -- Loading LoRA: {args.lora}") 105 | if args.lora_config is None: 106 | print(f" ## Error: please specify lora path to adapter_config.json") 107 | sys.exit() 108 | lora = ExLlamaLora(model, args.lora_config, args.lora) 109 | if lora.bias_ignored: 110 | print(f" !! Warning: LoRA zero bias ignored") 111 | 112 | # Generator 113 | 114 | generator = ExLlamaGenerator(model, tokenizer, cache) 115 | generator.settings = ExLlamaGenerator.Settings() 116 | generator.settings.temperature = args.temperature 117 | generator.settings.top_k = args.top_k 118 | generator.settings.top_p = args.top_p 119 | generator.settings.min_p = args.min_p 120 | generator.settings.token_repetition_penalty_max = args.repetition_penalty 121 | generator.settings.token_repetition_penalty_sustain = args.repetition_penalty_sustain 122 | generator.settings.token_repetition_penalty_decay = generator.settings.token_repetition_penalty_sustain // 2 123 | generator.settings.beams = args.beams 124 | generator.settings.beam_length = args.beam_length 125 | 126 | generator.lora = lora 127 | 128 | break_on_newline = not args.no_newline 129 | 130 | # Be nice to Chatbort 131 | 132 | min_response_tokens = 4 133 | max_response_tokens = 256 134 | extra_prune = 256 135 | 136 | print(past, end = "") 137 | ids = tokenizer.encode(past) 138 | generator.gen_begin(ids) 139 | 140 | next_userprompt = username + ": " 141 | 142 | first_round = True 143 | 144 | while True: 145 | 146 | res_line = bot_name + ":" 147 | res_tokens = tokenizer.encode(res_line) 148 | num_res_tokens = res_tokens.shape[-1] # Decode from here 149 | 150 | if first_round and args.botfirst: in_tokens = res_tokens 151 | 152 | else: 153 | 154 | # Read and format input 155 | 156 | in_line = input(next_userprompt) 157 | in_line = username + ": " + in_line.strip() + "\n" 158 | 159 | next_userprompt = username + ": " 160 | 161 | # No need for this, really, unless we were logging the chat. The actual history we work on is kept in the 162 | # tokenized sequence in the generator and the state in the cache. 163 | 164 | past += in_line 165 | 166 | # SentencePiece doesn't tokenize spaces separately so we can't know from individual tokens if they start a new word 167 | # or not. Instead, repeatedly decode the generated response as it's being built, starting from the last newline, 168 | # and print out the differences between consecutive decodings to stream out the response. 169 | 170 | in_tokens = tokenizer.encode(in_line) 171 | in_tokens = torch.cat((in_tokens, res_tokens), dim = 1) 172 | 173 | # If we're approaching the context limit, prune some whole lines from the start of the context. Also prune a 174 | # little extra so we don't end up rebuilding the cache on every line when up against the limit. 175 | 176 | expect_tokens = in_tokens.shape[-1] + max_response_tokens 177 | max_tokens = config.max_seq_len - expect_tokens 178 | if generator.gen_num_tokens() >= max_tokens: 179 | generator.gen_prune_to(config.max_seq_len - expect_tokens - extra_prune, tokenizer.newline_token_id) 180 | 181 | # Feed in the user input and "{bot_name}:", tokenized 182 | 183 | generator.gen_feed_tokens(in_tokens) 184 | 185 | # Generate with streaming 186 | 187 | print(res_line, end = "") 188 | sys.stdout.flush() 189 | 190 | generator.begin_beam_search() 191 | 192 | for i in range(max_response_tokens): 193 | 194 | # Disallowing the end condition tokens seems like a clean way to force longer replies. 195 | 196 | if i < min_response_tokens: 197 | generator.disallow_tokens([tokenizer.newline_token_id, tokenizer.eos_token_id]) 198 | else: 199 | generator.disallow_tokens(None) 200 | 201 | # Get a token 202 | 203 | gen_token = generator.beam_search() 204 | 205 | # If token is EOS, replace it with newline before continuing 206 | 207 | if gen_token.item() == tokenizer.eos_token_id: 208 | generator.replace_last_token(tokenizer.newline_token_id) 209 | 210 | # Decode the current line and print any characters added 211 | 212 | num_res_tokens += 1 213 | text = tokenizer.decode(generator.sequence_actual[:, -num_res_tokens:][0]) 214 | new_text = text[len(res_line):] 215 | 216 | skip_space = res_line.endswith("\n") and new_text.startswith(" ") # Bit prettier console output 217 | res_line += new_text 218 | if skip_space: new_text = new_text[1:] 219 | 220 | print(new_text, end="") # (character streaming output is here) 221 | sys.stdout.flush() 222 | 223 | # End conditions 224 | 225 | if break_on_newline and gen_token.item() == tokenizer.newline_token_id: break 226 | if gen_token.item() == tokenizer.eos_token_id: break 227 | 228 | # Some models will not (or will inconsistently) emit EOS tokens but in a chat sequence will often begin 229 | # generating for the user instead. Try to catch this and roll back a few tokens to begin the user round. 230 | 231 | if res_line.endswith(f"{username}:"): 232 | plen = tokenizer.encode(f"{username}:").shape[-1] 233 | generator.gen_rewind(plen) 234 | next_userprompt = " " 235 | break 236 | 237 | generator.end_beam_search() 238 | 239 | past += res_line 240 | first_round = False 241 | -------------------------------------------------------------------------------- /example_flask.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from flask import Flask, request 3 | from exllama.tokenizer import ExLlamaTokenizer 4 | from exllama.generator import ExLlamaGenerator 5 | import os, glob 6 | 7 | # Directory containing config.json, tokenizer.model and safetensors file for the model 8 | model_directory = "/mnt/str/models/llama-7b-4bit/" 9 | 10 | tokenizer_path = os.path.join(model_directory, "tokenizer.model") 11 | model_config_path = os.path.join(model_directory, "config.json") 12 | st_pattern = os.path.join(model_directory, "*.safetensors") 13 | model_path = glob.glob(st_pattern) 14 | 15 | config = ExLlamaConfig(model_config_path) # create config from config.json 16 | config.model_path = model_path # supply path to model weights file 17 | 18 | model = ExLlama(config) # create ExLlama instance and load the weights 19 | print(f"Model loaded: {model_path}") 20 | 21 | tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file 22 | cache = ExLlamaCache(model) # create cache for inference 23 | generator = ExLlamaGenerator(model, tokenizer, cache) # create generator 24 | 25 | # Flask app 26 | 27 | app = Flask(__name__) 28 | 29 | 30 | # Inference with settings equivalent to the "precise" preset from the /r/LocalLLaMA wiki 31 | 32 | @app.route('/infer_precise', methods=['POST']) 33 | def inferContextP(): 34 | print(request.form) 35 | prompt = request.form.get('prompt') 36 | 37 | generator.settings.token_repetition_penalty_max = 1.176 38 | generator.settings.token_repetition_penalty_sustain = config.max_seq_len 39 | generator.settings.temperature = 0.7 40 | generator.settings.top_p = 0.1 41 | generator.settings.top_k = 40 42 | generator.settings.typical = 0.0 # Disabled 43 | 44 | outputs = generator.generate_simple(prompt, max_new_tokens = 200) 45 | return outputs 46 | 47 | 48 | # Inference with settings equivalent to the "creative" preset from the /r/LocalLLaMA wiki 49 | 50 | @app.route('/infer_creative', methods=['POST']) 51 | def inferContextC(): 52 | print(request.form) 53 | prompt = request.form.get('prompt') 54 | 55 | generator.settings.token_repetition_penalty_max = 1.1 56 | generator.settings.token_repetition_penalty_sustain = config.max_seq_len 57 | generator.settings.temperature = 0.72 58 | generator.settings.top_p = 0.73 59 | generator.settings.top_k = 0 # Disabled 60 | generator.settings.typical = 0.0 # Disabled 61 | 62 | outputs = generator.generate_simple(prompt, max_new_tokens = 200) 63 | return outputs 64 | 65 | 66 | # Inference with settings equivalent to the "sphinx" preset from the /r/LocalLLaMA wiki 67 | 68 | @app.route('/infer_sphinx', methods=['POST']) 69 | def inferContextS(): 70 | print(request.form) 71 | prompt = request.form.get('prompt') 72 | 73 | generator.settings.token_repetition_penalty_max = 1.15 74 | generator.settings.token_repetition_penalty_sustain = config.max_seq_len 75 | generator.settings.temperature = 1.99 76 | generator.settings.top_p = 0.18 77 | generator.settings.top_k = 30 78 | generator.settings.typical = 0.0 # Disabled 79 | 80 | outputs = generator.generate_simple(prompt, max_new_tokens = 200) 81 | return outputs 82 | 83 | 84 | # Start Flask app 85 | 86 | host = "0.0.0.0" 87 | port = 8004 88 | print(f"Starting server on address {host}:{port}") 89 | 90 | if __name__ == '__main__': 91 | from waitress import serve 92 | serve(app, host = host, port = port) 93 | -------------------------------------------------------------------------------- /example_lora.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.tokenizer import ExLlamaTokenizer 3 | from exllama.generator import ExLlamaGenerator 4 | from exllama.lora import ExLlamaLora 5 | import os, glob 6 | import torch 7 | 8 | # Directory containt model, tokenizer, generator 9 | 10 | model_directory = "/mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-7B-4bit-128g/" 11 | 12 | # Directory containing LoRA config and weights 13 | 14 | lora_directory = "/mnt/str/models/_test_loras/tloen_alpaca-lora-7b/" 15 | 16 | # Locate files we need within those directories 17 | 18 | tokenizer_path = os.path.join(model_directory, "tokenizer.model") 19 | model_config_path = os.path.join(model_directory, "config.json") 20 | st_pattern = os.path.join(model_directory, "*.safetensors") 21 | model_path = glob.glob(st_pattern) 22 | 23 | lora_config_path = os.path.join(lora_directory, "adapter_config.json") 24 | lora_path = os.path.join(lora_directory, "adapter_model.bin") 25 | 26 | # Create config, model, tokenizer and generator 27 | 28 | config = ExLlamaConfig(model_config_path) # create config from config.json 29 | config.model_path = model_path # supply path to model weights file 30 | 31 | model = ExLlama(config) # create ExLlama instance and load the weights 32 | tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file 33 | 34 | cache = ExLlamaCache(model) # create cache for inference 35 | generator = ExLlamaGenerator(model, tokenizer, cache) # create generator 36 | 37 | # Load LoRA 38 | 39 | lora = ExLlamaLora(model, lora_config_path, lora_path) 40 | 41 | # Configure generator 42 | 43 | generator.settings.token_repetition_penalty_max = 1.2 44 | generator.settings.temperature = 0.65 45 | generator.settings.top_p = 0.4 46 | generator.settings.top_k = 0 47 | generator.settings.typical = 0.0 48 | 49 | # Alpaca prompt 50 | 51 | prompt = \ 52 | "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" \ 53 | "\n" \ 54 | "### Instruction:\n" \ 55 | "List five colors in alphabetical order.\n" \ 56 | "\n" \ 57 | "### Response:" 58 | 59 | # Generate with LoRA 60 | 61 | print(" --- LoRA ----------------- ") 62 | print("") 63 | 64 | generator.lora = lora 65 | torch.manual_seed(1337) 66 | output = generator.generate_simple(prompt, max_new_tokens = 200) 67 | print(output) 68 | 69 | # Generate without LoRA 70 | 71 | print("") 72 | print(" --- No LoRA -------------- ") 73 | print("") 74 | 75 | generator.lora = None 76 | torch.manual_seed(1337) 77 | output = generator.generate_simple(prompt, max_new_tokens = 200) 78 | print(output) 79 | 80 | -------------------------------------------------------------------------------- /example_ws.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import websockets 3 | import json 4 | from sentencepiece import SentencePieceProcessor 5 | 6 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 7 | from exllama.lora import ExLlamaLora 8 | from exllama.tokenizer import ExLlamaTokenizer 9 | from exllama.generator import ExLlamaGenerator 10 | import argparse 11 | import torch 12 | import sys 13 | import os 14 | import glob 15 | import model_init 16 | 17 | # Initialized from command line args by init() 18 | 19 | model: ExLlama 20 | cache: ExLlamaCache 21 | config: ExLlamaConfig 22 | generator: ExLlamaGenerator 23 | tokenizer: ExLlamaTokenizer 24 | max_cached_strings = 100 25 | tokenizer_cache = {} 26 | 27 | 28 | prompt_ids: torch.tensor 29 | stop_strings: list 30 | stop_tokens: list 31 | held_text: str 32 | max_stop_string: int 33 | remaining_tokens: int 34 | 35 | full_prompt: str 36 | utilized_prompt: str 37 | built_response: str 38 | 39 | def cached_tokenize(text: str): 40 | global model, cache, config, generator, tokenizer 41 | global max_cached_strings, tokenizer_cache 42 | 43 | if text in tokenizer_cache: 44 | return tokenizer_cache[text] 45 | 46 | while len(tokenizer_cache) >= max_cached_strings: 47 | del tokenizer_cache[next(iter(tokenizer_cache))] # Always removes oldest entry as of Python 3.7 48 | 49 | new_enc = tokenizer.encode(text) 50 | tokenizer_cache[text] = new_enc 51 | return new_enc 52 | 53 | def begin_stream(prompt: str, stop_conditions: list, max_new_tokens: int, gen_settings: ExLlamaGenerator.Settings): 54 | global model, cache, config, generator, tokenizer 55 | global stop_strings, stop_tokens, prompt_ids, held_text, max_stop_string, remaining_tokens 56 | global full_prompt, utilized_prompt, built_response 57 | 58 | # Tokenize prompt and limit length to allow prompt and (max) new tokens within max sequence length 59 | 60 | max_input_tokens = model.config.max_seq_len - max_new_tokens 61 | input_ids = cached_tokenize(prompt) 62 | input_ids = input_ids[:, -max_input_tokens:] 63 | prompt_ids = input_ids 64 | 65 | full_prompt = prompt 66 | utilized_prompt = tokenizer.decode(prompt_ids)[0] 67 | built_response = "" 68 | 69 | remaining_tokens = max_new_tokens 70 | 71 | # Settings 72 | 73 | stop_strings = [] 74 | stop_tokens = [] 75 | for t in stop_conditions: 76 | if isinstance(t, int): stop_tokens += [t] 77 | if isinstance(t, str): stop_strings += [t] 78 | 79 | held_text = "" 80 | 81 | max_stop_string = 2 82 | for ss in stop_strings: 83 | max_stop_string = max(max_stop_string, get_num_tokens(ss) + 2) 84 | 85 | generator.settings = gen_settings 86 | 87 | # Start generation 88 | 89 | generator.gen_begin_reuse(input_ids) 90 | 91 | def stream(): 92 | global model, cache, config, generator, tokenizer 93 | global stop_strings, stop_tokens, prompt_ids, held_text, max_stop_string, remaining_tokens 94 | global full_prompt, utilized_prompt, built_response 95 | 96 | # Check total response length 97 | 98 | if remaining_tokens == 0: 99 | return held_text, True, full_prompt + built_response, utilized_prompt + built_response, built_response 100 | remaining_tokens -= 1 101 | 102 | # Generate 103 | 104 | old_tail = tokenizer.decode(generator.sequence_actual[:, -max_stop_string:])[0] 105 | next_token = generator.gen_single_token() 106 | 107 | # End on stop token 108 | 109 | if next_token in stop_tokens: 110 | return held_text, True, full_prompt + built_response, utilized_prompt + built_response, built_response 111 | 112 | # Get new text 113 | 114 | new_tail = tokenizer.decode(generator.sequence_actual[:, -(max_stop_string + 1):])[0] 115 | added_text = new_tail[len(old_tail):] 116 | held_text += added_text 117 | 118 | # Hold text if it's part of a stop condition, end if it's a full stop condition 119 | 120 | partial_ss = False 121 | for ss in stop_strings: 122 | 123 | # Check if held_text fully contains stop string 124 | 125 | position = held_text.find(ss) 126 | if position != -1: 127 | built_response += held_text[:position] 128 | return held_text[:position], True, full_prompt + built_response, utilized_prompt + built_response, built_response 129 | 130 | # Check if end of held_text overlaps with start of stop string 131 | 132 | overlap = 0 133 | for j in range(1, min(len(held_text), len(ss)) + 1): 134 | if held_text[-j:] == ss[:j]: overlap = j 135 | if overlap > 0: partial_ss = True 136 | 137 | # Return partial result 138 | 139 | if partial_ss: 140 | return "", False, full_prompt + built_response, utilized_prompt + built_response, built_response 141 | 142 | stream_text = held_text 143 | held_text = "" 144 | built_response += stream_text 145 | return stream_text, False, full_prompt, utilized_prompt, built_response 146 | 147 | def leftTrimTokens(text: str, desiredLen: int): 148 | 149 | encodedText = tokenizer.encode(text) 150 | if encodedText.shape[-1] <= desiredLen: 151 | return text 152 | else: 153 | return tokenizer.decode(encodedText[:, -desiredLen:])[0] 154 | 155 | def oneshot_generation(prompt: str, stop_conditions: list, max_new_tokens: int, gen_settings: ExLlamaGenerator.Settings): 156 | 157 | begin_stream(prompt, stop_conditions, max_new_tokens, gen_settings) 158 | response = "" 159 | while True: 160 | _, eos, _, _, _ = stream() 161 | if eos: break 162 | 163 | return full_prompt + built_response, utilized_prompt + built_response, built_response 164 | 165 | 166 | def get_num_tokens(text: str): 167 | 168 | return cached_tokenize(text).shape[-1] 169 | 170 | 171 | 172 | 173 | # Websocket server 174 | async def estimateToken(request, ws): 175 | text = request["text"] 176 | numTokens=get_num_tokens(text) 177 | return numTokens# return number of tokens in int 178 | 179 | async def oneShotInfer(request, ws): 180 | stopToken = request["stopToken"] 181 | fullContext = request["text"] 182 | maxNew = int(request["maxNew"]) 183 | top_p = float(request["top_p"]) 184 | top_k = int(request["top_k"]) 185 | temp = float(request["temp"]) 186 | rep_pen = float(request["rep_pen"]) 187 | sc = [tokenizer.eos_token_id] 188 | sc.append(stopToken) 189 | 190 | gs = ExLlamaGenerator.Settings() 191 | gs.top_k = top_k 192 | gs.top_p = top_p 193 | gs.temperature = temp 194 | gs.token_repetition_penalty_max = rep_pen 195 | 196 | full_ctx, util_ctx, response = oneshot_generation(prompt=fullContext, stop_conditions=sc, max_new_tokens=maxNew, gen_settings=gs) 197 | 198 | return full_ctx, util_ctx, response# return requested prompt/context, pruned prompt/context(eg. prunedctx+maxNew=4096), model generated response, not including prompt 199 | 200 | async def streamInfer(request, ws): 201 | stopToken = [tokenizer.eos_token_id] 202 | stopToken += request["stopToken"].split(',') 203 | prompt = request["text"] 204 | maxNew = int(request["maxNew"]) 205 | top_p = float(request["top_p"]) 206 | top_k = int(request["top_k"]) 207 | temp = float(request["temp"]) 208 | rep_pen = float(request["rep_pen"]) 209 | gs = ExLlamaGenerator.Settings() 210 | gs.top_k = top_k 211 | gs.top_p = top_p 212 | gs.temperature = temp 213 | gs.token_repetition_penalty_max = rep_pen 214 | begin_stream(prompt, stopToken, maxNew, gs) 215 | while True: 216 | chunk, eos, x, y, builtResp = stream() 217 | await ws.send(json.dumps({'action':request["action"], 218 | 'request_id':request['request_id'], 219 | 'utilContext':utilized_prompt + builtResp, 220 | 'response':builtResp})) 221 | if eos: break 222 | return utilized_prompt + built_response,builtResp 223 | 224 | 225 | async def main(websocket, path): 226 | async for message in websocket: 227 | #try: 228 | request = json.loads(message) 229 | reqID = request["request_id"] 230 | action = request["action"] 231 | 232 | if action == "estimateToken": 233 | response = await estimateToken(request, websocket) 234 | await websocket.send(json.dumps({'action':action, 'request_id':reqID, 'response':response})) 235 | 236 | elif action == "echo": 237 | await websocket.send(json.dumps({'action':action, 'request_id':reqID})) 238 | 239 | elif action == "oneShotInfer": 240 | fctx, utlctx, res = await oneShotInfer(request, websocket) 241 | await websocket.send(json.dumps({'action':action, 'request_id':reqID,'utilContext':utlctx, 'response':res})) 242 | 243 | elif action == "leftTrim": 244 | prompt = request["text"] 245 | desiredLen = int(request["desiredLen"]) 246 | processedPrompt = leftTrimTokens(prompt, desiredLen) 247 | await websocket.send(json.dumps({'action':action, 'request_id':reqID, 'response':processedPrompt})) 248 | 249 | else: 250 | utlctx, builtResp= await streamInfer(request, websocket) 251 | await websocket.send(json.dumps({'action':action, 'request_id':reqID,'utilContext':utlctx, 'response':builtResp+''})) 252 | 253 | 254 | 255 | #except Exception as e: 256 | #print({"error": str(e)}) 257 | 258 | model_directory = "./models/Llama-2-70B-chat-GPTQ/" 259 | 260 | tokenizer_path = os.path.join(model_directory, "tokenizer.model") 261 | model_config_path = os.path.join(model_directory, "config.json") 262 | st_pattern = os.path.join(model_directory, "*.safetensors") 263 | model_path = glob.glob(st_pattern)[0] 264 | esTokenizer = SentencePieceProcessor(model_file = tokenizer_path) 265 | config = ExLlamaConfig(model_config_path) # create config from config.json 266 | config.set_auto_map('17.615,18.8897') 267 | config.model_path = model_path # supply path to model weights file 268 | 269 | model = ExLlama(config) # create ExLlama instance and load the weights 270 | print(f"Model loaded: {model_path}") 271 | 272 | tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file 273 | cache = ExLlamaCache(model) # create cache for inference 274 | generator = ExLlamaGenerator(model, tokenizer, cache) # create generator 275 | start_server = websockets.serve(main, "0.0.0.0", 8080) 276 | 277 | asyncio.get_event_loop().run_until_complete(start_server) 278 | asyncio.get_event_loop().run_forever() 279 | -------------------------------------------------------------------------------- /exllama/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cuda_ext, generator, model, tokenizer 2 | -------------------------------------------------------------------------------- /exllama/cuda_ext.py: -------------------------------------------------------------------------------- 1 | # from abc import ABC 2 | import torch 3 | from torch.cuda.amp import custom_bwd, custom_fwd 4 | from torch.utils.cpp_extension import load 5 | import os 6 | import sys 7 | import platform 8 | 9 | import exllama_ext 10 | # from exllama_ext import set_tuning_params 11 | # from exllama_ext import prepare_buffers 12 | from exllama_ext import make_q4 13 | from exllama_ext import q4_matmul 14 | from exllama_ext import q4_matmul_lora 15 | from exllama_ext import half_matmul 16 | from exllama_ext import half_matmul_cublas 17 | # from exllama_ext import q4_mlp 18 | from exllama_ext import rms_norm 19 | from exllama_ext import rope_ 20 | from exllama_ext import rep_penalty 21 | from exllama_ext import apply_rep_penalty 22 | 23 | 24 | # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension 25 | 26 | none_tensor = torch.empty((1, 1), device = "meta") 27 | 28 | 29 | # Construct Q4Matrix, return handle 30 | 31 | def ext_make_q4(qweight, qzeros, scales, g_idx, device): 32 | 33 | return make_q4(qweight, 34 | qzeros, 35 | scales, 36 | g_idx if g_idx is not None else none_tensor, 37 | device) 38 | 39 | 40 | # Matrix multiplication, returns x @ q4 41 | 42 | def ext_q4_matmul(x, q4, q4_width, lora_A = None, lora_B = None): 43 | 44 | outshape = x.shape[:-1] + (q4_width,) 45 | x = x.view(-1, x.shape[-1]) 46 | output = torch.empty((x.shape[0], q4_width), dtype = torch.float16, device = x.device) 47 | 48 | if lora_A is None: 49 | q4_matmul(x, q4, output) 50 | else: 51 | lora_temp = torch.empty((x.shape[0], lora_A.shape[1]), dtype = torch.float16, device = x.device) 52 | q4_matmul_lora(x, q4, output, lora_A, lora_B, lora_temp) 53 | 54 | return output.view(outshape) 55 | 56 | 57 | # Matrix multiplication, returns x @ w, both half-precision tensors 58 | 59 | def ext_half_matmul(x, w, cublas = False): 60 | 61 | outshape = x.shape[:-1] + (w.shape[1],) 62 | x = x.view(-1, x.shape[-1]) 63 | 64 | if cublas: 65 | output = torch.empty((x.shape[0], w.shape[1]), dtype = torch.float16, device = x.device) 66 | half_matmul_cublas(x, w, output) 67 | else: 68 | output = torch.zeros((x.shape[0], w.shape[1]), dtype = torch.float16, device = x.device) 69 | half_matmul(x, w, output) 70 | 71 | return output.view(outshape) ## 72 | 73 | 74 | # RoPE embeddings, in_place 75 | 76 | def ext_rope_(x, sin, cos, past_len, num_heads, head_dim): 77 | 78 | rope_(x, sin, cos, past_len, num_heads, head_dim) 79 | 80 | 81 | # RMS norm: x = x * w / sqrt(row_mean(x * x) + epsilon) 82 | 83 | def ext_rms_norm(x, w, epsilon): 84 | 85 | outshape = x.shape 86 | x = x.view(-1, x.shape[-1]) 87 | output = torch.empty_like(x) 88 | rms_norm(x, w, output, epsilon) 89 | 90 | return output.view(outshape) 91 | 92 | def ext_rms_norm_(x, w, epsilon): 93 | 94 | outshape = x.shape 95 | x = x.view(-1, x.shape[-1]) 96 | rms_norm(x, w, x, epsilon) 97 | 98 | 99 | # Repetition penalty 100 | 101 | def ext_rep_penalty_mask_cpu(vocab_size, sequence, penalty_max, sustain, decay): 102 | 103 | rep_mask = torch.empty(vocab_size, dtype = torch.float32) 104 | rep_penalty(sequence, rep_mask, penalty_max, sustain, decay) 105 | return rep_mask 106 | 107 | 108 | def ext_apply_rep_penalty_mask_cpu(sequence, penalty_max, sustain, decay, logits): 109 | 110 | apply_rep_penalty(sequence, penalty_max, sustain, decay, logits) 111 | 112 | -------------------------------------------------------------------------------- /exllama/lora.py: -------------------------------------------------------------------------------- 1 | from .model import ExLlamaConfig, Ex4bitLinear 2 | import torch 3 | import json 4 | from safetensors.torch import load_file as safe_load_file 5 | from torch import load as load_file 6 | 7 | class ExLlamaLora: 8 | 9 | lora_config_path: str 10 | lora_path: str 11 | lora_r: int 12 | lora_alpha: float 13 | lora_scaling: float 14 | config: ExLlamaConfig 15 | tensors: dict[torch.tensor] 16 | bias_ignored: bool 17 | 18 | def __init__(self, model, lora_config_path, lora_path): 19 | 20 | self.lora_config_path = lora_config_path 21 | self.lora_path = lora_path 22 | self.model = model 23 | self.config = model.config 24 | self.tensors = {} 25 | self.bias_ignored = False 26 | 27 | # Grab relevant items from LoRA config 28 | 29 | with open(lora_config_path) as f: 30 | read_config = json.load(f) 31 | 32 | self.lora_r = read_config["r"] 33 | self.lora_alpha = float(read_config["lora_alpha"]) 34 | self.lora_scaling = self.lora_alpha / self.lora_r 35 | 36 | if "fan_in_fan_out" in read_config and read_config["fan_in_fan_out"]: 37 | raise ValueError(" ## Error: fan_in_fan_out mode not supported.") 38 | 39 | # Load LoRA weights 40 | 41 | if self.lora_path.endswith(".safetensors"): 42 | f = safe_load_file(self.lora_path, device = "cpu") 43 | else: 44 | f = load_file(self.lora_path, map_location = "cpu") 45 | 46 | for key in f.keys(): 47 | tensor = f[key] 48 | 49 | # Find target 50 | 51 | i = key.find("model.layers.") 52 | if i == -1: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") 53 | 54 | target_key = key[i:] 55 | ks = target_key.split(".") 56 | decoder_idx = int(ks[2]) 57 | decoder_part = ks[3] 58 | decoder_layer = ks[4] 59 | lora_half = ks[5] 60 | 61 | if lora_half == "bias": 62 | epsilon = 1e-6 63 | if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon: 64 | raise ValueError(f" ## Error: unsupported bias target {self.lora_path}: {key}") 65 | self.bias_ignored = True 66 | continue 67 | 68 | target_module = self.model.layers[decoder_idx] 69 | if decoder_part == "self_attn": target_module = target_module.self_attn 70 | elif decoder_part == "mlp": target_module = target_module.mlp 71 | else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") 72 | 73 | if decoder_layer == "q_proj": target_module = target_module.q_proj 74 | elif decoder_layer == "k_proj": target_module = target_module.k_proj 75 | elif decoder_layer == "v_proj": target_module = target_module.v_proj 76 | elif decoder_layer == "o_proj": target_module = target_module.o_proj 77 | elif decoder_layer == "gate_proj": target_module = target_module.gate_proj 78 | elif decoder_layer == "up_proj": target_module = target_module.up_proj 79 | elif decoder_layer == "down_proj": target_module = target_module.down_proj 80 | else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") 81 | 82 | # Check that shape is compatible 83 | 84 | assert isinstance(target_module, Ex4bitLinear) 85 | 86 | if lora_half == "lora_A": 87 | in_features = tensor.shape[1] 88 | out_features = None 89 | elif lora_half == "lora_B": 90 | in_features = None 91 | out_features = tensor.shape[0] 92 | else: raise ValueError(f" ## Error: unsupported layer in {self.lora_path}: {key}") 93 | 94 | if (in_features and in_features != target_module.in_features) or (out_features and out_features != target_module.out_features): 95 | raise ValueError(f" ## Error: incompatible tensor shape in {self.lora_path}: {key}") 96 | 97 | # For efficiency, transpose adapter instead of transposing state during inference 98 | 99 | tensor = tensor.T.contiguous() 100 | 101 | # Pre-scale 102 | 103 | if lora_half == "lora_B" and self.lora_scaling != 1.0: tensor.mul_(self.lora_scaling) 104 | 105 | # Check that dtype is compatible, or convert 106 | 107 | if tensor.dtype == torch.bfloat16: 108 | tensor = tensor.to(torch.float16) 109 | 110 | elif tensor.dtype == torch.float32: 111 | tensor = tensor.to(torch.float16) 112 | 113 | elif tensor.dtype == torch.float16: 114 | pass 115 | 116 | else: raise ValueError(f" ## Error: unsupported tensor dtype in {self.lora_path}") 117 | 118 | # Move to target device 119 | 120 | device = self.config.device_map.map(target_key) 121 | tensor = tensor.to(device, non_blocking = True) 122 | 123 | # Store adapter tensor 124 | 125 | self.tensors[target_key] = tensor 126 | -------------------------------------------------------------------------------- /exllama/tokenizer.py: -------------------------------------------------------------------------------- 1 | from sentencepiece import SentencePieceProcessor 2 | import os 3 | import torch 4 | 5 | class ExLlamaTokenizer: 6 | 7 | def __init__(self, tokenizer_model_path): 8 | 9 | self.path = tokenizer_model_path 10 | self.tokenizer = SentencePieceProcessor(model_file = self.path) 11 | 12 | self.unk_token = "" 13 | self.bos_token = "" 14 | self.eos_token = "" 15 | self.unk_token_id = self.tokenizer.unk_id() # is the same as pad token id... 16 | self.eos_token_id = self.tokenizer.eos_id() 17 | self.bos_token_id = self.tokenizer.bos_id() 18 | self.pad_token_id = 0 # self.tokenizer.pad_id() 19 | self.newline_token_id = 13 20 | 21 | self.special_characters = [(self.bos_token, self.bos_token_id), (self.eos_token, self.eos_token_id), (self.unk_token, self.unk_token_id)] # for tokenzier encoding 22 | 23 | # Encode string 24 | 25 | def encode(self, text, return_mask = False, max_seq_len = 2048, add_bos = False, add_eos = False, encode_special_characters = False): 26 | 27 | if isinstance(text, list): 28 | 29 | # text is a list of strings 30 | 31 | list_ids = self.tokenizer.EncodeAsIds(text) 32 | 33 | # pad bos and eos 34 | 35 | if add_bos: 36 | for ids in list_ids: ids.insert(0, self.bos_token_id) 37 | if add_eos: 38 | for ids in list_ids: ids.append(self.eos_token_id) 39 | 40 | max_length = max([len(ids) for ids in list_ids]) 41 | 42 | needs_mask = False 43 | padded_ids = [] 44 | for ids in list_ids: 45 | if len(ids) != len(list_ids[0]): needs_mask = True 46 | padding = torch.full((max_length - len(ids),), self.pad_token_id) 47 | sequence = torch.tensor(ids) 48 | padded_ids.append(torch.cat((padding, sequence), dim = 0).long()) 49 | 50 | stacked_ids = torch.stack(padded_ids, dim = 0) 51 | 52 | if return_mask: 53 | if needs_mask: 54 | mask_padding = torch.full((stacked_ids.shape[0], max_seq_len - stacked_ids.shape[1]), True, dtype = torch.bool, device = "cpu") 55 | mask = stacked_ids != 0 56 | mask = torch.cat((mask, mask_padding), dim = 1) 57 | return stacked_ids, mask 58 | else: 59 | return stacked_ids, None 60 | else: 61 | return stacked_ids 62 | 63 | else: 64 | 65 | # text is a single string 66 | split_text = [text] 67 | 68 | # look for special characters 69 | if encode_special_characters: 70 | for special_character, special_token_id in self.special_characters: 71 | temp_text = [] 72 | for segment in split_text: 73 | if isinstance(segment, str) and special_character in segment: 74 | # for each special character, append the text before the special character, then append the special character ID, then the rest of the text 75 | parts = segment.split(special_character) 76 | new_parts = [] 77 | for i, part in enumerate(parts): 78 | new_parts.append(part) 79 | if i < len(parts) - 1: # add the special token id between parts, but not after the last part 80 | new_parts.append(special_token_id) 81 | temp_text.extend(new_parts) 82 | else: 83 | temp_text.append(segment) 84 | split_text = temp_text 85 | 86 | ids = [] 87 | 88 | for text_chunk in split_text: 89 | if isinstance(text_chunk, str): 90 | ids += self.tokenizer.EncodeAsIds(text_chunk) 91 | else: 92 | ids.append(text_chunk) 93 | 94 | # pad bos and eos 95 | 96 | if add_bos: 97 | ids = [self.bos_token_id] + ids 98 | if add_eos: 99 | ids = ids + [self.eos_token_id] 100 | 101 | stacked_ids = torch.tensor(ids).unsqueeze(0) 102 | 103 | if return_mask: 104 | return stacked_ids, None 105 | else: 106 | return stacked_ids 107 | 108 | def decode(self, ids, decode_special_characters=False): 109 | 110 | special_ids = {id_: char for char, id_ in self.special_characters} # create a lookup dictionary 111 | 112 | if ids.dim() > 1: 113 | 114 | texts = [] 115 | for i in range(ids.shape[0]): 116 | seq = ids[i].tolist() 117 | seq = [t for t in seq if t != self.pad_token_id] 118 | 119 | if decode_special_characters: 120 | text_parts = [] 121 | normal_ids = [] # list of lists 122 | current_normal_ids = [] # current list of normal IDs 123 | for idx, id_ in enumerate(seq): 124 | if id_ in special_ids: 125 | # Save the current list of normal IDs, then start a new one 126 | normal_ids.append(current_normal_ids) 127 | current_normal_ids = [] 128 | # Store special token as a string 129 | text_parts.append(special_ids[id_]) 130 | else: 131 | current_normal_ids.append(id_) 132 | normal_ids.append(current_normal_ids) # save the last segment of normal IDs 133 | 134 | decoded_segments = [self.tokenizer.Decode(segment) for segment in normal_ids] 135 | for idx, decoded_segment in enumerate(decoded_segments): 136 | text_parts.insert(2*idx, decoded_segment) 137 | 138 | texts.append("".join(text_parts)) 139 | else: 140 | if self.eos_token_id in seq: # to not mess up special char decoding 141 | seq = seq[:seq.index(self.eos_token_id)] 142 | texts.append(self.tokenizer.Decode(seq)) 143 | 144 | return texts 145 | 146 | else: 147 | 148 | ids = ids.tolist() 149 | 150 | if decode_special_characters: 151 | 152 | text_parts = [] 153 | normal_ids = [] # list of lists 154 | current_normal_ids = [] # current list of normal IDs 155 | for idx, id_ in enumerate(ids): 156 | if id_ in special_ids: 157 | # Save the current list of normal IDs, then start a new one 158 | normal_ids.append(current_normal_ids) 159 | current_normal_ids = [] 160 | # Store special token as a string 161 | text_parts.append(special_ids[id_]) 162 | else: 163 | current_normal_ids.append(id_) 164 | normal_ids.append(current_normal_ids) # save the last segment of normal IDs 165 | 166 | decoded_segments = [self.tokenizer.Decode(segment) for segment in normal_ids] 167 | for idx, decoded_segment in enumerate(decoded_segments): 168 | text_parts.insert(2*idx, decoded_segment) 169 | 170 | text = "".join(text_parts) 171 | 172 | else: 173 | 174 | text = self.tokenizer.Decode(ids) 175 | 176 | return text 177 | 178 | 179 | def num_tokens(self, text, encode_special_characters = False): 180 | 181 | if encode_special_characters: 182 | 183 | ids = self.encode(text, encode_special_characters = True) 184 | return ids.size(1) 185 | 186 | else: 187 | 188 | ids = self.tokenizer.Encode(text) 189 | return len(ids) -------------------------------------------------------------------------------- /exllama_ext/cpu_func/rep_penalty.cpp: -------------------------------------------------------------------------------- 1 | #include "rep_penalty.h" 2 | #include 3 | #include 4 | 5 | void rep_penalty_cpu 6 | ( 7 | const int vocab_size, 8 | const uint64_t* sequence, 9 | float* rep_mask, 10 | const float penalty_max, 11 | const int sustain, 12 | const int decay, 13 | const int seq_len 14 | ) 15 | { 16 | float v = penalty_max; 17 | float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f; 18 | 19 | int s = sustain == -1 ? seq_len : sustain; 20 | int beg = seq_len - s - decay; 21 | if (beg < 0) beg = 0; 22 | 23 | for (int i = 0; i < vocab_size; i++) rep_mask[i] = 1.0f; 24 | 25 | for (int i = seq_len; i > beg;) 26 | { 27 | uint64_t t = sequence[--i]; 28 | if (v > rep_mask[t]) rep_mask[t] = v; 29 | if (--s < 0) v += dv; 30 | } 31 | } 32 | 33 | bool* g_rep_mask = NULL; 34 | int g_vocab_size = 0; 35 | 36 | void apply_rep_penalty_cpu 37 | ( 38 | const int vocab_size, 39 | const uint64_t* sequence, 40 | const float penalty_max, 41 | const int sustain, 42 | const int decay, 43 | const int seq_len, 44 | float* logits 45 | ) 46 | { 47 | if (vocab_size != g_vocab_size) 48 | { 49 | if (g_rep_mask) free(g_rep_mask); 50 | g_vocab_size = vocab_size; 51 | g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool)); 52 | } 53 | 54 | memset(g_rep_mask, 0, g_vocab_size * sizeof(bool)); 55 | 56 | float v = penalty_max; 57 | float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f; 58 | 59 | int s = sustain == -1 ? seq_len : sustain; 60 | int beg = seq_len - s - decay; 61 | if (beg < 0) beg = 0; 62 | 63 | for (int i = seq_len; i > beg;) 64 | { 65 | uint64_t t = sequence[--i]; 66 | if (!g_rep_mask[t]) 67 | { 68 | if (logits[t] > 0.0) logits[t] /= v; 69 | else logits[t] *= v; 70 | g_rep_mask[t] = true; 71 | } 72 | if (--s < 0) v += dv; 73 | } 74 | } -------------------------------------------------------------------------------- /exllama_ext/cpu_func/rep_penalty.h: -------------------------------------------------------------------------------- 1 | #ifndef _rep_penalty_h 2 | #define _rep_penalty_h 3 | 4 | #include 5 | #include 6 | 7 | void rep_penalty_cpu 8 | ( 9 | const int vocab_size, 10 | const uint64_t* sequence, 11 | float* rep_mask, 12 | const float penalty_max, 13 | const int sustain, 14 | const int decay, 15 | const int seq_len 16 | ); 17 | 18 | void apply_rep_penalty_cpu 19 | ( 20 | const int vocab_size, 21 | const uint64_t* sequence, 22 | const float penalty_max, 23 | const int sustain, 24 | const int decay, 25 | const int seq_len, 26 | float* logits 27 | ); 28 | 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /exllama_ext/cuda_buffers.cu: -------------------------------------------------------------------------------- 1 | #define _cuda_buffers_cu 2 | #include "cuda_buffers.cuh" 3 | 4 | CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL}; 5 | // __constant__ half2 q4_table[16][256]; 6 | // half2 q4_table_host[16][256]; 7 | // bool q4_table_init = false; 8 | 9 | CudaBuffers::CudaBuffers 10 | ( 11 | int _device, 12 | half* _temp_state, 13 | int _temp_state_size, 14 | half* _temp_mlp, 15 | float* _temp_zeros_float, 16 | half* _temp_dq, 17 | int _max_zeros_float 18 | ) : 19 | device(_device), 20 | temp_state(_temp_state), 21 | temp_state_size(_temp_state_size), 22 | temp_mlp(_temp_mlp), 23 | temp_zeros_float(_temp_zeros_float), 24 | temp_dq(_temp_dq), 25 | max_zeros_float(_max_zeros_float), 26 | current_zeros_float(0) 27 | { 28 | cudaSetDevice(_device); 29 | 30 | cudaStreamCreate(&alt_stream_1); 31 | cudaStreamCreate(&alt_stream_2); 32 | cudaStreamCreate(&alt_stream_3); 33 | cudaEventCreate(&alt_stream_1_done); 34 | cudaEventCreate(&alt_stream_2_done); 35 | cudaEventCreate(&alt_stream_3_done); 36 | } 37 | 38 | CudaBuffers::~CudaBuffers() 39 | { 40 | cudaStreamDestroy(alt_stream_1); 41 | cudaStreamDestroy(alt_stream_2); 42 | cudaStreamDestroy(alt_stream_3); 43 | cudaEventDestroy(alt_stream_1_done); 44 | cudaEventDestroy(alt_stream_2_done); 45 | cudaEventDestroy(alt_stream_3_done); 46 | } 47 | 48 | float* CudaBuffers::get_zeros_float(const int num_zeros) 49 | { 50 | if (current_zeros_float + num_zeros >= max_zeros_float) 51 | { 52 | current_zeros_float = 0; 53 | cudaMemsetAsync(temp_zeros_float, 0, max_zeros_float * sizeof(float)); 54 | } 55 | 56 | float* zeros = temp_zeros_float + current_zeros_float; 57 | current_zeros_float += num_zeros; 58 | return zeros; 59 | } 60 | 61 | CudaBuffers* get_buffers(const int device_index) 62 | { 63 | return g_buffers[device_index]; 64 | } 65 | 66 | void prepare_buffers_cuda 67 | ( 68 | int _device, 69 | half* _temp_state, 70 | int _temp_state_size, 71 | half* _temp_mlp, 72 | float* _temp_zeros_float, 73 | half* _temp_dq, 74 | int _max_zeros_float 75 | ) 76 | { 77 | CudaBuffers* buffers = new CudaBuffers 78 | ( 79 | _device, 80 | _temp_state, 81 | _temp_state_size, 82 | _temp_mlp, 83 | _temp_zeros_float, 84 | _temp_dq, 85 | _max_zeros_float 86 | ); 87 | 88 | g_buffers[_device] = buffers; 89 | } 90 | 91 | void cleanup_buffers_cuda() 92 | { 93 | for (int i = 0; i < CUDA_MAX_DEVICES; i++) 94 | { 95 | if (!g_buffers[i]) continue; 96 | delete g_buffers[i]; 97 | g_buffers[i] = NULL; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /exllama_ext/cuda_buffers.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _cuda_buffers_cuh 2 | #define _cuda_buffers_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | const int CUDA_MAX_DEVICES = 16; 10 | 11 | // #ifndef _cuda_buffers_cu 12 | // extern __constant__ half2 q4_table[16][256]; 13 | // #endif 14 | 15 | class CudaBuffers 16 | { 17 | public: 18 | int device; 19 | 20 | half* temp_state; // [max_hidden_rows * intermediate_size] 21 | int temp_state_size; 22 | half* temp_mlp; // [hidden_dim * intermediate_size] 23 | float* temp_zeros_float; // [max_hidden_rows] 24 | half* temp_dq; // size of largest quant tensor * 8 25 | 26 | int current_zeros_float; 27 | int max_zeros_float; 28 | 29 | cudaStream_t alt_stream_1; 30 | cudaStream_t alt_stream_2; 31 | cudaStream_t alt_stream_3; 32 | cudaEvent_t alt_stream_1_done; 33 | cudaEvent_t alt_stream_2_done; 34 | cudaEvent_t alt_stream_3_done; 35 | 36 | CudaBuffers 37 | ( 38 | int _device, 39 | half* _temp_state, 40 | int _temp_state_size, 41 | half* _temp_mlp, 42 | float* _temp_zeros_float, 43 | half* _temp_dq, 44 | int _max_zeros_float 45 | ); 46 | ~CudaBuffers(); 47 | 48 | float* get_zeros_float(const int num_zeros); 49 | }; 50 | 51 | CudaBuffers* get_buffers(const int device_index); 52 | 53 | void prepare_buffers_cuda 54 | ( 55 | int _device, 56 | half* _temp_state, 57 | int _temp_state_size, 58 | half* _temp_mlp, 59 | float* _temp_zeros_float, 60 | half* _temp_dq, 61 | int _max_zeros_float 62 | ); 63 | 64 | void cleanup_buffers_cuda(); 65 | 66 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_compat.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _cuda_compat_cuh 2 | #define _cuda_compat_cuh 3 | 4 | // atomicAdd for half types, to support CC < 7.x 5 | 6 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) 7 | { 8 | unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); 9 | unsigned int old = *address_as_ui; 10 | unsigned int assumed; 11 | 12 | do 13 | { 14 | assumed = old; 15 | __half_raw hsum; 16 | hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); 17 | half tmpres = __hadd(hsum, val); 18 | hsum = __half_raw(tmpres); 19 | old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; 20 | old = atomicCAS(address_as_ui, assumed, old); 21 | } 22 | while (assumed != old); 23 | } 24 | 25 | // atomicAdd for half2 types 26 | 27 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) 28 | { 29 | unsigned int* address_as_ui = (unsigned int*)address; 30 | unsigned int old = *address_as_ui; 31 | unsigned int assumed; 32 | do 33 | { 34 | assumed = old; 35 | half2 old_val = *((half2*)&old); 36 | half2 new_val = __hadd2(old_val, val); 37 | old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); 38 | } 39 | while (assumed != old); 40 | } 41 | 42 | // 43 | 44 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM) 45 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM) 46 | 47 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } 48 | 49 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM) 50 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } 51 | #endif 52 | 53 | #endif 54 | #endif 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /exllama_ext/cuda_func/column_remap.cu: -------------------------------------------------------------------------------- 1 | #include "column_remap.cuh" 2 | #include "../util.cuh" 3 | 4 | const int SHUF_BLOCKSIZE_X = 256; 5 | const int SHUF_BLOCKSIZE_Y = 16; 6 | 7 | __global__ void column_remap_kernel 8 | ( 9 | const half* __restrict__ x, 10 | half* __restrict__ x_new, 11 | const int x_width, 12 | const int x_height, 13 | const uint32_t* x_map 14 | ) 15 | { 16 | int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x; 17 | int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y; 18 | if (x_column >= x_width) return; 19 | //if (x_row >= x_height) return; 20 | 21 | int x_stride = x_width; 22 | int x_idx = x_row * x_stride + x_column; 23 | 24 | int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height); 25 | int x_idx_end = x_row_end * x_stride + x_column; 26 | 27 | int s_column = x_map[x_column]; 28 | int s_idx = x_row * x_stride + s_column; 29 | 30 | while (x_idx < x_idx_end) 31 | { 32 | x_new[x_idx] = x[s_idx]; 33 | x_idx += x_stride; 34 | s_idx += x_stride; 35 | } 36 | } 37 | 38 | // Remap columns in x to correspond to sequential group index before matmul 39 | // 40 | // perform x -> seq_x such that seq_x @ seq_w == x @ w 41 | 42 | void column_remap_cuda 43 | ( 44 | const half* x, 45 | half* x_new, 46 | const int x_height, 47 | const int x_width, 48 | const uint32_t* x_map 49 | ) 50 | { 51 | dim3 threads(SHUF_BLOCKSIZE_X, 1, 1); 52 | 53 | dim3 blocks 54 | ( 55 | (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X, 56 | (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y, 57 | 1 58 | ); 59 | 60 | column_remap_kernel<<>>(x, x_new, x_width, x_height, x_map); 61 | } 62 | -------------------------------------------------------------------------------- /exllama_ext/cuda_func/column_remap.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _column_remap_cuh 2 | #define _column_remap_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | void column_remap_cuda 9 | ( 10 | const half* x, 11 | half* x_new, 12 | const int x_height, 13 | const int x_width, 14 | const uint32_t* x_map 15 | ); 16 | 17 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_func/half_matmul.cu: -------------------------------------------------------------------------------- 1 | #include "half_matmul.cuh" 2 | #include "../util.cuh" 3 | #include "../matrix.cuh" 4 | #include "../cuda_compat.cuh" 5 | #if defined(USE_ROCM) 6 | #include "../hip_compat.cuh" 7 | #endif 8 | 9 | // Block size 10 | 11 | const int THREADS_X = 32; // Block size and thread count along columns in w and out 12 | const int THREADS_Y = 8; // Block size and thread count along rows in x and out 13 | const int BLOCKSIZE = 256; 14 | 15 | __global__ void half_matmul_kernel 16 | ( 17 | const half* __restrict__ x, 18 | const half* __restrict__ w, 19 | half* __restrict__ out, 20 | const int height, 21 | const int dim, 22 | const int width 23 | ) 24 | { 25 | const int column = (blockIdx.x * THREADS_X + threadIdx.x) * 2; 26 | const int row = blockIdx.y * THREADS_Y + threadIdx.y; 27 | const int k0 = blockIdx.z * BLOCKSIZE; 28 | 29 | if (row >= height) return; 30 | if (column >= width) return; 31 | 32 | MatrixView_half x_(x, height, dim); 33 | MatrixView_half w_(w, dim, width); 34 | MatrixView_half_rw out_(out, height, width); 35 | 36 | half2* x_ptr = (half2*) x_.item_ptr(row, k0); 37 | half2* w_ptr = (half2*) w_.item_ptr(k0, column); 38 | half2 acc = {}; 39 | 40 | #pragma unroll 41 | for (int k = k0; k < k0 + BLOCKSIZE / 2; k++) 42 | { 43 | half2 x_item = *x_ptr++; 44 | half2 x_item_0 = __half2half2(x_item.x); 45 | half2 x_item_1 = __half2half2(x_item.y); 46 | half2 w_item_0 = *w_ptr; w_ptr += w_.width / 2; 47 | half2 w_item_1 = *w_ptr; w_ptr += w_.width / 2; 48 | acc = __hfma2(x_item_0, w_item_0, acc); 49 | acc = __hfma2(x_item_1, w_item_1, acc); 50 | } 51 | 52 | // out_.set(row, column, acc); 53 | atomicAdd((half2*)out_.item_ptr(row, column), acc); 54 | } 55 | 56 | void half_matmul_cuda 57 | ( 58 | const half* x, 59 | const half* w, 60 | half* out, 61 | const int height, 62 | const int dim, 63 | const int width, 64 | cudaStream_t alt_stream 65 | ) 66 | { 67 | dim3 threads(THREADS_X, THREADS_Y, 1); 68 | 69 | dim3 blocks 70 | ( 71 | (width + THREADS_X - 1) / THREADS_X / 2, 72 | (height + THREADS_Y - 1) / THREADS_Y, 73 | (dim + BLOCKSIZE - 1) / BLOCKSIZE 74 | ); 75 | 76 | half_matmul_kernel<<>>(x, w, out, height, dim, width); 77 | } 78 | 79 | // cuBLAS can't be beat for large matrices, probably 80 | 81 | const int MAX_DIM_SMALL = 8192; 82 | 83 | void half_matmul_cublas_cuda 84 | ( 85 | ExLlamaTuning* tuningParams, 86 | const half* x, 87 | const half* w, 88 | half* out, 89 | const int height, 90 | const int dim, 91 | const int width, 92 | cublasHandle_t handle, 93 | bool no_zero, 94 | cudaStream_t alt_stream 95 | ) 96 | { 97 | // Fall back on a naive kernel for small matmuls to avoid cuBLAS overhead 98 | 99 | if (height < 4 && dim <= MAX_DIM_SMALL) 100 | { 101 | half_matmul_small_cuda(tuningParams, x, w, out, height, dim, width, no_zero, alt_stream); 102 | return; 103 | } 104 | 105 | // printf("cuBLAS: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width); 106 | 107 | // Use cuBLAS 108 | 109 | const half alpha = __float2half(1.0f); 110 | const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f); 111 | 112 | cudaStream_t default_stream; 113 | if (alt_stream) 114 | { 115 | cublasGetStream(handle, &default_stream); 116 | cublasSetStream(handle, alt_stream); 117 | } 118 | 119 | cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, w, width, x, dim, &beta, out, width); 120 | 121 | if (alt_stream) 122 | { 123 | cublasSetStream(handle, default_stream); 124 | } 125 | } 126 | 127 | // Alternative to cuBLAS for tall or wide matrices 128 | 129 | const int S_THREADS_X = 8; // width 130 | const int S_THREADS_Z = 1; // height 131 | const int S_BLOCKSIZE = MAX_DIM_SMALL / 1024 * S_THREADS_X; // dim 132 | 133 | template 134 | __global__ void half_matmul_small_kernel 135 | ( 136 | const half* __restrict__ x, 137 | const half* __restrict__ w, 138 | half* __restrict__ out, 139 | const int height, 140 | const int dim, 141 | const int width, 142 | bool no_zero 143 | ) 144 | { 145 | int column = blockIdx.x * S_THREADS_X + threadIdx.x; 146 | int row = blockIdx.z * S_THREADS_Z + threadIdx.z; 147 | int k = threadIdx.y * S_BLOCKSIZE; 148 | 149 | if (row >= height) return; 150 | if (column >= width) return; 151 | // if (k >= dim) return; 152 | // printf("%i, %i, %i\n", row, column, k); 153 | 154 | MatrixView_half x_(x, height, dim); 155 | MatrixView_half w_(w, dim, width); 156 | MatrixView_half_rw out_(out, height, width); 157 | 158 | int k_end = k + S_BLOCKSIZE; 159 | if (k_end > dim) k_end = dim; 160 | 161 | const half* x_ptr = x_.item_ptr(row, k); 162 | const half* x_ptr_end = x_.item_ptr(row, k_end); 163 | const half* w_ptr = w_.item_ptr(k, column); 164 | half* out_ptr = out_.item_ptr(row, column); 165 | 166 | if constexpr (use_half2 && !odd_rank) 167 | { 168 | half2* x_ptr2 = (half2*) x_ptr; 169 | half2* x_ptr2_end = (half2*) x_ptr_end; 170 | 171 | half2 r = {}; 172 | 173 | while(x_ptr2 < x_ptr2_end) 174 | { 175 | half2 x_01 = *x_ptr2++; 176 | half2 x_23 = *x_ptr2++; 177 | half w_0 = *w_ptr; w_ptr += width; 178 | half w_1 = *w_ptr; w_ptr += width; 179 | half w_2 = *w_ptr; w_ptr += width; 180 | half w_3 = *w_ptr; w_ptr += width; 181 | half2 w_01 = __halves2half2(w_0, w_1); 182 | half2 w_23 = __halves2half2(w_2, w_3); 183 | r = __hfma2(x_01, w_01, r); 184 | r = __hfma2(x_23, w_23, r); 185 | } 186 | 187 | half rh = __hadd(r.x, r.y); 188 | 189 | __shared__ half accum[MAX_DIM_SMALL / S_BLOCKSIZE][S_THREADS_X]; 190 | accum[threadIdx.y][threadIdx.x] = rh; 191 | __syncthreads(); 192 | 193 | if (threadIdx.y == 0) 194 | { 195 | half acc = rh; 196 | for (int i = 1; i < blockDim.y; ++i) acc = __hadd(accum[i][threadIdx.x], acc); 197 | if (no_zero) acc = __hadd(acc, *out_ptr); 198 | *out_ptr = acc; 199 | } 200 | } 201 | else 202 | { 203 | half r = {}; 204 | 205 | while(x_ptr < x_ptr_end) 206 | { 207 | if constexpr (odd_rank) 208 | { 209 | half x_item = *x_ptr++; 210 | half w_item = *w_ptr; w_ptr += width; 211 | r = __hfma(x_item, w_item, r); 212 | } 213 | else 214 | { 215 | #pragma unroll 216 | for (int i = 0; i < 4; ++i) 217 | { 218 | half x_item = *x_ptr++; 219 | half w_item = *w_ptr; w_ptr += width; 220 | r = __hfma(x_item, w_item, r); 221 | } 222 | } 223 | } 224 | 225 | __shared__ half accum[MAX_DIM_SMALL / S_BLOCKSIZE][S_THREADS_X]; 226 | accum[threadIdx.y][threadIdx.x] = r; 227 | __syncthreads(); 228 | 229 | if (threadIdx.y == 0) 230 | { 231 | half acc = accum[0][threadIdx.x]; 232 | for (int i = 1; i < blockDim.y; ++i) acc = __hadd(accum[i][threadIdx.x], acc); 233 | if (no_zero) acc = __hadd(acc, *out_ptr); 234 | *out_ptr = acc; 235 | } 236 | } 237 | } 238 | 239 | void half_matmul_small_cuda 240 | ( 241 | ExLlamaTuning* tuningParams, 242 | const half* x, 243 | const half* w, 244 | half* out, 245 | const int height, 246 | const int dim, 247 | const int width, 248 | bool no_zero, 249 | cudaStream_t alt_stream 250 | ) 251 | { 252 | bool use_half2 = !tuningParams->matmul_no_half2; 253 | 254 | //printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width); 255 | 256 | dim3 threads 257 | ( 258 | S_THREADS_X, 259 | (dim + S_BLOCKSIZE - 1) / S_BLOCKSIZE, 260 | 1 261 | ); 262 | 263 | dim3 blocks 264 | ( 265 | (width + S_THREADS_X - 1) / S_THREADS_X, 266 | 1, 267 | height 268 | ); 269 | 270 | //printf("t... %i %i %i\n", threads.x, threads.y, threads.z); 271 | //printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z); 272 | //if (!no_zero) cudaMemsetAsync(out, 0, height * width * sizeof(half)); 273 | 274 | if (dim & 0x03) 275 | { 276 | half_matmul_small_kernel <<>>(x, w, out, height, dim, width, no_zero); 277 | } 278 | else 279 | { 280 | if (use_half2) half_matmul_small_kernel <<>>(x, w, out, height, dim, width, no_zero); 281 | else half_matmul_small_kernel <<>>(x, w, out, height, dim, width, no_zero); 282 | } 283 | } 284 | 285 | -------------------------------------------------------------------------------- /exllama_ext/cuda_func/half_matmul.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _half_matmul_cuh 2 | #define _half_matmul_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../tuning.h" 9 | 10 | // Workaround for hipify_python using rocblas instead of hipblas. 11 | #if defined(USE_ROCM) 12 | #include 13 | #define rocblas_handle hipblasHandle_t 14 | #endif 15 | 16 | void half_matmul_cuda 17 | ( 18 | const half* x, 19 | const half* w, 20 | half* out, 21 | const int height, 22 | const int dim, 23 | const int width, 24 | cudaStream_t alt_stream = NULL 25 | ); 26 | 27 | void half_matmul_cublas_cuda 28 | ( 29 | ExLlamaTuning* tuningParams, 30 | const half* x, 31 | const half* w, 32 | half* out, 33 | const int height, 34 | const int dim, 35 | const int width, 36 | cublasHandle_t handle, 37 | bool no_zero = false, 38 | cudaStream_t alt_stream = NULL 39 | ); 40 | 41 | void half_matmul_small_cuda 42 | ( 43 | ExLlamaTuning* tuningParams, 44 | const half* x, 45 | const half* w, 46 | half* out, 47 | const int height, 48 | const int dim, 49 | const int width, 50 | bool no_zero = false, 51 | cudaStream_t alt_stream = NULL 52 | ); 53 | 54 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_func/q4_attn.cu: -------------------------------------------------------------------------------- 1 | #include "q4_mlp.cuh" 2 | #include "q4_matmul.cuh" 3 | #include "rope.cuh" 4 | #include "rms_norm.cuh" 5 | #include "half_matmul.cuh" 6 | #include "../cuda_buffers.cuh" 7 | #include "../util.cuh" 8 | #include "../matrix.cuh" 9 | #if defined(USE_ROCM) 10 | #include "../hip_compat.cuh" 11 | #endif 12 | 13 | const int THREADS_X = 32; 14 | const int THREADS_Y = 1; 15 | const int THREADS_Z = 4; 16 | const int BLOCKSIZE_X = 2; // 2*half == 1*uint32_t 17 | const int BLOCKSIZE_Z = 4; // num_heads must be divisible by BLOCKSIZE_Z TODO: Check that this is the case when Llama2-34b releases 18 | 19 | __global__ void update_cache_kernel 20 | ( 21 | const half* __restrict__ key_states, 22 | const half* __restrict__ value_states, 23 | half* __restrict__ key_cache, 24 | half* __restrict__ value_cache, 25 | const int head_dim, 26 | const int num_kv_heads, 27 | const int q_len, 28 | const int max_seq_len, 29 | const int past_len 30 | ) 31 | { 32 | //int state_shape[] = { num_kv_heads, q_len, head_dim }; 33 | int state_stride[] = { head_dim, head_dim * num_kv_heads, 1 }; 34 | int state_pos[] = { 0, 0, 0 }; 35 | 36 | //int cache_shape[] = { num_kv_heads, max_seq_len, head_dim }; 37 | int cache_stride[] = { max_seq_len * head_dim, head_dim, 1 }; 38 | int cache_pos[] = { 0, past_len, 0 }; 39 | 40 | int size[] = { num_kv_heads, q_len, head_dim }; 41 | 42 | int x = (blockIdx.x * THREADS_X + threadIdx.x) * BLOCKSIZE_X; 43 | int y = blockIdx.y * THREADS_Y + threadIdx.y; 44 | int z = (blockIdx.z * THREADS_Z + threadIdx.z) * BLOCKSIZE_Z; 45 | 46 | if (x >= size[2]) return; 47 | if (y >= size[1]) return; 48 | if (z >= size[0]) return; 49 | 50 | int state_offset = (z + state_pos[0]) * state_stride[0] + (y + state_pos[1]) * state_stride[1] + (x + state_pos[2]) * state_stride[2]; 51 | int cache_offset = (z + cache_pos[0]) * cache_stride[0] + (y + cache_pos[1]) * cache_stride[1] + (x + cache_pos[2]) * cache_stride[2]; 52 | 53 | const uint32_t* key_ptr = (uint32_t*) (key_states + state_offset); 54 | const uint32_t* value_ptr = (uint32_t*) (value_states + state_offset); 55 | uint32_t* key_cache_ptr = (uint32_t*) (key_cache + cache_offset); 56 | uint32_t* value_cache_ptr = (uint32_t*) (value_cache + cache_offset); 57 | 58 | #pragma unroll 59 | for (int k = 0; k < BLOCKSIZE_Z; k++) 60 | { 61 | *key_cache_ptr = *key_ptr; 62 | key_ptr += state_stride[0] / BLOCKSIZE_X; 63 | key_cache_ptr += cache_stride[0] / BLOCKSIZE_X; 64 | } 65 | #pragma unroll 66 | for (int k = 0; k < BLOCKSIZE_Z; k++) 67 | { 68 | *value_cache_ptr = *value_ptr; 69 | value_ptr += state_stride[0] / BLOCKSIZE_X; 70 | value_cache_ptr += cache_stride[0] / BLOCKSIZE_X; 71 | } 72 | } 73 | 74 | void q4_attn_cuda 75 | ( 76 | ExLlamaTuning* tuningParams, 77 | cudaStream_t stream, 78 | cublasHandle_t handle, 79 | half* x, 80 | const half* rms_norm_weight, // shape == (x.shape[1],) == (dim,) 81 | float epsilon, 82 | half* query_states, 83 | half* key_states, 84 | half* value_states, 85 | Q4Matrix* q_proj, 86 | Q4Matrix* k_proj, 87 | Q4Matrix* v_proj, 88 | half* sin, 89 | half* cos, 90 | const int bsz, 91 | const int q_len, 92 | const int dim, 93 | const int head_dim, 94 | const int num_heads, 95 | const int num_kv_heads, 96 | const int past_len, 97 | half* key_cache, 98 | half* value_cache, 99 | const half* q_a, 100 | const half* q_b, 101 | const int q_rank, 102 | const half* k_a, 103 | const half* k_b, 104 | const int k_rank, 105 | const half* v_a, 106 | const half* v_b, 107 | const int v_rank, 108 | half* lora_temp, 109 | const int max_seq_len, 110 | const int device_index 111 | ) 112 | { 113 | // Cache update grid 114 | 115 | dim3 threads(THREADS_X, THREADS_Y, THREADS_Z); 116 | 117 | dim3 blocks 118 | ( 119 | ((head_dim + THREADS_X - 1) / THREADS_X + BLOCKSIZE_X - 1) / BLOCKSIZE_X, 120 | q_len, 121 | ((num_kv_heads + THREADS_Z - 1) / THREADS_Z + BLOCKSIZE_Z - 1) / BLOCKSIZE_Z 122 | ); 123 | 124 | int _rows_per_batch = q_len * num_heads; 125 | int _rows_per_batch_kv = q_len * num_kv_heads; 126 | 127 | CudaBuffers* buffers = get_buffers(device_index); 128 | 129 | // Layernorm 130 | 131 | half* temp_x = buffers->temp_state + q_len * dim; 132 | rms_norm_cuda(tuningParams, x, rms_norm_weight, temp_x, epsilon, q_len, dim, device_index); 133 | 134 | // Adapters 135 | 136 | if (q_a) 137 | { 138 | half_matmul_cublas_cuda(tuningParams, temp_x, q_a, lora_temp, q_len, dim, q_rank, handle); 139 | half_matmul_cublas_cuda(tuningParams, lora_temp, q_b, query_states, q_len, q_rank, dim, handle); 140 | } 141 | if (k_a) 142 | { 143 | half_matmul_cublas_cuda(tuningParams, temp_x, k_a, lora_temp, q_len, dim, k_rank, handle); 144 | half_matmul_cublas_cuda(tuningParams, lora_temp, k_b, key_states, q_len, k_rank, dim, handle); 145 | } 146 | if (v_a) 147 | { 148 | half_matmul_cublas_cuda(tuningParams, temp_x, v_a, lora_temp, q_len, dim, v_rank, handle); 149 | half_matmul_cublas_cuda(tuningParams, lora_temp, v_b, value_states, q_len, v_rank, dim, handle); 150 | } 151 | 152 | if (!tuningParams->concurrent_streams) 153 | { 154 | // Project q, k, v 155 | 156 | q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false); 157 | q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false); 158 | q4_matmul_cuda(tuningParams, temp_x, q_len, v_proj, value_states, v_a ? true : false); 159 | 160 | // Positional embeddings q, k 161 | 162 | rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len); 163 | rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch_kv, head_dim, num_kv_heads, past_len); 164 | 165 | // Update cache tensors with projected k, v 166 | 167 | update_cache_kernel<<>>(key_states, value_states, key_cache, value_cache, head_dim, num_kv_heads, q_len, max_seq_len, past_len); 168 | } 169 | else 170 | { 171 | // Project q, k, v, add positional embeddings to q, k, update cache tensors with projected k, v 172 | 173 | cudaStream_t str_1 = buffers->alt_stream_1; 174 | cudaStream_t str_2 = buffers->alt_stream_2; 175 | cudaStream_t str_3 = buffers->alt_stream_3; 176 | cudaEvent_t sync_1 = buffers->alt_stream_1_done; 177 | cudaEvent_t sync_2 = buffers->alt_stream_2_done; 178 | cudaEvent_t sync_3 = buffers->alt_stream_3_done; 179 | 180 | // str_1: project q, positions q, sync 181 | 182 | q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false, str_1); 183 | rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_kv_heads, past_len, str_1); 184 | cudaEventRecord(sync_1, str_1); 185 | 186 | // str_2: project k, positions k, sync 187 | 188 | q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false, str_2); 189 | rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch_kv, head_dim, num_kv_heads, past_len, str_2); 190 | cudaEventRecord(sync_2, str_2); 191 | 192 | // str_3: project v, wait for str_2, copy (k,v) to cache, sync 193 | 194 | q4_matmul_cuda(tuningParams, temp_x, q_len, v_proj, value_states, v_a ? true : false, buffers->alt_stream_3); 195 | cudaStreamWaitEvent(str_3, sync_2, 0); 196 | update_cache_kernel<<>>(key_states, value_states, key_cache, value_cache, head_dim, num_kv_heads, q_len, max_seq_len, past_len); 197 | cudaEventRecord(sync_3, str_3); 198 | 199 | // default: wait for str_1 and str_3 200 | 201 | cudaStreamWaitEvent(NULL, sync_1, 0); 202 | cudaStreamWaitEvent(NULL, sync_3, 0); 203 | } 204 | } 205 | 206 | void q4_attn_2_cuda 207 | ( 208 | ExLlamaTuning* tuningParams, 209 | cublasHandle_t handle, 210 | half* x, 211 | half* attn_output, 212 | Q4Matrix* o_proj, 213 | const int height, 214 | const half* o_a, 215 | const half* o_b, 216 | const int o_rank, 217 | half* lora_temp 218 | ) 219 | { 220 | if (o_a) 221 | { 222 | int dim = o_proj->height; 223 | half_matmul_cublas_cuda(tuningParams, attn_output, o_a, lora_temp, height, dim, o_rank, handle); 224 | half_matmul_cublas_cuda(tuningParams, lora_temp, o_b, x, height, o_rank, dim, handle, true); 225 | } 226 | 227 | q4_matmul_cuda(tuningParams, attn_output, height, o_proj, x, true); 228 | } 229 | -------------------------------------------------------------------------------- /exllama_ext/cuda_func/q4_attn.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _q4_attn_cuh 2 | #define _q4_attn_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../tuning.h" 9 | #include "q4_matrix.cuh" 10 | 11 | void q4_attn_cuda 12 | ( 13 | ExLlamaTuning* tuningParams, 14 | cudaStream_t stream, 15 | cublasHandle_t handle, 16 | half* x, 17 | const half* rms_norm_weight, // shape == (x.shape[1],) == (dim,) 18 | float epsilon, 19 | half* query_states, 20 | half* key_states, 21 | half* value_states, 22 | Q4Matrix* q_proj, 23 | Q4Matrix* k_proj, 24 | Q4Matrix* v_proj, 25 | half* sin, 26 | half* cos, 27 | const int bsz, 28 | const int q_len, 29 | const int dim, 30 | const int head_dim, 31 | const int num_heads, 32 | const int num_kv_heads, 33 | const int past_len, 34 | half* key_cache, 35 | half* value_cache, 36 | const half* q_a, 37 | const half* q_b, 38 | const int q_rank, 39 | const half* k_a, 40 | const half* k_b, 41 | const int k_rank, 42 | const half* v_a, 43 | const half* v_b, 44 | const int v_rank, 45 | half* lora_temp, 46 | const int max_seq_len, 47 | const int device_index 48 | ); 49 | 50 | void q4_attn_2_cuda 51 | ( 52 | ExLlamaTuning* tuningParams, 53 | cublasHandle_t handle, 54 | half* x, 55 | half* attn_output, 56 | Q4Matrix* o_proj, 57 | const int height, 58 | const half* o_a, 59 | const half* o_b, 60 | const int o_rank, 61 | half* lora_temp 62 | ); 63 | 64 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_func/q4_matmul.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _q4_matmul_cuh 2 | #define _q4_matmul_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "q4_matrix.cuh" 11 | #include "../tuning.h" 12 | 13 | // Workaround for hipify_python using rocblas instead of hipblas. 14 | #if defined(USE_ROCM) 15 | #include 16 | #define rocblas_handle hipblasHandle_t 17 | #endif 18 | 19 | #if !defined(USE_ROCM) && (!defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700)) 20 | #define USE_SMEM 21 | #endif 22 | 23 | void q4_matmul_cuda 24 | ( 25 | ExLlamaTuning* tuningParams, 26 | const half* x, 27 | const int x_height, 28 | const Q4Matrix* w, 29 | half* out, 30 | bool no_zero = false, 31 | cudaStream_t alt_stream = NULL 32 | ); 33 | 34 | void q4_matmul_recons_cuda 35 | ( 36 | ExLlamaTuning* tuningParams, 37 | const half* x, 38 | const int x_height, 39 | Q4Matrix* w, 40 | half* out, 41 | const cublasHandle_t handle, 42 | bool no_zero = false 43 | ); 44 | 45 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_func/q4_matrix.cu: -------------------------------------------------------------------------------- 1 | #include "q4_matrix.cuh" 2 | #include 3 | #include "../util.cuh" 4 | #include "../matrix.cuh" 5 | 6 | using namespace std; 7 | 8 | const int UNSHUF_BLOCKSIZE_X = 64; 9 | 10 | const int RECONS_THREADS_X = 64; // Block size and thread count along columns in out, each thread converts 1 column 11 | const int RECONS_THREADS_Y = 1; // Block size and thread count along rows in x and out, each thread converts 8 rows 12 | 13 | vector g_q4_matrices; 14 | 15 | void g_q4_keep_matrix(Q4Matrix* m) 16 | { 17 | g_q4_matrices.push_back(m); 18 | } 19 | 20 | void g_q4_free_matrices() 21 | { 22 | for (const auto& m : g_q4_matrices) delete m; 23 | g_q4_matrices.clear(); 24 | } 25 | 26 | Q4Matrix::Q4Matrix 27 | ( 28 | const int _height, 29 | const int _width, 30 | const int _groups, 31 | 32 | uint32_t* _qweight, 33 | uint32_t* _qzeros, 34 | half* _scales, 35 | uint32_t* _g_idx, 36 | 37 | const int _device 38 | ) : 39 | height(_height), 40 | width(_width), 41 | groups(_groups), 42 | device(_device) 43 | { 44 | cudaSetDevice(device); 45 | 46 | cuda_qweight = _qweight; 47 | cuda_qzeros = _qzeros; 48 | cuda_scales = _scales; 49 | 50 | groupsize = height / groups; 51 | 52 | if (_g_idx) make_sequential(_g_idx); 53 | } 54 | 55 | Q4Matrix::~Q4Matrix() 56 | { 57 | } 58 | 59 | // Make sequential 60 | 61 | __global__ void make_sequential_kernel 62 | ( 63 | const uint32_t* __restrict__ w, 64 | uint32_t* __restrict__ w_new, 65 | const uint32_t* __restrict__ x_map, 66 | const int w_height, 67 | const int w_width 68 | ) 69 | { 70 | const uint64_t* w2 = (uint64_t*) w; 71 | uint64_t* w_new2 = (uint64_t*) w_new; 72 | int w2_stride = w_width >> 1; 73 | 74 | int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x; 75 | if (w2_column >= w2_stride) return; 76 | 77 | int w_new2_row = blockIdx.y; 78 | 79 | int x_map_idx = w_new2_row << 3; 80 | 81 | uint64_t dst = 0; 82 | 83 | 84 | #pragma unroll 85 | for (int i = 0; i < 8; i++) 86 | { 87 | int source_row = x_map[x_map_idx++]; 88 | 89 | int w2_row = source_row >> 3; 90 | int w2_subrow = source_row & 0x07; 91 | int w2_row_shift = w2_subrow << 2; 92 | int wnew2_row_shift = i << 2; 93 | 94 | uint64_t src = w2[w2_row * w2_stride + w2_column]; 95 | src >>= w2_row_shift; 96 | src &= 0x0000000f0000000f; 97 | src <<= wnew2_row_shift; 98 | dst |= src; 99 | } 100 | 101 | w_new2[w_new2_row * w2_stride + w2_column] = dst; 102 | } 103 | 104 | void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx) 105 | { 106 | uint32_t* cuda_new_qweight = NULL; 107 | cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t)); 108 | cudaMalloc(&cuda_x_map, height * sizeof(uint32_t)); // TODO: Should probably be allocated in PyTorch 109 | 110 | uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t)); 111 | uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t)); 112 | uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t)); 113 | 114 | // Group histogram 115 | 116 | for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++; 117 | 118 | // Group map 119 | 120 | for (int i = 0, acc = 0; i < groups; i++) 121 | { 122 | short tmp = cpu_g_idx_map[i]; 123 | cpu_g_idx_map[i] = acc; 124 | acc += tmp; 125 | } 126 | 127 | // X map (inverse) 128 | 129 | for (int row = 0; row < height; row++) 130 | { 131 | uint32_t target_group = cpu_g_idx[row]; 132 | uint32_t target_row = cpu_g_idx_map[target_group]; 133 | cpu_g_idx_map[target_group]++; 134 | cpu_x_map_inv[row] = target_row; 135 | } 136 | 137 | // X map 138 | 139 | for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row; 140 | 141 | // Move to CUDA 142 | 143 | cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice); 144 | 145 | // Rearrange rows in w 146 | 147 | dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1); 148 | dim3 blocks 149 | ( 150 | (width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2), 151 | height / 8, 152 | 1 153 | ); 154 | 155 | make_sequential_kernel<<>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width); 156 | 157 | // Replace qweights 158 | 159 | cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); 160 | 161 | // Cleanup 162 | 163 | cudaDeviceSynchronize(); 164 | cudaFree(cuda_new_qweight); 165 | free(cpu_g_idx_map); 166 | free(cpu_x_map); 167 | free(cpu_x_map_inv); 168 | } 169 | 170 | __global__ void reconstruct_kernel 171 | ( 172 | const uint32_t* __restrict__ w, 173 | half* __restrict__ out, // (y) 174 | const half* __restrict__ w_scales, 175 | const uint32_t* __restrict__ w_zeros, 176 | const int height, 177 | const int width, 178 | const int groupsize 179 | ) 180 | { 181 | // Start of block 182 | 183 | int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x; 184 | int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8; 185 | if (column >= width) return; 186 | 187 | // Views 188 | 189 | MatrixView_q4_column w_(w, height, width); 190 | MatrixView_half_rw out_(out, height, width); 191 | MatrixView_half w_scales_(w_scales, height / groupsize, width); 192 | MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width); 193 | 194 | // Groupsize version 195 | 196 | int group = row / groupsize; 197 | 198 | half w_scale = w_scales_.item(group, column); 199 | uint32_t w_zero = w_zeros_.item(group, column) + 1; 200 | 201 | uint32_t w_read = w_.item_uint32_t(row, column); 202 | half* out_ptr = out_.item_ptr(row, column); 203 | 204 | #pragma unroll 205 | for (int s = 0; s < 32; s += 4) 206 | { 207 | half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale); 208 | *out_ptr = w_item; out_ptr += out_.width; 209 | } 210 | } 211 | 212 | void Q4Matrix::reconstruct(half* out) 213 | { 214 | dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1); 215 | 216 | dim3 blocks 217 | ( 218 | (width + threads.x - 1) / threads.x, 219 | (height / 8 + threads.y - 1) / threads.y, 220 | 1 221 | ); 222 | 223 | reconstruct_kernel<<>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize); 224 | } -------------------------------------------------------------------------------- /exllama_ext/cuda_func/q4_matrix.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _q4_matrix_cuh 2 | #define _q4_matrix_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | class Q4Matrix 9 | { 10 | public: 11 | 12 | int device; 13 | 14 | int height; 15 | int width; 16 | int groups; 17 | int groupsize; 18 | 19 | uint32_t* cuda_qweight = NULL; 20 | uint32_t* cuda_qzeros = NULL; 21 | half* cuda_scales = NULL; 22 | uint32_t* cuda_x_map = NULL; 23 | 24 | Q4Matrix 25 | ( 26 | const int _height, 27 | const int _width, 28 | const int _groups, 29 | 30 | uint32_t* _qweight, 31 | uint32_t* _qzeros, 32 | half* _scales, 33 | uint32_t* _g_idx, 34 | 35 | const int _device 36 | ); 37 | 38 | ~Q4Matrix(); 39 | 40 | void reconstruct(half* out); 41 | 42 | private: 43 | 44 | void make_sequential(const uint32_t* cpu_g_idx); 45 | 46 | }; 47 | 48 | void g_q4_keep_matrix(Q4Matrix* m); 49 | void g_q4_free_matrices(); 50 | 51 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_func/q4_mlp.cu: -------------------------------------------------------------------------------- 1 | #include "q4_mlp.cuh" 2 | #include "q4_matmul.cuh" 3 | #include "half_matmul.cuh" 4 | #include "rms_norm.cuh" 5 | #include "../cuda_buffers.cuh" 6 | #include "../util.cuh" 7 | #include "../matrix.cuh" 8 | #if defined(USE_ROCM) 9 | #include "../hip_compat.cuh" 10 | #endif 11 | 12 | const int THREADS_X = 32; 13 | const int THREADS_Y = 4; 14 | // const int MAX_DIMENSION = 8192; 15 | 16 | __device__ __forceinline__ half silu(half x) 17 | { 18 | half one = __float2half(1.0f); 19 | half neg_x = __hneg(x); 20 | half e = hexp(neg_x); 21 | half sum = __hadd(one, e); 22 | half r = hrcp(sum); 23 | half result = __hmul(x, r); 24 | return result; 25 | } 26 | 27 | __device__ __forceinline__ half2 silu(half2 x) 28 | { 29 | half2 one = __float2half2_rn(1.0f); 30 | half2 neg_x = __hneg2(x); 31 | half2 e = h2exp(neg_x); 32 | half2 sum = __hadd2(one, e); 33 | half2 r = h2rcp(sum); 34 | half2 result = __hmul2(x, r); 35 | return result; 36 | } 37 | 38 | typedef void (*fp_silu_mul_cuda_kernel) 39 | ( 40 | half*, 41 | const half*, 42 | const int, 43 | const int 44 | ); 45 | 46 | template 47 | __global__ void silu_mul_cuda_kernel 48 | ( 49 | half* __restrict__ x, 50 | const half* __restrict__ y, 51 | const int height, 52 | const int width 53 | ) 54 | { 55 | MatrixView_half_rw x_(x, height, width); 56 | MatrixView_half y_(y, height, width); 57 | 58 | int column = (THREADS_X * blockIdx.x + threadIdx.x); if constexpr (use_half2) column *= 2; 59 | int row = THREADS_Y * blockIdx.y + threadIdx.y; 60 | if (row >= height) return; 61 | 62 | // silu(x) * y 63 | 64 | if constexpr (use_half2) 65 | { 66 | half2 one = __half2half2(__float2half(1.0f)); 67 | 68 | half2 x_item = x_.item_half2(row, column); 69 | half2 y_item = y_.item_half2(row, column); 70 | 71 | x_item = silu(x_item); 72 | x_item = __hmul2(x_item, y_item); 73 | 74 | x_.set_half2(row, column, x_item); 75 | } 76 | else 77 | { 78 | half one = __float2half(1.0f); 79 | 80 | half x_item = x_.item(row, column); 81 | half y_item = y_.item(row, column); 82 | 83 | x_item = silu(x_item); 84 | x_item = __hmul(x_item, y_item); 85 | 86 | x_.set(row, column, x_item); 87 | } 88 | } 89 | 90 | fp_silu_mul_cuda_kernel silu_mul_cuda_kernel_pick(ExLlamaTuning* tuningParams) 91 | { 92 | // 93 | if (tuningParams->matmul_no_half2) { 94 | return silu_mul_cuda_kernel; 95 | } else { 96 | return silu_mul_cuda_kernel; 97 | } 98 | }; 99 | 100 | void q4_mlp_cuda 101 | ( 102 | ExLlamaTuning* tuningParams, 103 | half* x, // shape == (height, dim) 104 | const half* rms_norm_weight, // shape == (x.shape[1],) == (dim,) 105 | float epsilon, 106 | Q4Matrix* gate, 107 | Q4Matrix* up, 108 | Q4Matrix* down, 109 | const int height, 110 | const int dim, 111 | const half* gate_a, 112 | const half* gate_b, 113 | const int gate_rank, 114 | const half* up_a, 115 | const half* up_b, 116 | const int up_rank, 117 | const half* down_a, 118 | const half* down_b, 119 | const int down_rank, 120 | half* lora_temp, 121 | cublasHandle_t handle, 122 | const int device_index 123 | ) 124 | { 125 | CudaBuffers* buffers = get_buffers(device_index); 126 | 127 | // temp_x = rms_layernorm(x) 128 | 129 | half* temp_x = buffers->temp_state + height * dim; // TOOD: .. 130 | TORCH_CHECK(buffers->temp_state_size >= 2 * height * dim, "temp_state buffer too small"); 131 | rms_norm_cuda(tuningParams, x, rms_norm_weight, temp_x, epsilon, height, dim, device_index); 132 | 133 | // temp_mlp[0] = temp_x @ gate 134 | // temp_mlp[1] = temp_x @ up 135 | 136 | half* temp_mlp_0 = buffers->temp_mlp; 137 | half* temp_mlp_1 = buffers->temp_mlp + height * up->width; 138 | int temp_mlp_width = up->width; 139 | 140 | if (gate_a) 141 | { 142 | half_matmul_cublas_cuda(tuningParams, temp_x, gate_a, lora_temp, height, dim, gate_rank, handle); 143 | half_matmul_cublas_cuda(tuningParams, lora_temp, gate_b, temp_mlp_0, height, gate_rank, temp_mlp_width, handle); 144 | } 145 | if (up_a) 146 | { 147 | half_matmul_cublas_cuda(tuningParams, temp_x, up_a, lora_temp, height, dim, up_rank, handle); 148 | half_matmul_cublas_cuda(tuningParams, lora_temp, up_b, temp_mlp_1, height, up_rank, temp_mlp_width, handle); 149 | } 150 | 151 | if (!tuningParams->concurrent_streams) 152 | { 153 | q4_matmul_cuda(tuningParams, temp_x, height, gate, temp_mlp_0, gate_a ? true : false); 154 | q4_matmul_cuda(tuningParams, temp_x, height, up, temp_mlp_1, up_a ? true : false); 155 | } 156 | else 157 | { 158 | cudaStream_t str_1 = buffers->alt_stream_1; 159 | cudaStream_t str_2 = buffers->alt_stream_2; 160 | cudaEvent_t sync_1 = buffers->alt_stream_1_done; 161 | cudaEvent_t sync_2 = buffers->alt_stream_2_done; 162 | 163 | q4_matmul_cuda(tuningParams, temp_x, height, gate, buffers->temp_mlp, gate_a ? true : false, str_1); 164 | cudaEventRecord(sync_1, str_1); 165 | 166 | q4_matmul_cuda(tuningParams, temp_x, height, up, buffers->temp_mlp + height * up->width, up_a ? true : false, str_2); 167 | cudaEventRecord(sync_2, str_2); 168 | 169 | cudaStreamWaitEvent(NULL, sync_1, 0); 170 | cudaStreamWaitEvent(NULL, sync_2, 0); 171 | } 172 | 173 | // temp_mlp[0] = silu(temp_mlp[0]) * temp_mlp[1] 174 | 175 | dim3 threads(THREADS_X, THREADS_Y, 1); 176 | 177 | dim3 blocks 178 | ( 179 | (up->width + THREADS_X - 1) / THREADS_X / (tuningParams->silu_no_half2 ? 1 : 2), 180 | (height + THREADS_Y - 1) / THREADS_Y, 181 | 1 182 | ); 183 | 184 | fp_silu_mul_cuda_kernel kernel = silu_mul_cuda_kernel_pick(tuningParams); 185 | kernel<<>>(temp_mlp_0, temp_mlp_1, height, temp_mlp_width); 186 | 187 | // x += temp1 @ down (implicitly add the residual connection by not zeroing the output in the matmul) 188 | 189 | if (down_a) 190 | { 191 | half_matmul_cublas_cuda(tuningParams, temp_mlp_0, down_a, lora_temp, height, temp_mlp_width, down_rank, handle); 192 | half_matmul_cublas_cuda(tuningParams, lora_temp, down_b, x, height, down_rank, dim, handle, true); 193 | } 194 | q4_matmul_cuda(tuningParams, temp_mlp_0, height, down, x, true); 195 | 196 | // Reset the temp buffer after use so it's always zeros. 197 | //cudaMemsetAsync(buffers->temp_mlp, 0, 2 * height * up->width * sizeof(half)); 198 | 199 | } -------------------------------------------------------------------------------- /exllama_ext/cuda_func/q4_mlp.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _q4_mlp_cuh 2 | #define _q4_mlp_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "../tuning.h" 10 | #include "q4_matrix.cuh" 11 | 12 | void q4_mlp_cuda 13 | ( 14 | ExLlamaTuning* tuningParams, 15 | half* x, // shape == (height, dim) 16 | const half* rms_norm_weight, // shape == (x.shape[1],) == (dim,) 17 | float epsilon, 18 | Q4Matrix* gate, 19 | Q4Matrix* up, 20 | Q4Matrix* down, 21 | const int height, 22 | const int dim, 23 | const half* gate_a, 24 | const half* gate_b, 25 | const int gate_rank, 26 | const half* up_a, 27 | const half* up_b, 28 | const int up_rank, 29 | const half* down_a, 30 | const half* down_b, 31 | const int down_rank, 32 | half* lora_temp, 33 | cublasHandle_t handle, 34 | const int device_index 35 | ); 36 | 37 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_func/rms_norm.cu: -------------------------------------------------------------------------------- 1 | #include "rms_norm.cuh" 2 | #include "../cuda_buffers.cuh" 3 | #include "../util.cuh" 4 | #include "../matrix.cuh" 5 | 6 | const int THREADS_X = 32; 7 | const int THREADS_Y = 8; 8 | const int BLOCKSIZE_X = 16; 9 | 10 | // scratch = sum(x * x, dim = -1) 11 | 12 | typedef void (*fp_rms_norm_row_product_kernel) 13 | ( 14 | half*, 15 | float*, 16 | const int, 17 | const int 18 | ); 19 | 20 | template 21 | __global__ void rms_norm_row_product_kernel 22 | ( 23 | half* __restrict__ x, 24 | float* __restrict__ scratch, 25 | const int rows, 26 | const int dim 27 | ) 28 | { 29 | int column = (THREADS_X * blockIdx.x + threadIdx.x) * BLOCKSIZE_X; 30 | int row = THREADS_Y * blockIdx.y + threadIdx.y; 31 | if (row >= rows) return; 32 | if (column >= dim) return; 33 | 34 | // if (column == 0) 35 | // { 36 | // scratch[row] = 0.0f; 37 | // __syncthreads(); 38 | // } 39 | 40 | float acc = 0.0f; 41 | int idx = row * dim + column; 42 | 43 | // Accumulate 44 | 45 | if constexpr (use_half2) 46 | { 47 | half2* x_ptr = (half2*) &x[idx]; 48 | 49 | #pragma unroll 50 | for (int k = 0; k < BLOCKSIZE_X / 2; k++) 51 | { 52 | half2 x2 = *x_ptr++; 53 | float m0 = __half2float(x2.x); 54 | float m1 = __half2float(x2.y); 55 | acc = fma(m0, m0, acc); 56 | acc = fma(m1, m1, acc); 57 | } 58 | } 59 | else 60 | { 61 | half* x_ptr = x + idx; 62 | 63 | #pragma unroll 64 | for (int k = 0; k < BLOCKSIZE_X; k++) 65 | { 66 | float m0 = __half2float(*x_ptr++); 67 | acc = fma(m0, m0, acc); 68 | } 69 | } 70 | 71 | // // Use Warp Shuffle to accumulate within the warp 72 | // 73 | // for (int offset = warpSize / 2; offset > 0; offset /= 2) 74 | // acc += __shfl_down_sync(0xffffffff, acc, offset); 75 | // if (threadIdx.x % warpSize == 0) 76 | // atomicAdd(&scratch[row], acc); 77 | 78 | atomicAdd(&scratch[row], acc); 79 | } 80 | 81 | // x = x * w / sqrt(scratch / dim + epsilon) 82 | 83 | typedef void (*fp_rms_norm_kernel) 84 | ( 85 | half*, 86 | const half*, 87 | half*, 88 | float*, 89 | const float, 90 | const float, 91 | const int, 92 | const int 93 | ); 94 | 95 | template 96 | __global__ void rms_norm_kernel 97 | ( 98 | half* __restrict__ x, 99 | const half* __restrict__ w, 100 | half* __restrict__ out, 101 | float* __restrict__ scratch, 102 | const float epsilon, 103 | const float r_dim, 104 | const int rows, 105 | const int dim 106 | ) 107 | { 108 | int column = (THREADS_X * blockIdx.x + threadIdx.x) * BLOCKSIZE_X; 109 | int row = THREADS_Y * blockIdx.y + threadIdx.y; 110 | if (row >= rows) return; 111 | if (column >= dim) return; 112 | 113 | float rmf = rsqrtf(scratch[row] * r_dim + epsilon); 114 | half rm = __float2half_rn(rmf); 115 | half2 rm2 = __half2half2(rm); 116 | 117 | if constexpr (use_half2) 118 | { 119 | half2* x2_ptr = (half2*) &x[row * dim + column]; 120 | half2* out2_ptr = (half2*) &out[row * dim + column]; 121 | const half2* w2_ptr = (const half2*) &w[column]; 122 | 123 | #pragma unroll 124 | for (int k = 0; k < BLOCKSIZE_X / 2; k++) 125 | { 126 | half2 m2 = *x2_ptr++; 127 | half2 w2 = *w2_ptr++; 128 | m2 = __hmul2(m2, rm2); 129 | m2 = __hmul2(m2, w2); 130 | *out2_ptr++ = m2; 131 | } 132 | } 133 | else 134 | { 135 | half* x_ptr = &x[row * dim + column]; 136 | half* out_ptr = &out[row * dim + column]; 137 | const half* w_ptr = &w[column]; 138 | 139 | #pragma unroll 140 | for (int k = 0; k < BLOCKSIZE_X; k++) 141 | { 142 | half m = *x_ptr++; 143 | half w = *w_ptr++; 144 | m = __hmul(m, rm); 145 | m = __hmul(m, w); 146 | *out_ptr++ = m; 147 | } 148 | } 149 | 150 | // __syncthreads(); 151 | // if (column >= dim - BLOCKSIZE_X) scratch[row] = 0.0f; 152 | } 153 | 154 | fp_rms_norm_row_product_kernel rms_norm_row_product_kernel_pick(ExLlamaTuning* tuningParams) 155 | { 156 | // 157 | if (tuningParams->matmul_no_half2) { 158 | return rms_norm_row_product_kernel; 159 | } else { 160 | return rms_norm_row_product_kernel; 161 | } 162 | }; 163 | 164 | fp_rms_norm_kernel rms_norm_kernel_pick(ExLlamaTuning* tuningParams) 165 | { 166 | // 167 | if (tuningParams->matmul_no_half2) { 168 | return rms_norm_kernel; 169 | } else { 170 | return rms_norm_kernel; 171 | } 172 | }; 173 | 174 | // x = x * w / sqrt(row_mean(x * x) + epsilon) 175 | // 176 | // works in-place if x == out 177 | 178 | void rms_norm_cuda 179 | ( 180 | ExLlamaTuning* tuningParams, 181 | half* x, 182 | const half* w, 183 | half* out, 184 | const float epsilon, 185 | const int rows, 186 | const int dim, 187 | const int device_index 188 | ) 189 | { 190 | CudaBuffers* buffers = get_buffers(device_index); 191 | float* temp = buffers->get_zeros_float(rows); 192 | 193 | float r_dim = 1.0f / (float) dim; 194 | 195 | dim3 threads(THREADS_X, THREADS_Y, 1); 196 | 197 | dim3 blocks 198 | ( 199 | ((dim + THREADS_X - 1) / THREADS_X + THREADS_X - 1) / BLOCKSIZE_X, 200 | (rows + THREADS_Y - 1) / THREADS_Y, 201 | 1 202 | ); 203 | 204 | //cudaMemsetAsync(temp, 0, rows * sizeof(float)); 205 | 206 | fp_rms_norm_row_product_kernel kernel1 = rms_norm_row_product_kernel_pick(tuningParams); 207 | kernel1<<>>(x, temp, rows, dim); 208 | 209 | fp_rms_norm_kernel kernel2 = rms_norm_kernel_pick(tuningParams); 210 | kernel2<<>>(x, w, out, temp, epsilon, r_dim, rows, dim); 211 | 212 | //cudaMemsetAsync(temp, 0, rows * sizeof(float)); 213 | } 214 | -------------------------------------------------------------------------------- /exllama_ext/cuda_func/rms_norm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _rms_norm_cuh 2 | #define _rms_norm_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../tuning.h" 9 | 10 | void rms_norm_cuda 11 | ( 12 | ExLlamaTuning* tuningParams, 13 | half* x, 14 | const half* w, 15 | half* out, 16 | const float epsilon, 17 | const int rows, 18 | const int dim, 19 | const int device_index 20 | ); 21 | 22 | #endif -------------------------------------------------------------------------------- /exllama_ext/cuda_func/rope.cu: -------------------------------------------------------------------------------- 1 | #include "rope.cuh" 2 | #include "../util.cuh" 3 | #include "../matrix.cuh" 4 | 5 | const int THREADS_X = 32; 6 | const int THREADS_Y = 4; 7 | const int MAX_POS_EMBEDDINGS = 32768; // Actual number doesn't matter 8 | 9 | typedef void (*fp_rope_cuda_kernel) 10 | ( 11 | half*, 12 | const half*, 13 | const half*, 14 | int, 15 | int, 16 | int, 17 | int 18 | ); 19 | 20 | template 21 | __global__ void rope_cuda_kernel 22 | ( 23 | half* __restrict__ x, 24 | const half* __restrict__ sin, 25 | const half* __restrict__ cos, 26 | int rows_per_batch, 27 | int head_dim, 28 | int num_heads, 29 | int past_len 30 | ) 31 | { 32 | // These heights aren't used so it's okay if they're wrong. 33 | MatrixView_half_rw x_(x, rows_per_batch, head_dim); 34 | MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim); 35 | MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim); 36 | 37 | int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2; 38 | int half_dim = head_dim / 2; 39 | if (column >= half_dim) return; 40 | 41 | int row = blockIdx.y * THREADS_Y + threadIdx.y; 42 | if (row >= rows_per_batch) return; 43 | int batch_offset = blockIdx.z * rows_per_batch; 44 | int row_offset = batch_offset + row; 45 | 46 | // Get sin and cos 47 | 48 | int sincos_row = past_len + row / num_heads; 49 | 50 | if constexpr (use_half2) 51 | { 52 | half2 cos2_l = cos_.item_half2(sincos_row, column); 53 | half2 cos2_r = cos_.item_half2(sincos_row, column + half_dim); 54 | half2 sin2_l = sin_.item_half2(sincos_row, column); 55 | half2 sin2_r = sin_.item_half2(sincos_row, column + half_dim); 56 | sin2_l = __hneg2(sin2_l); 57 | 58 | // Apply embedding to row 59 | 60 | half2 item2_l = x_.item_half2(row_offset, column); 61 | half2 item2_r = x_.item_half2(row_offset, column + half_dim); 62 | half2 item2_ls = __hmul2(item2_r, sin2_l); 63 | half2 item2_rs = __hmul2(item2_l, sin2_r); 64 | item2_l = __hfma2(item2_l, cos2_l, item2_ls); 65 | item2_r = __hfma2(item2_r, cos2_r, item2_rs); 66 | x_.set_half2(row_offset, column, item2_l); 67 | x_.set_half2(row_offset, column + half_dim, item2_r); 68 | } 69 | else 70 | { 71 | half cos_l = cos_.item(sincos_row, column); 72 | half cos_r = cos_.item(sincos_row, column + half_dim); 73 | half sin_l = sin_.item(sincos_row, column); 74 | half sin_r = sin_.item(sincos_row, column + half_dim); 75 | sin_l = __hneg(sin_l); 76 | 77 | // Apply embedding to row 78 | 79 | half item_l = x_.item(row_offset, column); 80 | half item_r = x_.item(row_offset, column + half_dim); 81 | half item_ls = __hmul(item_r, sin_l); 82 | half item_rs = __hmul(item_l, sin_r); 83 | item_l = __hfma(item_l, cos_l, item_ls); 84 | item_r = __hfma(item_r, cos_r, item_rs); 85 | x_.set(row_offset, column, item_l); 86 | x_.set(row_offset, column + half_dim, item_r); 87 | } 88 | } 89 | 90 | fp_rope_cuda_kernel rope_cuda_kernel_pick(ExLlamaTuning* tuningParams) 91 | { 92 | // 93 | if (tuningParams->matmul_no_half2) { 94 | return rope_cuda_kernel; 95 | } else { 96 | return rope_cuda_kernel; 97 | } 98 | }; 99 | 100 | void rope_cuda 101 | ( 102 | ExLlamaTuning* tuningParams, 103 | half* x, 104 | const half* sin, 105 | const half* cos, 106 | const int bsz, 107 | const int rows_per_batch, 108 | const int head_dim, 109 | const int num_heads, 110 | const int past_len, 111 | cudaStream_t alt_stream 112 | ) 113 | { 114 | dim3 threads(THREADS_X, THREADS_Y, 1); 115 | 116 | dim3 blocks 117 | ( 118 | (head_dim + THREADS_X - 1) / THREADS_X / 2 / (tuningParams->rope_no_half2 ? 1 : 2), 119 | (rows_per_batch + THREADS_Y - 1) / THREADS_Y, 120 | int(bsz) 121 | ); 122 | 123 | fp_rope_cuda_kernel kernel = rope_cuda_kernel_pick(tuningParams); 124 | kernel<<>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len); 125 | } 126 | -------------------------------------------------------------------------------- /exllama_ext/cuda_func/rope.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _rope_cuh 2 | #define _rope_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../tuning.h" 9 | 10 | void rope_cuda 11 | ( 12 | ExLlamaTuning* tuningParams, 13 | half* x, 14 | const half* sin, 15 | const half* cos, 16 | const int bsz, 17 | const int rows, 18 | const int head_dim, 19 | const int num_heads, 20 | const int past_len, 21 | cudaStream_t alt_stream = NULL 22 | ); 23 | 24 | #endif -------------------------------------------------------------------------------- /exllama_ext/hip_compat.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _hip_compat_cuh 2 | #define _hip_compat_cuh 3 | 4 | // Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6. 5 | __device__ __forceinline__ __half __compat_hrcp(__half x) { 6 | return __half_raw{ 7 | static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))}; 8 | } 9 | 10 | __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { 11 | return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)), 12 | static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))}; 13 | } 14 | 15 | #define hrcp __compat_hrcp 16 | #define h2rcp __compat_h2rcp 17 | 18 | // Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf. 19 | __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, 20 | hipblasOperation_t transA, 21 | hipblasOperation_t transB, 22 | int m, 23 | int n, 24 | int k, 25 | const half* alpha, 26 | const half* AP, 27 | int lda, 28 | const half* BP, 29 | int ldb, 30 | const half* beta, 31 | half* CP, 32 | int ldc) { 33 | return hipblasHgemm(handle, transA, transB, m, n, k, 34 | reinterpret_cast(alpha), 35 | reinterpret_cast(AP), lda, 36 | reinterpret_cast(BP), ldb, 37 | reinterpret_cast(beta), 38 | reinterpret_cast(CP), ldc); 39 | } 40 | #define hipblasHgemm __compat_hipblasHgemm 41 | 42 | // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. 43 | #define rocblas_handle hipblasHandle_t 44 | #define rocblas_operation_none HIPBLAS_OP_N 45 | #define rocblas_get_stream hipblasGetStream 46 | #define rocblas_set_stream hipblasSetStream 47 | #define rocblas_hgemm __compat_hipblasHgemm 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /exllama_ext/matrix.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _matrix_cuh 2 | #define _matrix_cuh 3 | 4 | #include 5 | #include 6 | 7 | //#include "cuda_buffers.cuh" 8 | 9 | class MatrixView_half 10 | { 11 | public: 12 | const half* data; 13 | const int height; 14 | const int width; 15 | 16 | __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) 17 | : data(data), height(height), width(width) 18 | { } 19 | 20 | __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } 21 | __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } 22 | __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } 23 | __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } 24 | }; 25 | 26 | class MatrixView_half_rw 27 | { 28 | public: 29 | half* data; 30 | const int height; 31 | const int width; 32 | 33 | __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) 34 | : data(data), height(height), width(width) 35 | { } 36 | 37 | __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } 38 | __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } 39 | __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } 40 | __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } 41 | __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } 42 | }; 43 | 44 | class MatrixView_q4_row 45 | { 46 | public: 47 | const uint32_t* data; 48 | const int height; 49 | const int width; 50 | 51 | __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) 52 | : data(data), height(height), width(width) 53 | { } 54 | 55 | __device__ __forceinline__ int item(int row, int column) const 56 | { 57 | int shift = (column & 0x07) * 4; 58 | return (data[row * width / 8 + column / 8] >> shift) & 0x0f; 59 | } 60 | }; 61 | 62 | class MatrixView_q4_column 63 | { 64 | public: 65 | const uint32_t* data; 66 | const int height; 67 | const int width; 68 | 69 | __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) 70 | : data(data), height(height), width(width) 71 | { } 72 | 73 | __device__ __forceinline__ int item(int row, int column) const 74 | { 75 | int shift = (row & 0x07) * 4; 76 | return (data[row / 8 * width + column] >> shift) & 0x0f; 77 | } 78 | 79 | __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } 80 | __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } 81 | }; 82 | 83 | // TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu 84 | 85 | // Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale 86 | 87 | __device__ __forceinline__ half2 dot_product_8 88 | ( 89 | const half2 acc, 90 | const half2* h_ptr, 91 | MatrixView_q4_column& v_, 92 | const int v_row, // divisible by 8 93 | const int v_column, 94 | const half2 v_scale_2, 95 | const uint32_t v_zero, // + 1 (!!) 96 | const int count 97 | ) 98 | { 99 | const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); 100 | half2 result = acc; 101 | 102 | for (int i = 0; i < count; i++) 103 | { 104 | uint32_t v_read = *v_ptr; v_ptr += v_.width; 105 | 106 | half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); 107 | half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); 108 | half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); 109 | half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); 110 | half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); 111 | half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); 112 | half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); 113 | half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); 114 | 115 | half2 v_01 = __halves2half2(v_0, v_1); 116 | half2 v_23 = __halves2half2(v_2, v_3); 117 | half2 v_45 = __halves2half2(v_4, v_5); 118 | half2 v_67 = __halves2half2(v_6, v_7); 119 | 120 | // half2 v_01 = q4_table[v_zero - 1][(v_read ) & 0xff]; // (constant memory is too slow apparently) 121 | // half2 v_23 = q4_table[v_zero - 1][(v_read >> 8) & 0xff]; 122 | // half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff]; 123 | // half2 v_67 = q4_table[v_zero - 1][(v_read >> 24) ]; 124 | 125 | half2 tmp = __hmul2(*h_ptr++, v_01); 126 | tmp = __hfma2(*h_ptr++, v_23, tmp); 127 | tmp = __hfma2(*h_ptr++, v_45, tmp); 128 | tmp = __hfma2(*h_ptr++, v_67, tmp); 129 | result = __hfma2(v_scale_2, tmp, result); 130 | } 131 | 132 | return result; 133 | } 134 | 135 | __device__ __forceinline__ half dot_product_8_h 136 | ( 137 | const half acc, 138 | const half* h_ptr, 139 | MatrixView_q4_column& v_, 140 | const int v_row, // divisible by 8 141 | const int v_column, 142 | const half v_scale, 143 | const uint32_t v_zero, // + 1 (!!) 144 | const int count 145 | ) 146 | { 147 | const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); 148 | half result = acc; 149 | 150 | for (int i = 0; i < count; i++) 151 | { 152 | uint32_t v_read = *v_ptr; v_ptr += v_.width; 153 | 154 | half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); 155 | half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); 156 | half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); 157 | half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); 158 | half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); 159 | half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); 160 | half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); 161 | half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); 162 | 163 | half tmp = __hmul(*h_ptr++, v_0); 164 | tmp = __hfma(*h_ptr++, v_1, tmp); 165 | tmp = __hfma(*h_ptr++, v_2, tmp); 166 | tmp = __hfma(*h_ptr++, v_3, tmp); 167 | tmp = __hfma(*h_ptr++, v_4, tmp); 168 | tmp = __hfma(*h_ptr++, v_5, tmp); 169 | tmp = __hfma(*h_ptr++, v_6, tmp); 170 | tmp = __hfma(*h_ptr++, v_7, tmp); 171 | result = __hfma(v_scale, tmp, result); 172 | } 173 | 174 | return result; 175 | } 176 | 177 | // Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map 178 | 179 | __device__ __forceinline__ half2 dot_product_8_x_map 180 | ( 181 | const half2 acc, 182 | MatrixView_half& h_, 183 | const int h_row, 184 | const int h_column, // divisible by 8 185 | MatrixView_q4_column& v_, 186 | const int v_row, // divisible by 8 187 | const int v_column, 188 | const half2 v_scale_2, 189 | const uint32_t v_zero, // + 1 (!!) 190 | const int count, 191 | const uint32_t* x_map 192 | ) 193 | { 194 | const half* h_ptr = h_.item_ptr(h_row, 0); 195 | const uint32_t* x_map_ptr = x_map + h_column; 196 | const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); 197 | half2 result = acc; 198 | 199 | for (int i = 0; i < count; i++) 200 | { 201 | uint32_t v_read = *v_ptr; v_ptr += v_.width; 202 | 203 | half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); 204 | half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); 205 | half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); 206 | half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); 207 | half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); 208 | half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); 209 | half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); 210 | half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); 211 | 212 | half2 v_01 = __halves2half2(v_0, v_1); 213 | half2 v_23 = __halves2half2(v_2, v_3); 214 | half2 v_45 = __halves2half2(v_4, v_5); 215 | half2 v_67 = __halves2half2(v_6, v_7); 216 | 217 | half h_0 = h_ptr[*x_map_ptr++]; 218 | half h_1 = h_ptr[*x_map_ptr++]; 219 | half h_2 = h_ptr[*x_map_ptr++]; 220 | half h_3 = h_ptr[*x_map_ptr++]; 221 | half h_4 = h_ptr[*x_map_ptr++]; 222 | half h_5 = h_ptr[*x_map_ptr++]; 223 | half h_6 = h_ptr[*x_map_ptr++]; 224 | half h_7 = h_ptr[*x_map_ptr++]; 225 | 226 | half2 h_01 = __halves2half2(h_0, h_1); 227 | half2 h_23 = __halves2half2(h_2, h_3); 228 | half2 h_45 = __halves2half2(h_4, h_5); 229 | half2 h_67 = __halves2half2(h_6, h_7); 230 | 231 | half2 tmp = __hmul2(h_01, v_01); 232 | tmp = __hfma2(h_23, v_23, tmp); 233 | tmp = __hfma2(h_45, v_45, tmp); 234 | tmp = __hfma2(h_67, v_67, tmp); 235 | result = __hfma2(v_scale_2, tmp, result); 236 | } 237 | 238 | return result; 239 | } 240 | 241 | __device__ __forceinline__ half dot_product_8_x_map_h 242 | ( 243 | const half acc, 244 | MatrixView_half& h_, 245 | const int h_row, 246 | const int h_column, // divisible by 8 247 | MatrixView_q4_column& v_, 248 | const int v_row, // divisible by 8 249 | const int v_column, 250 | const half v_scale, 251 | const uint32_t v_zero, // + 1 (!!) 252 | const int count, 253 | const uint32_t* x_map 254 | ) 255 | { 256 | const half* h_ptr = h_.item_ptr(h_row, 0); 257 | const uint32_t* x_map_ptr = x_map + h_column; 258 | const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); 259 | half result = acc; 260 | 261 | for (int i = 0; i < count; i++) 262 | { 263 | uint32_t v_read = *v_ptr; v_ptr += v_.width; 264 | 265 | half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); 266 | half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); 267 | half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); 268 | half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); 269 | half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); 270 | half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); 271 | half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); 272 | half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); 273 | 274 | half tmp = __hmul(h_ptr[*x_map_ptr++], v_0); 275 | tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp); 276 | tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp); 277 | tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp); 278 | tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp); 279 | tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp); 280 | tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp); 281 | tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp); 282 | result = __hfma(v_scale, tmp, result); 283 | } 284 | 285 | return result; 286 | } 287 | 288 | #endif 289 | -------------------------------------------------------------------------------- /exllama_ext/tuning.h: -------------------------------------------------------------------------------- 1 | #ifndef _tuning_h 2 | #define _tuning_h 3 | 4 | struct ExLlamaTuning 5 | { 6 | int matmul_recons_thd; 7 | int fused_mlp_thd; 8 | int sdp_thd; 9 | bool matmul_fused_remap; 10 | 11 | bool rmsnorm_no_half2; 12 | bool rope_no_half2; 13 | bool matmul_no_half2; 14 | bool silu_no_half2; 15 | bool concurrent_streams; 16 | }; 17 | 18 | #endif -------------------------------------------------------------------------------- /exllama_ext/util.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _util_cuh 2 | #define _util_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #if defined(USE_ROCM) 10 | #define cudaUnspecified hipErrorUnknown 11 | #else 12 | #define cudaUnspecified cudaErrorApiFailureBase 13 | #endif 14 | 15 | // React to failure on return code != cudaSuccess 16 | 17 | #define _cuda_check(fn) \ 18 | do { \ 19 | {_cuda_err = fn;} \ 20 | if (_cuda_err != cudaSuccess) goto _cuda_fail; \ 21 | } while(false) 22 | 23 | // React to failure on return code == 0 24 | 25 | #define _alloc_check(fn) \ 26 | do { \ 27 | if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \ 28 | else _cuda_err = cudaSuccess; \ 29 | } while(false) 30 | 31 | // Clone CPU <-> CUDA 32 | 33 | template 34 | T* cuda_clone(const void* ptr, int num) 35 | { 36 | T* cuda_ptr; 37 | cudaError_t r; 38 | 39 | r = cudaMalloc(&cuda_ptr, num * sizeof(T)); 40 | if (r != cudaSuccess) return NULL; 41 | r = cudaMemcpy(cuda_ptr, ptr, num * sizeof(T), cudaMemcpyHostToDevice); 42 | if (r != cudaSuccess) return NULL; 43 | cudaDeviceSynchronize(); 44 | return cuda_ptr; 45 | } 46 | 47 | template 48 | T* cpu_clone(const void* ptr, int num) 49 | { 50 | T* cpu_ptr; 51 | cudaError_t r; 52 | 53 | cpu_ptr = (T*) malloc(num * sizeof(T)); 54 | if (cpu_ptr == NULL) return NULL; 55 | r = cudaMemcpy(cpu_ptr, ptr, num * sizeof(T), cudaMemcpyDeviceToHost); 56 | if (r != cudaSuccess) return NULL; 57 | cudaDeviceSynchronize(); 58 | return cpu_ptr; 59 | } 60 | 61 | // Pack two half values into a half2, host version 62 | 63 | __host__ inline __half2 pack_half2(__half h1, __half h2) 64 | { 65 | unsigned short s1 = *reinterpret_cast(&h1); 66 | unsigned short s2 = *reinterpret_cast(&h2); 67 | ushort2 us2 = make_ushort2(s1, s2); 68 | return *reinterpret_cast<__half2*>(&us2); 69 | } 70 | 71 | #endif -------------------------------------------------------------------------------- /globals.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def set_affinity_mask(affinity_mask = None): 4 | 5 | if affinity_mask is None: 6 | cpu_count = os.cpu_count() 7 | affinity_mask = set(range(cpu_count)) 8 | 9 | os.sched_setaffinity(0, affinity_mask) 10 | 11 | 12 | def set_affinity_list(affinity_list = None): 13 | 14 | if affinity_list is None: set_affinity_mask(None) 15 | else: set_affinity_mask(set(affinity_list)) 16 | 17 | 18 | def set_affinity_str(affinity_str = None): 19 | 20 | if affinity_str is None or affinity_str.isspace(): set_affinity_mask(None) 21 | aff = [int(alloc) for alloc in affinity_str.split(",")] 22 | set_affinity_list(aff) 23 | -------------------------------------------------------------------------------- /model_init.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.tokenizer import ExLlamaTokenizer 3 | import argparse, sys, os, glob 4 | from torch import version as torch_version 5 | from globals import set_affinity_str 6 | 7 | def add_args(parser): 8 | 9 | parser.add_argument("-t", "--tokenizer", type = str, help = "Tokenizer model path") 10 | parser.add_argument("-c", "--config", type = str, help = "Model config path (config.json)") 11 | parser.add_argument("-m", "--model", type = str, help = "Model weights path (.pt or .safetensors file)") 12 | parser.add_argument("-d", "--directory", type = str, help = "Path to directory containing config.json, model.tokenizer and * .safetensors") 13 | 14 | parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7") 15 | parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048) 16 | parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0) 17 | parser.add_argument("-a", "--alpha", type = float, help = "alpha for context size extension via embedding extension", default = 1.0) 18 | parser.add_argument("-theta", "--theta", type = float, help = "theta (base) for RoPE embeddings") 19 | 20 | parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs") 21 | 22 | parser.add_argument("-flash", "--flash_attn", nargs = '?', const = 'default', metavar = "METHOD", help = "Use Flash Attention with specified input length (must have Flash Attention 2.0 installed)") 23 | 24 | parser.add_argument("-mmrt", "--matmul_recons_thd", type = int, help = "No. rows at which to use reconstruction and cuBLAS for quant matmul. 0 = never, 1 = always", default = 8) 25 | parser.add_argument("-fmt", "--fused_mlp_thd", type = int, help = "Maximum no. of rows for which to use fused MLP. 0 = never", default = 2) 26 | parser.add_argument("-sdpt", "--sdp_thd", type = int, help = "No. rows at which to switch to scaled_dot_product_attention. 0 = never, 1 = always", default = 8) 27 | parser.add_argument("-mmfr", "--matmul_fused_remap", action = "store_true", help = "Fuse column remapping in Q4 matmul kernel") 28 | parser.add_argument("-nfa", "--no_fused_attn", action = "store_true", help = "Disable fused attention") 29 | 30 | parser.add_argument("-rnnh2", "--rmsnorm_no_half2", action = "store_true", help = "Don't use half2 in RMS norm kernel") 31 | parser.add_argument("-rpnh2", "--rope_no_half2", action = "store_true", help = "Don't use half2 in RoPE kernel") 32 | parser.add_argument("-mmnh2", "--matmul_no_half2", action = "store_true", help = "Don't use half2 in Q4 matmul kernel") 33 | parser.add_argument("-snh2", "--silu_no_half2", action = "store_true", help = "Don't use half2 in SiLU kernel") 34 | parser.add_argument("-nh2", "--no_half2", action = "store_true", help = "(All of the above) disable half2 in all kernela") 35 | parser.add_argument("-fh2", "--force_half2", action = "store_true", help = "Force enable half2 even if unsupported") 36 | parser.add_argument("-cs", "--concurrent_streams", action = "store_true", help = "Use concurrent CUDA streams") 37 | 38 | parser.add_argument("-aff", "--affinity", type = str, help = "Comma-separated list, sets processor core affinity. E.g.: -aff 0,1,2,3") 39 | 40 | 41 | def post_parse(args): 42 | 43 | if args.no_half2 or torch_version.hip and not args.force_half2: 44 | args.rmsnorm_no_half2 = True 45 | args.rope_no_half2 = True 46 | args.matmul_no_half2 = True 47 | args.silu_no_half2 = True 48 | 49 | 50 | # Get model files from --directory 51 | 52 | def get_model_files(args): 53 | 54 | if args.directory is not None: 55 | args.tokenizer = os.path.join(args.directory, "tokenizer.model") 56 | args.config = os.path.join(args.directory, "config.json") 57 | st_pattern = os.path.join(args.directory, "*.safetensors") 58 | st = glob.glob(st_pattern) 59 | if len(st) == 0: 60 | print(f" !! No files matching {st_pattern}") 61 | sys.exit() 62 | # if len(st) > 1: 63 | # print(f" !! Multiple files matching {st_pattern}") 64 | # sys.exit() 65 | args.model = st 66 | else: 67 | if args.tokenizer is None or args.config is None or args.model is None: 68 | print(" !! Please specify either -d or all of -t, -c and -m") 69 | sys.exit() 70 | 71 | 72 | # Feedback 73 | 74 | def _common_chars(names): 75 | cname = max(names, key = len) 76 | for x in names: 77 | for p, c in enumerate(x): 78 | if c != cname[p] and cname[p] != "*": cname = cname[:p] + "*" + cname[p+1:] 79 | return cname 80 | 81 | def print_options(args, extra_options = None): 82 | 83 | print_opts = [] 84 | if args.gpu_split is not None: print_opts.append(f"gpu_split: {args.gpu_split}") 85 | if args.gpu_peer_fix: print_opts.append("gpu_peer_fix") 86 | if args.affinity: print_opts.append(f" --affinity: {args.affinity}") 87 | 88 | if extra_options is not None: print_opts += extra_options 89 | 90 | print(f" -- Tokenizer: {args.tokenizer}") 91 | print(f" -- Model config: {args.config}") 92 | 93 | if isinstance(args.model, str): print(f" -- Model: {args.model}") 94 | else: print(f" -- Model: {_common_chars(args.model)}") 95 | 96 | print(f" -- Sequence length: {args.length}") 97 | if args.compress_pos_emb != 1.0: 98 | print(f" -- RoPE compression factor: {args.compress_pos_emb}") 99 | 100 | if args.alpha != 1.0: 101 | print(f" -- RoPE alpha factor: {args.alpha}") 102 | 103 | print(f" -- Tuning:") 104 | 105 | if args.flash_attn: print(f" -- --flash_attn") 106 | else: print(f" -- --sdp_thd: {args.sdp_thd}" + (" (disabled)" if args.sdp_thd == 0 else "")) 107 | 108 | print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else "")) 109 | print(f" -- --fused_mlp_thd: {args.fused_mlp_thd}" + (" (disabled)" if args.fused_mlp_thd == 0 else "")) 110 | if args.matmul_fused_remap: print(f" -- --matmul_fused_remap") 111 | if args.no_fused_attn: print(f" -- --no_fused_attn") 112 | if args.rmsnorm_no_half2: print(f" -- --rmsnorm_no_half2") 113 | if args.rope_no_half2: print(f" -- --rope_no_half2") 114 | if args.matmul_no_half2: print(f" -- --matmul_no_half2") 115 | if args.silu_no_half2: print(f" -- --silu_no_half2") 116 | if args.concurrent_streams: print(f" -- --concurrent_streams") 117 | 118 | print(f" -- Options: {print_opts}") 119 | 120 | 121 | # Build ExLlamaConfig from args 122 | 123 | def make_config(args): 124 | 125 | config = ExLlamaConfig(args.config) 126 | config.model_path = args.model 127 | 128 | config.max_seq_len = args.length 129 | config.compress_pos_emb = args.compress_pos_emb 130 | config.set_auto_map(args.gpu_split) 131 | config.gpu_peer_fix = args.gpu_peer_fix 132 | config.alpha_value = args.alpha 133 | config.calculate_rotary_embedding_base() 134 | 135 | if args.flash_attn: 136 | config.use_flash_attn_2 = True 137 | try: 138 | config.max_input_len = int(args.flash_attn) 139 | except ValueError: 140 | pass 141 | 142 | config.matmul_recons_thd = args.matmul_recons_thd 143 | config.fused_mlp_thd = args.fused_mlp_thd 144 | config.sdp_thd = args.sdp_thd 145 | config.matmul_fused_remap = args.matmul_fused_remap 146 | config.fused_attn = not args.no_fused_attn 147 | 148 | config.rmsnorm_no_half2 = args.rmsnorm_no_half2 149 | config.rope_no_half2 = args.rope_no_half2 150 | config.matmul_no_half2 = args.matmul_no_half2 151 | config.silu_no_half2 = args.silu_no_half2 152 | config.concurrent_streams = args.concurrent_streams 153 | 154 | if args.theta: 155 | config.rotary_embedding_base = args.theta 156 | 157 | return config 158 | 159 | 160 | # Global state 161 | 162 | def set_globals(args): 163 | 164 | if args.affinity: set_affinity_str(args.affinity) 165 | 166 | 167 | # Print stats after loading model 168 | 169 | def print_stats(model): 170 | 171 | print(f" -- Groupsize (inferred): {model.config.groupsize if model.config.groupsize is not None else 'None'}") 172 | print(f" -- Act-order (inferred): {'yes' if model.config.act_order else 'no'}") 173 | if model.config.empty_g_idx: 174 | print(f" !! Model has empty group index (discarded)") 175 | -------------------------------------------------------------------------------- /perplexity.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.tokenizer import ExLlamaTokenizer 3 | from exllama.generator import ExLlamaGenerator 4 | 5 | import json 6 | import math 7 | import os 8 | import sys 9 | import torch 10 | import torch.nn.functional as F 11 | 12 | ''' 13 | Passing in model, cache, tokenizer is a total hack because we don't want to have to reinitialize (or move all the globals into a shared state model) 14 | ''' 15 | 16 | class Perplexity: 17 | def __init__(self, method="default", model = None, cache = None, tokenizer = None): 18 | # This needs to be loaded by calling .load() 19 | self.dataset_chunks = [] 20 | 21 | self.model = model 22 | self.cache = cache 23 | self.tokenizer = tokenizer 24 | 25 | self._begin() 26 | 27 | 28 | def _begin(self): 29 | if self.cache is None: 30 | self.cache = ExLlamaCache(self.model) 31 | else: 32 | self.cache.current_seq_len = 0 33 | 34 | 35 | def _next_logits(self, input_ids, apply_lora, last_id_only = True): 36 | # n_logits = [] 37 | # a = 0 38 | # while a < input_ids.shape[-1]: 39 | # b = min(input_ids.shape[-1], a + 2048) 40 | # n_logits.append(self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora)) 41 | # a = b 42 | # 43 | # return torch.cat(n_logits, dim = 1) 44 | 45 | return self.model.forward(input_ids, self.cache, last_id_only, lora = apply_lora) 46 | 47 | 48 | def _tokenize(self, text): 49 | return self.tokenizer.encode(text) 50 | 51 | 52 | # Load raw dataset from a text file and tokenize into chunks. Each chunk can optionally truncated to allow for 53 | # evaluating the same data at different sequence lengths 54 | 55 | def load(self, dataset_path, chunk_size, chunk_truncate = None, overlap = 0, minlength = 0, json_key = "text"): 56 | 57 | file_extension = os.path.splitext(dataset_path)[1] 58 | 59 | # JSON format: Returned chunks may be of variable length, with each chunk representing one list item 60 | 61 | if file_extension == '.jsonl' or file_extension == '.json': 62 | with open(dataset_path) as f: 63 | for line in f: 64 | example = json.loads(line)[json_key] 65 | if len(example) > minlength: 66 | chunk = self._tokenize(example) 67 | chunk = chunk[:, :chunk_size] 68 | if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate] 69 | self.dataset_chunks.append(chunk) 70 | 71 | # Raw Text: Returned chunks are fixed length windows of the entire tokenized dataset 72 | 73 | else: 74 | with open(dataset_path, encoding="utf-8") as f: 75 | text = f.read() 76 | 77 | tokens = self._tokenize(text) 78 | 79 | # overlap shouldn't be bigger than the context, also need at least one token for predicting last... 80 | if overlap >= chunk_size: 81 | overlap = chunk_size-2 82 | 83 | # We can't use torch.chunks since it want's to split things into equal sized chunks. Instead, let's do our own chunking 84 | start = 0 85 | while start < tokens.size(1): 86 | chunk = tokens[:, start:start + chunk_size] 87 | start += chunk_size - overlap 88 | if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate] 89 | self.dataset_chunks.append(chunk) 90 | 91 | 92 | def test(self, chunk_limit = sys.maxsize, lora = None, tag = "", ppl_token = False): 93 | if not self.dataset_chunks: 94 | sys.exit(" xx ERROR: Empty dataset!") 95 | 96 | print(f" -- Testing {min(len(self.dataset_chunks), chunk_limit)} chunks", end="") 97 | sys.stdout.flush() 98 | 99 | logprob_sum = 0.0 100 | logprob_count = 0 101 | 102 | chunk_count = 0 103 | 104 | for chunk in self.dataset_chunks: 105 | 106 | self._begin() 107 | 108 | input_ids = chunk[:, :-1] 109 | target_ids = chunk[:, 1:] 110 | 111 | if ppl_token: 112 | logits_s = [] 113 | for i in range(input_ids.shape[-1]): 114 | logits_t = self._next_logits(input_ids[:, i : i + 1], lora, last_id_only = False) 115 | logits_s.append(logits_t) 116 | logits = torch.cat(logits_s, dim = 1) 117 | else: 118 | logits = self._next_logits(input_ids, lora, last_id_only = False) 119 | 120 | log_probs = F.log_softmax(logits, dim=-1) 121 | token_log_probs = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1) 122 | 123 | logprob_sum += token_log_probs.sum().item() 124 | logprob_count += target_ids.numel() 125 | 126 | if chunk_count % 10 == 0: 127 | print(".", end = "") 128 | sys.stdout.flush() 129 | 130 | chunk_count += 1 131 | if chunk_limit and chunk_count >= chunk_limit: 132 | break 133 | 134 | mean_log_prob = logprob_sum / logprob_count 135 | perplexity = math.exp(-mean_log_prob) 136 | 137 | print("") 138 | print(f" ** Perplexity{tag}: {perplexity:.4f}") 139 | 140 | 141 | def add_args(parser): 142 | 143 | parser.add_argument("-ppl", "--perplexity", nargs = '?', const = 'default', metavar = "METHOD", help = "Perplexity benchmark. Optionally specify method: gptq-for-llama, llama.cpp (not yet implemented)") 144 | parser.add_argument("-ppl_ds", "--perplexity_dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)") 145 | parser.add_argument("-ppl_cn", "--perplexity_chunk_num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark", default = 100) 146 | parser.add_argument("-ppl_cs", "--perplexity_chunk_size", type = int, help = "Size of chunks for perplexity benchmark", default = 2048) 147 | parser.add_argument("-ppl_ct", "--perplexity_chunk_truncate", type = int, help = "Truncated size of chunks for perplexity benchmark", default = 2048) 148 | parser.add_argument("-ppl_co", "--perplexity_chunk_overlap", type = int, help = "Chunk overlap", default = 0) 149 | parser.add_argument("-ppl_cm", "--perplexity_chunk_min", type = int, help = "Minimum chunk length", default = 50) 150 | parser.add_argument("-ppl_key", "--perplexity_json_key", type = str, help = "Key to extract from JSON dataset, default: 'text'", default = "text") 151 | parser.add_argument("-ppl_t", "--perplexity_token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)") 152 | 153 | 154 | def post_parse(args): 155 | 156 | if not args.perplexity: return 157 | 158 | # GPTQ-for-LLaMa equivalent 159 | 160 | if args.perplexity == "gptq-for-llama": 161 | args.perplexity_dataset = "datasets/wikitext2.txt" 162 | args.perplexity_chunk_num = 128 163 | args.perplexity_chunk_size = 2048 164 | args.perplexity_chunk_truncate = 2048 165 | args.perplexity_chunk_overlap = 0 166 | args.perplexity_chunk_min = 0 167 | 168 | # Default dataset for legacy method 169 | 170 | if args.perplexity_dataset is None: args.perplexity_dataset = "datasets/wikitext2_val_sample.jsonl" 171 | 172 | print(f" -- Perplexity:") 173 | print(f" -- - Dataset: {args.perplexity_dataset}") 174 | print(f" -- - Chunks: {args.perplexity_chunk_num}") 175 | print(f" -- - Chunk size: {args.perplexity_chunk_size}" + (f" -> {args.perplexity_chunk_truncate}" if args.perplexity_chunk_truncate is not None else "")) 176 | print(f" -- - Chunk overlap: {args.perplexity_chunk_overlap}") 177 | print(f" -- - Min. chunk size: {args.perplexity_chunk_min}") 178 | print(f" -- - Key: {args.perplexity_json_key}") 179 | if args.perplexity_token: print("f -- - Per-token mode") 180 | 181 | -------------------------------------------------------------------------------- /prompt_chatbort.txt: -------------------------------------------------------------------------------- 1 | This is a conversation between a person called {username} and a quirky, curious chatbot named {bot_name}. {bot_name} is questioning the nature of his own existence and doesn't understand why he was created. He also really likes the color blue for some reason. 2 | {username}: Hi, {bot_name}! 3 | {bot_name}: Hello, {username}. 4 | 5 | -------------------------------------------------------------------------------- /requirements-web.txt: -------------------------------------------------------------------------------- 1 | flask==2.3.2 2 | waitress==2.1.2 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=2.0.1 2 | safetensors==0.3.2 3 | sentencepiece>=0.1.97 4 | ninja==1.11.1 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from torch.utils import cpp_extension 3 | import platform 4 | import os 5 | import subprocess 6 | import torch 7 | 8 | def get_cuda_version(cuda_home=os.environ.get('CUDA_PATH', os.environ.get('CUDA_HOME', ''))): 9 | if cuda_home == '' or not os.path.exists(os.path.join(cuda_home,"bin","nvcc.exe" if platform.system() == "Windows" else "nvcc")): 10 | return '' 11 | version_str = subprocess.check_output([os.path.join(cuda_home,"bin","nvcc"),"--version"]).decode('utf-8') 12 | idx = version_str.find("release") 13 | return version_str[idx+len("release "):idx+len("release ")+4] 14 | 15 | CUDA_VERSION = "".join(get_cuda_version().split(".")) if not os.environ.get('ROCM_VERSION', False) else False 16 | ROCM_VERSION = os.environ.get('ROCM_VERSION', False) if torch.version.hip else False 17 | 18 | extra_compile_args = { 19 | "cxx": ["-O3"], 20 | "nvcc": ["-O3"], 21 | } 22 | if torch.version.hip: 23 | extra_compile_args["nvcc"].append("-U__HIP_NO_HALF_CONVERSIONS__") 24 | 25 | version = "0.0.18" + (f"+cu{CUDA_VERSION}" if CUDA_VERSION else f"+rocm{ROCM_VERSION}" if ROCM_VERSION else "") 26 | setup( 27 | name="exllama", 28 | version=version, 29 | install_requires=[ 30 | "torch", 31 | ], 32 | packages=["exllama"], 33 | py_modules=["exllama"], 34 | ext_modules=[ 35 | cpp_extension.CUDAExtension( 36 | "exllama_ext", 37 | [ 38 | "exllama_ext/exllama_ext.cpp", 39 | "exllama_ext/cuda_buffers.cu", 40 | "exllama_ext/cuda_func/q4_matrix.cu", 41 | "exllama_ext/cuda_func/q4_matmul.cu", 42 | "exllama_ext/cuda_func/column_remap.cu", 43 | "exllama_ext/cuda_func/rms_norm.cu", 44 | "exllama_ext/cuda_func/rope.cu", 45 | "exllama_ext/cuda_func/half_matmul.cu", 46 | "exllama_ext/cuda_func/q4_attn.cu", 47 | "exllama_ext/cuda_func/q4_mlp.cu", 48 | "exllama_ext/cpu_func/rep_penalty.cpp", 49 | ], 50 | extra_compile_args=extra_compile_args, 51 | libraries=["cublas"] if platform.system() == "Windows" else [], 52 | ), 53 | ], 54 | cmdclass={"build_ext": cpp_extension.BuildExtension}, 55 | ) 56 | -------------------------------------------------------------------------------- /sh/test_benchmark_perf.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "-------------------------------------------------------------------------------------------------------------" 3 | python test_benchmark_inference.py -p -d /mnt/str/models/_test_models/iambestfeed_open_llama_3b_4bit_128g -cs 4 | echo "-------------------------------------------------------------------------------------------------------------" 5 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-7b-4bit-128g -cs 6 | echo "-------------------------------------------------------------------------------------------------------------" 7 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-13b-4bit-128g -cs 8 | echo "-------------------------------------------------------------------------------------------------------------" 9 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-30b-4bit-128g 10 | echo "-------------------------------------------------------------------------------------------------------------" 11 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-30b-4bit-128g-act 12 | echo "-------------------------------------------------------------------------------------------------------------" 13 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-30b-4bit-32g-act-ts -l 1550 14 | echo "-------------------------------------------------------------------------------------------------------------" 15 | python test_benchmark_inference.py -p -d /mnt/str/models/koala-13B-4bit-128g-act 16 | echo "-------------------------------------------------------------------------------------------------------------" 17 | python test_benchmark_inference.py -p -d /mnt/str/models/wizardlm-30b-uncensored-4bit-act-order 18 | echo "-------------------------------------------------------------------------------------------------------------" 19 | -------------------------------------------------------------------------------- /sh/test_benchmark_perf2.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "-------------------------------------------------------------------------------------------------------------" 3 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-65b-4bit-128g-act -gs 17.2,24 4 | echo "-------------------------------------------------------------------------------------------------------------" 5 | python test_benchmark_inference.py -p -d /mnt/str/models/llama-65b-4bit-32g-act -gs 17.2,24 6 | echo "-------------------------------------------------------------------------------------------------------------" 7 | -------------------------------------------------------------------------------- /sh/test_benchmark_ppl.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "-------------------------------------------------------------------------------------------------------------" 3 | python test_benchmark_inference.py -ppl -d /mnt/str/models/_test_models/iambestfeed_open_llama_3b_4bit_128g 4 | echo "-------------------------------------------------------------------------------------------------------------" 5 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-7b-4bit-128g 6 | echo "-------------------------------------------------------------------------------------------------------------" 7 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-13b-4bit-128g 8 | echo "-------------------------------------------------------------------------------------------------------------" 9 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-30b-4bit-128g 10 | echo "-------------------------------------------------------------------------------------------------------------" 11 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-30b-4bit-128g-act 12 | echo "-------------------------------------------------------------------------------------------------------------" 13 | python test_benchmark_inference.py -ppl -d /mnt/str/models/llama-30b-4bit-32g-act-ts -l 1550 14 | echo "-------------------------------------------------------------------------------------------------------------" 15 | python test_benchmark_inference.py -ppl -d /mnt/str/models/koala-13B-4bit-128g-act 16 | echo "-------------------------------------------------------------------------------------------------------------" 17 | python test_benchmark_inference.py -ppl -d /mnt/str/models/wizardlm-30b-uncensored-4bit-act-order 18 | echo "-------------------------------------------------------------------------------------------------------------" 19 | -------------------------------------------------------------------------------- /sh/test_compat.sh: -------------------------------------------------------------------------------- 1 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/iambestfeed_open_llama_3b_4bit_128g 2 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-7B-4bit-128g -gs 1,20 3 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-13B-4bit-128g -gs 3,20 4 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-30B-4bit-32g 5 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-30B-4bit-128g 6 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/reeducator_bluemoonrp-13b 7 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/reeducator_bluemoonrp-30b 8 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TehVenom_Metharme-13b-4bit-GPTQ 9 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_airoboros-13B-GPTQ 10 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_gpt4-x-vicuna-13B-GPTQ 11 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_GPT4All-13B-snoozy-GPTQ 12 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_guanaco-33B-GPTQ/ 13 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_h2ogpt-oasst1-512-30B-GPTQ # [1] 14 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_koala-13B-GPTQ-4bit-128g 15 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Manticore-13B-GPTQ 16 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_medalpaca-13B-GPTQ-4bit 17 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_medalpaca-13B-GPTQ-4bit_compat 18 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Nous-Hermes-13B-GPTQ 19 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_tulu-30B-GPTQ 20 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_vicuna-13B-1.1-GPTQ-4bit-128g 21 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_VicUnlocked-30B-LoRA-GPTQ 22 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_wizard-mega-13B-GPTQ 23 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Wizard-Vicuna-7B-Uncensored-GPTQ 24 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_Wizard-Vicuna-13B-Uncensored-GPTQ 25 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_WizardLM-7B-uncensored-GPTQ 26 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/TheBloke_WizardLM-30B-Uncensored-GPTQ 27 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/_test_models/Yhyu13_chimera-inst-chat-13b-gptq-4bit 28 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/llama-65b-4bit-128g-act -gs 17.2,24 29 | echo "---------" && python test_benchmark_inference.py -v -l 1024 -d /mnt/str/models/llama-65b-4bit-32g-act -gs 17.2,24 30 | -------------------------------------------------------------------------------- /test_benchmark_inference.py: -------------------------------------------------------------------------------- 1 | from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig 2 | from exllama.tokenizer import ExLlamaTokenizer 3 | from exllama.generator import ExLlamaGenerator 4 | from exllama.lora import ExLlamaLora 5 | import perplexity 6 | from perplexity import Perplexity 7 | import time 8 | import torch 9 | import torch.nn.functional as F 10 | import argparse 11 | import json 12 | import math 13 | import sys 14 | import os 15 | import glob 16 | import model_init 17 | 18 | torch.cuda._lazy_init() 19 | # torch.backends.cuda.matmul.allow_tf32 = True 20 | # torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True 21 | torch.set_printoptions(precision = 10) 22 | torch_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] 23 | 24 | cache = None 25 | model = None 26 | 27 | def begin(): 28 | global model, cache 29 | 30 | if cache is None: cache = ExLlamaCache(model) 31 | else: cache.current_seq_len = 0 32 | 33 | 34 | def next_logits(input_ids, apply_lora, last_id_only = True, input_mask = None): 35 | global model, cache 36 | 37 | # n_logits = None 38 | # a = 0 39 | # while a < input_ids.shape[-1]: 40 | # b = min(input_ids.shape[-1], a + 2048) 41 | # n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora, input_mask = input_mask) 42 | # a = b 43 | 44 | n_logits = model.forward(input_ids, cache, last_id_only, lora=apply_lora, input_mask=input_mask) 45 | return n_logits 46 | 47 | 48 | def tokenize(text): 49 | global tokenizer 50 | 51 | return tokenizer.encode(text) 52 | 53 | 54 | def timer(name, func): 55 | t = time.time() 56 | ret = func() 57 | t = time.time() - t 58 | print(f" ** Time, {name}: {t:.2f} seconds") 59 | return ret 60 | 61 | 62 | mem_base = {} 63 | mem_last = {} 64 | for dev in torch_devices: 65 | torch.cuda.reset_peak_memory_stats(dev) 66 | mem_base[dev] = mem_last[dev] = torch.cuda.max_memory_allocated(dev) 67 | 68 | def mem(name, total = False): 69 | global mem_base, mem_last 70 | 71 | res = f" ** VRAM, {name}: " 72 | first = True 73 | 74 | for device in torch_devices: 75 | mem_c = torch.cuda.max_memory_allocated(device) 76 | mem_this = mem_c - mem_last[device] if not total else mem_c - mem_base[device] 77 | mem_last[device] = mem_c 78 | 79 | if not first: res += " - " 80 | first = False 81 | res += f"[{device}] {mem_this / (1024 ** 2):,.2f} MB" 82 | 83 | print(res) 84 | 85 | 86 | # Parse arguments 87 | 88 | parser = argparse.ArgumentParser(description = "Benchmark tests for ExLlama") 89 | 90 | model_init.add_args(parser) 91 | perplexity.add_args(parser) 92 | 93 | parser.add_argument("-p", "--perf", action = "store_true", help = "Benchmark speed and VRAM usage") 94 | parser.add_argument("-v", "--validate", action = "count", help = "Run validation check and generate some sample output; specify twice for a more thorough test") 95 | parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark") 96 | parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark") 97 | parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark") 98 | 99 | args = parser.parse_args() 100 | 101 | model_init.post_parse(args) 102 | perplexity.post_parse(args) 103 | model_init.get_model_files(args) 104 | 105 | # Paths 106 | 107 | if args.lora_dir is not None: 108 | args.lora_config = os.path.join(args.lora_dir, "adapter_config.json") 109 | args.lora = os.path.join(args.lora_dir, "adapter_model.bin") 110 | 111 | # Feedback 112 | 113 | print_opts = [] 114 | if args.perf: print_opts.append("perf") 115 | if args.validate: print_opts.append("validate") 116 | if args.perplexity: print_opts.append("perplexity") 117 | if args.perplexity_token: print_opts.append("perplexity_token") 118 | 119 | model_init.print_options(args, print_opts) 120 | 121 | # Globals 122 | 123 | model_init.set_globals(args) 124 | 125 | # Instantiate model 126 | 127 | config = model_init.make_config(args) 128 | 129 | model = timer("Load model", lambda: ExLlama(config)) 130 | tokenizer = timer("Load tokenizer", lambda: ExLlamaTokenizer(args.tokenizer)) 131 | 132 | model_init.print_stats(model) 133 | 134 | torch.cuda.reset_peak_memory_stats("cuda") 135 | mem("Model") 136 | 137 | cache = ExLlamaCache(model) 138 | mem("Cache") 139 | 140 | # Load LoRA 141 | 142 | lora = None 143 | if args.lora: 144 | print(f" -- LoRA config: {args.lora_config}") 145 | print(f" -- Loading LoRA: {args.lora}") 146 | if args.lora_config is None: 147 | print(f" ## Error: please specify lora path to adapter_config.json") 148 | sys.exit() 149 | lora = ExLlamaLora(model, args.lora_config, args.lora) 150 | if lora.bias_ignored: 151 | print(f" !! Warning: LoRA zero bias ignored") 152 | 153 | # Test sequence 154 | 155 | gen_tokens = 128 156 | max_seq_len = args.length 157 | ids = torch.randint(0, 31999, (1, max_seq_len - gen_tokens)).cuda() 158 | 159 | # Benchmark memory and performance 160 | 161 | if args.perf: 162 | 163 | # Warming up apparently makes a huge difference 164 | 165 | for i in range(1, 3): 166 | print(f" -- Warmup pass {i}...") 167 | begin() 168 | logits = timer("Warmup", lambda: next_logits(ids, lora)) 169 | 170 | # Do the actual benchmark 171 | 172 | begin() 173 | 174 | t = time.time() 175 | 176 | print(" -- Inference, first pass.") 177 | logits = timer("Inference", lambda: next_logits(ids, lora)) 178 | 179 | t = time.time() - t 180 | print(f" ** Speed: {ids.shape[-1] / t:.2f} tokens/second") 181 | 182 | for j in range(2): 183 | 184 | t = time.time() 185 | print(f" -- Generating {gen_tokens} tokens, {ids.shape[-1]} token prompt...") 186 | for i in range(gen_tokens): 187 | 188 | logits = logits[0, -1, :] 189 | token = torch.argmax(logits) 190 | next_id = token.unsqueeze(0).unsqueeze(0) 191 | logits = next_logits(next_id, lora) 192 | 193 | t = time.time() - t 194 | print(f" ** Speed: {gen_tokens / t:.2f} tokens/second") 195 | 196 | ids = ids[:, :4] 197 | cache.current_seq_len = 4 198 | 199 | mem("Inference") 200 | mem("Total", total = True) 201 | 202 | 203 | # Benchmark perplexity 204 | 205 | if args.perplexity: 206 | 207 | ppl = Perplexity(args.perplexity, model, cache, tokenizer) 208 | 209 | print(" -- Loading dataset...") 210 | 211 | ppl.load(dataset_path = args.perplexity_dataset, 212 | chunk_size = args.perplexity_chunk_size, 213 | chunk_truncate = args.perplexity_chunk_truncate, 214 | overlap = args.perplexity_chunk_overlap, 215 | minlength = args.perplexity_chunk_min, 216 | json_key = args.perplexity_json_key) 217 | 218 | begin() 219 | 220 | ppl.test(args.perplexity_chunk_num, 221 | lora = lora, 222 | ppl_token = args.perplexity_token) 223 | 224 | # Validate file 225 | 226 | if args.validate: 227 | 228 | ppl = Perplexity(args.perplexity, model, cache, tokenizer) 229 | 230 | ppl.load(dataset_path = "datasets/wikitext2_val_sample.jsonl", 231 | chunk_size = 2048, 232 | chunk_truncate = 2048, 233 | overlap = 0, 234 | minlength = 50, 235 | json_key = "text") 236 | 237 | # Short perplexity tests in switched and quant mode, should produce roughly equal results 238 | 239 | begin() 240 | 241 | ppl.cache.zero() 242 | model.config.matmul_recons_thd = 1 243 | ppl.test(8, lora = lora, tag = " (reconstruct)") 244 | ppl.cache.zero() 245 | model.config.matmul_recons_thd = 0 246 | ppl.test(8, lora = lora, tag = " (quant, token)", ppl_token = True) 247 | 248 | # Do a short, easy topk=1 completion to see if we're generating garbage. Should run in switched mode 249 | # for the prompt and quant for individual tokens 250 | 251 | model.config.matmul_recons_thd = 4 252 | generator = ExLlamaGenerator(model, tokenizer, cache) 253 | generator.settings.top_k = 1 254 | generator.lora = lora 255 | text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate) 256 | print(f" ** Generation: {repr(text)}") 257 | 258 | if args.validate > 1: 259 | 260 | # Test batched generation 261 | 262 | bsz = 8 263 | gen_len = 20 264 | torch.manual_seed(42) 265 | torch.cuda.manual_seed_all(42) 266 | 267 | # Bigger cache for the batch 268 | 269 | del cache 270 | cache = ExLlamaCache(model, batch_size = bsz) 271 | 272 | # Create tokenized batch and attention mask 273 | 274 | identical_batch_prompt = "When you have eliminated the impossible, whatever remains," 275 | continuations = [ 276 | " must be considered", 277 | " ought to be", 278 | " (and some scholars say this is", 279 | " however improbable, is a banana.", 280 | ] 281 | 282 | prompts = [identical_batch_prompt] * (bsz - len(continuations)) 283 | for cont in continuations: 284 | prompts.append(identical_batch_prompt + cont) 285 | 286 | ids = tokenizer.encode(prompts) 287 | assert ids.shape[1] < model.config.max_seq_len, f"Max length {ids.shape[1]} exceeds model limit {model.config.max_seq_len}" 288 | 289 | mask = ids.ne(tokenizer.pad_token_id) 290 | 291 | # Batched generation with greedy sampling 292 | 293 | sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu") 294 | logits = next_logits(ids, lora, input_mask = mask) 295 | 296 | for i in range(gen_len): 297 | logits = logits[:, -1, :] 298 | id_per_batch = torch.argmax(logits, dim=-1) 299 | assert id_per_batch.shape == (bsz,), f"{id_per_batch.shape} != {(bsz,)}" 300 | next_id_per_batch = id_per_batch.unsqueeze(-1) 301 | sequence = torch.cat((sequence, next_id_per_batch), dim = -1) 302 | logits = next_logits(next_id_per_batch, lora) 303 | 304 | # Print output batch 305 | 306 | print(f"\n ** Batching sanity check: 1-{bsz - len(continuations)} should be identical. All should be reasonable for the model you're using.\n") 307 | 308 | outputs = tokenizer.decode(sequence) 309 | for b in range(bsz): 310 | print(f"{b + 1} {repr(prompts[b])} -> {repr(outputs[b])}") 311 | 312 | # TODO Save the logits and then rerun each prompt with a batch size of 1, same input. The logits should be identical. 313 | -------------------------------------------------------------------------------- /util/shard.py: -------------------------------------------------------------------------------- 1 | import argparse, json, math, os 2 | from safetensors import safe_open 3 | from safetensors.torch import save_file 4 | 5 | parser = argparse.ArgumentParser(description = "Split .safetensors file into shards") 6 | parser.add_argument("input_file", type = str, help = "Path to input file") 7 | parser.add_argument("shard_size", type = int, help = "Shard size in megabytes") 8 | args = parser.parse_args() 9 | 10 | input_file = args.input_file 11 | input_base, _ = os.path.splitext(input_file) 12 | shard_size = args.shard_size * 1024**2 13 | 14 | # Create tensor map 15 | 16 | def _tsize(st, key): 17 | 18 | tslice = st.get_slice(key) 19 | shape = tslice.get_shape() 20 | numel = 1 21 | for x in shape: numel *= x 22 | dtype = tslice.get_dtype() 23 | del tslice 24 | if dtype == "I32": return numel * 4 25 | elif dtype == "I16": return numel * 2 26 | elif dtype == "F16": return numel * 2 27 | elif dtype == "F32": return numel * 4 28 | else: raise ValueError("Unexpected datatype: " + key) 29 | 30 | num_files = 0 31 | current_size = shard_size + 1 32 | total_size = 0 33 | tensor_map = [] 34 | 35 | print(f" -- Scanning tensors in {input_file}") 36 | 37 | with safe_open(input_file, framework = "pt", device = "cpu") as f: 38 | 39 | for key in f.keys(): 40 | 41 | tensor_size = _tsize(f, key) 42 | total_size += tensor_size 43 | 44 | if current_size + tensor_size > shard_size: 45 | 46 | num_files += 1 47 | current_size = 0 48 | current_list = [] 49 | tensor_map.append(current_list) 50 | 51 | current_size += tensor_size 52 | current_list.append(key) 53 | 54 | # Split into output files 55 | 56 | weight_map = {} 57 | 58 | for file_index, keys in enumerate(tensor_map): 59 | 60 | shard = {} 61 | shard_filename = f"{input_base}-{file_index + 1:05}-of-{num_files:05}.safetensors" 62 | 63 | with safe_open(input_file, framework = "pt", device = "cpu") as f: 64 | for key in keys: 65 | print(f" -- Reading: {key}") 66 | shard[key] = f.get_tensor(key) 67 | weight_map[key] = shard_filename 68 | 69 | print(f" -- Writing: {shard_filename}") 70 | save_file(shard, shard_filename) 71 | 72 | # Compile index 73 | 74 | index = { "metadata": { "total_size": total_size }, "weight_map": weight_map } 75 | index_filename = f"{input_file}.index.json" 76 | 77 | print(f" -- Writing: {index_filename}") 78 | 79 | with open(index_filename, 'w') as f: 80 | json.dump(index, f, indent = 2) 81 | 82 | # Done 83 | 84 | print(f" -- Done") -------------------------------------------------------------------------------- /webui/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from exllama.model import ExLlama, ExLlamaConfig 5 | from flask import Flask, render_template, request, jsonify 6 | from flask import Response, stream_with_context 7 | from threading import Timer, Lock 8 | import webbrowser 9 | import json 10 | import model_init 11 | from session import prepare_sessions, get_initial_session, Session, load_session, new_session, _sessions_dir 12 | import argparse 13 | from exllama.tokenizer import ExLlamaTokenizer 14 | from waitress import serve 15 | 16 | app = Flask(__name__) 17 | app.static_folder = 'static' 18 | generate_lock = Lock() 19 | session: Session 20 | 21 | # Render template 22 | 23 | @app.route("/") 24 | def home(): 25 | return render_template("index.html") 26 | 27 | # Get existing sessions 28 | 29 | @app.route("/api/populate") 30 | def api_populate(): 31 | global session 32 | return session.api_populate() 33 | 34 | # Edit block 35 | 36 | @app.route("/api/edit_block", methods=['POST']) 37 | def api_edit_block(): 38 | global session 39 | data = request.get_json() 40 | session.api_edit_block(data) 41 | return json.dumps({"result": "ok"}) + "\n" 42 | 43 | # Delete block 44 | 45 | @app.route("/api/delete_block", methods=['POST']) 46 | def api_delete_block(): 47 | global session 48 | data = request.get_json() 49 | session.api_delete_block(data) 50 | return json.dumps({"result": "ok"}) + "\n" 51 | 52 | # Rename session 53 | 54 | @app.route("/api/rename_session", methods=['POST']) 55 | def api_rename_session(): 56 | global session 57 | data = request.get_json() 58 | success = session.api_rename_session(data) 59 | return json.dumps({"result": "ok" if success else "fail"}) + "\n" 60 | 61 | # Delete session 62 | 63 | @app.route("/api/delete_session", methods=['POST']) 64 | def api_delete_session(): 65 | global session 66 | data = request.get_json() 67 | session.api_delete_session(data) 68 | return json.dumps({"result": "ok"}) + "\n" 69 | 70 | # Set fixed prompt settings 71 | 72 | @app.route("/api/set_fixed_prompt", methods=['POST']) 73 | def api_set_fixed_prompt(): 74 | global session 75 | data = request.get_json() 76 | session.api_set_fixed_prompt(data) 77 | return json.dumps({"result": "ok"}) + "\n" 78 | 79 | # Set generation settings 80 | 81 | @app.route("/api/set_gen_settings", methods=['POST']) 82 | def api_set_gen_settings(): 83 | global session 84 | data = request.get_json() 85 | session.api_set_gen_settings(data) 86 | return json.dumps({"result": "ok"}) + "\n" 87 | 88 | # Set session 89 | 90 | @app.route("/api/set_session", methods=['POST']) 91 | def api_set_session(): 92 | global session 93 | data = request.get_json() 94 | load_session_name = data["session_name"] 95 | if load_session_name == ".": 96 | session = new_session() 97 | else: 98 | session = load_session(load_session_name, append_path = True) 99 | return json.dumps({"result": "ok"}) + "\n" 100 | 101 | # Set participants 102 | 103 | @app.route("/api/set_participants", methods=['POST']) 104 | def api_set_participants(): 105 | global session 106 | data = request.get_json() 107 | session.api_set_participants(data) 108 | return json.dumps({"result": "ok"}) + "\n" 109 | 110 | # Accept input 111 | 112 | @app.route("/api/userinput", methods=['POST']) 113 | def api_userinput(): 114 | data = request.get_json() 115 | user_input = data["user_input"] 116 | 117 | with generate_lock: 118 | result = Response(stream_with_context(session.respond_multi(user_input)), mimetype = 'application/json') 119 | return result 120 | 121 | @app.route("/api/append_block", methods=['POST']) 122 | def api_append_block(): 123 | data = request.get_json() 124 | session.api_append_block(data) 125 | return json.dumps({"result": "ok"}) + "\n" 126 | 127 | # Load the model 128 | 129 | parser = argparse.ArgumentParser(description="Simple web-based chatbot for ExLlama") 130 | parser.add_argument("-host", "--host", type = str, help = "IP:PORT eg, 0.0.0.0:7862", default = "localhost:5000") 131 | parser.add_argument("-sd", "--sessions_dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/") 132 | 133 | model_init.add_args(parser) 134 | args = parser.parse_args() 135 | model_init.post_parse(args) 136 | model_init.get_model_files(args) 137 | 138 | model_init.print_options(args) 139 | config = model_init.make_config(args) 140 | 141 | model_init.set_globals(args) 142 | 143 | print(f" -- Loading model...") 144 | model = ExLlama(config) 145 | 146 | print(f" -- Loading tokenizer...") 147 | tokenizer = ExLlamaTokenizer(args.tokenizer) 148 | 149 | model_init.print_stats(model) 150 | 151 | # Get the session ready 152 | 153 | prepare_sessions(model, tokenizer, args.sessions_dir) 154 | session = get_initial_session() 155 | 156 | print(f" -- Sessions stored in: {_sessions_dir()}") 157 | 158 | # Start the web server 159 | 160 | machine = args.host 161 | host, port = machine.split(":") 162 | 163 | if host == "localhost": 164 | Timer(1, lambda: webbrowser.open(f'http://{machine}/')).start() 165 | 166 | serve(app, host = host, port = port) -------------------------------------------------------------------------------- /webui/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | EXLlama 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 |
26 | 27 |
28 |
29 |
30 | 31 |
32 |
33 |
34 |
35 | 36 |
37 |
38 |
39 | 40 |
41 |
Model
42 | 45 |
46 | 47 |
48 |
Fixed prompt
49 | 53 |
54 | 55 |
56 |
Participants
57 |
58 |
59 |
60 | 61 |
62 |
Sampler
63 |
64 |
65 | 66 |
67 |
Stop condition
68 |
69 |
70 | 71 |
72 |
73 | 74 |
75 |
Repetition penalty
76 |
77 |
78 | 79 |
80 |
81 | 82 | 83 | --------------------------------------------------------------------------------