├── .github ├── CODEOWNERS ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── check-commits.yml │ ├── linux-cpu-tests.yml │ ├── linux-cuda-tests.yml │ ├── linux-examples.yml │ ├── python-quality.yml │ ├── security.yml │ └── stale.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── bench ├── generation │ ├── README.md │ ├── charts │ │ ├── google-gemma-2b_bf16_Accuracy.png │ │ ├── google-gemma-2b_bf16_Latency__ms_.png │ │ ├── google-gemma-2b_bf16_Perplexity.png │ │ ├── meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png │ │ ├── meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png │ │ ├── meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png │ │ ├── mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png │ │ ├── mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png │ │ └── mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png │ ├── evaluate_configurations.py │ ├── evaluate_many_models.sh │ ├── evaluate_model.py │ ├── gen_barchart.py │ ├── metrics │ │ ├── __init__.py │ │ ├── latency.py │ │ ├── perplexity.py │ │ └── prediction.py │ └── setup │ │ ├── __init__.py │ │ ├── awq.py │ │ ├── bnb.py │ │ ├── hqq.py │ │ └── quanto.py ├── kernels │ ├── benchmark.py │ ├── benchmark_marlin_fp8.py │ └── benchmark_w4a16.py └── torch_kernels │ ├── README.md │ ├── test_int_mm.py │ ├── test_int_mm_inductor.py │ ├── test_weight_int4pack_mm.py │ └── test_weight_int8pack_mm.py ├── examples ├── nlp │ ├── text-classification │ │ └── sst2 │ │ │ └── quantize_sst2_model.py │ └── text-generation │ │ └── quantize_causal_lm_model.py ├── speech │ └── speech_recognition │ │ ├── quantize_asr_model.py │ │ └── requirements.txt └── vision │ ├── StableDiffusion │ ├── README.md │ ├── quantize_StableDiffusion.py │ └── requirements.txt │ ├── image-classification │ ├── mnist │ │ └── quantize_mnist_model.py │ └── pets │ │ └── quantize_vit_model.py │ ├── object-detection │ └── quantize_owl_model.py │ └── text-to-image │ └── quantize_pixart_sigma.py ├── external ├── awq │ ├── conftest.py │ ├── pack_intweight.py │ ├── packing_utils.py │ ├── test_awq_kernels.py │ ├── test_awq_packing.py │ └── test_awq_quantize.py └── smoothquant │ ├── README.md │ └── smoothquant.py ├── optimum └── quanto │ ├── __init__.py │ ├── calibrate.py │ ├── library │ ├── README.md │ ├── __init__.py │ ├── extensions │ │ ├── README.md │ │ ├── __init__.py │ │ ├── cpp │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── pybind_module.cpp │ │ │ ├── unpack.cpp │ │ │ └── unpack.h │ │ ├── cuda │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── awq │ │ │ │ ├── dequantize.cuh │ │ │ │ └── v2 │ │ │ │ │ ├── gemm_cuda.cu │ │ │ │ │ ├── gemm_cuda.h │ │ │ │ │ ├── gemv_cuda.cu │ │ │ │ │ ├── gemv_cuda.h │ │ │ │ │ └── semaphore.h │ │ │ ├── marlin │ │ │ │ ├── COPYRIGHT │ │ │ │ ├── fp8_marlin.cu │ │ │ │ ├── fp8_marlin.cuh │ │ │ │ ├── gptq_marlin.cuh │ │ │ │ ├── gptq_marlin_dtypes.cuh │ │ │ │ ├── gptq_marlin_repack.cu │ │ │ │ ├── gptq_marlin_repack.cuh │ │ │ │ ├── marlin_cuda.cpp │ │ │ │ ├── marlin_cuda.h │ │ │ │ ├── marlin_cuda_kernel.cu │ │ │ │ └── marlin_cuda_kernel.cuh │ │ │ ├── pybind_module.cpp │ │ │ ├── unpack.cu │ │ │ └── unpack.h │ │ ├── extension.py │ │ ├── hip │ │ │ ├── __init__.py │ │ │ ├── pybind_module.cpp │ │ │ ├── unpack.cu │ │ │ └── unpack.h │ │ ├── mps │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── pybind_module.cpp │ │ │ ├── unpack.h │ │ │ └── unpack.mm │ │ └── xpu │ │ │ ├── __init__.py │ │ │ ├── pybind_module.cpp │ │ │ ├── unpack.h │ │ │ └── unpack.sycl │ ├── qbytes_mm.py │ ├── quantize.py │ └── unpack.py │ ├── models │ ├── __init__.py │ ├── diffusers_models.py │ ├── shared_dict.py │ └── transformers_models.py │ ├── nn │ ├── __init__.py │ ├── qconv2d.py │ ├── qlayernorm.py │ ├── qlinear.py │ └── qmodule.py │ ├── quantize.py │ ├── subpackage │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── base.py │ │ └── quantize.py │ └── tensor │ ├── __init__.py │ ├── activations │ ├── __init__.py │ ├── qbytes.py │ ├── qbytes_ops.py │ └── quantization.py │ ├── core.py │ ├── function.py │ ├── grouped.py │ ├── optimizers │ ├── __init__.py │ ├── absmax_optimizer.py │ ├── affine_optimizer.py │ ├── hqq_optimizer.py │ ├── max_optimizer.py │ ├── optimizer.py │ └── symmetric_optimizer.py │ ├── packed.py │ ├── qbits.py │ ├── qbytes.py │ ├── qtensor.py │ ├── qtype.py │ └── weights │ ├── __init__.py │ ├── awq │ ├── __init__.py │ ├── packed.py │ └── qbits.py │ ├── marlin │ ├── __init__.py │ ├── fp8 │ │ ├── __init__.py │ │ ├── packed.py │ │ └── qbits.py │ ├── int4 │ │ ├── __init__.py │ │ ├── packed.py │ │ └── qbits.py │ └── permutations.py │ ├── packing.py │ ├── qbits.py │ ├── qbytes.py │ ├── quantization.py │ ├── reordering.py │ └── tinygemm │ ├── __init__.py │ ├── packed.py │ └── qbits.py ├── pyproject.toml ├── setup.sh └── tests ├── cli ├── cli_helpers.py └── test_quantize_cli.py ├── conftest.py ├── helpers.py ├── library ├── test_extensions.py ├── test_mm.py ├── test_quantize.py └── test_unpack.py ├── models ├── conftest.py ├── test_quantized_model_for_causal_lm.py └── test_quantized_model_for_pixart.py ├── nn ├── test_calibrate.py ├── test_qattention.py ├── test_qconv2d.py ├── test_qlayernorm.py ├── test_qlinear.py └── test_qmodule.py ├── quantize ├── test_quantize_mlp.py ├── test_quantize_patterns.py └── test_requantize.py └── tensor ├── activations ├── test_activations_compile.py ├── test_activations_dispatch.py └── test_activations_quantize.py ├── ops ├── test_linear_dispatch.py └── test_mm_dispatch.py ├── optimizers └── test_hqq_optimizer.py ├── test_absmax.py ├── test_packed_tensor.py └── weights ├── optimized ├── test_awq_packed_tensor.py ├── test_awq_weight_qbits_tensor.py ├── test_marlin_fp8_packed_tensor.py ├── test_marlin_int4_packed_tensor.py ├── test_marlin_int4_weight_qbits_tensor.py ├── test_marlin_qbytes_tensor.py ├── test_tinygemm_packed_tensor.py └── test_tinygemm_weight_qbits_tensor.py ├── test_weight_qbits_tensor.py ├── test_weight_qbits_tensor_dispatch.py ├── test_weight_qbits_tensor_instantiate.py ├── test_weight_qbits_tensor_quantize.py ├── test_weight_qbytes_tensor_backward.py ├── test_weight_qbytes_tensor_dispatch.py ├── test_weight_qbytes_tensor_instantiate.py ├── test_weight_qbytes_tensor_quantize.py ├── test_weight_qbytes_tensor_serialization.py └── weight_helpers.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @dacorvo @sunmarc 2 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # What does this PR do? 2 | 3 | 12 | 13 | 14 | 15 | Fixes # (issue) 16 | 17 | 18 | ## Before submitting 19 | - [ ] Did you read the [contributor guideline](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md#create-a-pull-request), 20 | Pull Request section? 21 | - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link 22 | to it if that's the case. 23 | - [ ] Did you run all tests locally and make sure they pass. 24 | - [ ] Did you write any new necessary tests? 25 | 26 | 27 | ## Who can review? 28 | 29 | Anyone in the community is free to review the PR once the tests have passed. Feel free to tag 30 | members/contributors who may be interested in your PR. 31 | -------------------------------------------------------------------------------- /.github/workflows/check-commits.yml: -------------------------------------------------------------------------------- 1 | name: Check Commits 2 | 3 | on: [workflow_call] 4 | 5 | jobs: 6 | build: 7 | name: Check commits 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | 12 | - uses: huggingface/action-check-commits@v1.0.0 13 | with: 14 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 15 | max-commits: "10" 16 | min-words: "3" 17 | forbidden-words: "fixup" 18 | -------------------------------------------------------------------------------- /.github/workflows/linux-cpu-tests.yml: -------------------------------------------------------------------------------- 1 | name: Linux CPU tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "optimum/quanto/**" 9 | - "tests/**" 10 | - "pyproject.toml" 11 | pull_request: 12 | types: [assigned, opened, synchronize, reopened] 13 | paths: 14 | - "optimum/quanto/**" 15 | - "tests/**" 16 | - "pyproject.toml" 17 | 18 | jobs: 19 | check-commits: 20 | uses: ./.github/workflows/check-commits.yml 21 | python-quality: 22 | uses: ./.github/workflows/python-quality.yml 23 | test-ubuntu-cpu: 24 | needs: [check-commits, python-quality] 25 | runs-on: ubuntu-latest 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | python-version: ["3.9", "3.11"] 30 | 31 | steps: 32 | - uses: actions/checkout@v2 33 | - name: Set up Python ${{ matrix.python-version }} 34 | uses: actions/setup-python@v2 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | 38 | - name: Build and install quanto 39 | run: | 40 | pip install --upgrade pip 41 | pip install -e .[dev] 42 | 43 | - name: Run base tests 44 | run: | 45 | python -m pytest tests --ignore=tests/models --ignore=tests/cli 46 | 47 | - name: Run models tests 48 | run: | 49 | pip install accelerate transformers diffusers 50 | python -m pytest tests/models 51 | 52 | 53 | - name: Run CLI tests 54 | run: | 55 | pip install optimum 56 | python -m pytest tests/cli 57 | 58 | run_staging_tests: 59 | needs: [check-commits, python-quality] 60 | runs-on: ubuntu-latest 61 | strategy: 62 | fail-fast: false 63 | matrix: 64 | python-version: ["3.9", "3.11"] 65 | 66 | steps: 67 | - uses: actions/checkout@v2 68 | - name: Set up Python ${{ matrix.python-version }} 69 | uses: actions/setup-python@v2 70 | with: 71 | python-version: ${{ matrix.python-version }} 72 | 73 | - name: Build and install quanto 74 | run: | 75 | pip install --upgrade pip 76 | pip install -e .[dev] 77 | 78 | - name: Run models hub tests 79 | run: | 80 | pip install accelerate transformers diffusers 81 | HUGGINGFACE_CO_STAGING=true python -m pytest tests/models -k "hub" 82 | -------------------------------------------------------------------------------- /.github/workflows/linux-cuda-tests.yml: -------------------------------------------------------------------------------- 1 | name: Linux CUDA tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "optimum/quanto/**" 9 | - "tests/**" 10 | - "pyproject.toml" 11 | pull_request: 12 | types: [assigned, opened, synchronize, reopened] 13 | paths: 14 | - "optimum/quanto/**" 15 | - "tests/**" 16 | - "pyproject.toml" 17 | 18 | jobs: 19 | check-commits: 20 | uses: ./.github/workflows/check-commits.yml 21 | python-quality: 22 | uses: ./.github/workflows/python-quality.yml 23 | test-ubuntu-cuda: 24 | needs: [check-commits, python-quality] 25 | runs-on: 26 | group: aws-g5-4xlarge-plus 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | cuda-version: ["11.8", "12.4", "12.6"] 31 | container: 32 | image: pytorch/pytorch:2.6.0-cuda${{ matrix.cuda-version }}-cudnn9-devel 33 | options: --gpus 0 34 | 35 | steps: 36 | - uses: actions/checkout@v2 37 | - name: Check CUDA installation 38 | run: | 39 | nvcc -V 40 | 41 | - name: Build and install quanto 42 | run: | 43 | pip install --upgrade pip 44 | pip install -e .[dev] 45 | 46 | - name: Run base tests 47 | run: | 48 | python -m pytest tests --ignore=tests/models --ignore=tests/cli 49 | 50 | - name: Run models tests 51 | run: | 52 | pip install accelerate transformers diffusers 53 | python -m pytest tests/models 54 | 55 | - name: Run CLI tests 56 | run: | 57 | pip install optimum 58 | python -m pytest tests/cli 59 | -------------------------------------------------------------------------------- /.github/workflows/linux-examples.yml: -------------------------------------------------------------------------------- 1 | name: Linux examples (CPU, CUDA) 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "optimum/quanto/**" 9 | - "examples/**" 10 | - "pyproject.toml" 11 | pull_request: 12 | types: [assigned, opened, synchronize, reopened] 13 | paths: 14 | - "optimum/quanto/**" 15 | - "examples/**" 16 | - "pyproject.toml" 17 | 18 | jobs: 19 | check-commits: 20 | uses: ./.github/workflows/check-commits.yml 21 | python-quality: 22 | uses: ./.github/workflows/python-quality.yml 23 | run-examples: 24 | needs: [check-commits, python-quality] 25 | runs-on: 26 | group: aws-g5-4xlarge-plus 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | device: ["cpu", "cuda"] 31 | container: 32 | image: pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel 33 | options: --gpus 0 34 | 35 | steps: 36 | - uses: actions/checkout@v2 37 | - name: Check CUDA installation 38 | run: | 39 | nvcc -V 40 | 41 | - name: Build and install packages 42 | run: | 43 | pip install --upgrade pip 44 | pip install -e .[examples] 45 | 46 | # Run examples 47 | - name: Run MNIST classification example 48 | run: | 49 | for w in int4 int8 float8; do \ 50 | for a in none int8 float8; do \ 51 | python examples/vision/image-classification/mnist/quantize_mnist_model.py \ 52 | --weights $w --activations $a --device ${{ matrix.device }}; \ 53 | done; \ 54 | done 55 | - name: Run OWL detection example 56 | run: | 57 | for w in int4 int8 float8; do \ 58 | python examples/vision/object-detection/quantize_owl_model.py \ 59 | --image http://images.cocodataset.org/val2017/000000039769.jpg \ 60 | --texts "a photo of a cat" "a remote" \ 61 | --weights $w --device ${{ matrix.device }}; \ 62 | done 63 | - name: Run text-classification example 64 | run: | 65 | for w in int4 int8; do \ 66 | for a in none int8; do \ 67 | python examples/nlp/text-classification/sst2/quantize_sst2_model.py \ 68 | --weights $w --activations $a --device ${{ matrix.device }}; \ 69 | done; \ 70 | done 71 | - name: Run text-to-image example 72 | if: ${{ matrix.device == 'cuda'}} 73 | run: | 74 | for w in int4 int8 fp8; do \ 75 | python examples/vision/text-to-image/quantize_pixart_sigma.py \ 76 | --qtype $w --device ${{ matrix.device }}; \ 77 | done 78 | -------------------------------------------------------------------------------- /.github/workflows/python-quality.yml: -------------------------------------------------------------------------------- 1 | name: Python code quality 2 | 3 | on: [workflow_call] 4 | 5 | jobs: 6 | check_code_quality: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: Set up Python 12 | uses: actions/setup-python@v2 13 | with: 14 | python-version: 3.9 15 | - name: Install dependencies 16 | run: | 17 | pip install --upgrade pip 18 | pip install .[dev] 19 | - run: ruff format bench examples optimum tests --diff 20 | - run: ruff check --show-fixes bench examples optimum tests 21 | -------------------------------------------------------------------------------- /.github/workflows/security.yml: -------------------------------------------------------------------------------- 1 | name: Security Checks 2 | 3 | on: 4 | push: 5 | 6 | permissions: 7 | contents: read 8 | 9 | jobs: 10 | secrets: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - shell: bash 14 | run: | 15 | if [ "${{ github.event_name }}" == "push" ]; then 16 | echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV 17 | echo "branch=${{ github.ref_name }}" >> $GITHUB_ENV 18 | fi 19 | if [ "${{ github.event_name }}" == "pull_request" ]; then 20 | echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV 21 | echo "branch=${{ github.event.pull_request.head.ref }}" >> $GITHUB_ENV 22 | fi 23 | - name: Checkout code 24 | uses: actions/checkout@v4 25 | with: 26 | ref: ${{env.branch}} 27 | fetch-depth: ${{env.depth}} 28 | - name: Scan for secrets 29 | uses: trufflesecurity/trufflehog@main 30 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: 'Close stale issues and PRs' 2 | on: 3 | schedule: 4 | - cron: '30 1 * * *' 5 | workflow_dispatch: 6 | 7 | permissions: 8 | issues: write 9 | pull-requests: write 10 | 11 | jobs: 12 | stale: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/stale@v9 16 | with: 17 | stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.' 18 | stale-pr-message: 'This PR is stale because it has been open 15 days with no activity. Remove stale label or comment or this will be closed in 5 days.' 19 | close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.' 20 | close-pr-message: 'This PR was closed because it has been stalled for 5 days with no activity.' 21 | days-before-issue-stale: 30 22 | days-before-pr-stale: 15 23 | days-before-issue-close: 5 24 | days-before-pr-close: 5 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .pytest_cache 3 | *.egg-info 4 | dist 5 | .venv 6 | build/ -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: check test style 2 | 3 | check_dirs := optimum tests bench examples 4 | 5 | check: 6 | ruff check --show-fixes ${check_dirs} 7 | ruff format ${check_dirs} --diff 8 | 9 | style: 10 | ruff check ${check_dirs} --fix 11 | ruff format ${check_dirs} 12 | 13 | test: 14 | python -m pytest -sv tests 15 | -------------------------------------------------------------------------------- /bench/generation/charts/google-gemma-2b_bf16_Accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/google-gemma-2b_bf16_Accuracy.png -------------------------------------------------------------------------------- /bench/generation/charts/google-gemma-2b_bf16_Latency__ms_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/google-gemma-2b_bf16_Latency__ms_.png -------------------------------------------------------------------------------- /bench/generation/charts/google-gemma-2b_bf16_Perplexity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/google-gemma-2b_bf16_Perplexity.png -------------------------------------------------------------------------------- /bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png -------------------------------------------------------------------------------- /bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png -------------------------------------------------------------------------------- /bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png -------------------------------------------------------------------------------- /bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png -------------------------------------------------------------------------------- /bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png -------------------------------------------------------------------------------- /bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png -------------------------------------------------------------------------------- /bench/generation/evaluate_many_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Absolute path to this script, e.g. /home/user/bin/foo.sh 3 | SCRIPT=$(readlink -f "$0") 4 | # Absolute path this script is in, thus /home/user/bin 5 | SCRIPT_PATH=$(dirname "$SCRIPT") 6 | 7 | models=( 8 | google/gemma-2b 9 | meta-llama/Meta-Llama-3.1-8B 10 | mistralai/Mistral-7B-Instruct-v0.3 11 | ) 12 | 13 | for m in ${models[@]}; do 14 | python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric prediction --png --json --batch_size 16 15 | python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric perplexity --png --json --batch_size 16 16 | python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric latency --png --json --batch_size 16 17 | done 18 | -------------------------------------------------------------------------------- /bench/generation/gen_barchart.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import json 17 | 18 | import matplotlib.pyplot as plt 19 | import numpy as np 20 | import torch 21 | 22 | 23 | def save_bar_chart(title, labels, ylabel, series, save_path): 24 | x = np.arange(len(labels)) # the label locations 25 | width = 0.15 # the width of the bars 26 | multiplier = 0 27 | 28 | fig, ax = plt.subplots(layout="constrained") 29 | fig.set_figwidth(10) 30 | 31 | max_value = 0 32 | 33 | for attribute, measurement in series.items(): 34 | max_value = max(max_value, max(measurement)) 35 | offset = width * multiplier 36 | rects = ax.bar(x + offset, measurement, width, label=attribute) 37 | ax.bar_label(rects, padding=5) 38 | multiplier += 1 39 | 40 | # Add some text for labels, title and custom x-axis tick labels, etc. 41 | ax.set_ylabel(ylabel) 42 | ax.set_title(title) 43 | ax.set_xticks(x + width, labels) 44 | ax.legend(loc="upper left", ncols=4) 45 | ax.set_ylim(0, max_value * 1.2) 46 | 47 | plt.savefig(save_path) 48 | 49 | 50 | def gen_barchart(model_id, title, label, results, dtype): 51 | dtype_str = "f16" if dtype is torch.float16 else "bf16" 52 | activations = (dtype_str, "f8") 53 | weights = ("i4", "i8", "f8") 54 | series = {} 55 | reference = round(results[f"W{dtype_str}A{dtype_str}"], 2) 56 | series[f"Weights {dtype_str}"] = [ 57 | reference, 58 | ] * len(activations) 59 | for w in weights: 60 | name = f"Weights {w}" 61 | series[name] = [] 62 | for a in activations: 63 | result = results[f"W{w}A{a}"] 64 | series[name].append(round(result, 2)) 65 | model_name = model_id.replace("/", "-") 66 | metric_name = label.replace(" ", "_").replace("(", "_").replace(")", "_") 67 | save_bar_chart( 68 | title=title, 69 | labels=[f"Activations {a}" for a in activations], 70 | series=series, 71 | ylabel=label, 72 | save_path=f"{model_name}_{dtype_str}_{metric_name}.png", 73 | ) 74 | 75 | 76 | def main(): 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument("benchmark", type=str, help="A benchmark result file (.json).") 79 | parser.add_argument("--title", type=str, required=True, help="The graph title.") 80 | parser.add_argument("--label", type=str, required=True, help="The graph vertical label.") 81 | args = parser.parse_args() 82 | with open(args.benchmark) as f: 83 | benchmark = json.load(f) 84 | for model_id, results in benchmark.items(): 85 | gen_barchart(model_id, args.title, args.label, results) 86 | 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /bench/generation/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /bench/generation/metrics/prediction.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | 17 | import torch 18 | from datasets import load_dataset 19 | 20 | 21 | @torch.no_grad() 22 | def prediction_accuracy(model, tokenizer, batch_size, samples=None): 23 | test_dataset = load_dataset("lambada", split=["test"])[0] 24 | model.eval() 25 | # The task is to predict the last token of the input. 26 | total, hit = 0, 0 27 | start = time.time() 28 | for batch in test_dataset.iter(batch_size=batch_size): 29 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True) 30 | input_ids = inputs.input_ids.to(model.device) 31 | attention_mask = inputs.attention_mask.to(model.device) 32 | labels = input_ids[:, -1] 33 | # Pass only the first tokens 34 | outputs = model(input_ids[:, :-1], attention_mask=attention_mask[:, :-1]) 35 | preds = outputs.logits[:, -1, :].argmax(dim=-1) 36 | total += labels.size(0) 37 | hit += (preds == labels).sum().item() 38 | if samples is not None and total >= samples: 39 | break 40 | end = time.time() 41 | acc = hit / total 42 | print(f"{total} sequences evaluated in {end - start:.2f} s. accuracy = {acc:.2f}") 43 | return acc 44 | -------------------------------------------------------------------------------- /bench/generation/setup/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /bench/generation/setup/bnb.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig 17 | 18 | 19 | def setup( 20 | model_id: str, 21 | weights: str, 22 | activations: str, 23 | device: torch.device, 24 | ): 25 | if activations != "none": 26 | raise ValueError("Activation quantization is not supported by BitsAndBytes") 27 | if weights == "int4": 28 | quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="fp4") 29 | elif weights == "int8": 30 | quantization_config = BitsAndBytesConfig(load_in_8bit=True) 31 | else: 32 | raise ValueError("BitsAndBytes only supports int4 and int8 weights.") 33 | dtype = torch.float32 if device.type == "cpu" else torch.float16 34 | tokenizer = AutoTokenizer.from_pretrained(model_id) 35 | tokenizer.pad_token_id = tokenizer.eos_token_id 36 | tokenizer.padding_side = "left" 37 | quantization_config.bnb_4bit_compute_dtype = dtype 38 | model = AutoModelForCausalLM.from_pretrained( 39 | model_id, torch_dtype=dtype, low_cpu_mem_usage=True, quantization_config=quantization_config 40 | ) 41 | 42 | return model, tokenizer 43 | -------------------------------------------------------------------------------- /bench/generation/setup/hqq.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from hqq.core.quantize import BaseQuantizeConfig 17 | from hqq.engine.hf import HQQModelForCausalLM 18 | from transformers import AutoTokenizer 19 | 20 | 21 | def setup(model_id: str, weights: str, activations: str, device: torch.device, group_size: int = 64): 22 | if activations != "none": 23 | raise ValueError("Activation quantization is not supported by HQQ") 24 | if weights == "int4": 25 | quant_config = BaseQuantizeConfig(nbits=4, group_size=group_size) 26 | elif weights == "int8": 27 | quant_config = BaseQuantizeConfig(nbits=8, group_size=group_size) 28 | else: 29 | raise ValueError("HQQ only supports int4 and int8 weights.") 30 | # Load model 31 | model = HQQModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16) 32 | # Quantize 33 | model.quantize_model(quant_config=quant_config, compute_dtype=torch.float16, device=device) 34 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 35 | tokenizer.pad_token_id = tokenizer.eos_token_id 36 | tokenizer.padding_side = "left" 37 | return model, tokenizer 38 | -------------------------------------------------------------------------------- /bench/generation/setup/quanto.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | 17 | import torch 18 | from datasets import load_dataset 19 | from transformers import AutoModelForCausalLM, AutoTokenizer 20 | 21 | from optimum.quanto import Calibration, freeze, qfloat8, qint4, qint8, quantize 22 | 23 | 24 | @torch.no_grad() 25 | def calibrate(model, tokenizer, batch_size, batches): 26 | samples = batch_size * batches 27 | cal_dataset = load_dataset("lambada", split=["validation"])[0] 28 | model.eval() 29 | total = 0 30 | for batch in cal_dataset.iter(batch_size=batch_size): 31 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True) 32 | input_ids = inputs.input_ids.to(model.device) 33 | attention_mask = inputs.attention_mask.to(model.device) 34 | model(input_ids, attention_mask=attention_mask) 35 | total += input_ids.size(0) 36 | if total >= samples: 37 | break 38 | 39 | 40 | def setup( 41 | model_id: str, 42 | weights: str, 43 | activations: str, 44 | batch_size: int, 45 | device: torch.device, 46 | dtype: torch.dtype, 47 | ): 48 | weights = keyword_to_qtype(weights) 49 | activations = keyword_to_qtype(activations) 50 | tokenizer = AutoTokenizer.from_pretrained(model_id) 51 | tokenizer.pad_token_id = tokenizer.eos_token_id 52 | tokenizer.padding_side = "left" 53 | model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, low_cpu_mem_usage=True).to(device) 54 | if weights is not None or activations is not None: 55 | print("Quantizing") 56 | start = time.time() 57 | quantization_root = model 58 | if hasattr(model, "model"): 59 | quantization_root = model.model 60 | quantize(quantization_root, weights=weights, activations=activations) 61 | if activations is not None: 62 | print("Calibrating") 63 | with Calibration(): 64 | calibrate(model, tokenizer, batch_size, batches=4) 65 | print("Freezing") 66 | freeze(model) 67 | print(f"Finished: {time.time() - start:.2f}") 68 | return model, tokenizer 69 | 70 | 71 | def keyword_to_qtype(k): 72 | return { 73 | "none": None, 74 | "int4": qint4, 75 | "int8": qint8, 76 | "float8": qfloat8, 77 | }[k] 78 | -------------------------------------------------------------------------------- /bench/torch_kernels/README.md: -------------------------------------------------------------------------------- 1 | This contains a few scripts to test pytorch kernels that are relevant for quantization. 2 | -------------------------------------------------------------------------------- /bench/torch_kernels/test_int_mm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import timeit 17 | 18 | import torch 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser(description="Torch integer matmul benchmark") 23 | parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") 24 | parser.add_argument("--device", type=str, default=None, help="The device to use for the test.") 25 | parser.add_argument("--it", type=int, default=100, help="Number of iterations for average") 26 | args = parser.parse_args() 27 | 28 | torch.manual_seed(args.seed) 29 | 30 | if args.device is None: 31 | if torch.cuda.is_available(): 32 | device = torch.device("cuda") 33 | elif torch.backends.mps.is_available(): 34 | device = torch.device("mps") 35 | elif torch.xpu.is_available(): 36 | device = torch.device("xpu") 37 | else: 38 | device = torch.device("cpu") 39 | else: 40 | device = torch.device(args.device) 41 | 42 | def avg_time(f, it): 43 | return timeit.Timer(f).timeit(it) / it 44 | 45 | # Resstrictions for accelerated integer matmul: 46 | # - input matrices must be 2D 47 | # - the collapsing dimension must be a multiple of 8 48 | A = torch.randint(1, 10, [2400, 3200]).type(torch.int8).to(device) 49 | B = torch.randint(1, 10, [3200, 4800]).type(torch.int8).to(device) 50 | 51 | print(f"Evaluating integer matmul on {device.type}:") 52 | # Warmup (slow) 53 | torch._int_mm(A, B) 54 | # Average on several calls 55 | t = avg_time(lambda: torch._int_mm(A, B), args.it) * 1000 56 | print(f"Average inference on {args.it} iterations: {t:.4f} ms") 57 | 58 | # Convert inputs to float 59 | 60 | def to_float(x): 61 | if x.device.type == ("cpu"): 62 | # matrix multiplication is not supported for float16 on CPU 63 | return x.to(torch.float32) 64 | return x.to(torch.float16) 65 | 66 | A = to_float(A) 67 | B = to_float(B) 68 | print(f"Evaluating {A.dtype} matmul on {device.type}:") 69 | 70 | # Warmup (slow) 71 | torch.matmul(A, B) 72 | # Average on several calls 73 | t = avg_time(lambda: torch.matmul(A, B), args.it) * 1000 74 | print(f"Average inference on {args.it} iterations: {t:.4f} ms") 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /bench/torch_kernels/test_int_mm_inductor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import timeit 16 | 17 | import torch 18 | 19 | 20 | def mm(a, b): 21 | return torch._int_mm(a, b) 22 | 23 | 24 | A = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda() 25 | B = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda() 26 | it = 100 27 | 28 | # Warmup (slow) 29 | mm(A, B) 30 | # Get a reference 31 | print(timeit.Timer(lambda: mm(A, B)).timeit(it) / it) 32 | 33 | cmm = torch.compile(mm, backend="inductor") 34 | # First invocation will trigger the actual compilation 35 | cmm(A, B) 36 | # Now compare execution time 37 | print(timeit.Timer(lambda: cmm(A, B)).timeit(it) / it) 38 | -------------------------------------------------------------------------------- /bench/torch_kernels/test_weight_int8pack_mm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import timeit 17 | 18 | import torch 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser(description="Torch quantized int8 weight matmul benchmark") 23 | parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") 24 | parser.add_argument("--device", type=str, default=None, help="The device to use for the test.") 25 | parser.add_argument("--it", type=int, default=10, help="Number of iterations for average") 26 | args = parser.parse_args() 27 | 28 | torch.manual_seed(args.seed) 29 | 30 | if args.device is None: 31 | if torch.cuda.is_available(): 32 | device = torch.device("cuda") 33 | elif torch.backends.mps.is_available(): 34 | device = torch.device("mps") 35 | elif torch.xpu.is_available(): 36 | device = torch.device("xpu") 37 | else: 38 | device = torch.device("cpu") 39 | else: 40 | device = torch.device(args.device) 41 | 42 | def avg_time(f, it): 43 | return timeit.Timer(f).timeit(it) / it 44 | 45 | A = torch.rand([2400, 3200], dtype=torch.bfloat16, device=device) 46 | B = torch.randint(-128, 127, [4800, 3200], dtype=torch.int8, device=device) 47 | B_scale = torch.rand([4800], dtype=torch.bfloat16, device=device) 48 | 49 | print(f"Evaluating quantized int8 matmul on {device.type}:") 50 | # Warmup (slow) 51 | torch._weight_int8pack_mm(A, B, B_scale) 52 | # Average on several calls 53 | t = avg_time(lambda: torch._weight_int8pack_mm(A, B, B_scale), args.it) * 1000 54 | print(f"Average inference on {args.it} iterations: {t:.4f} ms") 55 | 56 | # Convert weights to float 57 | 58 | B = B.to(torch.bfloat16).t() 59 | print(f"Evaluating {A.dtype} matmul on {device.type}:") 60 | 61 | # Warmup (slow) 62 | torch.matmul(A, B) * B_scale 63 | # Average on several calls 64 | t = avg_time(lambda: torch.matmul(A, B) * B_scale, args.it) * 1000 65 | print(f"Average inference on {args.it} iterations: {t:.4f} ms") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /examples/speech/speech_recognition/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | evaluate 3 | librosa 4 | soundfile 5 | jiwer 6 | -------------------------------------------------------------------------------- /examples/vision/StableDiffusion/README.md: -------------------------------------------------------------------------------- 1 | # Quantize Stable Diffusion examples 2 | 3 | ## Running locally with PyTorch 4 | 5 | ### Installing the dependencies 6 | 7 | Before running the scripts, make sure to install the library's training dependencies: 8 | 9 | **Important** 10 | 11 | To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment: 12 | ```bash 13 | git clone https://github.com/huggingface/quanto 14 | cd quanto 15 | pip install -e . 16 | ``` 17 | 18 | Then cd in the `examples/vision/StableDiffusion` folder and run 19 | ```bash 20 | pip install -r requirements.txt 21 | ``` 22 | 23 | **Now, we can launch the image generation script:** 24 | 25 | ```bash 26 | python quantize_StableDiffusion.py --batch_size=1 --torch_dtype="fp32" 27 | ``` 28 | 29 | To better track our training experiments, we're using the following flags in the command above: 30 | 31 | * `batch_size` Batch size is the number of samples used in one iteration of training. 32 | 33 | * `torch_dtype` {fp32,fp16,bf16} 34 | * `unet_qtype` {fp8,int8,int4,none} 35 | 36 | Our experiments were conducted on a single 24GB A10 GPU. 37 | ```bash 38 | fp16-fp16 39 | 40 | batch_size: 1, torch_dtype: fp16, unet_dtype: none  in 3.307 seconds.Memory: 3.192GB. 41 | ``` 42 | 43 | ```bash 44 | bf16-int8 45 | 46 | batch_size: 1, torch_dtype: bf16, unet_dtype: int8  in 3.918 seconds.Memory: 2.644GB. 47 | ``` 48 | 49 | ```bash 50 | fp16-int8 51 | 52 | batch_size: 1, torch_dtype: fp16, unet_dtype: int8  in 3.920 seconds.Memory: 2.634GB. 53 | ``` 54 | 55 | will both get high-quality images at fast speed generation -------------------------------------------------------------------------------- /examples/vision/StableDiffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | quanto 2 | diffusers 3 | torch 4 | transformers 5 | accelerate 6 | wandb -------------------------------------------------------------------------------- /examples/vision/text-to-image/quantize_pixart_sigma.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gc 3 | 4 | import torch 5 | from diffusers import DiffusionPipeline 6 | 7 | from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize 8 | 9 | 10 | NUM_INFERENCE_STEPS = 50 11 | 12 | TORCH_DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16} 13 | QTYPES = { 14 | "fp8": qfloat8, 15 | "int8": qint8, 16 | "int4": qint4, 17 | "none": None, 18 | } 19 | 20 | 21 | def load_pipeline(model_id, torch_dtype, qtype=None, device="cpu"): 22 | pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True).to(device) 23 | 24 | if qtype: 25 | quantize(pipe.transformer, weights=qtype) 26 | freeze(pipe.transformer) 27 | quantize(pipe.text_encoder, weights=qtype) 28 | freeze(pipe.text_encoder) 29 | 30 | pipe.set_progress_bar_config(disable=True) 31 | return pipe 32 | 33 | 34 | def get_device_memory(device): 35 | gc.collect() 36 | if device.type == "cuda": 37 | torch.cuda.empty_cache() 38 | return torch.cuda.memory_allocated() 39 | elif device.type == "mps": 40 | torch.mps.empty_cache() 41 | return torch.mps.current_allocated_memory() 42 | elif device.type == "xpu": 43 | torch.xpu.empty_cache() 44 | return torch.xpu.memory_allocated() 45 | return None 46 | 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("--model_id", type=str, default="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS") 51 | parser.add_argument("--prompt", type=str, default="ghibli style, a fantasy landscape with castles") 52 | parser.add_argument("--torch_dtype", type=str, default="fp16", choices=list(TORCH_DTYPES.keys())) 53 | parser.add_argument("--qtype", type=str, default=None, choices=list(QTYPES.keys())) 54 | parser.add_argument("--device", type=str, default=None, help="The device to use for generation.") 55 | args = parser.parse_args() 56 | 57 | if args.device is None: 58 | if torch.cuda.is_available(): 59 | device = torch.device("cuda") 60 | elif torch.backends.mps.is_available(): 61 | device = torch.device("mps") 62 | elif torch.xpu.is_available(): 63 | device = torch.device("xpu") 64 | else: 65 | device = torch.device("cpu") 66 | else: 67 | device = torch.device(args.device) 68 | 69 | pipeline = load_pipeline( 70 | args.model_id, TORCH_DTYPES[args.torch_dtype], QTYPES[args.qtype] if args.qtype else None, device 71 | ) 72 | 73 | print(f"torch_dtype: {args.torch_dtype}, qtype: {args.qtype}.") 74 | memory = get_device_memory(device) 75 | if memory is not None: 76 | memory_gb = memory / 2**30 77 | print(f"{device.type} device memory: {memory_gb:.2f} GB.") 78 | 79 | if args.qtype == "int4" and device.type == "CUDA": 80 | raise ValueError("This example does not work (yet) for int4 on CUDA") 81 | 82 | img_name = f"pixart-sigma-dtype@{args.torch_dtype}-qtype@{args.qtype}.png" 83 | image = pipeline( 84 | prompt=args.prompt, 85 | num_inference_steps=NUM_INFERENCE_STEPS, 86 | num_images_per_prompt=1, 87 | generator=torch.manual_seed(0), 88 | ).images[0] 89 | image.save(img_name) 90 | -------------------------------------------------------------------------------- /external/awq/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | 18 | 19 | devices = ["cpu"] 20 | if torch.cuda.is_available(): 21 | devices += ["cuda"] 22 | elif torch.backends.mps.is_available(): 23 | devices += ["mps"] 24 | 25 | 26 | @pytest.fixture(scope="module", params=devices) 27 | def device(request): 28 | return torch.device(request.param) 29 | 30 | 31 | def pytest_configure(config): 32 | # register additional markers 33 | config.addinivalue_line("markers", "skip_device(type): mark test to be skipped for the specified device type") 34 | 35 | 36 | def pytest_runtest_call(item): 37 | fixture_name = "device" 38 | if fixture_name in item.fixturenames: 39 | # TODO: should be able to recover the fixture id instead of the actual value 40 | fixture_arg = item.funcargs[fixture_name].type 41 | skip_marks = {mark.args[0] for mark in item.iter_markers(name=f"skip_{fixture_name}")} 42 | if fixture_arg in skip_marks: 43 | pytest.skip(f"Test skipped for {fixture_name} {fixture_arg}") 44 | -------------------------------------------------------------------------------- /external/awq/pack_intweight.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023 MIT HAN Lab 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | import torch 23 | 24 | 25 | def pack_intweight(unpacked_qweight, interleave, kstride): 26 | # unpacked_qweight: [N, K] 27 | N = unpacked_qweight.shape[0] 28 | K = unpacked_qweight.shape[1] 29 | 30 | Packed_Kernel = unpacked_qweight.cpu().numpy().reshape(N, K // 32, 32) 31 | # np.arange(32).reshape(4, 4, 2).transpose(1, 0, 2) => [0, 1, 8, 9, 16, 17, 24, 25, ...] 32 | Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 3, 2, 4) 33 | Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 32) 34 | 35 | # reorder each 8 weights for fast dequantization 36 | # [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7] 37 | Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 8) 38 | Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 2, 4, 3) 39 | Packed_Kernel = Packed_Kernel.reshape(N, K) 40 | 41 | # interleaving every four rows 42 | Packed_Kernel = Packed_Kernel.reshape( 43 | N // interleave, interleave, K // kstride, kstride 44 | ) 45 | # N // 4, K // 64, 4, 64 46 | Packed_Kernel = Packed_Kernel.transpose(0, 2, 1, 3) 47 | Packed_Kernel = Packed_Kernel.reshape( 48 | N // interleave, K // kstride, kstride, interleave 49 | ) 50 | # Packing -> (N // 4, K // 64, 64) 51 | Packed_Kernel = ( 52 | Packed_Kernel[..., 0] 53 | | (Packed_Kernel[..., 1] << 4) 54 | | (Packed_Kernel[..., 2] << 8) 55 | | (Packed_Kernel[..., 3] << 12) 56 | ) 57 | # reshape to (N // 4, K), FP16 format 58 | Packed_Kernel = Packed_Kernel.reshape(N // interleave, K) 59 | qweight = ( 60 | torch.tensor(Packed_Kernel.astype("int16")) 61 | .to(unpacked_qweight.device) 62 | .contiguous() 63 | ) 64 | return qweight 65 | -------------------------------------------------------------------------------- /external/awq/test_awq_quantize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup 5 | 6 | 7 | def awq_quantize(base, scales, zeros, group_size): 8 | _, in_features = base.shape 9 | scale_zeros = scales * zeros 10 | intweight = [] 11 | # From https://github.com/casper-hansen/AutoAWQ/blob/main/awq/modules/linear/gemv_fast.py#L165 12 | for idx in range(in_features): 13 | intweight.append( 14 | torch.round( 15 | (base[:, idx] + scale_zeros[:, idx // group_size]) 16 | / scales[:, idx // group_size] 17 | ).to(torch.uint8)[:, None] 18 | ) 19 | intweight = torch.cat(intweight, dim=1) 20 | return intweight 21 | 22 | 23 | @pytest.mark.parametrize("in_features, out_features", [(256, 512), (1024, 1024)]) 24 | def test_awq_quantize(in_features, out_features): 25 | """Verify that AWQ quantization is equivalent to quanto affine quantization 26 | """ 27 | shape = (out_features, in_features) 28 | base = torch.rand(shape, dtype=torch.float16) 29 | group_size = 128 30 | 31 | # Quantize using quanto 32 | scale, zeropoint = MaxOptimizer()(base, bits=4, axis=0, group_size=128) 33 | quanto_base = AffineQuantizer.apply(base, qint4, 0, group_size, scale, zeropoint) 34 | # Extract quantized data, unpack and ungroup to recover original shape 35 | quanto_data = ungroup(quanto_base._data.unpack(), axis=0, orig_shape=shape) 36 | 37 | # Reshape scale and zeropoint as expected by awq 38 | awq_shape = (out_features, in_features // group_size) 39 | scale = scale.reshape(awq_shape) 40 | zeropoint = zeropoint.reshape(awq_shape) 41 | 42 | # Compare with awq quantization 43 | awq_data = awq_quantize(base, scale, zeropoint, group_size) 44 | # FIX: AWQ does not clamp values before packing 45 | qmax = 2 ** 4 - 1 46 | awq_data = torch.clamp(awq_data, 0, qmax) 47 | 48 | mismatches = quanto_data != awq_data 49 | n = torch.sum(mismatches).numpy() 50 | rate = n / base.numel() 51 | print(f"Mismatches: {n}/{base.numel()} ({rate:.8f} %)") 52 | # Extract mismatches 53 | display = 10 54 | quanto_values = torch.masked_select(quanto_data, mismatches)[:display] 55 | awq_values = torch.masked_select(awq_data, mismatches)[:display] 56 | print(f"First {display} mismatches") 57 | print(list(quanto_values.numpy())) 58 | print(list(awq_values.numpy())) 59 | # Due to a slightly different order of operations (zero is multiplied by scale before subtracting it), 60 | # there are some mismatches 61 | assert rate < 5e-4 62 | -------------------------------------------------------------------------------- /external/smoothquant/README.md: -------------------------------------------------------------------------------- 1 | # SmoothQuant original conversion script 2 | 3 | This converts an OPT or Bloom [🤗 transformers](https://github.com/huggingface/transformers) model to a "smoothed" version, as described in 4 | [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438). 5 | 6 | ```bash 7 | $ python smoothquant.py --model facebook/opt-1.3b --save-path smoothed-models/facebook/opt-1.3b 8 | ``` 9 | 10 | Note: due to hard-coded assumptions on model architecture in the script this only works for OPT models that apply the layer_norm 11 | before the attention (`do_layer_norm_before=true` in `config.json`). This means all models but `facebook/opt-350m`. 12 | -------------------------------------------------------------------------------- /optimum/quanto/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __version__ = "0.2.7dev" 16 | 17 | from .calibrate import * 18 | from .library import * 19 | from .models import * 20 | from .nn import * 21 | from .quantize import * 22 | from .tensor import * 23 | -------------------------------------------------------------------------------- /optimum/quanto/library/README.md: -------------------------------------------------------------------------------- 1 | # Quanto operations library 2 | 3 | This contains the `quanto::` operations, available in python under `torch.ops.quanto`. 4 | 5 | To add a new operation: 6 | 7 | - add a definition for the operation in `library/ops.py`, 8 | - provide a default implementation using pytorch operators only under `library/python`, 9 | - provide optimized kernels for all devices under `library/ext`. 10 | -------------------------------------------------------------------------------- /optimum/quanto/library/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .extensions import * 16 | from .qbytes_mm import * 17 | from .quantize import * 18 | from .unpack import * 19 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/README.md: -------------------------------------------------------------------------------- 1 | # Quanto library extensions 2 | 3 | This folder contains device-specific `quanto::` operations. 4 | 5 | Implementations can be provided as part of: 6 | 7 | - the generic C++ pytorch extension under `cpp`, 8 | - the CUDA extension under `cuda`, 9 | - the Metal Performance Shader extension under `mps`, 10 | - the XPU SYCL extension under `xpu`. 11 | 12 | 13 | To provide a device-specific implementation of an operation that already has a default implementation (such as unpack), use the following syntax: 14 | 15 | ```python 16 | @torch.library.impl("quanto::unpack", ["CPU", "CUDA"]) 17 | def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor: 18 | return ext.unpack(t, bits) 19 | ``` 20 | 21 | To declare a new device-specific operation, you need to add it to the library: 22 | 23 | ```python 24 | torch.library.define( 25 | "quanto::gemm_f16i4", 26 | "(Tensor input," 27 | " Tensor other," 28 | " Tensor other_scale," 29 | " Tensor other_shift," 30 | " int group_size)" 31 | " -> Tensor", 32 | ) 33 | ``` 34 | 35 | Then you can provide its implementation: 36 | 37 | ```python 38 | @torch.library.impl("quanto::gemm_f16i4", ["CUDA"]) 39 | def gemm_f16i4( 40 | input: torch.Tensor, 41 | other: torch.Tensor, 42 | scales: torch.Tensor, 43 | shift: torch.Tensor, 44 | group_size: int, 45 | ) -> torch.Tensor: 46 | ... 47 | ``` 48 | 49 | 50 | Please refer to each extension folder for examples. 51 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import platform 16 | 17 | import torch 18 | from packaging import version 19 | 20 | from .cpp import * 21 | from .extension import * 22 | 23 | 24 | if torch.cuda.is_available() and platform.system() == "Linux": 25 | if torch.version.cuda: 26 | from .cuda import * 27 | elif torch.version.hip: 28 | from .hip import * 29 | 30 | if torch.backends.mps.is_available(): 31 | from .mps import * 32 | 33 | 34 | def _is_xpu_available(): 35 | # SYCL extension support is added in torch>=2.7 on Linux 36 | if platform.system() != "Linux": 37 | return False 38 | if version.parse(torch.__version__).release < version.parse("2.7").release: 39 | return False 40 | return torch.xpu.is_available() 41 | 42 | 43 | if _is_xpu_available(): 44 | from .xpu import * 45 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cpp/README.md: -------------------------------------------------------------------------------- 1 | # Quanto generic C++ extension 2 | 3 | Kernels in this extension must use only the C++ syntax. 4 | 5 | They can use any pytorch operation defined under `aten::` or `c10::`. 6 | 7 | To add a new implementation for an operation defined in `library./ops.py`: 8 | 9 | - add the corresponding `.cpp` file to the list of sources in `__init__.py`, 10 | - add a binding to `pybind_module.cpp`, 11 | - provide an implementation calling the binding in `__init__.py`. 12 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cpp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | import torch 18 | 19 | from ..extension import Extension, register_extension 20 | 21 | 22 | __all__ = [] 23 | 24 | 25 | ext = Extension( 26 | "quanto_cpp", 27 | root_dir=os.path.dirname(__file__), 28 | sources=["unpack.cpp", "pybind_module.cpp"], 29 | extra_cflags=["-O3"], 30 | ) 31 | register_extension(ext) 32 | 33 | 34 | @torch.library.impl("quanto::unpack", ["CPU"]) 35 | def unpack_cpp(t: torch.Tensor, bits: int): 36 | return ext.lib.unpack(t, bits) 37 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cpp/pybind_module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "unpack.h" 17 | 18 | // !IMPORTANT! Some python objects such as dtype, device, are not mapped to C++ types, 19 | // and need to be explicitly converted using dedicated helpers before calling a C++ method. 20 | // As a consequence, when an operation takes such an object as parameter, instead 21 | // of creating a binding directly to the C++ method, you must create a binding to a 22 | // lambda method that converts the unmapped types and calls the C++ method. 23 | 24 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 25 | m.def("unpack", &unpack, "unpack"); 26 | } 27 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cpp/unpack.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "unpack.h" 16 | #include 17 | 18 | 19 | static torch::Tensor unpack_4bit(torch::Tensor &t) { 20 | return torch::cat({ 21 | (t & 0x0F), 22 | (t & 0xF0).__rshift__(4) 23 | }, 24 | 0); 25 | } 26 | 27 | static torch::Tensor unpack_2bit(torch::Tensor &t) { 28 | return torch::cat({ 29 | (t & 0x03), 30 | (t & 0x0C).__rshift__(2), 31 | (t & 0x30).__rshift__(4), 32 | (t & 0xC0).__rshift__(6) 33 | }, 34 | 0); 35 | } 36 | 37 | torch::Tensor unpack(torch::Tensor &t, int bits) { 38 | TORCH_CHECK(t.scalar_type() == torch::kUInt8, "Unsupported data type: ", t.scalar_type()); 39 | switch(bits) { 40 | case 4: 41 | return unpack_4bit(t); 42 | case 2: 43 | return unpack_2bit(t); 44 | default: 45 | throw std::invalid_argument("Can only unpack 2-bit or 4-bit tensors."); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cpp/unpack.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | torch::Tensor unpack(torch::Tensor &t, int bits); 18 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/README.md: -------------------------------------------------------------------------------- 1 | # Quanto generic CUDA extension 2 | 3 | Kernels in this extension can use both the C++ and CUDA syntax. 4 | 5 | They can use any pytorch operation defined under `aten::` or `c10::`. 6 | 7 | To add a new implementation for an operation defined in `library./ops.py`: 8 | 9 | - add the corresponding `.cpp` or `.cu` file to the list of sources in `__init__.py`, 10 | - add a binding to `pybind_module.cpp`, 11 | - provide an implementation calling the binding in `__init__.py`. 12 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor awq_v2_gemm_f16i4(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scales, torch::Tensor _zeros); 4 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | torch::Tensor awq_v2_gemv_f16i4( 5 | torch::Tensor _in_feats, 6 | torch::Tensor _kernel, 7 | torch::Tensor _scaling_factors, 8 | torch::Tensor _zeros, 9 | int m, 10 | int n, 11 | int k, 12 | int group_size); 13 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/COPYRIGHT: -------------------------------------------------------------------------------- 1 | These kernels were vendored from VLLM. The Marlin kernels were developed 2 | by Elias Frantar and extended by Neural Magic. 3 | 4 | --- 5 | 6 | Copyright (C) Marlin.2024 Elias Frantar 7 | Modified by Neural Magic 8 | Copyright 2024 The vLLM team. 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cuh: -------------------------------------------------------------------------------- 1 | // #pragma once 2 | #include 3 | #include 4 | 5 | 6 | // #ifndef _fp8_marlin_cuh 7 | // #define _fp8_marlin_cuh 8 | 9 | // #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 10 | // assert(0); 11 | // #else 12 | torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, 13 | torch::Tensor& b_scales, torch::Tensor& workspace, 14 | int64_t num_bits, int64_t size_m, int64_t size_n, 15 | int64_t size_k); 16 | // #endif 17 | 18 | // #endif -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/gptq_marlin.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace gptq_marlin { 13 | 14 | // 8 warps are a good choice since every SM has 4 schedulers and having more 15 | // than 1 warp per schedule allows some more latency hiding. At the same time, 16 | // we want relatively few warps to have many registers per warp and small tiles. 17 | static constexpr int default_threads = 256; 18 | 19 | static constexpr int pipe_stages = 20 | 4; // 4 pipeline stages fit into shared memory 21 | 22 | static constexpr int min_thread_n = 64; 23 | static constexpr int min_thread_k = 64; 24 | 25 | static constexpr int tile_size = 16; 26 | static constexpr int max_par = 16; 27 | 28 | template 29 | struct Vec { 30 | T elems[n]; 31 | __device__ T& operator[](int i) { return elems[i]; } 32 | }; 33 | 34 | using I4 = Vec; 35 | 36 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } 37 | 38 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 39 | // No support for async 40 | #else 41 | 42 | __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, 43 | bool pred = true) { 44 | const int BYTES = 16; 45 | uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); 46 | asm volatile( 47 | "{\n" 48 | " .reg .pred p;\n" 49 | " setp.ne.b32 p, %0, 0;\n" 50 | " @p cp.async.cg.shared.global [%1], [%2], %3;\n" 51 | "}\n" ::"r"((int)pred), 52 | "r"(smem), "l"(glob_ptr), "n"(BYTES)); 53 | } 54 | 55 | __device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { 56 | const int BYTES = 16; 57 | uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); 58 | asm volatile( 59 | "{\n" 60 | " cp.async.cg.shared.global [%0], [%1], %2;\n" 61 | "}\n" ::"r"(smem), 62 | "l"(glob_ptr), "n"(BYTES)); 63 | } 64 | 65 | __device__ inline void cp_async_fence() { 66 | asm volatile("cp.async.commit_group;\n" ::); 67 | } 68 | 69 | template 70 | __device__ inline void cp_async_wait() { 71 | asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); 72 | } 73 | 74 | #endif 75 | 76 | } // namespace gptq_marlin -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_dtypes.cuh: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _data_types_cuh 3 | #define _data_types_cuh 4 | #include "gptq_marlin.cuh" 5 | #include 6 | #include 7 | 8 | namespace gptq_marlin { 9 | 10 | template 11 | class ScalarType {}; 12 | 13 | template <> 14 | class ScalarType { 15 | public: 16 | using scalar_t = half; 17 | using scalar_t2 = half2; 18 | 19 | // Matrix fragments for tensor core instructions; their precise layout is 20 | // documented here: 21 | // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type 22 | using FragA = Vec; 23 | using FragB = Vec; 24 | using FragC = Vec; 25 | using FragS = Vec; 26 | 27 | static __device__ float inline num2float(const half x) { 28 | return __half2float(x); 29 | } 30 | 31 | static __device__ half2 inline num2num2(const half x) { 32 | return __half2half2(x); 33 | } 34 | 35 | static __device__ half2 inline nums2num2(const half x1, const half x2) { 36 | return __halves2half2(x1, x2); 37 | } 38 | 39 | static __host__ __device__ half inline float2num(const float x) { 40 | return __float2half(x); 41 | } 42 | }; 43 | 44 | template <> 45 | class ScalarType { 46 | public: 47 | using scalar_t = nv_bfloat16; 48 | using scalar_t2 = nv_bfloat162; 49 | 50 | using FragA = Vec; 51 | using FragB = Vec; 52 | using FragC = Vec; 53 | using FragS = Vec; 54 | 55 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 56 | static __device__ float inline num2float(const nv_bfloat16 x) { 57 | return __bfloat162float(x); 58 | } 59 | 60 | static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) { 61 | return __bfloat162bfloat162(x); 62 | } 63 | 64 | static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1, 65 | const nv_bfloat16 x2) { 66 | return __halves2bfloat162(x1, x2); 67 | } 68 | 69 | static __host__ __device__ nv_bfloat16 inline float2num(const float x) { 70 | return __float2bfloat16(x); 71 | } 72 | #endif 73 | }; 74 | 75 | } // namespace gptq_marlin 76 | 77 | #endif -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #ifndef _gptq_marlin_repack_cuh 6 | #define _gptq_marlin_repack_cuh 7 | 8 | torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, 9 | int64_t size_k, int64_t size_n, 10 | int64_t num_bits); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include "marlin_cuda.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "marlin_cuda_kernel.cuh" 24 | 25 | const int ERR_PROB_SHAPE = 1; 26 | const int ERR_KERN_SHAPE = 2; 27 | 28 | void mul( 29 | const torch::Tensor& A, 30 | const torch::Tensor& B, 31 | torch::Tensor& C, 32 | const torch::Tensor& s, 33 | const torch::Tensor& sz, // ADDED: add scaled zero point 34 | torch::Tensor& workspace, 35 | int thread_k, 36 | int thread_n, 37 | int sms, 38 | int max_par 39 | ) { 40 | int prob_m = A.size(0); 41 | int prob_n = C.size(1); 42 | int prob_k = A.size(1); 43 | int groupsize = (s.size(0) == 1) ? -1 : prob_k / s.size(0); 44 | if (groupsize != -1 && groupsize * s.size(0) != prob_k) 45 | AT_ERROR("k=", prob_k, " not compatible with ", s.size(0), " groups."); 46 | if (workspace.numel() < prob_n / 128 * max_par) 47 | AT_ERROR("workspace must be of size at least ", prob_n / 128 * max_par, "."); 48 | int dev = A.get_device(); 49 | int err = marlin_cuda( 50 | A.data_ptr(), 51 | B.data_ptr(), 52 | C.data_ptr(), 53 | s.data_ptr(), 54 | sz.data_ptr(), // ADDED: add scaled zero point 55 | prob_m, prob_n, prob_k, 56 | workspace.data_ptr(), 57 | groupsize, 58 | dev, 59 | at::cuda::getCurrentCUDAStream(dev), 60 | thread_k, 61 | thread_n, 62 | sms, 63 | max_par 64 | ); 65 | if (err == ERR_PROB_SHAPE) { 66 | AT_ERROR( 67 | "Problem (m=", prob_m, ", n=", prob_n, ", k=", prob_k, ")", 68 | " not compatible with thread_k=", thread_k, ", thread_n=", thread_n, "." 69 | ); 70 | } else if (err == ERR_KERN_SHAPE) { 71 | AT_ERROR( 72 | "No kernel implementation for thread_k=", thread_k, ", thread_n=", thread_n, ", groupsize=", groupsize, "." 73 | ); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include 17 | 18 | void mul( 19 | const torch::Tensor& A, 20 | const torch::Tensor& B, 21 | torch::Tensor& C, 22 | const torch::Tensor& s, 23 | const torch::Tensor& sz, 24 | torch::Tensor& workspace, 25 | int thread_k = -1, 26 | int thread_n = -1, 27 | int sms = -1, 28 | int max_par = 8 29 | ); 30 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include 17 | 18 | int marlin_cuda( 19 | const void* A, 20 | const void* B, 21 | void* C, 22 | void* s, 23 | void* sz, // ADDED: add scaled zero point 24 | int prob_m, 25 | int prob_n, 26 | int prob_k, 27 | void* workspace, 28 | int groupsize = -1, 29 | int dev = 0, 30 | cudaStream_t stream = 0, 31 | int thread_k = -1, 32 | int thread_n = -1, 33 | int sms = -1, 34 | int max_par = 16 35 | ); 36 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/pybind_module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "awq/v2/gemm_cuda.h" 17 | #include "awq/v2/gemv_cuda.h" 18 | #include "unpack.h" 19 | #include "marlin/fp8_marlin.cuh" 20 | #include "marlin/gptq_marlin_repack.cuh" 21 | #include "marlin/marlin_cuda.h" 22 | 23 | // !IMPORTANT! Some python objects such as dtype, device, are not mapped to C++ types, 24 | // and need to be explicitly converted using dedicated helpers before calling a C++ method. 25 | // As a consequence, when an operation takes such an object as parameter, instead 26 | // of creating a binding directly to the C++ method, you must create a binding to a 27 | // lambda method that converts the unmapped types and calls the C++ method. 28 | // See the binding of quantize_symmetric for instance. 29 | 30 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 31 | m.def("awq_v2_gemm_f16i4", &awq_v2_gemm_f16i4, "awq_v2_gemm_f16i4"); 32 | m.def("awq_v2_gemv_f16i4", &awq_v2_gemv_f16i4, "awq_v2_gemv_f16i4"); 33 | m.def("gptq_marlin_repack", &gptq_marlin_repack, "gptq_marlin_repack"); 34 | m.def("fp8_marlin_gemm", &fp8_marlin_gemm, "fp8_marlin_gemm"); 35 | m.def("marlin_gemm_f16i4", &mul, "marlin_gemm_f16i4"); 36 | m.def("unpack", &unpack, "unpack"); 37 | } 38 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/unpack.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;} 21 | #define BLOCK_SIZE 256 22 | 23 | using namespace at; 24 | 25 | 26 | static torch::Tensor allocate_output(const torch::Tensor& input, int bits) { 27 | int n_packed = 8 / bits; 28 | auto output_shape = input.sizes().vec(); 29 | output_shape[0] = output_shape[0] * n_packed; 30 | return torch::empty(output_shape, input.options()); 31 | } 32 | 33 | __global__ void unpack_4bit_kernel(unsigned char* input, unsigned char* output, int n) { 34 | int i = blockIdx.x*blockDim.x + threadIdx.x; 35 | if(i>=n) return; 36 | 37 | output[i] = (input[i] & 0x0F); 38 | output[i + n] = (input[i] & 0xF0) >> 4; 39 | } 40 | 41 | static torch::Tensor unpack_4bit(const torch::Tensor& input){ 42 | 43 | auto output = allocate_output(input, 4); 44 | 45 | const auto numel = input.numel(); 46 | int blocks = cdiv(numel, BLOCK_SIZE); 47 | unpack_4bit_kernel<<>>( 48 | input.data_ptr(), 49 | output.data_ptr(), 50 | numel 51 | ); 52 | 53 | C10_CUDA_KERNEL_LAUNCH_CHECK(); 54 | 55 | return output; 56 | } 57 | 58 | __global__ void unpack_2bit_kernel(unsigned char* input, unsigned char* output, int n) { 59 | int i = blockIdx.x*blockDim.x + threadIdx.x; 60 | if(i>=n) return; 61 | 62 | output[i] = (input[i] & 0x03); 63 | output[i + n] = (input[i] & 0x0C) >> 2; 64 | output[i + n*2] = (input[i] & 0x30) >> 4; 65 | output[i + n*3] = (input[i] & 0xC0) >> 6; 66 | } 67 | 68 | static torch::Tensor unpack_2bit(const torch::Tensor& input){ 69 | 70 | auto output = allocate_output(input, 2); 71 | 72 | const auto numel = input.numel(); 73 | int blocks = cdiv(numel, BLOCK_SIZE); 74 | unpack_2bit_kernel<<>>( 75 | input.data_ptr(), 76 | output.data_ptr(), 77 | numel 78 | ); 79 | 80 | C10_CUDA_KERNEL_LAUNCH_CHECK(); 81 | 82 | return output; 83 | } 84 | 85 | torch::Tensor unpack(torch::Tensor &t, int bits) { 86 | TORCH_CHECK(t.scalar_type() == torch::kUInt8, "Unsupported data type: ", t.scalar_type()); 87 | TORCH_CHECK(t.device().is_cuda(), "t must be a CUDA tensor."); 88 | TORCH_CHECK(t.is_contiguous(), "t must be contiguous."); 89 | switch(bits) { 90 | case 4: 91 | return unpack_4bit(t); 92 | case 2: 93 | return unpack_2bit(t); 94 | default: 95 | throw std::invalid_argument("Can only unpack 2-bit or 4-bit tensors."); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/cuda/unpack.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | torch::Tensor unpack(torch::Tensor &t, int bits); 18 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/extension.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import warnings 4 | from typing import List 5 | 6 | import torch 7 | from torch.utils.cpp_extension import load 8 | 9 | 10 | __all__ = ["is_extension_available", "get_extension"] 11 | 12 | 13 | class Extension(object): 14 | def __init__( 15 | self, 16 | name: str, 17 | root_dir: str, 18 | sources: List[str], 19 | extra_cflags: List[str] = None, 20 | extra_cuda_cflags: List[str] = None, 21 | ): 22 | self.name = name 23 | self.sources = [f"{root_dir}/{source}" for source in sources] 24 | self.extra_cflags = extra_cflags 25 | self.extra_cuda_cflags = extra_cuda_cflags 26 | self.build_directory = os.path.join(root_dir, "build") 27 | self._lib = None 28 | 29 | @property 30 | def lib(self): 31 | if self._lib is None: 32 | # We only load the extension when the lib is required 33 | version_file = os.path.join(self.build_directory, "pytorch_version.txt") 34 | if os.path.exists(version_file): 35 | # The extension has already been built: check the torch version for which it was built 36 | with open(version_file, "r") as f: 37 | pytorch_build_version = f.read().rstrip() 38 | if pytorch_build_version != torch.__version__: 39 | shutil.rmtree(self.build_directory) 40 | warnings.warn( 41 | f"{self.name} was compiled with pytorch {pytorch_build_version}, but {torch.__version__} is installed: it will be recompiled." 42 | ) 43 | os.makedirs(self.build_directory, exist_ok=True) 44 | self._lib = load( 45 | name=self.name, 46 | sources=self.sources, 47 | extra_cflags=self.extra_cflags, 48 | extra_cuda_cflags=self.extra_cuda_cflags, 49 | build_directory=self.build_directory, 50 | ) 51 | if not os.path.exists(version_file): 52 | with open(version_file, "w") as f: 53 | f.write(torch.__version__) 54 | return self._lib 55 | 56 | 57 | _extensions = {} 58 | 59 | 60 | def register_extension(extension: Extension): 61 | assert extension.name not in _extensions 62 | _extensions[extension.name] = extension 63 | 64 | 65 | def get_extension(extension_type: str): 66 | """Get an extension 67 | 68 | Args: 69 | extension_type (`str`): 70 | The extension type. 71 | Returns: 72 | The corresponding extension. 73 | """ 74 | return _extensions[extension_type] 75 | 76 | 77 | def is_extension_available(extension_type: str): 78 | """Check is an extension is available 79 | 80 | Args: 81 | extension_type (`str`): 82 | The extension type. 83 | Returns: 84 | True if the extension is available. 85 | """ 86 | return extension_type in _extensions 87 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/hip/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | import torch 18 | 19 | from ..extension import Extension, register_extension 20 | 21 | 22 | __all__ = [] 23 | 24 | 25 | ext = Extension( 26 | "quanto_hip", 27 | root_dir=os.path.dirname(__file__), 28 | sources=["unpack.cu", "pybind_module.cpp"], 29 | extra_cflags=["-std=c++17"], 30 | ) 31 | register_extension(ext) 32 | 33 | 34 | @torch.library.impl("quanto::unpack", ["CUDA"]) 35 | def unpack_hip(t: torch.Tensor, bits: int): 36 | return ext.lib.unpack(t, bits) 37 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/hip/pybind_module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "unpack.h" 17 | 18 | 19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 20 | m.def("unpack", &unpack, "unpack"); 21 | } 22 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/hip/unpack.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;} 21 | #define BLOCK_SIZE 256 22 | 23 | using namespace at; 24 | 25 | 26 | static torch::Tensor allocate_output(const torch::Tensor& input, int bits) { 27 | int n_packed = 8 / bits; 28 | auto output_shape = input.sizes().vec(); 29 | output_shape[0] = output_shape[0] * n_packed; 30 | return torch::empty(output_shape, input.options()); 31 | } 32 | 33 | __global__ void unpack_4bit_kernel(unsigned char* input, unsigned char* output, int n) { 34 | int i = blockIdx.x*blockDim.x + threadIdx.x; 35 | if(i>=n) return; 36 | 37 | output[i] = (input[i] & 0x0F); 38 | output[i + n] = (input[i] & 0xF0) >> 4; 39 | } 40 | 41 | static torch::Tensor unpack_4bit(const torch::Tensor& input){ 42 | 43 | auto output = allocate_output(input, 4); 44 | 45 | const auto numel = input.numel(); 46 | int blocks = cdiv(numel, BLOCK_SIZE); 47 | unpack_4bit_kernel<<>>( 48 | input.data_ptr(), 49 | output.data_ptr(), 50 | numel 51 | ); 52 | 53 | C10_CUDA_KERNEL_LAUNCH_CHECK(); 54 | 55 | return output; 56 | } 57 | 58 | __global__ void unpack_2bit_kernel(unsigned char* input, unsigned char* output, int n) { 59 | int i = blockIdx.x*blockDim.x + threadIdx.x; 60 | if(i>=n) return; 61 | 62 | output[i] = (input[i] & 0x03); 63 | output[i + n] = (input[i] & 0x0C) >> 2; 64 | output[i + n*2] = (input[i] & 0x30) >> 4; 65 | output[i + n*3] = (input[i] & 0xC0) >> 6; 66 | } 67 | 68 | static torch::Tensor unpack_2bit(const torch::Tensor& input){ 69 | 70 | auto output = allocate_output(input, 2); 71 | 72 | const auto numel = input.numel(); 73 | int blocks = cdiv(numel, BLOCK_SIZE); 74 | unpack_2bit_kernel<<>>( 75 | input.data_ptr(), 76 | output.data_ptr(), 77 | numel 78 | ); 79 | 80 | C10_CUDA_KERNEL_LAUNCH_CHECK(); 81 | 82 | return output; 83 | } 84 | 85 | torch::Tensor unpack(torch::Tensor &t, int bits) { 86 | TORCH_CHECK(t.scalar_type() == torch::kUInt8, "Unsupported data type: ", t.scalar_type()); 87 | TORCH_CHECK(t.device().is_cuda(), "t must be a CUDA tensor."); 88 | TORCH_CHECK(t.is_contiguous(), "t must be contiguous."); 89 | switch(bits) { 90 | case 4: 91 | return unpack_4bit(t); 92 | case 2: 93 | return unpack_2bit(t); 94 | default: 95 | throw std::invalid_argument("Can only unpack 2-bit or 4-bit tensors."); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/hip/unpack.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | torch::Tensor unpack(torch::Tensor &t, int bits); 18 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/mps/README.md: -------------------------------------------------------------------------------- 1 | # Quanto Metal Performance Shaders extension 2 | 3 | To add a new implementation for an operation defined in `library./ops.py`: 4 | 5 | - add the corresponding `.mm` file to the list of sources in `__init__.py`, 6 | - add a binding to `pybind_module.cpp`, 7 | - provide an implementation calling the binding in `__init__.py`. 8 | 9 | Note: torch JIT extensions for MPS requires the xcode command-line tools. 10 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/mps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | import torch 18 | 19 | from ..extension import Extension, register_extension 20 | 21 | 22 | __all__ = [] 23 | 24 | 25 | ext = Extension( 26 | "quanto_mps", 27 | root_dir=os.path.dirname(__file__), 28 | sources=["unpack.mm", "pybind_module.cpp"], 29 | extra_cflags=["-std=c++17"], 30 | ) 31 | register_extension(ext) 32 | 33 | 34 | @torch.library.impl("quanto::unpack", "MPS") 35 | def unpack_mps(t: torch.Tensor, bits: int): 36 | return ext.lib.unpack(t, bits) 37 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/mps/pybind_module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "unpack.h" 17 | 18 | 19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 20 | m.def("unpack", &unpack, "unpack"); 21 | } 22 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/mps/unpack.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | torch::Tensor unpack(const torch::Tensor &input, int bits); 18 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/xpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # Copyright 2024 Intel Corporation. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | 18 | import torch 19 | 20 | from ..extension import Extension, register_extension 21 | 22 | 23 | __all__ = [] 24 | 25 | 26 | module_path = os.path.dirname(__file__) 27 | sources = [ 28 | "unpack.sycl", 29 | "pybind_module.cpp", 30 | ] 31 | ext = Extension( 32 | "quanto_xpu", 33 | root_dir=os.path.dirname(__file__), 34 | sources=sources, 35 | ) 36 | register_extension(ext) 37 | 38 | 39 | @torch.library.impl("quanto::unpack", "XPU") 40 | def unpack_xpu(t: torch.Tensor, bits: int): 41 | return ext.lib.unpack(t, bits) 42 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/xpu/pybind_module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "unpack.h" 17 | 18 | // !IMPORTANT! Some python objects such as dtype, device, are not mapped to C++ types, 19 | // and need to be explicitly converted using dedicated helpers before calling a C++ method. 20 | // As a consequence, when an operation takes such an object as parameter, instead 21 | // of creating a binding directly to the C++ method, you must create a binding to a 22 | // lambda method that converts the unmapped types and calls the C++ method. 23 | // See the binding of quantize_symmetric for instance. 24 | 25 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 26 | m.def("unpack", &unpack, "unpack"); 27 | } 28 | -------------------------------------------------------------------------------- /optimum/quanto/library/extensions/xpu/unpack.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The HuggingFace Team. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | torch::Tensor unpack(torch::Tensor &t, int bits); 18 | -------------------------------------------------------------------------------- /optimum/quanto/library/quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Union 16 | 17 | import torch 18 | 19 | from ..tensor import dtype_info, group 20 | 21 | 22 | torch.library.define( 23 | "quanto::quantize_symmetric", "(Tensor base, ScalarType dtype, int? axis, Tensor scale) -> Tensor" 24 | ) 25 | 26 | 27 | @torch.library.impl("quanto::quantize_symmetric", "default") 28 | def quantize_symmetric( 29 | base: torch.Tensor, dtype: torch.dtype, axis: Union[int, None], scale: torch.Tensor 30 | ) -> torch.Tensor: 31 | # Sanity checks 32 | if axis is None: 33 | if scale.ndim > 0: 34 | raise ValueError("Scale must be a scalar when quantizing per-tensor") 35 | else: 36 | if base.ndim == 1: 37 | raise ValueError("1D Tensors cannot be quantized per-axis") 38 | if axis == base.ndim - 1: 39 | # Align on the general convention to index the last dimension 40 | axis = -1 41 | if axis not in (0, -1): 42 | raise ValueError("Quantization is only supported along the first or last axis.") 43 | if base.shape[axis] == 1: 44 | raise ValueError(f"Cannot quantize Tensor of shape {base.shape} along axis {axis} of size 1") 45 | if torch.squeeze(scale).ndim > 1: 46 | raise ValueError("Quantizing along multiple axis is not supported") 47 | if scale.ndim != base.ndim: 48 | raise ValueError( 49 | "When quantizing per-axis, the scale must be broadcastable to the base (Tip: try to add missing dims of length zero)." 50 | ) 51 | data = base / scale 52 | if not dtype.is_floating_point: 53 | data = torch.round(data) 54 | info = dtype_info(dtype) 55 | return torch.clamp(data, min=info.min, max=info.max).to(dtype) 56 | 57 | 58 | torch.library.define( 59 | "quanto::quantize_affine", 60 | "(Tensor base, int bits, int axis, int? group_size, Tensor scale, Tensor shift) -> Tensor", 61 | ) 62 | 63 | 64 | @torch.library.impl("quanto::quantize_affine", "default") 65 | def quantize_affine( 66 | base: torch.Tensor, bits: int, axis: int, group_size: Union[int, None], scale: torch.Tensor, shift: torch.Tensor 67 | ) -> torch.Tensor: 68 | if axis not in (0, -1): 69 | raise ValueError("axis parameter must be 0 (first axis) or -1 (last axis)") 70 | if group_size is not None: 71 | base = group(base, axis=axis, group_size=group_size) 72 | if shift.dtype.is_floating_point: 73 | data = torch.round((base + shift) / scale) 74 | else: 75 | # Shift is an integer representing zero (i.e. zero-point) 76 | data = torch.round(base / scale) + shift 77 | 78 | return torch.clamp(data, min=0, max=2**bits - 1).to(torch.uint8) 79 | -------------------------------------------------------------------------------- /optimum/quanto/library/unpack.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | 18 | torch.library.define("quanto::unpack", "(Tensor self, int bits) -> Tensor") 19 | 20 | 21 | @torch.library.impl("quanto::unpack", "default") 22 | def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor: 23 | """ 24 | Un-Pack int4 / int2 weights (packed in a uint8) into a torch.uint8 tensor 25 | What un-packing means? Assume we have packed 4 2-bit values in 8-bit 26 | (because torch does not have native support for 2-bit datatypes) 27 | 28 | > 1110 0100 29 | 30 | Unpacking them means retrieving the original 4 2-bit values: 31 | 32 | > 0000 0011 | 0000 0010 | 0000 0001 | 0000 0000 33 | 34 | Args: 35 | packed (`torch.Tensor`): 36 | The packed tensor in `torch.uint8` precision 37 | bits (`int`): 38 | The number of bits per encoded value. Can be 2 or 4. 39 | """ 40 | unpacked = [] 41 | values_per_item = 8 // bits 42 | 43 | def rshift(t: torch.Tensor, bits: int): 44 | if t.device.type == "mps": 45 | # rshift is not supported on MPS device 46 | return t // (2**bits) 47 | return t >> bits 48 | 49 | # Unpack each set of values independently 50 | for i in range(values_per_item): 51 | mask = 2 ** (bits * (i + 1)) - 1 52 | unpacked.append(rshift(packed & mask, bits * i)) 53 | # Return the concatenated unpacked tensors 54 | return torch.cat(unpacked).to(torch.uint8) 55 | -------------------------------------------------------------------------------- /optimum/quanto/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib 16 | import os 17 | from collections.abc import Mapping 18 | from typing import Any, Dict, List, Optional, Union 19 | 20 | 21 | def is_transformers_available() -> bool: 22 | return importlib.util.find_spec("transformers") is not None 23 | 24 | 25 | def is_diffusers_available() -> bool: 26 | return importlib.util.find_spec("diffusers") is not None 27 | 28 | 29 | if is_transformers_available(): 30 | from .transformers_models import * 31 | 32 | 33 | if is_diffusers_available(): 34 | from .diffusers_models import * 35 | -------------------------------------------------------------------------------- /optimum/quanto/models/shared_dict.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from collections.abc import Mapping 17 | from typing import Any, Dict 18 | 19 | from safetensors import safe_open 20 | 21 | 22 | class ShardedStateDict(Mapping): 23 | """A pytorch state_dict stored in multiple safetensors files 24 | 25 | This class implements the `collections.abc.Mapping` interface. 26 | It can be passed to `torch.nn.Module.load_state_dict()` to recursively 27 | load the module tensors. 28 | """ 29 | 30 | def __init__(self, base_dir: str, tensor_index: Dict[str, str]): 31 | self._base_dir = base_dir 32 | self._index = tensor_index 33 | self._handles = {} 34 | 35 | def __iter__(self): 36 | yield from self._index 37 | 38 | def __len__(self): 39 | return self._index.__len__() 40 | 41 | def __getitem__(self, key: Any) -> Any: 42 | filename = self._index.__getitem__(key) 43 | if filename not in self._handles: 44 | f = safe_open(os.path.join(self._base_dir, filename), framework="pytorch") 45 | self._handles[filename] = f 46 | f = self._handles[filename] 47 | return f.get_tensor(key) 48 | 49 | def __contains__(self, key: object) -> bool: 50 | return self._index.__contains__(key) 51 | 52 | def keys(self): 53 | return self._index.keys() 54 | -------------------------------------------------------------------------------- /optimum/quanto/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .qconv2d import * 16 | from .qlayernorm import * 17 | from .qlinear import * 18 | from .qmodule import * 19 | -------------------------------------------------------------------------------- /optimum/quanto/nn/qconv2d.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import torch 18 | 19 | from ..tensor import Optimizer, qtype 20 | from .qmodule import QModuleMixin, register_qmodule 21 | 22 | 23 | __all__ = ["QConv2d"] 24 | 25 | 26 | @register_qmodule(torch.nn.Conv2d) 27 | class QConv2d(QModuleMixin, torch.nn.Conv2d): 28 | @classmethod 29 | def qcreate( 30 | cls, 31 | module, 32 | weights: qtype, 33 | activations: Optional[qtype] = None, 34 | optimizer: Optional[Optimizer] = None, 35 | device: Optional[torch.device] = None, 36 | ): 37 | return cls( 38 | in_channels=module.in_channels, 39 | out_channels=module.out_channels, 40 | kernel_size=module.kernel_size, 41 | stride=module.stride, 42 | padding=module.padding, 43 | dilation=module.dilation, 44 | groups=module.groups, 45 | bias=module.bias is not None, 46 | padding_mode=module.padding_mode, 47 | dtype=module.weight.dtype, 48 | device=device, 49 | weights=weights, 50 | activations=activations, 51 | optimizer=optimizer, 52 | ) 53 | 54 | def forward(self, input: torch.Tensor) -> torch.Tensor: 55 | return self._conv_forward(input, self.qweight, self.bias) 56 | -------------------------------------------------------------------------------- /optimum/quanto/nn/qlayernorm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import torch 18 | 19 | from ..tensor import Optimizer, qtype 20 | from .qmodule import QModuleMixin, register_qmodule 21 | 22 | 23 | __all__ = ["QLayerNorm"] 24 | 25 | 26 | @register_qmodule(torch.nn.LayerNorm) 27 | class QLayerNorm(QModuleMixin, torch.nn.LayerNorm): 28 | @classmethod 29 | def qcreate( 30 | cls, 31 | module, 32 | weights: Optional[qtype] = None, 33 | activations: Optional[qtype] = None, 34 | optimizer: Optional[Optimizer] = None, 35 | device: Optional[torch.device] = None, 36 | ): 37 | if activations is None: 38 | return None 39 | dtype = None if module.weight is None else module.weight.dtype 40 | return cls( 41 | module.normalized_shape, 42 | module.eps, 43 | module.elementwise_affine, 44 | module.bias is not None, 45 | dtype=dtype, 46 | device=device, 47 | weights=None, # We never quantize QLayerNorm weights 48 | activations=activations, 49 | optimizer=None, # We never quantize QLayerNorm weights 50 | ) 51 | 52 | def forward(self, input: torch.Tensor) -> torch.Tensor: 53 | return torch.nn.functional.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) 54 | -------------------------------------------------------------------------------- /optimum/quanto/nn/qlinear.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import torch 18 | 19 | from ..tensor import Optimizer, qtype 20 | from .qmodule import QModuleMixin, register_qmodule 21 | 22 | 23 | __all__ = ["QLinear"] 24 | 25 | 26 | @register_qmodule(torch.nn.Linear) 27 | class QLinear(QModuleMixin, torch.nn.Linear): 28 | @classmethod 29 | def qcreate( 30 | cls, 31 | module, 32 | weights: qtype, 33 | activations: Optional[qtype] = None, 34 | optimizer: Optional[Optimizer] = None, 35 | device: Optional[torch.device] = None, 36 | ): 37 | return cls( 38 | module.in_features, 39 | module.out_features, 40 | module.bias is not None, 41 | dtype=module.weight.dtype, 42 | device=device, 43 | weights=weights, 44 | activations=activations, 45 | optimizer=optimizer, 46 | quantize_input=True, 47 | ) 48 | 49 | def forward(self, input: torch.Tensor) -> torch.Tensor: 50 | return torch.nn.functional.linear(input, self.qweight, bias=self.bias) 51 | -------------------------------------------------------------------------------- /optimum/quanto/subpackage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .commands import * 16 | -------------------------------------------------------------------------------- /optimum/quanto/subpackage/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base import * 16 | -------------------------------------------------------------------------------- /optimum/quanto/subpackage/commands/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from optimum.commands import BaseOptimumCLICommand, CommandInfo 16 | from optimum.commands.optimum_cli import optimum_cli_subcommand 17 | 18 | from .quantize import QuantizeCommand 19 | 20 | 21 | __all__ = ["QuantoCommand"] 22 | 23 | 24 | @optimum_cli_subcommand() 25 | class QuantoCommand(BaseOptimumCLICommand): 26 | COMMAND = CommandInfo(name="quanto", help="Hugging Face models quantization tools") 27 | SUBCOMMANDS = ( 28 | CommandInfo( 29 | name="quantize", 30 | help="Quantize Hugging Face models.", 31 | subcommand_class=QuantizeCommand, 32 | ), 33 | ) 34 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .activations import * 16 | from .core import * 17 | from .grouped import * 18 | from .optimizers import * 19 | from .qbits import * 20 | from .qbytes import * 21 | from .qtensor import * 22 | from .qtype import * 23 | from .weights import * 24 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/activations/__init__.py: -------------------------------------------------------------------------------- 1 | from .qbytes import * 2 | from .quantization import * 3 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/activations/quantization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from ..qtype import qtype 18 | from .qbytes import ActivationQBytesTensor 19 | 20 | 21 | __all__ = ["quantize_activation"] 22 | 23 | 24 | def quantize_activation(t: torch.Tensor, qtype: qtype, scale: torch.Tensor): 25 | """Quantize an activation Tensor. 26 | 27 | Activations are always quantized per-tensor with a scalar scale. 28 | 29 | Args: 30 | base (`torch.Tensor`): the Tensor to quantize 31 | qtype (`quanto.qtype`): The target quantization type 32 | scale (`torch.Tensor`): The scalar quantization scale 33 | 34 | Returns: 35 | A quantized Tensor. 36 | """ 37 | if scale.numel() != 1: 38 | raise ValueError("Parameter scale must be a scalar because activations can only be quantized per-tensor") 39 | return ActivationQBytesTensor.quantize(t, qtype, scale) 40 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import torch 17 | 18 | 19 | __all__ = ["axis_to_dim", "dtype_info"] 20 | 21 | 22 | def dtype_info(dtype): 23 | info = torch.finfo if dtype.is_floating_point else torch.iinfo 24 | return info(dtype) 25 | 26 | 27 | def axis_to_dim(t, axis): 28 | dim = list(range(t.ndim)) 29 | if axis == -1: 30 | dim = dim[:-1] 31 | else: 32 | dim.remove(axis) 33 | return dim 34 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/function.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | 18 | __all__ = ["QuantizedLinearFunction"] 19 | 20 | 21 | class QuantizedLinearFunction(torch.autograd.Function): 22 | """Quantized linear function. 23 | 24 | This is a quantized implementation of torch.nn.functional.linear. 25 | 26 | It defines explicitly the backward pass instead of letting pytorch 27 | build it by combining the gradients of the underlying quantized operations. 28 | 29 | This has two main benefits: 30 | 31 | - this saves computations, 32 | - this allows to use operations that do not have a registered backward method, 33 | such as quanto custom operations. 34 | 35 | The drawback is that the extra tensors involved in the quantization graph, such as 36 | the scales and shift, cannot be trained. 37 | This is however consistent with the quanto quantizers backward pass, that returns 38 | a zero gradient for these tensors. 39 | """ 40 | 41 | @staticmethod 42 | def forward(ctx, input, other, bias=None): 43 | ctx.save_for_backward(input, other) 44 | output = torch.matmul(input, other.t()) 45 | if bias is not None: 46 | output = output + bias 47 | return output 48 | 49 | def backward(ctx, gO): 50 | input_gO = other_gO = bias_gO = None 51 | input, other = ctx.saved_tensors 52 | out_features, in_features = other.shape 53 | if ctx.needs_input_grad[0]: 54 | # grad(A@(B.t()) = gO => grad(A) = gO@(B.t().t()) = gO@B 55 | input_gO = torch.matmul(gO, other) 56 | if ctx.needs_input_grad[1]: 57 | # grad(B@A.t()) = gO.t() => grad(B) = gO.t()@(A.t().t()) = gO.t()@A 58 | other_gO = torch.matmul(gO.view(-1, out_features).t(), input.view(-1, in_features)) 59 | if ctx.needs_input_grad[2]: 60 | # Bias gradient is the sum on all dimensions but the last one 61 | dim = tuple(range(gO.ndim - 1)) 62 | bias_gO = gO.sum(dim) 63 | return input_gO, other_gO, bias_gO 64 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/grouped.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import List 3 | 4 | import torch 5 | 6 | 7 | __all__ = ["group", "ungroup", "grouped_shape"] 8 | 9 | 10 | def grouped_shape(shape: List, axis: int, group_size: int) -> List: 11 | if axis not in (0, -1): 12 | raise ValueError("Axis must be 0 or -1 for group-wise quantization") 13 | n_groups = math.prod(shape) // group_size 14 | return (n_groups, group_size) if axis == 0 else (group_size, n_groups) 15 | 16 | 17 | def group(base: torch.Tensor, axis: int, group_size: int): 18 | if axis not in (0, -1): 19 | raise ValueError("Axis must be 0 or -1 for group-wise quantization") 20 | # In standard per-axis quantization, we have one scale per axis dim 21 | axis_dim = base.shape[axis] 22 | # This scale is evaluated over axis_numel items for each feature along axis 23 | axis_numel = base.numel() // axis_dim 24 | if group_size > axis_numel or axis_numel % group_size != 0: 25 | raise ValueError(f"Group size ({group_size}) must be a divisor of ({axis_numel})") 26 | # Group-wise quantization further splits axis_numel into multiple groups per axis 27 | axis_groups = axis_numel // group_size 28 | if axis == 0: 29 | # Easy-peasy: we simply need to reshape to (axis_dim * axis_groups, group_size) 30 | return base.reshape([-1, group_size]) 31 | # More difficult: reshape to (group_size, axis_dim * axis_groups) 32 | # First, split by groups, preserving the axis dimension 33 | grouped = base.reshape((axis_groups, group_size, axis_dim)) 34 | # Permute to (group_size, axis_dim, axis_groups) 35 | grouped = grouped.permute(1, 2, 0) 36 | return grouped.reshape(group_size, axis_dim * axis_groups) 37 | 38 | 39 | def ungroup(grouped: torch.Tensor, axis: int, orig_shape: torch.Size): 40 | if grouped.shape == orig_shape: 41 | return grouped 42 | if axis == 0: 43 | # No transposition required, just reshape 44 | return grouped.reshape(orig_shape) 45 | group_size = grouped.shape[0] if axis == -1 else grouped.shape[-1] 46 | axis_dim = orig_shape[axis] 47 | axis_groups = grouped.numel() // axis_dim // group_size 48 | ungrouped = grouped.reshape(group_size, axis_dim, axis_groups) 49 | # Permute to (axis_groups, group_size, axis_dim) 50 | ungrouped = ungrouped.permute(2, 0, 1) 51 | return ungrouped.reshape(orig_shape) 52 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .absmax_optimizer import * 16 | from .affine_optimizer import * 17 | from .hqq_optimizer import * 18 | from .max_optimizer import * 19 | from .optimizer import * 20 | from .symmetric_optimizer import * 21 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/optimizers/absmax_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional, Tuple, Union 16 | 17 | import torch 18 | 19 | from ..qtype import qtype 20 | from .symmetric_optimizer import SymmetricOptimizer 21 | 22 | 23 | __all__ = ["AbsmaxOptimizer"] 24 | 25 | 26 | class AbsmaxOptimizer(SymmetricOptimizer): 27 | def optimize( 28 | self, base: torch.Tensor, qtype: qtype, axis: Optional[int] = None 29 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 30 | base = torch.abs(base) 31 | if axis is None: 32 | rmax = torch.max(base) 33 | else: 34 | dim = list(range(1, base.ndim)) if (axis == 0) else list(range(0, base.ndim - 1)) 35 | rmax = torch.amax(torch.abs(base), dim=dim, keepdim=True) 36 | return rmax / qtype.qmax 37 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/optimizers/affine_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional, Tuple 16 | 17 | import torch 18 | 19 | from ..grouped import group 20 | from ..qtype import qtype 21 | from .optimizer import Optimizer 22 | 23 | 24 | __all__ = ["AffineOptimizer"] 25 | 26 | 27 | class AffineOptimizer(Optimizer): 28 | def __call__( 29 | self, 30 | base: torch.Tensor, 31 | qtype: qtype, 32 | axis: int, 33 | group_size: Optional[int] = None, 34 | zeropoint: bool = False, 35 | ) -> Tuple[torch.Tensor, torch.Tensor]: 36 | """ 37 | Args: 38 | base (`torch.Tensor`): the weight Tensor to quantize 39 | qtype (`quanto.qtype`): The target quantization type 40 | axis ('int`): The quantization axis (0 or -1) 41 | group_size (`Optional[int]`): The quantization group size 42 | zeropoint (`bool`): Allow an exact representation of zero. If True, the shifts are stored as 43 | integer instead of float, which results in a slightly smaller model, but might also reduce 44 | the model performance. Defaults to False. 45 | Returns: 46 | A tuple of scale, shift Tensor. 47 | """ 48 | if axis not in [0, -1]: 49 | raise ValueError("axis parameter must be 0 (first axis) or -1 (last axis)") 50 | if group_size is not None: 51 | base = group(base, axis, group_size) 52 | if axis is not None and base.shape[axis] == 1: 53 | axis = None 54 | scale, shift = self.optimize(base, qtype, axis) 55 | assert scale.dtype == base.dtype 56 | assert shift.dtype == base.dtype 57 | if zeropoint: 58 | # Round shift to make sure zero can be represented exactly using 'shift' as quantized value 59 | shift = torch.clamp(torch.round(shift / scale), 0, 2**qtype.bits - 1).to(torch.uint8) 60 | return scale, shift 61 | 62 | def optimize(self, base: torch.Tensor, qtype: qtype, axis: int) -> Tuple[torch.Tensor, torch.Tensor]: 63 | raise NotImplementedError 64 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/optimizers/hqq_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional, Tuple, Union 16 | 17 | import torch 18 | 19 | from ..qtype import qtype 20 | from ..weights import quantize_weight 21 | from .max_optimizer import MaxOptimizer 22 | 23 | 24 | __all__ = ["HqqOptimizer"] 25 | 26 | 27 | # Shrinking operator 28 | def shrink_lp_op(x: torch.Tensor, beta: float, lp_norm: float) -> torch.Tensor: 29 | if lp_norm == 1: 30 | return torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - 1.0 / beta) 31 | else: 32 | return torch.sign(x) * torch.nn.functional.relu( 33 | torch.abs(x) - (1.0 / beta) * torch.pow(torch.abs(x), lp_norm - 1) 34 | ) 35 | 36 | 37 | class HqqOptimizer(MaxOptimizer): 38 | """Implementation of the HQQ algorithm 39 | 40 | This is an implementation of the algorithm described in "Half-Quadratic Quantization of Large Machine Learning Models", 41 | by Hicham Badri and Appu Shaji (https://mobiusml.github.io/hqq_blog/). 42 | This is an adaption of the original implementation at https://github.com/mobiusml/hqq. 43 | 44 | """ 45 | 46 | def __init__( 47 | self, 48 | lp_norm: Optional[float] = 0.7, 49 | beta: Optional[int] = 1e1, 50 | kappa: Optional[float] = 1.01, 51 | iters: Optional[int] = 20, 52 | verbose: Optional[bool] = False, 53 | ) -> None: 54 | self.lp_norm = lp_norm 55 | self.beta = beta 56 | self.kappa = kappa 57 | self.iters = iters 58 | self.verbose = verbose 59 | 60 | def optimize( 61 | self, base: torch.Tensor, qtype: qtype, axis: int 62 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 63 | scale, shift = super().optimize(base, qtype, axis) 64 | best_error = None 65 | beta = self.beta 66 | base_q = quantize_weight(base, qtype=qtype, axis=axis, scale=scale, shift=shift) 67 | for i in range(self.iters): 68 | error = base - base_q 69 | if best_error is None: 70 | best_error = float(torch.abs(base - base_q).mean()) 71 | if self.verbose: 72 | print(f"Start error: {best_error:.6f}") 73 | e = shrink_lp_op(error, beta, self.lp_norm) 74 | mean_axis = 0 if axis == -1 else -1 75 | hqq_shift = torch.mean(base_q._data * scale - (base - e), axis=mean_axis, keepdim=True) 76 | base_q = quantize_weight(base, qtype=qtype, axis=axis, scale=scale, shift=hqq_shift) 77 | mean_error = float(torch.abs(base - base_q).mean()) 78 | if self.verbose: 79 | print(f"HQQ error at it #{i}: {mean_error:.6f}") 80 | if mean_error < best_error: 81 | best_error = mean_error 82 | shift = hqq_shift 83 | beta *= self.kappa 84 | else: 85 | break 86 | 87 | return scale, shift 88 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/optimizers/max_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Tuple, Union 16 | 17 | import torch 18 | 19 | from ..qtype import qtype 20 | from .affine_optimizer import AffineOptimizer 21 | 22 | 23 | __all__ = ["MaxOptimizer"] 24 | 25 | 26 | class MaxOptimizer(AffineOptimizer): 27 | def optimize( 28 | self, base: torch.Tensor, qtype: qtype, axis: int 29 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 30 | dim = list(range(1, base.ndim)) if (axis == 0) else list(range(0, base.ndim - 1)) 31 | rmin = torch.amin(base, dim=dim, keepdim=True) 32 | rmax = torch.amax(base, dim=dim, keepdim=True) 33 | qmin = -(2 ** (qtype.bits - 1)) 34 | qmax = 2 ** (qtype.bits - 1) - 1 35 | scale = (rmax - rmin) / (qmax - qmin) 36 | shift = -rmin 37 | return scale, shift 38 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/optimizers/optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC 16 | from typing import Optional, Tuple, Union 17 | 18 | import torch 19 | 20 | 21 | __all__ = ["Optimizer"] 22 | 23 | 24 | class Optimizer(ABC): 25 | def __call__( 26 | self, base: torch.Tensor, bits: int, axis: int, group_size: Optional[int] = None 27 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 28 | raise NotImplementedError 29 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/optimizers/symmetric_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import torch 18 | 19 | from ..qtype import qtype 20 | from .optimizer import Optimizer 21 | 22 | 23 | __all__ = ["SymmetricOptimizer"] 24 | 25 | 26 | class SymmetricOptimizer(Optimizer): 27 | def __call__(self, base: torch.Tensor, qtype: qtype, axis: Optional[int] = None) -> torch.Tensor: 28 | if axis not in [None, 0, -1]: 29 | raise ValueError("axis parameter must be None, 0 (first axis) or -1 (last axis)") 30 | if axis is not None and base.shape[axis] == 1: 31 | axis = None 32 | scale = self.optimize(base, qtype, axis) 33 | assert scale.dtype == base.dtype 34 | 35 | return scale 36 | 37 | def optimize(self, base: torch.Tensor, qmax: float, axis: Optional[int] = None) -> torch.Tensor: 38 | raise NotImplementedError 39 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/qbits.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import torch 17 | from torch.autograd import Function 18 | 19 | from .grouped import ungroup 20 | from .packed import PackedTensor 21 | from .qtensor import QTensor 22 | 23 | 24 | __all__ = ["QBitsTensor"] 25 | 26 | 27 | class QBitsDequantizer(Function): 28 | @staticmethod 29 | def forward(ctx, t): 30 | if isinstance(t._data, PackedTensor): 31 | data = t._data.unpack() 32 | else: 33 | data = t._data 34 | shift = t._shift 35 | if not shift.dtype.is_floating_point: 36 | # Remove shift before multiplying by the scale 37 | data = data.to(torch.int8) - shift.to(torch.int8) 38 | if t.qtype.is_floating_point: 39 | # Upcast explicitly to the scale dtype 40 | dqt = t._scale * data.to(t._scale.dtype) 41 | else: 42 | dqt = t._scale * data 43 | if shift.dtype.is_floating_point: 44 | # Remove scaled shift 45 | dqt -= shift 46 | if t.axis is None: 47 | return dqt 48 | # Restore the original shape (if needed) 49 | return ungroup(dqt, axis=t.axis, orig_shape=t.shape) 50 | 51 | @staticmethod 52 | def backward(ctx, gO): 53 | return gO 54 | 55 | 56 | class QBitsTensor(QTensor): 57 | def __init__(self, qtype, axis, group_size, size, stride, data, scale, shift, requires_grad=False): 58 | super().__init__(qtype, axis) 59 | self._data = data 60 | self._scale = scale 61 | self._shift = shift 62 | self._group_size = group_size 63 | 64 | def __repr__(self): 65 | return f"{type(self).__name__}({self._data}, scale={self._scale}, shift={self._shift}, dtype={self.dtype})" 66 | 67 | def dequantize(self): 68 | return QBitsDequantizer.apply(self) 69 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/qbytes.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from torch.autograd import Function 16 | 17 | from .qtensor import QTensor 18 | 19 | 20 | __all__ = ["QBytesTensor"] 21 | 22 | 23 | class QBytesDequantizer(Function): 24 | @staticmethod 25 | def forward(ctx, t): 26 | if t.qtype.is_floating_point: 27 | # Upcast explicitly to the scale dtype 28 | dqt = t._scale * t._data.to(t._scale.dtype) 29 | else: 30 | dqt = t._scale * t._data 31 | return dqt 32 | 33 | @staticmethod 34 | def backward(ctx, gO): 35 | # For autograd, dequantization is a no-op 36 | return gO 37 | 38 | 39 | class QBytesTensor(QTensor): 40 | def __init__(self, qtype, axis, size, stride, data, scale, requires_grad=False): 41 | super().__init__(qtype, axis) 42 | self._data = data 43 | self._scale = scale 44 | 45 | def __repr__(self): 46 | return f"{self.__class__}({self._data}, scale={self._scale}, dtype={self.dtype})" 47 | 48 | def dequantize(self): 49 | """Differentiable dequantization function""" 50 | return QBytesDequantizer.apply(self) 51 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/qtype.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | import torch 18 | 19 | 20 | @dataclass 21 | class qtype: 22 | """A quantized type class mimicking torch dtype""" 23 | 24 | name: str 25 | is_floating_point: bool 26 | bits: int 27 | # This defines the storage dtype 28 | dtype: torch.dtype 29 | qmin: float 30 | qmax: float 31 | 32 | def __str__(self): 33 | return f"quanto.{self.name}" 34 | 35 | def __hash__(self): 36 | return hash(str(self)) 37 | 38 | 39 | # Integer qtypes 40 | 41 | 42 | def qint(bits): 43 | qmin = -(2 ** (bits - 1)) 44 | qmax = 2 ** (bits - 1) - 1 45 | return qtype(f"qint{bits}", is_floating_point=False, bits=bits, dtype=torch.int8, qmin=qmin, qmax=qmax) 46 | 47 | 48 | qint2 = qint(2) 49 | qint4 = qint(4) 50 | qint8 = qint(8) 51 | 52 | # Float qtypes 53 | 54 | 55 | def qfloat(dtype: torch.dtype): 56 | finfo = torch.finfo(dtype) 57 | qmin = finfo.min 58 | qmax = finfo.max 59 | return qtype(f"q{finfo.dtype}", is_floating_point=True, bits=8, dtype=dtype, qmin=qmin, qmax=qmax) 60 | 61 | 62 | qfloat8_e4m3fn = qfloat(torch.float8_e4m3fn) 63 | qfloat8_e4m3fnuz = qfloat(torch.float8_e4m3fnuz) 64 | qfloat8_e5m2 = qfloat(torch.float8_e5m2) 65 | 66 | # Alias the float8 representation that has the better support and inference efficiency 67 | qfloat8 = qfloat8_e4m3fn 68 | 69 | # Convenience dict to get a dtype from its name 70 | qtypes = {name: q for (name, q) in locals().items() if isinstance(q, qtype)} 71 | 72 | __all__ = ["qtype", "qtypes"] + [str(name) for name in qtypes.keys()] 73 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/__init__.py: -------------------------------------------------------------------------------- 1 | from .qbits import * 2 | from .qbytes import * 3 | from .quantization import * 4 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/awq/__init__.py: -------------------------------------------------------------------------------- 1 | from .packed import * 2 | from .qbits import * 3 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/marlin/__init__.py: -------------------------------------------------------------------------------- 1 | from .fp8 import * 2 | from .int4 import * 3 | from .permutations import * 4 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/marlin/fp8/__init__.py: -------------------------------------------------------------------------------- 1 | from .packed import * 2 | from .qbits import * 3 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/marlin/int4/__init__.py: -------------------------------------------------------------------------------- 1 | from .packed import * 2 | from .qbits import * 3 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/marlin/permutations.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import functools 16 | from typing import List, Tuple 17 | 18 | import torch 19 | 20 | from ..reordering import reorder, reverse 21 | 22 | 23 | __all__ = ["marlin_permute"] 24 | 25 | 26 | # https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54 27 | @functools.cache 28 | def _get_perms() -> Tuple[List[int], List[int]]: 29 | perm = [] 30 | for i in range(8): 31 | perm.extend([i + 8 * j for j in range(8)]) 32 | perm_single = [] 33 | for i in range(4): 34 | perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) 35 | return perm, perm_single 36 | 37 | 38 | @functools.cache 39 | def _get_inverted_perms() -> Tuple[List[int], List[int]]: 40 | perm, perm_single = _get_perms() 41 | return reverse(perm), reverse(perm_single) 42 | 43 | 44 | def marlin_permute(t: torch.Tensor, reverse=False): 45 | perm, perm_single = _get_inverted_perms() if reverse else _get_perms() 46 | out_features = t.shape[1] 47 | if t.shape[0] == 1: 48 | reordered = reorder(t, perm_single) 49 | else: 50 | reordered = reorder(t, perm) 51 | return reordered.reshape((-1, out_features)).contiguous() 52 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/packing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | 18 | def unpack_int32_to_uint8(packed: torch.Tensor, bits: int): 19 | """Unpack a packed int32 tensor to a larger uint8 tensor 20 | 21 | Args: 22 | packed (`torch.Tensor`): 23 | The packed integer tensor 24 | bits: (`int`): 25 | The number of bits of each packed value. 26 | 27 | Returns: 28 | An unpacked uint8 `torch.Tensor` expanded along the last dimension. 29 | """ 30 | total_bits = 32 31 | shifts = torch.arange(0, total_bits, bits, device=packed.device) 32 | 33 | # Unpack column-wise 34 | unpacked = torch.bitwise_right_shift(packed[:, :, None], shifts[None, None, :]).to( 35 | torch.int8 # smallest dtype available 36 | ) 37 | unpacked = unpacked.reshape(unpacked.shape[0], -1) 38 | 39 | # Convert to unsigned 40 | unpacked = torch.bitwise_and(unpacked, (2**bits) - 1) 41 | 42 | return unpacked.to(torch.uint8) 43 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/quantization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import torch 18 | 19 | from ..qtype import qtype 20 | from .qbits import WeightQBitsTensor 21 | from .qbytes import WeightQBytesTensor 22 | 23 | 24 | __all__ = ["quantize_weight"] 25 | 26 | 27 | def quantize_weight( 28 | t: torch.Tensor, 29 | qtype: qtype, 30 | axis: int, 31 | scale: torch.Tensor, 32 | shift: Optional[torch.Tensor] = None, 33 | group_size: Optional[int] = None, 34 | activation_qtype: Optional[qtype] = None, 35 | optimized: Optional[bool] = True, 36 | ): 37 | """Quantize a weight Tensor. 38 | 39 | Weights are always quantized per-axis. 40 | 41 | Args: 42 | t (`torch.Tensor`): the weight Tensor to quantize 43 | qtype (`quanto.qtype`): The target quantization type 44 | axis ('int`): The quantization axis (0 or -1) 45 | scale (`torch.Tensor`): the quantization scale 46 | shift (`Optional[torch.Tensor]`): optional shift to apply 47 | group_size (`Optional[int]`): The quantization group size 48 | activation_qtype (`Optional[qtype]`, defaults to `None`): 49 | Which quantization type is being used for the activations. The function `quantize_weight` 50 | initializes `torch.Tensor` subclasses that may depend on the activation dtype. 51 | `None` corresponds to no quantization. 52 | optimized (`Optional[bool]`, defaults to True): 53 | If True, the quantization algorithm will select the most efficient kernel 54 | for the weights and format the resulting Tensor accordingly. 55 | If False, a kernel-agnostic Tensor will be returned (but it can be optimized later 56 | explicitly by calling QTensor.optimize() or implicitly by moving it to a specific device). 57 | Returns: 58 | A quantized Tensor. 59 | """ 60 | if axis not in (0, -1): 61 | raise ValueError("axis parameter must be 0 (first axis) or -1 (last axis)") 62 | if qtype.bits == 8: 63 | if shift is not None: 64 | raise ValueError("shift cannot be specified for 8-bit qtypes") 65 | if group_size is not None: 66 | raise ValueError("group_size cannot be specified for 8-bit qtypes.") 67 | if axis is not None and t.shape[axis] == 1: 68 | # Quantizing along an axis of dimension 1 means quantizing per-tensor 69 | axis = None 70 | return WeightQBytesTensor.quantize(t, qtype, axis, scale, activation_qtype, optimized) 71 | if shift is None: 72 | raise ValueError("shift must be specified for qtypes lower than 8-bit") 73 | return WeightQBitsTensor.quantize(t, qtype, axis, group_size, scale, shift, optimized) 74 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/reordering.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Union 16 | 17 | import torch 18 | 19 | 20 | __all__ = ["reorder", "reverse"] 21 | 22 | 23 | def reorder(t: torch.Tensor, permutation: Union[torch.Tensor, List[int]]): 24 | """Reorder a Tensor using a permutation 25 | 26 | Args: 27 | t (`torch.Tensor`): the Tensor to reorder 28 | permutation (`Union[torch.Tensor, List[int]]`): the permutation to apply 29 | 30 | Returns: 31 | The reordered torch.Tensor 32 | """ 33 | block_size = permutation.numel() if isinstance(permutation, torch.Tensor) else len(permutation) 34 | reordered = t.reshape((-1, block_size))[:, permutation].reshape(t.shape) 35 | return reordered.contiguous() 36 | 37 | 38 | def reverse(permutation: Union[torch.Tensor, List[int]]): 39 | """Reverse a permutation 40 | 41 | The reversed permutation can be used to revert a reordered Tensor to its original 42 | ordering. 43 | 44 | Args: 45 | permutation (`Union[torch.Tensor, List[int]]`): the permutation to reverse 46 | 47 | Returns: 48 | The reversed permutation 49 | """ 50 | block_size = permutation.numel() if isinstance(permutation, torch.Tensor) else len(permutation) 51 | reversed = torch.empty((block_size,), dtype=torch.int64) 52 | reversed[permutation] = torch.arange(block_size) 53 | return reversed 54 | -------------------------------------------------------------------------------- /optimum/quanto/tensor/weights/tinygemm/__init__.py: -------------------------------------------------------------------------------- 1 | from .packed import * 2 | from .qbits import * 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = 'optimum-quanto' 3 | description = 'A pytorch quantization backend for optimum.' 4 | classifiers = [ 5 | 'Development Status :: 2 - Pre-Alpha', 6 | 'License :: OSI Approved :: Apache Software License', 7 | 'Intended Audience :: Developers', 8 | 'Intended Audience :: Education', 9 | 'Intended Audience :: Science/Research', 10 | 'Operating System :: OS Independent', 11 | 'Programming Language :: Python :: 3.9', 12 | 'Programming Language :: Python :: 3.10', 13 | 'Programming Language :: Python :: 3.11', 14 | 'Topic :: Scientific/Engineering :: Artificial Intelligence' 15 | ] 16 | keywords = ['torch', 'quantization'] 17 | requires-python = '>=3.9.0' 18 | authors = [{ name = 'David Corvoysier' }] 19 | maintainers = [ 20 | {name = "HuggingFace Inc. Special Ops Team", email="hardware@huggingface.co"}, 21 | ] 22 | dependencies = ['torch>=2.6.0', 'ninja', 'numpy', 'safetensors', 'huggingface_hub'] 23 | license = { text = 'Apache-2.0' } 24 | readme = 'README.md' 25 | dynamic = ['version'] 26 | 27 | [project.urls] 28 | homepage = 'https://github.com/huggingface/optimum-quanto' 29 | 30 | [project.optional-dependencies] 31 | dev = ['pytest', 'ruff'] 32 | examples = [ 33 | 'torchvision', 34 | 'transformers', 35 | 'diffusers', 36 | 'datasets', 37 | 'accelerate', 38 | 'sentencepiece', 39 | 'scipy' 40 | ] 41 | 42 | [tool.setuptools.packages.find] 43 | where = ["."] 44 | include = ["optimum*"] 45 | 46 | [tool.setuptools.dynamic] 47 | version = {attr = 'optimum.quanto.__version__'} 48 | 49 | [build-system] 50 | requires = ['setuptools>65.5.1', 'setuptools_scm'] 51 | build-backend = 'setuptools.build_meta' 52 | 53 | [tool.ruff] 54 | # Configuration for Ruff 55 | line-length = 119 # Same line-length as Black had 56 | 57 | # Linting rules: 58 | # Never enforce `E501` (line length violations) and other specific rules. 59 | lint.ignore = ['C901', 'E501', 'E741'] 60 | lint.select = ['C', 'E', 'F', 'I', 'W'] 61 | 62 | # Ignore import violations in all `__init__.py` files. 63 | [tool.ruff.lint.per-file-ignores] 64 | '__init__.py' = ['E402', 'F401', 'F403', 'F811'] 65 | 66 | # isort configuration (to sort imports) 67 | [tool.ruff.lint.isort] 68 | lines-after-imports = 2 69 | known-first-party = ['optimum.quanto'] 70 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NIGHTLY=${1:-0} 4 | VENV=".venv" 5 | if [ ! -d "${VENV}" ]; then 6 | python3 -m venv ${VENV} 7 | fi 8 | . ${VENV}/bin/activate 9 | if [ "$NIGHTLY" -eq "0" ]; then 10 | pip install --upgrade torch torchvision torchaudio 11 | else 12 | pip install --upgrade --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 13 | fi 14 | # Build tools 15 | pip install ruff pytest build 16 | # For examples 17 | pip install accelerate transformers datasets 18 | -------------------------------------------------------------------------------- /tests/cli/cli_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib 16 | 17 | import pytest 18 | 19 | 20 | requires_optimum_cli = pytest.mark.skipif( 21 | importlib.util.find_spec("optimum.commands") is None, reason="optimum-cli is required" 22 | ) 23 | -------------------------------------------------------------------------------- /tests/cli/test_quantize_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import subprocess 16 | from tempfile import TemporaryDirectory 17 | 18 | import pytest 19 | from cli_helpers import requires_optimum_cli 20 | 21 | from optimum.quanto import quantization_map 22 | 23 | 24 | @requires_optimum_cli 25 | @pytest.mark.parametrize("weights", ["int4", "int8"]) 26 | def test_export_decoder_cli(weights): 27 | from optimum.quanto import QuantizedModelForCausalLM 28 | 29 | model_id = "facebook/opt-125m" 30 | with TemporaryDirectory() as tempdir: 31 | subprocess.run( 32 | [ 33 | "optimum-cli", 34 | "quanto", 35 | "quantize", 36 | "--model", 37 | model_id, 38 | "--weights", 39 | f"{weights}", 40 | tempdir, 41 | ], 42 | shell=False, 43 | check=True, 44 | ) 45 | # Verify we can reload the quantized model 46 | qmodel = QuantizedModelForCausalLM.from_pretrained(tempdir) 47 | qmap = quantization_map(qmodel) 48 | for layer_qconfig in qmap.values(): 49 | assert layer_qconfig["weights"] == f"q{weights}" 50 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | 18 | 19 | devices = ["cpu"] 20 | if torch.cuda.is_available(): 21 | devices += ["cuda"] 22 | elif torch.backends.mps.is_available(): 23 | devices += ["mps"] 24 | elif torch.xpu.is_available(): 25 | devices += ["xpu"] 26 | 27 | 28 | @pytest.fixture(scope="module", params=devices) 29 | def device(request): 30 | return torch.device(request.param) 31 | 32 | 33 | def pytest_configure(config): 34 | # register additional markers 35 | config.addinivalue_line("markers", "skip_device(type): mark test to be skipped for the specified device type") 36 | 37 | 38 | def pytest_runtest_call(item): 39 | fixture_name = "device" 40 | if fixture_name in item.fixturenames: 41 | # TODO: should be able to recover the fixture id instead of the actual value 42 | fixture_arg = item.funcargs[fixture_name].type 43 | skip_marks = {mark.args[0] for mark in item.iter_markers(name=f"skip_{fixture_name}")} 44 | if fixture_arg in skip_marks: 45 | pytest.skip(f"Test skipped for {fixture_name} {fixture_arg}") 46 | -------------------------------------------------------------------------------- /tests/library/test_extensions.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | import pytest 4 | import torch 5 | from packaging import version 6 | 7 | from optimum.quanto.library.extensions import get_extension, is_extension_available 8 | 9 | 10 | def _is_xpu_available(): 11 | # SYCL extension support is added in torch>=2.7 on Linux 12 | if platform.system() != "Linux": 13 | return False 14 | if version.parse(torch.__version__).release < version.parse("2.7").release: 15 | return False 16 | return torch.xpu.is_available() 17 | 18 | 19 | extension_names = ["quanto_cpp"] 20 | if torch.cuda.is_available(): 21 | if torch.version.cuda: 22 | extension_names.append("quanto_cuda") 23 | if torch.version.hip: 24 | extension_names.append("quanto_hip") 25 | if torch.backends.mps.is_available(): 26 | extension_names.append("quanto_mps") 27 | if _is_xpu_available(): 28 | extension_names.append("quanto_xpu") 29 | 30 | 31 | @pytest.mark.parametrize("extension_name", extension_names) 32 | def test_extension_available(extension_name): 33 | assert is_extension_available(extension_name) 34 | 35 | 36 | @pytest.mark.parametrize("extension_name", extension_names) 37 | def test_extension_compilation(extension_name): 38 | extension = get_extension(extension_name) 39 | assert extension.lib is not None 40 | -------------------------------------------------------------------------------- /tests/library/test_unpack.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import pytest 17 | import torch 18 | 19 | from optimum.quanto.tensor.packed import pack_weights 20 | 21 | 22 | @pytest.mark.parametrize("bits", [2, 4], ids=["int2", "int4"]) 23 | @pytest.mark.parametrize("shape", [(12,), (32, 32)], ids=["vector", "matrix"]) 24 | def test_unpack(bits, shape, device): 25 | qmax = 2**bits 26 | a = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device) 27 | packed_a = pack_weights(a, bits) 28 | unpacked_a = torch.ops.quanto.unpack(packed_a, bits) 29 | assert unpacked_a.dtype == torch.uint8 30 | assert torch.equal(unpacked_a, a) 31 | -------------------------------------------------------------------------------- /tests/models/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from huggingface_hub.constants import _staging_mode 3 | 4 | 5 | @pytest.fixture 6 | def staging(): 7 | """A pytest fixture only available in huggingface_hub staging mode 8 | 9 | If the huggingface_hub is not operating in staging mode, tests using 10 | that fixture are automatically skipped. 11 | 12 | Returns: 13 | a Dict containing a valid staging user and token. 14 | """ 15 | if not _staging_mode: 16 | pytest.skip("requires huggingface_hub staging mode") 17 | return { 18 | "user": "__DUMMY_TRANSFORMERS_USER__", 19 | # Not critical, only usable on the sandboxed CI instance. 20 | "token": "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL", 21 | } 22 | 23 | 24 | @pytest.fixture(autouse=True) 25 | def skip_if_staging(request): 26 | if _staging_mode: 27 | if "staging" not in request.fixturenames: 28 | pytest.skip("requires huggingface_hub standard mode") 29 | -------------------------------------------------------------------------------- /tests/nn/test_qmodule.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | 18 | from optimum.quanto import QTensor, qint8, qtypes 19 | from optimum.quanto.nn import QLinear 20 | 21 | 22 | @pytest.mark.parametrize("in_features", [8, 16]) 23 | @pytest.mark.parametrize("out_features", [32, 64]) 24 | @pytest.mark.parametrize("use_bias", [True, False], ids=["bias", "no-bias"]) 25 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) 26 | def test_qmodule_freeze(in_features, out_features, use_bias, dtype): 27 | qlinear = QLinear(in_features, out_features, bias=use_bias, weights=qint8).to(dtype) 28 | assert not qlinear.frozen 29 | assert not isinstance(qlinear.weight, QTensor) 30 | assert qlinear.weight.dtype == dtype 31 | if use_bias: 32 | assert not isinstance(qlinear.bias, QTensor) 33 | assert qlinear.bias.dtype == dtype 34 | qweight = qlinear.qweight 35 | assert isinstance(qweight, QTensor) 36 | assert qweight.dtype == dtype 37 | assert qweight.qtype == qint8 38 | qlinear.freeze() 39 | assert qlinear.frozen 40 | assert isinstance(qlinear.weight, QTensor) 41 | assert qlinear.weight.dtype == dtype 42 | assert qlinear.weight.qtype == qint8 43 | if use_bias: 44 | assert not isinstance(qlinear.bias, QTensor) 45 | assert qlinear.bias.dtype == dtype 46 | 47 | 48 | @pytest.mark.parametrize("weights", ["qint2", "qint4", "qint8", "qfloat8"]) 49 | @pytest.mark.parametrize("activations", [None, "qint8", "qfloat8"]) 50 | def test_qmodule_qtype_as_string(weights, activations): 51 | qlinear = QLinear(16, 64, weights=weights, activations=activations) 52 | assert qlinear.weight_qtype == qtypes[weights] 53 | assert qlinear.activation_qtype is None if activations is None else qtypes[activations] 54 | -------------------------------------------------------------------------------- /tests/tensor/activations/test_activations_compile.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | from helpers import random_tensor 18 | 19 | from optimum.quanto import ActivationQBytesTensor, absmax_scale, qint8, quantize_activation 20 | 21 | 22 | def compile_for_device(f, device): 23 | # Remove any side-effects form previous compilation 24 | torch.compiler.reset() 25 | # Inductor relies on Triton for inference which does not support MPS 26 | backend = "aot_eager" if device == torch.device("mps") else "inductor" 27 | return torch.compile(f, backend=backend) 28 | 29 | 30 | @pytest.mark.skip("Disabled as it is not working (yet ?)") 31 | @pytest.mark.parametrize("input_shape", [(2, 10), (10, 32, 32)]) 32 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"]) 33 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"]) 34 | def test_compile_quantize_tensor(input_shape, qtype, dtype, device): 35 | if device == torch.device("mps") and dtype == torch.bfloat16: 36 | pytest.skip("BFloat16 is not supported on MPS") 37 | a = random_tensor(input_shape, dtype=dtype).to(device) 38 | 39 | def f(x, qtype): 40 | scale = absmax_scale(x) 41 | return quantize_activation(x, qtype=qtype, scale=scale) 42 | 43 | compiled_f = compile_for_device(f, device) 44 | qa = compiled_f(a, qtype) 45 | assert isinstance(qa, ActivationQBytesTensor) 46 | assert qa.qtype == qtype 47 | assert qa._scale.dtype == dtype 48 | assert qa.axis is None 49 | 50 | 51 | def test_compile_qtensor_to(device): 52 | input_shape = (10, 32, 32) 53 | a = random_tensor(input_shape).to(device) 54 | 55 | def f(x, dtype): 56 | return x.to(dtype) 57 | 58 | compiled_f = compile_for_device(f, device) 59 | 60 | scale = absmax_scale(a) 61 | qa = quantize_activation(a, qtype=qint8, scale=scale) 62 | cqa = compiled_f(qa, torch.float16) 63 | assert isinstance(cqa, ActivationQBytesTensor) 64 | assert cqa.qtype == qint8 65 | assert cqa._scale.dtype == torch.float16 66 | -------------------------------------------------------------------------------- /tests/tensor/activations/test_activations_quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | from helpers import assert_similar, device_eq, random_tensor 18 | 19 | from optimum.quanto import ( 20 | ActivationQBytesTensor, 21 | absmax_scale, 22 | qfloat8, 23 | qfloat8_e4m3fn, 24 | qfloat8_e4m3fnuz, 25 | qfloat8_e5m2, 26 | qint8, 27 | ) 28 | 29 | 30 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)]) 31 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 32 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"]) 33 | def test_symmetric_quantize_int(input_shape, dtype, qtype, device): 34 | a = random_tensor(input_shape, dtype=dtype).to(device) 35 | scale = absmax_scale(a, qtype=qtype, axis=None) 36 | qa = ActivationQBytesTensor.quantize(a, qtype, scale) 37 | assert isinstance(qa, ActivationQBytesTensor) 38 | assert qa.dtype == dtype 39 | assert qa.qtype == qtype 40 | assert device_eq(qa.device, device) 41 | assert_similar(a, qa) 42 | 43 | 44 | @pytest.mark.skip_device("mps") 45 | @pytest.mark.skip_device("xpu") 46 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)]) 47 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 48 | @pytest.mark.parametrize( 49 | "qtype", 50 | [qfloat8, qfloat8_e4m3fn, qfloat8_e4m3fnuz, qfloat8_e5m2], 51 | ids=["qfloat8", "qfloat8_e4m3fn", "qfloat8_e4m3fnuz", "qfloat8_e5m2"], 52 | ) 53 | def test_symmetric_quantize_float8(input_shape, dtype, qtype, device): 54 | a = random_tensor(input_shape, dtype=dtype).to(device) 55 | scale = absmax_scale(a, qtype=qtype, axis=None) 56 | qa = ActivationQBytesTensor.quantize(a, qtype, scale) 57 | assert isinstance(qa, ActivationQBytesTensor) 58 | assert qa.dtype == dtype 59 | assert qa.qtype == qtype 60 | assert device_eq(qa.device, device) 61 | assert_similar(a, qa, atol=5e-3) 62 | -------------------------------------------------------------------------------- /tests/tensor/ops/test_mm_dispatch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | from helpers import assert_similar, random_qactivation, random_qweight 18 | 19 | from optimum.quanto import qint8 20 | 21 | 22 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) 23 | @pytest.mark.parametrize("in_features", [5, 16, 24]) 24 | @pytest.mark.parametrize("hidden", [5, 16, 24]) 25 | @pytest.mark.parametrize("out_features", [5, 16, 24]) 26 | def test_qactivation_qweight_matmul(dtype, in_features, hidden, out_features, device): 27 | qa = random_qactivation((in_features, hidden), qint8, dtype=dtype).to(device) 28 | qb = random_qweight((hidden, out_features), qint8, dtype=dtype, axis=-1).to(device) 29 | qmatmul = torch.matmul(qa, qb) 30 | # The outputs should be almost identical if we use the dequantized inputs 31 | matmul = torch.matmul(qa.dequantize(), qb.dequantize()) 32 | assert_similar(matmul, qmatmul) 33 | 34 | 35 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) 36 | @pytest.mark.parametrize("batch_size", [1, 10]) 37 | @pytest.mark.parametrize("a_shape, b_shape", [[(16, 32), (32, 24)], [(5, 10), (10, 6)]]) 38 | def test_qactivation_qactivation_bmm(dtype, batch_size, a_shape, b_shape, device): 39 | qa = random_qactivation((batch_size,) + a_shape, qint8, dtype=dtype).to(device) 40 | qb = random_qactivation((batch_size,) + b_shape, qint8, dtype=dtype).to(device) 41 | qbmm = torch.bmm(qa, qb) 42 | # The outputs should be almost identical if we use the dequantized inputs 43 | bmm = torch.bmm(qa.dequantize(), qb.dequantize()) 44 | assert_similar(bmm, qbmm) 45 | -------------------------------------------------------------------------------- /tests/tensor/optimizers/test_hqq_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | from helpers import random_tensor 18 | 19 | from optimum.quanto import ( 20 | HqqOptimizer, 21 | MaxOptimizer, 22 | WeightQBitsTensor, 23 | qint2, 24 | qint4, 25 | ) 26 | 27 | 28 | def compare_quantized_tensor(a, qtype, axis, group_size, scale, shift): 29 | qa = WeightQBitsTensor.quantize(a, qtype, axis, group_size, scale, shift) 30 | # Evaluate mean absolute error 31 | mean_error = torch.mean(torch.abs(a - qa)) 32 | # Also evaluate cosine similarity 33 | sim = torch.nn.functional.cosine_similarity(a.flatten(), qa.flatten(), dim=0) 34 | return mean_error, sim 35 | 36 | 37 | @pytest.mark.parametrize("input_shape", [(1024, 1024)]) 38 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16], ids=["bf16", "fp16"]) 39 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"]) 40 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 41 | @pytest.mark.parametrize("group_size", [32, 64, 128]) 42 | def test_hqq_optimizer(input_shape, dtype, qtype, axis, group_size, device): 43 | a = random_tensor(input_shape, dtype=dtype).to(device) 44 | max_scale, max_shift = MaxOptimizer()(a, qtype=qtype, axis=axis, group_size=group_size) 45 | max_mean_error, max_sim = compare_quantized_tensor(a, qtype, axis, group_size, max_scale, max_shift) 46 | hqq_scale, hqq_shift = HqqOptimizer()(a, qtype=qtype, axis=axis, group_size=group_size) 47 | hqq_mean_error, hqq_sim = compare_quantized_tensor(a, qtype, axis, group_size, hqq_scale, hqq_shift) 48 | # HQQ optimizes the mean error, so it should be lower 49 | assert hqq_mean_error <= max_mean_error 50 | # FIXME: HQQ cosine similarity should be also closer to 1 51 | -------------------------------------------------------------------------------- /tests/tensor/test_absmax.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | from helpers import random_tensor 18 | 19 | from optimum.quanto import absmax_scale, qfloat8, qint8 20 | 21 | 22 | @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (2, 10), (10, 32, 32)]) 23 | @pytest.mark.parametrize("qtype", [qint8, qfloat8], ids=["qint8", "qfloat8"]) 24 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 25 | @pytest.mark.parametrize("axis", [None, 0, -1], ids=["per-tensor", "first-axis", "last-axis"]) 26 | def test_absmax_scale(input_shape, axis, dtype, qtype, device): 27 | if device.type == "mps" and qtype.is_floating_point: 28 | pytest.skip("Float8 are not supported on MPS device") 29 | a = random_tensor(input_shape, dtype=dtype).to(device) 30 | scale = absmax_scale(a, qtype, axis) 31 | assert scale.dtype == dtype 32 | if axis is None: 33 | assert scale.ndim == 0 34 | else: 35 | assert scale.ndim == a.ndim 36 | sscale = torch.squeeze(scale) 37 | if a.ndim == 1 or a.shape[axis] == 1: 38 | # Quantization is actually per-tensor as the axis dim is 1 39 | assert sscale.ndim == 0 40 | else: 41 | assert sscale.ndim == 1 42 | -------------------------------------------------------------------------------- /tests/tensor/test_packed_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import io 16 | 17 | import pytest 18 | import torch 19 | from helpers import device_eq 20 | 21 | from optimum.quanto.tensor.packed import PackedTensor 22 | 23 | 24 | @pytest.mark.parametrize("shape", [(10,), (12,), (10, 10), (12, 10), (32, 32)]) 25 | @pytest.mark.parametrize("bits", [2, 4], ids=["int2", "int4"]) 26 | def test_pack_tensor(shape, bits, device): 27 | """This test verifies that an integer tensor in the correct range is preserved.""" 28 | qmax = 2**bits 29 | t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device) 30 | packed = PackedTensor.pack(t, bits=bits) 31 | 32 | assert isinstance(packed, PackedTensor) 33 | assert packed.dtype == torch.uint8 34 | assert device_eq(packed.device, device) 35 | assert torch.equal(t, packed.unpack()) 36 | 37 | 38 | @pytest.mark.parametrize("bits", [2, 4], ids=["int2", "int4"]) 39 | def test_packed_tensor_serialization(bits, device): 40 | qmax = 2**bits 41 | shape = (10, 32) 42 | t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device) 43 | packed = PackedTensor.pack(t, bits=bits) 44 | b = io.BytesIO() 45 | torch.save(packed, b) 46 | b.seek(0) 47 | packed_reloaded = torch.load(b, weights_only=False) 48 | assert isinstance(packed_reloaded, PackedTensor) 49 | assert packed_reloaded.shape == packed.shape 50 | assert packed_reloaded.dtype == packed.dtype 51 | assert packed_reloaded.bits == packed.bits 52 | assert torch.equal(packed_reloaded._data, packed._data) 53 | assert torch.equal(t, packed_reloaded.unpack()) 54 | -------------------------------------------------------------------------------- /tests/tensor/weights/optimized/test_awq_packed_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | import pytest 18 | import torch 19 | from helpers import device_eq 20 | 21 | from optimum.quanto.tensor.weights.awq import AWQPackedTensor, AWQPacking 22 | 23 | 24 | @pytest.mark.skip_device("cpu") 25 | @pytest.mark.skip_device("mps") 26 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024]) 27 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024]) 28 | @pytest.mark.parametrize("random", [True, False]) 29 | @pytest.mark.parametrize("packing, reorder", [(AWQPacking.V1, True), (AWQPacking.V1, False), (AWQPacking.V2, False)]) 30 | def test_pack_awq_tensor(in_features, out_features, random, packing, reorder, device): 31 | bits = 4 32 | qmax = 2**bits 33 | shape = (out_features, in_features) 34 | if random: 35 | t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device) 36 | else: 37 | numel = np.prod(shape) 38 | t = torch.tensor(range(numel), dtype=torch.int32) 39 | t = (t % qmax).reshape(shape).to(torch.uint8).to(device) 40 | packed = AWQPackedTensor.pack(t, packing=packing, reorder=reorder) 41 | assert isinstance(packed, AWQPackedTensor) 42 | assert packed._packing == packing 43 | assert packed._reorder == reorder 44 | assert device_eq(packed.device, device) 45 | assert torch.equal(t, packed.unpack()) 46 | 47 | 48 | @pytest.mark.skip_device("cpu") 49 | @pytest.mark.skip_device("mps") 50 | @pytest.mark.parametrize("packing, reorder", [(AWQPacking.V1, True), (AWQPacking.V2, False)]) 51 | def test_move_awq_tensor(packing, reorder, device): 52 | shape = (256, 256) 53 | bits = 4 54 | qmax = 2**bits 55 | numel = np.prod(shape) 56 | t = torch.tensor(range(numel), dtype=torch.int32) 57 | t = (t % qmax).reshape(shape).to(torch.uint8).to(device) 58 | packed = AWQPackedTensor.pack(t, packing=packing, reorder=reorder) 59 | assert packed._packing == packing 60 | assert packed._reorder == reorder 61 | moved = packed.to(device) 62 | assert isinstance(moved, AWQPackedTensor) 63 | assert moved._packing == packing 64 | assert moved._reorder == reorder 65 | # TensorRT tensors are unpacked when moved out of CUDA or XPU device 66 | moved = packed.to("cpu") 67 | assert type(moved) is torch.Tensor 68 | -------------------------------------------------------------------------------- /tests/tensor/weights/optimized/test_marlin_fp8_packed_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | import pytest 18 | import torch 19 | from helpers import device_eq 20 | 21 | from optimum.quanto.library.extensions import is_extension_available 22 | from optimum.quanto.tensor.weights.marlin.fp8 import MarlinF8PackedTensor 23 | 24 | 25 | def get_fp8_tensor(shape, device, random=False): 26 | # We will initialize float8 from an uint8 tensor 27 | qmax = 2**8 28 | if random: 29 | t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device) 30 | else: 31 | numel = np.prod(shape) 32 | t = torch.tensor(range(numel), dtype=torch.int32) 33 | t = (t % qmax).reshape(shape).to(torch.uint8).to(device) 34 | # Remove values that would be interpreted as nans in float8. 35 | t[t == 127] = 0 36 | t[t == 255] = 0 37 | return t.view(torch.float8_e4m3fn).to(device) 38 | 39 | 40 | @pytest.mark.skipif(not is_extension_available("quanto_cuda"), reason="CUDA extension is not available") 41 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024]) 42 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024]) 43 | @pytest.mark.parametrize("random", [True, False]) 44 | def test_pack_marlin_fp8_tensor(in_features, out_features, random): 45 | shape = (out_features, in_features) 46 | device = torch.device("cuda") 47 | t = get_fp8_tensor(shape, device, random) 48 | packed = MarlinF8PackedTensor.pack(t) 49 | assert isinstance(packed, MarlinF8PackedTensor) 50 | assert device_eq(packed.device, device) 51 | assert torch.equal(t, packed.unpack()) 52 | 53 | 54 | @pytest.mark.skipif(not is_extension_available("quanto_cuda"), reason="CUDA extension is not available") 55 | def test_move_marlin_fp8_tensor(): 56 | shape = (256, 256) 57 | device = torch.device("cuda") 58 | t = get_fp8_tensor(shape, device) 59 | packed = MarlinF8PackedTensor.pack(t) 60 | moved = packed.to("cuda") 61 | assert isinstance(moved, MarlinF8PackedTensor) 62 | # Marlin FP8 tensors are unpacked when moved out of CUDA device 63 | moved = packed.to("cpu") 64 | assert type(moved) is torch.Tensor 65 | assert torch.equal(t, moved.to("cuda")) 66 | -------------------------------------------------------------------------------- /tests/tensor/weights/optimized/test_marlin_int4_packed_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | import pytest 18 | import torch 19 | from helpers import device_eq 20 | 21 | from optimum.quanto.tensor.weights.marlin.int4 import MarlinInt4PackedTensor 22 | 23 | 24 | def get_uint4_tensor(shape, device, random=False): 25 | qmax = 2**4 26 | if random: 27 | t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device) 28 | else: 29 | numel = np.prod(shape) 30 | t = torch.tensor(range(numel), dtype=torch.int32) 31 | t = (t % qmax).reshape(shape).to(torch.uint8).to(device) 32 | return t 33 | 34 | 35 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") 36 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024]) 37 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024]) 38 | @pytest.mark.parametrize("random", [True, False]) 39 | def test_pack_marlin_int4_tensor(in_features, out_features, random): 40 | shape = (out_features, in_features) 41 | device = torch.device("cuda") 42 | t = get_uint4_tensor(shape, device, random) 43 | packed = MarlinInt4PackedTensor.pack(t) 44 | assert isinstance(packed, MarlinInt4PackedTensor) 45 | assert device_eq(packed.device, device) 46 | assert torch.equal(t, packed.unpack()) 47 | 48 | 49 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") 50 | def test_move_marlin_int4_packed_tensor(device): 51 | shape = (256, 256) 52 | device = torch.device("cuda") 53 | t = get_uint4_tensor(shape, device) 54 | packed = MarlinInt4PackedTensor.pack(t) 55 | moved = packed.to("cuda") 56 | assert isinstance(moved, MarlinInt4PackedTensor) 57 | # Marlin int4 tensors are unpacked when moved out of CUDA device 58 | moved = packed.to("cpu") 59 | assert type(moved) is torch.Tensor 60 | assert torch.equal(t, moved.to("cuda")) 61 | -------------------------------------------------------------------------------- /tests/tensor/weights/optimized/test_marlin_qbytes_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | 18 | from optimum.quanto import qfloat8_e4m3fn 19 | from optimum.quanto.library.extensions import is_extension_available 20 | from optimum.quanto.tensor.weights.marlin import MarlinF8QBytesTensor 21 | 22 | 23 | @pytest.mark.skipif( 24 | not is_extension_available("quanto_cuda") or torch.cuda.get_device_capability()[0] < 8, 25 | reason="CUDA >= sm80 not available", 26 | ) 27 | @pytest.mark.parametrize("in_features", [16, 32, 48, 64]) 28 | @pytest.mark.parametrize("out_features", [64, 128, 192, 256]) 29 | def test_pack_unpack(in_features: int, out_features: int): 30 | data = torch.randint(0, 256, size=(out_features, in_features), dtype=torch.uint8, device="cuda") 31 | 32 | # Remove nans. 33 | data[data == 127] = 0 34 | data[data == 255] = 0 35 | 36 | data = data.view(torch.float8_e4m3fn) 37 | 38 | qtype = qfloat8_e4m3fn 39 | axis = 0 40 | size = data.shape 41 | stride = data.stride() 42 | scale = torch.rand((out_features, 1), dtype=torch.float16, device="cuda") 43 | marlin_tensor = MarlinF8QBytesTensor(qtype, axis, size, stride, data, scale) 44 | 45 | data_dequantized = marlin_tensor.dequantize() 46 | 47 | assert torch.all((data.to(torch.float16) * scale - data_dequantized).abs() < 1e-4) 48 | -------------------------------------------------------------------------------- /tests/tensor/weights/optimized/test_tinygemm_packed_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import numpy as np 17 | import pytest 18 | import torch 19 | from helpers import device_eq 20 | from packaging import version 21 | 22 | from optimum.quanto.tensor.weights.tinygemm import TinyGemmPackedTensor 23 | 24 | 25 | @pytest.mark.skip_device("mps") # Only available with pytorch 2.4 26 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024]) 27 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024]) 28 | @pytest.mark.parametrize("random", [True, False]) 29 | def test_pack_tinygemm_tensor(in_features, out_features, random, device): 30 | if device.type == "cuda": 31 | if torch.version.hip: 32 | pytest.skip(reason="TinyGemm is not supported on ROCm devices") 33 | if version.parse(torch.version.cuda).release < (12, 1): 34 | pytest.skip(reason="CUDA runtime must be at least 12.1") 35 | if torch.cuda.get_device_capability()[0] < 8: 36 | pytest.skip(reason="CUDA device >= sm80 not available") 37 | bits = 4 38 | qmax = 2**bits 39 | shape = (out_features, in_features) 40 | if random: 41 | t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device) 42 | else: 43 | numel = np.prod(shape) 44 | t = torch.tensor(range(numel), dtype=torch.int32) 45 | t = (t % qmax).reshape(shape).to(torch.uint8).to(device) 46 | packed = TinyGemmPackedTensor.pack(t) 47 | assert isinstance(packed, TinyGemmPackedTensor) 48 | assert device_eq(packed.device, device) 49 | assert torch.equal(t, packed.unpack()) 50 | 51 | 52 | @pytest.mark.skip_device("mps") # Only available with pytorch 2.4 53 | def test_move_tinygemm_packed_tensor(device): 54 | if device.type == "cuda": 55 | if torch.version.hip: 56 | pytest.skip(reason="TinyGemm is not supported on ROCm devices") 57 | if version.parse(torch.version.cuda).release < (12, 1): 58 | pytest.skip(reason="CUDA runtime must be at least 12.1") 59 | if torch.cuda.get_device_capability()[0] < 8: 60 | pytest.skip(reason="CUDA device >= sm80 not available") 61 | shape = (256, 256) 62 | bits = 4 63 | qmax = 2**bits 64 | numel = np.prod(shape) 65 | t = torch.tensor(range(numel), dtype=torch.int32) 66 | t = (t % qmax).reshape(shape).to(torch.uint8) 67 | packed = TinyGemmPackedTensor.pack(t) 68 | moved = packed.to(device) 69 | assert torch.equal(t.to(device), moved.unpack()) 70 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbits_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import io 16 | 17 | import pytest 18 | import torch 19 | from helpers import random_qweight, random_tensor 20 | 21 | from optimum.quanto import MaxOptimizer, WeightQBitsTensor, qint2, qint4, quantize_weight 22 | 23 | 24 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["int2", "int4"]) 25 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 26 | def test_weight_qbits_tensor_serialization(qtype, axis): 27 | qa = random_qweight((5, 5), qtype=qtype, axis=axis) 28 | b = io.BytesIO() 29 | torch.save(qa, b) 30 | b.seek(0) 31 | qa_reloaded = torch.load(b, weights_only=False) 32 | assert isinstance(qa_reloaded, WeightQBitsTensor) 33 | assert qa_reloaded.qtype == qa.qtype 34 | assert qa_reloaded.dtype == qa.dtype 35 | assert torch.equal(qa_reloaded._data, qa._data) 36 | assert torch.equal(qa_reloaded._scale, qa._scale) 37 | assert torch.equal(qa_reloaded._shift, qa._shift) 38 | 39 | 40 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["int2", "int4"]) 41 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 42 | @pytest.mark.parametrize("group_size", [None, 16], ids=["channel-wise", "group-wise"]) 43 | def test_weight_qbits_tensor_requires_grad(qtype, axis, group_size, device): 44 | weight = random_tensor((32, 32), dtype=torch.float32).to(device) 45 | weight.requires_grad = True 46 | scale, shift = MaxOptimizer()(weight, qtype=qtype, axis=axis, group_size=group_size) 47 | qweight = quantize_weight(weight, qtype=qtype, axis=axis, scale=scale, shift=shift, group_size=group_size) 48 | assert qweight.requires_grad is True 49 | 50 | 51 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["int2", "int4"]) 52 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 53 | @pytest.mark.parametrize("group_size", [None, 16], ids=["channel-wise", "group-wise"]) 54 | def test_weight_qbits_tensor_backward(qtype, axis, group_size, device): 55 | weight = random_tensor((32, 32), dtype=torch.float32).to(device) 56 | weight.requires_grad = True 57 | scale, shift = MaxOptimizer()(weight, qtype=qtype, axis=axis, group_size=group_size) 58 | qweight = quantize_weight(weight, qtype=qtype, axis=axis, scale=scale, shift=shift, group_size=group_size) 59 | gradient = torch.randn((32, 32)).to(device) 60 | # Backpropagate gradient to the inner float weights 61 | qweight.dequantize().backward(gradient) 62 | assert torch.equal(weight.grad, gradient) 63 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbits_tensor_instantiate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import pytest 17 | import torch 18 | 19 | from optimum.quanto import qint2, qint4 20 | from optimum.quanto.tensor.weights import WeightQBitsTensor 21 | 22 | 23 | def random_data_scale_shift(input_shape, dtype, qtype, axis, group_size): 24 | out_features, in_features = input_shape 25 | n_groups = in_features * out_features // group_size 26 | data_shape = (n_groups, group_size) if axis == 0 else (group_size, n_groups) 27 | scale_shape = (n_groups, 1) if axis == 0 else (1, n_groups) 28 | min_value = -(2 ** (qtype.bits - 1)) 29 | max_value = 2 ** (qtype.bits - 1) - 1 30 | data = torch.randint(max_value - min_value + 1, data_shape, dtype=torch.uint8) 31 | scale = torch.full(scale_shape, 1.0 / -min_value, dtype=dtype) 32 | shift = torch.ones(scale_shape, dtype=dtype) 33 | return data, scale, shift 34 | 35 | 36 | @pytest.mark.parametrize("input_shape, group_size", [[(32, 32), 16], [(1024, 1024), 128]]) 37 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 38 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"]) 39 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"]) 40 | def test_weight_qbits_tensor_instantiate(input_shape, dtype, qtype, axis, group_size, device): 41 | data, scale, shift = random_data_scale_shift(input_shape, dtype, qtype, axis, group_size) 42 | input_stride = torch.ones(input_shape).stride() 43 | qa = WeightQBitsTensor(qtype, axis, group_size, input_shape, input_stride, data, scale=scale, shift=shift).to( 44 | device 45 | ) 46 | assert torch.max(torch.abs(qa.dequantize())) <= 1 47 | assert qa.dtype == dtype 48 | assert qa.qtype == qtype 49 | assert qa.shape == input_shape 50 | 51 | 52 | @pytest.mark.parametrize("input_shape, group_size", [[(32, 32), 16], [(1024, 1024), 128]]) 53 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 54 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"]) 55 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"]) 56 | def test_weight_qbits_tensor_equal(input_shape, dtype, qtype, axis, group_size, device): 57 | data, scale, shift = random_data_scale_shift(input_shape, dtype, qtype, axis, group_size) 58 | qa = WeightQBitsTensor(qtype, axis, group_size, data.size(), data.stride(), data, scale=scale, shift=shift).to( 59 | device 60 | ) 61 | qb = WeightQBitsTensor( 62 | qtype, axis, group_size, data.size(), data.stride(), data.clone(), scale=scale.clone(), shift=shift.clone() 63 | ).to(device) 64 | assert qa.equal(qb) 65 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbits_tensor_quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | from helpers import assert_similar, device_eq, random_tensor 18 | 19 | from optimum.quanto import ( 20 | MaxOptimizer, 21 | qint2, 22 | qint4, 23 | ) 24 | from optimum.quanto.tensor.weights import WeightQBitsTensor 25 | 26 | 27 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)]) 28 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 29 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"]) 30 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 31 | @pytest.mark.parametrize("group_size", [None, 8], ids=["channel-wise", "group-wise"]) 32 | @pytest.mark.parametrize("shift_mode", ["zeropoint", "float"]) 33 | def test_weight_qbits_tensor_quantize(input_shape, dtype, qtype, axis, group_size, shift_mode, device): 34 | a = random_tensor(input_shape, dtype=dtype).to(device) 35 | scale, shift = MaxOptimizer()(a, qtype=qtype, axis=axis, group_size=group_size) 36 | if shift_mode == "zeropoint": 37 | shift = torch.round(shift / scale).to(torch.int8) 38 | qa = WeightQBitsTensor.quantize(a, qtype, axis, group_size, scale, shift) 39 | assert isinstance(qa, WeightQBitsTensor) 40 | assert qa.dtype == dtype 41 | assert qa.qtype == qtype 42 | assert device_eq(qa.device, device) 43 | atol = { 44 | qint4: { 45 | "zeropoint": 4e-3, 46 | "float": 3e-3, 47 | }, 48 | qint2: { 49 | "zeropoint": 6e-2, 50 | "float": 5e-2, 51 | }, 52 | }[qtype][shift_mode] 53 | assert_similar(a, qa, atol=atol) 54 | 55 | 56 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 57 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"]) 58 | def test_weight_qbits_tensor_quantize_integer_tensor(dtype, qtype, device): 59 | """This test verifies that an integer tensor in the correct range is preserved.""" 60 | bits = qtype.bits 61 | qmin = -(2 ** (bits - 1)) 62 | qmax = 2 ** (bits - 1) - 1 63 | a = torch.tensor(range(qmin, qmax + 1), dtype=dtype).to(device) 64 | scale, shift = MaxOptimizer()(a, qtype=qtype, axis=0, group_size=None) 65 | zeropoint = torch.round(shift / scale) 66 | qa = WeightQBitsTensor.quantize(a, qtype, 0, None, scale, zeropoint) 67 | 68 | assert qa._data.dtype == torch.uint8 69 | assert isinstance(qa, WeightQBitsTensor) 70 | assert qa.dtype == dtype 71 | assert qa.qtype == qtype 72 | assert device_eq(qa.device, device) 73 | assert torch.equal(a, qa.dequantize()) 74 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbytes_tensor_backward.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import torch 17 | from helpers import random_tensor 18 | 19 | from optimum.quanto import AbsmaxOptimizer, qint8, quantize_weight 20 | 21 | 22 | def test_weight_qbytes_tensor_requires_grad(device): 23 | w = random_tensor((10, 10), dtype=torch.float32).to(device) 24 | w.requires_grad = True 25 | scale = AbsmaxOptimizer()(w, qtype=qint8, axis=0) 26 | qw = quantize_weight(w, qtype=qint8, axis=0, scale=scale) 27 | assert qw.requires_grad is True 28 | 29 | 30 | def test_weight_qbytes_tensor_backward(device): 31 | w = random_tensor((10, 10), dtype=torch.float32).to(device) 32 | w.requires_grad = True 33 | scale = AbsmaxOptimizer()(w, qtype=qint8, axis=0) 34 | qw = quantize_weight(w, qtype=qint8, axis=0, scale=scale) 35 | gradient = torch.randn((10, 10)).to(device) 36 | # Backpropagate gradient to the inner float weights 37 | qw.dequantize().backward(gradient) 38 | assert torch.equal(w.grad, gradient) 39 | 40 | 41 | def test_weight_qbytes_tensor_chained_backward(device): 42 | a = random_tensor((10, 10), dtype=torch.float32).to(device) 43 | a.requires_grad = True 44 | scale = AbsmaxOptimizer()(a, qtype=qint8, axis=0) 45 | qa = quantize_weight(a, qtype=qint8, axis=0, scale=scale) 46 | b = random_tensor((10, 10), dtype=torch.float32).to(device) 47 | b.requires_grad = True 48 | scale = AbsmaxOptimizer()(b, qtype=qint8, axis=0) 49 | qb = quantize_weight(b, qtype=qint8, axis=0, scale=scale) 50 | # Evaluate the product 51 | prod = qa * qb 52 | # Backpropagate 53 | gradient = torch.randn((10, 10)).to(device) 54 | prod.backward(gradient) 55 | assert torch.allclose(a.grad, qb.dequantize() * gradient) 56 | assert torch.allclose(b.grad, qa.dequantize() * gradient) 57 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbytes_tensor_dispatch.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from helpers import random_qweight, random_tensor 4 | 5 | from optimum.quanto import AbsmaxOptimizer, WeightQBytesTensor, qint8, quantize_weight 6 | 7 | 8 | def test_weight_qytes_tensor_to_device(device): 9 | qa = random_qweight((32, 32), qtype=qint8, dtype=torch.float) 10 | qa = qa.to(device) 11 | assert isinstance(qa, WeightQBytesTensor) 12 | assert qa.device.type == device.type 13 | assert qa._data.device.type == device.type 14 | assert qa._scale.device.type == device.type 15 | 16 | 17 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"]) 18 | @pytest.mark.parametrize("qtype", [qint8]) 19 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 20 | def test_weight_qbytes_tensor_equal(dtype, qtype, axis, device): 21 | a = random_tensor((32, 32), dtype=dtype, device=device) 22 | scale = AbsmaxOptimizer()(a, qtype=qtype, axis=axis) 23 | qa1 = quantize_weight(a, qtype=qtype, axis=axis, scale=scale) 24 | qa2 = quantize_weight(a, qtype=qtype, axis=axis, scale=scale) 25 | assert torch.equal(qa1, qa2) 26 | 27 | 28 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 29 | @pytest.mark.parametrize("qtype", [qint8]) 30 | def test_weight_qbytes_tensor_transpose_contiguous(axis, qtype, device): 31 | input_shape = (16, 32) 32 | qa = random_qweight(input_shape, axis=axis, qtype=qtype, dtype=torch.float32).to(device) 33 | assert qa.is_contiguous() 34 | tqa = qa.t() 35 | assert isinstance(tqa, WeightQBytesTensor) 36 | assert not tqa.is_contiguous() 37 | tqa = tqa.contiguous() 38 | assert tqa.is_contiguous() 39 | 40 | 41 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 42 | @pytest.mark.parametrize("qtype", [qint8]) 43 | def test_weight_qbytes_tensor_transposed_stride(axis, qtype, device): 44 | input_shape = (16, 32) 45 | a = random_tensor(input_shape, dtype=torch.float32).to(device) 46 | scale = AbsmaxOptimizer()(a, qtype=qtype, axis=axis) 47 | qa = quantize_weight(a, qtype=qtype, axis=axis, scale=scale) 48 | assert qa.stride() == a.stride() 49 | ta = a.t() 50 | tqa = qa.t() 51 | assert isinstance(tqa, WeightQBytesTensor) 52 | assert tqa.stride() == ta.stride() 53 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbytes_tensor_instantiate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import pytest 17 | import torch 18 | 19 | from optimum.quanto import WeightQBytesTensor, qfloat8, qint8 20 | 21 | 22 | def random_data_scale(input_shape, dtype, qtype): 23 | if qtype.is_floating_point: 24 | min_value = torch.finfo(qtype.dtype).min 25 | max_value = torch.finfo(qtype.dtype).max 26 | data = (torch.rand(input_shape) * max_value + min_value).to(qtype.dtype) 27 | else: 28 | max_value = torch.iinfo(qtype.dtype).max 29 | data = torch.randint(-max_value, max_value, input_shape, dtype=qtype.dtype) 30 | scale = torch.tensor(1.0 / max_value, dtype=dtype) 31 | return data, scale 32 | 33 | 34 | @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (10, 32, 32)]) 35 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"]) 36 | @pytest.mark.parametrize("qtype", [qint8, qfloat8], ids=["qint8", "qfloat8"]) 37 | def test_qbytestensor_instantiate(input_shape, dtype, qtype, device): 38 | if qtype.is_floating_point and device.type == "mps": 39 | pytest.skip("float8 types are not supported on MPS device") 40 | data, scale = random_data_scale(input_shape, dtype, qtype) 41 | qa = WeightQBytesTensor(qtype, None, data.size(), data.stride(), data, scale=scale, activation_qtype=None).to( 42 | device 43 | ) 44 | assert torch.max(torch.abs(qa.dequantize())) <= 1 45 | assert qa.dtype == dtype 46 | assert qa.qtype == qtype 47 | assert qa.shape == input_shape 48 | 49 | 50 | @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (10, 32, 32)]) 51 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"]) 52 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"]) 53 | def test_qbytestensor_equal(input_shape, dtype, qtype, device): 54 | data, scale = random_data_scale(input_shape, dtype, qtype) 55 | qa = WeightQBytesTensor(qtype, None, data.size(), data.stride(), data, scale=scale, activation_qtype=None).to( 56 | device 57 | ) 58 | qb = WeightQBytesTensor( 59 | qtype, None, data.size(), data.stride(), data.clone(), scale=scale, activation_qtype=None 60 | ).to(device) 61 | assert qa.equal(qb) 62 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbytes_tensor_quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import torch 17 | from helpers import assert_similar, device_eq, random_qweight, random_tensor 18 | 19 | from optimum.quanto import ( 20 | WeightQBytesTensor, 21 | absmax_scale, 22 | qfloat8, 23 | qfloat8_e4m3fn, 24 | qfloat8_e4m3fnuz, 25 | qfloat8_e5m2, 26 | qint8, 27 | ) 28 | 29 | 30 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)]) 31 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 32 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"]) 33 | @pytest.mark.parametrize( 34 | "axis", 35 | [None, 0, -1], 36 | ids=["per-tensor", "first-axis", "last-axis"], 37 | ) 38 | def test_symmetric_quantize_int(input_shape, dtype, qtype, axis, device): 39 | a = random_tensor(input_shape, dtype=dtype).to(device) 40 | scale = absmax_scale(a, qtype=qtype, axis=axis) 41 | qa = WeightQBytesTensor.quantize(a, qtype, axis, scale) 42 | assert isinstance(qa, WeightQBytesTensor) 43 | assert qa.dtype == dtype 44 | assert qa.qtype == qtype 45 | assert device_eq(qa.device, device) 46 | assert_similar(a, qa) 47 | 48 | 49 | @pytest.mark.skip_device("mps") 50 | @pytest.mark.skip_device("xpu") 51 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)]) 52 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 53 | @pytest.mark.parametrize( 54 | "qtype", 55 | [qfloat8, qfloat8_e4m3fn, qfloat8_e4m3fnuz, qfloat8_e5m2], 56 | ids=["qfloat8", "qfloat8_e4m3fn", "qfloat8_e4m3fnuz", "qfloat8_e5m2"], 57 | ) 58 | @pytest.mark.parametrize( 59 | "axis", 60 | [None, 0, -1], 61 | ids=["per-tensor", "first-axis", "last-axis"], 62 | ) 63 | def test_symmetric_quantize_float8(input_shape, dtype, qtype, axis, device): 64 | a = random_tensor(input_shape, dtype=dtype).to(device) 65 | scale = absmax_scale(a, qtype=qtype, axis=axis) 66 | qa = WeightQBytesTensor.quantize(a, qtype, axis, scale) 67 | assert isinstance(qa, WeightQBytesTensor) 68 | assert qa.dtype == dtype 69 | assert qa.qtype == qtype 70 | assert device_eq(qa.device, device) 71 | assert_similar(a, qa, atol=5e-3) 72 | 73 | 74 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 75 | def test_quantize_weight_axis_dim_1(axis, device): 76 | input_shape = (1, 32) if axis == 0 else (32, 1) 77 | qa = random_qweight(input_shape, dtype=torch.float32, qtype=qint8, axis=axis, device=device) 78 | # Quantizing along an axis of dimension 1 actually means per-tensor 79 | assert qa.axis is None 80 | -------------------------------------------------------------------------------- /tests/tensor/weights/test_weight_qbytes_tensor_serialization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import io 16 | 17 | import pytest 18 | import torch 19 | from helpers import random_qweight 20 | 21 | from optimum.quanto import qfloat8, qint8 22 | 23 | 24 | @pytest.mark.parametrize("input_shape", [(10, 10), (10, 32, 32)]) 25 | @pytest.mark.parametrize("qtype", [qint8, qfloat8], ids=["qint8", "qfloat8"]) 26 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) 27 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"]) 28 | def test_weights_qbytes_tensor_serialization(input_shape, qtype, dtype, axis): 29 | qinputs = random_qweight(input_shape, dtype=dtype, qtype=qtype, axis=axis) 30 | b = io.BytesIO() 31 | torch.save(qinputs, b) 32 | b.seek(0) 33 | qinputs_reloaded = torch.load(b, weights_only=False) 34 | assert qinputs_reloaded.qtype == qtype 35 | assert torch.equal(qinputs_reloaded._scale, qinputs._scale) 36 | if qtype.is_floating_point: 37 | # Equality is not supported for float8 38 | assert torch.equal(qinputs_reloaded._data.to(torch.float32), qinputs._data.to(torch.float32)) 39 | else: 40 | assert torch.equal(qinputs_reloaded._data, qinputs._data) 41 | # We cannot test dtype directly as it is not correctly set by torch.load 42 | assert qinputs_reloaded._scale.dtype == dtype 43 | assert qinputs_reloaded.axis == qinputs.axis 44 | -------------------------------------------------------------------------------- /tests/tensor/weights/weight_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from helpers import assert_similar, random_tensor 17 | 18 | 19 | def check_weight_qtensor_linear(qweight, batch_size, tokens, use_bias, rel_max_err=0.0): 20 | dtype = qweight.dtype 21 | device = qweight.device 22 | out_features, in_features = qweight.shape 23 | inputs = torch.rand((batch_size, tokens, in_features), dtype=dtype, device=device) 24 | bias = random_tensor((out_features,), dtype=dtype, device=device) if use_bias else None 25 | qout = torch.nn.functional.linear(inputs, qweight, bias) 26 | out = torch.nn.functional.linear(inputs, qweight.dequantize(), bias) 27 | # Verify global alignment 28 | assert_similar(out, qout) 29 | # Also look for outliers 30 | mean_val = out.abs().max() 31 | max_err = (out - qout).abs().max() 32 | rel_max_err = max_err / mean_val 33 | # These values were evaluated empirically without any optimized kernels. 34 | rtol = {"cpu": 1e-2, "cuda": 2e-2, "mps": 1e-2, "xpu": 2e-2}[device.type] 35 | assert rel_max_err < rtol, ( 36 | f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err * 100:.2f} %)" 37 | ) 38 | --------------------------------------------------------------------------------