├── .github
    ├── CODEOWNERS
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── check-commits.yml
    │   ├── linux-cpu-tests.yml
    │   ├── linux-cuda-tests.yml
    │   ├── linux-examples.yml
    │   ├── python-quality.yml
    │   ├── security.yml
    │   └── stale.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── bench
    ├── generation
    │   ├── README.md
    │   ├── charts
    │   │   ├── google-gemma-2b_bf16_Accuracy.png
    │   │   ├── google-gemma-2b_bf16_Latency__ms_.png
    │   │   ├── google-gemma-2b_bf16_Perplexity.png
    │   │   ├── meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png
    │   │   ├── meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png
    │   │   ├── meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png
    │   │   ├── mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png
    │   │   ├── mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png
    │   │   └── mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png
    │   ├── evaluate_configurations.py
    │   ├── evaluate_many_models.sh
    │   ├── evaluate_model.py
    │   ├── gen_barchart.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   ├── latency.py
    │   │   ├── perplexity.py
    │   │   └── prediction.py
    │   └── setup
    │   │   ├── __init__.py
    │   │   ├── awq.py
    │   │   ├── bnb.py
    │   │   ├── hqq.py
    │   │   └── quanto.py
    ├── kernels
    │   ├── benchmark.py
    │   ├── benchmark_marlin_fp8.py
    │   └── benchmark_w4a16.py
    └── torch_kernels
    │   ├── README.md
    │   ├── test_int_mm.py
    │   ├── test_int_mm_inductor.py
    │   ├── test_weight_int4pack_mm.py
    │   └── test_weight_int8pack_mm.py
├── examples
    ├── nlp
    │   ├── text-classification
    │   │   └── sst2
    │   │   │   └── quantize_sst2_model.py
    │   └── text-generation
    │   │   └── quantize_causal_lm_model.py
    ├── speech
    │   └── speech_recognition
    │   │   ├── quantize_asr_model.py
    │   │   └── requirements.txt
    └── vision
    │   ├── StableDiffusion
    │       ├── README.md
    │       ├── quantize_StableDiffusion.py
    │       └── requirements.txt
    │   ├── image-classification
    │       ├── mnist
    │       │   └── quantize_mnist_model.py
    │       └── pets
    │       │   └── quantize_vit_model.py
    │   ├── object-detection
    │       └── quantize_owl_model.py
    │   └── text-to-image
    │       └── quantize_pixart_sigma.py
├── external
    ├── awq
    │   ├── conftest.py
    │   ├── pack_intweight.py
    │   ├── packing_utils.py
    │   ├── test_awq_kernels.py
    │   ├── test_awq_packing.py
    │   └── test_awq_quantize.py
    └── smoothquant
    │   ├── README.md
    │   └── smoothquant.py
├── optimum
    └── quanto
    │   ├── __init__.py
    │   ├── calibrate.py
    │   ├── library
    │       ├── README.md
    │       ├── __init__.py
    │       ├── extensions
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── cpp
    │       │   │   ├── README.md
    │       │   │   ├── __init__.py
    │       │   │   ├── pybind_module.cpp
    │       │   │   ├── unpack.cpp
    │       │   │   └── unpack.h
    │       │   ├── cuda
    │       │   │   ├── README.md
    │       │   │   ├── __init__.py
    │       │   │   ├── awq
    │       │   │   │   ├── dequantize.cuh
    │       │   │   │   └── v2
    │       │   │   │   │   ├── gemm_cuda.cu
    │       │   │   │   │   ├── gemm_cuda.h
    │       │   │   │   │   ├── gemv_cuda.cu
    │       │   │   │   │   ├── gemv_cuda.h
    │       │   │   │   │   └── semaphore.h
    │       │   │   ├── marlin
    │       │   │   │   ├── COPYRIGHT
    │       │   │   │   ├── fp8_marlin.cu
    │       │   │   │   ├── fp8_marlin.cuh
    │       │   │   │   ├── gptq_marlin.cuh
    │       │   │   │   ├── gptq_marlin_dtypes.cuh
    │       │   │   │   ├── gptq_marlin_repack.cu
    │       │   │   │   ├── gptq_marlin_repack.cuh
    │       │   │   │   ├── marlin_cuda.cpp
    │       │   │   │   ├── marlin_cuda.h
    │       │   │   │   ├── marlin_cuda_kernel.cu
    │       │   │   │   └── marlin_cuda_kernel.cuh
    │       │   │   ├── pybind_module.cpp
    │       │   │   ├── unpack.cu
    │       │   │   └── unpack.h
    │       │   ├── extension.py
    │       │   ├── hip
    │       │   │   ├── __init__.py
    │       │   │   ├── pybind_module.cpp
    │       │   │   ├── unpack.cu
    │       │   │   └── unpack.h
    │       │   ├── mps
    │       │   │   ├── README.md
    │       │   │   ├── __init__.py
    │       │   │   ├── pybind_module.cpp
    │       │   │   ├── unpack.h
    │       │   │   └── unpack.mm
    │       │   └── xpu
    │       │   │   ├── __init__.py
    │       │   │   ├── pybind_module.cpp
    │       │   │   ├── unpack.h
    │       │   │   └── unpack.sycl
    │       ├── qbytes_mm.py
    │       ├── quantize.py
    │       └── unpack.py
    │   ├── models
    │       ├── __init__.py
    │       ├── diffusers_models.py
    │       ├── shared_dict.py
    │       └── transformers_models.py
    │   ├── nn
    │       ├── __init__.py
    │       ├── qconv2d.py
    │       ├── qlayernorm.py
    │       ├── qlinear.py
    │       └── qmodule.py
    │   ├── quantize.py
    │   ├── subpackage
    │       ├── __init__.py
    │       └── commands
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   └── quantize.py
    │   └── tensor
    │       ├── __init__.py
    │       ├── activations
    │           ├── __init__.py
    │           ├── qbytes.py
    │           ├── qbytes_ops.py
    │           └── quantization.py
    │       ├── core.py
    │       ├── function.py
    │       ├── grouped.py
    │       ├── optimizers
    │           ├── __init__.py
    │           ├── absmax_optimizer.py
    │           ├── affine_optimizer.py
    │           ├── hqq_optimizer.py
    │           ├── max_optimizer.py
    │           ├── optimizer.py
    │           └── symmetric_optimizer.py
    │       ├── packed.py
    │       ├── qbits.py
    │       ├── qbytes.py
    │       ├── qtensor.py
    │       ├── qtype.py
    │       └── weights
    │           ├── __init__.py
    │           ├── awq
    │               ├── __init__.py
    │               ├── packed.py
    │               └── qbits.py
    │           ├── marlin
    │               ├── __init__.py
    │               ├── fp8
    │               │   ├── __init__.py
    │               │   ├── packed.py
    │               │   └── qbits.py
    │               ├── int4
    │               │   ├── __init__.py
    │               │   ├── packed.py
    │               │   └── qbits.py
    │               └── permutations.py
    │           ├── packing.py
    │           ├── qbits.py
    │           ├── qbytes.py
    │           ├── quantization.py
    │           ├── reordering.py
    │           └── tinygemm
    │               ├── __init__.py
    │               ├── packed.py
    │               └── qbits.py
├── pyproject.toml
├── setup.sh
└── tests
    ├── cli
        ├── cli_helpers.py
        └── test_quantize_cli.py
    ├── conftest.py
    ├── helpers.py
    ├── library
        ├── test_extensions.py
        ├── test_mm.py
        ├── test_quantize.py
        └── test_unpack.py
    ├── models
        ├── conftest.py
        ├── test_quantized_model_for_causal_lm.py
        └── test_quantized_model_for_pixart.py
    ├── nn
        ├── test_calibrate.py
        ├── test_qattention.py
        ├── test_qconv2d.py
        ├── test_qlayernorm.py
        ├── test_qlinear.py
        └── test_qmodule.py
    ├── quantize
        ├── test_quantize_mlp.py
        ├── test_quantize_patterns.py
        └── test_requantize.py
    └── tensor
        ├── activations
            ├── test_activations_compile.py
            ├── test_activations_dispatch.py
            └── test_activations_quantize.py
        ├── ops
            ├── test_linear_dispatch.py
            └── test_mm_dispatch.py
        ├── optimizers
            └── test_hqq_optimizer.py
        ├── test_absmax.py
        ├── test_packed_tensor.py
        └── weights
            ├── optimized
                ├── test_awq_packed_tensor.py
                ├── test_awq_weight_qbits_tensor.py
                ├── test_marlin_fp8_packed_tensor.py
                ├── test_marlin_int4_packed_tensor.py
                ├── test_marlin_int4_weight_qbits_tensor.py
                ├── test_marlin_qbytes_tensor.py
                ├── test_tinygemm_packed_tensor.py
                └── test_tinygemm_weight_qbits_tensor.py
            ├── test_weight_qbits_tensor.py
            ├── test_weight_qbits_tensor_dispatch.py
            ├── test_weight_qbits_tensor_instantiate.py
            ├── test_weight_qbits_tensor_quantize.py
            ├── test_weight_qbytes_tensor_backward.py
            ├── test_weight_qbytes_tensor_dispatch.py
            ├── test_weight_qbytes_tensor_instantiate.py
            ├── test_weight_qbytes_tensor_quantize.py
            ├── test_weight_qbytes_tensor_serialization.py
            └── weight_helpers.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @dacorvo @sunmarc
2 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # What does this PR do?
 2 | 
 3 | <!--
 4 | Congratulations! You've made it this far! You're not quite done yet though.
 5 | 
 6 | Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
 7 | 
 8 | Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
 9 | 
10 | Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
11 | -->
12 | 
13 | <!-- Remove if not applicable -->
14 | 
15 | Fixes # (issue)
16 | 
17 | 
18 | ## Before submitting
19 | - [ ] Did you read the [contributor guideline](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md#create-a-pull-request),
20 |       Pull Request section?
21 | - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
22 |       to it if that's the case.
23 | - [ ] Did you run all tests locally and make sure they pass.
24 | - [ ] Did you write any new necessary tests?
25 | 
26 | 
27 | ## Who can review?
28 | 
29 | Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
30 | members/contributors who may be interested in your PR.
31 | 


--------------------------------------------------------------------------------
/.github/workflows/check-commits.yml:
--------------------------------------------------------------------------------
 1 | name: Check Commits
 2 | 
 3 | on: [workflow_call]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     name: Check commits
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v3
11 | 
12 |       - uses: huggingface/action-check-commits@v1.0.0
13 |         with:
14 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
15 |           max-commits: "10"
16 |           min-words: "3"
17 |           forbidden-words: "fixup"
18 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-cpu-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Linux CPU tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - "optimum/quanto/**"
 9 |       - "tests/**"
10 |       - "pyproject.toml"
11 |   pull_request:
12 |     types: [assigned, opened, synchronize, reopened]
13 |     paths:
14 |       - "optimum/quanto/**"
15 |       - "tests/**"
16 |       - "pyproject.toml"
17 | 
18 | jobs:
19 |   check-commits:
20 |     uses: ./.github/workflows/check-commits.yml
21 |   python-quality:
22 |     uses: ./.github/workflows/python-quality.yml
23 |   test-ubuntu-cpu:
24 |     needs: [check-commits, python-quality]
25 |     runs-on: ubuntu-latest
26 |     strategy:
27 |       fail-fast: false
28 |       matrix:
29 |         python-version: ["3.9", "3.11"]
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v2
33 |       - name: Set up Python ${{ matrix.python-version }}
34 |         uses: actions/setup-python@v2
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 | 
38 |       - name: Build and install quanto
39 |         run: |
40 |           pip install --upgrade pip
41 |           pip install -e .[dev]
42 | 
43 |       - name: Run base tests
44 |         run: |
45 |           python -m pytest tests --ignore=tests/models --ignore=tests/cli
46 | 
47 |       - name: Run models tests
48 |         run: |
49 |           pip install accelerate transformers diffusers
50 |           python -m pytest tests/models
51 | 
52 | 
53 |       - name: Run CLI tests
54 |         run: |
55 |           pip install optimum
56 |           python -m pytest tests/cli
57 | 
58 |   run_staging_tests:
59 |     needs: [check-commits, python-quality]
60 |     runs-on: ubuntu-latest
61 |     strategy:
62 |       fail-fast: false
63 |       matrix:
64 |         python-version: ["3.9", "3.11"]
65 | 
66 |     steps:
67 |       - uses: actions/checkout@v2
68 |       - name: Set up Python ${{ matrix.python-version }}
69 |         uses: actions/setup-python@v2
70 |         with:
71 |           python-version: ${{ matrix.python-version }}
72 | 
73 |       - name: Build and install quanto
74 |         run: |
75 |           pip install --upgrade pip
76 |           pip install -e .[dev]
77 | 
78 |       - name: Run models hub tests
79 |         run: |
80 |           pip install accelerate transformers diffusers
81 |           HUGGINGFACE_CO_STAGING=true python -m pytest tests/models -k "hub"
82 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-cuda-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Linux CUDA tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - "optimum/quanto/**"
 9 |       - "tests/**"
10 |       - "pyproject.toml"
11 |   pull_request:
12 |     types: [assigned, opened, synchronize, reopened]
13 |     paths:
14 |       - "optimum/quanto/**"
15 |       - "tests/**"
16 |       - "pyproject.toml"
17 | 
18 | jobs:
19 |   check-commits:
20 |     uses: ./.github/workflows/check-commits.yml
21 |   python-quality:
22 |     uses: ./.github/workflows/python-quality.yml
23 |   test-ubuntu-cuda:
24 |     needs: [check-commits, python-quality]
25 |     runs-on:
26 |       group: aws-g5-4xlarge-plus
27 |     strategy:
28 |       fail-fast: false
29 |       matrix:
30 |         cuda-version: ["11.8", "12.4", "12.6"]
31 |     container:
32 |       image: pytorch/pytorch:2.6.0-cuda${{ matrix.cuda-version }}-cudnn9-devel
33 |       options: --gpus 0
34 | 
35 |     steps:
36 |       - uses: actions/checkout@v2
37 |       - name: Check CUDA installation
38 |         run: |
39 |           nvcc -V
40 | 
41 |       - name: Build and install quanto
42 |         run: |
43 |           pip install --upgrade pip
44 |           pip install -e .[dev]
45 | 
46 |       - name: Run base tests
47 |         run: |
48 |           python -m pytest tests --ignore=tests/models --ignore=tests/cli
49 | 
50 |       - name: Run models tests
51 |         run: |
52 |           pip install accelerate transformers diffusers
53 |           python -m pytest tests/models
54 | 
55 |       - name: Run CLI tests
56 |         run: |
57 |           pip install optimum
58 |           python -m pytest tests/cli
59 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-examples.yml:
--------------------------------------------------------------------------------
 1 | name: Linux examples (CPU, CUDA)
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - "optimum/quanto/**"
 9 |       - "examples/**"
10 |       - "pyproject.toml"
11 |   pull_request:
12 |     types: [assigned, opened, synchronize, reopened]
13 |     paths:
14 |       - "optimum/quanto/**"
15 |       - "examples/**"
16 |       - "pyproject.toml"
17 | 
18 | jobs:
19 |   check-commits:
20 |     uses: ./.github/workflows/check-commits.yml
21 |   python-quality:
22 |     uses: ./.github/workflows/python-quality.yml
23 |   run-examples:
24 |     needs: [check-commits, python-quality]
25 |     runs-on:
26 |       group: aws-g5-4xlarge-plus
27 |     strategy:
28 |       fail-fast: false
29 |       matrix:
30 |         device: ["cpu", "cuda"]
31 |     container:
32 |       image: pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
33 |       options: --gpus 0
34 | 
35 |     steps:
36 |       - uses: actions/checkout@v2
37 |       - name: Check CUDA installation
38 |         run: |
39 |           nvcc -V
40 | 
41 |       - name: Build and install packages
42 |         run: |
43 |           pip install --upgrade pip
44 |           pip install -e .[examples]
45 | 
46 |       # Run examples
47 |       - name: Run MNIST classification example
48 |         run: |
49 |           for w in int4 int8 float8; do \
50 |             for a in none int8 float8; do \
51 |               python examples/vision/image-classification/mnist/quantize_mnist_model.py \
52 |                 --weights $w --activations $a --device ${{ matrix.device }}; \
53 |             done; \
54 |           done
55 |       - name: Run OWL detection example
56 |         run: |
57 |           for w in int4 int8 float8; do \
58 |             python examples/vision/object-detection/quantize_owl_model.py \
59 |               --image http://images.cocodataset.org/val2017/000000039769.jpg \
60 |               --texts "a photo of a cat" "a remote" \
61 |               --weights $w --device ${{ matrix.device }}; \
62 |           done
63 |       - name: Run text-classification example
64 |         run: |
65 |           for w in int4 int8; do \
66 |             for a in none int8; do \
67 |               python examples/nlp/text-classification/sst2/quantize_sst2_model.py \
68 |                 --weights $w --activations $a --device ${{ matrix.device }}; \
69 |             done; \
70 |           done
71 |       - name: Run text-to-image example
72 |         if: ${{ matrix.device == 'cuda'}}
73 |         run: |
74 |           for w in int4 int8 fp8; do \
75 |             python examples/vision/text-to-image/quantize_pixart_sigma.py \
76 |               --qtype $w --device ${{ matrix.device }}; \
77 |           done
78 | 


--------------------------------------------------------------------------------
/.github/workflows/python-quality.yml:
--------------------------------------------------------------------------------
 1 | name: Python code quality
 2 | 
 3 | on: [workflow_call]
 4 | 
 5 | jobs:
 6 |   check_code_quality:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 |       - name: Set up Python
12 |         uses: actions/setup-python@v2
13 |         with:
14 |           python-version: 3.9
15 |       - name: Install dependencies
16 |         run: |
17 |           pip install --upgrade pip
18 |           pip install .[dev]
19 |       - run: ruff format bench examples optimum tests --diff
20 |       - run: ruff check --show-fixes bench examples optimum tests
21 | 


--------------------------------------------------------------------------------
/.github/workflows/security.yml:
--------------------------------------------------------------------------------
 1 | name: Security Checks
 2 | 
 3 | on:
 4 |   push:
 5 | 
 6 | permissions:
 7 |   contents: read
 8 | 
 9 | jobs:
10 |   secrets:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - shell: bash
14 |         run: |
15 |           if [ "${{ github.event_name }}" == "push" ]; then
16 |             echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV
17 |             echo "branch=${{ github.ref_name }}" >> $GITHUB_ENV
18 |           fi
19 |           if [ "${{ github.event_name }}" == "pull_request" ]; then
20 |             echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV
21 |             echo "branch=${{ github.event.pull_request.head.ref }}" >> $GITHUB_ENV
22 |           fi
23 |       - name: Checkout code
24 |         uses: actions/checkout@v4
25 |         with:
26 |           ref: ${{env.branch}}
27 |           fetch-depth: ${{env.depth}}
28 |       - name: Scan for secrets
29 |         uses: trufflesecurity/trufflehog@main
30 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | name: 'Close stale issues and PRs'
 2 | on:
 3 |   schedule:
 4 |     - cron: '30 1 * * *'
 5 |   workflow_dispatch:
 6 | 
 7 | permissions:
 8 |   issues: write
 9 |   pull-requests: write
10 | 
11 | jobs:
12 |   stale:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/stale@v9
16 |         with:
17 |           stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
18 |           stale-pr-message: 'This PR is stale because it has been open 15 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
19 |           close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
20 |           close-pr-message: 'This PR was closed because it has been stalled for 5 days with no activity.'
21 |           days-before-issue-stale: 30
22 |           days-before-pr-stale: 15
23 |           days-before-issue-close: 5
24 |           days-before-pr-close: 5
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .pytest_cache
3 | *.egg-info
4 | dist
5 | .venv
6 | build/


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: check test style
 2 | 
 3 | check_dirs := optimum tests bench examples
 4 | 
 5 | check:
 6 | 	ruff check --show-fixes ${check_dirs}
 7 | 	ruff format ${check_dirs} --diff
 8 | 
 9 | style:
10 | 	ruff check ${check_dirs} --fix
11 | 	ruff format ${check_dirs}
12 | 
13 | test:
14 | 	python -m pytest -sv tests
15 | 


--------------------------------------------------------------------------------
/bench/generation/charts/google-gemma-2b_bf16_Accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/google-gemma-2b_bf16_Accuracy.png


--------------------------------------------------------------------------------
/bench/generation/charts/google-gemma-2b_bf16_Latency__ms_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/google-gemma-2b_bf16_Latency__ms_.png


--------------------------------------------------------------------------------
/bench/generation/charts/google-gemma-2b_bf16_Perplexity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/google-gemma-2b_bf16_Perplexity.png


--------------------------------------------------------------------------------
/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png


--------------------------------------------------------------------------------
/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png


--------------------------------------------------------------------------------
/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png


--------------------------------------------------------------------------------
/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png


--------------------------------------------------------------------------------
/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png


--------------------------------------------------------------------------------
/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/optimum-quanto/157f419c2c5c72fe5dadd302255380c0644e5e78/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png


--------------------------------------------------------------------------------
/bench/generation/evaluate_many_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Absolute path to this script, e.g. /home/user/bin/foo.sh
 3 | SCRIPT=$(readlink -f "$0")
 4 | # Absolute path this script is in, thus /home/user/bin
 5 | SCRIPT_PATH=$(dirname "$SCRIPT")
 6 | 
 7 | models=(
 8 |     google/gemma-2b
 9 |     meta-llama/Meta-Llama-3.1-8B
10 |     mistralai/Mistral-7B-Instruct-v0.3
11 | )
12 | 
13 | for m in ${models[@]}; do
14 |     python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric prediction --png --json --batch_size 16
15 |     python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric perplexity --png --json --batch_size 16
16 |     python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric latency --png --json --batch_size 16
17 | done
18 | 


--------------------------------------------------------------------------------
/bench/generation/gen_barchart.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import json
17 | 
18 | import matplotlib.pyplot as plt
19 | import numpy as np
20 | import torch
21 | 
22 | 
23 | def save_bar_chart(title, labels, ylabel, series, save_path):
24 |     x = np.arange(len(labels))  # the label locations
25 |     width = 0.15  # the width of the bars
26 |     multiplier = 0
27 | 
28 |     fig, ax = plt.subplots(layout="constrained")
29 |     fig.set_figwidth(10)
30 | 
31 |     max_value = 0
32 | 
33 |     for attribute, measurement in series.items():
34 |         max_value = max(max_value, max(measurement))
35 |         offset = width * multiplier
36 |         rects = ax.bar(x + offset, measurement, width, label=attribute)
37 |         ax.bar_label(rects, padding=5)
38 |         multiplier += 1
39 | 
40 |     # Add some text for labels, title and custom x-axis tick labels, etc.
41 |     ax.set_ylabel(ylabel)
42 |     ax.set_title(title)
43 |     ax.set_xticks(x + width, labels)
44 |     ax.legend(loc="upper left", ncols=4)
45 |     ax.set_ylim(0, max_value * 1.2)
46 | 
47 |     plt.savefig(save_path)
48 | 
49 | 
50 | def gen_barchart(model_id, title, label, results, dtype):
51 |     dtype_str = "f16" if dtype is torch.float16 else "bf16"
52 |     activations = (dtype_str, "f8")
53 |     weights = ("i4", "i8", "f8")
54 |     series = {}
55 |     reference = round(results[f"W{dtype_str}A{dtype_str}"], 2)
56 |     series[f"Weights {dtype_str}"] = [
57 |         reference,
58 |     ] * len(activations)
59 |     for w in weights:
60 |         name = f"Weights {w}"
61 |         series[name] = []
62 |         for a in activations:
63 |             result = results[f"W{w}A{a}"]
64 |             series[name].append(round(result, 2))
65 |     model_name = model_id.replace("/", "-")
66 |     metric_name = label.replace(" ", "_").replace("(", "_").replace(")", "_")
67 |     save_bar_chart(
68 |         title=title,
69 |         labels=[f"Activations {a}" for a in activations],
70 |         series=series,
71 |         ylabel=label,
72 |         save_path=f"{model_name}_{dtype_str}_{metric_name}.png",
73 |     )
74 | 
75 | 
76 | def main():
77 |     parser = argparse.ArgumentParser()
78 |     parser.add_argument("benchmark", type=str, help="A benchmark result file (.json).")
79 |     parser.add_argument("--title", type=str, required=True, help="The graph title.")
80 |     parser.add_argument("--label", type=str, required=True, help="The graph vertical label.")
81 |     args = parser.parse_args()
82 |     with open(args.benchmark) as f:
83 |         benchmark = json.load(f)
84 |         for model_id, results in benchmark.items():
85 |             gen_barchart(model_id, args.title, args.label, results)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/bench/generation/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/bench/generation/metrics/prediction.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import time
16 | 
17 | import torch
18 | from datasets import load_dataset
19 | 
20 | 
21 | @torch.no_grad()
22 | def prediction_accuracy(model, tokenizer, batch_size, samples=None):
23 |     test_dataset = load_dataset("lambada", split=["test"])[0]
24 |     model.eval()
25 |     # The task is to predict the last token of the input.
26 |     total, hit = 0, 0
27 |     start = time.time()
28 |     for batch in test_dataset.iter(batch_size=batch_size):
29 |         inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
30 |         input_ids = inputs.input_ids.to(model.device)
31 |         attention_mask = inputs.attention_mask.to(model.device)
32 |         labels = input_ids[:, -1]
33 |         # Pass only the first tokens
34 |         outputs = model(input_ids[:, :-1], attention_mask=attention_mask[:, :-1])
35 |         preds = outputs.logits[:, -1, :].argmax(dim=-1)
36 |         total += labels.size(0)
37 |         hit += (preds == labels).sum().item()
38 |         if samples is not None and total >= samples:
39 |             break
40 |     end = time.time()
41 |     acc = hit / total
42 |     print(f"{total} sequences evaluated in {end - start:.2f} s. accuracy = {acc:.2f}")
43 |     return acc
44 | 


--------------------------------------------------------------------------------
/bench/generation/setup/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/bench/generation/setup/bnb.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
17 | 
18 | 
19 | def setup(
20 |     model_id: str,
21 |     weights: str,
22 |     activations: str,
23 |     device: torch.device,
24 | ):
25 |     if activations != "none":
26 |         raise ValueError("Activation quantization is not supported by BitsAndBytes")
27 |     if weights == "int4":
28 |         quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="fp4")
29 |     elif weights == "int8":
30 |         quantization_config = BitsAndBytesConfig(load_in_8bit=True)
31 |     else:
32 |         raise ValueError("BitsAndBytes only supports int4 and int8 weights.")
33 |     dtype = torch.float32 if device.type == "cpu" else torch.float16
34 |     tokenizer = AutoTokenizer.from_pretrained(model_id)
35 |     tokenizer.pad_token_id = tokenizer.eos_token_id
36 |     tokenizer.padding_side = "left"
37 |     quantization_config.bnb_4bit_compute_dtype = dtype
38 |     model = AutoModelForCausalLM.from_pretrained(
39 |         model_id, torch_dtype=dtype, low_cpu_mem_usage=True, quantization_config=quantization_config
40 |     )
41 | 
42 |     return model, tokenizer
43 | 


--------------------------------------------------------------------------------
/bench/generation/setup/hqq.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from hqq.core.quantize import BaseQuantizeConfig
17 | from hqq.engine.hf import HQQModelForCausalLM
18 | from transformers import AutoTokenizer
19 | 
20 | 
21 | def setup(model_id: str, weights: str, activations: str, device: torch.device, group_size: int = 64):
22 |     if activations != "none":
23 |         raise ValueError("Activation quantization is not supported by HQQ")
24 |     if weights == "int4":
25 |         quant_config = BaseQuantizeConfig(nbits=4, group_size=group_size)
26 |     elif weights == "int8":
27 |         quant_config = BaseQuantizeConfig(nbits=8, group_size=group_size)
28 |     else:
29 |         raise ValueError("HQQ only supports int4 and int8 weights.")
30 |     # Load model
31 |     model = HQQModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
32 |     # Quantize
33 |     model.quantize_model(quant_config=quant_config, compute_dtype=torch.float16, device=device)
34 |     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
35 |     tokenizer.pad_token_id = tokenizer.eos_token_id
36 |     tokenizer.padding_side = "left"
37 |     return model, tokenizer
38 | 


--------------------------------------------------------------------------------
/bench/generation/setup/quanto.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import time
16 | 
17 | import torch
18 | from datasets import load_dataset
19 | from transformers import AutoModelForCausalLM, AutoTokenizer
20 | 
21 | from optimum.quanto import Calibration, freeze, qfloat8, qint4, qint8, quantize
22 | 
23 | 
24 | @torch.no_grad()
25 | def calibrate(model, tokenizer, batch_size, batches):
26 |     samples = batch_size * batches
27 |     cal_dataset = load_dataset("lambada", split=["validation"])[0]
28 |     model.eval()
29 |     total = 0
30 |     for batch in cal_dataset.iter(batch_size=batch_size):
31 |         inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
32 |         input_ids = inputs.input_ids.to(model.device)
33 |         attention_mask = inputs.attention_mask.to(model.device)
34 |         model(input_ids, attention_mask=attention_mask)
35 |         total += input_ids.size(0)
36 |         if total >= samples:
37 |             break
38 | 
39 | 
40 | def setup(
41 |     model_id: str,
42 |     weights: str,
43 |     activations: str,
44 |     batch_size: int,
45 |     device: torch.device,
46 |     dtype: torch.dtype,
47 | ):
48 |     weights = keyword_to_qtype(weights)
49 |     activations = keyword_to_qtype(activations)
50 |     tokenizer = AutoTokenizer.from_pretrained(model_id)
51 |     tokenizer.pad_token_id = tokenizer.eos_token_id
52 |     tokenizer.padding_side = "left"
53 |     model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, low_cpu_mem_usage=True).to(device)
54 |     if weights is not None or activations is not None:
55 |         print("Quantizing")
56 |         start = time.time()
57 |         quantization_root = model
58 |         if hasattr(model, "model"):
59 |             quantization_root = model.model
60 |         quantize(quantization_root, weights=weights, activations=activations)
61 |         if activations is not None:
62 |             print("Calibrating")
63 |             with Calibration():
64 |                 calibrate(model, tokenizer, batch_size, batches=4)
65 |         print("Freezing")
66 |         freeze(model)
67 |         print(f"Finished: {time.time() - start:.2f}")
68 |     return model, tokenizer
69 | 
70 | 
71 | def keyword_to_qtype(k):
72 |     return {
73 |         "none": None,
74 |         "int4": qint4,
75 |         "int8": qint8,
76 |         "float8": qfloat8,
77 |     }[k]
78 | 


--------------------------------------------------------------------------------
/bench/torch_kernels/README.md:
--------------------------------------------------------------------------------
1 | This contains a few scripts to test pytorch kernels that are relevant for quantization.
2 | 


--------------------------------------------------------------------------------
/bench/torch_kernels/test_int_mm.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import timeit
17 | 
18 | import torch
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser(description="Torch integer matmul benchmark")
23 |     parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
24 |     parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
25 |     parser.add_argument("--it", type=int, default=100, help="Number of iterations for average")
26 |     args = parser.parse_args()
27 | 
28 |     torch.manual_seed(args.seed)
29 | 
30 |     if args.device is None:
31 |         if torch.cuda.is_available():
32 |             device = torch.device("cuda")
33 |         elif torch.backends.mps.is_available():
34 |             device = torch.device("mps")
35 |         elif torch.xpu.is_available():
36 |             device = torch.device("xpu")
37 |         else:
38 |             device = torch.device("cpu")
39 |     else:
40 |         device = torch.device(args.device)
41 | 
42 |     def avg_time(f, it):
43 |         return timeit.Timer(f).timeit(it) / it
44 | 
45 |     # Resstrictions for accelerated integer matmul:
46 |     # - input matrices must be 2D
47 |     # - the collapsing dimension must be a multiple of 8
48 |     A = torch.randint(1, 10, [2400, 3200]).type(torch.int8).to(device)
49 |     B = torch.randint(1, 10, [3200, 4800]).type(torch.int8).to(device)
50 | 
51 |     print(f"Evaluating integer matmul on {device.type}:")
52 |     # Warmup (slow)
53 |     torch._int_mm(A, B)
54 |     # Average on several calls
55 |     t = avg_time(lambda: torch._int_mm(A, B), args.it) * 1000
56 |     print(f"Average inference on {args.it} iterations: {t:.4f} ms")
57 | 
58 |     # Convert inputs to float
59 | 
60 |     def to_float(x):
61 |         if x.device.type == ("cpu"):
62 |             # matrix multiplication is not supported for float16 on CPU
63 |             return x.to(torch.float32)
64 |         return x.to(torch.float16)
65 | 
66 |     A = to_float(A)
67 |     B = to_float(B)
68 |     print(f"Evaluating {A.dtype} matmul on {device.type}:")
69 | 
70 |     # Warmup (slow)
71 |     torch.matmul(A, B)
72 |     # Average on several calls
73 |     t = avg_time(lambda: torch.matmul(A, B), args.it) * 1000
74 |     print(f"Average inference on {args.it} iterations: {t:.4f} ms")
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/bench/torch_kernels/test_int_mm_inductor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import timeit
16 | 
17 | import torch
18 | 
19 | 
20 | def mm(a, b):
21 |     return torch._int_mm(a, b)
22 | 
23 | 
24 | A = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda()
25 | B = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda()
26 | it = 100
27 | 
28 | # Warmup (slow)
29 | mm(A, B)
30 | # Get a reference
31 | print(timeit.Timer(lambda: mm(A, B)).timeit(it) / it)
32 | 
33 | cmm = torch.compile(mm, backend="inductor")
34 | # First invocation will trigger the actual compilation
35 | cmm(A, B)
36 | # Now compare execution time
37 | print(timeit.Timer(lambda: cmm(A, B)).timeit(it) / it)
38 | 


--------------------------------------------------------------------------------
/bench/torch_kernels/test_weight_int8pack_mm.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import timeit
17 | 
18 | import torch
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser(description="Torch quantized int8 weight matmul benchmark")
23 |     parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
24 |     parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
25 |     parser.add_argument("--it", type=int, default=10, help="Number of iterations for average")
26 |     args = parser.parse_args()
27 | 
28 |     torch.manual_seed(args.seed)
29 | 
30 |     if args.device is None:
31 |         if torch.cuda.is_available():
32 |             device = torch.device("cuda")
33 |         elif torch.backends.mps.is_available():
34 |             device = torch.device("mps")
35 |         elif torch.xpu.is_available():
36 |             device = torch.device("xpu")
37 |         else:
38 |             device = torch.device("cpu")
39 |     else:
40 |         device = torch.device(args.device)
41 | 
42 |     def avg_time(f, it):
43 |         return timeit.Timer(f).timeit(it) / it
44 | 
45 |     A = torch.rand([2400, 3200], dtype=torch.bfloat16, device=device)
46 |     B = torch.randint(-128, 127, [4800, 3200], dtype=torch.int8, device=device)
47 |     B_scale = torch.rand([4800], dtype=torch.bfloat16, device=device)
48 | 
49 |     print(f"Evaluating quantized int8 matmul on {device.type}:")
50 |     # Warmup (slow)
51 |     torch._weight_int8pack_mm(A, B, B_scale)
52 |     # Average on several calls
53 |     t = avg_time(lambda: torch._weight_int8pack_mm(A, B, B_scale), args.it) * 1000
54 |     print(f"Average inference on {args.it} iterations: {t:.4f} ms")
55 | 
56 |     # Convert weights to float
57 | 
58 |     B = B.to(torch.bfloat16).t()
59 |     print(f"Evaluating {A.dtype} matmul on {device.type}:")
60 | 
61 |     # Warmup (slow)
62 |     torch.matmul(A, B) * B_scale
63 |     # Average on several calls
64 |     t = avg_time(lambda: torch.matmul(A, B) * B_scale, args.it) * 1000
65 |     print(f"Average inference on {args.it} iterations: {t:.4f} ms")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/examples/speech/speech_recognition/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | evaluate
3 | librosa
4 | soundfile
5 | jiwer
6 | 


--------------------------------------------------------------------------------
/examples/vision/StableDiffusion/README.md:
--------------------------------------------------------------------------------
 1 | # Quantize Stable Diffusion examples
 2 | 
 3 | ## Running locally with PyTorch
 4 | 
 5 | ### Installing the dependencies
 6 | 
 7 | Before running the scripts, make sure to install the library's training dependencies:
 8 | 
 9 | **Important**
10 | 
11 | To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
12 | ```bash
13 | git clone https://github.com/huggingface/quanto
14 | cd quanto
15 | pip install -e .
16 | ```
17 | 
18 | Then cd in the `examples/vision/StableDiffusion` folder and run
19 | ```bash
20 | pip install -r requirements.txt
21 | ```
22 | 
23 | **Now, we can launch the image generation script:**
24 | 
25 | ```bash
26 | python quantize_StableDiffusion.py --batch_size=1 --torch_dtype="fp32"
27 | ```
28 | 
29 | To better track our training experiments, we're using the following flags in the command above:
30 | 
31 | * `batch_size` Batch size is the number of samples used in one iteration of training.
32 | 
33 | * `torch_dtype` {fp32,fp16,bf16}
34 | * `unet_qtype` {fp8,int8,int4,none}
35 | 
36 | Our experiments were conducted on a single 24GB A10 GPU.
37 | ```bash
38 | fp16-fp16
39 | 
40 | batch_size: 1, torch_dtype: fp16, unet_dtype: none  in 3.307 seconds.Memory: 3.192GB.
41 | ```
42 | 
43 | ```bash
44 | bf16-int8
45 | 
46 | batch_size: 1, torch_dtype: bf16, unet_dtype: int8  in 3.918 seconds.Memory: 2.644GB.
47 | ```
48 | 
49 | ```bash
50 | fp16-int8
51 | 
52 | batch_size: 1, torch_dtype: fp16, unet_dtype: int8  in 3.920 seconds.Memory: 2.634GB.
53 | ``` 
54 | 
55 | will both get high-quality images at fast speed generation


--------------------------------------------------------------------------------
/examples/vision/StableDiffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | quanto
2 | diffusers
3 | torch
4 | transformers
5 | accelerate
6 | wandb


--------------------------------------------------------------------------------
/examples/vision/text-to-image/quantize_pixart_sigma.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gc
 3 | 
 4 | import torch
 5 | from diffusers import DiffusionPipeline
 6 | 
 7 | from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize
 8 | 
 9 | 
10 | NUM_INFERENCE_STEPS = 50
11 | 
12 | TORCH_DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16}
13 | QTYPES = {
14 |     "fp8": qfloat8,
15 |     "int8": qint8,
16 |     "int4": qint4,
17 |     "none": None,
18 | }
19 | 
20 | 
21 | def load_pipeline(model_id, torch_dtype, qtype=None, device="cpu"):
22 |     pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True).to(device)
23 | 
24 |     if qtype:
25 |         quantize(pipe.transformer, weights=qtype)
26 |         freeze(pipe.transformer)
27 |         quantize(pipe.text_encoder, weights=qtype)
28 |         freeze(pipe.text_encoder)
29 | 
30 |     pipe.set_progress_bar_config(disable=True)
31 |     return pipe
32 | 
33 | 
34 | def get_device_memory(device):
35 |     gc.collect()
36 |     if device.type == "cuda":
37 |         torch.cuda.empty_cache()
38 |         return torch.cuda.memory_allocated()
39 |     elif device.type == "mps":
40 |         torch.mps.empty_cache()
41 |         return torch.mps.current_allocated_memory()
42 |     elif device.type == "xpu":
43 |         torch.xpu.empty_cache()
44 |         return torch.xpu.memory_allocated()
45 |     return None
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument("--model_id", type=str, default="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS")
51 |     parser.add_argument("--prompt", type=str, default="ghibli style, a fantasy landscape with castles")
52 |     parser.add_argument("--torch_dtype", type=str, default="fp16", choices=list(TORCH_DTYPES.keys()))
53 |     parser.add_argument("--qtype", type=str, default=None, choices=list(QTYPES.keys()))
54 |     parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
55 |     args = parser.parse_args()
56 | 
57 |     if args.device is None:
58 |         if torch.cuda.is_available():
59 |             device = torch.device("cuda")
60 |         elif torch.backends.mps.is_available():
61 |             device = torch.device("mps")
62 |         elif torch.xpu.is_available():
63 |             device = torch.device("xpu")
64 |         else:
65 |             device = torch.device("cpu")
66 |     else:
67 |         device = torch.device(args.device)
68 | 
69 |     pipeline = load_pipeline(
70 |         args.model_id, TORCH_DTYPES[args.torch_dtype], QTYPES[args.qtype] if args.qtype else None, device
71 |     )
72 | 
73 |     print(f"torch_dtype: {args.torch_dtype}, qtype: {args.qtype}.")
74 |     memory = get_device_memory(device)
75 |     if memory is not None:
76 |         memory_gb = memory / 2**30
77 |         print(f"{device.type} device memory: {memory_gb:.2f} GB.")
78 | 
79 |     if args.qtype == "int4" and device.type == "CUDA":
80 |         raise ValueError("This example does not work (yet) for int4 on CUDA")
81 | 
82 |     img_name = f"pixart-sigma-dtype@{args.torch_dtype}-qtype@{args.qtype}.png"
83 |     image = pipeline(
84 |         prompt=args.prompt,
85 |         num_inference_steps=NUM_INFERENCE_STEPS,
86 |         num_images_per_prompt=1,
87 |         generator=torch.manual_seed(0),
88 |     ).images[0]
89 |     image.save(img_name)
90 | 


--------------------------------------------------------------------------------
/external/awq/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | 
18 | 
19 | devices = ["cpu"]
20 | if torch.cuda.is_available():
21 |     devices += ["cuda"]
22 | elif torch.backends.mps.is_available():
23 |     devices += ["mps"]
24 | 
25 | 
26 | @pytest.fixture(scope="module", params=devices)
27 | def device(request):
28 |     return torch.device(request.param)
29 | 
30 | 
31 | def pytest_configure(config):
32 |     # register additional markers
33 |     config.addinivalue_line("markers", "skip_device(type): mark test to be skipped for the specified device type")
34 | 
35 | 
36 | def pytest_runtest_call(item):
37 |     fixture_name = "device"
38 |     if fixture_name in item.fixturenames:
39 |         # TODO: should be able to recover the fixture id instead of the actual value
40 |         fixture_arg = item.funcargs[fixture_name].type
41 |         skip_marks = {mark.args[0] for mark in item.iter_markers(name=f"skip_{fixture_name}")}
42 |         if fixture_arg in skip_marks:
43 |             pytest.skip(f"Test skipped for {fixture_name} {fixture_arg}")
44 | 


--------------------------------------------------------------------------------
/external/awq/pack_intweight.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2023 MIT HAN Lab
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | import torch
23 | 
24 | 
25 | def pack_intweight(unpacked_qweight, interleave, kstride):
26 |     # unpacked_qweight: [N, K]
27 |     N = unpacked_qweight.shape[0]
28 |     K = unpacked_qweight.shape[1]
29 | 
30 |     Packed_Kernel = unpacked_qweight.cpu().numpy().reshape(N, K // 32, 32)
31 |     # np.arange(32).reshape(4, 4, 2).transpose(1, 0, 2) => [0, 1, 8, 9, 16, 17, 24, 25, ...]
32 |     Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 3, 2, 4)
33 |     Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 32)
34 | 
35 |     # reorder each 8 weights for fast dequantization
36 |     # [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7]
37 |     Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 8)
38 |     Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 2, 4, 3)
39 |     Packed_Kernel = Packed_Kernel.reshape(N, K)
40 | 
41 |     # interleaving every four rows
42 |     Packed_Kernel = Packed_Kernel.reshape(
43 |         N // interleave, interleave, K // kstride, kstride
44 |     )
45 |     # N // 4, K // 64, 4, 64
46 |     Packed_Kernel = Packed_Kernel.transpose(0, 2, 1, 3)
47 |     Packed_Kernel = Packed_Kernel.reshape(
48 |         N // interleave, K // kstride, kstride, interleave
49 |     )
50 |     # Packing -> (N // 4, K // 64, 64)
51 |     Packed_Kernel = (
52 |         Packed_Kernel[..., 0]
53 |         | (Packed_Kernel[..., 1] << 4)
54 |         | (Packed_Kernel[..., 2] << 8)
55 |         | (Packed_Kernel[..., 3] << 12)
56 |     )
57 |     # reshape to (N // 4, K), FP16 format
58 |     Packed_Kernel = Packed_Kernel.reshape(N // interleave, K)
59 |     qweight = (
60 |         torch.tensor(Packed_Kernel.astype("int16"))
61 |         .to(unpacked_qweight.device)
62 |         .contiguous()
63 |     )
64 |     return qweight
65 | 


--------------------------------------------------------------------------------
/external/awq/test_awq_quantize.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup
 5 | 
 6 | 
 7 | def awq_quantize(base, scales, zeros, group_size):
 8 |     _, in_features = base.shape
 9 |     scale_zeros = scales * zeros
10 |     intweight = []
11 |     # From https://github.com/casper-hansen/AutoAWQ/blob/main/awq/modules/linear/gemv_fast.py#L165
12 |     for idx in range(in_features):
13 |         intweight.append(
14 |             torch.round(
15 |                 (base[:, idx] + scale_zeros[:, idx // group_size])
16 |                         / scales[:, idx // group_size]
17 |                     ).to(torch.uint8)[:, None]
18 |                 )
19 |     intweight = torch.cat(intweight, dim=1)
20 |     return intweight
21 | 
22 | 
23 | @pytest.mark.parametrize("in_features, out_features", [(256, 512), (1024, 1024)])
24 | def test_awq_quantize(in_features, out_features):
25 |     """Verify that AWQ quantization is equivalent to quanto affine quantization
26 |     """
27 |     shape = (out_features, in_features)
28 |     base = torch.rand(shape, dtype=torch.float16)
29 |     group_size = 128
30 | 
31 |     # Quantize using quanto
32 |     scale, zeropoint = MaxOptimizer()(base, bits=4, axis=0, group_size=128)
33 |     quanto_base = AffineQuantizer.apply(base, qint4, 0, group_size, scale, zeropoint)
34 |     # Extract quantized data, unpack and ungroup to recover original shape
35 |     quanto_data = ungroup(quanto_base._data.unpack(), axis=0, orig_shape=shape)
36 | 
37 |     # Reshape scale and zeropoint as expected by awq
38 |     awq_shape = (out_features, in_features // group_size)
39 |     scale = scale.reshape(awq_shape)
40 |     zeropoint = zeropoint.reshape(awq_shape)
41 | 
42 |     # Compare with awq quantization
43 |     awq_data = awq_quantize(base, scale, zeropoint, group_size)
44 |     # FIX: AWQ does not clamp values before packing
45 |     qmax = 2 ** 4 - 1
46 |     awq_data = torch.clamp(awq_data, 0, qmax)
47 | 
48 |     mismatches = quanto_data != awq_data
49 |     n = torch.sum(mismatches).numpy()
50 |     rate = n / base.numel()
51 |     print(f"Mismatches: {n}/{base.numel()} ({rate:.8f} %)")
52 |     # Extract mismatches
53 |     display = 10
54 |     quanto_values = torch.masked_select(quanto_data, mismatches)[:display]
55 |     awq_values = torch.masked_select(awq_data, mismatches)[:display]
56 |     print(f"First {display} mismatches")
57 |     print(list(quanto_values.numpy()))
58 |     print(list(awq_values.numpy()))
59 |     # Due to a slightly different order of operations (zero is multiplied by scale before subtracting it),
60 |     # there are some mismatches
61 |     assert rate < 5e-4
62 | 


--------------------------------------------------------------------------------
/external/smoothquant/README.md:
--------------------------------------------------------------------------------
 1 | # SmoothQuant original conversion script
 2 | 
 3 | This converts an OPT or Bloom [🤗 transformers](https://github.com/huggingface/transformers) model to a "smoothed" version, as described in
 4 | [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438).
 5 | 
 6 | ```bash
 7 | $ python smoothquant.py --model facebook/opt-1.3b --save-path smoothed-models/facebook/opt-1.3b
 8 | ```
 9 | 
10 | Note: due to hard-coded assumptions on model architecture in the script this only works for OPT models that apply the layer_norm
11 | before the attention (`do_layer_norm_before=true` in `config.json`). This means all models but `facebook/opt-350m`.
12 | 


--------------------------------------------------------------------------------
/optimum/quanto/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | __version__ = "0.2.7dev"
16 | 
17 | from .calibrate import *
18 | from .library import *
19 | from .models import *
20 | from .nn import *
21 | from .quantize import *
22 | from .tensor import *
23 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/README.md:
--------------------------------------------------------------------------------
 1 | # Quanto operations library
 2 | 
 3 | This contains the `quanto::` operations, available in python under `torch.ops.quanto`.
 4 | 
 5 | To add a new operation:
 6 | 
 7 | - add a definition for the operation in `library/ops.py`,
 8 | - provide a default implementation using pytorch operators only under `library/python`,
 9 | - provide optimized kernels for all devices under `library/ext`.
10 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .extensions import *
16 | from .qbytes_mm import *
17 | from .quantize import *
18 | from .unpack import *
19 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/README.md:
--------------------------------------------------------------------------------
 1 | # Quanto library extensions
 2 | 
 3 | This folder contains device-specific `quanto::` operations.
 4 | 
 5 | Implementations can be provided as part of:
 6 | 
 7 | - the generic C++ pytorch extension under `cpp`,
 8 | - the CUDA extension under `cuda`,
 9 | - the Metal Performance Shader extension under `mps`,
10 | - the XPU SYCL extension under `xpu`.
11 | 
12 | 
13 | To provide a device-specific implementation of an operation that already has a default implementation (such as unpack), use the following syntax:
14 | 
15 | ```python
16 | @torch.library.impl("quanto::unpack", ["CPU", "CUDA"])
17 | def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor:
18 |     return ext.unpack(t, bits)
19 | ```
20 | 
21 | To declare a new device-specific operation, you need to add it to the library:
22 | 
23 | ```python
24 | torch.library.define(
25 |     "quanto::gemm_f16i4",
26 |     "(Tensor input,"
27 |     " Tensor other,"
28 |     " Tensor other_scale,"
29 |     " Tensor other_shift,"
30 |     " int group_size)"
31 |     " -> Tensor",
32 | )
33 | ```
34 | 
35 | Then you can provide its implementation:
36 | 
37 | ```python
38 | @torch.library.impl("quanto::gemm_f16i4", ["CUDA"])
39 | def gemm_f16i4(
40 |     input: torch.Tensor,
41 |     other: torch.Tensor,
42 |     scales: torch.Tensor,
43 |     shift: torch.Tensor,
44 |     group_size: int,
45 | ) -> torch.Tensor:
46 |     ...
47 | ```
48 | 
49 | 
50 | Please refer to each extension folder for examples.
51 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import platform
16 | 
17 | import torch
18 | from packaging import version
19 | 
20 | from .cpp import *
21 | from .extension import *
22 | 
23 | 
24 | if torch.cuda.is_available() and platform.system() == "Linux":
25 |     if torch.version.cuda:
26 |         from .cuda import *
27 |     elif torch.version.hip:
28 |         from .hip import *
29 | 
30 | if torch.backends.mps.is_available():
31 |     from .mps import *
32 | 
33 | 
34 | def _is_xpu_available():
35 |     # SYCL extension support is added in torch>=2.7 on Linux
36 |     if platform.system() != "Linux":
37 |         return False
38 |     if version.parse(torch.__version__).release < version.parse("2.7").release:
39 |         return False
40 |     return torch.xpu.is_available()
41 | 
42 | 
43 | if _is_xpu_available():
44 |     from .xpu import *
45 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cpp/README.md:
--------------------------------------------------------------------------------
 1 | # Quanto generic C++ extension
 2 | 
 3 | Kernels in this extension must use only the C++ syntax.
 4 | 
 5 | They can use any pytorch operation defined under `aten::` or `c10::`.
 6 | 
 7 | To add a new implementation for an operation defined in `library./ops.py`:
 8 | 
 9 | - add the corresponding `.cpp` file to the list of sources in `__init__.py`,
10 | - add a binding to `pybind_module.cpp`,
11 | - provide an implementation calling the binding in `__init__.py`.
12 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cpp/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | import torch
18 | 
19 | from ..extension import Extension, register_extension
20 | 
21 | 
22 | __all__ = []
23 | 
24 | 
25 | ext = Extension(
26 |     "quanto_cpp",
27 |     root_dir=os.path.dirname(__file__),
28 |     sources=["unpack.cpp", "pybind_module.cpp"],
29 |     extra_cflags=["-O3"],
30 | )
31 | register_extension(ext)
32 | 
33 | 
34 | @torch.library.impl("quanto::unpack", ["CPU"])
35 | def unpack_cpp(t: torch.Tensor, bits: int):
36 |     return ext.lib.unpack(t, bits)
37 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cpp/pybind_module.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | #include "unpack.h"
17 | 
18 | // !IMPORTANT! Some python objects such as dtype, device, are not mapped to C++ types,
19 | // and need to be explicitly converted using dedicated helpers before calling a C++ method.
20 | // As a consequence, when an operation takes such an object as parameter, instead
21 | // of creating a binding directly to the C++ method, you must create a binding to a
22 | // lambda method that converts the unmapped types and calls the C++ method.
23 | 
24 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
25 |   m.def("unpack", &unpack, "unpack");
26 | }
27 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cpp/unpack.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "unpack.h"
16 | #include <torch/extension.h>
17 | 
18 | 
19 | static torch::Tensor unpack_4bit(torch::Tensor &t) {
20 | 	return torch::cat({
21 |                       (t & 0x0F),
22 |                       (t & 0xF0).__rshift__(4)
23 |                     },
24 |                     0);
25 | }
26 | 
27 | static torch::Tensor unpack_2bit(torch::Tensor &t) {
28 | 	return torch::cat({
29 |                       (t & 0x03),
30 |                       (t & 0x0C).__rshift__(2),
31 |                       (t & 0x30).__rshift__(4),
32 |                       (t & 0xC0).__rshift__(6)
33 |                     },
34 |                     0);
35 | }
36 | 
37 | torch::Tensor unpack(torch::Tensor &t, int bits) {
38 |     TORCH_CHECK(t.scalar_type() == torch::kUInt8, "Unsupported data type: ", t.scalar_type());
39 |     switch(bits) {
40 |       case 4:
41 |         return unpack_4bit(t);
42 |       case 2:
43 |         return unpack_2bit(t);
44 |       default:
45 |         throw std::invalid_argument("Can only unpack 2-bit or 4-bit tensors.");
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cpp/unpack.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | 
17 | torch::Tensor unpack(torch::Tensor &t, int bits);
18 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/README.md:
--------------------------------------------------------------------------------
 1 | # Quanto generic CUDA extension
 2 | 
 3 | Kernels in this extension can use both the C++ and CUDA syntax.
 4 | 
 5 | They can use any pytorch operation defined under `aten::` or `c10::`.
 6 | 
 7 | To add a new implementation for an operation defined in `library./ops.py`:
 8 | 
 9 | - add the corresponding `.cpp` or `.cu` file to the list of sources in `__init__.py`,
10 | - add a binding to `pybind_module.cpp`,
11 | - provide an implementation calling the binding in `__init__.py`.
12 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.h:
--------------------------------------------------------------------------------
1 | #include <torch/extension.h>
2 | 
3 | torch::Tensor awq_v2_gemm_f16i4(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scales, torch::Tensor _zeros);
4 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | torch::Tensor awq_v2_gemv_f16i4(
 5 |     torch::Tensor _in_feats,
 6 |     torch::Tensor _kernel,
 7 |     torch::Tensor _scaling_factors,
 8 |     torch::Tensor _zeros,
 9 |     int m,
10 |     int n,
11 |     int k,
12 |     int group_size);
13 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | These kernels were vendored from VLLM. The Marlin kernels were developed
 2 | by Elias Frantar and extended by Neural Magic.
 3 | 
 4 | ---
 5 | 
 6 | Copyright (C) Marlin.2024 Elias Frantar
 7 | Modified by Neural Magic
 8 | Copyright 2024 The vLLM team.
 9 | 
10 | Licensed under the Apache License, Version 2.0 (the "License");
11 | you may not use this file except in compliance with the License.
12 | You may obtain a copy of the License at
13 | 
14 |          http://www.apache.org/licenses/LICENSE-2.0
15 | 
16 | Unless required by applicable law or agreed to in writing, software
17 | distributed under the License is distributed on an "AS IS" BASIS,
18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 | See the License for the specific language governing permissions and
20 | limitations under the License.
21 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cuh:
--------------------------------------------------------------------------------
 1 | // #pragma once
 2 | #include <torch/all.h>
 3 | #include <stdint.h>
 4 | 
 5 | 
 6 | // #ifndef _fp8_marlin_cuh
 7 | // #define _fp8_marlin_cuh
 8 | 
 9 | // #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
10 | // assert(0);
11 | // #else
12 | torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
13 |                               torch::Tensor& b_scales, torch::Tensor& workspace,
14 |                               int64_t num_bits, int64_t size_m, int64_t size_n,
15 |                               int64_t size_k);
16 | // #endif
17 | 
18 | // #endif


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/gptq_marlin.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/all.h>
 4 | 
 5 | #include <ATen/cuda/CUDAContext.h>
 6 | #include <c10/cuda/CUDAGuard.h>
 7 | #include <cuda.h>
 8 | #include <cuda_fp16.h>
 9 | #include <cuda_runtime.h>
10 | #include <iostream>
11 | 
12 | namespace gptq_marlin {
13 | 
14 | // 8 warps are a good choice since every SM has 4 schedulers and having more
15 | // than 1 warp per schedule allows some more latency hiding. At the same time,
16 | // we want relatively few warps to have many registers per warp and small tiles.
17 | static constexpr int default_threads = 256;
18 | 
19 | static constexpr int pipe_stages =
20 |     4;  // 4 pipeline stages fit into shared memory
21 | 
22 | static constexpr int min_thread_n = 64;
23 | static constexpr int min_thread_k = 64;
24 | 
25 | static constexpr int tile_size = 16;
26 | static constexpr int max_par = 16;
27 | 
28 | template <typename T, int n>
29 | struct Vec {
30 |   T elems[n];
31 |   __device__ T& operator[](int i) { return elems[i]; }
32 | };
33 | 
34 | using I4 = Vec<int, 4>;
35 | 
36 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
37 | 
38 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
39 | // No support for async
40 | #else
41 | 
42 | __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
43 |                                       bool pred = true) {
44 |   const int BYTES = 16;
45 |   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
46 |   asm volatile(
47 |       "{\n"
48 |       "   .reg .pred p;\n"
49 |       "   setp.ne.b32 p, %0, 0;\n"
50 |       "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
51 |       "}\n" ::"r"((int)pred),
52 |       "r"(smem), "l"(glob_ptr), "n"(BYTES));
53 | }
54 | 
55 | __device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
56 |   const int BYTES = 16;
57 |   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
58 |   asm volatile(
59 |       "{\n"
60 |       "   cp.async.cg.shared.global [%0], [%1], %2;\n"
61 |       "}\n" ::"r"(smem),
62 |       "l"(glob_ptr), "n"(BYTES));
63 | }
64 | 
65 | __device__ inline void cp_async_fence() {
66 |   asm volatile("cp.async.commit_group;\n" ::);
67 | }
68 | 
69 | template <int n>
70 | __device__ inline void cp_async_wait() {
71 |   asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
72 | }
73 | 
74 | #endif
75 | 
76 | }  // namespace gptq_marlin


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_dtypes.cuh:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef _data_types_cuh
 3 | #define _data_types_cuh
 4 | #include "gptq_marlin.cuh"
 5 | #include <cuda_fp16.h>
 6 | #include <cuda_bf16.h>
 7 | 
 8 | namespace gptq_marlin {
 9 | 
10 | template <typename scalar_t>
11 | class ScalarType {};
12 | 
13 | template <>
14 | class ScalarType<half> {
15 |  public:
16 |   using scalar_t = half;
17 |   using scalar_t2 = half2;
18 | 
19 |   // Matrix fragments for tensor core instructions; their precise layout is
20 |   // documented here:
21 |   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
22 |   using FragA = Vec<half2, 4>;
23 |   using FragB = Vec<half2, 2>;
24 |   using FragC = Vec<float, 4>;
25 |   using FragS = Vec<half2, 1>;
26 | 
27 |   static __device__ float inline num2float(const half x) {
28 |     return __half2float(x);
29 |   }
30 | 
31 |   static __device__ half2 inline num2num2(const half x) {
32 |     return __half2half2(x);
33 |   }
34 | 
35 |   static __device__ half2 inline nums2num2(const half x1, const half x2) {
36 |     return __halves2half2(x1, x2);
37 |   }
38 | 
39 |   static __host__ __device__ half inline float2num(const float x) {
40 |     return __float2half(x);
41 |   }
42 | };
43 | 
44 | template <>
45 | class ScalarType<nv_bfloat16> {
46 |  public:
47 |   using scalar_t = nv_bfloat16;
48 |   using scalar_t2 = nv_bfloat162;
49 | 
50 |   using FragA = Vec<nv_bfloat162, 4>;
51 |   using FragB = Vec<nv_bfloat162, 2>;
52 |   using FragC = Vec<float, 4>;
53 |   using FragS = Vec<nv_bfloat162, 1>;
54 | 
55 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
56 |   static __device__ float inline num2float(const nv_bfloat16 x) {
57 |     return __bfloat162float(x);
58 |   }
59 | 
60 |   static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
61 |     return __bfloat162bfloat162(x);
62 |   }
63 | 
64 |   static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
65 |                                                   const nv_bfloat16 x2) {
66 |     return __halves2bfloat162(x1, x2);
67 |   }
68 | 
69 |   static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
70 |     return __float2bfloat16(x);
71 |   }
72 | #endif
73 | };
74 | 
75 | }  // namespace gptq_marlin
76 | 
77 | #endif


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cuh:
--------------------------------------------------------------------------------
 1 | #include <torch/library.h>
 2 | #include <torch/all.h>
 3 | #include <stdint.h>
 4 | 
 5 | #ifndef _gptq_marlin_repack_cuh
 6 | #define _gptq_marlin_repack_cuh
 7 | 
 8 | torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 9 |                                  int64_t size_k, int64_t size_n,
10 |                                  int64_t num_bits);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *         http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #include "marlin_cuda.h"
17 | 
18 | #include <torch/all.h>
19 | #include <torch/python.h>
20 | #include <ATen/cuda/CUDAContext.h>
21 | #include <cuda_runtime.h>
22 | 
23 | #include "marlin_cuda_kernel.cuh"
24 | 
25 | const int ERR_PROB_SHAPE = 1;
26 | const int ERR_KERN_SHAPE = 2;
27 | 
28 | void mul(
29 |   const torch::Tensor& A,
30 |   const torch::Tensor& B,
31 |         torch::Tensor& C,
32 |   const torch::Tensor& s,
33 |   const torch::Tensor& sz, // ADDED: add scaled zero point
34 |         torch::Tensor& workspace,
35 |   int thread_k,
36 |   int thread_n,
37 |   int sms,
38 |   int max_par
39 | ) {
40 |   int prob_m = A.size(0);
41 |   int prob_n = C.size(1);
42 |   int prob_k = A.size(1);
43 |   int groupsize = (s.size(0) == 1) ? -1 : prob_k / s.size(0);
44 |   if (groupsize != -1 && groupsize * s.size(0) != prob_k)
45 |     AT_ERROR("k=", prob_k, " not compatible with ", s.size(0), " groups.");
46 |   if (workspace.numel() < prob_n / 128 * max_par)
47 |     AT_ERROR("workspace must be of size at least ", prob_n / 128 * max_par, ".");
48 |   int dev = A.get_device();
49 |   int err = marlin_cuda(
50 |     A.data_ptr(),
51 |     B.data_ptr(),
52 |     C.data_ptr(),
53 |     s.data_ptr(),
54 |     sz.data_ptr(), // ADDED: add scaled zero point
55 |     prob_m, prob_n, prob_k,
56 |     workspace.data_ptr(),
57 |     groupsize,
58 |     dev,
59 |     at::cuda::getCurrentCUDAStream(dev),
60 |     thread_k,
61 |     thread_n,
62 |     sms,
63 |     max_par
64 |   );
65 |   if (err == ERR_PROB_SHAPE) {
66 |     AT_ERROR(
67 |       "Problem (m=", prob_m, ", n=", prob_n, ", k=", prob_k, ")",
68 |       " not compatible with thread_k=", thread_k, ", thread_n=", thread_n, "."
69 |     );
70 |   } else if (err == ERR_KERN_SHAPE) {
71 |     AT_ERROR(
72 |       "No kernel implementation for thread_k=", thread_k, ", thread_n=", thread_n, ", groupsize=", groupsize, "."
73 |     );
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *         http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #include <torch/extension.h>
17 | 
18 | void mul(
19 |   const torch::Tensor& A,
20 |   const torch::Tensor& B,
21 |         torch::Tensor& C,
22 |   const torch::Tensor& s,
23 |   const torch::Tensor& sz,
24 |         torch::Tensor& workspace,
25 |   int thread_k = -1,
26 |   int thread_n = -1,
27 |   int sms = -1,
28 |   int max_par = 8
29 | );
30 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *         http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #include <cuda_runtime.h>
17 | 
18 | int marlin_cuda(
19 |   const void* A,
20 |   const void* B,
21 |         void* C,
22 |         void* s,
23 |         void* sz, // ADDED: add scaled zero point
24 |   int prob_m,
25 |   int prob_n,
26 |   int prob_k,
27 |   void* workspace,
28 |   int groupsize = -1,
29 |   int dev = 0,
30 |   cudaStream_t stream = 0,
31 |   int thread_k = -1,
32 |   int thread_n = -1,
33 |   int sms = -1,
34 |   int max_par = 16
35 | );
36 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/pybind_module.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | #include "awq/v2/gemm_cuda.h"
17 | #include "awq/v2/gemv_cuda.h"
18 | #include "unpack.h"
19 | #include "marlin/fp8_marlin.cuh"
20 | #include "marlin/gptq_marlin_repack.cuh"
21 | #include "marlin/marlin_cuda.h"
22 | 
23 | // !IMPORTANT! Some python objects such as dtype, device, are not mapped to C++ types,
24 | // and need to be explicitly converted using dedicated helpers before calling a C++ method.
25 | // As a consequence, when an operation takes such an object as parameter, instead
26 | // of creating a binding directly to the C++ method, you must create a binding to a
27 | // lambda method that converts the unmapped types and calls the C++ method.
28 | // See the binding of quantize_symmetric for instance.
29 | 
30 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
31 |   m.def("awq_v2_gemm_f16i4", &awq_v2_gemm_f16i4, "awq_v2_gemm_f16i4");
32 |   m.def("awq_v2_gemv_f16i4", &awq_v2_gemv_f16i4, "awq_v2_gemv_f16i4");
33 |   m.def("gptq_marlin_repack", &gptq_marlin_repack, "gptq_marlin_repack");
34 |   m.def("fp8_marlin_gemm", &fp8_marlin_gemm, "fp8_marlin_gemm");
35 |   m.def("marlin_gemm_f16i4", &mul, "marlin_gemm_f16i4");
36 |   m.def("unpack", &unpack, "unpack");
37 | }
38 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/unpack.cu:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | #include <cuda.h>
17 | #include <cuda_runtime.h>
18 | #include <c10/cuda/CUDAException.h>
19 | 
20 | inline  unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
21 | #define BLOCK_SIZE 256
22 | 
23 | using namespace at;
24 | 
25 | 
26 | static torch::Tensor allocate_output(const torch::Tensor& input, int bits) {
27 |     int n_packed = 8 / bits;
28 |     auto output_shape = input.sizes().vec();
29 |     output_shape[0] = output_shape[0] * n_packed;
30 |     return torch::empty(output_shape, input.options());
31 | }
32 | 
33 | __global__ void unpack_4bit_kernel(unsigned char* input, unsigned char* output, int n) {
34 | 	int i = blockIdx.x*blockDim.x + threadIdx.x;
35 | 	if(i>=n) return;
36 | 
37 | 	output[i]     = (input[i] & 0x0F);
38 | 	output[i + n] = (input[i] & 0xF0) >> 4;
39 | }
40 | 
41 | static torch::Tensor unpack_4bit(const torch::Tensor& input){
42 | 
43 | 	auto output = allocate_output(input, 4);
44 | 
45 |     const auto numel = input.numel();
46 | 	int blocks = cdiv(numel, BLOCK_SIZE);
47 | 	unpack_4bit_kernel<<<blocks, BLOCK_SIZE>>>(
48 |         input.data_ptr<unsigned char>(),
49 |         output.data_ptr<unsigned char>(),
50 |         numel
51 |     );
52 | 
53 | 	C10_CUDA_KERNEL_LAUNCH_CHECK();
54 | 
55 | 	return output;
56 | }
57 | 
58 | __global__ void unpack_2bit_kernel(unsigned char* input, unsigned char* output, int n) {
59 | 	int i = blockIdx.x*blockDim.x + threadIdx.x;
60 | 	if(i>=n) return;
61 | 
62 | 	output[i]       = (input[i] & 0x03);
63 | 	output[i + n]   = (input[i] & 0x0C) >> 2;
64 | 	output[i + n*2] = (input[i] & 0x30) >> 4;
65 | 	output[i + n*3] = (input[i] & 0xC0) >> 6;
66 | }
67 | 
68 | static torch::Tensor unpack_2bit(const torch::Tensor& input){
69 | 
70 | 	auto output = allocate_output(input, 2);
71 | 
72 |     const auto numel = input.numel();
73 | 	int blocks = cdiv(numel, BLOCK_SIZE);
74 | 	unpack_2bit_kernel<<<blocks, BLOCK_SIZE>>>(
75 |         input.data_ptr<unsigned char>(),
76 |         output.data_ptr<unsigned char>(),
77 |         numel
78 |     );
79 | 
80 | 	C10_CUDA_KERNEL_LAUNCH_CHECK();
81 | 
82 | 	return output;
83 | }
84 | 
85 | torch::Tensor unpack(torch::Tensor &t, int bits) {
86 |     TORCH_CHECK(t.scalar_type() == torch::kUInt8, "Unsupported data type: ", t.scalar_type());
87 |     TORCH_CHECK(t.device().is_cuda(), "t must be a CUDA tensor.");
88 |     TORCH_CHECK(t.is_contiguous(), "t must be contiguous.");
89 |     switch(bits) {
90 |       case 4:
91 |         return unpack_4bit(t);
92 |       case 2:
93 |         return unpack_2bit(t);
94 |       default:
95 |         throw std::invalid_argument("Can only unpack 2-bit or 4-bit tensors.");
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/cuda/unpack.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | 
17 | torch::Tensor unpack(torch::Tensor &t, int bits);
18 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/extension.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import warnings
 4 | from typing import List
 5 | 
 6 | import torch
 7 | from torch.utils.cpp_extension import load
 8 | 
 9 | 
10 | __all__ = ["is_extension_available", "get_extension"]
11 | 
12 | 
13 | class Extension(object):
14 |     def __init__(
15 |         self,
16 |         name: str,
17 |         root_dir: str,
18 |         sources: List[str],
19 |         extra_cflags: List[str] = None,
20 |         extra_cuda_cflags: List[str] = None,
21 |     ):
22 |         self.name = name
23 |         self.sources = [f"{root_dir}/{source}" for source in sources]
24 |         self.extra_cflags = extra_cflags
25 |         self.extra_cuda_cflags = extra_cuda_cflags
26 |         self.build_directory = os.path.join(root_dir, "build")
27 |         self._lib = None
28 | 
29 |     @property
30 |     def lib(self):
31 |         if self._lib is None:
32 |             # We only load the extension when the lib is required
33 |             version_file = os.path.join(self.build_directory, "pytorch_version.txt")
34 |             if os.path.exists(version_file):
35 |                 # The extension has already been built: check the torch version for which it was built
36 |                 with open(version_file, "r") as f:
37 |                     pytorch_build_version = f.read().rstrip()
38 |                     if pytorch_build_version != torch.__version__:
39 |                         shutil.rmtree(self.build_directory)
40 |                         warnings.warn(
41 |                             f"{self.name} was compiled with pytorch {pytorch_build_version}, but {torch.__version__} is installed: it will be recompiled."
42 |                         )
43 |             os.makedirs(self.build_directory, exist_ok=True)
44 |             self._lib = load(
45 |                 name=self.name,
46 |                 sources=self.sources,
47 |                 extra_cflags=self.extra_cflags,
48 |                 extra_cuda_cflags=self.extra_cuda_cflags,
49 |                 build_directory=self.build_directory,
50 |             )
51 |             if not os.path.exists(version_file):
52 |                 with open(version_file, "w") as f:
53 |                     f.write(torch.__version__)
54 |         return self._lib
55 | 
56 | 
57 | _extensions = {}
58 | 
59 | 
60 | def register_extension(extension: Extension):
61 |     assert extension.name not in _extensions
62 |     _extensions[extension.name] = extension
63 | 
64 | 
65 | def get_extension(extension_type: str):
66 |     """Get an extension
67 | 
68 |     Args:
69 |         extension_type (`str`):
70 |             The extension type.
71 |     Returns:
72 |         The corresponding extension.
73 |     """
74 |     return _extensions[extension_type]
75 | 
76 | 
77 | def is_extension_available(extension_type: str):
78 |     """Check is an extension is available
79 | 
80 |     Args:
81 |         extension_type (`str`):
82 |             The extension type.
83 |     Returns:
84 |         True if the extension is available.
85 |     """
86 |     return extension_type in _extensions
87 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/hip/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | import torch
18 | 
19 | from ..extension import Extension, register_extension
20 | 
21 | 
22 | __all__ = []
23 | 
24 | 
25 | ext = Extension(
26 |     "quanto_hip",
27 |     root_dir=os.path.dirname(__file__),
28 |     sources=["unpack.cu", "pybind_module.cpp"],
29 |     extra_cflags=["-std=c++17"],
30 | )
31 | register_extension(ext)
32 | 
33 | 
34 | @torch.library.impl("quanto::unpack", ["CUDA"])
35 | def unpack_hip(t: torch.Tensor, bits: int):
36 |     return ext.lib.unpack(t, bits)
37 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/hip/pybind_module.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | #include "unpack.h"
17 | 
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |   m.def("unpack", &unpack, "unpack");
21 | }
22 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/hip/unpack.cu:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | #include <cuda.h>
17 | #include <cuda_runtime.h>
18 | #include <c10/cuda/CUDAException.h>
19 | 
20 | inline  unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
21 | #define BLOCK_SIZE 256
22 | 
23 | using namespace at;
24 | 
25 | 
26 | static torch::Tensor allocate_output(const torch::Tensor& input, int bits) {
27 |     int n_packed = 8 / bits;
28 |     auto output_shape = input.sizes().vec();
29 |     output_shape[0] = output_shape[0] * n_packed;
30 |     return torch::empty(output_shape, input.options());
31 | }
32 | 
33 | __global__ void unpack_4bit_kernel(unsigned char* input, unsigned char* output, int n) {
34 | 	int i = blockIdx.x*blockDim.x + threadIdx.x;
35 | 	if(i>=n) return;
36 | 
37 | 	output[i]     = (input[i] & 0x0F);
38 | 	output[i + n] = (input[i] & 0xF0) >> 4;
39 | }
40 | 
41 | static torch::Tensor unpack_4bit(const torch::Tensor& input){
42 | 
43 | 	auto output = allocate_output(input, 4);
44 | 
45 |     const auto numel = input.numel();
46 | 	int blocks = cdiv(numel, BLOCK_SIZE);
47 | 	unpack_4bit_kernel<<<blocks, BLOCK_SIZE>>>(
48 |         input.data_ptr<unsigned char>(),
49 |         output.data_ptr<unsigned char>(),
50 |         numel
51 |     );
52 | 
53 | 	C10_CUDA_KERNEL_LAUNCH_CHECK();
54 | 
55 | 	return output;
56 | }
57 | 
58 | __global__ void unpack_2bit_kernel(unsigned char* input, unsigned char* output, int n) {
59 | 	int i = blockIdx.x*blockDim.x + threadIdx.x;
60 | 	if(i>=n) return;
61 | 
62 | 	output[i]       = (input[i] & 0x03);
63 | 	output[i + n]   = (input[i] & 0x0C) >> 2;
64 | 	output[i + n*2] = (input[i] & 0x30) >> 4;
65 | 	output[i + n*3] = (input[i] & 0xC0) >> 6;
66 | }
67 | 
68 | static torch::Tensor unpack_2bit(const torch::Tensor& input){
69 | 
70 | 	auto output = allocate_output(input, 2);
71 | 
72 |     const auto numel = input.numel();
73 | 	int blocks = cdiv(numel, BLOCK_SIZE);
74 | 	unpack_2bit_kernel<<<blocks, BLOCK_SIZE>>>(
75 |         input.data_ptr<unsigned char>(),
76 |         output.data_ptr<unsigned char>(),
77 |         numel
78 |     );
79 | 
80 | 	C10_CUDA_KERNEL_LAUNCH_CHECK();
81 | 
82 | 	return output;
83 | }
84 | 
85 | torch::Tensor unpack(torch::Tensor &t, int bits) {
86 |     TORCH_CHECK(t.scalar_type() == torch::kUInt8, "Unsupported data type: ", t.scalar_type());
87 |     TORCH_CHECK(t.device().is_cuda(), "t must be a CUDA tensor.");
88 |     TORCH_CHECK(t.is_contiguous(), "t must be contiguous.");
89 |     switch(bits) {
90 |       case 4:
91 |         return unpack_4bit(t);
92 |       case 2:
93 |         return unpack_2bit(t);
94 |       default:
95 |         throw std::invalid_argument("Can only unpack 2-bit or 4-bit tensors.");
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/hip/unpack.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | 
17 | torch::Tensor unpack(torch::Tensor &t, int bits);
18 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/mps/README.md:
--------------------------------------------------------------------------------
 1 | # Quanto Metal Performance Shaders extension
 2 | 
 3 | To add a new implementation for an operation defined in `library./ops.py`:
 4 | 
 5 | - add the corresponding `.mm` file to the list of sources in `__init__.py`,
 6 | - add a binding to `pybind_module.cpp`,
 7 | - provide an implementation calling the binding in `__init__.py`.
 8 | 
 9 | Note: torch JIT extensions for MPS requires the xcode command-line tools.
10 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/mps/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | import torch
18 | 
19 | from ..extension import Extension, register_extension
20 | 
21 | 
22 | __all__ = []
23 | 
24 | 
25 | ext = Extension(
26 |     "quanto_mps",
27 |     root_dir=os.path.dirname(__file__),
28 |     sources=["unpack.mm", "pybind_module.cpp"],
29 |     extra_cflags=["-std=c++17"],
30 | )
31 | register_extension(ext)
32 | 
33 | 
34 | @torch.library.impl("quanto::unpack", "MPS")
35 | def unpack_mps(t: torch.Tensor, bits: int):
36 |     return ext.lib.unpack(t, bits)
37 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/mps/pybind_module.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | #include "unpack.h"
17 | 
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |   m.def("unpack", &unpack, "unpack");
21 | }
22 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/mps/unpack.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | 
17 | torch::Tensor unpack(const torch::Tensor &input, int bits);
18 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/xpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | # Copyright 2024 Intel Corporation. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | 
18 | import torch
19 | 
20 | from ..extension import Extension, register_extension
21 | 
22 | 
23 | __all__ = []
24 | 
25 | 
26 | module_path = os.path.dirname(__file__)
27 | sources = [
28 |     "unpack.sycl",
29 |     "pybind_module.cpp",
30 | ]
31 | ext = Extension(
32 |     "quanto_xpu",
33 |     root_dir=os.path.dirname(__file__),
34 |     sources=sources,
35 | )
36 | register_extension(ext)
37 | 
38 | 
39 | @torch.library.impl("quanto::unpack", "XPU")
40 | def unpack_xpu(t: torch.Tensor, bits: int):
41 |     return ext.lib.unpack(t, bits)
42 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/xpu/pybind_module.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | #include "unpack.h"
17 | 
18 | // !IMPORTANT! Some python objects such as dtype, device, are not mapped to C++ types,
19 | // and need to be explicitly converted using dedicated helpers before calling a C++ method.
20 | // As a consequence, when an operation takes such an object as parameter, instead
21 | // of creating a binding directly to the C++ method, you must create a binding to a
22 | // lambda method that converts the unmapped types and calls the C++ method.
23 | // See the binding of quantize_symmetric for instance.
24 | 
25 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
26 |   m.def("unpack", &unpack, "unpack");
27 | }
28 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/extensions/xpu/unpack.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <torch/extension.h>
16 | 
17 | torch::Tensor unpack(torch::Tensor &t, int bits);
18 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/quantize.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Union
16 | 
17 | import torch
18 | 
19 | from ..tensor import dtype_info, group
20 | 
21 | 
22 | torch.library.define(
23 |     "quanto::quantize_symmetric", "(Tensor base, ScalarType dtype, int? axis, Tensor scale) -> Tensor"
24 | )
25 | 
26 | 
27 | @torch.library.impl("quanto::quantize_symmetric", "default")
28 | def quantize_symmetric(
29 |     base: torch.Tensor, dtype: torch.dtype, axis: Union[int, None], scale: torch.Tensor
30 | ) -> torch.Tensor:
31 |     # Sanity checks
32 |     if axis is None:
33 |         if scale.ndim > 0:
34 |             raise ValueError("Scale must be a scalar when quantizing per-tensor")
35 |     else:
36 |         if base.ndim == 1:
37 |             raise ValueError("1D Tensors cannot be quantized per-axis")
38 |         if axis == base.ndim - 1:
39 |             # Align on the general convention to index the last dimension
40 |             axis = -1
41 |         if axis not in (0, -1):
42 |             raise ValueError("Quantization is only supported along the first or last axis.")
43 |         if base.shape[axis] == 1:
44 |             raise ValueError(f"Cannot quantize Tensor of shape {base.shape} along axis {axis} of size 1")
45 |         if torch.squeeze(scale).ndim > 1:
46 |             raise ValueError("Quantizing along multiple axis is not supported")
47 |         if scale.ndim != base.ndim:
48 |             raise ValueError(
49 |                 "When quantizing per-axis, the scale must be broadcastable to the base (Tip: try to add missing dims of length zero)."
50 |             )
51 |     data = base / scale
52 |     if not dtype.is_floating_point:
53 |         data = torch.round(data)
54 |     info = dtype_info(dtype)
55 |     return torch.clamp(data, min=info.min, max=info.max).to(dtype)
56 | 
57 | 
58 | torch.library.define(
59 |     "quanto::quantize_affine",
60 |     "(Tensor base, int bits, int axis, int? group_size, Tensor scale, Tensor shift) -> Tensor",
61 | )
62 | 
63 | 
64 | @torch.library.impl("quanto::quantize_affine", "default")
65 | def quantize_affine(
66 |     base: torch.Tensor, bits: int, axis: int, group_size: Union[int, None], scale: torch.Tensor, shift: torch.Tensor
67 | ) -> torch.Tensor:
68 |     if axis not in (0, -1):
69 |         raise ValueError("axis parameter must be 0 (first axis) or -1 (last axis)")
70 |     if group_size is not None:
71 |         base = group(base, axis=axis, group_size=group_size)
72 |     if shift.dtype.is_floating_point:
73 |         data = torch.round((base + shift) / scale)
74 |     else:
75 |         # Shift is an integer representing zero (i.e. zero-point)
76 |         data = torch.round(base / scale) + shift
77 | 
78 |     return torch.clamp(data, min=0, max=2**bits - 1).to(torch.uint8)
79 | 


--------------------------------------------------------------------------------
/optimum/quanto/library/unpack.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | 
18 | torch.library.define("quanto::unpack", "(Tensor self, int bits) -> Tensor")
19 | 
20 | 
21 | @torch.library.impl("quanto::unpack", "default")
22 | def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor:
23 |     """
24 |     Un-Pack int4 / int2 weights (packed in a uint8) into a torch.uint8 tensor
25 |     What un-packing means? Assume we have packed 4 2-bit values in 8-bit
26 |     (because torch does not have native support for 2-bit datatypes)
27 | 
28 |     > 1110 0100
29 | 
30 |     Unpacking them means retrieving the original 4 2-bit values:
31 | 
32 |     > 0000 0011 | 0000 0010 | 0000 0001 | 0000 0000
33 | 
34 |     Args:
35 |         packed (`torch.Tensor`):
36 |             The packed tensor in `torch.uint8` precision
37 |         bits (`int`):
38 |             The number of bits per encoded value. Can be 2 or 4.
39 |     """
40 |     unpacked = []
41 |     values_per_item = 8 // bits
42 | 
43 |     def rshift(t: torch.Tensor, bits: int):
44 |         if t.device.type == "mps":
45 |             # rshift is not supported on MPS device
46 |             return t // (2**bits)
47 |         return t >> bits
48 | 
49 |     # Unpack each set of values independently
50 |     for i in range(values_per_item):
51 |         mask = 2 ** (bits * (i + 1)) - 1
52 |         unpacked.append(rshift(packed & mask, bits * i))
53 |     # Return the concatenated unpacked tensors
54 |     return torch.cat(unpacked).to(torch.uint8)
55 | 


--------------------------------------------------------------------------------
/optimum/quanto/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import importlib
16 | import os
17 | from collections.abc import Mapping
18 | from typing import Any, Dict, List, Optional, Union
19 | 
20 | 
21 | def is_transformers_available() -> bool:
22 |     return importlib.util.find_spec("transformers") is not None
23 | 
24 | 
25 | def is_diffusers_available() -> bool:
26 |     return importlib.util.find_spec("diffusers") is not None
27 | 
28 | 
29 | if is_transformers_available():
30 |     from .transformers_models import *
31 | 
32 | 
33 | if is_diffusers_available():
34 |     from .diffusers_models import *
35 | 


--------------------------------------------------------------------------------
/optimum/quanto/models/shared_dict.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | from collections.abc import Mapping
17 | from typing import Any, Dict
18 | 
19 | from safetensors import safe_open
20 | 
21 | 
22 | class ShardedStateDict(Mapping):
23 |     """A pytorch state_dict stored in multiple safetensors files
24 | 
25 |     This class implements the `collections.abc.Mapping` interface.
26 |     It can be passed to `torch.nn.Module.load_state_dict()` to recursively
27 |     load the module tensors.
28 |     """
29 | 
30 |     def __init__(self, base_dir: str, tensor_index: Dict[str, str]):
31 |         self._base_dir = base_dir
32 |         self._index = tensor_index
33 |         self._handles = {}
34 | 
35 |     def __iter__(self):
36 |         yield from self._index
37 | 
38 |     def __len__(self):
39 |         return self._index.__len__()
40 | 
41 |     def __getitem__(self, key: Any) -> Any:
42 |         filename = self._index.__getitem__(key)
43 |         if filename not in self._handles:
44 |             f = safe_open(os.path.join(self._base_dir, filename), framework="pytorch")
45 |             self._handles[filename] = f
46 |         f = self._handles[filename]
47 |         return f.get_tensor(key)
48 | 
49 |     def __contains__(self, key: object) -> bool:
50 |         return self._index.__contains__(key)
51 | 
52 |     def keys(self):
53 |         return self._index.keys()
54 | 


--------------------------------------------------------------------------------
/optimum/quanto/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .qconv2d import *
16 | from .qlayernorm import *
17 | from .qlinear import *
18 | from .qmodule import *
19 | 


--------------------------------------------------------------------------------
/optimum/quanto/nn/qconv2d.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional
16 | 
17 | import torch
18 | 
19 | from ..tensor import Optimizer, qtype
20 | from .qmodule import QModuleMixin, register_qmodule
21 | 
22 | 
23 | __all__ = ["QConv2d"]
24 | 
25 | 
26 | @register_qmodule(torch.nn.Conv2d)
27 | class QConv2d(QModuleMixin, torch.nn.Conv2d):
28 |     @classmethod
29 |     def qcreate(
30 |         cls,
31 |         module,
32 |         weights: qtype,
33 |         activations: Optional[qtype] = None,
34 |         optimizer: Optional[Optimizer] = None,
35 |         device: Optional[torch.device] = None,
36 |     ):
37 |         return cls(
38 |             in_channels=module.in_channels,
39 |             out_channels=module.out_channels,
40 |             kernel_size=module.kernel_size,
41 |             stride=module.stride,
42 |             padding=module.padding,
43 |             dilation=module.dilation,
44 |             groups=module.groups,
45 |             bias=module.bias is not None,
46 |             padding_mode=module.padding_mode,
47 |             dtype=module.weight.dtype,
48 |             device=device,
49 |             weights=weights,
50 |             activations=activations,
51 |             optimizer=optimizer,
52 |         )
53 | 
54 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
55 |         return self._conv_forward(input, self.qweight, self.bias)
56 | 


--------------------------------------------------------------------------------
/optimum/quanto/nn/qlayernorm.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional
16 | 
17 | import torch
18 | 
19 | from ..tensor import Optimizer, qtype
20 | from .qmodule import QModuleMixin, register_qmodule
21 | 
22 | 
23 | __all__ = ["QLayerNorm"]
24 | 
25 | 
26 | @register_qmodule(torch.nn.LayerNorm)
27 | class QLayerNorm(QModuleMixin, torch.nn.LayerNorm):
28 |     @classmethod
29 |     def qcreate(
30 |         cls,
31 |         module,
32 |         weights: Optional[qtype] = None,
33 |         activations: Optional[qtype] = None,
34 |         optimizer: Optional[Optimizer] = None,
35 |         device: Optional[torch.device] = None,
36 |     ):
37 |         if activations is None:
38 |             return None
39 |         dtype = None if module.weight is None else module.weight.dtype
40 |         return cls(
41 |             module.normalized_shape,
42 |             module.eps,
43 |             module.elementwise_affine,
44 |             module.bias is not None,
45 |             dtype=dtype,
46 |             device=device,
47 |             weights=None,  # We never quantize QLayerNorm weights
48 |             activations=activations,
49 |             optimizer=None,  # We never quantize QLayerNorm weights
50 |         )
51 | 
52 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
53 |         return torch.nn.functional.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
54 | 


--------------------------------------------------------------------------------
/optimum/quanto/nn/qlinear.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional
16 | 
17 | import torch
18 | 
19 | from ..tensor import Optimizer, qtype
20 | from .qmodule import QModuleMixin, register_qmodule
21 | 
22 | 
23 | __all__ = ["QLinear"]
24 | 
25 | 
26 | @register_qmodule(torch.nn.Linear)
27 | class QLinear(QModuleMixin, torch.nn.Linear):
28 |     @classmethod
29 |     def qcreate(
30 |         cls,
31 |         module,
32 |         weights: qtype,
33 |         activations: Optional[qtype] = None,
34 |         optimizer: Optional[Optimizer] = None,
35 |         device: Optional[torch.device] = None,
36 |     ):
37 |         return cls(
38 |             module.in_features,
39 |             module.out_features,
40 |             module.bias is not None,
41 |             dtype=module.weight.dtype,
42 |             device=device,
43 |             weights=weights,
44 |             activations=activations,
45 |             optimizer=optimizer,
46 |             quantize_input=True,
47 |         )
48 | 
49 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
50 |         return torch.nn.functional.linear(input, self.qweight, bias=self.bias)
51 | 


--------------------------------------------------------------------------------
/optimum/quanto/subpackage/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .commands import *
16 | 


--------------------------------------------------------------------------------
/optimum/quanto/subpackage/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .base import *
16 | 


--------------------------------------------------------------------------------
/optimum/quanto/subpackage/commands/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from optimum.commands import BaseOptimumCLICommand, CommandInfo
16 | from optimum.commands.optimum_cli import optimum_cli_subcommand
17 | 
18 | from .quantize import QuantizeCommand
19 | 
20 | 
21 | __all__ = ["QuantoCommand"]
22 | 
23 | 
24 | @optimum_cli_subcommand()
25 | class QuantoCommand(BaseOptimumCLICommand):
26 |     COMMAND = CommandInfo(name="quanto", help="Hugging Face models quantization tools")
27 |     SUBCOMMANDS = (
28 |         CommandInfo(
29 |             name="quantize",
30 |             help="Quantize Hugging Face models.",
31 |             subcommand_class=QuantizeCommand,
32 |         ),
33 |     )
34 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .activations import *
16 | from .core import *
17 | from .grouped import *
18 | from .optimizers import *
19 | from .qbits import *
20 | from .qbytes import *
21 | from .qtensor import *
22 | from .qtype import *
23 | from .weights import *
24 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/activations/__init__.py:
--------------------------------------------------------------------------------
1 | from .qbytes import *
2 | from .quantization import *
3 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/activations/quantization.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | from ..qtype import qtype
18 | from .qbytes import ActivationQBytesTensor
19 | 
20 | 
21 | __all__ = ["quantize_activation"]
22 | 
23 | 
24 | def quantize_activation(t: torch.Tensor, qtype: qtype, scale: torch.Tensor):
25 |     """Quantize an activation Tensor.
26 | 
27 |     Activations are always quantized per-tensor with a scalar scale.
28 | 
29 |     Args:
30 |         base (`torch.Tensor`): the Tensor to quantize
31 |         qtype (`quanto.qtype`): The target quantization type
32 |         scale (`torch.Tensor`): The scalar quantization scale
33 | 
34 |     Returns:
35 |         A quantized Tensor.
36 |     """
37 |     if scale.numel() != 1:
38 |         raise ValueError("Parameter scale must be a scalar because activations can only be quantized per-tensor")
39 |     return ActivationQBytesTensor.quantize(t, qtype, scale)
40 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/core.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import torch
17 | 
18 | 
19 | __all__ = ["axis_to_dim", "dtype_info"]
20 | 
21 | 
22 | def dtype_info(dtype):
23 |     info = torch.finfo if dtype.is_floating_point else torch.iinfo
24 |     return info(dtype)
25 | 
26 | 
27 | def axis_to_dim(t, axis):
28 |     dim = list(range(t.ndim))
29 |     if axis == -1:
30 |         dim = dim[:-1]
31 |     else:
32 |         dim.remove(axis)
33 |     return dim
34 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/function.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | 
18 | __all__ = ["QuantizedLinearFunction"]
19 | 
20 | 
21 | class QuantizedLinearFunction(torch.autograd.Function):
22 |     """Quantized linear function.
23 | 
24 |     This is a quantized implementation of torch.nn.functional.linear.
25 | 
26 |     It defines explicitly the backward pass instead of letting pytorch
27 |     build it by combining the gradients of the underlying quantized operations.
28 | 
29 |     This has two main benefits:
30 | 
31 |     - this saves computations,
32 |     - this allows to use operations that do not have a registered backward method,
33 |     such as quanto custom operations.
34 | 
35 |     The drawback is that the extra tensors involved in the quantization graph, such as
36 |     the scales and shift, cannot be trained.
37 |     This is however consistent with the quanto quantizers backward pass, that returns
38 |     a zero gradient for these tensors.
39 |     """
40 | 
41 |     @staticmethod
42 |     def forward(ctx, input, other, bias=None):
43 |         ctx.save_for_backward(input, other)
44 |         output = torch.matmul(input, other.t())
45 |         if bias is not None:
46 |             output = output + bias
47 |         return output
48 | 
49 |     def backward(ctx, gO):
50 |         input_gO = other_gO = bias_gO = None
51 |         input, other = ctx.saved_tensors
52 |         out_features, in_features = other.shape
53 |         if ctx.needs_input_grad[0]:
54 |             # grad(A@(B.t()) = gO => grad(A) = gO@(B.t().t()) = gO@B
55 |             input_gO = torch.matmul(gO, other)
56 |         if ctx.needs_input_grad[1]:
57 |             # grad(B@A.t()) = gO.t() => grad(B) = gO.t()@(A.t().t()) = gO.t()@A
58 |             other_gO = torch.matmul(gO.view(-1, out_features).t(), input.view(-1, in_features))
59 |         if ctx.needs_input_grad[2]:
60 |             # Bias gradient is the sum on all dimensions but the last one
61 |             dim = tuple(range(gO.ndim - 1))
62 |             bias_gO = gO.sum(dim)
63 |         return input_gO, other_gO, bias_gO
64 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/grouped.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from typing import List
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | __all__ = ["group", "ungroup", "grouped_shape"]
 8 | 
 9 | 
10 | def grouped_shape(shape: List, axis: int, group_size: int) -> List:
11 |     if axis not in (0, -1):
12 |         raise ValueError("Axis must be 0 or -1 for group-wise quantization")
13 |     n_groups = math.prod(shape) // group_size
14 |     return (n_groups, group_size) if axis == 0 else (group_size, n_groups)
15 | 
16 | 
17 | def group(base: torch.Tensor, axis: int, group_size: int):
18 |     if axis not in (0, -1):
19 |         raise ValueError("Axis must be 0 or -1 for group-wise quantization")
20 |     # In standard per-axis quantization, we have one scale per axis dim
21 |     axis_dim = base.shape[axis]
22 |     # This scale is evaluated over axis_numel items for each feature along axis
23 |     axis_numel = base.numel() // axis_dim
24 |     if group_size > axis_numel or axis_numel % group_size != 0:
25 |         raise ValueError(f"Group size ({group_size}) must be a divisor of ({axis_numel})")
26 |     # Group-wise quantization further splits axis_numel into multiple groups per axis
27 |     axis_groups = axis_numel // group_size
28 |     if axis == 0:
29 |         # Easy-peasy: we simply need to reshape to (axis_dim * axis_groups, group_size)
30 |         return base.reshape([-1, group_size])
31 |     # More difficult: reshape to (group_size, axis_dim * axis_groups)
32 |     # First, split by groups, preserving the axis dimension
33 |     grouped = base.reshape((axis_groups, group_size, axis_dim))
34 |     # Permute to (group_size, axis_dim, axis_groups)
35 |     grouped = grouped.permute(1, 2, 0)
36 |     return grouped.reshape(group_size, axis_dim * axis_groups)
37 | 
38 | 
39 | def ungroup(grouped: torch.Tensor, axis: int, orig_shape: torch.Size):
40 |     if grouped.shape == orig_shape:
41 |         return grouped
42 |     if axis == 0:
43 |         # No transposition required, just reshape
44 |         return grouped.reshape(orig_shape)
45 |     group_size = grouped.shape[0] if axis == -1 else grouped.shape[-1]
46 |     axis_dim = orig_shape[axis]
47 |     axis_groups = grouped.numel() // axis_dim // group_size
48 |     ungrouped = grouped.reshape(group_size, axis_dim, axis_groups)
49 |     # Permute to (axis_groups, group_size, axis_dim)
50 |     ungrouped = ungrouped.permute(2, 0, 1)
51 |     return ungrouped.reshape(orig_shape)
52 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/optimizers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .absmax_optimizer import *
16 | from .affine_optimizer import *
17 | from .hqq_optimizer import *
18 | from .max_optimizer import *
19 | from .optimizer import *
20 | from .symmetric_optimizer import *
21 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/optimizers/absmax_optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional, Tuple, Union
16 | 
17 | import torch
18 | 
19 | from ..qtype import qtype
20 | from .symmetric_optimizer import SymmetricOptimizer
21 | 
22 | 
23 | __all__ = ["AbsmaxOptimizer"]
24 | 
25 | 
26 | class AbsmaxOptimizer(SymmetricOptimizer):
27 |     def optimize(
28 |         self, base: torch.Tensor, qtype: qtype, axis: Optional[int] = None
29 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
30 |         base = torch.abs(base)
31 |         if axis is None:
32 |             rmax = torch.max(base)
33 |         else:
34 |             dim = list(range(1, base.ndim)) if (axis == 0) else list(range(0, base.ndim - 1))
35 |             rmax = torch.amax(torch.abs(base), dim=dim, keepdim=True)
36 |         return rmax / qtype.qmax
37 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/optimizers/affine_optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional, Tuple
16 | 
17 | import torch
18 | 
19 | from ..grouped import group
20 | from ..qtype import qtype
21 | from .optimizer import Optimizer
22 | 
23 | 
24 | __all__ = ["AffineOptimizer"]
25 | 
26 | 
27 | class AffineOptimizer(Optimizer):
28 |     def __call__(
29 |         self,
30 |         base: torch.Tensor,
31 |         qtype: qtype,
32 |         axis: int,
33 |         group_size: Optional[int] = None,
34 |         zeropoint: bool = False,
35 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
36 |         """
37 |         Args:
38 |             base (`torch.Tensor`): the weight Tensor to quantize
39 |             qtype (`quanto.qtype`): The target quantization type
40 |             axis ('int`): The quantization axis (0 or -1)
41 |             group_size (`Optional[int]`): The quantization group size
42 |             zeropoint (`bool`): Allow an exact representation of zero. If True, the shifts are stored as
43 |                 integer instead of float, which results in a slightly smaller model, but might also reduce
44 |                 the model performance. Defaults to False.
45 |         Returns:
46 |             A tuple of scale, shift Tensor.
47 |         """
48 |         if axis not in [0, -1]:
49 |             raise ValueError("axis parameter must be 0 (first axis) or -1 (last axis)")
50 |         if group_size is not None:
51 |             base = group(base, axis, group_size)
52 |         if axis is not None and base.shape[axis] == 1:
53 |             axis = None
54 |         scale, shift = self.optimize(base, qtype, axis)
55 |         assert scale.dtype == base.dtype
56 |         assert shift.dtype == base.dtype
57 |         if zeropoint:
58 |             # Round shift to make sure zero can be represented exactly using 'shift' as quantized value
59 |             shift = torch.clamp(torch.round(shift / scale), 0, 2**qtype.bits - 1).to(torch.uint8)
60 |         return scale, shift
61 | 
62 |     def optimize(self, base: torch.Tensor, qtype: qtype, axis: int) -> Tuple[torch.Tensor, torch.Tensor]:
63 |         raise NotImplementedError
64 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/optimizers/hqq_optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional, Tuple, Union
16 | 
17 | import torch
18 | 
19 | from ..qtype import qtype
20 | from ..weights import quantize_weight
21 | from .max_optimizer import MaxOptimizer
22 | 
23 | 
24 | __all__ = ["HqqOptimizer"]
25 | 
26 | 
27 | # Shrinking operator
28 | def shrink_lp_op(x: torch.Tensor, beta: float, lp_norm: float) -> torch.Tensor:
29 |     if lp_norm == 1:
30 |         return torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - 1.0 / beta)
31 |     else:
32 |         return torch.sign(x) * torch.nn.functional.relu(
33 |             torch.abs(x) - (1.0 / beta) * torch.pow(torch.abs(x), lp_norm - 1)
34 |         )
35 | 
36 | 
37 | class HqqOptimizer(MaxOptimizer):
38 |     """Implementation of the HQQ algorithm
39 | 
40 |     This is an implementation of the algorithm described in "Half-Quadratic Quantization of Large Machine Learning Models",
41 |     by Hicham Badri and Appu Shaji (https://mobiusml.github.io/hqq_blog/).
42 |     This is an adaption of the original implementation at https://github.com/mobiusml/hqq.
43 | 
44 |     """
45 | 
46 |     def __init__(
47 |         self,
48 |         lp_norm: Optional[float] = 0.7,
49 |         beta: Optional[int] = 1e1,
50 |         kappa: Optional[float] = 1.01,
51 |         iters: Optional[int] = 20,
52 |         verbose: Optional[bool] = False,
53 |     ) -> None:
54 |         self.lp_norm = lp_norm
55 |         self.beta = beta
56 |         self.kappa = kappa
57 |         self.iters = iters
58 |         self.verbose = verbose
59 | 
60 |     def optimize(
61 |         self, base: torch.Tensor, qtype: qtype, axis: int
62 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
63 |         scale, shift = super().optimize(base, qtype, axis)
64 |         best_error = None
65 |         beta = self.beta
66 |         base_q = quantize_weight(base, qtype=qtype, axis=axis, scale=scale, shift=shift)
67 |         for i in range(self.iters):
68 |             error = base - base_q
69 |             if best_error is None:
70 |                 best_error = float(torch.abs(base - base_q).mean())
71 |                 if self.verbose:
72 |                     print(f"Start error: {best_error:.6f}")
73 |             e = shrink_lp_op(error, beta, self.lp_norm)
74 |             mean_axis = 0 if axis == -1 else -1
75 |             hqq_shift = torch.mean(base_q._data * scale - (base - e), axis=mean_axis, keepdim=True)
76 |             base_q = quantize_weight(base, qtype=qtype, axis=axis, scale=scale, shift=hqq_shift)
77 |             mean_error = float(torch.abs(base - base_q).mean())
78 |             if self.verbose:
79 |                 print(f"HQQ error at it #{i}: {mean_error:.6f}")
80 |             if mean_error < best_error:
81 |                 best_error = mean_error
82 |                 shift = hqq_shift
83 |                 beta *= self.kappa
84 |             else:
85 |                 break
86 | 
87 |         return scale, shift
88 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/optimizers/max_optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Tuple, Union
16 | 
17 | import torch
18 | 
19 | from ..qtype import qtype
20 | from .affine_optimizer import AffineOptimizer
21 | 
22 | 
23 | __all__ = ["MaxOptimizer"]
24 | 
25 | 
26 | class MaxOptimizer(AffineOptimizer):
27 |     def optimize(
28 |         self, base: torch.Tensor, qtype: qtype, axis: int
29 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
30 |         dim = list(range(1, base.ndim)) if (axis == 0) else list(range(0, base.ndim - 1))
31 |         rmin = torch.amin(base, dim=dim, keepdim=True)
32 |         rmax = torch.amax(base, dim=dim, keepdim=True)
33 |         qmin = -(2 ** (qtype.bits - 1))
34 |         qmax = 2 ** (qtype.bits - 1) - 1
35 |         scale = (rmax - rmin) / (qmax - qmin)
36 |         shift = -rmin
37 |         return scale, shift
38 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/optimizers/optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC
16 | from typing import Optional, Tuple, Union
17 | 
18 | import torch
19 | 
20 | 
21 | __all__ = ["Optimizer"]
22 | 
23 | 
24 | class Optimizer(ABC):
25 |     def __call__(
26 |         self, base: torch.Tensor, bits: int, axis: int, group_size: Optional[int] = None
27 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
28 |         raise NotImplementedError
29 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/optimizers/symmetric_optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional
16 | 
17 | import torch
18 | 
19 | from ..qtype import qtype
20 | from .optimizer import Optimizer
21 | 
22 | 
23 | __all__ = ["SymmetricOptimizer"]
24 | 
25 | 
26 | class SymmetricOptimizer(Optimizer):
27 |     def __call__(self, base: torch.Tensor, qtype: qtype, axis: Optional[int] = None) -> torch.Tensor:
28 |         if axis not in [None, 0, -1]:
29 |             raise ValueError("axis parameter must be None, 0 (first axis) or -1 (last axis)")
30 |         if axis is not None and base.shape[axis] == 1:
31 |             axis = None
32 |         scale = self.optimize(base, qtype, axis)
33 |         assert scale.dtype == base.dtype
34 | 
35 |         return scale
36 | 
37 |     def optimize(self, base: torch.Tensor, qmax: float, axis: Optional[int] = None) -> torch.Tensor:
38 |         raise NotImplementedError
39 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/qbits.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import torch
17 | from torch.autograd import Function
18 | 
19 | from .grouped import ungroup
20 | from .packed import PackedTensor
21 | from .qtensor import QTensor
22 | 
23 | 
24 | __all__ = ["QBitsTensor"]
25 | 
26 | 
27 | class QBitsDequantizer(Function):
28 |     @staticmethod
29 |     def forward(ctx, t):
30 |         if isinstance(t._data, PackedTensor):
31 |             data = t._data.unpack()
32 |         else:
33 |             data = t._data
34 |         shift = t._shift
35 |         if not shift.dtype.is_floating_point:
36 |             # Remove shift before multiplying by the scale
37 |             data = data.to(torch.int8) - shift.to(torch.int8)
38 |         if t.qtype.is_floating_point:
39 |             # Upcast explicitly to the scale dtype
40 |             dqt = t._scale * data.to(t._scale.dtype)
41 |         else:
42 |             dqt = t._scale * data
43 |         if shift.dtype.is_floating_point:
44 |             # Remove scaled shift
45 |             dqt -= shift
46 |         if t.axis is None:
47 |             return dqt
48 |         # Restore the original shape (if needed)
49 |         return ungroup(dqt, axis=t.axis, orig_shape=t.shape)
50 | 
51 |     @staticmethod
52 |     def backward(ctx, gO):
53 |         return gO
54 | 
55 | 
56 | class QBitsTensor(QTensor):
57 |     def __init__(self, qtype, axis, group_size, size, stride, data, scale, shift, requires_grad=False):
58 |         super().__init__(qtype, axis)
59 |         self._data = data
60 |         self._scale = scale
61 |         self._shift = shift
62 |         self._group_size = group_size
63 | 
64 |     def __repr__(self):
65 |         return f"{type(self).__name__}({self._data}, scale={self._scale}, shift={self._shift}, dtype={self.dtype})"
66 | 
67 |     def dequantize(self):
68 |         return QBitsDequantizer.apply(self)
69 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/qbytes.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from torch.autograd import Function
16 | 
17 | from .qtensor import QTensor
18 | 
19 | 
20 | __all__ = ["QBytesTensor"]
21 | 
22 | 
23 | class QBytesDequantizer(Function):
24 |     @staticmethod
25 |     def forward(ctx, t):
26 |         if t.qtype.is_floating_point:
27 |             # Upcast explicitly to the scale dtype
28 |             dqt = t._scale * t._data.to(t._scale.dtype)
29 |         else:
30 |             dqt = t._scale * t._data
31 |         return dqt
32 | 
33 |     @staticmethod
34 |     def backward(ctx, gO):
35 |         # For autograd, dequantization is a no-op
36 |         return gO
37 | 
38 | 
39 | class QBytesTensor(QTensor):
40 |     def __init__(self, qtype, axis, size, stride, data, scale, requires_grad=False):
41 |         super().__init__(qtype, axis)
42 |         self._data = data
43 |         self._scale = scale
44 | 
45 |     def __repr__(self):
46 |         return f"{self.__class__}({self._data}, scale={self._scale}, dtype={self.dtype})"
47 | 
48 |     def dequantize(self):
49 |         """Differentiable dequantization function"""
50 |         return QBytesDequantizer.apply(self)
51 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/qtype.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | import torch
18 | 
19 | 
20 | @dataclass
21 | class qtype:
22 |     """A quantized type class mimicking torch dtype"""
23 | 
24 |     name: str
25 |     is_floating_point: bool
26 |     bits: int
27 |     # This defines the storage dtype
28 |     dtype: torch.dtype
29 |     qmin: float
30 |     qmax: float
31 | 
32 |     def __str__(self):
33 |         return f"quanto.{self.name}"
34 | 
35 |     def __hash__(self):
36 |         return hash(str(self))
37 | 
38 | 
39 | # Integer qtypes
40 | 
41 | 
42 | def qint(bits):
43 |     qmin = -(2 ** (bits - 1))
44 |     qmax = 2 ** (bits - 1) - 1
45 |     return qtype(f"qint{bits}", is_floating_point=False, bits=bits, dtype=torch.int8, qmin=qmin, qmax=qmax)
46 | 
47 | 
48 | qint2 = qint(2)
49 | qint4 = qint(4)
50 | qint8 = qint(8)
51 | 
52 | # Float qtypes
53 | 
54 | 
55 | def qfloat(dtype: torch.dtype):
56 |     finfo = torch.finfo(dtype)
57 |     qmin = finfo.min
58 |     qmax = finfo.max
59 |     return qtype(f"q{finfo.dtype}", is_floating_point=True, bits=8, dtype=dtype, qmin=qmin, qmax=qmax)
60 | 
61 | 
62 | qfloat8_e4m3fn = qfloat(torch.float8_e4m3fn)
63 | qfloat8_e4m3fnuz = qfloat(torch.float8_e4m3fnuz)
64 | qfloat8_e5m2 = qfloat(torch.float8_e5m2)
65 | 
66 | # Alias the float8 representation that has the better support and inference efficiency
67 | qfloat8 = qfloat8_e4m3fn
68 | 
69 | # Convenience dict to get a dtype from its name
70 | qtypes = {name: q for (name, q) in locals().items() if isinstance(q, qtype)}
71 | 
72 | __all__ = ["qtype", "qtypes"] + [str(name) for name in qtypes.keys()]
73 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/__init__.py:
--------------------------------------------------------------------------------
1 | from .qbits import *
2 | from .qbytes import *
3 | from .quantization import *
4 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/awq/__init__.py:
--------------------------------------------------------------------------------
1 | from .packed import *
2 | from .qbits import *
3 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/marlin/__init__.py:
--------------------------------------------------------------------------------
1 | from .fp8 import *
2 | from .int4 import *
3 | from .permutations import *
4 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/marlin/fp8/__init__.py:
--------------------------------------------------------------------------------
1 | from .packed import *
2 | from .qbits import *
3 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/marlin/int4/__init__.py:
--------------------------------------------------------------------------------
1 | from .packed import *
2 | from .qbits import *
3 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/marlin/permutations.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #         http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import functools
16 | from typing import List, Tuple
17 | 
18 | import torch
19 | 
20 | from ..reordering import reorder, reverse
21 | 
22 | 
23 | __all__ = ["marlin_permute"]
24 | 
25 | 
26 | # https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54
27 | @functools.cache
28 | def _get_perms() -> Tuple[List[int], List[int]]:
29 |     perm = []
30 |     for i in range(8):
31 |         perm.extend([i + 8 * j for j in range(8)])
32 |     perm_single = []
33 |     for i in range(4):
34 |         perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
35 |     return perm, perm_single
36 | 
37 | 
38 | @functools.cache
39 | def _get_inverted_perms() -> Tuple[List[int], List[int]]:
40 |     perm, perm_single = _get_perms()
41 |     return reverse(perm), reverse(perm_single)
42 | 
43 | 
44 | def marlin_permute(t: torch.Tensor, reverse=False):
45 |     perm, perm_single = _get_inverted_perms() if reverse else _get_perms()
46 |     out_features = t.shape[1]
47 |     if t.shape[0] == 1:
48 |         reordered = reorder(t, perm_single)
49 |     else:
50 |         reordered = reorder(t, perm)
51 |     return reordered.reshape((-1, out_features)).contiguous()
52 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/packing.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | 
18 | def unpack_int32_to_uint8(packed: torch.Tensor, bits: int):
19 |     """Unpack a packed int32 tensor to a larger uint8 tensor
20 | 
21 |     Args:
22 |         packed (`torch.Tensor`):
23 |             The packed integer tensor
24 |         bits: (`int`):
25 |             The number of bits of each packed value.
26 | 
27 |     Returns:
28 |         An unpacked uint8 `torch.Tensor` expanded along the last dimension.
29 |     """
30 |     total_bits = 32
31 |     shifts = torch.arange(0, total_bits, bits, device=packed.device)
32 | 
33 |     # Unpack column-wise
34 |     unpacked = torch.bitwise_right_shift(packed[:, :, None], shifts[None, None, :]).to(
35 |         torch.int8  # smallest dtype available
36 |     )
37 |     unpacked = unpacked.reshape(unpacked.shape[0], -1)
38 | 
39 |     # Convert to unsigned
40 |     unpacked = torch.bitwise_and(unpacked, (2**bits) - 1)
41 | 
42 |     return unpacked.to(torch.uint8)
43 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/quantization.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional
16 | 
17 | import torch
18 | 
19 | from ..qtype import qtype
20 | from .qbits import WeightQBitsTensor
21 | from .qbytes import WeightQBytesTensor
22 | 
23 | 
24 | __all__ = ["quantize_weight"]
25 | 
26 | 
27 | def quantize_weight(
28 |     t: torch.Tensor,
29 |     qtype: qtype,
30 |     axis: int,
31 |     scale: torch.Tensor,
32 |     shift: Optional[torch.Tensor] = None,
33 |     group_size: Optional[int] = None,
34 |     activation_qtype: Optional[qtype] = None,
35 |     optimized: Optional[bool] = True,
36 | ):
37 |     """Quantize a weight Tensor.
38 | 
39 |     Weights are always quantized per-axis.
40 | 
41 |     Args:
42 |         t (`torch.Tensor`): the weight Tensor to quantize
43 |         qtype (`quanto.qtype`): The target quantization type
44 |         axis ('int`): The quantization axis (0 or -1)
45 |         scale (`torch.Tensor`): the quantization scale
46 |         shift (`Optional[torch.Tensor]`): optional shift to apply
47 |         group_size (`Optional[int]`): The quantization group size
48 |         activation_qtype (`Optional[qtype]`, defaults to `None`):
49 |             Which quantization type is being used for the activations. The function `quantize_weight`
50 |             initializes `torch.Tensor` subclasses that may depend on the activation dtype.
51 |             `None` corresponds to no quantization.
52 |         optimized (`Optional[bool]`, defaults to True):
53 |             If True, the quantization algorithm will select the most efficient kernel
54 |             for the weights and format the resulting Tensor accordingly.
55 |             If False, a kernel-agnostic Tensor will be returned (but it can be optimized later
56 |             explicitly by calling QTensor.optimize() or implicitly by moving it to a specific device).
57 |     Returns:
58 |         A quantized Tensor.
59 |     """
60 |     if axis not in (0, -1):
61 |         raise ValueError("axis parameter must be 0 (first axis) or -1 (last axis)")
62 |     if qtype.bits == 8:
63 |         if shift is not None:
64 |             raise ValueError("shift cannot be specified for 8-bit qtypes")
65 |         if group_size is not None:
66 |             raise ValueError("group_size cannot be specified for 8-bit qtypes.")
67 |         if axis is not None and t.shape[axis] == 1:
68 |             # Quantizing along an axis of dimension 1 means quantizing per-tensor
69 |             axis = None
70 |         return WeightQBytesTensor.quantize(t, qtype, axis, scale, activation_qtype, optimized)
71 |     if shift is None:
72 |         raise ValueError("shift must be specified for qtypes lower than 8-bit")
73 |     return WeightQBitsTensor.quantize(t, qtype, axis, group_size, scale, shift, optimized)
74 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/reordering.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import List, Union
16 | 
17 | import torch
18 | 
19 | 
20 | __all__ = ["reorder", "reverse"]
21 | 
22 | 
23 | def reorder(t: torch.Tensor, permutation: Union[torch.Tensor, List[int]]):
24 |     """Reorder a Tensor using a permutation
25 | 
26 |     Args:
27 |         t (`torch.Tensor`): the Tensor to reorder
28 |         permutation (`Union[torch.Tensor, List[int]]`): the permutation to apply
29 | 
30 |     Returns:
31 |         The reordered torch.Tensor
32 |     """
33 |     block_size = permutation.numel() if isinstance(permutation, torch.Tensor) else len(permutation)
34 |     reordered = t.reshape((-1, block_size))[:, permutation].reshape(t.shape)
35 |     return reordered.contiguous()
36 | 
37 | 
38 | def reverse(permutation: Union[torch.Tensor, List[int]]):
39 |     """Reverse a permutation
40 | 
41 |     The reversed permutation can be used to revert a reordered Tensor to its original
42 |     ordering.
43 | 
44 |     Args:
45 |         permutation (`Union[torch.Tensor, List[int]]`): the permutation to reverse
46 | 
47 |     Returns:
48 |         The reversed permutation
49 |     """
50 |     block_size = permutation.numel() if isinstance(permutation, torch.Tensor) else len(permutation)
51 |     reversed = torch.empty((block_size,), dtype=torch.int64)
52 |     reversed[permutation] = torch.arange(block_size)
53 |     return reversed
54 | 


--------------------------------------------------------------------------------
/optimum/quanto/tensor/weights/tinygemm/__init__.py:
--------------------------------------------------------------------------------
1 | from .packed import *
2 | from .qbits import *
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = 'optimum-quanto'
 3 | description = 'A pytorch quantization backend for optimum.'
 4 | classifiers = [
 5 |     'Development Status :: 2 - Pre-Alpha',
 6 |     'License :: OSI Approved :: Apache Software License',
 7 |     'Intended Audience :: Developers',
 8 |     'Intended Audience :: Education',
 9 |     'Intended Audience :: Science/Research',
10 |     'Operating System :: OS Independent',
11 |     'Programming Language :: Python :: 3.9',
12 |     'Programming Language :: Python :: 3.10',
13 |     'Programming Language :: Python :: 3.11',
14 |     'Topic :: Scientific/Engineering :: Artificial Intelligence'
15 | ]
16 | keywords = ['torch', 'quantization']
17 | requires-python = '>=3.9.0'
18 | authors = [{ name = 'David Corvoysier' }]
19 | maintainers = [
20 |     {name = "HuggingFace Inc. Special Ops Team", email="hardware@huggingface.co"},
21 | ]
22 | dependencies = ['torch>=2.6.0', 'ninja', 'numpy', 'safetensors', 'huggingface_hub']
23 | license = { text = 'Apache-2.0' }
24 | readme = 'README.md'
25 | dynamic = ['version']
26 | 
27 | [project.urls]
28 | homepage = 'https://github.com/huggingface/optimum-quanto'
29 | 
30 | [project.optional-dependencies]
31 | dev = ['pytest', 'ruff']
32 | examples = [
33 |     'torchvision',
34 |     'transformers',
35 |     'diffusers',
36 |     'datasets',
37 |     'accelerate',
38 |     'sentencepiece',
39 |     'scipy'
40 | ]
41 | 
42 | [tool.setuptools.packages.find]
43 | where = ["."]
44 | include = ["optimum*"]
45 | 
46 | [tool.setuptools.dynamic]
47 | version = {attr = 'optimum.quanto.__version__'}
48 | 
49 | [build-system]
50 | requires = ['setuptools>65.5.1', 'setuptools_scm']
51 | build-backend = 'setuptools.build_meta'
52 | 
53 | [tool.ruff]
54 | # Configuration for Ruff
55 | line-length = 119  # Same line-length as Black had
56 | 
57 | # Linting rules:
58 | # Never enforce `E501` (line length violations) and other specific rules.
59 | lint.ignore = ['C901', 'E501', 'E741']
60 | lint.select = ['C', 'E', 'F', 'I', 'W']
61 | 
62 | # Ignore import violations in all `__init__.py` files.
63 | [tool.ruff.lint.per-file-ignores]
64 | '__init__.py' = ['E402', 'F401', 'F403', 'F811']
65 | 
66 | # isort configuration (to sort imports)
67 | [tool.ruff.lint.isort]
68 | lines-after-imports = 2
69 | known-first-party = ['optimum.quanto']
70 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NIGHTLY=${1:-0}
 4 | VENV=".venv"
 5 | if [ ! -d "${VENV}" ]; then
 6 |     python3 -m venv ${VENV}
 7 | fi
 8 | . ${VENV}/bin/activate
 9 | if [ "$NIGHTLY" -eq "0" ]; then
10 |     pip install --upgrade torch torchvision torchaudio
11 | else
12 |     pip install --upgrade --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
13 | fi
14 | # Build tools
15 | pip install ruff pytest build
16 | # For examples
17 | pip install accelerate transformers datasets
18 | 


--------------------------------------------------------------------------------
/tests/cli/cli_helpers.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import importlib
16 | 
17 | import pytest
18 | 
19 | 
20 | requires_optimum_cli = pytest.mark.skipif(
21 |     importlib.util.find_spec("optimum.commands") is None, reason="optimum-cli is required"
22 | )
23 | 


--------------------------------------------------------------------------------
/tests/cli/test_quantize_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import subprocess
16 | from tempfile import TemporaryDirectory
17 | 
18 | import pytest
19 | from cli_helpers import requires_optimum_cli
20 | 
21 | from optimum.quanto import quantization_map
22 | 
23 | 
24 | @requires_optimum_cli
25 | @pytest.mark.parametrize("weights", ["int4", "int8"])
26 | def test_export_decoder_cli(weights):
27 |     from optimum.quanto import QuantizedModelForCausalLM
28 | 
29 |     model_id = "facebook/opt-125m"
30 |     with TemporaryDirectory() as tempdir:
31 |         subprocess.run(
32 |             [
33 |                 "optimum-cli",
34 |                 "quanto",
35 |                 "quantize",
36 |                 "--model",
37 |                 model_id,
38 |                 "--weights",
39 |                 f"{weights}",
40 |                 tempdir,
41 |             ],
42 |             shell=False,
43 |             check=True,
44 |         )
45 |         # Verify we can reload the quantized model
46 |         qmodel = QuantizedModelForCausalLM.from_pretrained(tempdir)
47 |         qmap = quantization_map(qmodel)
48 |         for layer_qconfig in qmap.values():
49 |             assert layer_qconfig["weights"] == f"q{weights}"
50 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | 
18 | 
19 | devices = ["cpu"]
20 | if torch.cuda.is_available():
21 |     devices += ["cuda"]
22 | elif torch.backends.mps.is_available():
23 |     devices += ["mps"]
24 | elif torch.xpu.is_available():
25 |     devices += ["xpu"]
26 | 
27 | 
28 | @pytest.fixture(scope="module", params=devices)
29 | def device(request):
30 |     return torch.device(request.param)
31 | 
32 | 
33 | def pytest_configure(config):
34 |     # register additional markers
35 |     config.addinivalue_line("markers", "skip_device(type): mark test to be skipped for the specified device type")
36 | 
37 | 
38 | def pytest_runtest_call(item):
39 |     fixture_name = "device"
40 |     if fixture_name in item.fixturenames:
41 |         # TODO: should be able to recover the fixture id instead of the actual value
42 |         fixture_arg = item.funcargs[fixture_name].type
43 |         skip_marks = {mark.args[0] for mark in item.iter_markers(name=f"skip_{fixture_name}")}
44 |         if fixture_arg in skip_marks:
45 |             pytest.skip(f"Test skipped for {fixture_name} {fixture_arg}")
46 | 


--------------------------------------------------------------------------------
/tests/library/test_extensions.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | 
 3 | import pytest
 4 | import torch
 5 | from packaging import version
 6 | 
 7 | from optimum.quanto.library.extensions import get_extension, is_extension_available
 8 | 
 9 | 
10 | def _is_xpu_available():
11 |     # SYCL extension support is added in torch>=2.7 on Linux
12 |     if platform.system() != "Linux":
13 |         return False
14 |     if version.parse(torch.__version__).release < version.parse("2.7").release:
15 |         return False
16 |     return torch.xpu.is_available()
17 | 
18 | 
19 | extension_names = ["quanto_cpp"]
20 | if torch.cuda.is_available():
21 |     if torch.version.cuda:
22 |         extension_names.append("quanto_cuda")
23 |     if torch.version.hip:
24 |         extension_names.append("quanto_hip")
25 | if torch.backends.mps.is_available():
26 |     extension_names.append("quanto_mps")
27 | if _is_xpu_available():
28 |     extension_names.append("quanto_xpu")
29 | 
30 | 
31 | @pytest.mark.parametrize("extension_name", extension_names)
32 | def test_extension_available(extension_name):
33 |     assert is_extension_available(extension_name)
34 | 
35 | 
36 | @pytest.mark.parametrize("extension_name", extension_names)
37 | def test_extension_compilation(extension_name):
38 |     extension = get_extension(extension_name)
39 |     assert extension.lib is not None
40 | 


--------------------------------------------------------------------------------
/tests/library/test_unpack.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import pytest
17 | import torch
18 | 
19 | from optimum.quanto.tensor.packed import pack_weights
20 | 
21 | 
22 | @pytest.mark.parametrize("bits", [2, 4], ids=["int2", "int4"])
23 | @pytest.mark.parametrize("shape", [(12,), (32, 32)], ids=["vector", "matrix"])
24 | def test_unpack(bits, shape, device):
25 |     qmax = 2**bits
26 |     a = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
27 |     packed_a = pack_weights(a, bits)
28 |     unpacked_a = torch.ops.quanto.unpack(packed_a, bits)
29 |     assert unpacked_a.dtype == torch.uint8
30 |     assert torch.equal(unpacked_a, a)
31 | 


--------------------------------------------------------------------------------
/tests/models/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from huggingface_hub.constants import _staging_mode
 3 | 
 4 | 
 5 | @pytest.fixture
 6 | def staging():
 7 |     """A pytest fixture only available in huggingface_hub staging mode
 8 | 
 9 |     If the huggingface_hub is not operating in staging mode, tests using
10 |     that fixture are automatically skipped.
11 | 
12 |     Returns:
13 |         a Dict containing a valid staging user and token.
14 |     """
15 |     if not _staging_mode:
16 |         pytest.skip("requires huggingface_hub staging mode")
17 |     return {
18 |         "user": "__DUMMY_TRANSFORMERS_USER__",
19 |         # Not critical, only usable on the sandboxed CI instance.
20 |         "token": "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL",
21 |     }
22 | 
23 | 
24 | @pytest.fixture(autouse=True)
25 | def skip_if_staging(request):
26 |     if _staging_mode:
27 |         if "staging" not in request.fixturenames:
28 |             pytest.skip("requires huggingface_hub standard mode")
29 | 


--------------------------------------------------------------------------------
/tests/nn/test_qmodule.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | 
18 | from optimum.quanto import QTensor, qint8, qtypes
19 | from optimum.quanto.nn import QLinear
20 | 
21 | 
22 | @pytest.mark.parametrize("in_features", [8, 16])
23 | @pytest.mark.parametrize("out_features", [32, 64])
24 | @pytest.mark.parametrize("use_bias", [True, False], ids=["bias", "no-bias"])
25 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
26 | def test_qmodule_freeze(in_features, out_features, use_bias, dtype):
27 |     qlinear = QLinear(in_features, out_features, bias=use_bias, weights=qint8).to(dtype)
28 |     assert not qlinear.frozen
29 |     assert not isinstance(qlinear.weight, QTensor)
30 |     assert qlinear.weight.dtype == dtype
31 |     if use_bias:
32 |         assert not isinstance(qlinear.bias, QTensor)
33 |         assert qlinear.bias.dtype == dtype
34 |     qweight = qlinear.qweight
35 |     assert isinstance(qweight, QTensor)
36 |     assert qweight.dtype == dtype
37 |     assert qweight.qtype == qint8
38 |     qlinear.freeze()
39 |     assert qlinear.frozen
40 |     assert isinstance(qlinear.weight, QTensor)
41 |     assert qlinear.weight.dtype == dtype
42 |     assert qlinear.weight.qtype == qint8
43 |     if use_bias:
44 |         assert not isinstance(qlinear.bias, QTensor)
45 |         assert qlinear.bias.dtype == dtype
46 | 
47 | 
48 | @pytest.mark.parametrize("weights", ["qint2", "qint4", "qint8", "qfloat8"])
49 | @pytest.mark.parametrize("activations", [None, "qint8", "qfloat8"])
50 | def test_qmodule_qtype_as_string(weights, activations):
51 |     qlinear = QLinear(16, 64, weights=weights, activations=activations)
52 |     assert qlinear.weight_qtype == qtypes[weights]
53 |     assert qlinear.activation_qtype is None if activations is None else qtypes[activations]
54 | 


--------------------------------------------------------------------------------
/tests/tensor/activations/test_activations_compile.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | from helpers import random_tensor
18 | 
19 | from optimum.quanto import ActivationQBytesTensor, absmax_scale, qint8, quantize_activation
20 | 
21 | 
22 | def compile_for_device(f, device):
23 |     # Remove any side-effects form previous compilation
24 |     torch.compiler.reset()
25 |     # Inductor relies on Triton for inference which does not support MPS
26 |     backend = "aot_eager" if device == torch.device("mps") else "inductor"
27 |     return torch.compile(f, backend=backend)
28 | 
29 | 
30 | @pytest.mark.skip("Disabled as it is not working (yet ?)")
31 | @pytest.mark.parametrize("input_shape", [(2, 10), (10, 32, 32)])
32 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"])
33 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"])
34 | def test_compile_quantize_tensor(input_shape, qtype, dtype, device):
35 |     if device == torch.device("mps") and dtype == torch.bfloat16:
36 |         pytest.skip("BFloat16 is not supported on MPS")
37 |     a = random_tensor(input_shape, dtype=dtype).to(device)
38 | 
39 |     def f(x, qtype):
40 |         scale = absmax_scale(x)
41 |         return quantize_activation(x, qtype=qtype, scale=scale)
42 | 
43 |     compiled_f = compile_for_device(f, device)
44 |     qa = compiled_f(a, qtype)
45 |     assert isinstance(qa, ActivationQBytesTensor)
46 |     assert qa.qtype == qtype
47 |     assert qa._scale.dtype == dtype
48 |     assert qa.axis is None
49 | 
50 | 
51 | def test_compile_qtensor_to(device):
52 |     input_shape = (10, 32, 32)
53 |     a = random_tensor(input_shape).to(device)
54 | 
55 |     def f(x, dtype):
56 |         return x.to(dtype)
57 | 
58 |     compiled_f = compile_for_device(f, device)
59 | 
60 |     scale = absmax_scale(a)
61 |     qa = quantize_activation(a, qtype=qint8, scale=scale)
62 |     cqa = compiled_f(qa, torch.float16)
63 |     assert isinstance(cqa, ActivationQBytesTensor)
64 |     assert cqa.qtype == qint8
65 |     assert cqa._scale.dtype == torch.float16
66 | 


--------------------------------------------------------------------------------
/tests/tensor/activations/test_activations_quantize.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | from helpers import assert_similar, device_eq, random_tensor
18 | 
19 | from optimum.quanto import (
20 |     ActivationQBytesTensor,
21 |     absmax_scale,
22 |     qfloat8,
23 |     qfloat8_e4m3fn,
24 |     qfloat8_e4m3fnuz,
25 |     qfloat8_e5m2,
26 |     qint8,
27 | )
28 | 
29 | 
30 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)])
31 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
32 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"])
33 | def test_symmetric_quantize_int(input_shape, dtype, qtype, device):
34 |     a = random_tensor(input_shape, dtype=dtype).to(device)
35 |     scale = absmax_scale(a, qtype=qtype, axis=None)
36 |     qa = ActivationQBytesTensor.quantize(a, qtype, scale)
37 |     assert isinstance(qa, ActivationQBytesTensor)
38 |     assert qa.dtype == dtype
39 |     assert qa.qtype == qtype
40 |     assert device_eq(qa.device, device)
41 |     assert_similar(a, qa)
42 | 
43 | 
44 | @pytest.mark.skip_device("mps")
45 | @pytest.mark.skip_device("xpu")
46 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)])
47 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
48 | @pytest.mark.parametrize(
49 |     "qtype",
50 |     [qfloat8, qfloat8_e4m3fn, qfloat8_e4m3fnuz, qfloat8_e5m2],
51 |     ids=["qfloat8", "qfloat8_e4m3fn", "qfloat8_e4m3fnuz", "qfloat8_e5m2"],
52 | )
53 | def test_symmetric_quantize_float8(input_shape, dtype, qtype, device):
54 |     a = random_tensor(input_shape, dtype=dtype).to(device)
55 |     scale = absmax_scale(a, qtype=qtype, axis=None)
56 |     qa = ActivationQBytesTensor.quantize(a, qtype, scale)
57 |     assert isinstance(qa, ActivationQBytesTensor)
58 |     assert qa.dtype == dtype
59 |     assert qa.qtype == qtype
60 |     assert device_eq(qa.device, device)
61 |     assert_similar(a, qa, atol=5e-3)
62 | 


--------------------------------------------------------------------------------
/tests/tensor/ops/test_mm_dispatch.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | from helpers import assert_similar, random_qactivation, random_qweight
18 | 
19 | from optimum.quanto import qint8
20 | 
21 | 
22 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
23 | @pytest.mark.parametrize("in_features", [5, 16, 24])
24 | @pytest.mark.parametrize("hidden", [5, 16, 24])
25 | @pytest.mark.parametrize("out_features", [5, 16, 24])
26 | def test_qactivation_qweight_matmul(dtype, in_features, hidden, out_features, device):
27 |     qa = random_qactivation((in_features, hidden), qint8, dtype=dtype).to(device)
28 |     qb = random_qweight((hidden, out_features), qint8, dtype=dtype, axis=-1).to(device)
29 |     qmatmul = torch.matmul(qa, qb)
30 |     # The outputs should be almost identical if we use the dequantized inputs
31 |     matmul = torch.matmul(qa.dequantize(), qb.dequantize())
32 |     assert_similar(matmul, qmatmul)
33 | 
34 | 
35 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
36 | @pytest.mark.parametrize("batch_size", [1, 10])
37 | @pytest.mark.parametrize("a_shape, b_shape", [[(16, 32), (32, 24)], [(5, 10), (10, 6)]])
38 | def test_qactivation_qactivation_bmm(dtype, batch_size, a_shape, b_shape, device):
39 |     qa = random_qactivation((batch_size,) + a_shape, qint8, dtype=dtype).to(device)
40 |     qb = random_qactivation((batch_size,) + b_shape, qint8, dtype=dtype).to(device)
41 |     qbmm = torch.bmm(qa, qb)
42 |     # The outputs should be almost identical if we use the dequantized inputs
43 |     bmm = torch.bmm(qa.dequantize(), qb.dequantize())
44 |     assert_similar(bmm, qbmm)
45 | 


--------------------------------------------------------------------------------
/tests/tensor/optimizers/test_hqq_optimizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | from helpers import random_tensor
18 | 
19 | from optimum.quanto import (
20 |     HqqOptimizer,
21 |     MaxOptimizer,
22 |     WeightQBitsTensor,
23 |     qint2,
24 |     qint4,
25 | )
26 | 
27 | 
28 | def compare_quantized_tensor(a, qtype, axis, group_size, scale, shift):
29 |     qa = WeightQBitsTensor.quantize(a, qtype, axis, group_size, scale, shift)
30 |     # Evaluate mean absolute error
31 |     mean_error = torch.mean(torch.abs(a - qa))
32 |     # Also evaluate cosine similarity
33 |     sim = torch.nn.functional.cosine_similarity(a.flatten(), qa.flatten(), dim=0)
34 |     return mean_error, sim
35 | 
36 | 
37 | @pytest.mark.parametrize("input_shape", [(1024, 1024)])
38 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16], ids=["bf16", "fp16"])
39 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"])
40 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
41 | @pytest.mark.parametrize("group_size", [32, 64, 128])
42 | def test_hqq_optimizer(input_shape, dtype, qtype, axis, group_size, device):
43 |     a = random_tensor(input_shape, dtype=dtype).to(device)
44 |     max_scale, max_shift = MaxOptimizer()(a, qtype=qtype, axis=axis, group_size=group_size)
45 |     max_mean_error, max_sim = compare_quantized_tensor(a, qtype, axis, group_size, max_scale, max_shift)
46 |     hqq_scale, hqq_shift = HqqOptimizer()(a, qtype=qtype, axis=axis, group_size=group_size)
47 |     hqq_mean_error, hqq_sim = compare_quantized_tensor(a, qtype, axis, group_size, hqq_scale, hqq_shift)
48 |     # HQQ optimizes the mean error, so it should be lower
49 |     assert hqq_mean_error <= max_mean_error
50 |     # FIXME: HQQ cosine similarity should be also closer to 1
51 | 


--------------------------------------------------------------------------------
/tests/tensor/test_absmax.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | from helpers import random_tensor
18 | 
19 | from optimum.quanto import absmax_scale, qfloat8, qint8
20 | 
21 | 
22 | @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (2, 10), (10, 32, 32)])
23 | @pytest.mark.parametrize("qtype", [qint8, qfloat8], ids=["qint8", "qfloat8"])
24 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
25 | @pytest.mark.parametrize("axis", [None, 0, -1], ids=["per-tensor", "first-axis", "last-axis"])
26 | def test_absmax_scale(input_shape, axis, dtype, qtype, device):
27 |     if device.type == "mps" and qtype.is_floating_point:
28 |         pytest.skip("Float8 are not supported on MPS device")
29 |     a = random_tensor(input_shape, dtype=dtype).to(device)
30 |     scale = absmax_scale(a, qtype, axis)
31 |     assert scale.dtype == dtype
32 |     if axis is None:
33 |         assert scale.ndim == 0
34 |     else:
35 |         assert scale.ndim == a.ndim
36 |         sscale = torch.squeeze(scale)
37 |         if a.ndim == 1 or a.shape[axis] == 1:
38 |             # Quantization is actually per-tensor as the axis dim is 1
39 |             assert sscale.ndim == 0
40 |         else:
41 |             assert sscale.ndim == 1
42 | 


--------------------------------------------------------------------------------
/tests/tensor/test_packed_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import io
16 | 
17 | import pytest
18 | import torch
19 | from helpers import device_eq
20 | 
21 | from optimum.quanto.tensor.packed import PackedTensor
22 | 
23 | 
24 | @pytest.mark.parametrize("shape", [(10,), (12,), (10, 10), (12, 10), (32, 32)])
25 | @pytest.mark.parametrize("bits", [2, 4], ids=["int2", "int4"])
26 | def test_pack_tensor(shape, bits, device):
27 |     """This test verifies that an integer tensor in the correct range is preserved."""
28 |     qmax = 2**bits
29 |     t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
30 |     packed = PackedTensor.pack(t, bits=bits)
31 | 
32 |     assert isinstance(packed, PackedTensor)
33 |     assert packed.dtype == torch.uint8
34 |     assert device_eq(packed.device, device)
35 |     assert torch.equal(t, packed.unpack())
36 | 
37 | 
38 | @pytest.mark.parametrize("bits", [2, 4], ids=["int2", "int4"])
39 | def test_packed_tensor_serialization(bits, device):
40 |     qmax = 2**bits
41 |     shape = (10, 32)
42 |     t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
43 |     packed = PackedTensor.pack(t, bits=bits)
44 |     b = io.BytesIO()
45 |     torch.save(packed, b)
46 |     b.seek(0)
47 |     packed_reloaded = torch.load(b, weights_only=False)
48 |     assert isinstance(packed_reloaded, PackedTensor)
49 |     assert packed_reloaded.shape == packed.shape
50 |     assert packed_reloaded.dtype == packed.dtype
51 |     assert packed_reloaded.bits == packed.bits
52 |     assert torch.equal(packed_reloaded._data, packed._data)
53 |     assert torch.equal(t, packed_reloaded.unpack())
54 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/optimized/test_awq_packed_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | import pytest
18 | import torch
19 | from helpers import device_eq
20 | 
21 | from optimum.quanto.tensor.weights.awq import AWQPackedTensor, AWQPacking
22 | 
23 | 
24 | @pytest.mark.skip_device("cpu")
25 | @pytest.mark.skip_device("mps")
26 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
27 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
28 | @pytest.mark.parametrize("random", [True, False])
29 | @pytest.mark.parametrize("packing, reorder", [(AWQPacking.V1, True), (AWQPacking.V1, False), (AWQPacking.V2, False)])
30 | def test_pack_awq_tensor(in_features, out_features, random, packing, reorder, device):
31 |     bits = 4
32 |     qmax = 2**bits
33 |     shape = (out_features, in_features)
34 |     if random:
35 |         t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
36 |     else:
37 |         numel = np.prod(shape)
38 |         t = torch.tensor(range(numel), dtype=torch.int32)
39 |         t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
40 |     packed = AWQPackedTensor.pack(t, packing=packing, reorder=reorder)
41 |     assert isinstance(packed, AWQPackedTensor)
42 |     assert packed._packing == packing
43 |     assert packed._reorder == reorder
44 |     assert device_eq(packed.device, device)
45 |     assert torch.equal(t, packed.unpack())
46 | 
47 | 
48 | @pytest.mark.skip_device("cpu")
49 | @pytest.mark.skip_device("mps")
50 | @pytest.mark.parametrize("packing, reorder", [(AWQPacking.V1, True), (AWQPacking.V2, False)])
51 | def test_move_awq_tensor(packing, reorder, device):
52 |     shape = (256, 256)
53 |     bits = 4
54 |     qmax = 2**bits
55 |     numel = np.prod(shape)
56 |     t = torch.tensor(range(numel), dtype=torch.int32)
57 |     t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
58 |     packed = AWQPackedTensor.pack(t, packing=packing, reorder=reorder)
59 |     assert packed._packing == packing
60 |     assert packed._reorder == reorder
61 |     moved = packed.to(device)
62 |     assert isinstance(moved, AWQPackedTensor)
63 |     assert moved._packing == packing
64 |     assert moved._reorder == reorder
65 |     # TensorRT tensors are unpacked when moved out of CUDA or XPU device
66 |     moved = packed.to("cpu")
67 |     assert type(moved) is torch.Tensor
68 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/optimized/test_marlin_fp8_packed_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | import pytest
18 | import torch
19 | from helpers import device_eq
20 | 
21 | from optimum.quanto.library.extensions import is_extension_available
22 | from optimum.quanto.tensor.weights.marlin.fp8 import MarlinF8PackedTensor
23 | 
24 | 
25 | def get_fp8_tensor(shape, device, random=False):
26 |     # We will initialize float8 from an uint8 tensor
27 |     qmax = 2**8
28 |     if random:
29 |         t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
30 |     else:
31 |         numel = np.prod(shape)
32 |         t = torch.tensor(range(numel), dtype=torch.int32)
33 |         t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
34 |     # Remove values that would be interpreted as nans in float8.
35 |     t[t == 127] = 0
36 |     t[t == 255] = 0
37 |     return t.view(torch.float8_e4m3fn).to(device)
38 | 
39 | 
40 | @pytest.mark.skipif(not is_extension_available("quanto_cuda"), reason="CUDA extension is not available")
41 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
42 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
43 | @pytest.mark.parametrize("random", [True, False])
44 | def test_pack_marlin_fp8_tensor(in_features, out_features, random):
45 |     shape = (out_features, in_features)
46 |     device = torch.device("cuda")
47 |     t = get_fp8_tensor(shape, device, random)
48 |     packed = MarlinF8PackedTensor.pack(t)
49 |     assert isinstance(packed, MarlinF8PackedTensor)
50 |     assert device_eq(packed.device, device)
51 |     assert torch.equal(t, packed.unpack())
52 | 
53 | 
54 | @pytest.mark.skipif(not is_extension_available("quanto_cuda"), reason="CUDA extension is not available")
55 | def test_move_marlin_fp8_tensor():
56 |     shape = (256, 256)
57 |     device = torch.device("cuda")
58 |     t = get_fp8_tensor(shape, device)
59 |     packed = MarlinF8PackedTensor.pack(t)
60 |     moved = packed.to("cuda")
61 |     assert isinstance(moved, MarlinF8PackedTensor)
62 |     # Marlin FP8 tensors are unpacked when moved out of CUDA device
63 |     moved = packed.to("cpu")
64 |     assert type(moved) is torch.Tensor
65 |     assert torch.equal(t, moved.to("cuda"))
66 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/optimized/test_marlin_int4_packed_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | import pytest
18 | import torch
19 | from helpers import device_eq
20 | 
21 | from optimum.quanto.tensor.weights.marlin.int4 import MarlinInt4PackedTensor
22 | 
23 | 
24 | def get_uint4_tensor(shape, device, random=False):
25 |     qmax = 2**4
26 |     if random:
27 |         t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
28 |     else:
29 |         numel = np.prod(shape)
30 |         t = torch.tensor(range(numel), dtype=torch.int32)
31 |         t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
32 |     return t
33 | 
34 | 
35 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
36 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
37 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
38 | @pytest.mark.parametrize("random", [True, False])
39 | def test_pack_marlin_int4_tensor(in_features, out_features, random):
40 |     shape = (out_features, in_features)
41 |     device = torch.device("cuda")
42 |     t = get_uint4_tensor(shape, device, random)
43 |     packed = MarlinInt4PackedTensor.pack(t)
44 |     assert isinstance(packed, MarlinInt4PackedTensor)
45 |     assert device_eq(packed.device, device)
46 |     assert torch.equal(t, packed.unpack())
47 | 
48 | 
49 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
50 | def test_move_marlin_int4_packed_tensor(device):
51 |     shape = (256, 256)
52 |     device = torch.device("cuda")
53 |     t = get_uint4_tensor(shape, device)
54 |     packed = MarlinInt4PackedTensor.pack(t)
55 |     moved = packed.to("cuda")
56 |     assert isinstance(moved, MarlinInt4PackedTensor)
57 |     # Marlin int4 tensors are unpacked when moved out of CUDA device
58 |     moved = packed.to("cpu")
59 |     assert type(moved) is torch.Tensor
60 |     assert torch.equal(t, moved.to("cuda"))
61 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/optimized/test_marlin_qbytes_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | 
18 | from optimum.quanto import qfloat8_e4m3fn
19 | from optimum.quanto.library.extensions import is_extension_available
20 | from optimum.quanto.tensor.weights.marlin import MarlinF8QBytesTensor
21 | 
22 | 
23 | @pytest.mark.skipif(
24 |     not is_extension_available("quanto_cuda") or torch.cuda.get_device_capability()[0] < 8,
25 |     reason="CUDA >= sm80 not available",
26 | )
27 | @pytest.mark.parametrize("in_features", [16, 32, 48, 64])
28 | @pytest.mark.parametrize("out_features", [64, 128, 192, 256])
29 | def test_pack_unpack(in_features: int, out_features: int):
30 |     data = torch.randint(0, 256, size=(out_features, in_features), dtype=torch.uint8, device="cuda")
31 | 
32 |     # Remove nans.
33 |     data[data == 127] = 0
34 |     data[data == 255] = 0
35 | 
36 |     data = data.view(torch.float8_e4m3fn)
37 | 
38 |     qtype = qfloat8_e4m3fn
39 |     axis = 0
40 |     size = data.shape
41 |     stride = data.stride()
42 |     scale = torch.rand((out_features, 1), dtype=torch.float16, device="cuda")
43 |     marlin_tensor = MarlinF8QBytesTensor(qtype, axis, size, stride, data, scale)
44 | 
45 |     data_dequantized = marlin_tensor.dequantize()
46 | 
47 |     assert torch.all((data.to(torch.float16) * scale - data_dequantized).abs() < 1e-4)
48 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/optimized/test_tinygemm_packed_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import numpy as np
17 | import pytest
18 | import torch
19 | from helpers import device_eq
20 | from packaging import version
21 | 
22 | from optimum.quanto.tensor.weights.tinygemm import TinyGemmPackedTensor
23 | 
24 | 
25 | @pytest.mark.skip_device("mps")  # Only available with pytorch 2.4
26 | @pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
27 | @pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
28 | @pytest.mark.parametrize("random", [True, False])
29 | def test_pack_tinygemm_tensor(in_features, out_features, random, device):
30 |     if device.type == "cuda":
31 |         if torch.version.hip:
32 |             pytest.skip(reason="TinyGemm is not supported on ROCm devices")
33 |         if version.parse(torch.version.cuda).release < (12, 1):
34 |             pytest.skip(reason="CUDA runtime must be at least 12.1")
35 |         if torch.cuda.get_device_capability()[0] < 8:
36 |             pytest.skip(reason="CUDA device >= sm80 not available")
37 |     bits = 4
38 |     qmax = 2**bits
39 |     shape = (out_features, in_features)
40 |     if random:
41 |         t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
42 |     else:
43 |         numel = np.prod(shape)
44 |         t = torch.tensor(range(numel), dtype=torch.int32)
45 |         t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
46 |     packed = TinyGemmPackedTensor.pack(t)
47 |     assert isinstance(packed, TinyGemmPackedTensor)
48 |     assert device_eq(packed.device, device)
49 |     assert torch.equal(t, packed.unpack())
50 | 
51 | 
52 | @pytest.mark.skip_device("mps")  # Only available with pytorch 2.4
53 | def test_move_tinygemm_packed_tensor(device):
54 |     if device.type == "cuda":
55 |         if torch.version.hip:
56 |             pytest.skip(reason="TinyGemm is not supported on ROCm devices")
57 |         if version.parse(torch.version.cuda).release < (12, 1):
58 |             pytest.skip(reason="CUDA runtime must be at least 12.1")
59 |         if torch.cuda.get_device_capability()[0] < 8:
60 |             pytest.skip(reason="CUDA device >= sm80 not available")
61 |     shape = (256, 256)
62 |     bits = 4
63 |     qmax = 2**bits
64 |     numel = np.prod(shape)
65 |     t = torch.tensor(range(numel), dtype=torch.int32)
66 |     t = (t % qmax).reshape(shape).to(torch.uint8)
67 |     packed = TinyGemmPackedTensor.pack(t)
68 |     moved = packed.to(device)
69 |     assert torch.equal(t.to(device), moved.unpack())
70 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbits_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import io
16 | 
17 | import pytest
18 | import torch
19 | from helpers import random_qweight, random_tensor
20 | 
21 | from optimum.quanto import MaxOptimizer, WeightQBitsTensor, qint2, qint4, quantize_weight
22 | 
23 | 
24 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["int2", "int4"])
25 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
26 | def test_weight_qbits_tensor_serialization(qtype, axis):
27 |     qa = random_qweight((5, 5), qtype=qtype, axis=axis)
28 |     b = io.BytesIO()
29 |     torch.save(qa, b)
30 |     b.seek(0)
31 |     qa_reloaded = torch.load(b, weights_only=False)
32 |     assert isinstance(qa_reloaded, WeightQBitsTensor)
33 |     assert qa_reloaded.qtype == qa.qtype
34 |     assert qa_reloaded.dtype == qa.dtype
35 |     assert torch.equal(qa_reloaded._data, qa._data)
36 |     assert torch.equal(qa_reloaded._scale, qa._scale)
37 |     assert torch.equal(qa_reloaded._shift, qa._shift)
38 | 
39 | 
40 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["int2", "int4"])
41 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
42 | @pytest.mark.parametrize("group_size", [None, 16], ids=["channel-wise", "group-wise"])
43 | def test_weight_qbits_tensor_requires_grad(qtype, axis, group_size, device):
44 |     weight = random_tensor((32, 32), dtype=torch.float32).to(device)
45 |     weight.requires_grad = True
46 |     scale, shift = MaxOptimizer()(weight, qtype=qtype, axis=axis, group_size=group_size)
47 |     qweight = quantize_weight(weight, qtype=qtype, axis=axis, scale=scale, shift=shift, group_size=group_size)
48 |     assert qweight.requires_grad is True
49 | 
50 | 
51 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["int2", "int4"])
52 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
53 | @pytest.mark.parametrize("group_size", [None, 16], ids=["channel-wise", "group-wise"])
54 | def test_weight_qbits_tensor_backward(qtype, axis, group_size, device):
55 |     weight = random_tensor((32, 32), dtype=torch.float32).to(device)
56 |     weight.requires_grad = True
57 |     scale, shift = MaxOptimizer()(weight, qtype=qtype, axis=axis, group_size=group_size)
58 |     qweight = quantize_weight(weight, qtype=qtype, axis=axis, scale=scale, shift=shift, group_size=group_size)
59 |     gradient = torch.randn((32, 32)).to(device)
60 |     # Backpropagate gradient to the inner float weights
61 |     qweight.dequantize().backward(gradient)
62 |     assert torch.equal(weight.grad, gradient)
63 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbits_tensor_instantiate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import pytest
17 | import torch
18 | 
19 | from optimum.quanto import qint2, qint4
20 | from optimum.quanto.tensor.weights import WeightQBitsTensor
21 | 
22 | 
23 | def random_data_scale_shift(input_shape, dtype, qtype, axis, group_size):
24 |     out_features, in_features = input_shape
25 |     n_groups = in_features * out_features // group_size
26 |     data_shape = (n_groups, group_size) if axis == 0 else (group_size, n_groups)
27 |     scale_shape = (n_groups, 1) if axis == 0 else (1, n_groups)
28 |     min_value = -(2 ** (qtype.bits - 1))
29 |     max_value = 2 ** (qtype.bits - 1) - 1
30 |     data = torch.randint(max_value - min_value + 1, data_shape, dtype=torch.uint8)
31 |     scale = torch.full(scale_shape, 1.0 / -min_value, dtype=dtype)
32 |     shift = torch.ones(scale_shape, dtype=dtype)
33 |     return data, scale, shift
34 | 
35 | 
36 | @pytest.mark.parametrize("input_shape, group_size", [[(32, 32), 16], [(1024, 1024), 128]])
37 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
38 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"])
39 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"])
40 | def test_weight_qbits_tensor_instantiate(input_shape, dtype, qtype, axis, group_size, device):
41 |     data, scale, shift = random_data_scale_shift(input_shape, dtype, qtype, axis, group_size)
42 |     input_stride = torch.ones(input_shape).stride()
43 |     qa = WeightQBitsTensor(qtype, axis, group_size, input_shape, input_stride, data, scale=scale, shift=shift).to(
44 |         device
45 |     )
46 |     assert torch.max(torch.abs(qa.dequantize())) <= 1
47 |     assert qa.dtype == dtype
48 |     assert qa.qtype == qtype
49 |     assert qa.shape == input_shape
50 | 
51 | 
52 | @pytest.mark.parametrize("input_shape, group_size", [[(32, 32), 16], [(1024, 1024), 128]])
53 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
54 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"])
55 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"])
56 | def test_weight_qbits_tensor_equal(input_shape, dtype, qtype, axis, group_size, device):
57 |     data, scale, shift = random_data_scale_shift(input_shape, dtype, qtype, axis, group_size)
58 |     qa = WeightQBitsTensor(qtype, axis, group_size, data.size(), data.stride(), data, scale=scale, shift=shift).to(
59 |         device
60 |     )
61 |     qb = WeightQBitsTensor(
62 |         qtype, axis, group_size, data.size(), data.stride(), data.clone(), scale=scale.clone(), shift=shift.clone()
63 |     ).to(device)
64 |     assert qa.equal(qb)
65 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbits_tensor_quantize.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | from helpers import assert_similar, device_eq, random_tensor
18 | 
19 | from optimum.quanto import (
20 |     MaxOptimizer,
21 |     qint2,
22 |     qint4,
23 | )
24 | from optimum.quanto.tensor.weights import WeightQBitsTensor
25 | 
26 | 
27 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)])
28 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
29 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"])
30 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
31 | @pytest.mark.parametrize("group_size", [None, 8], ids=["channel-wise", "group-wise"])
32 | @pytest.mark.parametrize("shift_mode", ["zeropoint", "float"])
33 | def test_weight_qbits_tensor_quantize(input_shape, dtype, qtype, axis, group_size, shift_mode, device):
34 |     a = random_tensor(input_shape, dtype=dtype).to(device)
35 |     scale, shift = MaxOptimizer()(a, qtype=qtype, axis=axis, group_size=group_size)
36 |     if shift_mode == "zeropoint":
37 |         shift = torch.round(shift / scale).to(torch.int8)
38 |     qa = WeightQBitsTensor.quantize(a, qtype, axis, group_size, scale, shift)
39 |     assert isinstance(qa, WeightQBitsTensor)
40 |     assert qa.dtype == dtype
41 |     assert qa.qtype == qtype
42 |     assert device_eq(qa.device, device)
43 |     atol = {
44 |         qint4: {
45 |             "zeropoint": 4e-3,
46 |             "float": 3e-3,
47 |         },
48 |         qint2: {
49 |             "zeropoint": 6e-2,
50 |             "float": 5e-2,
51 |         },
52 |     }[qtype][shift_mode]
53 |     assert_similar(a, qa, atol=atol)
54 | 
55 | 
56 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
57 | @pytest.mark.parametrize("qtype", [qint2, qint4], ids=["qint2", "qint4"])
58 | def test_weight_qbits_tensor_quantize_integer_tensor(dtype, qtype, device):
59 |     """This test verifies that an integer tensor in the correct range is preserved."""
60 |     bits = qtype.bits
61 |     qmin = -(2 ** (bits - 1))
62 |     qmax = 2 ** (bits - 1) - 1
63 |     a = torch.tensor(range(qmin, qmax + 1), dtype=dtype).to(device)
64 |     scale, shift = MaxOptimizer()(a, qtype=qtype, axis=0, group_size=None)
65 |     zeropoint = torch.round(shift / scale)
66 |     qa = WeightQBitsTensor.quantize(a, qtype, 0, None, scale, zeropoint)
67 | 
68 |     assert qa._data.dtype == torch.uint8
69 |     assert isinstance(qa, WeightQBitsTensor)
70 |     assert qa.dtype == dtype
71 |     assert qa.qtype == qtype
72 |     assert device_eq(qa.device, device)
73 |     assert torch.equal(a, qa.dequantize())
74 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbytes_tensor_backward.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import torch
17 | from helpers import random_tensor
18 | 
19 | from optimum.quanto import AbsmaxOptimizer, qint8, quantize_weight
20 | 
21 | 
22 | def test_weight_qbytes_tensor_requires_grad(device):
23 |     w = random_tensor((10, 10), dtype=torch.float32).to(device)
24 |     w.requires_grad = True
25 |     scale = AbsmaxOptimizer()(w, qtype=qint8, axis=0)
26 |     qw = quantize_weight(w, qtype=qint8, axis=0, scale=scale)
27 |     assert qw.requires_grad is True
28 | 
29 | 
30 | def test_weight_qbytes_tensor_backward(device):
31 |     w = random_tensor((10, 10), dtype=torch.float32).to(device)
32 |     w.requires_grad = True
33 |     scale = AbsmaxOptimizer()(w, qtype=qint8, axis=0)
34 |     qw = quantize_weight(w, qtype=qint8, axis=0, scale=scale)
35 |     gradient = torch.randn((10, 10)).to(device)
36 |     # Backpropagate gradient to the inner float weights
37 |     qw.dequantize().backward(gradient)
38 |     assert torch.equal(w.grad, gradient)
39 | 
40 | 
41 | def test_weight_qbytes_tensor_chained_backward(device):
42 |     a = random_tensor((10, 10), dtype=torch.float32).to(device)
43 |     a.requires_grad = True
44 |     scale = AbsmaxOptimizer()(a, qtype=qint8, axis=0)
45 |     qa = quantize_weight(a, qtype=qint8, axis=0, scale=scale)
46 |     b = random_tensor((10, 10), dtype=torch.float32).to(device)
47 |     b.requires_grad = True
48 |     scale = AbsmaxOptimizer()(b, qtype=qint8, axis=0)
49 |     qb = quantize_weight(b, qtype=qint8, axis=0, scale=scale)
50 |     # Evaluate the product
51 |     prod = qa * qb
52 |     # Backpropagate
53 |     gradient = torch.randn((10, 10)).to(device)
54 |     prod.backward(gradient)
55 |     assert torch.allclose(a.grad, qb.dequantize() * gradient)
56 |     assert torch.allclose(b.grad, qa.dequantize() * gradient)
57 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbytes_tensor_dispatch.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from helpers import random_qweight, random_tensor
 4 | 
 5 | from optimum.quanto import AbsmaxOptimizer, WeightQBytesTensor, qint8, quantize_weight
 6 | 
 7 | 
 8 | def test_weight_qytes_tensor_to_device(device):
 9 |     qa = random_qweight((32, 32), qtype=qint8, dtype=torch.float)
10 |     qa = qa.to(device)
11 |     assert isinstance(qa, WeightQBytesTensor)
12 |     assert qa.device.type == device.type
13 |     assert qa._data.device.type == device.type
14 |     assert qa._scale.device.type == device.type
15 | 
16 | 
17 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"])
18 | @pytest.mark.parametrize("qtype", [qint8])
19 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
20 | def test_weight_qbytes_tensor_equal(dtype, qtype, axis, device):
21 |     a = random_tensor((32, 32), dtype=dtype, device=device)
22 |     scale = AbsmaxOptimizer()(a, qtype=qtype, axis=axis)
23 |     qa1 = quantize_weight(a, qtype=qtype, axis=axis, scale=scale)
24 |     qa2 = quantize_weight(a, qtype=qtype, axis=axis, scale=scale)
25 |     assert torch.equal(qa1, qa2)
26 | 
27 | 
28 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
29 | @pytest.mark.parametrize("qtype", [qint8])
30 | def test_weight_qbytes_tensor_transpose_contiguous(axis, qtype, device):
31 |     input_shape = (16, 32)
32 |     qa = random_qweight(input_shape, axis=axis, qtype=qtype, dtype=torch.float32).to(device)
33 |     assert qa.is_contiguous()
34 |     tqa = qa.t()
35 |     assert isinstance(tqa, WeightQBytesTensor)
36 |     assert not tqa.is_contiguous()
37 |     tqa = tqa.contiguous()
38 |     assert tqa.is_contiguous()
39 | 
40 | 
41 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
42 | @pytest.mark.parametrize("qtype", [qint8])
43 | def test_weight_qbytes_tensor_transposed_stride(axis, qtype, device):
44 |     input_shape = (16, 32)
45 |     a = random_tensor(input_shape, dtype=torch.float32).to(device)
46 |     scale = AbsmaxOptimizer()(a, qtype=qtype, axis=axis)
47 |     qa = quantize_weight(a, qtype=qtype, axis=axis, scale=scale)
48 |     assert qa.stride() == a.stride()
49 |     ta = a.t()
50 |     tqa = qa.t()
51 |     assert isinstance(tqa, WeightQBytesTensor)
52 |     assert tqa.stride() == ta.stride()
53 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbytes_tensor_instantiate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import pytest
17 | import torch
18 | 
19 | from optimum.quanto import WeightQBytesTensor, qfloat8, qint8
20 | 
21 | 
22 | def random_data_scale(input_shape, dtype, qtype):
23 |     if qtype.is_floating_point:
24 |         min_value = torch.finfo(qtype.dtype).min
25 |         max_value = torch.finfo(qtype.dtype).max
26 |         data = (torch.rand(input_shape) * max_value + min_value).to(qtype.dtype)
27 |     else:
28 |         max_value = torch.iinfo(qtype.dtype).max
29 |         data = torch.randint(-max_value, max_value, input_shape, dtype=qtype.dtype)
30 |     scale = torch.tensor(1.0 / max_value, dtype=dtype)
31 |     return data, scale
32 | 
33 | 
34 | @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (10, 32, 32)])
35 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"])
36 | @pytest.mark.parametrize("qtype", [qint8, qfloat8], ids=["qint8", "qfloat8"])
37 | def test_qbytestensor_instantiate(input_shape, dtype, qtype, device):
38 |     if qtype.is_floating_point and device.type == "mps":
39 |         pytest.skip("float8 types are not supported on MPS device")
40 |     data, scale = random_data_scale(input_shape, dtype, qtype)
41 |     qa = WeightQBytesTensor(qtype, None, data.size(), data.stride(), data, scale=scale, activation_qtype=None).to(
42 |         device
43 |     )
44 |     assert torch.max(torch.abs(qa.dequantize())) <= 1
45 |     assert qa.dtype == dtype
46 |     assert qa.qtype == qtype
47 |     assert qa.shape == input_shape
48 | 
49 | 
50 | @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (10, 32, 32)])
51 | @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32], ids=["bf16", "fp16", "fp32"])
52 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"])
53 | def test_qbytestensor_equal(input_shape, dtype, qtype, device):
54 |     data, scale = random_data_scale(input_shape, dtype, qtype)
55 |     qa = WeightQBytesTensor(qtype, None, data.size(), data.stride(), data, scale=scale, activation_qtype=None).to(
56 |         device
57 |     )
58 |     qb = WeightQBytesTensor(
59 |         qtype, None, data.size(), data.stride(), data.clone(), scale=scale, activation_qtype=None
60 |     ).to(device)
61 |     assert qa.equal(qb)
62 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbytes_tensor_quantize.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | import torch
17 | from helpers import assert_similar, device_eq, random_qweight, random_tensor
18 | 
19 | from optimum.quanto import (
20 |     WeightQBytesTensor,
21 |     absmax_scale,
22 |     qfloat8,
23 |     qfloat8_e4m3fn,
24 |     qfloat8_e4m3fnuz,
25 |     qfloat8_e5m2,
26 |     qint8,
27 | )
28 | 
29 | 
30 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)])
31 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
32 | @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"])
33 | @pytest.mark.parametrize(
34 |     "axis",
35 |     [None, 0, -1],
36 |     ids=["per-tensor", "first-axis", "last-axis"],
37 | )
38 | def test_symmetric_quantize_int(input_shape, dtype, qtype, axis, device):
39 |     a = random_tensor(input_shape, dtype=dtype).to(device)
40 |     scale = absmax_scale(a, qtype=qtype, axis=axis)
41 |     qa = WeightQBytesTensor.quantize(a, qtype, axis, scale)
42 |     assert isinstance(qa, WeightQBytesTensor)
43 |     assert qa.dtype == dtype
44 |     assert qa.qtype == qtype
45 |     assert device_eq(qa.device, device)
46 |     assert_similar(a, qa)
47 | 
48 | 
49 | @pytest.mark.skip_device("mps")
50 | @pytest.mark.skip_device("xpu")
51 | @pytest.mark.parametrize("input_shape", [(32, 32), (32, 10, 32)])
52 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
53 | @pytest.mark.parametrize(
54 |     "qtype",
55 |     [qfloat8, qfloat8_e4m3fn, qfloat8_e4m3fnuz, qfloat8_e5m2],
56 |     ids=["qfloat8", "qfloat8_e4m3fn", "qfloat8_e4m3fnuz", "qfloat8_e5m2"],
57 | )
58 | @pytest.mark.parametrize(
59 |     "axis",
60 |     [None, 0, -1],
61 |     ids=["per-tensor", "first-axis", "last-axis"],
62 | )
63 | def test_symmetric_quantize_float8(input_shape, dtype, qtype, axis, device):
64 |     a = random_tensor(input_shape, dtype=dtype).to(device)
65 |     scale = absmax_scale(a, qtype=qtype, axis=axis)
66 |     qa = WeightQBytesTensor.quantize(a, qtype, axis, scale)
67 |     assert isinstance(qa, WeightQBytesTensor)
68 |     assert qa.dtype == dtype
69 |     assert qa.qtype == qtype
70 |     assert device_eq(qa.device, device)
71 |     assert_similar(a, qa, atol=5e-3)
72 | 
73 | 
74 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
75 | def test_quantize_weight_axis_dim_1(axis, device):
76 |     input_shape = (1, 32) if axis == 0 else (32, 1)
77 |     qa = random_qweight(input_shape, dtype=torch.float32, qtype=qint8, axis=axis, device=device)
78 |     # Quantizing along an axis of dimension 1 actually means per-tensor
79 |     assert qa.axis is None
80 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/test_weight_qbytes_tensor_serialization.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import io
16 | 
17 | import pytest
18 | import torch
19 | from helpers import random_qweight
20 | 
21 | from optimum.quanto import qfloat8, qint8
22 | 
23 | 
24 | @pytest.mark.parametrize("input_shape", [(10, 10), (10, 32, 32)])
25 | @pytest.mark.parametrize("qtype", [qint8, qfloat8], ids=["qint8", "qfloat8"])
26 | @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
27 | @pytest.mark.parametrize("axis", [0, -1], ids=["first-axis", "last-axis"])
28 | def test_weights_qbytes_tensor_serialization(input_shape, qtype, dtype, axis):
29 |     qinputs = random_qweight(input_shape, dtype=dtype, qtype=qtype, axis=axis)
30 |     b = io.BytesIO()
31 |     torch.save(qinputs, b)
32 |     b.seek(0)
33 |     qinputs_reloaded = torch.load(b, weights_only=False)
34 |     assert qinputs_reloaded.qtype == qtype
35 |     assert torch.equal(qinputs_reloaded._scale, qinputs._scale)
36 |     if qtype.is_floating_point:
37 |         # Equality is not supported for float8
38 |         assert torch.equal(qinputs_reloaded._data.to(torch.float32), qinputs._data.to(torch.float32))
39 |     else:
40 |         assert torch.equal(qinputs_reloaded._data, qinputs._data)
41 |     # We cannot test dtype directly as it is not correctly set by torch.load
42 |     assert qinputs_reloaded._scale.dtype == dtype
43 |     assert qinputs_reloaded.axis == qinputs.axis
44 | 


--------------------------------------------------------------------------------
/tests/tensor/weights/weight_helpers.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | from helpers import assert_similar, random_tensor
17 | 
18 | 
19 | def check_weight_qtensor_linear(qweight, batch_size, tokens, use_bias, rel_max_err=0.0):
20 |     dtype = qweight.dtype
21 |     device = qweight.device
22 |     out_features, in_features = qweight.shape
23 |     inputs = torch.rand((batch_size, tokens, in_features), dtype=dtype, device=device)
24 |     bias = random_tensor((out_features,), dtype=dtype, device=device) if use_bias else None
25 |     qout = torch.nn.functional.linear(inputs, qweight, bias)
26 |     out = torch.nn.functional.linear(inputs, qweight.dequantize(), bias)
27 |     # Verify global alignment
28 |     assert_similar(out, qout)
29 |     # Also look for outliers
30 |     mean_val = out.abs().max()
31 |     max_err = (out - qout).abs().max()
32 |     rel_max_err = max_err / mean_val
33 |     # These values were evaluated empirically without any optimized kernels.
34 |     rtol = {"cpu": 1e-2, "cuda": 2e-2, "mps": 1e-2, "xpu": 2e-2}[device.type]
35 |     assert rel_max_err < rtol, (
36 |         f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err * 100:.2f} %)"
37 |     )
38 | 


--------------------------------------------------------------------------------