├── test
    ├── __init__.py
    ├── test_hpu
    │   ├── requirements.txt
    │   ├── conftest.py
    │   ├── _test_helpers.py
    │   └── test_auto_round.py
    ├── test_cuda
    │   ├── requirements_diffusion.txt
    │   ├── test_llmc_integration.py
    │   ├── requirements.txt
    │   ├── requirements_vlm.txt
    │   ├── test_multiple_card_calib.py
    │   ├── _test_helpers.py
    │   ├── test_calib_dataset.py
    │   ├── test_conv1d.py
    │   ├── test_vllm.py
    │   ├── test_diffusion.py
    │   ├── test_mxfp_and_nvfp_quant.py
    │   ├── test_alg_ext.py
    │   └── test_packing.py
    └── test_cpu
    │   ├── requirements.txt
    │   ├── test_script.py
    │   ├── test_utils.py
    │   ├── _test_helpers.py
    │   ├── test_alg_ext.py
    │   ├── test_conv1d.py
    │   ├── test_autoopt.py
    │   ├── test_model_scope.py
    │   ├── test_logger.py
    │   ├── test_auto_scheme.py
    │   ├── test_load_awq_gptq.py
    │   ├── test_mxfp_save_load.py
    │   ├── test_woq_linear.py
    │   ├── test_cli_usage.py
    │   ├── test_gpt_oss.py
    │   ├── test_autoround_acc.py
    │   └── test_llmc_integration.py
├── .azure-pipelines
    ├── scripts
    │   ├── codeScan
    │   │   ├── codespell
    │   │   │   └── autoround_dict.txt
    │   │   ├── bandit
    │   │   │   └── bandit.sh
    │   │   └── pylint
    │   │   │   └── pylint.sh
    │   ├── ut
    │   │   ├── .coverage
    │   │   ├── collect_log.sh
    │   │   ├── run_ut_hpu.sh
    │   │   └── run_ut.sh
    │   └── change_color.sh
    ├── license_template.txt
    ├── code-scan.yml
    ├── docker
    │   ├── DockerfileCodeScan.devel
    │   └── Dockerfile.devel
    ├── unit-test-hpu.yml
    ├── template
    │   ├── code-scan-template.yml
    │   └── ut-template.yml
    ├── compatibility-test.yml
    └── unit-test.yml
├── requirements-cpu.txt
├── auto_round
    ├── alg_ext.pyd
    ├── alg_ext.abi3.so
    ├── auto_scheme
    │   ├── default_alg.pyd
    │   ├── default_alg.abi3.so
    │   ├── __init__.py
    │   └── register.py
    ├── compressors
    │   ├── mllm
    │   │   ├── templates
    │   │   │   ├── llava.json
    │   │   │   ├── phi3_v.json
    │   │   │   ├── cogvlm2.json
    │   │   │   └── default.json
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   ├── diffusion
    │   │   ├── __init__.py
    │   │   └── README.md
    │   └── __init__.py
    ├── eval
    │   ├── __init__.py
    │   └── evaluation.py
    ├── modelling
    │   ├── __init__.py
    │   └── llama4.py
    ├── experimental
    │   ├── __init__.py
    │   ├── qmodules
    │   │   ├── __init__.py
    │   │   └── base.py
    │   └── utils.py
    ├── export
    │   ├── export_to_autogptq
    │   │   └── __init__.py
    │   ├── export_to_gguf
    │   │   └── __init__.py
    │   ├── export_to_awq
    │   │   └── __init__.py
    │   ├── export_to_llmcompressor
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   ├── export_to_autoround
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   ├── export_to_itrex
    │   │   └── __init__.py
    │   ├── register.py
    │   └── __init__.py
    ├── inference
    │   ├── __init__.py
    │   └── utils.py
    ├── utils
    │   └── __init__.py
    ├── version.py
    ├── data_type
    │   ├── __init__.py
    │   ├── register.py
    │   └── w4fp8.py
    ├── __init__.py
    └── envs.py
├── docs
    ├── imgs
    │   ├── AutoRound.png
    │   ├── full_range_sym.png
    │   ├── autoround_overview.png
    │   └── norm_bias_overview.png
    ├── full_range_sym.md
    ├── mxnv_acc.md
    ├── alg_202508.md
    ├── gguf_alg_ext_acc.md
    ├── publication_list.md
    ├── tuning_norm_bias.md
    ├── opt_rtn.md
    └── auto_scheme_acc.md
├── MANIFEST.in
├── requirements-lib.txt
├── .gitignore
├── requirements.txt
├── auto_round_extension
    ├── vllm_ext
    │   ├── README.md
    │   ├── __init__.py
    │   ├── kv_cache.py
    │   ├── quant_impl.py
    │   ├── sitecustomize.py
    │   ├── envs_ext.py
    │   ├── tests
    │   │   ├── test_models.py
    │   │   └── test_fp8kv.py
    │   ├── mxfp8_qdq_utils.py
    │   ├── quant_method_moe.py
    │   └── auto_round_ext.py
    ├── __init__.py
    ├── cuda
    │   └── __init__.py
    ├── hpu
    │   └── __init__.py
    ├── torch
    │   └── __init__.py
    ├── triton
    │   ├── __init__.py
    │   ├── triton_utils
    │   │   ├── __init__.py
    │   │   └── mixin.py
    │   └── triton_utils_zp
    │   │   ├── __init__.py
    │   │   └── mixin.py
    ├── ark
    │   └── __init__.py
    └── ipex
    │   └── __init__.py
├── SECURITY.md
├── setup.cfg
├── .pre-commit-config.yaml
├── .github
    └── workflows
    │   ├── manual-binary-build-publish.yml
    │   └── compatibility-test.yml
├── CONTRIBUTING.md
└── pyproject.toml


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/test_hpu/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt:
--------------------------------------------------------------------------------
1 | endianess


--------------------------------------------------------------------------------
/requirements-cpu.txt:
--------------------------------------------------------------------------------
1 | numba
2 | tbb
3 | intel-extension-for-pytorch
4 | 


--------------------------------------------------------------------------------
/test/test_cuda/requirements_diffusion.txt:
--------------------------------------------------------------------------------
1 | diffusers
2 | image-reward
3 | clip


--------------------------------------------------------------------------------
/test/test_cuda/test_llmc_integration.py:
--------------------------------------------------------------------------------
1 | ../test_cpu/test_llmc_integration.py


--------------------------------------------------------------------------------
/auto_round/alg_ext.pyd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/auto_round/alg_ext.pyd


--------------------------------------------------------------------------------
/docs/imgs/AutoRound.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/AutoRound.png


--------------------------------------------------------------------------------
/auto_round/alg_ext.abi3.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/auto_round/alg_ext.abi3.so


--------------------------------------------------------------------------------
/docs/imgs/full_range_sym.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/full_range_sym.png


--------------------------------------------------------------------------------
/docs/imgs/autoround_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/autoround_overview.png


--------------------------------------------------------------------------------
/docs/imgs/norm_bias_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/norm_bias_overview.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include requirements-cpu.txt
3 | include requirements-lib.txt
4 | exclude test/*
5 | 


--------------------------------------------------------------------------------
/auto_round/auto_scheme/default_alg.pyd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/auto_round/auto_scheme/default_alg.pyd


--------------------------------------------------------------------------------
/auto_round/auto_scheme/default_alg.abi3.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/main/auto_round/auto_scheme/default_alg.abi3.so


--------------------------------------------------------------------------------
/requirements-lib.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=1.10.0
 2 | datasets
 3 | py-cpuinfo
 4 | sentencepiece
 5 | numpy
 6 | tqdm
 7 | packaging
 8 | pillow
 9 | transformers
10 | threadpoolctl
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vs
 2 | .vscode
 3 | __pycache__
 4 | *.egg-info/
 5 | build/*
 6 | .eggs/
 7 | dist/
 8 | .cache/
 9 | .clangd
10 | CMakeUserPresets.json
11 | tmp_autoround/
12 | ut_log_dir/
13 | 


--------------------------------------------------------------------------------
/auto_round/compressors/mllm/templates/llava.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_type": "llava",
3 |     "replace_tokens": null,
4 |     "processor": "llava",
5 |     "extra_encode" : false,
6 |     "default_dataset": "NeelNanda/pile-10k"
7 | }


--------------------------------------------------------------------------------
/test/test_cpu/requirements.txt:
--------------------------------------------------------------------------------
 1 | addict
 2 | modelscope
 3 | gguf
 4 | sentencepiece
 5 | torchvision
 6 | parameterized
 7 | pillow
 8 | numba
 9 | llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main
10 | lm_eval


--------------------------------------------------------------------------------
/auto_round/compressors/mllm/templates/phi3_v.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_type": "phi3_v",
3 |     "replace_tokens": ["<image>", "<|image_1|>"],
4 |     "processor": "hf",
5 |     "extra_encode" : false,
6 |     "default_dataset": "NeelNanda/pile-10k"
7 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # 1.5.1<accelerate<1.10.0 may cause potentially high RAM usage; versions 1.10.0 or above are recommended.
 2 | accelerate
 3 | datasets
 4 | numpy
 5 | # packaging # for python version <= 3.9
 6 | py-cpuinfo
 7 | threadpoolctl
 8 | torch
 9 | tqdm
10 | transformers>=4.38


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/README.md:
--------------------------------------------------------------------------------
 1 | -  Build and Install vLLM 
 2 | 
 3 | ```
 4 | git clone --branch fused-moe-ar https://github.com/yiliu30/vllm-fork.git
 5 | VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
 6 | ```
 7 | 
 8 | 
 9 | - Enable vLLM-Ext at Runtime
10 | ```bash
11 | VLLM_ENABLE_AR_EXT=1 vllm serve ...
12 | ```


--------------------------------------------------------------------------------
/auto_round/compressors/mllm/templates/cogvlm2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_type": "cogvlm2",
3 |     "format_user": "Question: {{content}} ",
4 |     "format_assistant": "Answer: {{content}}\n",
5 |     "replace_tokens": ["<image>\n", ""],
6 |     "processor": "cogvlm2",
7 |     "extra_encode" : true,
8 |     "default_dataset": "NeelNanda/pile-10k"
9 | }


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/.coverage:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | 
 4 | [paths]
 5 | source =
 6 |     auto_round/
 7 |     /auto-round/auto_round/
 8 |     */site-packages/auto_round/
 9 |     */dist-packages/auto_round/
10 | 
11 | [report]
12 | include =
13 |  */auto_round/**
14 |  */auto_round_extension/**
15 | exclude_lines =
16 |  pragma: no cover
17 |  raise NotImplementedError
18 |  raise TypeError
19 |  except ImportError:
20 |  except Exception as e:


--------------------------------------------------------------------------------
/auto_round/compressors/mllm/templates/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "default",
 3 |     "format_user": "{{content}}",
 4 |     "format_assistant": "{{content}}",
 5 |     "format_system": "{{content}}",
 6 |     "format_function": "",
 7 |     "format_observation": "",
 8 |     "format_separator": "\n",
 9 |     "default_system": "You are a helpful assistant.",
10 |     "replace_tokens": null,
11 |     "extra_encode" : false,
12 |     "default_dataset": "NeelNanda/pile-10k",
13 |     "processor": "hf"
14 | }


--------------------------------------------------------------------------------
/test/test_cuda/requirements.txt:
--------------------------------------------------------------------------------
 1 | # autoawq
 2 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation
 3 | auto-gptq
 4 | einops
 5 | # gptqmodel>=2.0
 6 | # pip install -v git+https://github.com/ModelCloud/GPTQModel.git@v2.2.0 --no-build-isolation
 7 | intel-extension-for-pytorch
 8 | lm-eval>=0.4.9.1
 9 | optimum
10 | pandas
11 | parameterized
12 | pillow
13 | torchvision
14 | numba
15 | vllm>=0.8.5.post1
16 | llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main


--------------------------------------------------------------------------------
/test/test_cpu/test_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.insert(0, "../..")
 6 | 
 7 | 
 8 | class TestScript(unittest.TestCase):
 9 |     def test_default(self):
10 |         os.system(
11 |             """
12 |                 cd ../.. && 
13 |                 python -m auto_round
14 |                     --iters 2
15 |                     --deployment_device fake
16 |                     --output_dir ./tmp_script_test"""
17 |         )
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/test/test_cuda/requirements_vlm.txt:
--------------------------------------------------------------------------------
 1 | # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 --no-deps
 2 | # pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git --no-deps
 3 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation
 4 | # pip install flash-attn==2.7.4.post1 --no-build-isolation
 5 | bitsandbytes
 6 | einops
 7 | flash-attn
 8 | intel-extension-for-transformers
 9 | lm-eval>=0.4.2,<0.5
10 | optimum
11 | pandas
12 | protobuf
13 | pillow
14 | tiktoken
15 | torchvision
16 | triton
17 | xformers
18 | timm
19 | 


--------------------------------------------------------------------------------
/.azure-pipelines/license_template.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Intel Corporation
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round/eval/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round/modelling/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round_extension/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round/experimental/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round_extension/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round_extension/hpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round_extension/torch/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_autogptq/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_gguf/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils_zp/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_awq/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .export import save_quantized_as_autoawq
16 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_llmcompressor/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .config import check_compressed_tensors_supported
15 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_autoround/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .export import save_quantized_as_autoround, AutoRoundFormat
16 | 


--------------------------------------------------------------------------------
/auto_round/inference/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init
15 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_itrex/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .export import save_quantized_as_itrex, pack_model
15 | from .config import QuantConfig
16 | 


--------------------------------------------------------------------------------
/auto_round/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round.utils.device import *
16 | from auto_round.utils.common import *
17 | from auto_round.utils.model import *
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | license_files =
 3 |    LICENSE
 4 |    third-party-programs.txt
 5 | 
 6 | [options.entry_points]
 7 | console_scripts =
 8 |     auto_round = auto_round.__main__:run
 9 |     auto-round = auto_round.__main__:run
10 |     auto_round_eval = auto_round.__main__:run_eval
11 |     auto-round-eval = auto_round.__main__:run_eval
12 |     auto_round_mllm = auto_round.__main__:run_mllm
13 |     auto-round-mllm = auto_round.__main__:run_mllm
14 |     auto-round-fast = auto_round.__main__:run_fast
15 |     auto_round_fast = auto_round.__main__:run_fast
16 |     auto-round-best = auto_round.__main__:run_best
17 |     auto_round_best = auto_round.__main__:run_best
18 |     auto-round-light = auto_round.__main__:run_light
19 |     auto_round_light = auto_round.__main__:run_light
20 | 
21 | 


--------------------------------------------------------------------------------
/auto_round/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Intel® auto-round: An open-source Python library
15 | supporting popular model weight only compression based on signround."""
16 | 
17 | __version__ = "0.9.3"
18 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/collect_log.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | uv pip install coverage
 4 | export COVERAGE_RCFILE=${BUILD_SOURCESDIRECTORY}/.azure-pipelines/scripts/ut/.coverage
 5 | coverage_log="${BUILD_SOURCESDIRECTORY}/log_dir/coverage_log"
 6 | cd "${BUILD_SOURCESDIRECTORY}/log_dir"
 7 | 
 8 | echo "collect coverage for PR branch"
 9 | mkdir -p coverage_PR
10 | cp ut-*/.coverage.* ./coverage_PR/
11 | cd coverage_PR
12 | coverage combine --keep --rcfile=${COVERAGE_RCFILE}
13 | 
14 | cp .coverage "${BUILD_SOURCESDIRECTORY}"
15 | cd "${BUILD_SOURCESDIRECTORY}"
16 | coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log}
17 | coverage html -d log_dir/coverage_PR/htmlcov --rcfile=${COVERAGE_RCFILE}
18 | coverage xml -o log_dir/coverage_PR/coverage.xml --rcfile=${COVERAGE_RCFILE}
19 | ls -l log_dir/coverage_PR/htmlcov
20 | 


--------------------------------------------------------------------------------
/docs/full_range_sym.md:
--------------------------------------------------------------------------------
 1 | W2G32 nsamples 512,iter 200, average accuracy of 10 tasks
 2 | 
 3 | | Models                     | gptq_sym | asym       | full_range_sym |
 4 | |----------------------------|----------|------------|----------------|
 5 | | Meta-Llama-3.1-8B-Instruct | 0.4500   | 0.52802    | **0.5381**     |
 6 | | Qwen2-7B                   | 0.5229   | **0.5559** | 0.5486         |
 7 | 
 8 | W4G128 nsamples 128,iter 200, average accuracy of 10 tasks
 9 | 
10 | | Models                     | asym       | full_range_sym |
11 | |----------------------------|------------|----------------|
12 | | Meta-Llama-3.1-8B-Instruct | 0.6342     | **0.6370**     |
13 | | Qwen2-7B                   | 0.6143     | **0.6167**     |
14 | | Mistral-7B-Instruct-v0.2   | 0.6606     | **0.6635**     |
15 | | Phi-3-mini-4k-instruct     | **0.6475** | 0.6432         |
16 | 


--------------------------------------------------------------------------------
/auto_round_extension/ark/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round_extension.ark.qlinear import QuantLinear, QuantLinearGPTQ, QuantLinearAWQ
16 | 
17 | qlinear_classes = (QuantLinear, QuantLinearGPTQ)
18 | 
19 | awq_classes = (QuantLinearAWQ,)
20 | 


--------------------------------------------------------------------------------
/auto_round/compressors/diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
16 | from auto_round.compressors.diffusion.compressor import DiffusionCompressor
17 | from auto_round.compressors.diffusion.eval import diffusion_eval
18 | 


--------------------------------------------------------------------------------
/auto_round/experimental/qmodules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round.experimental.qmodules.mx import MXFP4QuantLinear, MXFP8QuantLinear
16 | from auto_round.experimental.qmodules.nvfp4 import NVFP4QuantLinear
17 | from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear
18 | 


--------------------------------------------------------------------------------
/auto_round_extension/ipex/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear as IpexAWQQuantLinear
16 | from auto_round_extension.ipex.qlinear_ipex_gptq import (
17 |     QuantLinear as IpexGPTQQuantLinear,
18 | )
19 | 
20 | ipex_qlinear_classes = (IpexAWQQuantLinear, IpexGPTQQuantLinear)
21 | 


--------------------------------------------------------------------------------
/test/test_hpu/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Mapping
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | def pytest_addoption(parser):
 8 |     parser.addoption(
 9 |         "--mode",
10 |         action="store",
11 |         default="lazy",
12 |         help="{compile|lazy}, default lazy. Choose mode to run tests",
13 |     )
14 | 
15 | 
16 | backup_env = pytest.StashKey[Mapping]()
17 | 
18 | 
19 | def pytest_configure(config):
20 |     pytest.mode = config.getoption("--mode")
21 |     assert pytest.mode.lower() in ["lazy", "compile"]
22 | 
23 |     config.stash[backup_env] = os.environ
24 | 
25 |     if pytest.mode == "lazy":
26 |         os.environ["PT_HPU_LAZY_MODE"] = "1"
27 |     elif pytest.mode == "compile":
28 |         os.environ["PT_HPU_LAZY_MODE"] = "0"
29 |         os.environ["PT_ENABLE_INT64_SUPPORT"] = "1"
30 | 
31 | 
32 | def pytest_unconfigure(config):
33 |     os.environ.clear()
34 |     os.environ.update(config.stash[backup_env])
35 | 


--------------------------------------------------------------------------------
/auto_round/compressors/mllm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round.compressors.mllm.dataset import get_mllm_dataloader
16 | from auto_round.compressors.mllm.template import Template, get_template, TEMPLATES
17 | from auto_round.compressors.mllm.compressor import MLLMCompressor
18 | from auto_round.compressors.mllm.eval import mllm_eval, lmms_eval
19 | 


--------------------------------------------------------------------------------
/auto_round/data_type/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import auto_round.data_type.int
16 | import auto_round.data_type.mxfp
17 | import auto_round.data_type.fp8
18 | from auto_round.data_type.register import QUANT_FUNC_WITH_DTYPE
19 | import auto_round.data_type.w4fp8
20 | from auto_round.data_type.utils import get_quant_func, update_fused_layer_global_scales
21 | import auto_round.data_type.nvfp
22 | import auto_round.data_type.gguf
23 | 


--------------------------------------------------------------------------------
/auto_round/compressors/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round.compressors.adam import AdamCompressor
16 | from auto_round.compressors.base import BaseCompressor
17 | from auto_round.compressors.base import LLMCompressor
18 | from auto_round.compressors.mllm.compressor import MLLMCompressor
19 | from auto_round.compressors.diffusion.compressor import DiffusionCompressor
20 | from auto_round.compressors.config import (
21 |     DiffusionExtraConfig,
22 |     ExtraConfig,
23 |     MLLMExtraConfig,
24 |     SchemeExtraConfig,
25 |     TuningExtraConfig,
26 | )
27 | 


--------------------------------------------------------------------------------
/docs/mxnv_acc.md:
--------------------------------------------------------------------------------
 1 | Average accuracy of hellaswag,lambada_openai,mmlu,piqa,winogrande.
 2 | 
 3 | We evaluated using a fake model since we currently have no access to devices for running the real models. However, we have verified that in most cases the fake model closely matches the real model.
 4 | 
 5 | | mxfp4 g32         | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4    | Qwen3-32B |
 6 | |:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:|
 7 | | RTN               | 0.6212               | 0.6550            | 0.7167 | 0.6901   |
 8 | | AutoRound         | 0.6686               | 0.6758            | 0.7247 | 0.7211   |
 9 | | AutoRound+alg_ext | 0.6732               | 0.6809            | 0.7225 | 0.7201   |
10 | 
11 | | nvfp4  g16        | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4    | Qwen3-32B |
12 | |:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:|
13 | | RTN               | 0.6876              | 0.6906             | 0.7296 | 0.7164      |
14 | | AutoRound         | 0.6918              | 0.6973             | 0.7306 | 0.7306      |
15 | | AutoRound+alg_ext | 0.6965              | 0.6989             | 0.7318  | 0.7295     |
16 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from unittest.mock import patch
 3 | 
 4 | sys.path.insert(0, "../..")
 5 | import auto_round.utils.device as auto_round_utils
 6 | 
 7 | 
 8 | class TestPackingWithNumba:
 9 | 
10 |     @patch.object(auto_round_utils, "_is_tbb_installed", lambda: False)
11 |     def test_tbb_not_installed(self):
12 |         assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False."
13 |         assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False."
14 | 
15 |     @patch.object(auto_round_utils, "_is_tbb_installed", lambda: True)
16 |     @patch.object(auto_round_utils, "_is_tbb_configured", lambda: False)
17 |     def test_tbb_installed_but_not_configured_right(self):
18 |         assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False."
19 |         assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False."
20 | 
21 |     @patch.object(auto_round_utils, "is_numba_available", lambda: False)
22 |     def test_numba_not_installed(self):
23 |         assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False."
24 | 


--------------------------------------------------------------------------------
/auto_round/auto_scheme/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | 
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from auto_round.logger import logger
17 | 
18 | from auto_round.auto_scheme.gen_auto_scheme import AutoScheme
19 | 
20 | 
21 | def __getattr__(name):
22 |     if name == "AUTO_SCHEME_METHODS":
23 |         try:
24 |             import auto_round.auto_scheme.default_alg
25 |         except ImportError:
26 |             logger.warning("AutoScheme is currently supported only on Linux.")
27 | 
28 |         from auto_round.auto_scheme.register import AUTO_SCHEME_METHODS
29 | 
30 |         return AUTO_SCHEME_METHODS
31 | 
32 |     raise AttributeError(f"auto-scheme has no attribute '{name}'")
33 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # ==---------------------------------------------------------------------------==
16 | # Apply the extension
17 | # ==---------------------------------------------------------------------------==
18 | 
19 | 
20 | def apply():
21 |     import auto_round_extension.vllm_ext.auto_round_ext
22 |     import auto_round_extension.vllm_ext.envs_ext
23 | 
24 |     print("*****************************************************************************")
25 |     print("* !!! VLLM_ENABLE_AR_EXT is set to 1, applying auto_round_vllm_extension   *")
26 |     print("*****************************************************************************")
27 | 


--------------------------------------------------------------------------------
/.azure-pipelines/code-scan.yml:
--------------------------------------------------------------------------------
 1 | trigger: none
 2 | 
 3 | pr:
 4 |   autoCancel: true
 5 |   drafts: false
 6 |   branches:
 7 |     include:
 8 |       - main
 9 |   paths:
10 |     include:
11 |       - auto_round
12 |       - auto_round_extension
13 |       - setup.py
14 |       - requirements.txt
15 |       - requirements-cpu.txt
16 |       - requirements-lib.txt
17 |       - .azure-pipelines/code-scan.yml
18 |       - .azure-pipelines/scripts/codeScan
19 | 
20 | pool:
21 |   vmImage: "ubuntu-latest"
22 | 
23 | variables:
24 |   CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog"
25 | 
26 | stages:
27 | 
28 |   - stage: BanditCodeScan
29 |     displayName: Bandit Code Scan
30 |     dependsOn: []
31 |     jobs:
32 |       - job: Bandit
33 |         displayName: Bandit
34 |         steps:
35 |           - template: template/code-scan-template.yml
36 |             parameters:
37 |               codeScanFileName: "bandit"
38 |               uploadPath: "bandit.log"
39 | 
40 |   - stage: PylintCodeScan
41 |     displayName: Pylint Code Scan
42 |     dependsOn: []
43 |     jobs:
44 |       - job: Pylint
45 |         displayName: Pylint
46 |         steps:
47 |           - template: template/code-scan-template.yml
48 |             parameters:
49 |               codeScanFileName: "pylint"
50 |               uploadPath: "pylint.json"
51 | 


--------------------------------------------------------------------------------
/auto_round/auto_scheme/register.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | AUTO_SCHEME_METHODS = {}
16 | 
17 | 
18 | def register_scheme_methods(names):
19 |     """Class decorator to register a mixed precision algorithm to the registry.
20 | 
21 |     Decorator function used before a Pattern subclass.
22 | 
23 |     Args:
24 |         names: A string. Define the export type.
25 | 
26 |     Returns:
27 |         cls: The class of register.
28 |     """
29 | 
30 |     def register(alg):
31 |         if isinstance(names, (tuple, list)):
32 |             for name in names:
33 |                 AUTO_SCHEME_METHODS[name] = alg
34 |         else:
35 |             AUTO_SCHEME_METHODS[names] = alg
36 | 
37 |         return alg
38 | 
39 |     return register
40 | 


--------------------------------------------------------------------------------
/auto_round/data_type/register.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | QUANT_FUNC_WITH_DTYPE = {}
17 | 
18 | 
19 | def register_dtype(names):
20 |     """Class decorator to register a EXPORT subclass to the registry.
21 | 
22 |     Decorator function used before a Pattern subclass.
23 | 
24 |     Args:
25 |         names: A string. Define the export type.
26 | 
27 |     Returns:
28 |         cls: The class of register.
29 |     """
30 | 
31 |     def register(dtype):
32 |         if isinstance(names, (tuple, list)):
33 |             for name in names:
34 |                 QUANT_FUNC_WITH_DTYPE[name] = dtype
35 |         else:
36 |             QUANT_FUNC_WITH_DTYPE[names] = dtype
37 | 
38 |         return dtype
39 | 
40 |     return register
41 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_multiple_card_calib.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import shutil
 4 | import sys
 5 | import unittest
 6 | 
 7 | sys.path.insert(0, "../..")
 8 | 
 9 | from auto_round.testing_utils import multi_card
10 | 
11 | 
12 | def get_accuracy(data):
13 |     match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data)
14 | 
15 |     if match:
16 |         accuracy = float(match.group(1))
17 |         return accuracy
18 |     else:
19 |         return 0.0
20 | 
21 | 
22 | class TestAutoRound(unittest.TestCase):
23 |     @classmethod
24 |     def setUpClass(self):
25 |         self.save_dir = "./saved"
26 |         self.tasks = "lambada_openai"
27 | 
28 |     @classmethod
29 |     def tearDownClass(self):
30 |         shutil.rmtree("./saved", ignore_errors=True)
31 |         shutil.rmtree("runs", ignore_errors=True)
32 | 
33 |     @multi_card
34 |     def test_multiple_card_calib(self):
35 |         python_path = sys.executable
36 | 
37 |         ##test llm script
38 |         res = os.system(
39 |             f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
40 |         )
41 |         if res > 0 or res == -1:
42 |             assert False, "cmd line test fail, please have a check"
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/test/test_cpu/_test_helpers.py:
--------------------------------------------------------------------------------
 1 | def model_infer(model, tokenizer, apply_chat_template=False):
 2 |     prompts = [
 3 |         "Hello,my name is",
 4 |         # "The president of the United States is",
 5 |         # "The capital of France is",
 6 |         # "The future of AI is",
 7 |     ]
 8 |     if apply_chat_template:
 9 |         texts = []
10 |         for prompt in prompts:
11 |             messages = [{"role": "user", "content": prompt}]
12 |             text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
13 |             texts.append(text)
14 |         prompts = texts
15 | 
16 |     inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
17 | 
18 |     outputs = model.generate(
19 |         input_ids=inputs["input_ids"].to(model.device),
20 |         attention_mask=inputs["attention_mask"].to(model.device),
21 |         do_sample=False,  ## change this to follow official usage
22 |         max_new_tokens=5,
23 |     )
24 |     generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
25 | 
26 |     decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
27 | 
28 |     for i, prompt in enumerate(prompts):
29 |         print(f"Prompt: {prompt}")
30 |         print(f"Generated: {decoded_outputs[i]}")
31 |         print("-" * 50)
32 |     return decoded_outputs[0]
33 | 


--------------------------------------------------------------------------------
/test/test_cuda/_test_helpers.py:
--------------------------------------------------------------------------------
 1 | def model_infer(model, tokenizer, apply_chat_template=False):
 2 |     prompts = [
 3 |         "Hello,my name is",
 4 |         # "The president of the United States is",
 5 |         # "The capital of France is",
 6 |         # "The future of AI is",
 7 |     ]
 8 |     if apply_chat_template:
 9 |         texts = []
10 |         for prompt in prompts:
11 |             messages = [{"role": "user", "content": prompt}]
12 |             text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
13 |             texts.append(text)
14 |         prompts = texts
15 | 
16 |     inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
17 | 
18 |     outputs = model.generate(
19 |         input_ids=inputs["input_ids"].to(model.device),
20 |         attention_mask=inputs["attention_mask"].to(model.device),
21 |         do_sample=False,  ## change this to follow official usage
22 |         max_new_tokens=5,
23 |     )
24 |     generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
25 | 
26 |     decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
27 | 
28 |     for i, prompt in enumerate(prompts):
29 |         print(f"Prompt: {prompt}")
30 |         print(f"Generated: {decoded_outputs[i]}")
31 |         print("-" * 50)
32 |     return decoded_outputs[0]
33 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/kv_cache.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from typing import TYPE_CHECKING, Any, Literal, Optional, cast
17 | 
18 | import torch
19 | from vllm.logger import init_logger
20 | from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
21 | 
22 | logger = init_logger(__name__)
23 | 
24 | 
25 | class AutoRoundKVCacheMethod(BaseKVCacheMethod):
26 |     """
27 |     Supports loading kv-cache scaling factors from compressed-tensors
28 |     checkpoints.
29 |     """
30 | 
31 |     def __init__(self, quant_config):
32 |         self.validate_kv_cache_scheme(quant_config)
33 |         super().__init__(quant_config)
34 | 
35 |     @staticmethod
36 |     def validate_kv_cache_scheme(quant_config):
37 |         # FIXME: parse from quant_config
38 |         return True
39 | 


--------------------------------------------------------------------------------
/auto_round/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from auto_round.autoround import AutoRound
15 | 
16 | # support for old api
17 | from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion
18 | from auto_round.schemes import QuantizationScheme
19 | from auto_round.auto_scheme import AutoScheme
20 | from auto_round.utils import LazyImport
21 | 
22 | 
23 | def __getattr__(name):
24 |     if name == "AutoHfQuantizer":
25 |         from auto_round.inference.auto_quantizer import AutoHfQuantizer
26 | 
27 |         return AutoHfQuantizer
28 |     if name == "AutoRoundConfig":
29 |         from auto_round.inference.auto_quantizer import AutoRoundConfig
30 | 
31 |         return AutoRoundConfig
32 | 
33 |     raise AttributeError(f"auto-round has no attribute '{name}'")
34 | 
35 | 
36 | from .version import __version__
37 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/codeScan/bandit/bandit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for var in "$@"
 4 | do
 5 |   case $var in
 6 |     --scan_module=*)
 7 |         scan_module=$(echo $var |cut -f2 -d=)
 8 |     ;;
 9 |   esac
10 | done
11 | 
12 | source /auto-round/.azure-pipelines/scripts/change_color.sh
13 | RESET="echo -en \\E[0m \\n" # close color
14 | 
15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog"
16 | mkdir -p $log_dir
17 | 
18 | python -m bandit -r -lll -iii "/auto-round/${scan_module}" >$log_dir/bandit.log
19 | exit_code=$?
20 | 
21 | $BOLD_YELLOW && echo " -----------------  Current bandit cmd start --------------------------" && $RESET
22 | echo "python -m bandit -r -lll -iii  /auto-round/${scan_module} > $log_dir/bandit.log"
23 | $BOLD_YELLOW && echo " -----------------  Current bandit cmd end --------------------------" && $RESET
24 | 
25 | $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------"
26 | cat $log_dir/bandit.log
27 | $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
28 | 
29 | if [ ${exit_code} -ne 0 ]; then
30 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET
31 |     exit 1
32 | fi
33 | $BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
34 | exit 0
35 | 


--------------------------------------------------------------------------------
/.azure-pipelines/docker/DockerfileCodeScan.devel:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | ARG UBUNTU_VER=24.04
17 | FROM ubuntu:${UBUNTU_VER} as devel
18 | 
19 | # See http://bugs.python.org/issue19846
20 | ENV LANG C.UTF-8
21 | 
22 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
23 |     aspell \
24 |     aspell-en \
25 |     python3 \
26 |     python3-pip \
27 |     autoconf \
28 |     build-essential \
29 |     wget
30 | 
31 | RUN ln -sf $(which python3) /usr/bin/python
32 | 
33 | ARG USER_ID=1000
34 | ARG GROUP_ID=1000
35 | 
36 | RUN groupadd -g ${GROUP_ID} hostgroup && \
37 |     useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser
38 | 
39 | USER hostuser
40 | 
41 | ENV PATH="/home/hostuser/.local/bin:$PATH"
42 | RUN pip config set global.break-system-packages true
43 | RUN python -m pip install --no-cache-dir pylint bandit
44 | 
45 | WORKDIR /
46 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/quant_impl.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from typing import Optional
17 | 
18 | import torch
19 | 
20 | 
21 | class AutoRoundQuantImpl(ABC):
22 |     @classmethod
23 |     @abstractmethod
24 |     def get_min_capability(cls) -> int:
25 |         """
26 |         Get minimum device capability.
27 |         """
28 |         raise NotImplementedError
29 | 
30 |     @abstractmethod
31 |     def create_weights(self, *args, **kwargs):
32 |         raise NotImplementedError
33 | 
34 |     @abstractmethod
35 |     def apply_weights(
36 |         self,
37 |         layer: torch.nn.Module,
38 |         x: torch.Tensor,
39 |         bias: Optional[torch.Tensor],
40 |     ):
41 |         raise NotImplementedError
42 | 
43 |     @abstractmethod
44 |     def process_weights_after_loading(self, layer: torch.nn.Module):
45 |         raise NotImplementedError
46 | 


--------------------------------------------------------------------------------
/.azure-pipelines/unit-test-hpu.yml:
--------------------------------------------------------------------------------
 1 | trigger: none
 2 | 
 3 | pr:
 4 |   autoCancel: true
 5 |   drafts: false
 6 |   branches:
 7 |     include:
 8 |       - main
 9 |   paths:
10 |     include:
11 |       - auto_round
12 |       - auto_round_extension
13 |       - test/test*hpu*'
14 |       - setup.py
15 |       - requirements-lib.txt
16 |       - .azure-pipelines/scripts/ut
17 |       - .azure-pipelines/template/docker-template.yml
18 |       - .azure-pipelines/template/ut-template.yml
19 |     exclude:
20 |       - auto_round/export/export_to_autogptq
21 |       - auto_round/export/export_to_awq
22 |       - "*.md"
23 |       - "**/*.md"
24 | 
25 | pool: GAUDI
26 | 
27 | variables:
28 |   IMAGE_NAME: "auto-round"
29 |   IMAGE_TAG: "py312"
30 |   UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
31 |   DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
32 |   ARTIFACT_NAME: "UT_coverage_report"
33 |   REPO: $(Build.Repository.Uri)
34 | 
35 | stages:
36 |   - stage: Unit_test
37 |     displayName: Unit Test
38 |     dependsOn: []
39 |     jobs:
40 |       - job:
41 |         displayName: Unit Test
42 |         steps:
43 |           - template: template/ut-template.yml
44 |             parameters:
45 |               imageSource: "pull"
46 |               dockerConfigName: "commonDockerConfig"
47 |               utScriptFileName: "run_ut_hpu"
48 |               uploadPath: $(UPLOAD_PATH)
49 |               utArtifact: "ut"
50 | 
51 |           - task: PublishCodeCoverageResults@2
52 |             inputs:
53 |               summaryFileLocation: $(UPLOAD_PATH)/coverage.xml
54 | 


--------------------------------------------------------------------------------
/.azure-pipelines/template/code-scan-template.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: codeScanFileName
 3 |     type: string
 4 |   - name: uploadPath
 5 |     type: string
 6 | 
 7 |   - name: codeScanContainerName
 8 |     type: string
 9 |     default: "codeScan"
10 |   - name: scanModule
11 |     type: string
12 |     default: "auto_round"
13 | 
14 | steps:
15 |   - template: docker-template.yml
16 |     parameters:
17 |       dockerConfigName: "commonDockerConfig"
18 |       repoName: "code-scan"
19 |       repoTag: "1.0"
20 |       dockerFileName: "DockerfileCodeScan"
21 |       containerName: ${{ parameters.codeScanContainerName }}
22 | 
23 |   - script: |
24 |       docker exec ${{ parameters.codeScanContainerName }} bash -c "bash /auto-round/.azure-pipelines/scripts/codeScan/${{ parameters.codeScanFileName }}/${{ parameters.codeScanFileName }}.sh \
25 |       --scan_module=${{ parameters.scanModule }}"
26 |     displayName: "${{ parameters.codeScanFileName }} Check"
27 | 
28 |   - task: PublishPipelineArtifact@1
29 |     condition: succeededOrFailed()
30 |     inputs:
31 |       targetPath: .azure-pipelines/scripts/codeScan/scanLog/${{ parameters.uploadPath }}
32 |       artifact: ${{ parameters.codeScanFileName }}
33 |       publishLocation: "pipeline"
34 |     displayName: "PublishPipelineArtifact"
35 | 
36 |   - task: Bash@3
37 |     condition: always()
38 |     inputs:
39 |       targetType: "inline"
40 |       script: |
41 |         docker exec ${{ parameters.codeScanContainerName }} bash -c "rm -fr /auto-round/* && rm -fr /auto-round/.* || true"
42 |     displayName: "Docker clean up"
43 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_alg_ext.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | from parameterized import parameterized
 7 | 
 8 | sys.path.insert(0, "../..")
 9 | 
10 | from auto_round import AutoRound
11 | 
12 | 
13 | class TestAlgExt(unittest.TestCase):
14 |     def test_alg_ext(self):
15 |         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
16 |         ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True)
17 |         ar.quantize()
18 | 
19 |         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
20 |         ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True)
21 |         ar.quantize()
22 | 
23 |         from auto_round.auto_scheme import AutoScheme
24 | 
25 |         scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True)
26 |         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
27 |         ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True)
28 |         ar.quantize()
29 | 
30 |     def test_alg_ext_import(self):
31 |         from auto_round.alg_ext import wrapper_autoround
32 | 
33 |     def test_all_support_dtype(self):
34 |         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
35 |         for scheme in ["MXFP4", "NVFP4", "W2A16G64"]:
36 |             ar = AutoRound(
37 |                 model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
38 |             )
39 |             ar.quantize()
40 | 


--------------------------------------------------------------------------------
/test/test_hpu/_test_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def is_pytest_mode_compile():
 5 |     return pytest.mode == "compile"
 6 | 
 7 | 
 8 | def is_pytest_mode_lazy():
 9 |     return pytest.mode == "lazy"
10 | 
11 | 
12 | def model_infer(model, tokenizer, apply_chat_template=False):
13 |     prompts = [
14 |         "Hello,my name is",
15 |         # "The president of the United States is",
16 |         # "The capital of France is",
17 |         # "The future of AI is",
18 |     ]
19 |     if apply_chat_template:
20 |         texts = []
21 |         for prompt in prompts:
22 |             messages = [{"role": "user", "content": prompt}]
23 |             text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
24 |             texts.append(text)
25 |         prompts = texts
26 | 
27 |     inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
28 | 
29 |     outputs = model.generate(
30 |         input_ids=inputs["input_ids"].to(model.device),
31 |         attention_mask=inputs["attention_mask"].to(model.device),
32 |         do_sample=False,  ## change this to follow official usage
33 |         max_new_tokens=5,
34 |     )
35 |     generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
36 | 
37 |     decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
38 | 
39 |     for i, prompt in enumerate(prompts):
40 |         print(f"Prompt: {prompt}")
41 |         print(f"Generated: {decoded_outputs[i]}")
42 |         print("-" * 50)
43 |     return decoded_outputs[0]
44 | 


--------------------------------------------------------------------------------
/.azure-pipelines/docker/Dockerfile.devel:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ARG UBUNTU_VER=24.04
16 | FROM ubuntu:${UBUNTU_VER}
17 | 
18 | # See http://bugs.python.org/issue19846
19 | ENV LANG C.UTF-8
20 | 
21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
22 |     build-essential \
23 |     ca-certificates \
24 |     git \
25 |     libomp-dev \
26 |     numactl \
27 |     time \
28 |     wget \
29 |     bc \
30 |     jq \
31 |     vim
32 | 
33 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
34 | 
35 | ARG USER_ID=1000
36 | ARG GROUP_ID=1000
37 | 
38 | RUN groupadd -g ${GROUP_ID} hostgroup && \
39 |     useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser
40 | 
41 | USER hostuser
42 | 
43 | ENV PATH="/home/hostuser/.venv/bin:$PATH"
44 | ENV VIRTUAL_ENV="/home/hostuser/.venv"
45 | ENV UV_NO_PROGRESS=1 \
46 |     UV_COMPILE_BYTECODE=1 \
47 |     UV_LINK_MODE=copy
48 | 
49 | RUN uv venv --python=3.12 /home/hostuser/.venv
50 | RUN which python && python --version
51 | 
52 | WORKDIR /home/hostuser
53 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_autoround/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import fields
16 | from typing import List
17 | 
18 | from auto_round.schemes import QuantizationScheme
19 | 
20 | 
21 | def check_neq_config(config: dict, **expected) -> List[str]:
22 |     """
23 |     Compare a config dict against expected values.
24 |     Ensures all required keys are present in both config and expected.
25 | 
26 |     Returns:
27 |         List[str]: [keys] for mismatched values.
28 |     """
29 |     scheme_keys = [f.name for f in fields(QuantizationScheme)]
30 |     # 1. Check missing from expected
31 |     missing_expected = [k for k in scheme_keys if k not in expected]
32 |     if missing_expected:
33 |         raise ValueError(f"Missing expected values for keys: {missing_expected}")
34 | 
35 |     # # 2. Check missing from layer config
36 |     # missing_config = [k for k in scheme_keys if k not in config] # None
37 |     # if missing_config:
38 |     #     raise ValueError(f"Missing config values for keys: {missing_config}")
39 | 
40 |     # 3. Collect mismatches
41 |     return [key for key in scheme_keys if config.get(key) not in (expected[key], None)]
42 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/run_ut_hpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | # install requirements
 5 | echo "set up UT env..."
 6 | export TQDM_MININTERVAL=60
 7 | pip install pytest-cov pytest-html
 8 | pip list
 9 | 
10 | cd /auto-round/test/test_hpu || exit 1
11 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
12 | 
13 | export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
14 | export FORCE_BF16=1
15 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage
16 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
17 | 
18 | LOG_DIR=/auto-round/log_dir
19 | mkdir -p ${LOG_DIR}
20 | ut_log_name=${LOG_DIR}/ut.log
21 | 
22 | find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
23 | find . -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
24 | 
25 | cat run_lazy.sh
26 | bash run_lazy.sh 2>&1 | tee ${ut_log_name}
27 | 
28 | cat run_compile.sh
29 | bash run_compile.sh 2>&1 | tee ${ut_log_name}
30 | 
31 | cp report.html ${LOG_DIR}/
32 | cp coverage.xml ${LOG_DIR}/
33 | 
34 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
35 |     echo "##[error]Find errors in pytest case, please check the output..."
36 |     exit 1
37 | fi
38 | 
39 | # if ut pass, collect the coverage file into artifacts
40 | cp .coverage ${LOG_DIR}/.coverage
41 | 
42 | echo "UT finished successfully! "


--------------------------------------------------------------------------------
/auto_round/export/register.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | EXPORT_FORMAT = {}
17 | 
18 | 
19 | def register_format(name):
20 |     """Class decorator to register a EXPORT subclass to the registry.
21 | 
22 |     Decorator function used before a Pattern subclass.
23 | 
24 |     Args:
25 |         cls (class): The subclass of register.
26 |         name: A string. Define the export type.
27 | 
28 |     Returns:
29 |         cls: The class of register.
30 |     """
31 | 
32 |     def register(format):
33 |         EXPORT_FORMAT[name] = format
34 |         return format
35 | 
36 |     return register
37 | 
38 | 
39 | PACKING_LAYER_WITH_FORMAT = {}
40 | 
41 | 
42 | def register_layer_packing(name):
43 |     """Class decorator to register a EXPORT subclass to the registry.
44 | 
45 |     Decorator function used before a Pattern subclass.
46 | 
47 |     Args:
48 |         cls (class): The subclass of register.
49 |         name: A string. Define the export type.
50 | 
51 |     Returns:
52 |         cls: The class of register.
53 |     """
54 | 
55 |     def register(format):
56 |         PACKING_LAYER_WITH_FORMAT[name] = format
57 |         return format
58 | 
59 |     return register
60 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/codeScan/pylint/pylint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for var in "$@"
 4 | do
 5 |   case $var in
 6 |     --scan_module=*)
 7 |         scan_module=$(echo $var |cut -f2 -d=)
 8 |     ;;
 9 |   esac
10 | done
11 | 
12 | source /auto-round/.azure-pipelines/scripts/change_color.sh
13 | RESET="echo -en \\E[0m \\n" # close color
14 | 
15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog"
16 | mkdir -p $log_dir
17 | 
18 | pip install torch --index-url https://download.pytorch.org/whl/cpu
19 | pip install -r /auto-round/requirements.txt
20 | pip install -r /auto-round/requirements-cpu.txt
21 | 
22 | echo "[DEBUG] list pipdeptree..."
23 | pip install pipdeptree
24 | pipdeptree
25 | 
26 | python -m pylint -f json --disable=R,C,W,E0606,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto \
27 | --ignored-modules=tensorflow,keras,torch,torch.quantization,torch.tensor,torchvision,fairseq,mxnet,onnx,onnxruntime,intel_extension_for_pytorch,intel_extension_for_tensorflow,torchinfo,horovod,transformers,deepspeed,deepspeed.module_inject \
28 | /auto-round/${scan_module} > $log_dir/pylint.json
29 | 
30 | exit_code=$?
31 | 
32 | $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------" && $RESET
33 | cat $log_dir/pylint.json
34 | $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
35 | 
36 | if [ ${exit_code} -ne 0 ]; then
37 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Pylint error details." && $RESET
38 |     exit 1
39 | fi
40 | $BOLD_PURPLE && echo "Congratulations, Pylint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
41 | exit 0
42 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_calib_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import json
 8 | 
 9 | import torch
10 | from transformers import AutoModelForCausalLM, AutoTokenizer
11 | 
12 | from auto_round import AutoRound
13 | 
14 | 
15 | class TestLocalCalibDataset(unittest.TestCase):
16 |     @classmethod
17 |     def setUpClass(self):
18 |         json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
19 |         os.makedirs("./saved", exist_ok=True)
20 |         self.json_file = "./saved/tmp.json"
21 |         with open(self.json_file, "w") as json_file:
22 |             json.dump(json_data, json_file, indent=4)
23 | 
24 |         jsonl_data = [{"text": "哈哈，開心點"}, {"text": "hello world"}]
25 |         os.makedirs("./saved", exist_ok=True)
26 |         self.jsonl_file = "./saved/tmp.jsonl"
27 |         with open(self.jsonl_file, "w") as jsonl_file:
28 |             for item in jsonl_data:
29 |                 json.dump(item, jsonl_file, ensure_ascii=False)
30 |                 jsonl_file.write("\n")
31 | 
32 |         model_name = "facebook/opt-125m"
33 |         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
34 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
35 | 
36 |     def test_combine_dataset(self):
37 |         dataset = "NeelNanda/pile-10k" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese"
38 |         bits, group_size, sym = 4, 128, True
39 |         autoround = AutoRound(
40 |             self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
41 |         )
42 |         autoround.quantize()
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_conv1d.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | from _test_helpers import model_infer
 9 | from transformers import AutoModelForCausalLM, AutoTokenizer
10 | 
11 | from auto_round import AutoRound
12 | 
13 | 
14 | class LLMDataLoader:
15 |     def __init__(self):
16 |         self.batch_size = 1
17 | 
18 |     def __iter__(self):
19 |         for i in range(2):
20 |             yield torch.ones([1, 10], dtype=torch.long)
21 | 
22 | 
23 | class TestQuantizationConv1d(unittest.TestCase):
24 |     @classmethod
25 |     def setUpClass(self):
26 |         self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M"
27 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
28 |         self.llm_dataloader = LLMDataLoader()
29 | 
30 |     @classmethod
31 |     def tearDownClass(self):
32 |         shutil.rmtree("./saved", ignore_errors=True)
33 |         shutil.rmtree("runs", ignore_errors=True)
34 | 
35 |     def test_quant(self):
36 |         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
37 |         bits, group_size, sym = 4, 128, True
38 |         autoround = AutoRound(
39 |             self.model,
40 |             self.tokenizer,
41 |             bits=bits,
42 |             group_size=group_size,
43 |             sym=sym,
44 |             iters=2,
45 |             seqlen=2,
46 |             dataset=self.llm_dataloader,
47 |         )
48 | 
49 |         autoround.quantize()
50 |         autoround.save_quantized("./saved")
51 | 
52 |         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True)
53 |         model_infer(model, self.tokenizer)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/sitecustomize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | VLLM_ENABLE_AR_EXT = os.environ.get("VLLM_ENABLE_AR_EXT", "") in [
18 |     "1",
19 |     "true",
20 |     "True",
21 | ]
22 | 
23 | if VLLM_ENABLE_AR_EXT:
24 |     print("*****************************************************************************")
25 |     print(f"* !!! VLLM_ENABLE_AR_EXT is set to {VLLM_ENABLE_AR_EXT}, applying auto_round_vllm_extension *")
26 |     print("*****************************************************************************")
27 | 
28 |     import vllm.model_executor.layers.quantization.auto_round as auto_round_module
29 | 
30 |     from auto_round_extension.vllm_ext.auto_round_ext import AutoRoundExtensionConfig
31 | 
32 |     auto_round_module.AutoRoundConfig = AutoRoundExtensionConfig
33 |     from auto_round_extension.vllm_ext.envs_ext import extra_environment_variables
34 | 
35 | 
36 | else:
37 |     print("*****************************************************************************")
38 |     print(
39 |         f"* Sitecustomize is loaded, but VLLM_ENABLE_AR_EXT is set to {VLLM_ENABLE_AR_EXT}, skipping auto_round_vllm_extension *"
40 |     )
41 |     print("*****************************************************************************")
42 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_autoopt.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | import transformers
 9 | from transformers import AutoModelForCausalLM, AutoTokenizer
10 | 
11 | from auto_round import AutoRoundAdam
12 | 
13 | 
14 | class LLMDataLoader:
15 |     def __init__(self):
16 |         self.batch_size = 1
17 | 
18 |     def __iter__(self):
19 |         for i in range(2):
20 |             yield torch.ones([1, 10], dtype=torch.long)
21 | 
22 | 
23 | class TestAutoRound(unittest.TestCase):
24 |     @classmethod
25 |     def setUpClass(self):
26 |         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
27 |         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
28 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
29 |         self.llm_dataloader = LLMDataLoader()
30 | 
31 |     @classmethod
32 |     def tearDownClass(self):
33 |         shutil.rmtree("./saved", ignore_errors=True)
34 |         shutil.rmtree("runs", ignore_errors=True)
35 | 
36 |     def test_Adam(self):
37 |         bits, group_size, sym = 4, 128, False
38 |         from auto_round.utils import get_block_names
39 | 
40 |         llm_block_names = get_block_names(self.model, quant_vision=True)
41 |         bits, group_size, sym, batch_size = 4, 128, False, 20
42 |         adamround = AutoRoundAdam(
43 |             self.model,
44 |             self.tokenizer,
45 |             bits=bits,
46 |             group_size=group_size,
47 |             sym=sym,
48 |             iters=2,
49 |             seqlen=2,
50 |             batch_size=batch_size,
51 |             dataset=self.llm_dataloader,
52 |             to_quant_block_names=llm_block_names,
53 |         )
54 |         adamround.quantize()
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autofix_prs: true
 3 |   autoupdate_schedule: quarterly
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v6.0.0
 8 |     hooks:
 9 |       - id: check-json
10 |       - id: check-yaml
11 |       - id: debug-statements
12 |       - id: mixed-line-ending
13 |         args: [--fix=lf]
14 | 
15 |   - repo: https://github.com/Lucas-C/pre-commit-hooks
16 |     rev: v1.5.5
17 |     hooks:
18 |       - id: insert-license
19 |         files: |
20 |           (?x)^(
21 |             auto_round/.*(py|yaml|yml|sh)|
22 |             auto_round_extension/.*(py|yaml|yml|sh)
23 |           )$
24 |         args:
25 |           [
26 |             --license-filepath=.azure-pipelines/license_template.txt,
27 |             --use-current-year,
28 |             --detect-license-in-X-top-lines=40,
29 |             --skip-license-insertion-comment=Copyright,
30 |           ]
31 | 
32 |   - repo: https://github.com/psf/black-pre-commit-mirror
33 |     rev: 25.9.0
34 |     hooks:
35 |       - id: black
36 |         files: (.*\.py)$
37 | 
38 |   - repo: https://github.com/asottile/blacken-docs
39 |     rev: 1.20.0
40 |     hooks:
41 |       - id: blacken-docs
42 |         args: [--line-length=120, --skip-errors]
43 |         additional_dependencies:
44 |           - black==25.9.0
45 | 
46 |   - repo: https://github.com/codespell-project/codespell
47 |     rev: v2.4.1
48 |     hooks:
49 |       - id: codespell
50 |         args: [-w]
51 |         additional_dependencies:
52 |           - tomli
53 | 
54 |   - repo: https://github.com/crate-ci/typos
55 |     rev: v1.38.1
56 |     hooks:
57 |       - id: typos
58 | 
59 |   - repo: https://github.com/pycqa/isort
60 |     rev: 6.1.0
61 |     hooks:
62 |       - id: isort
63 | 
64 |   - repo: https://github.com/astral-sh/ruff-pre-commit
65 |     rev: v0.14.0
66 |     hooks:
67 |       - id: ruff
68 |         args: [--fix, --exit-non-zero-on-fix, --no-cache]
69 | 


--------------------------------------------------------------------------------
/auto_round/experimental/qmodules/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from typing import Optional, Union
17 | 
18 | import torch
19 | 
20 | __all__ = ["QModuleBase"]
21 | 
22 | 
23 | class QModuleBase(torch.nn.Module):
24 |     """
25 |     Base class used to describe the weight creation and forward pass
26 |     of different quantization schemes supported by Auto-Round.
27 |     The design is inspired by vLLM's CompressedTensorsScheme:
28 |     https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
29 | 
30 |     """
31 | 
32 |     def __init__(self):
33 |         super().__init__()
34 | 
35 |     @classmethod
36 |     @abstractmethod
37 |     def from_original(cls, config, original_layer: torch.nn.Module):
38 |         raise NotImplementedError
39 | 
40 |     @classmethod
41 |     @abstractmethod
42 |     def get_min_capability(cls) -> int:
43 |         """
44 |         Get minimum device capability.
45 |         """
46 |         raise NotImplementedError
47 | 
48 |     @abstractmethod
49 |     def process_weights_after_loading(self, layer: torch.nn.Module):
50 |         """
51 |         Called after weight loading is complete for any cleanup that
52 |         needs to occur.
53 |         """
54 |         raise NotImplementedError
55 | 


--------------------------------------------------------------------------------
/.github/workflows/manual-binary-build-publish.yml:
--------------------------------------------------------------------------------
 1 | name: AutoRound binary build and publish
 2 | permissions:
 3 |   contents: read
 4 | 
 5 | on:
 6 |   workflow_dispatch:
 7 |     inputs:
 8 |       branch:
 9 |         default: 'v0.9.3'
10 |         description: 'Tag to build the binary'
11 |         required: true
12 |         type: string
13 |       publish:
14 |         default: false
15 |         description: 'Publish the binary to PyPi'
16 |         required: false
17 |         type: boolean
18 | 
19 | jobs:
20 |   binary-build-and-publish:
21 |     runs-on: ubuntu-latest
22 |     strategy:
23 |       matrix:
24 |         option: ["full", "lib", "hpu"]
25 |       fail-fast: true
26 |     steps:
27 |       - name: Checkout out Repo
28 |         uses: actions/checkout@v4
29 |         with:
30 |           ref: ${{ inputs.branch }}
31 | 
32 |       - name: Set up Python
33 |         uses: actions/setup-python@v2
34 |         with:
35 |           python-version: '3.12'
36 | 
37 |       - name: Install dependencies
38 |         run: |
39 |           python -m pip install --upgrade pip setuptools wheel
40 | 
41 |       - name: Build the binary
42 |         run: |
43 |           if [ "${{ matrix.option }}" == "full" ]; then
44 |             echo "Building auto-round binary..."
45 |             python setup.py sdist bdist_wheel
46 |           else 
47 |             echo "Building auto-round-${{ matrix.option }} binary..."
48 |             python setup.py sdist bdist_wheel ${{ matrix.option }}
49 |           fi
50 | 
51 |       - name: Publish the binary
52 |         if: ${{ fromJSON(inputs.publish) }}
53 |         env:
54 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
55 |         working-directory: ./${{ matrix.repo }}
56 |         run: |
57 |           python -m pip install --upgrade twine
58 |           # python -m twine upload dist/*
59 | 
60 |       - uses: actions/upload-artifact@v4.3.4
61 |         with:
62 |           name: dist-${{ matrix.option }}
63 |           path: dist
64 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_conv1d.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | from _test_helpers import model_infer
 9 | from transformers import AutoModelForCausalLM, AutoTokenizer
10 | 
11 | from auto_round import AutoRound
12 | from auto_round.testing_utils import require_gptqmodel
13 | 
14 | 
15 | class LLMDataLoader:
16 |     def __init__(self):
17 |         self.batch_size = 1
18 | 
19 |     def __iter__(self):
20 |         for i in range(2):
21 |             yield torch.ones([1, 10], dtype=torch.long)
22 | 
23 | 
24 | class TestQuantizationConv1d(unittest.TestCase):
25 |     @classmethod
26 |     def setUpClass(self):
27 |         self.model_name = "MBZUAI/LaMini-GPT-124M"
28 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
29 |         self.llm_dataloader = LLMDataLoader()
30 | 
31 |     @classmethod
32 |     def tearDownClass(self):
33 |         shutil.rmtree("./saved", ignore_errors=True)
34 |         shutil.rmtree("runs", ignore_errors=True)
35 | 
36 |     @require_gptqmodel
37 |     def test_quant(self):
38 |         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
39 |         bits, group_size, sym = 4, 128, True
40 |         from auto_round import AutoRoundConfig
41 | 
42 |         autoround = AutoRound(
43 |             self.model,
44 |             self.tokenizer,
45 |             bits=bits,
46 |             group_size=group_size,
47 |             sym=sym,
48 |             iters=2,
49 |             seqlen=2,
50 |             dataset=self.llm_dataloader,
51 |         )
52 | 
53 |         autoround.quantize()
54 |         autoround.save_quantized("./saved")
55 | 
56 |         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True)
57 |         model_infer(model, self.tokenizer)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     unittest.main()
62 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/envs_ext.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | from typing import Any, Callable
17 | 
18 | from vllm.logger import init_logger
19 | 
20 | logger = init_logger(__name__)
21 | 
22 | # Define extra environment variables
23 | extra_environment_variables: dict[str, Callable[[], Any]] = {
24 |     "VLLM_MXFP4_PRE_UNPACK_WEIGHTS": lambda: os.getenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0") in ("1", "true", "True"),
25 |     "VLLM_MXFP4_PRE_UNPACK_TO_FP8": lambda: os.getenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1") in ("1", "true", "True"),
26 |     "VLLM_ENABLE_STATIC_MOE": lambda: os.getenv("VLLM_ENABLE_STATIC_MOE", "0") in ("1", "true", "True"),
27 |     "VLLM_AR_MXFP4_MODULAR_MOE": lambda: os.getenv("VLLM_AR_MXFP4_MODULAR_MOE", "1") in ("1", "true", "True"),
28 |     "VLLM_AR_POST_PROCESS_GPTOSS": lambda: os.getenv("VLLM_AR_POST_PROCESS_GPTOSS", "0") in ("1", "true", "True"),
29 | }
30 | # Add the extra environment variables to vllm.envs
31 | import vllm.envs as envs
32 | from vllm.envs import environment_variables
33 | 
34 | # Merge the environment variables
35 | all_environment_variables = {**environment_variables, **extra_environment_variables}
36 | 
37 | 
38 | for name, value_fn in extra_environment_variables.items():
39 |     setattr(envs, name, value_fn())
40 | 
41 | logger.warning_once(f"Added extra environment variables: {list(extra_environment_variables.keys())}")
42 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils/mixin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # MIT License
17 | #
18 | # Copyright (c) 2023 潘其威(William)
19 | #
20 | # Permission is hereby granted, free of charge, to any person obtaining a copy
21 | # of this software and associated documentation files (the "Software"), to deal
22 | # in the Software without restriction, including without limitation the rights
23 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
24 | # copies of the Software, and to permit persons to whom the Software is
25 | # furnished to do so, subject to the following conditions:
26 | #
27 | # The above copyright notice and this permission notice shall be included in all
28 | # copies or substantial portions of the Software.
29 | #
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
33 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
34 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
35 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 | # SOFTWARE.
37 | class TritonModuleMixin:
38 |     @classmethod
39 |     def warmup(cls, model, transpose=False, seqlen=2048):
40 |         pass
41 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils_zp/mixin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # MIT License
17 | #
18 | # Copyright (c) 2023 潘其威(William)
19 | #
20 | # Permission is hereby granted, free of charge, to any person obtaining a copy
21 | # of this software and associated documentation files (the "Software"), to deal
22 | # in the Software without restriction, including without limitation the rights
23 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
24 | # copies of the Software, and to permit persons to whom the Software is
25 | # furnished to do so, subject to the following conditions:
26 | #
27 | # The above copyright notice and this permission notice shall be included in all
28 | # copies or substantial portions of the Software.
29 | #
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
33 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
34 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
35 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 | # SOFTWARE.
37 | class TritonModuleMixin:
38 |     @classmethod
39 |     def warmup(cls, model, transpose=False, seqlen=2048):
40 |         pass
41 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_model_scope.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | import shutil
 4 | import sys
 5 | import unittest
 6 | 
 7 | sys.path.insert(0, "../..")
 8 | 
 9 | import torch
10 | 
11 | from auto_round import AutoRound
12 | 
13 | 
14 | class LLMDataLoader:
15 |     def __init__(self):
16 |         self.batch_size = 1
17 | 
18 |     def __iter__(self):
19 |         for i in range(3):
20 |             yield torch.ones([1, 10], dtype=torch.long)
21 | 
22 | 
23 | class TestModelScope(unittest.TestCase):
24 |     @classmethod
25 |     def setUpClass(self):
26 |         self.saved_path = "./saved"
27 |         self.dataset = LLMDataLoader()
28 | 
29 |         self.source_path, self.cache_path = "/tf_dataset/auto_round/modelscope", "/home/hostuser/.cache/modelscope"
30 |         if os.path.exists(self.source_path):
31 |             if not os.path.exists("/home/hostuser/.cache"):
32 |                 os.makedirs("/home/hostuser/.cache")
33 |             shutil.copytree(self.source_path, self.cache_path, dirs_exist_ok=True)
34 | 
35 |     @classmethod
36 |     def tearDownClass(self):
37 |         shutil.rmtree("./saved", ignore_errors=True)
38 |         shutil.rmtree("runs", ignore_errors=True)
39 |         if os.path.exists(self.cache_path):
40 |             shutil.rmtree(self.cache_path, ignore_errors=True)
41 | 
42 |         return super().tearDownClass()
43 | 
44 |     def test_llm(self):
45 |         model_name = "Qwen/Qwen2.5-0.5B-Instruct"
46 |         autoround = AutoRound(
47 |             model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset
48 |         )
49 |         autoround.quantize_and_save()
50 | 
51 |     def test_mllm(self):
52 |         model_name = "Qwen/Qwen2-VL-2B-Instruct"
53 |         autoround = AutoRound(
54 |             model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2
55 |         )
56 |         autoround.quantize_and_save(self.saved_path)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/auto_round/compressors/mllm/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | 
17 | import requests
18 | 
19 | from auto_round.utils import LazyImport
20 | 
21 | PIL = LazyImport("PIL")
22 | Image = LazyImport("PIL.Image")
23 | 
24 | VISUAL_KEYS = [
25 |     "thinker",
26 |     "visual",
27 |     "audio",
28 |     "talker",
29 |     "token2wav",
30 |     "multi_modal_projector",
31 |     "vision_tower",
32 |     "multimodal_projector",
33 |     "vision_model",
34 |     "model.connector",
35 | ]
36 | 
37 | 
38 | def _extract_data_dir(dir_path: str):
39 |     if os.path.isdir(dir_path):
40 |         return dir_path
41 |     elif "=" in dir_path:
42 |         result = {}
43 |         dir_path = dir_path.split(",")
44 |         for _path in dir_path:
45 |             k, v = _path.split("=")
46 |             if k in ["image", "video", "audio"]:
47 |                 result[k] = v
48 |         return result
49 |     else:
50 |         raise TypeError("incorrect input of extra_data_dir, please use auto_round --help for more details.")
51 | 
52 | 
53 | def fetch_image(path_or_url):
54 |     if os.path.isfile(path_or_url):
55 |         image_obj = Image.open(path_or_url)
56 |     elif path_or_url.startswith("http://") or path_or_url.startswith("https://"):
57 |         image_obj = Image.open(requests.get(path_or_url, stream=True).raw)
58 |     else:
59 |         raise TypeError(f"{path_or_url} neither a path or url.")
60 | 
61 |     return image_obj
62 | 


--------------------------------------------------------------------------------
/docs/alg_202508.md:
--------------------------------------------------------------------------------
 1 |  If you are evaluating LLaMA models with recent versions of Transformers, please
 2 | remove `@use_kernel_forward_from_hub("RMSNorm")` in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40) and enable `add_bos_token`(this is set as default in AutoRound) in lm-eval  to stabilize the accuracy.  These adjustments affect the quantized model but not the BF16 model for the tasks evaluated in the AutoRoundv2 paper.
 3 | 
 4 | All other settings follow the default configurations of AutoRound and lm-eval.
 5 | 
 6 | | Qwen3-8B W2G64                |  Avg.  | arc_challenge | hellaswag | gsm8k  | lambada_openai |  mmlu  | mmlupro | truthfulqa_mc1 | winogrande |
 7 | |:------------------------------|:------:|:-------------:|:---------:|:------:|:--------------:|:------:|:-------:|:--------------:|:----------:|
 8 | | AutoRound                     | 0.4373 |    0.4019     |  0.4437   | 0.4215 |     0.4826     | 0.5474 | 0.2630  |     0.3072     |   0.6314   |
 9 | | AutoRound+alg_ext             | 0.4787 |    0.4275     |  0.4516   | 0.5944 |     0.5181     | 0.5773 | 0.2807  |     0.3305     |   0.6496   |
10 | | AutoRoundBest+alg_ext lr 2e-3 | 0.4937 |    0.4505     |   0.474   | 0.5906 |     0.5556     | 0.6028 | 0.3127  |     0.3109     |   0.6527   |
11 | 
12 | | Llama3.1-8B-Instruct W2G64    |  Avg.  | arc_challenge | hellaswag | gsm8k  | lambada_openai |  mmlu  | mmlupro | truthfulqa_mc1 | winogrande |
13 | |:------------------------------|:------:|:-------------:|:---------:|:------:|:--------------:|:------:|:-------:|:--------------:|:----------:|
14 | | AutoRound                     | 0.3820 |    0.3635     |  0.4562   | 0.1622 |     0.5069     | 0.4411 | 0.1661  |     0.3207     |   0.6393   |
15 | | AutoRound+alg_ext             | 0.4166 |    0.3712     |  0.4729   | 0.2039 |     0.5946     | 0.4981 | 0.2163  |     0.3011     |   0.6748   |
16 | | AutoRoundBest+alg_ext lr 2e-3 | 0.4539 |    0.4138     |  0.4999   | 0.3071 |     0.6233     | 0.5279 | 0.2364  |     0.3231     |   0.6993   |
17 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/tests/test_models.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | from vllm.platforms import current_platform
17 | 
18 | MODELS = [
19 |     # "/data5/yliu7/HF_HOME/unsloth-gpt-oss-20b-BF16-ar-MXFP4/"
20 |     # "/data5/yliu7/HF_HOME/Qwen2.5-0.5B-Instruct-test-FP8_STATIC-fp8kv/"
21 |     # "/data6/yiliu4/Qwen3-15B-A2B-Base-MXFP4",
22 |     # "/data6/yiliu4/Llama-3.2-1B-Instruct-MXFP4-fp8attention",
23 |     # "/data6/yiliu4/Llama-3.2-1B-Instruct-MXFP8"
24 |     "/storage/yiliu7/ar_vllm_ext/quantized_model_qwen_mxfp4",
25 |     "/storage/yiliu7/ar_vllm_ext/quantized_model_qwen_mxfp8",
26 | ]
27 | 
28 | 
29 | @pytest.fixture(autouse=True)
30 | def set_vllm_ar_env(monkeypatch):
31 |     monkeypatch.setenv("VLLM_AR_MXFP4_MODULAR_MOE", "1")
32 |     monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1")
33 |     monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0")
34 |     monkeypatch.setenv("VLLM_ENABLE_STATIC_MOE", "0")
35 |     monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0")
36 |     monkeypatch.setenv("VLLM_ENABLE_AR_EXT", "1")
37 | 
38 | 
39 | @pytest.mark.skipif(
40 |     not current_platform.is_cuda(),
41 |     reason="only supports CUDA backend.",
42 | )
43 | @pytest.mark.parametrize("model", MODELS)
44 | def test_auto_round(vllm_runner, model):
45 |     with vllm_runner(model, enforce_eager=True) as llm:
46 |         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
47 |     assert output
48 |     print(f"output is: {output[0][1]}")
49 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_llmcompressor/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Dict, List
16 | 
17 | from auto_round.utils import matches_any_regex, to_standard_regex
18 | 
19 | 
20 | def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
21 |     """
22 |     Generate ignore regex list for llm_compressor based on regex_config and layer_config.
23 | 
24 |     Rules:
25 |     1. Any layer in regex_config with bits >= 16 is ignored.
26 |     2. Any layer in layer_config with bits >= 16 is ignored if not already included.
27 |     3. Output regex patterns are normalized for llm_compressor ('re:...' style).
28 | 
29 |     Args:
30 |         regex_config (Dict[str, Dict]): dynamic quantization config
31 |         layer_config (Dict[str, Dict]): layer-wise quantization config
32 | 
33 |     Returns:
34 |         List[str]: List of regex patterns to ignore during quantization.
35 |     """
36 |     prefix = "re:"
37 |     ignore_regex: List[str] = []
38 | 
39 |     # Step 1: Add regex_config keys with bits >= 16
40 |     for key, cfg in regex_config.items():
41 |         bits = cfg.get("bits")
42 |         if bits > 8:
43 |             ignore_regex.append(prefix + to_standard_regex(key))
44 | 
45 |     # Step 2: Add all full named layer from layer_config if bits >= 16
46 |     for key, cfg in layer_config.items():
47 |         bits = cfg.get("bits")
48 |         if bits > 8:
49 |             ignore_regex.append(key)
50 | 
51 |     return ignore_regex
52 | 


--------------------------------------------------------------------------------
/docs/gguf_alg_ext_acc.md:
--------------------------------------------------------------------------------
 1 | We use **lm-eval** for evaluation. For LLaMA, we enabled `add_bos_token` and
 2 | `removed @use_kernel_forward_from_hub("RMSNorm")`
 3 | in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40)
 4 | to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval.
 5 | 
 6 | *Average accuracy across `lambada_openai`, `hellaswag`, `piqa`, `winogrande`, `truthfulqa_mc1`, `openbookqa`, `boolq`, `arc_easy`,	`arc_challenge` and `mmlu`.*
 7 | 
 8 | |method|scheme|Llama-3.1-8B|Qwen2.5-7B-Instruct|Qwen3-8b|Qwen3-30B-A3B-Instruct-2507|
 9 | |:-----|:-----|:-----------|:------------------|:-------|:--------------------------|
10 | |**BF16**  | -    |0.6295(100%)|0.6571(100%)       |0.6322(100%)|0.6746(100%)           |
11 | | **Optimized RTN** | q2_k_s | 0.5535(87.92%)| 0.6266(95.35%)|0.5901(93.35%)|0.6386(94.66%)|
12 | | **AutoRound+alg_ext** |q2_k_s|0.5740(91.18%)|0.6349(96.62%)|0.5962(94.31%)|0.6460(95.77%)|
13 | | **Optimized RTN**  | q3_k_s | 0.6040(95.95%)|0.6382(97.12%)|0.6128(96.94%)|0.6598(97.82%)|
14 | | **AutoRound+alg_ext** |q3_k_s|0.6081(96.59%)|0.6503(98.97%)|0.6252(98.89%)|0.6622(98.17%)|
15 | | **Optimized RTN**  | q3_k_m |0.6083(96.63%) |0.6418(97.68%)|0.6194(97.97%)||
16 | | **AutoRound+alg_ext** |q3_k_m|0.6127(97.33%)|0.6533(99.42%)|0.6197(98.02%)||
17 | | **Optimized RTN**  | q4_k_s | 0.6228(98.94%)|0.6560(99.83%)|0.6303(99.70%)|0.6762(100.24%)|
18 | | **AutoRound+alg_ext** |q4_k_s|0.6239(99.11%)|0.6605(100.51%)|0.6320(99.98%)|0.6777(100.46%)|
19 | | **Optimized RTN**  | q4_k_m |0.6252(99.32%) |0.6558(99.80%)|0.6296(99.59%)||
20 | | **AutoRound+alg_ext** |q4_k_m|0.6257(99.40%)|0.6575(100.06%)|0.6340(100.29%)||
21 | 
22 | **Time cost**
23 | |model                      |Optimized RTN |AutoRound+alg_ext|
24 | |:--------------------------|:-------------|:----------------|
25 | |Llama-3.1-8B               |1m25s         |29m43s           |
26 | |Qwen2.5-7B-Instruct        |1m20s         |35m35s           |
27 | |Qwen3-8b                   |1m29s         |47m58s           |
28 | |Qwen3-30B-A3B-Instruct-2507|25m12s        |12h47m39s        |


--------------------------------------------------------------------------------
/test/test_cpu/test_logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from io import StringIO
 4 | 
 5 | from auto_round.logger import TRACE_LEVEL, AutoRoundFormatter, logger
 6 | 
 7 | 
 8 | def test_logger(monkeypatch):
 9 |     # Mock the AR_LOG_LEVEL environment variable
10 |     monkeypatch.setenv("AR_LOG_LEVEL", "TRACE")
11 | 
12 |     # Create a StringIO to capture log output
13 |     log_output = StringIO()
14 |     stream_handler = logging.StreamHandler(log_output)
15 |     stream_handler.setFormatter(AutoRoundFormatter())
16 | 
17 |     # Add the handler to the logger
18 |     logger.addHandler(stream_handler)
19 |     logger.setLevel(logging.getLevelName(os.getenv("AR_LOG_LEVEL", "INFO")))
20 | 
21 |     # Log messages at different levels
22 |     logger.trace("This is a TRACE message.")
23 |     logger.debug("This is a DEBUG message.")
24 |     logger.info("This is an INFO message.")
25 |     logger.warning("This is a WARNING message.")
26 |     logger.error("This is an ERROR message.")
27 |     logger.critical("This is a CRITICAL message.")
28 | 
29 |     # Test warning_once functionality
30 |     logger.warning_once("This is a WARNING_ONCE message.")
31 |     logger.warning_once("This is a WARNING_ONCE message.")  # Should not log again
32 |     logger.warning_once("This is another unique WARNING_ONCE message.")  # Should log
33 | 
34 |     # Remove the handler after the test
35 |     logger.removeHandler(stream_handler)
36 | 
37 |     # Get the log output
38 |     log_output.seek(0)
39 |     logs = log_output.read()
40 | 
41 |     # Assertions for log levels
42 |     assert "TRACE" in logs
43 |     assert "This is a TRACE message." in logs
44 |     assert "DEBUG" in logs
45 |     assert "This is a DEBUG message." in logs
46 |     assert "INFO" in logs
47 |     assert "This is an INFO message." in logs
48 |     assert "WARNING" in logs
49 |     assert "This is a WARNING message." in logs
50 |     assert "ERROR" in logs
51 |     assert "This is an ERROR message." in logs
52 |     assert "CRITICAL" in logs
53 |     assert "This is a CRITICAL message." in logs
54 | 
55 |     # Assertions for warning_once
56 |     assert logs.count("This is a WARNING_ONCE message.") == 1
57 |     assert "This is another unique WARNING_ONCE message." in logs
58 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/change_color.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # -------------- general approach start----------------
 4 | 
 5 | # 1. import this file:
 6 |     # source path/change_color.sh
 7 | # 2. use COLOR/BG:
 8 |     # $VARIABLE_NAME && out_put_content && $RESET
 9 | # 3. COLOR + BG:
10 |     # $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET
11 | # 4. custom
12 |     # abbreviation(change number)
13 |         # txt number range (30, 37)
14 |         # bg number range (40, 47)
15 |         # special effects number range (1, 7)
16 |         # echo -en \\E[number1 + ; + number2 + ; + number3 + m"
17 |         # e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m"
18 | 
19 | # -------------- general approach end----------------==
20 | 
21 | 
22 | # general setting
23 | # ------------- light_color start----------------
24 | # black
25 | LIGHT_BLACK="echo -en \\E[30m"
26 | # red
27 | LIGHT_RED="echo -en \\E[31m"
28 | # green
29 | LIGHT_GREEN="echo -en \\E[32m"
30 | # yellow
31 | LIGHT_YELLOW="echo -en \\E[33m"
32 | # blue
33 | LIGHT_BLUE="echo -en \\E[34m"
34 | # purple
35 | LIGHT_PURPLE="echo -en \\E[35m"
36 | # cyan
37 | LIGHT_CYAN="echo -en \\E[36m"
38 | # gray
39 | LIGHT_GRAY="echo -en \\E[37m"
40 | # ------------- light_color end----------------
41 | 
42 | # ------------- bold_color start----------------
43 | # black
44 | BOLD_BLACK="echo -en \\E[1;30m"
45 | # red
46 | BOLD_RED="echo -en \\E[1;31m"
47 | # green
48 | BOLD_GREEN="echo -en \\E[1;32m"
49 | # yellow
50 | BOLD_YELLOW="echo -en \\E[1;33m"
51 | # blue
52 | BOLD_BLUE="echo -en \\E[1;34m"
53 | # purple
54 | BOLD_PURPLE="echo -en \\E[1;35m"
55 | # cyan
56 | BOLD_CYAN="echo -en \\E[1;36m"
57 | # gray
58 | BOLD_GRAY="echo -en \\E[1;37m"
59 | # ------------- bold_color end----------------
60 | 
61 | # ------------- background_color start----------------
62 | # black
63 | BG_BLACK="echo -en \\E[40m"
64 | # red
65 | BG_RED="echo -en \\E[41m"
66 | # green
67 | BG_GREEN="echo -en \\E[42m"
68 | # yellow
69 | BG_YELLOW="echo -en \\E[43m"
70 | # blue
71 | BG_BLUE="echo -en \\E[44m"
72 | # purple
73 | BG_PURPLE="echo -en \\E[45m"
74 | # cyan
75 | BG_CYAN="echo -en \\E[46m"
76 | # gray
77 | BG_GRAY="echo -en \\E[47m"
78 | # ------------- background_color end----------------
79 | 
80 | # close
81 | RESET="echo -en \\E[0m"
82 | 


--------------------------------------------------------------------------------
/docs/publication_list.md:
--------------------------------------------------------------------------------
 1 | Full Publications/Events
 2 | ==========
 3 | 
 4 | ## 2025
 5 | 
 6 | * Blog in Intel: [Advancing Low-Bit Quantization for LLMs: AutoRound x LLM Compressor](https://community.intel.com/t5/Blogs/Products-and-Solutions/HPC/Advancing-Low-Bit-Quantization-for-LLMs-AutoRound-x-LLM/post/1729336) (Dec 2025)
 7 | 
 8 | * Blog in WeChat: [AutoRound x LLM Compressor：让低比特量化 LLM 更准、更好推理](https://mp.weixin.qq.com/s/l5WA-1_4ipffQN6GOH2Iqg) (Dec 2025)
 9 | 
10 | * Blog in vLLM: [Advancing Low‑Bit Quantization for LLMs: AutoRound x LLM Compressor](https://blog.vllm.ai/2025/12/09/intel-autoround-llmc.html) (Dec 2025)
11 | 
12 | * Blog in RedHat: [Advancing Low‑Bit Quantization for LLMs: AutoRound x LLM Compressor](https://developers.redhat.com/articles/2025/12/09/advancing-low-bit-quantization-llms-autoround-x-llm-compressor)  (Dec 2025)
13 | 
14 | * arXiv: [SignRoundV2: Closing the Performance Gap in Extremely Low-Bit Post-Training Quantization for LLMs](https://arxiv.org/abs/2512.04746) (Dec 2025)
15 | 
16 | * Blog in Intel: [AutoRound Meets SGLang: Enabling Quantized Model Inference with AutoRound](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/AutoRound-Meets-SGLang-Enabling-Quantized-Model-Inference-with/post/1727196) (Nov 2025)
17 | 
18 | * Blog in LMSYS: [AutoRound Meets SGLang: Enabling Quantized Model Inference with AutoRound](https://lmsys.org/blog/2025-11-13-AutoRound/) (Nov 2025)
19 | 
20 | * Blog in Medium: [Accelerating vLLM and SGLang Deployment using AutoRound](https://medium.com/@NeuralCompressor/accelerating-vllm-and-sglang-deployment-using-autoround-45fdc0b2683e) (Oct 2025)
21 | 
22 | * Blog in HuggingFace: [What is AutoRound?](https://huggingface.co/blog/autoround) (April 2025)
23 | 
24 | ## 2024
25 | 
26 | * EMNLP: [Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLM](https://aclanthology.org/2024.findings-emnlp.662/) (Oct 2024)
27 | 
28 | # 2023
29 | 
30 | * arXiv: [TEQ: Trainable Equivalent Transformation for Quantization of LLMs](https://arxiv.org/abs/2310.10944) (Oct 2023)
31 | 
32 | * Blog in Medium: [Effective Post-Training Quantization for Large Language Models](https://medium.com/intel-analytics-software/effective-post-training-quantization-for-large-language-models-with-enhanced-smoothquant-approach-93e9d104fb98) (Apr 2023)
33 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/run_ut.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | test_part=$1
 5 | 
 6 | # install requirements
 7 | echo "##[group]set up UT env..."
 8 | export TQDM_MININTERVAL=60
 9 | uv pip install pytest-cov pytest-html
10 | uv pip install -r /auto-round/test/test_cpu/requirements.txt \
11 |     --extra-index-url https://download.pytorch.org/whl/cpu
12 | 
13 | # install latest gguf for ut test
14 | cd ~ || exit 1
15 | git clone -b master --quiet --single-branch https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && uv pip install . sentencepiece
16 | 
17 | cd /auto-round && uv pip install .
18 | 
19 | echo "##[endgroup]"
20 | uv pip list
21 | 
22 | cd /auto-round/test/test_cpu || exit 1
23 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
24 | 
25 | export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH
26 | export FORCE_BF16=1
27 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage
28 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
29 | 
30 | LOG_DIR=/auto-round/log_dir
31 | mkdir -p ${LOG_DIR}
32 | ut_log_name=${LOG_DIR}/ut.log
33 | 
34 | # Split test files into 5 parts
35 | find . -name "test*.py" | sort > all_tests.txt
36 | total_lines=$(wc -l < all_tests.txt)
37 | NUM_CHUNKS=5
38 | q=$(( total_lines / NUM_CHUNKS ))
39 | r=$(( total_lines % NUM_CHUNKS ))
40 | if [ "$test_part" -le "$r" ]; then
41 |     chunk_size=$(( q + 1 ))
42 |     start_line=$(( (test_part - 1) * chunk_size + 1 ))
43 | else
44 |     chunk_size=$q
45 |     start_line=$(( r * (q + 1) + (test_part - r - 1) * q + 1 ))
46 | fi
47 | end_line=$(( start_line + chunk_size - 1 ))
48 | selected_files=$(sed -n "${start_line},${end_line}p" all_tests.txt)
49 | printf '%s\n' "${selected_files}" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
50 | cat run.sh
51 | bash run.sh 2>&1 | tee "${ut_log_name}"
52 | 
53 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
54 |     echo "##[error]Find errors in pytest case, please check the output..."
55 |     exit 1
56 | fi
57 | 
58 | # if ut pass, collect the coverage file into artifacts
59 | cp .coverage "${LOG_DIR}/.coverage.part${test_part}"
60 | 
61 | echo "UT finished successfully! "
62 | 


--------------------------------------------------------------------------------
/auto_round/inference/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from auto_round.utils import SUPPORTED_LAYER_TYPES
17 | 
18 | 
19 | def _expand_regex_config(regex_config, base_config, layer_names, model):
20 |     """
21 |     Expand regex-based layer configs to full layer names.
22 | 
23 |     Args:
24 |         regex_config (dict): regex-based config (dynamic_config or part of extra_config)
25 |         base_config (dict): extra_config to write into
26 |         layer_names (list): known quantization layer names
27 |         model (nn.Module): target model
28 | 
29 |     Returns:
30 |         dict: expanded base_config
31 |     """
32 |     if not regex_config:
33 |         return base_config
34 | 
35 |     # Collect all supported layer names in model
36 |     all_supported_layer_names = [n for n, m in model.named_modules() if isinstance(m, SUPPORTED_LAYER_TYPES)]
37 | 
38 |     # Identify which keys are regex patterns (not exact layer names)
39 |     regex_keys = [k for k in regex_config.keys() if k not in all_supported_layer_names]
40 | 
41 |     for regex_key in regex_keys:
42 |         try:
43 |             pattern = re.compile(regex_key)
44 |         except re.error:
45 |             # invalid regex, skip silently
46 |             continue
47 | 
48 |         # Prefer matches within layer_names first
49 |         matched_layers = [ln for ln in layer_names if re.search(pattern, ln)]
50 |         if not matched_layers:
51 |             matched_layers = [ln for ln in all_supported_layer_names if re.search(pattern, ln)]
52 | 
53 |         if matched_layers:
54 |             cfg = regex_config[regex_key]
55 |             if cfg == {}:
56 |                 continue
57 |             for ln in matched_layers:
58 |                 # do not overwrite explicit layer config
59 |                 if ln not in base_config:
60 |                     base_config[ln] = cfg
61 | 
62 |     return base_config
63 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ### License
 4 | 
 5 | <PROJECT NAME> is licensed under the terms in [LICENSE]<link to license file in repo>. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
 6 | 
 7 | ### Sign your work
 8 | 
 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 | 
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_auto_scheme.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.insert(0, "../..")
 6 | from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 7 | 
 8 | 
 9 | class TestAutoScheme(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(self):
12 |         self.save_dir = "./saved"
13 |         self.tasks = "lambada_openai"
14 | 
15 |     @classmethod
16 |     def tearDownClass(self):
17 |         shutil.rmtree("./saved", ignore_errors=True)
18 |         shutil.rmtree("runs", ignore_errors=True)
19 | 
20 |     def test_auto_scheme_export(self):
21 |         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
22 |         scheme = AutoScheme(avg_bits=2, options=("W2A16"), nsamples=1, ignore_scale_zp_bits=True)
23 |         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
24 |         ar.quantize_and_save(self.save_dir)
25 |         shutil.rmtree(self.save_dir, ignore_errors=True)
26 | 
27 |         scheme = AutoScheme(avg_bits=4, options=("mxfp4"), nsamples=1, ignore_scale_zp_bits=True)
28 |         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
29 |         ar.quantize_and_save(self.save_dir)
30 |         shutil.rmtree(self.save_dir, ignore_errors=True)
31 | 
32 |     def test_layer_config(self):
33 |         from auto_round.auto_scheme.utils import compute_avg_bits_for_model
34 |         from auto_round.utils import get_module
35 | 
36 |         target_bits = 3.0
37 |         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
38 |         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16"))
39 |         user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
40 |         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
41 |         model, layer_config = ar.quantize()
42 |         self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8)
43 |         self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False)
44 |         self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32)
45 |         layer = get_module(model, "model.decoder.layers.10.fc1")
46 |         self.assertEqual(layer.bits, 8)
47 |         self.assertEqual(layer.sym, False)
48 |         self.assertEqual(layer.group_size, 32)
49 |         avg_bits, _ = compute_avg_bits_for_model(model)
50 |         print(avg_bits)
51 |         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     unittest.main()
56 | 


--------------------------------------------------------------------------------
/.azure-pipelines/compatibility-test.yml:
--------------------------------------------------------------------------------
 1 | trigger: none
 2 | 
 3 | pr:
 4 |   autoCancel: true
 5 |   drafts: false
 6 |   branches:
 7 |     include:
 8 |       - main
 9 |   paths:
10 |     include:
11 |       - auto_round
12 |       - auto_round_extension
13 |       - setup.py
14 |       - setup.cfg
15 |       - requirements.txt
16 |       - requirements-cpu.txt
17 |       - .azure-pipelines/compatibility-test.yml
18 |     exclude:
19 |       - "*.md"
20 |       - "**/*.md"
21 | 
22 | stages:
23 |   - stage:
24 |     displayName: Compatibility Test
25 |     dependsOn: []
26 |     jobs:
27 |     - job:
28 |       timeoutInMinutes: 20
29 |       strategy:
30 |         matrix:
31 |           Python310_Linux:
32 |             python_version: '3.10'
33 |             vmImage: 'ubuntu-latest'
34 |           Python311_Linux:
35 |             python_version: '3.11'
36 |             vmImage: 'ubuntu-latest'
37 |           Python312_Linux:
38 |             python_version: '3.12'
39 |             vmImage: 'ubuntu-latest'
40 |           Python313_Linux:
41 |             python_version: '3.13'
42 |             vmImage: 'ubuntu-latest'
43 |           Python314_Linux:
44 |             python_version: '3.14'
45 |             vmImage: 'ubuntu-latest'
46 | 
47 |           Python310_Windows:
48 |             python_version: '3.10'
49 |             vmImage: 'windows-latest'
50 |           Python311_Windows:
51 |             python_version: '3.11'
52 |             vmImage: 'windows-latest'
53 |           Python312_Windows:
54 |             python_version: '3.12'
55 |             vmImage: 'windows-latest'
56 |           Python313_Windows:
57 |             python_version: '3.13'
58 |             vmImage: 'windows-latest'
59 |           Python314_Windows:
60 |             python_version: '3.14'
61 |             vmImage: 'windows-latest'
62 |             
63 |       pool:
64 |         vmImage: $(vmImage)
65 | 
66 |       steps:
67 |       - task: UsePythonVersion@0
68 |         inputs:
69 |           versionSpec: '$(python_version)'
70 |         displayName: 'Use Python $(python_version)'
71 | 
72 |       - script: |
73 |           python -m pip install --upgrade pip uv
74 |           uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
75 |           uv pip install .
76 |           pip list
77 |         env:
78 |           PYTHONUNBUFFERED: '1'
79 |           UV_NO_PROGRESS: '1'
80 |           UV_SYSTEM_PYTHON: '1'
81 |         displayName: 'Install dependencies'
82 | 
83 |       - script: |
84 |           python -c "import auto_round"
85 |         displayName: 'Run compatibility test'
86 | 


--------------------------------------------------------------------------------
/.github/workflows/compatibility-test.yml:
--------------------------------------------------------------------------------
 1 | name: Compatibility Test on ARM64
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |     types: [opened, reopened, ready_for_review, synchronize]
 7 |     paths:
 8 |       - "auto_round/**"
 9 |       - "auto_round_extension/**"
10 |       - "setup.py"
11 |       - "setup.cfg"
12 |       - "requirements.txt"
13 |       - "requirements-cpu.txt"
14 |       - ".github/workflows/compatibility-test.yml"
15 |       - "!**/*.md"
16 |   workflow_dispatch:
17 | 
18 | concurrency:
19 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
20 |   cancel-in-progress: true
21 | 
22 | jobs:
23 |   compatibility-test-arm:
24 |     name: ${{ matrix.os-name }} on ARM64
25 |     runs-on: ${{ matrix.os }}
26 |     timeout-minutes: 60
27 |     strategy:
28 |       fail-fast: false
29 |       matrix:
30 |         include:
31 |           - os: macos-latest
32 |             os-name: macOS
33 |             shell: bash
34 |           - os: ubuntu-24.04-arm
35 |             os-name: Linux
36 |             shell: bash
37 | 
38 |     defaults:
39 |       run:
40 |         shell: ${{ matrix.shell }}
41 | 
42 |     steps:
43 |       - name: Checkout code
44 |         uses: actions/checkout@v6
45 | 
46 |       - name: Set up Python
47 |         uses: actions/setup-python@v6
48 |         with:
49 |           python-version: "3.12"
50 | 
51 |       - name: Verify ARM architecture
52 |         run: |
53 |           python -c "import platform; print(f'Architecture: {platform.machine()}')"
54 | 
55 |       - name: Install dependencies
56 |         run: |
57 |           python -m pip install --upgrade pip uv setuptools
58 |           uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
59 |           uv pip install datasets --upgrade
60 |           uv pip install .
61 |           pip list
62 |         env:
63 |           PYTHONUNBUFFERED: "1"
64 |           UV_NO_PROGRESS: "1"
65 |           UV_SYSTEM_PYTHON: "1"
66 |           TQDM_MININTERVAL: "60"
67 | 
68 |       - name: Run compatibility test
69 |         run: |
70 |           set -xe
71 |           python -c "import auto_round"
72 |           echo "============================================================================"
73 |           auto-round --model_name Qwen/Qwen3-0.6B --bits 4 --iters 0 --nsamples 8
74 |           echo "============================================================================"
75 |           auto-round --model_name Qwen/Qwen3-0.6B --bits 4 --iters 1 --nsamples 8
76 |         env:
77 |           TQDM_MININTERVAL: "60"
78 |           PYTHONUNBUFFERED: "1"
79 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_load_awq_gptq.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.insert(0, "../..")
 6 | 
 7 | from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 8 | 
 9 | 
10 | class TestAutoRound(unittest.TestCase):
11 |     def model_infer(self, model, tokenizer):
12 |         prompts = [
13 |             "Hello,my name is",
14 |             # "The president of the United States is",
15 |             # "The capital of France is",
16 |             # "The future of AI is",
17 |         ]
18 | 
19 |         inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
20 | 
21 |         outputs = model.generate(
22 |             input_ids=inputs["input_ids"].to(model.device),
23 |             attention_mask=inputs["attention_mask"].to(model.device),
24 |             do_sample=False,  ## change this to follow official usage
25 |             max_new_tokens=5,
26 |         )
27 |         generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
28 | 
29 |         decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
30 | 
31 |         for i, prompt in enumerate(prompts):
32 |             print(f"Prompt: {prompt}")
33 |             print(f"Generated: {decoded_outputs[i]}")
34 |             print("-" * 50)
35 | 
36 |     @classmethod
37 |     def tearDownClass(self):
38 |         shutil.rmtree("runs", ignore_errors=True)
39 | 
40 |     def test_load_gptq_no_dummy_gidx_model(self):
41 |         model_name = "/tf_dataset/auto_round/models/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1"
42 |         quantization_config = AutoRoundConfig()
43 |         with self.assertRaises(NotImplementedError) as cm:
44 |             model = AutoModelForCausalLM.from_pretrained(
45 |                 model_name,
46 |                 torch_dtype="auto",
47 |                 trust_remote_code=True,
48 |                 device_map="cpu",
49 |                 quantization_config=quantization_config,
50 |             )
51 | 
52 |     def test_load_awq(self):
53 |         model_name = "/tf_dataset/auto_round/models/casperhansen/opt-125m-awq"
54 |         quantization_config = AutoRoundConfig()
55 |         model = AutoModelForCausalLM.from_pretrained(
56 |             model_name,
57 |             torch_dtype="auto",
58 |             trust_remote_code=True,
59 |             device_map="cpu",
60 |             quantization_config=quantization_config,
61 |         )
62 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
63 |         self.model_infer(model, tokenizer)
64 | 


--------------------------------------------------------------------------------
/.azure-pipelines/template/ut-template.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: dockerConfigName
 3 |     type: string
 4 |     default: "commonDockerConfig"
 5 |   - name: repo
 6 |     type: string
 7 |     default: "https://github.com/intel/auto-round"
 8 |   - name: utScriptFileName
 9 |     type: string
10 |   - name: uploadPath
11 |     type: string
12 |   - name: utArtifact
13 |     type: string
14 |   - name: utTestMode
15 |     type: string
16 |     default: "coverage"
17 |   - name: utContainerName
18 |     type: string
19 |     default: "AutoRoundUnitTest"
20 |   - name: imageSource
21 |     type: string
22 |     default: "build"
23 | 
24 | steps:
25 |   - template: docker-template.yml
26 |     parameters:
27 |       dockerConfigName: ${{ parameters.dockerConfigName }}
28 |       repoName: "auto-round"
29 |       repoTag: "py312"
30 |       dockerFileName: "Dockerfile"
31 |       containerName: ${{ parameters.utContainerName }}
32 |       repo: ${{ parameters.repo }}
33 |       imageSource: ${{ parameters.imageSource }}
34 | 
35 |   - ${{ if eq(parameters.imageSource, 'build') }}:
36 |     - script: |
37 |         docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \
38 |           && uv pip install torch==2.8.0 torchvision --index-url https://download.pytorch.org/whl/cpu \
39 |           && uv pip install intel-extension-for-pytorch==2.8.0 \
40 |           && uv pip install -r requirements.txt \
41 |           && uv pip install -r requirements-cpu.txt \
42 |           && uv pip list"
43 |       displayName: "Env Setup"
44 | 
45 |   - ${{ if eq(parameters.imageSource, 'pull') }}:
46 |     - script: |
47 |         docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \
48 |           && python setup.py bdist_wheel lib \
49 |           && pip install dist/*.whl \
50 |           && pip list"
51 |       displayName: "HPU Env Setup"
52 | 
53 |   - script: |
54 |       docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round/.azure-pipelines/scripts \
55 |         && bash ut/${{ parameters.utScriptFileName }}.sh ${{ parameters.utTestMode }}"
56 |     displayName: "Run UT"
57 | 
58 |   - task: PublishPipelineArtifact@1
59 |     condition: succeeded()
60 |     inputs:
61 |       targetPath: ${{ parameters.uploadPath }}
62 |       artifact: ${{ parameters.utArtifact }}_coverage
63 |       publishLocation: "pipeline"
64 | 
65 |   - task: Bash@3
66 |     condition: always()
67 |     inputs:
68 |       targetType: "inline"
69 |       script: |
70 |         docker stop ${{ parameters.utContainerName }}
71 |         docker rm -vf ${{ parameters.utContainerName }} || true
72 |     displayName: "Docker clean up"
73 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_vllm.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | """Test model set-up and inference for quantized HF models supported
 3 | on the AutoRound.
 4 | 
 5 | Validating the configuration and printing results for manual checking.
 6 | 
 7 | Run `pytest test/test_cuda/test_vllm.py`.
 8 | """
 9 | 
10 | import os
11 | import shutil
12 | import subprocess
13 | 
14 | import pytest
15 | from vllm import LLM, SamplingParams
16 | from vllm.platforms import current_platform
17 | 
18 | MODELS = [
19 |     "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",  ##auto_round:auto_gptq
20 |     "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound",  ##auto_round:auto_awq
21 | ]
22 | 
23 | 
24 | @pytest.mark.skipif(
25 |     not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
26 |     reason="only supports CPU/XPU/CUDA backend.",
27 | )
28 | @pytest.mark.parametrize("model", MODELS)
29 | def test_auto_round(model):
30 |     # Sample prompts.
31 |     prompts = [
32 |         "The capital of France is",
33 |         "The future of AI is",
34 |     ]
35 |     # Create a sampling params object.
36 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
37 |     # Create an LLM.
38 |     QUANTIZATION = "auto-round"
39 |     llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
40 |     # Generate texts from the prompts.
41 |     # The output is a list of RequestOutput objects
42 |     # that contain the prompt, generated text, and other information.
43 |     outputs = llm.generate(prompts, sampling_params)
44 |     # Print the outputs.
45 |     for output in outputs:
46 |         prompt = output.prompt
47 |         generated_text = output.outputs[0].text
48 |         if "France" in prompt:
49 |             assert "Paris" in generated_text
50 | 
51 | 
52 | @pytest.mark.parametrize("model", MODELS)
53 | def test_vllm_lm_eval(model):
54 |     if shutil.which("auto-round") is None:
55 |         pytest.skip("auto-round CLI not available")
56 | 
57 |     env = os.environ.copy()
58 |     env["VLLM_SKIP_WARMUP"] = "true"
59 |     env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
60 | 
61 |     cmd = [
62 |         "auto-round",
63 |         "--model",
64 |         model,
65 |         "--eval",
66 |         "--tasks",
67 |         "lambada_openai",
68 |         "--eval_bs",
69 |         "8",
70 |         "--eval_backend",
71 |         "vllm",
72 |         "--limit",
73 |         "10",
74 |     ]
75 | 
76 |     proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
77 |     assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"
78 | 


--------------------------------------------------------------------------------
/test/test_hpu/test_auto_round.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy
 4 | 
 5 | from auto_round.utils import is_hpex_available
 6 | 
 7 | 
 8 | def run_opt_125m_on_hpu():
 9 |     from transformers import AutoModelForCausalLM, AutoTokenizer
10 | 
11 |     from auto_round import AutoRound
12 | 
13 |     model_name = "facebook/opt-125m"
14 |     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
15 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
16 | 
17 |     bits, group_size, sym = 4, 128, False
18 |     autoround = AutoRound(
19 |         model,
20 |         tokenizer,
21 |         bits=bits,
22 |         group_size=group_size,
23 |         sym=sym,
24 |         iters=2,
25 |         seqlen=2,
26 |     )
27 |     q_model, qconfig = autoround.quantize()
28 |     assert q_model is not None, "Expected q_model to be not None"
29 | 
30 | 
31 | @pytest.mark.skipif(not is_hpex_available(), reason="HPU is not supported")
32 | @pytest.mark.skipif(not is_pytest_mode_lazy(), reason="Only for lazy mode")
33 | def test_opt_125m_lazy_mode():
34 |     run_opt_125m_on_hpu()
35 | 
36 | 
37 | @pytest.mark.skipif(not is_hpex_available(), reason="HPU is not supported")
38 | @pytest.mark.skipif(not is_pytest_mode_compile(), reason="Only for compile mode")
39 | def test_opt_125m_compile_mode():
40 |     torch._dynamo.reset()
41 |     run_opt_125m_on_hpu()
42 | 
43 | 
44 | def test_import():
45 |     from auto_round import AutoRound
46 |     from auto_round.export.export_to_itrex.export import WeightOnlyLinear, save_quantized_as_itrex
47 | 
48 | 
49 | @pytest.mark.parametrize(
50 |     "data_type",
51 |     ["fp8_to_int_sym"],
52 | )
53 | def test_w4a8(data_type):
54 |     from transformers import AutoModelForCausalLM, AutoTokenizer
55 | 
56 |     from auto_round import AutoRound
57 | 
58 |     model_name = "facebook/opt-125m"
59 |     model = AutoModelForCausalLM.from_pretrained(
60 |         model_name,
61 |         torch_dtype="auto",
62 |         attn_implementation="eager",
63 |         trust_remote_code=True,
64 |     )
65 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
66 | 
67 |     autoround = AutoRound(
68 |         model,
69 |         tokenizer,
70 |         bits=4,
71 |         group_size=128,
72 |         iters=2,
73 |         seqlen=2,
74 |         data_type=data_type,
75 |         act_data_type="fp8_sym",
76 |         act_bits=8,
77 |         nsamples=1,
78 |         act_dynamic=False,
79 |     )
80 |     q_model, qconfig = autoround.quantize()
81 |     assert q_model is not None, "Expected q_model to be not None"
82 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/tests/test_fp8kv.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import pytest
15 | import torch
16 | from vllm.platforms import current_platform
17 | 
18 | 
19 | def cuda_capability_at_least(major, minor):
20 |     device_capability = torch.cuda.get_device_capability()
21 |     return device_capability[0] >= major or (device_capability[0] == major and device_capability[1] >= minor)
22 | 
23 | 
24 | MODELS = ["/home/yiliu7/workspace/auto-round/examples/Qwen2.5-0.5B-Instruct-ar-MXFP4-fp8"]
25 | 
26 | 
27 | @pytest.fixture(autouse=True)
28 | def set_vllm_ar_env(monkeypatch):
29 |     monkeypatch.setenv("VLLM_AR_MXFP4_MODULAR_MOE", "1")
30 |     monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1")
31 |     monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0")
32 |     monkeypatch.setenv("VLLM_ENABLE_STATIC_MOE", "0")
33 |     monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0")
34 |     monkeypatch.setenv("VLLM_ENABLE_AR_EXT", "1")
35 |     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
36 |     monkeypatch.setenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "1")
37 |     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
38 | 
39 | 
40 | @pytest.mark.skipif(
41 |     not current_platform.is_cuda(),
42 |     reason="only supports CUDA backend.",
43 | )
44 | @pytest.mark.skipif(
45 |     not cuda_capability_at_least(10, 0), reason="FP8 KV cache only supported on CUDA with compute capability >= 10.0"
46 | )
47 | @pytest.mark.parametrize("model", MODELS)
48 | def test_auto_fp8_kv(vllm_runner, model):
49 |     with vllm_runner(
50 |         model,
51 |         # enforce_eager=True,
52 |         kv_cache_dtype="fp8",
53 |         gpu_memory_utilization=0.1,
54 |     ) as llm:
55 |         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
56 |         assert (
57 |             llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype
58 |             == torch.uint8
59 |         ), f"Expected kv_cache_dtype to be torch.uint8, but got {llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype}"
60 |     assert output
61 |     print(f"output is: {output[0][1]}")
62 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_diffusion.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | import re
 4 | import shutil
 5 | import sys
 6 | import unittest
 7 | 
 8 | import requests
 9 | 
10 | sys.path.insert(0, "../..")
11 | 
12 | from diffusers import AutoPipelineForText2Image
13 | from PIL import Image
14 | 
15 | from auto_round import AutoRoundDiffusion
16 | from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env
17 | 
18 | 
19 | class TestAutoRound(unittest.TestCase):
20 |     @classmethod
21 |     def setUpClass(self):
22 |         self.model_name = "/dataset/FLUX.1-dev"
23 | 
24 |     @classmethod
25 |     def tearDownClass(self):
26 |         shutil.rmtree("runs", ignore_errors=True)
27 | 
28 |     @require_optimum
29 |     def test_diffusion_tune(self):
30 |         ## load the model
31 |         pipe = AutoPipelineForText2Image.from_pretrained(self.model_name).to("cuda")
32 |         model = pipe.transformer
33 | 
34 |         layer_config = {}
35 |         # skip some layers since it takes much time
36 |         for n, m in model.named_modules():
37 |             if m.__class__.__name__ != "Linear":
38 |                 continue
39 |             match = re.search(r"blocks\.(\d+)", n)
40 |             if match and int(match.group(1)) > 0:
41 |                 layer_config[n] = {"bits": 16, "act_bits": 16}
42 | 
43 |         ## quantize the model
44 |         autoround = AutoRoundDiffusion(
45 |             pipe,
46 |             tokenizer=None,
47 |             scheme="MXFP4",
48 |             iters=1,
49 |             nsamples=1,
50 |             num_inference_steps=2,
51 |             layer_config=layer_config,
52 |             dataset="/dataset/captions_source.tsv",
53 |         )
54 |         # skip model saving since it takes much time
55 |         autoround.quantize()
56 | 
57 |     def test_diffusion_rtn(self):
58 |         ## load the model
59 |         pipe = AutoPipelineForText2Image.from_pretrained(self.model_name)
60 | 
61 |         ## quantize the model
62 |         autoround = AutoRoundDiffusion(
63 |             pipe,
64 |             tokenizer=None,
65 |             scheme="MXFP4",
66 |             iters=0,
67 |             num_inference_steps=2,
68 |             dataset="/dataset/captions_source.tsv",
69 |         )
70 |         # skip model saving since it takes much time
71 |         autoround.quantize()
72 | 
73 |     def test_diffusion_model_checker(self):
74 |         from auto_round.utils import is_diffusion_model
75 | 
76 |         self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev"))
77 |         self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1"))
78 |         self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0"))
79 |         self.assertFalse(is_diffusion_model("/models/Qwen3-8B"))
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     unittest.main()
84 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/mxfp8_qdq_utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 3 | import torch
 4 | 
 5 | __all__ = ["get_fp_scale", "dequant_mx_fp8", "quant_mx_fp8"]
 6 | 
 7 | 
 8 | # def get_fp_scale(scale_e8m0):
 9 | #     # https://github.com/pytorch/ao/blob/994a4ba6c869854fcaa6ca7e118fcbd75e6c28cc/torchao/prototype/mx_formats/mx_tensor.py#L337
10 | #     assert scale_e8m0.dtype == torch.uint8, f"Expected uint8, got {scale_e8m0.dtype}"
11 | #     E8M0_EXPONENT_BIAS = 127
12 | #     scale_e8m0 = scale_e8m0.view(torch.uint8)
13 | #     s_offset = scale_e8m0.to(torch.int16) - E8M0_EXPONENT_BIAS
14 | #     # TODO(later): it would be nice if there was a way to do the 2^x operation
15 | #     # in PyTorch without creating a tensor of twos
16 | #     two = torch.full(s_offset.size(), 2.0, device=scale_e8m0.device)
17 | #     # pow(two, s_offset) can be out of range of floating point formats.
18 | #     # TODO(later): handle this for float16 if we decide to support float16
19 | #     # scales.
20 | #     s_fp = torch.pow(two, s_offset)
21 | 
22 | #     return s_fp
23 | 
24 | 
25 | def get_fp_scale(scale_e8m0):
26 |     # https://github.com/pytorch/ao/blob/994a4ba6c869854fcaa6ca7e118fcbd75e6c28cc/torchao/prototype/mx_formats/mx_tensor.py#L337
27 |     E8M0_EXPONENT_BIAS = 127
28 | 
29 |     scale_e8m0 = scale_e8m0.view(torch.uint8)
30 |     s_offset = scale_e8m0.to(torch.int16) - E8M0_EXPONENT_BIAS
31 |     # TODO(later): it would be nice if there was a way to do the 2^x operation
32 |     # in PyTorch without creating a tensor of twos
33 |     # two = torch.full(s_offset.size(), 2.0, device=scale_e8m0.device)
34 |     # pow(two, s_offset) can be out of range of floating point formats.
35 |     # TODO(later): handle this for float16 if we decide to support float16
36 |     # scales.
37 |     # s_fp = torch.pow(two, s_offset)
38 |     # !!!!NOTE Critical: fixed the OoM issue when using HPU graph
39 |     s_fp = torch.pow(2.0, s_offset.to(torch.float))
40 | 
41 |     return s_fp
42 | 
43 | 
44 | def dequant_mx_fp8(weight_fp8, scale_e8m0, block_size, target_dtype):
45 |     scale_float = get_fp_scale(scale_e8m0)
46 |     weight_bf16 = weight_fp8.to(torch.bfloat16)
47 |     weight_original_shape = weight_bf16.shape
48 |     weight_bf16 = weight_bf16.reshape(-1, block_size)
49 |     scale_float = scale_float.reshape(-1, 1)
50 |     dequant_weight = weight_bf16 * scale_float
51 |     dequant_weight = dequant_weight.reshape(weight_original_shape)
52 |     return dequant_weight.to(target_dtype)
53 | 
54 | 
55 | def quant_mx_fp8(tensor):
56 |     from auto_round_extension.vllm_ext.utils import to_mx_fp8e4m3
57 | 
58 |     scale_e8m0_biased, data_lp = to_mx_fp8e4m3(
59 |         data_hp=tensor,
60 |         elem_dtype=torch.float8_e4m3fn,
61 |         block_size=32,
62 |     )
63 |     return scale_e8m0_biased, data_lp
64 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/quant_method_moe.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional
16 | 
17 | import torch
18 | from vllm.logger import init_logger
19 | from vllm.model_executor.layers.fused_moe import (
20 |     FusedMoEConfig,
21 |     FusedMoEMethodBase,
22 | )
23 | from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
24 | from vllm.model_executor.layers.quantization.auto_round import AutoRoundConfig
25 | 
26 | from auto_round.schemes import QuantizationScheme
27 | from auto_round_extension.vllm_ext.utils import _is_mxfp4_w4a4, _is_mxfp8_w8a8, get_scheme, need_quantize
28 | 
29 | logger = init_logger(__name__)
30 | 
31 | 
32 | QMOE_METHODS_DISPATCH_TABLE = {}
33 | 
34 | 
35 | class AutoRoundMoEMethod(FusedMoEMethodBase):
36 |     def __init__(self, moe: FusedMoEConfig):
37 |         super().__init__(moe)
38 | 
39 |     @staticmethod
40 |     def get_moe_method(
41 |         quant_config: AutoRoundConfig,
42 |         layer: torch.nn.Module,
43 |         prefix: str,
44 |     ) -> "AutoRoundMoEMethod":
45 | 
46 |         def get_impl(scheme: QuantizationScheme):
47 |             if not need_quantize(scheme.bits):
48 |                 from vllm.model_executor.layers.fused_moe.layer import (
49 |                     UnquantizedFusedMoEMethod,
50 |                 )
51 | 
52 |                 return UnquantizedFusedMoEMethod(layer.moe_config)
53 | 
54 |             elif _is_mxfp4_w4a4(scheme):
55 |                 from auto_round_extension.vllm_ext.moe_impl_mxfp4 import AutoRoundMoEMethodMXFp4Impl
56 | 
57 |                 return AutoRoundMoEMethodMXFp4Impl(quant_config, layer.moe_config)
58 | 
59 |             elif _is_mxfp8_w8a8(scheme):
60 |                 from auto_round_extension.vllm_ext.moe_impl_mxfp8 import AutoRoundMoEMethodMXFp8Impl
61 | 
62 |                 return AutoRoundMoEMethodMXFp8Impl(quant_config, layer.moe_config)
63 | 
64 |             raise ValueError(f"Unsupported FusedMoe scheme: {scheme}")
65 | 
66 |         layer_scheme = get_scheme(quant_config, prefix)
67 |         impl = get_impl(layer_scheme)
68 |         layer._prefix = prefix
69 |         logger.debug("Apply %s to %s", impl.__class__.__name__, prefix)
70 |         return impl
71 | 
72 |     def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
73 |         return self.impl.get_fused_moe_quant_config(layer)
74 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_mxfp_and_nvfp_quant.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import tempfile
 3 | 
 4 | import pytest
 5 | import torch
 6 | from transformers import AutoModelForCausalLM, AutoTokenizer
 7 | 
 8 | from auto_round import AutoRound
 9 | from auto_round import schemes as ar_schemes
10 | from auto_round.experimental import qmodules as ar_qmodules
11 | from auto_round.export.export_to_autoround import AutoRoundFormat
12 | from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp
13 | from auto_round.testing_utils import has_module
14 | 
15 | testing_schemes = [AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, AutoRoundFormat.NVFP4.value]
16 | QMODULE_MAPPING = {
17 |     AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear,
18 |     AutoRoundFormat.MXFP4.value: ar_qmodules.MXFP4QuantLinear,
19 |     AutoRoundFormat.NVFP4.value: ar_qmodules.NVFP4QuantLinear,
20 | }
21 | 
22 | 
23 | @pytest.mark.parametrize("scheme", testing_schemes)
24 | @torch.inference_mode()
25 | def test_e2e_quant_and_infer(scheme):
26 |     # Use a temporary directory for saving the quantized model
27 |     with tempfile.TemporaryDirectory() as temp_dir:
28 |         model_name = "Qwen/Qwen2.5-0.5B-Instruct"
29 | 
30 |         # Load the tokenizer and model
31 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
32 |         model = AutoModelForCausalLM.from_pretrained(
33 |             model_name,
34 |             device_map="cpu",
35 |             torch_dtype="auto",
36 |             trust_remote_code=True,
37 |         )
38 | 
39 |         # Initialize AutoRound for quantization
40 |         autoround = AutoRound(
41 |             model,
42 |             tokenizer,
43 |             scheme=scheme,
44 |             iters=0,
45 |             nsamples=2,
46 |         )
47 | 
48 |         # Quantize and save the model to the temporary directory
49 |         quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}"
50 |         autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
51 | 
52 |         # Perform inference with the quantized model
53 |         model = AutoModelForCausalLM.from_pretrained(
54 |             quantized_model_path,
55 |             torch_dtype="auto",
56 |         )
57 |         model.eval()
58 |         assert has_module(model, QMODULE_MAPPING[scheme]), f"Expected {QMODULE_MAPPING[scheme].__name__} in the model."
59 | 
60 |         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
61 |         prompt = "Ai is "
62 | 
63 |         # Tokenize the input prompt
64 |         encode = tokenizer.encode(prompt, return_tensors="pt")
65 | 
66 |         # Generate output tokens
67 |         output_tokens = model.generate(
68 |             encode,
69 |             max_length=30,
70 |         )
71 |         output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
72 | 
73 |         # Print and validate the output
74 |         print(f"Prompt: {prompt}")
75 |         print(f"Output: {output}")
76 |         assert output is not None, "Output should not be None"
77 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_alg_ext.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.insert(0, "../..")
 6 | 
 7 | import torch
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | from auto_round import AutoRound, AutoRoundConfig
11 | from auto_round.eval.evaluation import simple_evaluate_user_model
12 | 
13 | 
14 | class TestAlgExt(unittest.TestCase):
15 | 
16 |     @classmethod
17 |     def setUpClass(self):
18 |         self.model_name = "/models/opt-125m"
19 |         self.save_folder = "./saved"
20 | 
21 |     @classmethod
22 |     def tearDownClass(self):
23 |         shutil.rmtree(self.save_folder, ignore_errors=True)
24 |         shutil.rmtree("runs", ignore_errors=True)
25 | 
26 |     def test_2bits(self):
27 |         model_name = "/models/opt-125m"
28 |         ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True)
29 |         ar.quantize_and_save(self.save_folder)
30 |         model = AutoModelForCausalLM.from_pretrained(
31 |             self.save_folder,
32 |             device_map="auto",
33 |         )
34 | 
35 |         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
36 |         result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
37 |         print(result["results"]["lambada_openai"]["acc,none"])
38 |         # wo alg ext 0.2078, with 0.2371
39 |         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.22)
40 |         shutil.rmtree(self.save_folder, ignore_errors=True)
41 | 
42 |     def test_cli(self):
43 |         import os
44 | 
45 |         model_name = "/models/opt-125m"
46 |         python_path = sys.executable
47 | 
48 |         res = os.system(
49 |             f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits"
50 |         )
51 |         if res > 0 or res == -1:
52 |             assert False, "cmd line test fail, please have a check"
53 | 
54 |         res = os.system(
55 |             f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile"
56 |         )
57 |         if res > 0 or res == -1:
58 |             assert False, "cmd line test fail, please have a check"
59 | 
60 |     def test_all_support_dtype(self):
61 |         from auto_round.auto_scheme import AutoScheme
62 | 
63 |         model_name = "/models/Qwen3-0.6B"
64 |         for scheme in ["MXFP4", "NVFP4", "W2A16G64", "gguf:q2_k_s,gguf:q4_k_s"]:
65 |             avg_bits = 2 if scheme == "W2A16G64" else 4
66 |             scheme = AutoScheme(options=scheme, avg_bits=avg_bits, ignore_scale_zp_bits=True)
67 |             ar = AutoRound(
68 |                 model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
69 |             )
70 |             ar.quantize()
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_mxfp_save_load.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import tempfile
 3 | 
 4 | import pytest
 5 | import torch
 6 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 7 | from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
 8 | 
 9 | from auto_round import AutoRound
10 | from auto_round import schemes as ar_schemes
11 | from auto_round.experimental import qmodules as ar_qmodules
12 | from auto_round.export.export_to_autoround import AutoRoundFormat
13 | from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp
14 | from auto_round.inference.backend import MX_TENSOR_DATA_TYPES
15 | from auto_round.testing_utils import has_module
16 | 
17 | testing_scheme_name_lst = [
18 |     AutoRoundFormat.MXFP8.value,
19 |     AutoRoundFormat.MXFP4.value,
20 | ]
21 | QMODULE_MAPPING = {
22 |     AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear,
23 |     AutoRoundFormat.MXFP4.value: ar_qmodules.MXFP4QuantLinear,
24 | }
25 | SCHEMES_MAPPING = {
26 |     AutoRoundFormat.MXFP8.value: ar_schemes.MXFP8,
27 |     AutoRoundFormat.MXFP4.value: ar_schemes.MXFP4,
28 | }
29 | 
30 | 
31 | @pytest.mark.parametrize("scheme_name", testing_scheme_name_lst)
32 | @pytest.mark.parametrize("weight_data_type", MX_TENSOR_DATA_TYPES)
33 | @pytest.mark.parametrize("act_data_type", MX_TENSOR_DATA_TYPES)
34 | @torch.inference_mode()
35 | def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type):
36 |     # Use a temporary directory for saving the quantized model
37 |     with tempfile.TemporaryDirectory() as temp_dir:
38 |         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
39 |         config = AutoConfig.from_pretrained(model_name)
40 |         config.num_hidden_layers = 2  # Use a smaller model for testing
41 |         # Fix configuration validation issues
42 |         config.layer_types = config.layer_types[: config.num_hidden_layers]
43 | 
44 |         # Load the tokenizer and model
45 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
46 |         model = Qwen2ForCausalLM(config)
47 |         scheme = SCHEMES_MAPPING[scheme_name]
48 |         scheme.data_type = weight_data_type
49 |         scheme.act_data_type = act_data_type
50 |         # Initialize AutoRound for quantization
51 |         autoround = AutoRound(
52 |             model,
53 |             tokenizer,
54 |             scheme=scheme,
55 |             iters=0,
56 |             nsamples=2,
57 |         )
58 | 
59 |         # Quantize and save the model to the temporary directory
60 |         quantized_model_path = f"{temp_dir}/tmp_autoround"
61 |         autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
62 | 
63 |         # Perform inference with the quantized model
64 |         model = AutoModelForCausalLM.from_pretrained(
65 |             quantized_model_path,
66 |             torch_dtype="auto",
67 |         )
68 |         model.eval()
69 |         assert has_module(
70 |             model, QMODULE_MAPPING[scheme_name]
71 |         ), f"Expected {QMODULE_MAPPING[scheme_name].__name__} in the model."
72 | 


--------------------------------------------------------------------------------
/auto_round/eval/evaluation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | from typing import Optional, Union
17 | 
18 | from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
19 | 
20 | from auto_round.logger import logger
21 | 
22 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
23 | 
24 | from lm_eval.models.huggingface import HFLM  # pylint: disable=E0401
25 | 
26 | 
27 | def simple_evaluate_user_model(
28 |     user_model,
29 |     tokenizer,
30 |     batch_size: Optional[int] = 1,
31 |     limit: Optional[Union[int, float]] = None,
32 |     max_batch_size: Optional[int] = 64,
33 |     eval_model_dtype="auto",
34 |     add_bos_token: bool = False,
35 |     mllm: bool = False,
36 |     **kwargs
37 | ):
38 |     if mllm:
39 |         from lm_eval.models.hf_vlms import HFMultimodalLM  # pylint: disable=E0401
40 | 
41 |         if batch_size is None or batch_size == "auto":
42 |             logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16")
43 |             batch_size = 16
44 |         hflm = HFMultimodalLM(
45 |             pretrained=user_model,
46 |             tokenizer=tokenizer,
47 |             batch_size=batch_size,
48 |             max_batch_size=max_batch_size,
49 |             dtype=eval_model_dtype,
50 |             add_bos_token=add_bos_token,
51 |         )
52 |     else:
53 |         hflm = HFLM(
54 |             pretrained=user_model,
55 |             tokenizer=tokenizer,
56 |             batch_size=batch_size,
57 |             max_batch_size=max_batch_size,
58 |             dtype=eval_model_dtype,
59 |             add_bos_token=add_bos_token,
60 |         )
61 |     return lm_simple_evaluate(
62 |         model=hflm, model_args=None, batch_size=batch_size, max_batch_size=max_batch_size, limit=limit, **kwargs
63 |     )
64 | 
65 | 
66 | def simple_evaluate(
67 |     model,
68 |     model_args: Optional[Union[str, dict]] = None,
69 |     batch_size: Optional[int] = None,
70 |     limit: Optional[Union[int, float]] = None,
71 |     max_batch_size: Optional[int] = None,
72 |     device: Optional[str] = None,
73 |     **kwargs
74 | ):
75 |     try:
76 |         from transformers import AutoRoundConfig
77 |     except:
78 |         from auto_round.inference.auto_quantizer import AutoHfQuantizer
79 | 
80 |     return lm_simple_evaluate(
81 |         model=model,
82 |         model_args=model_args,
83 |         batch_size=batch_size,
84 |         limit=limit,
85 |         max_batch_size=max_batch_size,
86 |         device=device,
87 |         **kwargs
88 |     )
89 | 


--------------------------------------------------------------------------------
/auto_round/experimental/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | from auto_round.utils import logger
18 | 
19 | 
20 | def per_tensor_fp8_qdq(
21 |     tensor: torch.Tensor, tensor_max: None | torch.Tensor = None
22 | ) -> tuple[torch.Tensor, torch.Tensor]:
23 |     from auto_round.data_type.fp8 import quant_fp8_sym
24 | 
25 |     qdq_tensor, scale, _ = quant_fp8_sym(tensor, max_scale=1.0, tensor_max=tensor_max, group_size=0, v=0)
26 |     return qdq_tensor, scale
27 | 
28 | 
29 | # @torch.compiler.disable
30 | def update_parameter_data(module: torch.nn.Module, new_val: torch.Tensor, name: str):
31 |     """
32 |     Update the data of a parameter in a module.
33 |     If the parameter does not exist, it will be created.
34 |     """
35 |     if hasattr(module, name):
36 |         param = getattr(module, name)
37 |         if isinstance(param, torch.nn.Parameter):
38 |             param.data.copy_(new_val)
39 |         else:
40 |             module.register_parameter(name, torch.nn.Parameter(new_val))
41 |     else:
42 |         logger.warning_once(
43 |             "Parameter %s not found in module %s, creating new parameter."
44 |             % (name, module.__class__.__name__ + str(getattr(module, "layer_idx", "")))
45 |         )
46 |         module.register_parameter(name, torch.nn.Parameter(new_val))
47 | 
48 | 
49 | def normalize_static_kv_dtype(static_kv_dtype: str | torch.dtype) -> torch.dtype:
50 |     valid_dtype_name_lst = ["float16", "bfloat16", "fp8", "float32", "float"]
51 |     valid_torch_dtype = {
52 |         "float16": torch.float16,
53 |         "bfloat16": torch.bfloat16,
54 |         "fp8": torch.float8_e4m3fn,
55 |         "float8_e4m3fn": torch.float8_e4m3fn,
56 |         "float32": torch.float32,
57 |         "float": torch.float32,  # Alias for float32
58 |     }
59 |     if static_kv_dtype in valid_dtype_name_lst:
60 |         new_dtype = valid_torch_dtype[static_kv_dtype]
61 |     elif static_kv_dtype in valid_torch_dtype.values():
62 |         new_dtype = static_kv_dtype
63 |     else:
64 |         raise ValueError(
65 |             f"Invalid static kv dtype: {static_kv_dtype}. "
66 |             # f"Valid options are: {', '.join(valid_dtype_name_lst  + list(valid_torch_dtype.values()))}."
67 |         )
68 |     return new_dtype
69 | 
70 | 
71 | def is_attention_module(module: torch.nn.Module):
72 |     # FIXME: Handle this better.
73 |     return "attention" in module.__class__.__name__.lower() and (
74 |         hasattr(module, "k_proj") or hasattr(module, "v_proj") or hasattr(module, "qkv_proj")
75 |     )
76 | 


--------------------------------------------------------------------------------
/auto_round/envs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # Note: the design of this module is inspired by vLLM's envs.py
15 | # For detailed usage and configuration guide, see: docs/environments.md
16 | 
17 | import os
18 | from typing import TYPE_CHECKING, Any, Callable, Optional
19 | 
20 | if TYPE_CHECKING:
21 |     AR_LOG_LEVEL: str = "INFO"
22 |     AR_USE_MODELSCOPE: bool = "False"
23 | 
24 | environment_variables: dict[str, Callable[[], Any]] = {
25 |     # this is used for configuring the default logging level
26 |     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
27 |     "AR_ENABLE_COMPILE_PACKING": lambda: os.getenv("AR_ENABLE_COMPILE_PACKING", "0").lower() in ("1", "true", "yes"),
28 |     "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"],
29 |     "AR_WORK_SPACE": lambda: os.getenv("AR_WORK_SPACE", "ar_work_space").lower(),
30 | }
31 | 
32 | 
33 | def __getattr__(name: str):
34 |     # lazy evaluation of environment variables
35 |     if name in environment_variables:
36 |         return environment_variables[name]()
37 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
38 | 
39 | 
40 | def __dir__():
41 |     return list(environment_variables.keys())
42 | 
43 | 
44 | def is_set(name: str):
45 |     """Check if an environment variable is explicitly set."""
46 |     if name in environment_variables:
47 |         return name in os.environ
48 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
49 | 
50 | 
51 | def set_config(**kwargs):
52 |     """
53 |     Set configuration values for environment variables.
54 | 
55 |     Args:
56 |         **kwargs: Keyword arguments where keys are environment variable names
57 |                  and values are the desired values to set.
58 | 
59 |     Example:
60 |         set_config(AR_LOG_LEVEL="DEBUG", AR_USE_MODELSCOPE=True)
61 |     """
62 |     for key, value in kwargs.items():
63 |         if key in environment_variables:
64 |             # Convert value to appropriate string format
65 |             if key == "AR_USE_MODELSCOPE":
66 |                 # Handle boolean values for AR_USE_MODELSCOPE
67 |                 str_value = "true" if value in [True, "True", "true", "1", 1] else "false"
68 |             else:
69 |                 # For other variables, convert to string
70 |                 str_value = str(value)
71 | 
72 |             # Set the environment variable
73 |             os.environ[key] = str_value
74 |         else:
75 |             raise AttributeError(f"module {__name__!r} has no attribute {key!r}")
76 | 


--------------------------------------------------------------------------------
/docs/tuning_norm_bias.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | ## Fast tuning LayerNorm and Linear bias via fake quantization without rounding
 4 | 
 5 | **Personal view by Wenhua, welcome to discuss**
 6 | 
 7 | **Work in Progress** 
 8 | <div align="left">
 9 | Recent studies have found that tuning LayerNorm and bias through optimizer like Adam can lead to better results, especially for low-bit quantization such as 2-bit. However, I personally do not favor the use of Adam for this purpose, as detailed in the following section, and introduce an alternative way, detailed in the last section.
10 | 
11 | ### Why not using Adam
12 | 
13 | #### Reason1 hard to tune the learning rate and steps
14 | 
15 | Since Adam adaptively tunes the step size based on the gradient and its square, the learning rate often needs adjustment for different models, different quantization bits, or both, as observed in most papers. I hypothesize that this tuning requirement arises because most papers report results for only a limited range of model families, while many new models continually emerge. Despite my experience in this domain, I still find it challenging to tune the learning rate beyond using grid search. I believe many users encounter the same issue.
16 | 
17 | #### Reason2 Prone to overfitting
18 | 
19 | Since Adam adapts the step size with each iteration, it is difficult to control the changes in parameters, leading to significant deviations from the original model's weights in some scenarios. However, we only use hundreds or thousands of samples to fine-tune a low-bit model, whereas the original model is trained on a large corpus and specialized datasets (e.g., instruction datasets). Consequently, even if the low-bit tuned model performs well on some language-modeling tasks, it may lose other compatibility as the deviations increase.
20 | 
21 | 
22 | 
23 | ### Our way
24 | 
25 | **An overview of our method**
26 | <div align="center">
27 | 
28 | ![](../docs/imgs/norm_bias_overview.png)
29 | 
30 | <div align="left">
31 | 
32 | 
33 | We limit the tuned parameters in a quantization space, expressed as:
34 | $$
35 | W' = s*clip(W/s+zp,N,M)
36 | $$
37 | where 𝑠 is the quantization scale, predefined by 𝑊 and hyperparameters such as bits.
38 | 
39 | To tune the W', following Signround, we add a trainable parameter V in the range [-0.5, 0.5], which can be easily tuned by SignSGD.
40 | 
41 | $$
42 | W' = s*clip(W/s+zp+v,N,M)
43 | $$
44 | 
45 | 
46 | An important note: We remove the rounding to reduce unnecessary rounding loss, as the final weights of LayerNorm and bias are typically kept at 16-bit precision in most cases.
47 | 
48 | 
49 | 
50 | **Result at W2G32**
51 | 
52 | the tuning of layer normalization and Linear bias are fake quantized at W4G-1.
53 | 
54 |  Average accuracies of HellaSwag, WinoGrand, PIQA and LAMBADA, higher is better.
55 | 
56 | |           | OPT125m    | OPT1.3B    | OPT2.7B    | OPT6.7B    | LLaMAV2-7b | LLaMAV3-8B-Instruct |
57 | | --------- | ---------- | ---------- | ---------- | ---------- | ---------- | ------------------- |
58 | | SignRound | 0.3978     | 0.5094     | 0.5267     | 0.3681     | 0.6267     | 0.5890              |
59 | | Ours      | **0.4077** | **0.5151** | **0.5596** | **0.3887** | **0.6315** | **0.5949**          |
60 | 
61 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_woq_linear.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
 8 | 
 9 | 
10 | class TestWeightOnlyLinear:
11 |     @pytest.mark.parametrize(
12 |         "bits, compression_dtype",
13 |         [
14 |             (8, torch.int16),
15 |             (8, torch.int32),
16 |             (8, torch.int64),
17 |             (4, torch.int8),
18 |             (4, torch.int16),
19 |             (4, torch.int32),
20 |             (4, torch.int64),
21 |             (2, torch.int8),
22 |             (2, torch.int16),
23 |             (2, torch.int32),
24 |             (2, torch.int64),
25 |         ],
26 |     )
27 |     def test_pack_with_numba(self, bits, compression_dtype):
28 |         m = torch.nn.Linear(1024, 512)
29 |         dtype = "int"
30 |         weight = m.weight.detach()
31 |         group_size = 32
32 |         origin_shape = weight.shape
33 |         from auto_round.data_type.int import quant_tensor_sym
34 | 
35 |         origin_shape = weight.shape
36 |         weight = weight.reshape(-1, group_size)
37 |         qdq, scale, zp = quant_tensor_sym(weight, -1)
38 |         if isinstance(zp, int | float):
39 |             zp = torch.full_like(scale, zp)
40 |         int_weight = qdq.div(scale).add(zp).clamp(0, 2 ** (bits) - 1).to(torch.int32).reshape(origin_shape)
41 |         scale = scale.reshape(origin_shape[0], -1)
42 |         if isinstance(zp, torch.Tensor):
43 |             zp = zp.reshape(origin_shape[0], -1).to(torch.int32).clamp(0, 2 ** (bits) - 1)
44 |         module_with_legacy_pack = WeightOnlyLinear(
45 |             in_features=m.in_features,
46 |             out_features=m.out_features,
47 |             dtype=dtype,
48 |             bits=bits,
49 |             groupsize=32,
50 |             zp=zp is not None,
51 |             bias=m.bias is not None,
52 |             use_optimum_format=False,
53 |             compression_dtype=compression_dtype,
54 |             use_legacy_pack=True,
55 |         )
56 |         module_with_legacy_pack.pack(
57 |             int_weight.clone(), scale.clone(), zp.clone() if isinstance(zp, torch.Tensor) else zp, m.bias
58 |         )
59 |         module_with_new_pack = WeightOnlyLinear(
60 |             in_features=m.in_features,
61 |             out_features=m.out_features,
62 |             dtype=dtype,
63 |             bits=bits,
64 |             groupsize=32,
65 |             zp=zp is not None,
66 |             bias=m.bias is not None,
67 |             use_optimum_format=False,
68 |             compression_dtype=compression_dtype,
69 |             use_legacy_pack=False,
70 |         )
71 |         module_with_new_pack.pack(
72 |             int_weight.clone(), scale.clone(), zp.clone() if isinstance(zp, torch.Tensor) else zp, m.bias
73 |         )
74 | 
75 |         assert torch.equal(module_with_new_pack.qweight, module_with_legacy_pack.qweight)
76 | 
77 |         assert torch.equal(module_with_new_pack.qzeros, module_with_legacy_pack.qzeros)
78 |         assert torch.equal(module_with_new_pack.scales, module_with_legacy_pack.scales)
79 |         unpacked_int_weight = module_with_new_pack.unpack_tensor(module_with_legacy_pack.qweight)
80 |         assert torch.equal(unpacked_int_weight, int_weight)
81 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_cli_usage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | 
 8 | 
 9 | class TestAutoRoundCmd(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(self):
12 |         pass
13 | 
14 |     @classmethod
15 |     def tearDownClass(self):
16 |         shutil.rmtree("./saved", ignore_errors=True)
17 |         shutil.rmtree("runs", ignore_errors=True)
18 |         shutil.rmtree("../../saved", ignore_errors=True)
19 |         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
20 | 
21 |     def test_auto_round_cmd(self):
22 |         python_path = sys.executable
23 | 
24 |         # Test llm script
25 |         res = os.system(f"cd ../.. && {python_path} -m auto_round -h")
26 |         if res > 0 or res == -1:
27 |             assert False, "cmd line test fail, please have a check"
28 | 
29 |         res = os.system(
30 |             f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
31 |         )
32 |         if res > 0 or res == -1:
33 |             assert False, "cmd line test fail, please have a check"
34 | 
35 |         res = os.system(
36 |             f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
37 |         )
38 |         if res > 0 or res == -1:
39 |             assert False, "cmd line test fail, please have a check"
40 | 
41 |         res = os.system(
42 |             f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
43 |         )
44 |         if res > 0 or res == -1:
45 |             assert False, "cmd line test fail, please have a check"
46 | 
47 |         # test mllm script
48 | 
49 |         # test auto_round_mllm --eval help
50 |         res = os.system(f"cd ../.. && {python_path} -m auto_round --eval -h")
51 |         if res > 0 or res == -1:
52 |             assert False, "cmd line test fail, please have a check"
53 | 
54 |         # test auto_round_mllm --lmms help
55 |         res = os.system(f"cd ../.. && {python_path} -m auto_round --eval --lmms -h")
56 |         if res > 0 or res == -1:
57 |             assert False, "cmd line test fail, please have a check"
58 | 
59 |         res = os.system(
60 |             f"cd ../.. && {python_path} -m auto_round --mllm --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
61 |         )
62 |         if res > 0 or res == -1:
63 |             assert False, "cmd line test fail, please have a check"
64 | 
65 |         res = os.system(
66 |             f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --seqlen 32 --format auto_round"
67 |             " --quant_nontext_module --output_dir ./saved "
68 |         )
69 |         if res > 0 or res == -1:
70 |             assert False, "cmd line test fail, please have a check"
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_packing.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from auto_round.export.export_to_autoround.qlinear_fp import FLOAT_TO_E2M1, pack_fp4_to_uint8
 5 | 
 6 | 
 7 | # Random sampling from FLOAT_TO_E2M1
 8 | def _create_random_e2m1_tensor(shape):
 9 |     """Create a tensor of the given shape with random values from FLOAT_TO_E2M1."""
10 |     # Create a tensor of indices randomly selected from 0 to len(FLOAT_TO_E2M1)-1
11 |     indices = torch.randint(0, len(FLOAT_TO_E2M1), shape)
12 | 
13 |     # Map the indices to their corresponding values
14 |     e2m1_tensor = torch.tensor(FLOAT_TO_E2M1, dtype=torch.float32)[indices]
15 |     return e2m1_tensor
16 | 
17 | 
18 | def pack_fp4_to_uint8_old(x: torch.Tensor) -> torch.Tensor:
19 |     """
20 |     Packs a tensor with values in the fp4 range into uint8.
21 |     As there are 16 valid fp4 values, two fp4 values can be
22 |     packed into one uint8. Each fp4 value is mapped to its
23 |     particular index (e.g. 0.5 is mapped to index 1, 6.0 is mapped
24 |     to index 7) which is then represented using 4 bits. Consecutive
25 |     pairs of 4 bits are then packed into an uint8.
26 | 
27 |     :param x: tensor to pack
28 |     returns: a packed tensor in uint8
29 |     """
30 | 
31 |     m, n = x.shape
32 |     device = x.device
33 | 
34 |     # Create lookup table for FP4 values to indices
35 |     # Map the absolute values to 0-7 indices
36 |     kE2M1 = torch.tensor(FLOAT_TO_E2M1, device=device, dtype=x.dtype)
37 | 
38 |     # Find closest valid FP4 value index for each element
39 |     abs_x = torch.abs(x)
40 |     abs_indices = torch.zeros_like(abs_x, dtype=torch.long)
41 |     for i, val in enumerate(kE2M1):  # TODO any optimize?
42 |         abs_indices = torch.where(torch.isclose(abs_x, val), i, abs_indices)
43 | 
44 |     # Apply sign bit (bit 3) to get final 4-bit representation
45 |     indices = abs_indices + (torch.signbit(x) << 3).to(torch.long)
46 | 
47 |     # Reshape to prepare for packing pairs of values
48 |     indices = indices.reshape(-1)
49 | 
50 |     # Handle odd length by padding if necessary
51 |     if indices.numel() % 2 != 0:
52 |         indices = torch.cat([indices, torch.zeros(1, dtype=torch.long, device=device)])
53 | 
54 |     # Reshape to pair consecutive elements
55 |     indices = indices.reshape(-1, 2)
56 | 
57 |     # Pack pairs of 4-bit values into 8-bit values
58 |     packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8)
59 | 
60 |     return packed.reshape(m, n // 2)
61 | 
62 | 
63 | qwen_weight_shapes = [
64 |     torch.Size([2048, 768]),
65 |     torch.Size([768, 2048]),
66 |     torch.Size([128, 2048]),
67 |     torch.Size([512, 2048]),
68 |     torch.Size([4096, 2048]),
69 |     torch.Size([151936, 2048]),
70 |     torch.Size([2048, 4096]),
71 | ]
72 | 
73 | 
74 | @pytest.mark.parametrize("shape", qwen_weight_shapes)
75 | def test_packing_fp4(shape):
76 |     with torch.device("cuda"):
77 |         M, N = shape
78 |         random_tensor = _create_random_e2m1_tensor((M, N))
79 |         # Pack the tensor using the packing function
80 |         packed_tensor = pack_fp4_to_uint8(random_tensor)
81 |         packed_tensor_old = pack_fp4_to_uint8_old(random_tensor)
82 |         # check equal
83 |         assert torch.equal(packed_tensor, packed_tensor_old), "Packed tensors are not equal"
84 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_gpt_oss.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import AutoConfig, AutoTokenizer
 3 | from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM
 4 | 
 5 | from auto_round import AutoRound
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def setup_gpt_oss():
10 |     """Fixture to set up the GPT-OSS model and tokenizer."""
11 |     model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16"
12 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
13 |     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
14 |     config.num_hidden_layers = 1  # Reduce layers for testing
15 |     model = GptOssForCausalLM(config)
16 |     output_dir = "/tmp/test_quantized_gpt_oss"
17 |     return model, tokenizer, output_dir, config
18 | 
19 | 
20 | def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
21 |     """Helper function to quantize the model with the given scheme."""
22 |     autoround = AutoRound(
23 |         model,
24 |         tokenizer,
25 |         scheme=scheme,
26 |         nsamples=2,
27 |         iters=iters,
28 |         fp_layers="self_attn,router,lm_head,mlp.gate",
29 |     )
30 |     quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
31 |     return quantized_model
32 | 
33 | 
34 | def count_modules_by_type(model, target_module_name_or_class):
35 |     """Helper function to count modules of a specific type in the model."""
36 |     cnt = 0
37 |     for name, module in model.named_modules():
38 |         if isinstance(target_module_name_or_class, str):
39 |             if target_module_name_or_class == module.__class__.__name__:
40 |                 cnt += 1
41 |         else:
42 |             if isinstance(module, target_module_name_or_class):
43 |                 cnt += 1
44 |     return cnt
45 | 
46 | 
47 | @pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"])
48 | def test_quantization(setup_gpt_oss, scheme):
49 |     """Test quantization with the scheme."""
50 |     model, tokenizer, output_dir, config = setup_gpt_oss
51 |     quantized_model = quantize_model(model, tokenizer, output_dir, scheme)
52 | 
53 |     # Ensure the quantized model is not None
54 |     assert quantized_model is not None, "Quantized model should not be None."
55 |     from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
56 |     from auto_round.modelling.gpt_oss import GPTOssSingleExpert
57 | 
58 |     single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert)
59 |     quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear)
60 |     assert (
61 |         single_expert_cnt == config.num_local_experts
62 |     ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}."
63 |     assert (
64 |         quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts
65 |     ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}."
66 | 
67 |     print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}")
68 |     print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}")
69 |     # clean the output directory after test
70 |     import shutil
71 | 
72 |     shutil.rmtree(output_dir, ignore_errors=True)
73 | 


--------------------------------------------------------------------------------
/docs/opt_rtn.md:
--------------------------------------------------------------------------------
 1 | ### 🧮 Evaluation Results (LM-Eval)
 2 | For 2/3bit, we strongly recommend not using iter=0 except for GGUF:Q2_K_S which has a different quantization algorithm.
 3 | 
 4 | 4BIT=W4A16
 5 | 3BIT=W3A16
 6 | 2BIT=W2A16G64
 7 | 
 8 | RTN mode
 9 | 
10 | ~~~bash
11 | auto-round --model xxx --disable_opt_rtn --iters 0 
12 | ~~~
13 | 
14 | OPT RTN mode
15 | 
16 | ~~~bash
17 | auto-round --model xxx  --iters 0 
18 | ~~~
19 | 
20 | 
21 | 
22 | | Model                          | RNT/OPT  | AVG     | HellaSwag | LAMBADA | MMLU   | PIQA   | WinoGrande |
23 | |--------------------------------|----------|---------|-----------|---------|--------|--------|------------|
24 | | **Meta-Llama-3.1-8B-Instruct** | RTN-4BIT | 0.69328 | 0.5896    | 0.7013  | 0.6538 | 0.7987 | 0.7230     |
25 | |                                | OPT-4BIT | 0.69560 | 0.5882    | 0.7074  | 0.6631 | 0.7916 | 0.7277     |
26 | |                                | RTN-3BIT | 0.64562 | 0.5410    | 0.6695  | 0.5449 | 0.7742 | 0.6985     |
27 | |                                | OPT-3BIT | 0.65970 | 0.5490    | 0.6893  | 0.5711 | 0.7677 | 0.7214     |
28 | |                                | RTN-2BIT | 0.33008 | 0.2918    | 0.0474  | 0.2321 | 0.5740 | 0.5051     |
29 | |                                | OPT-2BIT | 0.38908 | 0.3241    | 0.1560  | 0.2822 | 0.6235 | 0.5596     |
30 | | **Qwen2.5-7B-Instruct**        | RTN-4BIT | 0.69560 | 0.6114    | 0.6713  | 0.7011 | 0.7878 | 0.7064     |
31 | |                                | OPT-4BIT | 0.70034 | 0.6143    | 0.6945  | 0.7115 | 0.7845 | 0.6969     |
32 | |                                | RTN-3BIT | 0.64144 | 0.5585    | 0.6092  | 0.6455 | 0.7476 | 0.6464     |
33 | |                                | OPT-3BIT | 0.66764 | 0.5756    | 0.7013  | 0.6597 | 0.7481 | 0.6535     |
34 | |                                | RTN-2BIT | 0.31856 | 0.2804    | 0.0351  | 0.2379 | 0.5256 | 0.5138     |
35 | |                                | OPT-2BIT | 0.45146 | 0.3645    | 0.2992  | 0.4043 | 0.6415 | 0.5478     |
36 | | **Qwen3-8B**                   | RTN-4BIT | 0.66240 | 0.5619    | 0.6150  | 0.7077 | 0.7573 | 0.6701     |
37 | |                                | OPT-4BIT | 0.66992 | 0.5619    | 0.6346  | 0.7102 | 0.7633 | 0.6796     |
38 | |                                | RTN-3BIT | 0.57322 | 0.4992    | 0.4260  | 0.6002 | 0.7361 | 0.6046     |
39 | |                                | OPT-3BIT | 0.63698 | 0.5226    | 0.5814  | 0.6718 | 0.7437 | 0.6654     |
40 | |                                | RTN-2BIT | 0.31150 | 0.2679    | 0.0041  | 0.2536 | 0.5283 | 0.5036     |
41 | |                                | OPT-2BIT | 0.44254 | 0.3749    | 0.2005  | 0.4202 | 0.6670 | 0.5501     |
42 | | **Qwen3-14B**                  | RTN-4BIT | 0.70448 | 0.5999    | 0.6511  | 0.7565 | 0.7998 | 0.7151     |
43 | |                                | OPT-4BIT | 0.70798 | 0.6031    | 0.6627  | 0.7534 | 0.8009 | 0.7198     |
44 | |                                | RTN-3BIT | 0.65876 | 0.5746    | 0.5467  | 0.7065 | 0.7628 | 0.7032     |
45 | |                                | OPT-3BIT | 0.68610 | 0.5683    | 0.6633  | 0.7258 | 0.7699 | 0.7032     |
46 | |                                | RTN-2BIT | 0.39398 | 0.3764    | 0.0607  | 0.3836 | 0.6480 | 0.5012     |
47 | |                                | OPT-2BIT | 0.50080 | 0.4554    | 0.2451  | 0.4899 | 0.7138 | 0.5998     |


--------------------------------------------------------------------------------
/test/test_cpu/test_autoround_acc.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | from auto_round.eval.evaluation import simple_evaluate
 7 | 
 8 | sys.path.insert(0, "../..")
 9 | from math import isclose
10 | 
11 | import torch
12 | import transformers
13 | from transformers import AutoModelForCausalLM, AutoTokenizer
14 | 
15 | from auto_round import AutoRound  # pylint: disable=E0401
16 | 
17 | 
18 | class LLMDataLoader:
19 |     def __init__(self):
20 |         self.batch_size = 1
21 | 
22 |     def __iter__(self):
23 |         for i in range(2):
24 |             yield torch.ones([1, 10], dtype=torch.long)
25 | 
26 | 
27 | class TestAutoRound(unittest.TestCase):
28 |     @classmethod
29 |     def setUpClass(self):
30 |         self.llm_dataloader = LLMDataLoader()
31 |         self.save_dir = "./saved"
32 | 
33 |     @classmethod
34 |     def tearDownClass(self):
35 |         shutil.rmtree(self.save_dir, ignore_errors=True)
36 |         shutil.rmtree("runs", ignore_errors=True)
37 | 
38 |     def test_default_acc(self):
39 |         model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM"
40 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)
41 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
42 |         bits, group_size, sym = 4, 128, True
43 |         inp = torch.ones([1, 10], dtype=torch.long)
44 |         autoround = AutoRound(
45 |             model,
46 |             tokenizer,
47 |             bits=bits,
48 |             device="cpu",
49 |             group_size=group_size,
50 |             sym=sym,
51 |             iters=2,
52 |             seqlen=10,
53 |             dataset=self.llm_dataloader,
54 |         )
55 |         autoround.quantize()
56 |         out0 = model(inp)
57 |         print(f"out0 = {float(out0[0][0][0][0])}")
58 | 
59 |         model_tmp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)
60 |         autoround_1 = AutoRound(
61 |             model_tmp,
62 |             tokenizer,
63 |             bits=bits,
64 |             group_size=group_size,
65 |             sym=sym,
66 |             device="cpu",
67 |             iters=2,
68 |             seqlen=10,
69 |             dataset=self.llm_dataloader,
70 |         )
71 |         autoround_1.quantize()
72 |         out1 = model_tmp(inp)
73 | 
74 |         assert out0[0].equal(out1[0])
75 |         self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04))
76 | 
77 |     def test_3bits_asym_autoround(self):
78 |         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
79 | 
80 |         bits, sym = 3, False
81 |         autoround = AutoRound(model_name, bits=bits, sym=sym, iters=0)
82 |         autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
83 |         model_args = f"pretrained={self.save_dir}"
84 |         # res = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto", limit=10)
85 | 
86 |         # accuracy = res["results"]["lambada_openai"]["acc,none"]
87 |         # print(f"accuracy = {accuracy}")
88 |         # assert accuracy > 0.15
89 |         shutil.rmtree(self.save_dir, ignore_errors=True)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     unittest.main()
94 | 


--------------------------------------------------------------------------------
/docs/auto_scheme_acc.md:
--------------------------------------------------------------------------------
 1 | We use **lm-eval** for evaluation. For LLaMA, we enabled `add_bos_token` and
 2 | `removed @use_kernel_forward_from_hub("RMSNorm")`
 3 | in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40)
 4 | to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval.
 5 | 
 6 | We ignore the scale and zp bits in the tables below. The accuracy may change a little as we modified a little of the
 7 | implementation. We will rerun all the experiments.
 8 | 
 9 | For mxfp experiment, we use fake model while for weight only model we use real model. **No tuning is applied unless explicit stated.**
10 | 
11 | *Average accuracy across `lambada_openai`, `hellaswag`, `piqa`, `winogrande`, and `mmlu`.*
12 | 
13 | ### Table 1 MXFP4/8 mixed accuracy.
14 | 
15 | | Average bits     | Llama3.1-8B-I  | Qwen2.5-7B-I   | Qwen3-8B       | Qwen3-32B      |
16 | |:------------------|:----------------:|:----------------:|:----------------:|:----------------:|
17 | | **BF16**         | 0.7076 (100%)  | 0.7075 (100%)  | 0.6764 (100%)  | 0.7321 (100%)  |
18 | | **Pure 4-bit**   | 0.6626 (93.6%) | 0.6550 (92.6%) | 0.6316 (93.4%) | 0.6901 (94.3%) |
19 | | **Ours 4.5-bit** | 0.6808 (96.2%) | 0.6776 (95.8%) | 0.6550 (96.8%) | 0.7176 (98.0%) |
20 | | **Ours 5-bit**   | 0.6857 (96.9%) | 0.6823 (96.4%) | 0.6594 (97.5%) | 0.7201 (98.3%) |
21 | | **Ours 6-bit**   | 0.6975 (98.6%) | 0.6970 (98.5%) | 0.6716 (99.3%) | 0.7303 (99.8%) |
22 | 
23 | We compare the proposed method against naive layer-wise bit allocation strategies, such as assigning higher
24 | precision to the network’s head((near lm-head) or tailad(close to embedding)) layers, to demonstrate its relative
25 | performance advantages.
26 | 
27 | ### Table 2  Comparison with other recipes at an average of 5 bits of mxfp datatype
28 | 
29 | | Avg. bits = 5         |      Llama3.1-8B-I |       Qwen2.5-7B-I |           Qwen3-8B |
30 | |:------------------|:----------------:|:----------------:|:----------------:|
31 | | **Tail layers 8-bit** |     0.6671 (94.3%) |     0.6616 (93.5%) |     0.6410 (94.8%) |
32 | | **Head layers 8-bit** |     0.6657 (94.1%) |     0.6686 (94.5%) |     0.6356 (94.0%) |
33 | | **Ours**              | **0.6857 (96.9%)** | **0.6823 (96.4%)** | **0.6594 (97.5%)** |
34 | 
35 | ### Table 3  Comparison with other recipes at an average of 4.5 bits of mxfp datatype
36 | 
37 | | Avg. bits = 4.5       |      Llama3.1-8B-I |       Qwen2.5-7B-I |           Qwen3-8B |
38 | |:------------------|:----------------:|:----------------:|:----------------:|
39 | | **Tail layers 8-bit** |     0.6614 (93.5%) |     0.6535 (92.4%) |     0.6373 (94.2%) |
40 | | **Head layers 8-bit** |     0.6568 (92.8%) |     0.6642 (93.9%) |     0.6305 (93.2%) |
41 | | **Ours**              | **0.6808 (96.2%)** | **0.6776 (95.5%)** | **0.6550 (95.8%)** |
42 | 
43 | 
44 | ### Table4   Comparison with other recipes at an average of 3 bits of W2G128 and W4G128
45 | 
46 | | Avg. bits = 4.5       | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B |
47 | |:------------------|:----------------:|:----------------:|:----------------:|
48 | | **Tail layers 4-bit** |        0.6058 |       0.3798 |   0.4536 |
49 | | **Head layers 4-bit** |        0.3198 |       0.3270 |   0.3196 |
50 | | **Ours**              |        0.6148 |       0.4058 |   0.4862 |
51 | 


--------------------------------------------------------------------------------
/.azure-pipelines/unit-test.yml:
--------------------------------------------------------------------------------
  1 | trigger: none
  2 | 
  3 | pr:
  4 |   autoCancel: true
  5 |   drafts: false
  6 |   branches:
  7 |     include:
  8 |       - main
  9 |   paths:
 10 |     include:
 11 |       - auto_round
 12 |       - auto_round_extension
 13 |       - test
 14 |       - setup.py
 15 |       - requirements.txt
 16 |       - requirements-cpu.txt
 17 |       - .azure-pipelines/scripts/ut
 18 |       - .azure-pipelines/unit-test.yml
 19 |       - .azure-pipelines/template/ut-template.yml
 20 |       - .azure-pipelines/template/docker-template.yml
 21 |     exclude:
 22 |       - test/test*hpu*
 23 |       - "*.md"
 24 |       - "**/*.md"
 25 | 
 26 | pool: ICX-16C
 27 | 
 28 | variables:
 29 |   IMAGE_NAME: "auto-round"
 30 |   IMAGE_TAG: "py312"
 31 |   UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
 32 |   DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
 33 |   ARTIFACT_NAME: "UT_coverage_report"
 34 |   REPO: $(Build.Repository.Uri)
 35 | 
 36 | stages:
 37 |   - stage: Unit_test
 38 |     displayName: Unit Test
 39 |     dependsOn: []
 40 |     jobs:
 41 |       - job:
 42 |         timeoutInMinutes: 120
 43 |         strategy:
 44 |           matrix:
 45 |             part1:
 46 |               PART: 1
 47 |             part2:
 48 |               PART: 2
 49 |             part3:
 50 |               PART: 3
 51 |             part4:
 52 |               PART: 4
 53 |             part5:
 54 |               PART: 5
 55 |         steps:
 56 |           - template: template/ut-template.yml
 57 |             parameters:
 58 |               dockerConfigName: "commonDockerConfig"
 59 |               utScriptFileName: "run_ut"
 60 |               uploadPath: $(UPLOAD_PATH)
 61 |               utArtifact: "ut-$(PART)"
 62 |               utTestMode: $(PART)
 63 | 
 64 |   - stage: Coverage
 65 |     displayName: "Collect Coverage"
 66 |     pool:
 67 |       vmImage: "ubuntu-latest"
 68 |     dependsOn: [Unit_test]
 69 |     jobs:
 70 |       - job: CollectDatafiles
 71 |         steps:
 72 |           - task: DownloadPipelineArtifact@2
 73 |             inputs:
 74 |               artifact:
 75 |               patterns: '*_coverage/.coverage.*'
 76 |               path: $(DOWNLOAD_PATH)
 77 |           
 78 |           - task: UsePythonVersion@0
 79 |             inputs:
 80 |               versionSpec: '3.12'
 81 |             displayName: 'Use Python 3.12'
 82 | 
 83 |           - script: |
 84 |               cd ${BUILD_SOURCESDIRECTORY}
 85 |               pip install -U pip setuptools uv
 86 |               uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
 87 |               uv pip install .
 88 |               pip list
 89 |               cd ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/scripts
 90 |               bash ut/collect_log.sh
 91 |             env:
 92 |               PYTHONUNBUFFERED: '1'
 93 |               UV_NO_PROGRESS: '1'
 94 |               UV_SYSTEM_PYTHON: '1'
 95 |             displayName: "Collect UT Coverage"
 96 | 
 97 |           - task: PublishPipelineArtifact@1
 98 |             condition: succeededOrFailed()
 99 |             inputs:
100 |               targetPath: $(UPLOAD_PATH)/coverage_PR
101 |               artifact: $(ARTIFACT_NAME)
102 |               publishLocation: "pipeline"
103 | 
104 |           - task: PublishCodeCoverageResults@2
105 |             inputs:
106 |               summaryFileLocation: $(Build.SourcesDirectory)/log_dir/coverage_PR/coverage.xml
107 | 


--------------------------------------------------------------------------------
/auto_round/data_type/w4fp8.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | 
17 | from auto_round.data_type.register import register_dtype
18 | from auto_round.data_type.utils import float8_e4m3fn_ste, get_gaudi_fp8_ste_func
19 | 
20 | 
21 | @register_dtype("fp8_to_int_sym")
22 | def progressive_quant_fp8_int4(
23 |     tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scale=1.0, q_scale_thresh=1e-5, **kwargs
24 | ):
25 |     """Two-stage quantization: quantize tensor to fp8 by per tensor, then quantize fp8 to w4g128
26 | 
27 |     This method first quantizes the input tensor into float8 format and then performs
28 |     a secondary quantization to int4 with grouping.
29 | 
30 |     Args:
31 |         tensor (torch.Tensor): Input tensor to quantize.
32 |         bits (int, optional): Bit precision for secondary quantization. Defaults to 4.
33 |         group_size (int, optional): Group size for int4 quantization. Defaults to -1 (no grouping).
34 |         v (float, optional): Optional parameter for variance tuning. Defaults to 0.
35 |         min_scale (float, optional): Minimum scaling factor for int4 quantization. Defaults to 1.0.
36 |         max_scale (float, optional): Maximum scaling factor for int4 quantization. Defaults to 1.0.
37 |         q_scale_thresh (float, optional): Threshold for scaling. Defaults to 1e-5.
38 |         **kwargs: Additional arguments for compatibility.
39 | 
40 |     Returns:
41 |         tuple:
42 |             - Quantized and dequantized tensor (torch.Tensor).
43 |             - Combined scaling factor (torch.Tensor).
44 |             - Placeholder for zp (None).
45 |     """
46 | 
47 |     info = torch.finfo(torch.float8_e4m3fn)
48 |     tensor_max = torch.max(torch.abs(tensor)).to(torch.float32)
49 |     scale = tensor_max.to(torch.float32) / info.max
50 |     min_scaling_factor = 1.0 / (info.max * 512.0)  ##copy from vllm
51 |     bf16_to_fp8_scale = torch.clip(scale, min=min_scaling_factor)
52 |     fp8_res = tensor / bf16_to_fp8_scale
53 |     fp8_res = torch.clip(fp8_res, info.min, info.max)
54 |     fp8_res = float8_e4m3fn_ste(fp8_res)
55 | 
56 |     ##convert to bf16
57 |     fp8_res_using_16bit = fp8_res.to(tensor.dtype)
58 |     ##convert to int4
59 |     from auto_round.data_type.int import quant_tensor_sym
60 | 
61 |     qdq_int4_tensor, scale_fp8_to_int4, zp_fp8_to_int4 = quant_tensor_sym(
62 |         fp8_res_using_16bit,
63 |         bits=bits,
64 |         group_size=group_size,
65 |         v=v,
66 |         min_scale=min_scale,
67 |         max_scale=max_scale,
68 |         scale_dtype=torch.bfloat16,
69 |         q_scale_thresh=q_scale_thresh,
70 |     )
71 |     qdq_tensor = qdq_int4_tensor * bf16_to_fp8_scale
72 | 
73 |     bf16_to_int4_scale = scale_fp8_to_int4 * bf16_to_fp8_scale
74 |     return qdq_tensor, {"scale": bf16_to_int4_scale, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4
75 | 


--------------------------------------------------------------------------------
/auto_round/compressors/diffusion/README.md:
--------------------------------------------------------------------------------
 1 | # AutoRound for Diffusion Models (Experimental)
 2 | 
 3 | This feature is experimental and may be subject to changes, including potential bug fixes, API modifications, or adjustments to default parameters.
 4 | 
 5 | ## Quantization
 6 | 
 7 | Quantization for diffusion models is limited:
 8 | 
 9 | 1. Only transformer module of diffusion models will be quantized..
10 | 2. Loading quantized model is not supported yet, so please use `fake` format for quantization.
11 | 3. Calibration dataset only supports `coco2014` and user customized `.tsv` file.
12 | 
13 | 
14 | ### API Usage (CPU/GPU) Recommended
15 | 
16 | 
17 | ```python
18 | import torch
19 | from auto_round import AutoRound
20 | from diffusers import AutoPipelineForText2Image
21 | 
22 | # Load the model
23 | model_name = "black-forest-labs/FLUX.1-dev"
24 | pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16)
25 | 
26 | # Quantize the model
27 | autoround = AutoRound(
28 |     pipe,
29 |     scheme="MXFP8",
30 |     dataset="coco2014",
31 |     num_inference_steps=10,
32 |     guidance_scale=7.5,
33 |     generator_seed=None,
34 |     batch_size=1,
35 | )
36 | autoround.quantize()
37 | 
38 | # Save the quantized model
39 | output_dir = "./tmp_autoround"
40 | # Currently loading the quantized diffusion model is not supported, so use fake format
41 | autoround.save_quantized(output_dir, format="fake", inplace=True)
42 | ```
43 | 
44 | - `dataset`: the dataset for quantization training. Currently only support coco2014 and user customized .tsv file.
45 | 
46 | - `num_inference_steps`: The reference number of denoising steps.
47 | 
48 | - `guidance_scale`: Control how much the image generation process follows the text prompt. The more it is, the more closely it follows the prompt.
49 | 
50 | - `generator_seed`: A seed that controls the initial noise from which an image is generated.
51 | 
52 | for more hyperparameters introduction, please refer [Homepage Detailed Hyperparameters](../../README.md#api-usage-gaudi2cpugpu)
53 | 
54 | ### CLI Usage
55 | 
56 | A user guide detailing the full list of supported arguments is provided by calling ```auto-round -h``` on the
57 | terminal.
58 | 
59 | ```bash
60 | auto-round \
61 |     --model black-forest-labs/FLUX.1-dev \
62 |     --scheme MXFP8 \
63 |     --format fake \
64 |     --batch_size 1 \
65 |     --output_dir ./tmp_autoround
66 | ```
67 | 
68 | ### Diffusion Support Matrix
69 | 
70 | For diffusion models, currently we only validate quantizaion on the FLUX.1-dev, which involves quantizing the transformer component of the pipeline.
71 | 
72 | | Model     | calibration dataset |
73 | |--------------|--------------|
74 | | black-forest-labs/FLUX.1-dev | COCO2014      |
75 | 
76 | 
77 | 
78 | <details>
79 | <summary style="font-size:17px;">Calibration Dataset</summary>
80 | 
81 | For diffusion models, we used [**coco2014**]("https://github.com/mlcommons/inference/raw/refs/heads/master/text_to_image/coco2014/captions/captions_source.tsv") calibration dataset as our default.
82 | 
83 | If users want to use their own dataset, please build the dataset file in ".tsv" format following below structure and use it through argument --dataset (tsv file):
84 | ```
85 | id      caption
86 | 0       YOUR_PROMPT
87 | 1       YOUR_PROMPT
88 | ...     ...
89 | ```
90 | - `id`: The id used to map generated images and prompts.
91 | - `caption`: The text prompt used to generate the images.
92 | 
93 | 
94 | </details>
95 | 


--------------------------------------------------------------------------------
/auto_round/export/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round.export.register import EXPORT_FORMAT, PACKING_LAYER_WITH_FORMAT, register_format, register_layer_packing
16 | 
17 | 
18 | @register_format("auto_gptq")
19 | def _save_quantized_as_autogptq(*args, **kwargs):
20 |     from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq
21 | 
22 |     return save_quantized_as_autogptq(*args, **kwargs)
23 | 
24 | 
25 | @register_format("itrex")
26 | def _save_quantized_as_itrex(*args, **kwargs):
27 |     from auto_round.export.export_to_itrex.export import save_quantized_as_itrex
28 | 
29 |     return save_quantized_as_itrex(*args, **kwargs)
30 | 
31 | 
32 | @register_format("itrex_xpu")
33 | def _save_quantized_as_itrex_xpu(*args, **kwargs):
34 |     from auto_round.export.export_to_itrex.export import save_quantized_as_itrex_xpu
35 | 
36 |     return save_quantized_as_itrex_xpu(*args, **kwargs)
37 | 
38 | 
39 | @register_format("auto_round")
40 | def _save_quantized_as_autoround(*args, **kwargs):
41 |     from auto_round.export.export_to_autoround.export import save_quantized_as_autoround
42 | 
43 |     return save_quantized_as_autoround(*args, **kwargs)
44 | 
45 | 
46 | @register_format("auto_awq")
47 | def _save_quantized_as_autoawq(*args, **kwargs):
48 |     from auto_round.export.export_to_awq.export import save_quantized_as_autoawq
49 | 
50 |     return save_quantized_as_autoawq(*args, **kwargs)
51 | 
52 | 
53 | @register_format("gguf")
54 | def _save_quantized_as_gguf(*args, **kwargs):
55 |     from auto_round.export.export_to_gguf.export import save_quantized_as_gguf
56 | 
57 |     return save_quantized_as_gguf(*args, **kwargs)
58 | 
59 | 
60 | @register_layer_packing("auto_round")
61 | def _packing_layer_with_autoround(*args, **kwargs):
62 |     from auto_round.export.export_to_autoround.export import pack_layer
63 | 
64 |     return pack_layer(*args, **kwargs)
65 | 
66 | 
67 | @register_layer_packing("auto_gptq")
68 | def _packing_layer_with_autogptq(*args, **kwargs):
69 |     from auto_round.export.export_to_autogptq.export import pack_layer
70 | 
71 |     return pack_layer(*args, **kwargs)
72 | 
73 | 
74 | @register_layer_packing("auto_awq")
75 | def _packing_layer_with_autoawq(*args, **kwargs):
76 |     from auto_round.export.export_to_awq.export import pack_layer
77 | 
78 |     return pack_layer(*args, **kwargs)
79 | 
80 | 
81 | @register_format("llm_compressor")
82 | def _save_quantized_as_llmcompressor(*args, **kwargs):
83 |     from auto_round.export.export_to_llmcompressor.export import save_quantized_as_llmcompressor
84 | 
85 |     return save_quantized_as_llmcompressor(*args, **kwargs)
86 | 
87 | 
88 | @register_layer_packing("llm_compressor")
89 | def _packing_layer_with_llmcompressor(*args, **kwargs):
90 |     from auto_round.export.export_to_llmcompressor.export import pack_layer
91 | 
92 |     return pack_layer(*args, **kwargs)
93 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_llmc_integration.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
 4 | from llmcompressor import oneshot
 5 | from llmcompressor.modifiers.autoround import AutoRoundModifier
 6 | from transformers import AutoModelForCausalLM, AutoTokenizer
 7 | 
 8 | from auto_round.calib_dataset import get_dataset
 9 | 
10 | recipe_str = """
11 | quant_stage:
12 |     quant_modifiers:
13 |         AutoRoundModifier:
14 |             ignore: ["lm_head"]
15 |             iters: 1
16 |             config_groups:
17 |                 group_0:
18 |                     targets:
19 |                         - "Linear"
20 |                     input_activations: null
21 |                     output_activations: null
22 |                     weights:
23 |                         num_bits: 4
24 |                         type: "int"
25 |                         symmetric: true
26 |                         strategy: group
27 |                         group_size: 128
28 | """
29 | 
30 | recipe_modifier_full = AutoRoundModifier(
31 |     ignore=["lm_head"],
32 |     iters=1,
33 |     config_groups={
34 |         "group_0": QuantizationScheme(
35 |             targets=["Linear"],
36 |             weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
37 |         )
38 |     },
39 | )
40 | 
41 | 
42 | @pytest.mark.parametrize(
43 |     "recipe",
44 |     [
45 |         recipe_str,
46 |         recipe_modifier_full,
47 |     ],
48 | )
49 | def test_oneshot_application(recipe, tmp_path):
50 |     output = tmp_path / "oneshot_output"
51 |     model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
52 |     tokenizer = AutoTokenizer.from_pretrained(model)
53 |     dataset = get_dataset(
54 |         tokenizer=tokenizer,
55 |         seqlen=16,
56 |         nsamples=2,
57 |     )
58 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
59 | 
60 |     oneshot(
61 |         model=model,
62 |         dataset=dataset,
63 |         output_dir=output,
64 |         recipe=recipe,
65 |     )
66 |     model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device)
67 | 
68 |     # Check that the model is quantized
69 |     # for compression_config - decompress() will attach a quantization_config
70 |     # to the model as we decompress right away
71 |     # for quantization_config - we have CompressedLinear which will only
72 |     # decompress on the forward pass and does not call decompress(). Results
73 |     # in a slightly different parameter tree to access the quant config
74 |     quantization_config = model_loaded.config.quantization_config.quantization_config
75 |     assert quantization_config is not None
76 | 
77 |     # check config is set properly
78 |     assert "lm_head" in quantization_config.ignore
79 |     assert len(quantization_config.config_groups) == 1
80 |     quant_scheme = quantization_config.config_groups["group_0"]
81 |     assert isinstance(quant_scheme, QuantizationScheme)
82 | 
83 |     weight_args = quantization_config.config_groups["group_0"].weights
84 |     assert isinstance(weight_args, QuantizationArgs)
85 |     assert weight_args.num_bits == 4
86 | 
87 |     # Check a specific layer is quantized
88 |     targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
89 |     assert hasattr(targeted_linear_layer, "quantization_scheme")
90 | 
91 |     # Check lm-head is not quantized
92 |     not_targeted = model_loaded.lm_head
93 |     assert not hasattr(not_targeted, "quantization_scheme")
94 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.codespell]
  2 | skip = 'pyproject.toml,.azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt'
  3 | ignore-words = ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt"
  4 | 
  5 | [tool.isort]
  6 | profile = "black"
  7 | line_length = 120
  8 | known_first_party = ["auto_round", "auto_round_extension"]
  9 | extend_skip_glob = ["**/__init__.py"]
 10 | 
 11 | [tool.black]
 12 | line-length = 120
 13 | 
 14 | [tool.typos]
 15 | [tool.typos.files]
 16 | extend-exclude = [
 17 |     ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt"
 18 | ]
 19 | [tool.typos.default.extend-words]
 20 | ue = "ue"
 21 | endianess = "endianess"
 22 | 
 23 | [tool.ruff]
 24 | # Exclude a variety of commonly ignored directories.
 25 | exclude = [
 26 |     ".bzr",
 27 |     ".direnv",
 28 |     ".eggs",
 29 |     ".git",
 30 |     ".git-rewrite",
 31 |     ".hg",
 32 |     ".ipynb_checkpoints",
 33 |     ".mypy_cache",
 34 |     ".nox",
 35 |     ".pants.d",
 36 |     ".pyenv",
 37 |     ".pytest_cache",
 38 |     ".pytype",
 39 |     ".ruff_cache",
 40 |     ".svn",
 41 |     ".tox",
 42 |     ".venv",
 43 |     ".vscode",
 44 |     "__pypackages__",
 45 |     "_build",
 46 |     "buck-out",
 47 |     "build",
 48 |     "dist",
 49 |     "node_modules",
 50 |     "site-packages",
 51 |     "venv",
 52 | ]
 53 | 
 54 | # Same as Black.
 55 | line-length = 120
 56 | indent-width = 4
 57 | 
 58 | # Assume Python 3.10
 59 | target-version = "py310"
 60 | 
 61 | [tool.ruff.lint]
 62 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
 63 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
 64 | # McCabe complexity (`C901`) by default.
 65 | select = ["E4", "E7", "E9", "F", "NPY", "FURB"]
 66 | ignore = [
 67 |     "E402", # Module level import not at top of file
 68 |     "E501", # Line too long (121 > 120 characters)
 69 |     "E721", # Do not compare types, use isinstance()
 70 |     "E722", # Do not use bare except
 71 |     "E731", # Do not assign a lambda expression, use a def
 72 |     "E741", # Do not use variables named ‘l’, ‘O’, or ‘I’
 73 |     "F401", # {name} imported but unused
 74 |     "F403", # from {name} import * used; unable to detect undefined names
 75 |     "F841", # Local variable is assigned to but never used{name}
 76 | ]
 77 | 
 78 | # Allow fix for all enabled rules (when `--fix`) is provided.
 79 | fixable = ["ALL"]
 80 | unfixable = []
 81 | 
 82 | # Allow unused variables when underscore-prefixed.
 83 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 84 | 
 85 | [tool.ruff.format]
 86 | # Like Black, use double quotes for strings.
 87 | quote-style = "double"
 88 | 
 89 | # Like Black, indent with spaces, rather than tabs.
 90 | indent-style = "space"
 91 | 
 92 | # Like Black, respect magic trailing commas.
 93 | skip-magic-trailing-comma = false
 94 | 
 95 | # Like Black, automatically detect the appropriate line ending.
 96 | line-ending = "auto"
 97 | 
 98 | # Enable auto-formatting of code examples in docstrings. Markdown,
 99 | # reStructuredText code/literal blocks and doctests are all supported.
100 | #
101 | # This is currently disabled by default, but it is planned for this
102 | # to be opt-out in the future.
103 | docstring-code-format = false
104 | 
105 | # Set the line length limit used when formatting code snippets in
106 | # docstrings.
107 | #
108 | # This only has an effect when the `docstring-code-format` setting is
109 | # enabled.
110 | docstring-code-line-length = "dynamic"
111 | 


--------------------------------------------------------------------------------
/auto_round_extension/vllm_ext/auto_round_ext.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Any
16 | 
17 | import torch
18 | from vllm.logger import init_logger
19 | from vllm.model_executor.layers.fused_moe import FusedMoE
20 | from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
21 | from vllm.model_executor.layers.quantization.auto_round import AutoRoundConfig as _BaseAutoRoundConfig
22 | 
23 | from auto_round.schemes import QuantizationScheme
24 | from auto_round_extension.vllm_ext.quant_method_linear import AutoRoundQuantLinearMethod
25 | from auto_round_extension.vllm_ext.quant_method_moe import AutoRoundMoEMethod
26 | 
27 | logger = init_logger(__name__)
28 | 
29 | 
30 | class AutoRoundExtensionConfig(_BaseAutoRoundConfig):
31 |     SUPPORTED_DTYPES = _BaseAutoRoundConfig.SUPPORTED_DTYPES.union({"mx_fp"})
32 |     SUPPORTED_FORMATS = _BaseAutoRoundConfig.SUPPORTED_FORMATS.union({"auto_round:llm_compressor"})
33 | 
34 |     def get_quant_method(self, layer: torch.nn.Module, prefix: str):
35 |         # FIXME: (yi) make it compatible with `AutoRoundConfig`
36 |         from vllm.attention.layer import Attention
37 | 
38 |         if isinstance(layer, Attention):
39 |             from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod
40 | 
41 |             return AutoRoundKVCacheMethod(self)
42 |         if isinstance(layer, FusedMoE):
43 |             quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix)
44 |             return quant_method
45 |         elif isinstance(layer, LinearBase):
46 |             return AutoRoundQuantLinearMethod.get_method(self, layer, prefix)
47 |         else:
48 |             return None
49 | 
50 |     @staticmethod
51 |     def _parse_quant_scheme(config: dict):
52 |         quant_scheme_attrs = QuantizationScheme.get_attributes()
53 |         filter_config = {key: value for key, value in config.items() if key in quant_scheme_attrs}
54 |         quant_scheme = QuantizationScheme.from_dict(filter_config)
55 |         return quant_scheme
56 | 
57 |     @classmethod
58 |     def from_config(cls, config: dict[str, Any]) -> _BaseAutoRoundConfig:
59 |         ar_config = super().from_config(config)
60 |         # TODO: (yi) refine below implementation
61 |         quant_scheme = AutoRoundExtensionConfig._parse_quant_scheme(config)
62 |         layer_schemes = {}
63 |         layer_schemes = {}  # ensure dict
64 |         extra_config = getattr(ar_config, "extra_config", None)
65 |         if extra_config is not None:
66 |             for layer_name, layer_config in extra_config.items():
67 |                 layer_schemes[layer_name] = AutoRoundExtensionConfig._parse_quant_scheme(layer_config)
68 |         ar_config.quant_scheme = quant_scheme
69 |         ar_config.layer_schemes = layer_schemes
70 |         return ar_config
71 | 
72 | 
73 | # Patch vLLM’s AutoRoundConfig at import time
74 | import vllm.model_executor.layers.quantization.auto_round as _auto_round_module
75 | 
76 | _auto_round_module.AutoRoundConfig = AutoRoundExtensionConfig
77 | 


--------------------------------------------------------------------------------
/auto_round/modelling/llama4.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # Note: adapted from # https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/modeling/llama4.py
15 | 
16 | __all__ = ["get_replacement_info"]
17 | 
18 | 
19 | import torch
20 | from transformers.modeling_utils import no_init_weights
21 | from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
22 | 
23 | from auto_round.utils import unsupported_meta_device
24 | 
25 | 
26 | class SequentialLlama4TextExperts(torch.nn.ModuleList):
27 |     def __init__(self, config, original):
28 |         self.num_experts = original.gate_up_proj.shape[0]
29 |         with no_init_weights():
30 |             super().__init__([Llama4TextMLP(config) for _ in range(self.num_experts)])
31 | 
32 |         if not unsupported_meta_device(original):
33 |             intermediate_size = original.down_proj.shape[1]
34 | 
35 |             for i in range(self.num_experts):
36 |                 gate_up = original.gate_up_proj[i]
37 |                 down = original.down_proj[i]
38 |                 gate_proj = gate_up[:, :intermediate_size]
39 |                 up_proj = gate_up[:, intermediate_size:]
40 | 
41 |                 self[i].gate_proj.weight.data.copy_(gate_proj.t())
42 |                 self[i].up_proj.weight.data.copy_(up_proj.t())
43 |                 self[i].down_proj.weight.data.copy_(down.t())
44 | 
45 | 
46 | class SequentialLlama4TextMoe(torch.nn.Module):
47 |     def __init__(self, config, original):
48 |         super().__init__()
49 |         self.top_k = config.num_experts_per_tok
50 |         self.hidden_dim = config.hidden_size
51 |         self.num_experts = config.num_local_experts
52 |         self.experts = SequentialLlama4TextExperts(config, original.experts)
53 |         self.router = original.router
54 |         self.shared_expert = original.shared_expert
55 | 
56 |     def forward(self, hidden_states: torch.Tensor):
57 |         hidden_states = hidden_states.reshape(-1, self.hidden_dim)
58 |         router_logits = self.router(hidden_states)
59 |         if isinstance(router_logits, tuple):
60 |             router_scores, router_logits = router_logits
61 |             router_scores = router_scores.t()
62 |         else:
63 |             # transformers < 4.54.0 only returns router_logits
64 |             router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=1)
65 | 
66 |             router_scores = (
67 |                 torch.full_like(router_logits, float("-inf"))
68 |                 .scatter_(1, router_indices, router_top_value)
69 |                 .transpose(0, 1)
70 |             )
71 |             router_scores = torch.sigmoid(router_scores.float()).to(hidden_states.dtype)
72 | 
73 |         out = self.shared_expert(hidden_states)
74 |         for i in range(self.num_experts):
75 |             out += self.experts[i](hidden_states) * router_scores[i].reshape(-1, 1)
76 | 
77 |         return out, router_logits
78 | 
79 | 
80 | def get_replacement_info(config):
81 |     return SequentialLlama4TextMoe, config.get_text_config(), "Llama4TextMoe"
82 | 


--------------------------------------------------------------------------------