├── test ├── __init__.py ├── test_hpu │ ├── requirements.txt │ ├── conftest.py │ ├── _test_helpers.py │ └── test_auto_round.py ├── test_cuda │ ├── requirements_diffusion.txt │ ├── test_llmc_integration.py │ ├── requirements.txt │ ├── requirements_vlm.txt │ ├── test_multiple_card_calib.py │ ├── _test_helpers.py │ ├── test_calib_dataset.py │ ├── test_conv1d.py │ ├── test_vllm.py │ ├── test_diffusion.py │ ├── test_mxfp_and_nvfp_quant.py │ ├── test_alg_ext.py │ └── test_packing.py └── test_cpu │ ├── requirements.txt │ ├── test_script.py │ ├── test_utils.py │ ├── _test_helpers.py │ ├── test_alg_ext.py │ ├── test_conv1d.py │ ├── test_autoopt.py │ ├── test_model_scope.py │ ├── test_logger.py │ ├── test_auto_scheme.py │ ├── test_load_awq_gptq.py │ ├── test_mxfp_save_load.py │ ├── test_woq_linear.py │ ├── test_cli_usage.py │ ├── test_gpt_oss.py │ ├── test_autoround_acc.py │ └── test_llmc_integration.py ├── .azure-pipelines ├── scripts │ ├── codeScan │ │ ├── codespell │ │ │ └── autoround_dict.txt │ │ ├── bandit │ │ │ └── bandit.sh │ │ └── pylint │ │ │ └── pylint.sh │ ├── ut │ │ ├── .coverage │ │ ├── collect_log.sh │ │ ├── run_ut_hpu.sh │ │ └── run_ut.sh │ └── change_color.sh ├── license_template.txt ├── code-scan.yml ├── docker │ ├── DockerfileCodeScan.devel │ └── Dockerfile.devel ├── unit-test-hpu.yml ├── template │ ├── code-scan-template.yml │ └── ut-template.yml ├── compatibility-test.yml └── unit-test.yml ├── requirements-cpu.txt ├── auto_round ├── alg_ext.pyd ├── alg_ext.abi3.so ├── auto_scheme │ ├── default_alg.pyd │ ├── default_alg.abi3.so │ ├── __init__.py │ └── register.py ├── compressors │ ├── mllm │ │ ├── templates │ │ │ ├── llava.json │ │ │ ├── phi3_v.json │ │ │ ├── cogvlm2.json │ │ │ └── default.json │ │ ├── __init__.py │ │ └── utils.py │ ├── diffusion │ │ ├── __init__.py │ │ └── README.md │ └── __init__.py ├── eval │ ├── __init__.py │ └── evaluation.py ├── modelling │ ├── __init__.py │ └── llama4.py ├── experimental │ ├── __init__.py │ ├── qmodules │ │ ├── __init__.py │ │ └── base.py │ └── utils.py ├── export │ ├── export_to_autogptq │ │ └── __init__.py │ ├── export_to_gguf │ │ └── __init__.py │ ├── export_to_awq │ │ └── __init__.py │ ├── export_to_llmcompressor │ │ ├── __init__.py │ │ └── utils.py │ ├── export_to_autoround │ │ ├── __init__.py │ │ └── utils.py │ ├── export_to_itrex │ │ └── __init__.py │ ├── register.py │ └── __init__.py ├── inference │ ├── __init__.py │ └── utils.py ├── utils │ └── __init__.py ├── version.py ├── data_type │ ├── __init__.py │ ├── register.py │ └── w4fp8.py ├── __init__.py └── envs.py ├── docs ├── imgs │ ├── AutoRound.png │ ├── full_range_sym.png │ ├── autoround_overview.png │ └── norm_bias_overview.png ├── full_range_sym.md ├── mxnv_acc.md ├── alg_202508.md ├── gguf_alg_ext_acc.md ├── publication_list.md ├── tuning_norm_bias.md ├── opt_rtn.md └── auto_scheme_acc.md ├── MANIFEST.in ├── requirements-lib.txt ├── .gitignore ├── requirements.txt ├── auto_round_extension ├── vllm_ext │ ├── README.md │ ├── __init__.py │ ├── kv_cache.py │ ├── quant_impl.py │ ├── sitecustomize.py │ ├── envs_ext.py │ ├── tests │ │ ├── test_models.py │ │ └── test_fp8kv.py │ ├── mxfp8_qdq_utils.py │ ├── quant_method_moe.py │ └── auto_round_ext.py ├── __init__.py ├── cuda │ └── __init__.py ├── hpu │ └── __init__.py ├── torch │ └── __init__.py ├── triton │ ├── __init__.py │ ├── triton_utils │ │ ├── __init__.py │ │ └── mixin.py │ └── triton_utils_zp │ │ ├── __init__.py │ │ └── mixin.py ├── ark │ └── __init__.py └── ipex │ └── __init__.py ├── SECURITY.md ├── setup.cfg ├── .pre-commit-config.yaml ├── .github └── workflows │ ├── manual-binary-build-publish.yml │ └── compatibility-test.yml ├── CONTRIBUTING.md └── pyproject.toml /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/test_hpu/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt: -------------------------------------------------------------------------------- 1 | endianess -------------------------------------------------------------------------------- /requirements-cpu.txt: -------------------------------------------------------------------------------- 1 | numba 2 | tbb 3 | intel-extension-for-pytorch 4 | -------------------------------------------------------------------------------- /test/test_cuda/requirements_diffusion.txt: -------------------------------------------------------------------------------- 1 | diffusers 2 | image-reward 3 | clip -------------------------------------------------------------------------------- /test/test_cuda/test_llmc_integration.py: -------------------------------------------------------------------------------- 1 | ../test_cpu/test_llmc_integration.py -------------------------------------------------------------------------------- /auto_round/alg_ext.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/auto_round/alg_ext.pyd -------------------------------------------------------------------------------- /docs/imgs/AutoRound.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/AutoRound.png -------------------------------------------------------------------------------- /auto_round/alg_ext.abi3.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/auto_round/alg_ext.abi3.so -------------------------------------------------------------------------------- /docs/imgs/full_range_sym.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/full_range_sym.png -------------------------------------------------------------------------------- /docs/imgs/autoround_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/autoround_overview.png -------------------------------------------------------------------------------- /docs/imgs/norm_bias_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/docs/imgs/norm_bias_overview.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include requirements-cpu.txt 3 | include requirements-lib.txt 4 | exclude test/* 5 | -------------------------------------------------------------------------------- /auto_round/auto_scheme/default_alg.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/auto_round/auto_scheme/default_alg.pyd -------------------------------------------------------------------------------- /auto_round/auto_scheme/default_alg.abi3.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/main/auto_round/auto_scheme/default_alg.abi3.so -------------------------------------------------------------------------------- /requirements-lib.txt: -------------------------------------------------------------------------------- 1 | accelerate>=1.10.0 2 | datasets 3 | py-cpuinfo 4 | sentencepiece 5 | numpy 6 | tqdm 7 | packaging 8 | pillow 9 | transformers 10 | threadpoolctl 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vs 2 | .vscode 3 | __pycache__ 4 | *.egg-info/ 5 | build/* 6 | .eggs/ 7 | dist/ 8 | .cache/ 9 | .clangd 10 | CMakeUserPresets.json 11 | tmp_autoround/ 12 | ut_log_dir/ 13 | -------------------------------------------------------------------------------- /auto_round/compressors/mllm/templates/llava.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "llava", 3 | "replace_tokens": null, 4 | "processor": "llava", 5 | "extra_encode" : false, 6 | "default_dataset": "NeelNanda/pile-10k" 7 | } -------------------------------------------------------------------------------- /test/test_cpu/requirements.txt: -------------------------------------------------------------------------------- 1 | addict 2 | modelscope 3 | gguf 4 | sentencepiece 5 | torchvision 6 | parameterized 7 | pillow 8 | numba 9 | llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main 10 | lm_eval -------------------------------------------------------------------------------- /auto_round/compressors/mllm/templates/phi3_v.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "phi3_v", 3 | "replace_tokens": ["", "<|image_1|>"], 4 | "processor": "hf", 5 | "extra_encode" : false, 6 | "default_dataset": "NeelNanda/pile-10k" 7 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 1.5.1=4.38 -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/README.md: -------------------------------------------------------------------------------- 1 | - Build and Install vLLM 2 | 3 | ``` 4 | git clone --branch fused-moe-ar https://github.com/yiliu30/vllm-fork.git 5 | VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv 6 | ``` 7 | 8 | 9 | - Enable vLLM-Ext at Runtime 10 | ```bash 11 | VLLM_ENABLE_AR_EXT=1 vllm serve ... 12 | ``` -------------------------------------------------------------------------------- /auto_round/compressors/mllm/templates/cogvlm2.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "cogvlm2", 3 | "format_user": "Question: {{content}} ", 4 | "format_assistant": "Answer: {{content}}\n", 5 | "replace_tokens": ["\n", ""], 6 | "processor": "cogvlm2", 7 | "extra_encode" : true, 8 | "default_dataset": "NeelNanda/pile-10k" 9 | } -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/.coverage: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | 4 | [paths] 5 | source = 6 | auto_round/ 7 | /auto-round/auto_round/ 8 | */site-packages/auto_round/ 9 | */dist-packages/auto_round/ 10 | 11 | [report] 12 | include = 13 | */auto_round/** 14 | */auto_round_extension/** 15 | exclude_lines = 16 | pragma: no cover 17 | raise NotImplementedError 18 | raise TypeError 19 | except ImportError: 20 | except Exception as e: -------------------------------------------------------------------------------- /auto_round/compressors/mllm/templates/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "default", 3 | "format_user": "{{content}}", 4 | "format_assistant": "{{content}}", 5 | "format_system": "{{content}}", 6 | "format_function": "", 7 | "format_observation": "", 8 | "format_separator": "\n", 9 | "default_system": "You are a helpful assistant.", 10 | "replace_tokens": null, 11 | "extra_encode" : false, 12 | "default_dataset": "NeelNanda/pile-10k", 13 | "processor": "hf" 14 | } -------------------------------------------------------------------------------- /test/test_cuda/requirements.txt: -------------------------------------------------------------------------------- 1 | # autoawq 2 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation 3 | auto-gptq 4 | einops 5 | # gptqmodel>=2.0 6 | # pip install -v git+https://github.com/ModelCloud/GPTQModel.git@v2.2.0 --no-build-isolation 7 | intel-extension-for-pytorch 8 | lm-eval>=0.4.9.1 9 | optimum 10 | pandas 11 | parameterized 12 | pillow 13 | torchvision 14 | numba 15 | vllm>=0.8.5.post1 16 | llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main -------------------------------------------------------------------------------- /test/test_cpu/test_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, "../..") 6 | 7 | 8 | class TestScript(unittest.TestCase): 9 | def test_default(self): 10 | os.system( 11 | """ 12 | cd ../.. && 13 | python -m auto_round 14 | --iters 2 15 | --deployment_device fake 16 | --output_dir ./tmp_script_test""" 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /test/test_cuda/requirements_vlm.txt: -------------------------------------------------------------------------------- 1 | # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 --no-deps 2 | # pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git --no-deps 3 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation 4 | # pip install flash-attn==2.7.4.post1 --no-build-isolation 5 | bitsandbytes 6 | einops 7 | flash-attn 8 | intel-extension-for-transformers 9 | lm-eval>=0.4.2,<0.5 10 | optimum 11 | pandas 12 | protobuf 13 | pillow 14 | tiktoken 15 | torchvision 16 | triton 17 | xformers 18 | timm 19 | -------------------------------------------------------------------------------- /.azure-pipelines/license_template.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Intel Corporation 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round_extension/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round_extension/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round_extension/hpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round_extension/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round_extension/triton/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round/export/export_to_autogptq/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round/export/export_to_gguf/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils_zp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round/export/export_to_awq/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .export import save_quantized_as_autoawq 16 | -------------------------------------------------------------------------------- /auto_round/export/export_to_llmcompressor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .config import check_compressed_tensors_supported 15 | -------------------------------------------------------------------------------- /auto_round/export/export_to_autoround/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .export import save_quantized_as_autoround, AutoRoundFormat 16 | -------------------------------------------------------------------------------- /auto_round/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init 15 | -------------------------------------------------------------------------------- /auto_round/export/export_to_itrex/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .export import save_quantized_as_itrex, pack_model 15 | from .config import QuantConfig 16 | -------------------------------------------------------------------------------- /auto_round/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round.utils.device import * 16 | from auto_round.utils.common import * 17 | from auto_round.utils.model import * 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = 3 | LICENSE 4 | third-party-programs.txt 5 | 6 | [options.entry_points] 7 | console_scripts = 8 | auto_round = auto_round.__main__:run 9 | auto-round = auto_round.__main__:run 10 | auto_round_eval = auto_round.__main__:run_eval 11 | auto-round-eval = auto_round.__main__:run_eval 12 | auto_round_mllm = auto_round.__main__:run_mllm 13 | auto-round-mllm = auto_round.__main__:run_mllm 14 | auto-round-fast = auto_round.__main__:run_fast 15 | auto_round_fast = auto_round.__main__:run_fast 16 | auto-round-best = auto_round.__main__:run_best 17 | auto_round_best = auto_round.__main__:run_best 18 | auto-round-light = auto_round.__main__:run_light 19 | auto_round_light = auto_round.__main__:run_light 20 | 21 | -------------------------------------------------------------------------------- /auto_round/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Intel® auto-round: An open-source Python library 15 | supporting popular model weight only compression based on signround.""" 16 | 17 | __version__ = "0.9.3" 18 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/collect_log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | uv pip install coverage 4 | export COVERAGE_RCFILE=${BUILD_SOURCESDIRECTORY}/.azure-pipelines/scripts/ut/.coverage 5 | coverage_log="${BUILD_SOURCESDIRECTORY}/log_dir/coverage_log" 6 | cd "${BUILD_SOURCESDIRECTORY}/log_dir" 7 | 8 | echo "collect coverage for PR branch" 9 | mkdir -p coverage_PR 10 | cp ut-*/.coverage.* ./coverage_PR/ 11 | cd coverage_PR 12 | coverage combine --keep --rcfile=${COVERAGE_RCFILE} 13 | 14 | cp .coverage "${BUILD_SOURCESDIRECTORY}" 15 | cd "${BUILD_SOURCESDIRECTORY}" 16 | coverage report -m --rcfile=${COVERAGE_RCFILE} | tee ${coverage_log} 17 | coverage html -d log_dir/coverage_PR/htmlcov --rcfile=${COVERAGE_RCFILE} 18 | coverage xml -o log_dir/coverage_PR/coverage.xml --rcfile=${COVERAGE_RCFILE} 19 | ls -l log_dir/coverage_PR/htmlcov 20 | -------------------------------------------------------------------------------- /docs/full_range_sym.md: -------------------------------------------------------------------------------- 1 | W2G32 nsamples 512,iter 200, average accuracy of 10 tasks 2 | 3 | | Models | gptq_sym | asym | full_range_sym | 4 | |----------------------------|----------|------------|----------------| 5 | | Meta-Llama-3.1-8B-Instruct | 0.4500 | 0.52802 | **0.5381** | 6 | | Qwen2-7B | 0.5229 | **0.5559** | 0.5486 | 7 | 8 | W4G128 nsamples 128,iter 200, average accuracy of 10 tasks 9 | 10 | | Models | asym | full_range_sym | 11 | |----------------------------|------------|----------------| 12 | | Meta-Llama-3.1-8B-Instruct | 0.6342 | **0.6370** | 13 | | Qwen2-7B | 0.6143 | **0.6167** | 14 | | Mistral-7B-Instruct-v0.2 | 0.6606 | **0.6635** | 15 | | Phi-3-mini-4k-instruct | **0.6475** | 0.6432 | 16 | -------------------------------------------------------------------------------- /auto_round_extension/ark/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round_extension.ark.qlinear import QuantLinear, QuantLinearGPTQ, QuantLinearAWQ 16 | 17 | qlinear_classes = (QuantLinear, QuantLinearGPTQ) 18 | 19 | awq_classes = (QuantLinearAWQ,) 20 | -------------------------------------------------------------------------------- /auto_round/compressors/diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader 16 | from auto_round.compressors.diffusion.compressor import DiffusionCompressor 17 | from auto_round.compressors.diffusion.eval import diffusion_eval 18 | -------------------------------------------------------------------------------- /auto_round/experimental/qmodules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round.experimental.qmodules.mx import MXFP4QuantLinear, MXFP8QuantLinear 16 | from auto_round.experimental.qmodules.nvfp4 import NVFP4QuantLinear 17 | from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear 18 | -------------------------------------------------------------------------------- /auto_round_extension/ipex/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear as IpexAWQQuantLinear 16 | from auto_round_extension.ipex.qlinear_ipex_gptq import ( 17 | QuantLinear as IpexGPTQQuantLinear, 18 | ) 19 | 20 | ipex_qlinear_classes = (IpexAWQQuantLinear, IpexGPTQQuantLinear) 21 | -------------------------------------------------------------------------------- /test/test_hpu/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Mapping 3 | 4 | import pytest 5 | 6 | 7 | def pytest_addoption(parser): 8 | parser.addoption( 9 | "--mode", 10 | action="store", 11 | default="lazy", 12 | help="{compile|lazy}, default lazy. Choose mode to run tests", 13 | ) 14 | 15 | 16 | backup_env = pytest.StashKey[Mapping]() 17 | 18 | 19 | def pytest_configure(config): 20 | pytest.mode = config.getoption("--mode") 21 | assert pytest.mode.lower() in ["lazy", "compile"] 22 | 23 | config.stash[backup_env] = os.environ 24 | 25 | if pytest.mode == "lazy": 26 | os.environ["PT_HPU_LAZY_MODE"] = "1" 27 | elif pytest.mode == "compile": 28 | os.environ["PT_HPU_LAZY_MODE"] = "0" 29 | os.environ["PT_ENABLE_INT64_SUPPORT"] = "1" 30 | 31 | 32 | def pytest_unconfigure(config): 33 | os.environ.clear() 34 | os.environ.update(config.stash[backup_env]) 35 | -------------------------------------------------------------------------------- /auto_round/compressors/mllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round.compressors.mllm.dataset import get_mllm_dataloader 16 | from auto_round.compressors.mllm.template import Template, get_template, TEMPLATES 17 | from auto_round.compressors.mllm.compressor import MLLMCompressor 18 | from auto_round.compressors.mllm.eval import mllm_eval, lmms_eval 19 | -------------------------------------------------------------------------------- /auto_round/data_type/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import auto_round.data_type.int 16 | import auto_round.data_type.mxfp 17 | import auto_round.data_type.fp8 18 | from auto_round.data_type.register import QUANT_FUNC_WITH_DTYPE 19 | import auto_round.data_type.w4fp8 20 | from auto_round.data_type.utils import get_quant_func, update_fused_layer_global_scales 21 | import auto_round.data_type.nvfp 22 | import auto_round.data_type.gguf 23 | -------------------------------------------------------------------------------- /auto_round/compressors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round.compressors.adam import AdamCompressor 16 | from auto_round.compressors.base import BaseCompressor 17 | from auto_round.compressors.base import LLMCompressor 18 | from auto_round.compressors.mllm.compressor import MLLMCompressor 19 | from auto_round.compressors.diffusion.compressor import DiffusionCompressor 20 | from auto_round.compressors.config import ( 21 | DiffusionExtraConfig, 22 | ExtraConfig, 23 | MLLMExtraConfig, 24 | SchemeExtraConfig, 25 | TuningExtraConfig, 26 | ) 27 | -------------------------------------------------------------------------------- /docs/mxnv_acc.md: -------------------------------------------------------------------------------- 1 | Average accuracy of hellaswag,lambada_openai,mmlu,piqa,winogrande. 2 | 3 | We evaluated using a fake model since we currently have no access to devices for running the real models. However, we have verified that in most cases the fake model closely matches the real model. 4 | 5 | | mxfp4 g32 | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4 | Qwen3-32B | 6 | |:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:| 7 | | RTN | 0.6212 | 0.6550 | 0.7167 | 0.6901 | 8 | | AutoRound | 0.6686 | 0.6758 | 0.7247 | 0.7211 | 9 | | AutoRound+alg_ext | 0.6732 | 0.6809 | 0.7225 | 0.7201 | 10 | 11 | | nvfp4 g16 | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4 | Qwen3-32B | 12 | |:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:| 13 | | RTN | 0.6876 | 0.6906 | 0.7296 | 0.7164 | 14 | | AutoRound | 0.6918 | 0.6973 | 0.7306 | 0.7306 | 15 | | AutoRound+alg_ext | 0.6965 | 0.6989 | 0.7318 | 0.7295 | 16 | -------------------------------------------------------------------------------- /test/test_cpu/test_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from unittest.mock import patch 3 | 4 | sys.path.insert(0, "../..") 5 | import auto_round.utils.device as auto_round_utils 6 | 7 | 8 | class TestPackingWithNumba: 9 | 10 | @patch.object(auto_round_utils, "_is_tbb_installed", lambda: False) 11 | def test_tbb_not_installed(self): 12 | assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False." 13 | assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False." 14 | 15 | @patch.object(auto_round_utils, "_is_tbb_installed", lambda: True) 16 | @patch.object(auto_round_utils, "_is_tbb_configured", lambda: False) 17 | def test_tbb_installed_but_not_configured_right(self): 18 | assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False." 19 | assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False." 20 | 21 | @patch.object(auto_round_utils, "is_numba_available", lambda: False) 22 | def test_numba_not_installed(self): 23 | assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False." 24 | -------------------------------------------------------------------------------- /auto_round/auto_scheme/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from auto_round.logger import logger 17 | 18 | from auto_round.auto_scheme.gen_auto_scheme import AutoScheme 19 | 20 | 21 | def __getattr__(name): 22 | if name == "AUTO_SCHEME_METHODS": 23 | try: 24 | import auto_round.auto_scheme.default_alg 25 | except ImportError: 26 | logger.warning("AutoScheme is currently supported only on Linux.") 27 | 28 | from auto_round.auto_scheme.register import AUTO_SCHEME_METHODS 29 | 30 | return AUTO_SCHEME_METHODS 31 | 32 | raise AttributeError(f"auto-scheme has no attribute '{name}'") 33 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # ==---------------------------------------------------------------------------== 16 | # Apply the extension 17 | # ==---------------------------------------------------------------------------== 18 | 19 | 20 | def apply(): 21 | import auto_round_extension.vllm_ext.auto_round_ext 22 | import auto_round_extension.vllm_ext.envs_ext 23 | 24 | print("*****************************************************************************") 25 | print("* !!! VLLM_ENABLE_AR_EXT is set to 1, applying auto_round_vllm_extension *") 26 | print("*****************************************************************************") 27 | -------------------------------------------------------------------------------- /.azure-pipelines/code-scan.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - auto_round 12 | - auto_round_extension 13 | - setup.py 14 | - requirements.txt 15 | - requirements-cpu.txt 16 | - requirements-lib.txt 17 | - .azure-pipelines/code-scan.yml 18 | - .azure-pipelines/scripts/codeScan 19 | 20 | pool: 21 | vmImage: "ubuntu-latest" 22 | 23 | variables: 24 | CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog" 25 | 26 | stages: 27 | 28 | - stage: BanditCodeScan 29 | displayName: Bandit Code Scan 30 | dependsOn: [] 31 | jobs: 32 | - job: Bandit 33 | displayName: Bandit 34 | steps: 35 | - template: template/code-scan-template.yml 36 | parameters: 37 | codeScanFileName: "bandit" 38 | uploadPath: "bandit.log" 39 | 40 | - stage: PylintCodeScan 41 | displayName: Pylint Code Scan 42 | dependsOn: [] 43 | jobs: 44 | - job: Pylint 45 | displayName: Pylint 46 | steps: 47 | - template: template/code-scan-template.yml 48 | parameters: 49 | codeScanFileName: "pylint" 50 | uploadPath: "pylint.json" 51 | -------------------------------------------------------------------------------- /auto_round/auto_scheme/register.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | AUTO_SCHEME_METHODS = {} 16 | 17 | 18 | def register_scheme_methods(names): 19 | """Class decorator to register a mixed precision algorithm to the registry. 20 | 21 | Decorator function used before a Pattern subclass. 22 | 23 | Args: 24 | names: A string. Define the export type. 25 | 26 | Returns: 27 | cls: The class of register. 28 | """ 29 | 30 | def register(alg): 31 | if isinstance(names, (tuple, list)): 32 | for name in names: 33 | AUTO_SCHEME_METHODS[name] = alg 34 | else: 35 | AUTO_SCHEME_METHODS[names] = alg 36 | 37 | return alg 38 | 39 | return register 40 | -------------------------------------------------------------------------------- /auto_round/data_type/register.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | QUANT_FUNC_WITH_DTYPE = {} 17 | 18 | 19 | def register_dtype(names): 20 | """Class decorator to register a EXPORT subclass to the registry. 21 | 22 | Decorator function used before a Pattern subclass. 23 | 24 | Args: 25 | names: A string. Define the export type. 26 | 27 | Returns: 28 | cls: The class of register. 29 | """ 30 | 31 | def register(dtype): 32 | if isinstance(names, (tuple, list)): 33 | for name in names: 34 | QUANT_FUNC_WITH_DTYPE[name] = dtype 35 | else: 36 | QUANT_FUNC_WITH_DTYPE[names] = dtype 37 | 38 | return dtype 39 | 40 | return register 41 | -------------------------------------------------------------------------------- /test/test_cuda/test_multiple_card_calib.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | import sys 5 | import unittest 6 | 7 | sys.path.insert(0, "../..") 8 | 9 | from auto_round.testing_utils import multi_card 10 | 11 | 12 | def get_accuracy(data): 13 | match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data) 14 | 15 | if match: 16 | accuracy = float(match.group(1)) 17 | return accuracy 18 | else: 19 | return 0.0 20 | 21 | 22 | class TestAutoRound(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(self): 25 | self.save_dir = "./saved" 26 | self.tasks = "lambada_openai" 27 | 28 | @classmethod 29 | def tearDownClass(self): 30 | shutil.rmtree("./saved", ignore_errors=True) 31 | shutil.rmtree("runs", ignore_errors=True) 32 | 33 | @multi_card 34 | def test_multiple_card_calib(self): 35 | python_path = sys.executable 36 | 37 | ##test llm script 38 | res = os.system( 39 | f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None" 40 | ) 41 | if res > 0 or res == -1: 42 | assert False, "cmd line test fail, please have a check" 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /test/test_cpu/_test_helpers.py: -------------------------------------------------------------------------------- 1 | def model_infer(model, tokenizer, apply_chat_template=False): 2 | prompts = [ 3 | "Hello,my name is", 4 | # "The president of the United States is", 5 | # "The capital of France is", 6 | # "The future of AI is", 7 | ] 8 | if apply_chat_template: 9 | texts = [] 10 | for prompt in prompts: 11 | messages = [{"role": "user", "content": prompt}] 12 | text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 13 | texts.append(text) 14 | prompts = texts 15 | 16 | inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) 17 | 18 | outputs = model.generate( 19 | input_ids=inputs["input_ids"].to(model.device), 20 | attention_mask=inputs["attention_mask"].to(model.device), 21 | do_sample=False, ## change this to follow official usage 22 | max_new_tokens=5, 23 | ) 24 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] 25 | 26 | decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 27 | 28 | for i, prompt in enumerate(prompts): 29 | print(f"Prompt: {prompt}") 30 | print(f"Generated: {decoded_outputs[i]}") 31 | print("-" * 50) 32 | return decoded_outputs[0] 33 | -------------------------------------------------------------------------------- /test/test_cuda/_test_helpers.py: -------------------------------------------------------------------------------- 1 | def model_infer(model, tokenizer, apply_chat_template=False): 2 | prompts = [ 3 | "Hello,my name is", 4 | # "The president of the United States is", 5 | # "The capital of France is", 6 | # "The future of AI is", 7 | ] 8 | if apply_chat_template: 9 | texts = [] 10 | for prompt in prompts: 11 | messages = [{"role": "user", "content": prompt}] 12 | text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 13 | texts.append(text) 14 | prompts = texts 15 | 16 | inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) 17 | 18 | outputs = model.generate( 19 | input_ids=inputs["input_ids"].to(model.device), 20 | attention_mask=inputs["attention_mask"].to(model.device), 21 | do_sample=False, ## change this to follow official usage 22 | max_new_tokens=5, 23 | ) 24 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] 25 | 26 | decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 27 | 28 | for i, prompt in enumerate(prompts): 29 | print(f"Prompt: {prompt}") 30 | print(f"Generated: {decoded_outputs[i]}") 31 | print("-" * 50) 32 | return decoded_outputs[0] 33 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/kv_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import TYPE_CHECKING, Any, Literal, Optional, cast 17 | 18 | import torch 19 | from vllm.logger import init_logger 20 | from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod 21 | 22 | logger = init_logger(__name__) 23 | 24 | 25 | class AutoRoundKVCacheMethod(BaseKVCacheMethod): 26 | """ 27 | Supports loading kv-cache scaling factors from compressed-tensors 28 | checkpoints. 29 | """ 30 | 31 | def __init__(self, quant_config): 32 | self.validate_kv_cache_scheme(quant_config) 33 | super().__init__(quant_config) 34 | 35 | @staticmethod 36 | def validate_kv_cache_scheme(quant_config): 37 | # FIXME: parse from quant_config 38 | return True 39 | -------------------------------------------------------------------------------- /auto_round/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from auto_round.autoround import AutoRound 15 | 16 | # support for old api 17 | from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion 18 | from auto_round.schemes import QuantizationScheme 19 | from auto_round.auto_scheme import AutoScheme 20 | from auto_round.utils import LazyImport 21 | 22 | 23 | def __getattr__(name): 24 | if name == "AutoHfQuantizer": 25 | from auto_round.inference.auto_quantizer import AutoHfQuantizer 26 | 27 | return AutoHfQuantizer 28 | if name == "AutoRoundConfig": 29 | from auto_round.inference.auto_quantizer import AutoRoundConfig 30 | 31 | return AutoRoundConfig 32 | 33 | raise AttributeError(f"auto-round has no attribute '{name}'") 34 | 35 | 36 | from .version import __version__ 37 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/codeScan/bandit/bandit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for var in "$@" 4 | do 5 | case $var in 6 | --scan_module=*) 7 | scan_module=$(echo $var |cut -f2 -d=) 8 | ;; 9 | esac 10 | done 11 | 12 | source /auto-round/.azure-pipelines/scripts/change_color.sh 13 | RESET="echo -en \\E[0m \\n" # close color 14 | 15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog" 16 | mkdir -p $log_dir 17 | 18 | python -m bandit -r -lll -iii "/auto-round/${scan_module}" >$log_dir/bandit.log 19 | exit_code=$? 20 | 21 | $BOLD_YELLOW && echo " ----------------- Current bandit cmd start --------------------------" && $RESET 22 | echo "python -m bandit -r -lll -iii /auto-round/${scan_module} > $log_dir/bandit.log" 23 | $BOLD_YELLOW && echo " ----------------- Current bandit cmd end --------------------------" && $RESET 24 | 25 | $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" 26 | cat $log_dir/bandit.log 27 | $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET 28 | 29 | if [ ${exit_code} -ne 0 ]; then 30 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET 31 | exit 1 32 | fi 33 | $BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 34 | exit 0 35 | -------------------------------------------------------------------------------- /.azure-pipelines/docker/DockerfileCodeScan.devel: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | ARG UBUNTU_VER=24.04 17 | FROM ubuntu:${UBUNTU_VER} as devel 18 | 19 | # See http://bugs.python.org/issue19846 20 | ENV LANG C.UTF-8 21 | 22 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 23 | aspell \ 24 | aspell-en \ 25 | python3 \ 26 | python3-pip \ 27 | autoconf \ 28 | build-essential \ 29 | wget 30 | 31 | RUN ln -sf $(which python3) /usr/bin/python 32 | 33 | ARG USER_ID=1000 34 | ARG GROUP_ID=1000 35 | 36 | RUN groupadd -g ${GROUP_ID} hostgroup && \ 37 | useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser 38 | 39 | USER hostuser 40 | 41 | ENV PATH="/home/hostuser/.local/bin:$PATH" 42 | RUN pip config set global.break-system-packages true 43 | RUN python -m pip install --no-cache-dir pylint bandit 44 | 45 | WORKDIR / 46 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/quant_impl.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from typing import Optional 17 | 18 | import torch 19 | 20 | 21 | class AutoRoundQuantImpl(ABC): 22 | @classmethod 23 | @abstractmethod 24 | def get_min_capability(cls) -> int: 25 | """ 26 | Get minimum device capability. 27 | """ 28 | raise NotImplementedError 29 | 30 | @abstractmethod 31 | def create_weights(self, *args, **kwargs): 32 | raise NotImplementedError 33 | 34 | @abstractmethod 35 | def apply_weights( 36 | self, 37 | layer: torch.nn.Module, 38 | x: torch.Tensor, 39 | bias: Optional[torch.Tensor], 40 | ): 41 | raise NotImplementedError 42 | 43 | @abstractmethod 44 | def process_weights_after_loading(self, layer: torch.nn.Module): 45 | raise NotImplementedError 46 | -------------------------------------------------------------------------------- /.azure-pipelines/unit-test-hpu.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - auto_round 12 | - auto_round_extension 13 | - test/test*hpu*' 14 | - setup.py 15 | - requirements-lib.txt 16 | - .azure-pipelines/scripts/ut 17 | - .azure-pipelines/template/docker-template.yml 18 | - .azure-pipelines/template/ut-template.yml 19 | exclude: 20 | - auto_round/export/export_to_autogptq 21 | - auto_round/export/export_to_awq 22 | - "*.md" 23 | - "**/*.md" 24 | 25 | pool: GAUDI 26 | 27 | variables: 28 | IMAGE_NAME: "auto-round" 29 | IMAGE_TAG: "py312" 30 | UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir 31 | DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir 32 | ARTIFACT_NAME: "UT_coverage_report" 33 | REPO: $(Build.Repository.Uri) 34 | 35 | stages: 36 | - stage: Unit_test 37 | displayName: Unit Test 38 | dependsOn: [] 39 | jobs: 40 | - job: 41 | displayName: Unit Test 42 | steps: 43 | - template: template/ut-template.yml 44 | parameters: 45 | imageSource: "pull" 46 | dockerConfigName: "commonDockerConfig" 47 | utScriptFileName: "run_ut_hpu" 48 | uploadPath: $(UPLOAD_PATH) 49 | utArtifact: "ut" 50 | 51 | - task: PublishCodeCoverageResults@2 52 | inputs: 53 | summaryFileLocation: $(UPLOAD_PATH)/coverage.xml 54 | -------------------------------------------------------------------------------- /.azure-pipelines/template/code-scan-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: codeScanFileName 3 | type: string 4 | - name: uploadPath 5 | type: string 6 | 7 | - name: codeScanContainerName 8 | type: string 9 | default: "codeScan" 10 | - name: scanModule 11 | type: string 12 | default: "auto_round" 13 | 14 | steps: 15 | - template: docker-template.yml 16 | parameters: 17 | dockerConfigName: "commonDockerConfig" 18 | repoName: "code-scan" 19 | repoTag: "1.0" 20 | dockerFileName: "DockerfileCodeScan" 21 | containerName: ${{ parameters.codeScanContainerName }} 22 | 23 | - script: | 24 | docker exec ${{ parameters.codeScanContainerName }} bash -c "bash /auto-round/.azure-pipelines/scripts/codeScan/${{ parameters.codeScanFileName }}/${{ parameters.codeScanFileName }}.sh \ 25 | --scan_module=${{ parameters.scanModule }}" 26 | displayName: "${{ parameters.codeScanFileName }} Check" 27 | 28 | - task: PublishPipelineArtifact@1 29 | condition: succeededOrFailed() 30 | inputs: 31 | targetPath: .azure-pipelines/scripts/codeScan/scanLog/${{ parameters.uploadPath }} 32 | artifact: ${{ parameters.codeScanFileName }} 33 | publishLocation: "pipeline" 34 | displayName: "PublishPipelineArtifact" 35 | 36 | - task: Bash@3 37 | condition: always() 38 | inputs: 39 | targetType: "inline" 40 | script: | 41 | docker exec ${{ parameters.codeScanContainerName }} bash -c "rm -fr /auto-round/* && rm -fr /auto-round/.* || true" 42 | displayName: "Docker clean up" 43 | -------------------------------------------------------------------------------- /test/test_cpu/test_alg_ext.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | from parameterized import parameterized 7 | 8 | sys.path.insert(0, "../..") 9 | 10 | from auto_round import AutoRound 11 | 12 | 13 | class TestAlgExt(unittest.TestCase): 14 | def test_alg_ext(self): 15 | model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" 16 | ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True) 17 | ar.quantize() 18 | 19 | model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" 20 | ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True) 21 | ar.quantize() 22 | 23 | from auto_round.auto_scheme import AutoScheme 24 | 25 | scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True) 26 | model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" 27 | ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True) 28 | ar.quantize() 29 | 30 | def test_alg_ext_import(self): 31 | from auto_round.alg_ext import wrapper_autoround 32 | 33 | def test_all_support_dtype(self): 34 | model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" 35 | for scheme in ["MXFP4", "NVFP4", "W2A16G64"]: 36 | ar = AutoRound( 37 | model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True 38 | ) 39 | ar.quantize() 40 | -------------------------------------------------------------------------------- /test/test_hpu/_test_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def is_pytest_mode_compile(): 5 | return pytest.mode == "compile" 6 | 7 | 8 | def is_pytest_mode_lazy(): 9 | return pytest.mode == "lazy" 10 | 11 | 12 | def model_infer(model, tokenizer, apply_chat_template=False): 13 | prompts = [ 14 | "Hello,my name is", 15 | # "The president of the United States is", 16 | # "The capital of France is", 17 | # "The future of AI is", 18 | ] 19 | if apply_chat_template: 20 | texts = [] 21 | for prompt in prompts: 22 | messages = [{"role": "user", "content": prompt}] 23 | text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 24 | texts.append(text) 25 | prompts = texts 26 | 27 | inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) 28 | 29 | outputs = model.generate( 30 | input_ids=inputs["input_ids"].to(model.device), 31 | attention_mask=inputs["attention_mask"].to(model.device), 32 | do_sample=False, ## change this to follow official usage 33 | max_new_tokens=5, 34 | ) 35 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] 36 | 37 | decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 38 | 39 | for i, prompt in enumerate(prompts): 40 | print(f"Prompt: {prompt}") 41 | print(f"Generated: {decoded_outputs[i]}") 42 | print("-" * 50) 43 | return decoded_outputs[0] 44 | -------------------------------------------------------------------------------- /.azure-pipelines/docker/Dockerfile.devel: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ARG UBUNTU_VER=24.04 16 | FROM ubuntu:${UBUNTU_VER} 17 | 18 | # See http://bugs.python.org/issue19846 19 | ENV LANG C.UTF-8 20 | 21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 22 | build-essential \ 23 | ca-certificates \ 24 | git \ 25 | libomp-dev \ 26 | numactl \ 27 | time \ 28 | wget \ 29 | bc \ 30 | jq \ 31 | vim 32 | 33 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv 34 | 35 | ARG USER_ID=1000 36 | ARG GROUP_ID=1000 37 | 38 | RUN groupadd -g ${GROUP_ID} hostgroup && \ 39 | useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser 40 | 41 | USER hostuser 42 | 43 | ENV PATH="/home/hostuser/.venv/bin:$PATH" 44 | ENV VIRTUAL_ENV="/home/hostuser/.venv" 45 | ENV UV_NO_PROGRESS=1 \ 46 | UV_COMPILE_BYTECODE=1 \ 47 | UV_LINK_MODE=copy 48 | 49 | RUN uv venv --python=3.12 /home/hostuser/.venv 50 | RUN which python && python --version 51 | 52 | WORKDIR /home/hostuser 53 | -------------------------------------------------------------------------------- /auto_round/export/export_to_autoround/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import fields 16 | from typing import List 17 | 18 | from auto_round.schemes import QuantizationScheme 19 | 20 | 21 | def check_neq_config(config: dict, **expected) -> List[str]: 22 | """ 23 | Compare a config dict against expected values. 24 | Ensures all required keys are present in both config and expected. 25 | 26 | Returns: 27 | List[str]: [keys] for mismatched values. 28 | """ 29 | scheme_keys = [f.name for f in fields(QuantizationScheme)] 30 | # 1. Check missing from expected 31 | missing_expected = [k for k in scheme_keys if k not in expected] 32 | if missing_expected: 33 | raise ValueError(f"Missing expected values for keys: {missing_expected}") 34 | 35 | # # 2. Check missing from layer config 36 | # missing_config = [k for k in scheme_keys if k not in config] # None 37 | # if missing_config: 38 | # raise ValueError(f"Missing config values for keys: {missing_config}") 39 | 40 | # 3. Collect mismatches 41 | return [key for key in scheme_keys if config.get(key) not in (expected[key], None)] 42 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/run_ut_hpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | # install requirements 5 | echo "set up UT env..." 6 | export TQDM_MININTERVAL=60 7 | pip install pytest-cov pytest-html 8 | pip list 9 | 10 | cd /auto-round/test/test_hpu || exit 1 11 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + 12 | 13 | export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH 14 | export FORCE_BF16=1 15 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage 16 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') 17 | 18 | LOG_DIR=/auto-round/log_dir 19 | mkdir -p ${LOG_DIR} 20 | ut_log_name=${LOG_DIR}/ut.log 21 | 22 | find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh 23 | find . -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh 24 | 25 | cat run_lazy.sh 26 | bash run_lazy.sh 2>&1 | tee ${ut_log_name} 27 | 28 | cat run_compile.sh 29 | bash run_compile.sh 2>&1 | tee ${ut_log_name} 30 | 31 | cp report.html ${LOG_DIR}/ 32 | cp coverage.xml ${LOG_DIR}/ 33 | 34 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then 35 | echo "##[error]Find errors in pytest case, please check the output..." 36 | exit 1 37 | fi 38 | 39 | # if ut pass, collect the coverage file into artifacts 40 | cp .coverage ${LOG_DIR}/.coverage 41 | 42 | echo "UT finished successfully! " -------------------------------------------------------------------------------- /auto_round/export/register.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | EXPORT_FORMAT = {} 17 | 18 | 19 | def register_format(name): 20 | """Class decorator to register a EXPORT subclass to the registry. 21 | 22 | Decorator function used before a Pattern subclass. 23 | 24 | Args: 25 | cls (class): The subclass of register. 26 | name: A string. Define the export type. 27 | 28 | Returns: 29 | cls: The class of register. 30 | """ 31 | 32 | def register(format): 33 | EXPORT_FORMAT[name] = format 34 | return format 35 | 36 | return register 37 | 38 | 39 | PACKING_LAYER_WITH_FORMAT = {} 40 | 41 | 42 | def register_layer_packing(name): 43 | """Class decorator to register a EXPORT subclass to the registry. 44 | 45 | Decorator function used before a Pattern subclass. 46 | 47 | Args: 48 | cls (class): The subclass of register. 49 | name: A string. Define the export type. 50 | 51 | Returns: 52 | cls: The class of register. 53 | """ 54 | 55 | def register(format): 56 | PACKING_LAYER_WITH_FORMAT[name] = format 57 | return format 58 | 59 | return register 60 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/codeScan/pylint/pylint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for var in "$@" 4 | do 5 | case $var in 6 | --scan_module=*) 7 | scan_module=$(echo $var |cut -f2 -d=) 8 | ;; 9 | esac 10 | done 11 | 12 | source /auto-round/.azure-pipelines/scripts/change_color.sh 13 | RESET="echo -en \\E[0m \\n" # close color 14 | 15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog" 16 | mkdir -p $log_dir 17 | 18 | pip install torch --index-url https://download.pytorch.org/whl/cpu 19 | pip install -r /auto-round/requirements.txt 20 | pip install -r /auto-round/requirements-cpu.txt 21 | 22 | echo "[DEBUG] list pipdeptree..." 23 | pip install pipdeptree 24 | pipdeptree 25 | 26 | python -m pylint -f json --disable=R,C,W,E0606,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto \ 27 | --ignored-modules=tensorflow,keras,torch,torch.quantization,torch.tensor,torchvision,fairseq,mxnet,onnx,onnxruntime,intel_extension_for_pytorch,intel_extension_for_tensorflow,torchinfo,horovod,transformers,deepspeed,deepspeed.module_inject \ 28 | /auto-round/${scan_module} > $log_dir/pylint.json 29 | 30 | exit_code=$? 31 | 32 | $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" && $RESET 33 | cat $log_dir/pylint.json 34 | $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET 35 | 36 | if [ ${exit_code} -ne 0 ]; then 37 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Pylint error details." && $RESET 38 | exit 1 39 | fi 40 | $BOLD_PURPLE && echo "Congratulations, Pylint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 41 | exit 0 42 | -------------------------------------------------------------------------------- /test/test_cuda/test_calib_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import json 8 | 9 | import torch 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | 12 | from auto_round import AutoRound 13 | 14 | 15 | class TestLocalCalibDataset(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(self): 18 | json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] 19 | os.makedirs("./saved", exist_ok=True) 20 | self.json_file = "./saved/tmp.json" 21 | with open(self.json_file, "w") as json_file: 22 | json.dump(json_data, json_file, indent=4) 23 | 24 | jsonl_data = [{"text": "哈哈,開心點"}, {"text": "hello world"}] 25 | os.makedirs("./saved", exist_ok=True) 26 | self.jsonl_file = "./saved/tmp.jsonl" 27 | with open(self.jsonl_file, "w") as jsonl_file: 28 | for item in jsonl_data: 29 | json.dump(item, jsonl_file, ensure_ascii=False) 30 | jsonl_file.write("\n") 31 | 32 | model_name = "facebook/opt-125m" 33 | self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 34 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 35 | 36 | def test_combine_dataset(self): 37 | dataset = "NeelNanda/pile-10k" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese" 38 | bits, group_size, sym = 4, 128, True 39 | autoround = AutoRound( 40 | self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset 41 | ) 42 | autoround.quantize() 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /test/test_cpu/test_conv1d.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | from _test_helpers import model_infer 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | from auto_round import AutoRound 12 | 13 | 14 | class LLMDataLoader: 15 | def __init__(self): 16 | self.batch_size = 1 17 | 18 | def __iter__(self): 19 | for i in range(2): 20 | yield torch.ones([1, 10], dtype=torch.long) 21 | 22 | 23 | class TestQuantizationConv1d(unittest.TestCase): 24 | @classmethod 25 | def setUpClass(self): 26 | self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M" 27 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 28 | self.llm_dataloader = LLMDataLoader() 29 | 30 | @classmethod 31 | def tearDownClass(self): 32 | shutil.rmtree("./saved", ignore_errors=True) 33 | shutil.rmtree("runs", ignore_errors=True) 34 | 35 | def test_quant(self): 36 | self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) 37 | bits, group_size, sym = 4, 128, True 38 | autoround = AutoRound( 39 | self.model, 40 | self.tokenizer, 41 | bits=bits, 42 | group_size=group_size, 43 | sym=sym, 44 | iters=2, 45 | seqlen=2, 46 | dataset=self.llm_dataloader, 47 | ) 48 | 49 | autoround.quantize() 50 | autoround.save_quantized("./saved") 51 | 52 | model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True) 53 | model_infer(model, self.tokenizer) 54 | 55 | 56 | if __name__ == "__main__": 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/sitecustomize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | VLLM_ENABLE_AR_EXT = os.environ.get("VLLM_ENABLE_AR_EXT", "") in [ 18 | "1", 19 | "true", 20 | "True", 21 | ] 22 | 23 | if VLLM_ENABLE_AR_EXT: 24 | print("*****************************************************************************") 25 | print(f"* !!! VLLM_ENABLE_AR_EXT is set to {VLLM_ENABLE_AR_EXT}, applying auto_round_vllm_extension *") 26 | print("*****************************************************************************") 27 | 28 | import vllm.model_executor.layers.quantization.auto_round as auto_round_module 29 | 30 | from auto_round_extension.vllm_ext.auto_round_ext import AutoRoundExtensionConfig 31 | 32 | auto_round_module.AutoRoundConfig = AutoRoundExtensionConfig 33 | from auto_round_extension.vllm_ext.envs_ext import extra_environment_variables 34 | 35 | 36 | else: 37 | print("*****************************************************************************") 38 | print( 39 | f"* Sitecustomize is loaded, but VLLM_ENABLE_AR_EXT is set to {VLLM_ENABLE_AR_EXT}, skipping auto_round_vllm_extension *" 40 | ) 41 | print("*****************************************************************************") 42 | -------------------------------------------------------------------------------- /test/test_cpu/test_autoopt.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | import transformers 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | from auto_round import AutoRoundAdam 12 | 13 | 14 | class LLMDataLoader: 15 | def __init__(self): 16 | self.batch_size = 1 17 | 18 | def __iter__(self): 19 | for i in range(2): 20 | yield torch.ones([1, 10], dtype=torch.long) 21 | 22 | 23 | class TestAutoRound(unittest.TestCase): 24 | @classmethod 25 | def setUpClass(self): 26 | model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" 27 | self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 28 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 29 | self.llm_dataloader = LLMDataLoader() 30 | 31 | @classmethod 32 | def tearDownClass(self): 33 | shutil.rmtree("./saved", ignore_errors=True) 34 | shutil.rmtree("runs", ignore_errors=True) 35 | 36 | def test_Adam(self): 37 | bits, group_size, sym = 4, 128, False 38 | from auto_round.utils import get_block_names 39 | 40 | llm_block_names = get_block_names(self.model, quant_vision=True) 41 | bits, group_size, sym, batch_size = 4, 128, False, 20 42 | adamround = AutoRoundAdam( 43 | self.model, 44 | self.tokenizer, 45 | bits=bits, 46 | group_size=group_size, 47 | sym=sym, 48 | iters=2, 49 | seqlen=2, 50 | batch_size=batch_size, 51 | dataset=self.llm_dataloader, 52 | to_quant_block_names=llm_block_names, 53 | ) 54 | adamround.quantize() 55 | 56 | 57 | if __name__ == "__main__": 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: true 3 | autoupdate_schedule: quarterly 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v6.0.0 8 | hooks: 9 | - id: check-json 10 | - id: check-yaml 11 | - id: debug-statements 12 | - id: mixed-line-ending 13 | args: [--fix=lf] 14 | 15 | - repo: https://github.com/Lucas-C/pre-commit-hooks 16 | rev: v1.5.5 17 | hooks: 18 | - id: insert-license 19 | files: | 20 | (?x)^( 21 | auto_round/.*(py|yaml|yml|sh)| 22 | auto_round_extension/.*(py|yaml|yml|sh) 23 | )$ 24 | args: 25 | [ 26 | --license-filepath=.azure-pipelines/license_template.txt, 27 | --use-current-year, 28 | --detect-license-in-X-top-lines=40, 29 | --skip-license-insertion-comment=Copyright, 30 | ] 31 | 32 | - repo: https://github.com/psf/black-pre-commit-mirror 33 | rev: 25.9.0 34 | hooks: 35 | - id: black 36 | files: (.*\.py)$ 37 | 38 | - repo: https://github.com/asottile/blacken-docs 39 | rev: 1.20.0 40 | hooks: 41 | - id: blacken-docs 42 | args: [--line-length=120, --skip-errors] 43 | additional_dependencies: 44 | - black==25.9.0 45 | 46 | - repo: https://github.com/codespell-project/codespell 47 | rev: v2.4.1 48 | hooks: 49 | - id: codespell 50 | args: [-w] 51 | additional_dependencies: 52 | - tomli 53 | 54 | - repo: https://github.com/crate-ci/typos 55 | rev: v1.38.1 56 | hooks: 57 | - id: typos 58 | 59 | - repo: https://github.com/pycqa/isort 60 | rev: 6.1.0 61 | hooks: 62 | - id: isort 63 | 64 | - repo: https://github.com/astral-sh/ruff-pre-commit 65 | rev: v0.14.0 66 | hooks: 67 | - id: ruff 68 | args: [--fix, --exit-non-zero-on-fix, --no-cache] 69 | -------------------------------------------------------------------------------- /auto_round/experimental/qmodules/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from typing import Optional, Union 17 | 18 | import torch 19 | 20 | __all__ = ["QModuleBase"] 21 | 22 | 23 | class QModuleBase(torch.nn.Module): 24 | """ 25 | Base class used to describe the weight creation and forward pass 26 | of different quantization schemes supported by Auto-Round. 27 | The design is inspired by vLLM's CompressedTensorsScheme: 28 | https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py 29 | 30 | """ 31 | 32 | def __init__(self): 33 | super().__init__() 34 | 35 | @classmethod 36 | @abstractmethod 37 | def from_original(cls, config, original_layer: torch.nn.Module): 38 | raise NotImplementedError 39 | 40 | @classmethod 41 | @abstractmethod 42 | def get_min_capability(cls) -> int: 43 | """ 44 | Get minimum device capability. 45 | """ 46 | raise NotImplementedError 47 | 48 | @abstractmethod 49 | def process_weights_after_loading(self, layer: torch.nn.Module): 50 | """ 51 | Called after weight loading is complete for any cleanup that 52 | needs to occur. 53 | """ 54 | raise NotImplementedError 55 | -------------------------------------------------------------------------------- /.github/workflows/manual-binary-build-publish.yml: -------------------------------------------------------------------------------- 1 | name: AutoRound binary build and publish 2 | permissions: 3 | contents: read 4 | 5 | on: 6 | workflow_dispatch: 7 | inputs: 8 | branch: 9 | default: 'v0.9.3' 10 | description: 'Tag to build the binary' 11 | required: true 12 | type: string 13 | publish: 14 | default: false 15 | description: 'Publish the binary to PyPi' 16 | required: false 17 | type: boolean 18 | 19 | jobs: 20 | binary-build-and-publish: 21 | runs-on: ubuntu-latest 22 | strategy: 23 | matrix: 24 | option: ["full", "lib", "hpu"] 25 | fail-fast: true 26 | steps: 27 | - name: Checkout out Repo 28 | uses: actions/checkout@v4 29 | with: 30 | ref: ${{ inputs.branch }} 31 | 32 | - name: Set up Python 33 | uses: actions/setup-python@v2 34 | with: 35 | python-version: '3.12' 36 | 37 | - name: Install dependencies 38 | run: | 39 | python -m pip install --upgrade pip setuptools wheel 40 | 41 | - name: Build the binary 42 | run: | 43 | if [ "${{ matrix.option }}" == "full" ]; then 44 | echo "Building auto-round binary..." 45 | python setup.py sdist bdist_wheel 46 | else 47 | echo "Building auto-round-${{ matrix.option }} binary..." 48 | python setup.py sdist bdist_wheel ${{ matrix.option }} 49 | fi 50 | 51 | - name: Publish the binary 52 | if: ${{ fromJSON(inputs.publish) }} 53 | env: 54 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 55 | working-directory: ./${{ matrix.repo }} 56 | run: | 57 | python -m pip install --upgrade twine 58 | # python -m twine upload dist/* 59 | 60 | - uses: actions/upload-artifact@v4.3.4 61 | with: 62 | name: dist-${{ matrix.option }} 63 | path: dist 64 | -------------------------------------------------------------------------------- /test/test_cuda/test_conv1d.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | from _test_helpers import model_infer 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | from auto_round import AutoRound 12 | from auto_round.testing_utils import require_gptqmodel 13 | 14 | 15 | class LLMDataLoader: 16 | def __init__(self): 17 | self.batch_size = 1 18 | 19 | def __iter__(self): 20 | for i in range(2): 21 | yield torch.ones([1, 10], dtype=torch.long) 22 | 23 | 24 | class TestQuantizationConv1d(unittest.TestCase): 25 | @classmethod 26 | def setUpClass(self): 27 | self.model_name = "MBZUAI/LaMini-GPT-124M" 28 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 29 | self.llm_dataloader = LLMDataLoader() 30 | 31 | @classmethod 32 | def tearDownClass(self): 33 | shutil.rmtree("./saved", ignore_errors=True) 34 | shutil.rmtree("runs", ignore_errors=True) 35 | 36 | @require_gptqmodel 37 | def test_quant(self): 38 | self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) 39 | bits, group_size, sym = 4, 128, True 40 | from auto_round import AutoRoundConfig 41 | 42 | autoround = AutoRound( 43 | self.model, 44 | self.tokenizer, 45 | bits=bits, 46 | group_size=group_size, 47 | sym=sym, 48 | iters=2, 49 | seqlen=2, 50 | dataset=self.llm_dataloader, 51 | ) 52 | 53 | autoround.quantize() 54 | autoround.save_quantized("./saved") 55 | 56 | model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True) 57 | model_infer(model, self.tokenizer) 58 | 59 | 60 | if __name__ == "__main__": 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/envs_ext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from typing import Any, Callable 17 | 18 | from vllm.logger import init_logger 19 | 20 | logger = init_logger(__name__) 21 | 22 | # Define extra environment variables 23 | extra_environment_variables: dict[str, Callable[[], Any]] = { 24 | "VLLM_MXFP4_PRE_UNPACK_WEIGHTS": lambda: os.getenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0") in ("1", "true", "True"), 25 | "VLLM_MXFP4_PRE_UNPACK_TO_FP8": lambda: os.getenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1") in ("1", "true", "True"), 26 | "VLLM_ENABLE_STATIC_MOE": lambda: os.getenv("VLLM_ENABLE_STATIC_MOE", "0") in ("1", "true", "True"), 27 | "VLLM_AR_MXFP4_MODULAR_MOE": lambda: os.getenv("VLLM_AR_MXFP4_MODULAR_MOE", "1") in ("1", "true", "True"), 28 | "VLLM_AR_POST_PROCESS_GPTOSS": lambda: os.getenv("VLLM_AR_POST_PROCESS_GPTOSS", "0") in ("1", "true", "True"), 29 | } 30 | # Add the extra environment variables to vllm.envs 31 | import vllm.envs as envs 32 | from vllm.envs import environment_variables 33 | 34 | # Merge the environment variables 35 | all_environment_variables = {**environment_variables, **extra_environment_variables} 36 | 37 | 38 | for name, value_fn in extra_environment_variables.items(): 39 | setattr(envs, name, value_fn()) 40 | 41 | logger.warning_once(f"Added extra environment variables: {list(extra_environment_variables.keys())}") 42 | -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils/mixin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # MIT License 17 | # 18 | # Copyright (c) 2023 潘其威(William) 19 | # 20 | # Permission is hereby granted, free of charge, to any person obtaining a copy 21 | # of this software and associated documentation files (the "Software"), to deal 22 | # in the Software without restriction, including without limitation the rights 23 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 24 | # copies of the Software, and to permit persons to whom the Software is 25 | # furnished to do so, subject to the following conditions: 26 | # 27 | # The above copyright notice and this permission notice shall be included in all 28 | # copies or substantial portions of the Software. 29 | # 30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 35 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36 | # SOFTWARE. 37 | class TritonModuleMixin: 38 | @classmethod 39 | def warmup(cls, model, transpose=False, seqlen=2048): 40 | pass 41 | -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils_zp/mixin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # MIT License 17 | # 18 | # Copyright (c) 2023 潘其威(William) 19 | # 20 | # Permission is hereby granted, free of charge, to any person obtaining a copy 21 | # of this software and associated documentation files (the "Software"), to deal 22 | # in the Software without restriction, including without limitation the rights 23 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 24 | # copies of the Software, and to permit persons to whom the Software is 25 | # furnished to do so, subject to the following conditions: 26 | # 27 | # The above copyright notice and this permission notice shall be included in all 28 | # copies or substantial portions of the Software. 29 | # 30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 35 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36 | # SOFTWARE. 37 | class TritonModuleMixin: 38 | @classmethod 39 | def warmup(cls, model, transpose=False, seqlen=2048): 40 | pass 41 | -------------------------------------------------------------------------------- /test/test_cpu/test_model_scope.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | import shutil 4 | import sys 5 | import unittest 6 | 7 | sys.path.insert(0, "../..") 8 | 9 | import torch 10 | 11 | from auto_round import AutoRound 12 | 13 | 14 | class LLMDataLoader: 15 | def __init__(self): 16 | self.batch_size = 1 17 | 18 | def __iter__(self): 19 | for i in range(3): 20 | yield torch.ones([1, 10], dtype=torch.long) 21 | 22 | 23 | class TestModelScope(unittest.TestCase): 24 | @classmethod 25 | def setUpClass(self): 26 | self.saved_path = "./saved" 27 | self.dataset = LLMDataLoader() 28 | 29 | self.source_path, self.cache_path = "/tf_dataset/auto_round/modelscope", "/home/hostuser/.cache/modelscope" 30 | if os.path.exists(self.source_path): 31 | if not os.path.exists("/home/hostuser/.cache"): 32 | os.makedirs("/home/hostuser/.cache") 33 | shutil.copytree(self.source_path, self.cache_path, dirs_exist_ok=True) 34 | 35 | @classmethod 36 | def tearDownClass(self): 37 | shutil.rmtree("./saved", ignore_errors=True) 38 | shutil.rmtree("runs", ignore_errors=True) 39 | if os.path.exists(self.cache_path): 40 | shutil.rmtree(self.cache_path, ignore_errors=True) 41 | 42 | return super().tearDownClass() 43 | 44 | def test_llm(self): 45 | model_name = "Qwen/Qwen2.5-0.5B-Instruct" 46 | autoround = AutoRound( 47 | model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset 48 | ) 49 | autoround.quantize_and_save() 50 | 51 | def test_mllm(self): 52 | model_name = "Qwen/Qwen2-VL-2B-Instruct" 53 | autoround = AutoRound( 54 | model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2 55 | ) 56 | autoround.quantize_and_save(self.saved_path) 57 | 58 | 59 | if __name__ == "__main__": 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /auto_round/compressors/mllm/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | import requests 18 | 19 | from auto_round.utils import LazyImport 20 | 21 | PIL = LazyImport("PIL") 22 | Image = LazyImport("PIL.Image") 23 | 24 | VISUAL_KEYS = [ 25 | "thinker", 26 | "visual", 27 | "audio", 28 | "talker", 29 | "token2wav", 30 | "multi_modal_projector", 31 | "vision_tower", 32 | "multimodal_projector", 33 | "vision_model", 34 | "model.connector", 35 | ] 36 | 37 | 38 | def _extract_data_dir(dir_path: str): 39 | if os.path.isdir(dir_path): 40 | return dir_path 41 | elif "=" in dir_path: 42 | result = {} 43 | dir_path = dir_path.split(",") 44 | for _path in dir_path: 45 | k, v = _path.split("=") 46 | if k in ["image", "video", "audio"]: 47 | result[k] = v 48 | return result 49 | else: 50 | raise TypeError("incorrect input of extra_data_dir, please use auto_round --help for more details.") 51 | 52 | 53 | def fetch_image(path_or_url): 54 | if os.path.isfile(path_or_url): 55 | image_obj = Image.open(path_or_url) 56 | elif path_or_url.startswith("http://") or path_or_url.startswith("https://"): 57 | image_obj = Image.open(requests.get(path_or_url, stream=True).raw) 58 | else: 59 | raise TypeError(f"{path_or_url} neither a path or url.") 60 | 61 | return image_obj 62 | -------------------------------------------------------------------------------- /docs/alg_202508.md: -------------------------------------------------------------------------------- 1 | If you are evaluating LLaMA models with recent versions of Transformers, please 2 | remove `@use_kernel_forward_from_hub("RMSNorm")` in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40) and enable `add_bos_token`(this is set as default in AutoRound) in lm-eval to stabilize the accuracy. These adjustments affect the quantized model but not the BF16 model for the tasks evaluated in the AutoRoundv2 paper. 3 | 4 | All other settings follow the default configurations of AutoRound and lm-eval. 5 | 6 | | Qwen3-8B W2G64 | Avg. | arc_challenge | hellaswag | gsm8k | lambada_openai | mmlu | mmlupro | truthfulqa_mc1 | winogrande | 7 | |:------------------------------|:------:|:-------------:|:---------:|:------:|:--------------:|:------:|:-------:|:--------------:|:----------:| 8 | | AutoRound | 0.4373 | 0.4019 | 0.4437 | 0.4215 | 0.4826 | 0.5474 | 0.2630 | 0.3072 | 0.6314 | 9 | | AutoRound+alg_ext | 0.4787 | 0.4275 | 0.4516 | 0.5944 | 0.5181 | 0.5773 | 0.2807 | 0.3305 | 0.6496 | 10 | | AutoRoundBest+alg_ext lr 2e-3 | 0.4937 | 0.4505 | 0.474 | 0.5906 | 0.5556 | 0.6028 | 0.3127 | 0.3109 | 0.6527 | 11 | 12 | | Llama3.1-8B-Instruct W2G64 | Avg. | arc_challenge | hellaswag | gsm8k | lambada_openai | mmlu | mmlupro | truthfulqa_mc1 | winogrande | 13 | |:------------------------------|:------:|:-------------:|:---------:|:------:|:--------------:|:------:|:-------:|:--------------:|:----------:| 14 | | AutoRound | 0.3820 | 0.3635 | 0.4562 | 0.1622 | 0.5069 | 0.4411 | 0.1661 | 0.3207 | 0.6393 | 15 | | AutoRound+alg_ext | 0.4166 | 0.3712 | 0.4729 | 0.2039 | 0.5946 | 0.4981 | 0.2163 | 0.3011 | 0.6748 | 16 | | AutoRoundBest+alg_ext lr 2e-3 | 0.4539 | 0.4138 | 0.4999 | 0.3071 | 0.6233 | 0.5279 | 0.2364 | 0.3231 | 0.6993 | 17 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/tests/test_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | from vllm.platforms import current_platform 17 | 18 | MODELS = [ 19 | # "/data5/yliu7/HF_HOME/unsloth-gpt-oss-20b-BF16-ar-MXFP4/" 20 | # "/data5/yliu7/HF_HOME/Qwen2.5-0.5B-Instruct-test-FP8_STATIC-fp8kv/" 21 | # "/data6/yiliu4/Qwen3-15B-A2B-Base-MXFP4", 22 | # "/data6/yiliu4/Llama-3.2-1B-Instruct-MXFP4-fp8attention", 23 | # "/data6/yiliu4/Llama-3.2-1B-Instruct-MXFP8" 24 | "/storage/yiliu7/ar_vllm_ext/quantized_model_qwen_mxfp4", 25 | "/storage/yiliu7/ar_vllm_ext/quantized_model_qwen_mxfp8", 26 | ] 27 | 28 | 29 | @pytest.fixture(autouse=True) 30 | def set_vllm_ar_env(monkeypatch): 31 | monkeypatch.setenv("VLLM_AR_MXFP4_MODULAR_MOE", "1") 32 | monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1") 33 | monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0") 34 | monkeypatch.setenv("VLLM_ENABLE_STATIC_MOE", "0") 35 | monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0") 36 | monkeypatch.setenv("VLLM_ENABLE_AR_EXT", "1") 37 | 38 | 39 | @pytest.mark.skipif( 40 | not current_platform.is_cuda(), 41 | reason="only supports CUDA backend.", 42 | ) 43 | @pytest.mark.parametrize("model", MODELS) 44 | def test_auto_round(vllm_runner, model): 45 | with vllm_runner(model, enforce_eager=True) as llm: 46 | output = llm.generate_greedy(["The capital of France is"], max_tokens=8) 47 | assert output 48 | print(f"output is: {output[0][1]}") 49 | -------------------------------------------------------------------------------- /auto_round/export/export_to_llmcompressor/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict, List 16 | 17 | from auto_round.utils import matches_any_regex, to_standard_regex 18 | 19 | 20 | def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]: 21 | """ 22 | Generate ignore regex list for llm_compressor based on regex_config and layer_config. 23 | 24 | Rules: 25 | 1. Any layer in regex_config with bits >= 16 is ignored. 26 | 2. Any layer in layer_config with bits >= 16 is ignored if not already included. 27 | 3. Output regex patterns are normalized for llm_compressor ('re:...' style). 28 | 29 | Args: 30 | regex_config (Dict[str, Dict]): dynamic quantization config 31 | layer_config (Dict[str, Dict]): layer-wise quantization config 32 | 33 | Returns: 34 | List[str]: List of regex patterns to ignore during quantization. 35 | """ 36 | prefix = "re:" 37 | ignore_regex: List[str] = [] 38 | 39 | # Step 1: Add regex_config keys with bits >= 16 40 | for key, cfg in regex_config.items(): 41 | bits = cfg.get("bits") 42 | if bits > 8: 43 | ignore_regex.append(prefix + to_standard_regex(key)) 44 | 45 | # Step 2: Add all full named layer from layer_config if bits >= 16 46 | for key, cfg in layer_config.items(): 47 | bits = cfg.get("bits") 48 | if bits > 8: 49 | ignore_regex.append(key) 50 | 51 | return ignore_regex 52 | -------------------------------------------------------------------------------- /docs/gguf_alg_ext_acc.md: -------------------------------------------------------------------------------- 1 | We use **lm-eval** for evaluation. For LLaMA, we enabled `add_bos_token` and 2 | `removed @use_kernel_forward_from_hub("RMSNorm")` 3 | in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40) 4 | to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval. 5 | 6 | *Average accuracy across `lambada_openai`, `hellaswag`, `piqa`, `winogrande`, `truthfulqa_mc1`, `openbookqa`, `boolq`, `arc_easy`, `arc_challenge` and `mmlu`.* 7 | 8 | |method|scheme|Llama-3.1-8B|Qwen2.5-7B-Instruct|Qwen3-8b|Qwen3-30B-A3B-Instruct-2507| 9 | |:-----|:-----|:-----------|:------------------|:-------|:--------------------------| 10 | |**BF16** | - |0.6295(100%)|0.6571(100%) |0.6322(100%)|0.6746(100%) | 11 | | **Optimized RTN** | q2_k_s | 0.5535(87.92%)| 0.6266(95.35%)|0.5901(93.35%)|0.6386(94.66%)| 12 | | **AutoRound+alg_ext** |q2_k_s|0.5740(91.18%)|0.6349(96.62%)|0.5962(94.31%)|0.6460(95.77%)| 13 | | **Optimized RTN** | q3_k_s | 0.6040(95.95%)|0.6382(97.12%)|0.6128(96.94%)|0.6598(97.82%)| 14 | | **AutoRound+alg_ext** |q3_k_s|0.6081(96.59%)|0.6503(98.97%)|0.6252(98.89%)|0.6622(98.17%)| 15 | | **Optimized RTN** | q3_k_m |0.6083(96.63%) |0.6418(97.68%)|0.6194(97.97%)|| 16 | | **AutoRound+alg_ext** |q3_k_m|0.6127(97.33%)|0.6533(99.42%)|0.6197(98.02%)|| 17 | | **Optimized RTN** | q4_k_s | 0.6228(98.94%)|0.6560(99.83%)|0.6303(99.70%)|0.6762(100.24%)| 18 | | **AutoRound+alg_ext** |q4_k_s|0.6239(99.11%)|0.6605(100.51%)|0.6320(99.98%)|0.6777(100.46%)| 19 | | **Optimized RTN** | q4_k_m |0.6252(99.32%) |0.6558(99.80%)|0.6296(99.59%)|| 20 | | **AutoRound+alg_ext** |q4_k_m|0.6257(99.40%)|0.6575(100.06%)|0.6340(100.29%)|| 21 | 22 | **Time cost** 23 | |model |Optimized RTN |AutoRound+alg_ext| 24 | |:--------------------------|:-------------|:----------------| 25 | |Llama-3.1-8B |1m25s |29m43s | 26 | |Qwen2.5-7B-Instruct |1m20s |35m35s | 27 | |Qwen3-8b |1m29s |47m58s | 28 | |Qwen3-30B-A3B-Instruct-2507|25m12s |12h47m39s | -------------------------------------------------------------------------------- /test/test_cpu/test_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from io import StringIO 4 | 5 | from auto_round.logger import TRACE_LEVEL, AutoRoundFormatter, logger 6 | 7 | 8 | def test_logger(monkeypatch): 9 | # Mock the AR_LOG_LEVEL environment variable 10 | monkeypatch.setenv("AR_LOG_LEVEL", "TRACE") 11 | 12 | # Create a StringIO to capture log output 13 | log_output = StringIO() 14 | stream_handler = logging.StreamHandler(log_output) 15 | stream_handler.setFormatter(AutoRoundFormatter()) 16 | 17 | # Add the handler to the logger 18 | logger.addHandler(stream_handler) 19 | logger.setLevel(logging.getLevelName(os.getenv("AR_LOG_LEVEL", "INFO"))) 20 | 21 | # Log messages at different levels 22 | logger.trace("This is a TRACE message.") 23 | logger.debug("This is a DEBUG message.") 24 | logger.info("This is an INFO message.") 25 | logger.warning("This is a WARNING message.") 26 | logger.error("This is an ERROR message.") 27 | logger.critical("This is a CRITICAL message.") 28 | 29 | # Test warning_once functionality 30 | logger.warning_once("This is a WARNING_ONCE message.") 31 | logger.warning_once("This is a WARNING_ONCE message.") # Should not log again 32 | logger.warning_once("This is another unique WARNING_ONCE message.") # Should log 33 | 34 | # Remove the handler after the test 35 | logger.removeHandler(stream_handler) 36 | 37 | # Get the log output 38 | log_output.seek(0) 39 | logs = log_output.read() 40 | 41 | # Assertions for log levels 42 | assert "TRACE" in logs 43 | assert "This is a TRACE message." in logs 44 | assert "DEBUG" in logs 45 | assert "This is a DEBUG message." in logs 46 | assert "INFO" in logs 47 | assert "This is an INFO message." in logs 48 | assert "WARNING" in logs 49 | assert "This is a WARNING message." in logs 50 | assert "ERROR" in logs 51 | assert "This is an ERROR message." in logs 52 | assert "CRITICAL" in logs 53 | assert "This is a CRITICAL message." in logs 54 | 55 | # Assertions for warning_once 56 | assert logs.count("This is a WARNING_ONCE message.") == 1 57 | assert "This is another unique WARNING_ONCE message." in logs 58 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/change_color.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------- general approach start---------------- 4 | 5 | # 1. import this file: 6 | # source path/change_color.sh 7 | # 2. use COLOR/BG: 8 | # $VARIABLE_NAME && out_put_content && $RESET 9 | # 3. COLOR + BG: 10 | # $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET 11 | # 4. custom 12 | # abbreviation(change number) 13 | # txt number range (30, 37) 14 | # bg number range (40, 47) 15 | # special effects number range (1, 7) 16 | # echo -en \\E[number1 + ; + number2 + ; + number3 + m" 17 | # e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m" 18 | 19 | # -------------- general approach end----------------== 20 | 21 | 22 | # general setting 23 | # ------------- light_color start---------------- 24 | # black 25 | LIGHT_BLACK="echo -en \\E[30m" 26 | # red 27 | LIGHT_RED="echo -en \\E[31m" 28 | # green 29 | LIGHT_GREEN="echo -en \\E[32m" 30 | # yellow 31 | LIGHT_YELLOW="echo -en \\E[33m" 32 | # blue 33 | LIGHT_BLUE="echo -en \\E[34m" 34 | # purple 35 | LIGHT_PURPLE="echo -en \\E[35m" 36 | # cyan 37 | LIGHT_CYAN="echo -en \\E[36m" 38 | # gray 39 | LIGHT_GRAY="echo -en \\E[37m" 40 | # ------------- light_color end---------------- 41 | 42 | # ------------- bold_color start---------------- 43 | # black 44 | BOLD_BLACK="echo -en \\E[1;30m" 45 | # red 46 | BOLD_RED="echo -en \\E[1;31m" 47 | # green 48 | BOLD_GREEN="echo -en \\E[1;32m" 49 | # yellow 50 | BOLD_YELLOW="echo -en \\E[1;33m" 51 | # blue 52 | BOLD_BLUE="echo -en \\E[1;34m" 53 | # purple 54 | BOLD_PURPLE="echo -en \\E[1;35m" 55 | # cyan 56 | BOLD_CYAN="echo -en \\E[1;36m" 57 | # gray 58 | BOLD_GRAY="echo -en \\E[1;37m" 59 | # ------------- bold_color end---------------- 60 | 61 | # ------------- background_color start---------------- 62 | # black 63 | BG_BLACK="echo -en \\E[40m" 64 | # red 65 | BG_RED="echo -en \\E[41m" 66 | # green 67 | BG_GREEN="echo -en \\E[42m" 68 | # yellow 69 | BG_YELLOW="echo -en \\E[43m" 70 | # blue 71 | BG_BLUE="echo -en \\E[44m" 72 | # purple 73 | BG_PURPLE="echo -en \\E[45m" 74 | # cyan 75 | BG_CYAN="echo -en \\E[46m" 76 | # gray 77 | BG_GRAY="echo -en \\E[47m" 78 | # ------------- background_color end---------------- 79 | 80 | # close 81 | RESET="echo -en \\E[0m" 82 | -------------------------------------------------------------------------------- /docs/publication_list.md: -------------------------------------------------------------------------------- 1 | Full Publications/Events 2 | ========== 3 | 4 | ## 2025 5 | 6 | * Blog in Intel: [Advancing Low-Bit Quantization for LLMs: AutoRound x LLM Compressor](https://community.intel.com/t5/Blogs/Products-and-Solutions/HPC/Advancing-Low-Bit-Quantization-for-LLMs-AutoRound-x-LLM/post/1729336) (Dec 2025) 7 | 8 | * Blog in WeChat: [AutoRound x LLM Compressor:让低比特量化 LLM 更准、更好推理](https://mp.weixin.qq.com/s/l5WA-1_4ipffQN6GOH2Iqg) (Dec 2025) 9 | 10 | * Blog in vLLM: [Advancing Low‑Bit Quantization for LLMs: AutoRound x LLM Compressor](https://blog.vllm.ai/2025/12/09/intel-autoround-llmc.html) (Dec 2025) 11 | 12 | * Blog in RedHat: [Advancing Low‑Bit Quantization for LLMs: AutoRound x LLM Compressor](https://developers.redhat.com/articles/2025/12/09/advancing-low-bit-quantization-llms-autoround-x-llm-compressor) (Dec 2025) 13 | 14 | * arXiv: [SignRoundV2: Closing the Performance Gap in Extremely Low-Bit Post-Training Quantization for LLMs](https://arxiv.org/abs/2512.04746) (Dec 2025) 15 | 16 | * Blog in Intel: [AutoRound Meets SGLang: Enabling Quantized Model Inference with AutoRound](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/AutoRound-Meets-SGLang-Enabling-Quantized-Model-Inference-with/post/1727196) (Nov 2025) 17 | 18 | * Blog in LMSYS: [AutoRound Meets SGLang: Enabling Quantized Model Inference with AutoRound](https://lmsys.org/blog/2025-11-13-AutoRound/) (Nov 2025) 19 | 20 | * Blog in Medium: [Accelerating vLLM and SGLang Deployment using AutoRound](https://medium.com/@NeuralCompressor/accelerating-vllm-and-sglang-deployment-using-autoround-45fdc0b2683e) (Oct 2025) 21 | 22 | * Blog in HuggingFace: [What is AutoRound?](https://huggingface.co/blog/autoround) (April 2025) 23 | 24 | ## 2024 25 | 26 | * EMNLP: [Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLM](https://aclanthology.org/2024.findings-emnlp.662/) (Oct 2024) 27 | 28 | # 2023 29 | 30 | * arXiv: [TEQ: Trainable Equivalent Transformation for Quantization of LLMs](https://arxiv.org/abs/2310.10944) (Oct 2023) 31 | 32 | * Blog in Medium: [Effective Post-Training Quantization for Large Language Models](https://medium.com/intel-analytics-software/effective-post-training-quantization-for-large-language-models-with-enhanced-smoothquant-approach-93e9d104fb98) (Apr 2023) 33 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/run_ut.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | test_part=$1 5 | 6 | # install requirements 7 | echo "##[group]set up UT env..." 8 | export TQDM_MININTERVAL=60 9 | uv pip install pytest-cov pytest-html 10 | uv pip install -r /auto-round/test/test_cpu/requirements.txt \ 11 | --extra-index-url https://download.pytorch.org/whl/cpu 12 | 13 | # install latest gguf for ut test 14 | cd ~ || exit 1 15 | git clone -b master --quiet --single-branch https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && uv pip install . sentencepiece 16 | 17 | cd /auto-round && uv pip install . 18 | 19 | echo "##[endgroup]" 20 | uv pip list 21 | 22 | cd /auto-round/test/test_cpu || exit 1 23 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + 24 | 25 | export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH 26 | export FORCE_BF16=1 27 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage 28 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') 29 | 30 | LOG_DIR=/auto-round/log_dir 31 | mkdir -p ${LOG_DIR} 32 | ut_log_name=${LOG_DIR}/ut.log 33 | 34 | # Split test files into 5 parts 35 | find . -name "test*.py" | sort > all_tests.txt 36 | total_lines=$(wc -l < all_tests.txt) 37 | NUM_CHUNKS=5 38 | q=$(( total_lines / NUM_CHUNKS )) 39 | r=$(( total_lines % NUM_CHUNKS )) 40 | if [ "$test_part" -le "$r" ]; then 41 | chunk_size=$(( q + 1 )) 42 | start_line=$(( (test_part - 1) * chunk_size + 1 )) 43 | else 44 | chunk_size=$q 45 | start_line=$(( r * (q + 1) + (test_part - r - 1) * q + 1 )) 46 | fi 47 | end_line=$(( start_line + chunk_size - 1 )) 48 | selected_files=$(sed -n "${start_line},${end_line}p" all_tests.txt) 49 | printf '%s\n' "${selected_files}" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh 50 | cat run.sh 51 | bash run.sh 2>&1 | tee "${ut_log_name}" 52 | 53 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then 54 | echo "##[error]Find errors in pytest case, please check the output..." 55 | exit 1 56 | fi 57 | 58 | # if ut pass, collect the coverage file into artifacts 59 | cp .coverage "${LOG_DIR}/.coverage.part${test_part}" 60 | 61 | echo "UT finished successfully! " 62 | -------------------------------------------------------------------------------- /auto_round/inference/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from auto_round.utils import SUPPORTED_LAYER_TYPES 17 | 18 | 19 | def _expand_regex_config(regex_config, base_config, layer_names, model): 20 | """ 21 | Expand regex-based layer configs to full layer names. 22 | 23 | Args: 24 | regex_config (dict): regex-based config (dynamic_config or part of extra_config) 25 | base_config (dict): extra_config to write into 26 | layer_names (list): known quantization layer names 27 | model (nn.Module): target model 28 | 29 | Returns: 30 | dict: expanded base_config 31 | """ 32 | if not regex_config: 33 | return base_config 34 | 35 | # Collect all supported layer names in model 36 | all_supported_layer_names = [n for n, m in model.named_modules() if isinstance(m, SUPPORTED_LAYER_TYPES)] 37 | 38 | # Identify which keys are regex patterns (not exact layer names) 39 | regex_keys = [k for k in regex_config.keys() if k not in all_supported_layer_names] 40 | 41 | for regex_key in regex_keys: 42 | try: 43 | pattern = re.compile(regex_key) 44 | except re.error: 45 | # invalid regex, skip silently 46 | continue 47 | 48 | # Prefer matches within layer_names first 49 | matched_layers = [ln for ln in layer_names if re.search(pattern, ln)] 50 | if not matched_layers: 51 | matched_layers = [ln for ln in all_supported_layer_names if re.search(pattern, ln)] 52 | 53 | if matched_layers: 54 | cfg = regex_config[regex_key] 55 | if cfg == {}: 56 | continue 57 | for ln in matched_layers: 58 | # do not overwrite explicit layer config 59 | if ln not in base_config: 60 | base_config[ln] = cfg 61 | 62 | return base_config 63 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ### Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. 58 | -------------------------------------------------------------------------------- /test/test_cpu/test_auto_scheme.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, "../..") 6 | from auto_round import AutoRound, AutoRoundConfig, AutoScheme 7 | 8 | 9 | class TestAutoScheme(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(self): 12 | self.save_dir = "./saved" 13 | self.tasks = "lambada_openai" 14 | 15 | @classmethod 16 | def tearDownClass(self): 17 | shutil.rmtree("./saved", ignore_errors=True) 18 | shutil.rmtree("runs", ignore_errors=True) 19 | 20 | def test_auto_scheme_export(self): 21 | model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" 22 | scheme = AutoScheme(avg_bits=2, options=("W2A16"), nsamples=1, ignore_scale_zp_bits=True) 23 | ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) 24 | ar.quantize_and_save(self.save_dir) 25 | shutil.rmtree(self.save_dir, ignore_errors=True) 26 | 27 | scheme = AutoScheme(avg_bits=4, options=("mxfp4"), nsamples=1, ignore_scale_zp_bits=True) 28 | ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) 29 | ar.quantize_and_save(self.save_dir) 30 | shutil.rmtree(self.save_dir, ignore_errors=True) 31 | 32 | def test_layer_config(self): 33 | from auto_round.auto_scheme.utils import compute_avg_bits_for_model 34 | from auto_round.utils import get_module 35 | 36 | target_bits = 3.0 37 | model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" 38 | scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16")) 39 | user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} 40 | ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) 41 | model, layer_config = ar.quantize() 42 | self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8) 43 | self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False) 44 | self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32) 45 | layer = get_module(model, "model.decoder.layers.10.fc1") 46 | self.assertEqual(layer.bits, 8) 47 | self.assertEqual(layer.sym, False) 48 | self.assertEqual(layer.group_size, 32) 49 | avg_bits, _ = compute_avg_bits_for_model(model) 50 | print(avg_bits) 51 | assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 52 | 53 | 54 | if __name__ == "__main__": 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /.azure-pipelines/compatibility-test.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - auto_round 12 | - auto_round_extension 13 | - setup.py 14 | - setup.cfg 15 | - requirements.txt 16 | - requirements-cpu.txt 17 | - .azure-pipelines/compatibility-test.yml 18 | exclude: 19 | - "*.md" 20 | - "**/*.md" 21 | 22 | stages: 23 | - stage: 24 | displayName: Compatibility Test 25 | dependsOn: [] 26 | jobs: 27 | - job: 28 | timeoutInMinutes: 20 29 | strategy: 30 | matrix: 31 | Python310_Linux: 32 | python_version: '3.10' 33 | vmImage: 'ubuntu-latest' 34 | Python311_Linux: 35 | python_version: '3.11' 36 | vmImage: 'ubuntu-latest' 37 | Python312_Linux: 38 | python_version: '3.12' 39 | vmImage: 'ubuntu-latest' 40 | Python313_Linux: 41 | python_version: '3.13' 42 | vmImage: 'ubuntu-latest' 43 | Python314_Linux: 44 | python_version: '3.14' 45 | vmImage: 'ubuntu-latest' 46 | 47 | Python310_Windows: 48 | python_version: '3.10' 49 | vmImage: 'windows-latest' 50 | Python311_Windows: 51 | python_version: '3.11' 52 | vmImage: 'windows-latest' 53 | Python312_Windows: 54 | python_version: '3.12' 55 | vmImage: 'windows-latest' 56 | Python313_Windows: 57 | python_version: '3.13' 58 | vmImage: 'windows-latest' 59 | Python314_Windows: 60 | python_version: '3.14' 61 | vmImage: 'windows-latest' 62 | 63 | pool: 64 | vmImage: $(vmImage) 65 | 66 | steps: 67 | - task: UsePythonVersion@0 68 | inputs: 69 | versionSpec: '$(python_version)' 70 | displayName: 'Use Python $(python_version)' 71 | 72 | - script: | 73 | python -m pip install --upgrade pip uv 74 | uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu 75 | uv pip install . 76 | pip list 77 | env: 78 | PYTHONUNBUFFERED: '1' 79 | UV_NO_PROGRESS: '1' 80 | UV_SYSTEM_PYTHON: '1' 81 | displayName: 'Install dependencies' 82 | 83 | - script: | 84 | python -c "import auto_round" 85 | displayName: 'Run compatibility test' 86 | -------------------------------------------------------------------------------- /.github/workflows/compatibility-test.yml: -------------------------------------------------------------------------------- 1 | name: Compatibility Test on ARM64 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | types: [opened, reopened, ready_for_review, synchronize] 7 | paths: 8 | - "auto_round/**" 9 | - "auto_round_extension/**" 10 | - "setup.py" 11 | - "setup.cfg" 12 | - "requirements.txt" 13 | - "requirements-cpu.txt" 14 | - ".github/workflows/compatibility-test.yml" 15 | - "!**/*.md" 16 | workflow_dispatch: 17 | 18 | concurrency: 19 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 20 | cancel-in-progress: true 21 | 22 | jobs: 23 | compatibility-test-arm: 24 | name: ${{ matrix.os-name }} on ARM64 25 | runs-on: ${{ matrix.os }} 26 | timeout-minutes: 60 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | include: 31 | - os: macos-latest 32 | os-name: macOS 33 | shell: bash 34 | - os: ubuntu-24.04-arm 35 | os-name: Linux 36 | shell: bash 37 | 38 | defaults: 39 | run: 40 | shell: ${{ matrix.shell }} 41 | 42 | steps: 43 | - name: Checkout code 44 | uses: actions/checkout@v6 45 | 46 | - name: Set up Python 47 | uses: actions/setup-python@v6 48 | with: 49 | python-version: "3.12" 50 | 51 | - name: Verify ARM architecture 52 | run: | 53 | python -c "import platform; print(f'Architecture: {platform.machine()}')" 54 | 55 | - name: Install dependencies 56 | run: | 57 | python -m pip install --upgrade pip uv setuptools 58 | uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu 59 | uv pip install datasets --upgrade 60 | uv pip install . 61 | pip list 62 | env: 63 | PYTHONUNBUFFERED: "1" 64 | UV_NO_PROGRESS: "1" 65 | UV_SYSTEM_PYTHON: "1" 66 | TQDM_MININTERVAL: "60" 67 | 68 | - name: Run compatibility test 69 | run: | 70 | set -xe 71 | python -c "import auto_round" 72 | echo "============================================================================" 73 | auto-round --model_name Qwen/Qwen3-0.6B --bits 4 --iters 0 --nsamples 8 74 | echo "============================================================================" 75 | auto-round --model_name Qwen/Qwen3-0.6B --bits 4 --iters 1 --nsamples 8 76 | env: 77 | TQDM_MININTERVAL: "60" 78 | PYTHONUNBUFFERED: "1" 79 | -------------------------------------------------------------------------------- /test/test_cpu/test_load_awq_gptq.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, "../..") 6 | 7 | from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer 8 | 9 | 10 | class TestAutoRound(unittest.TestCase): 11 | def model_infer(self, model, tokenizer): 12 | prompts = [ 13 | "Hello,my name is", 14 | # "The president of the United States is", 15 | # "The capital of France is", 16 | # "The future of AI is", 17 | ] 18 | 19 | inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) 20 | 21 | outputs = model.generate( 22 | input_ids=inputs["input_ids"].to(model.device), 23 | attention_mask=inputs["attention_mask"].to(model.device), 24 | do_sample=False, ## change this to follow official usage 25 | max_new_tokens=5, 26 | ) 27 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] 28 | 29 | decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 30 | 31 | for i, prompt in enumerate(prompts): 32 | print(f"Prompt: {prompt}") 33 | print(f"Generated: {decoded_outputs[i]}") 34 | print("-" * 50) 35 | 36 | @classmethod 37 | def tearDownClass(self): 38 | shutil.rmtree("runs", ignore_errors=True) 39 | 40 | def test_load_gptq_no_dummy_gidx_model(self): 41 | model_name = "/tf_dataset/auto_round/models/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" 42 | quantization_config = AutoRoundConfig() 43 | with self.assertRaises(NotImplementedError) as cm: 44 | model = AutoModelForCausalLM.from_pretrained( 45 | model_name, 46 | torch_dtype="auto", 47 | trust_remote_code=True, 48 | device_map="cpu", 49 | quantization_config=quantization_config, 50 | ) 51 | 52 | def test_load_awq(self): 53 | model_name = "/tf_dataset/auto_round/models/casperhansen/opt-125m-awq" 54 | quantization_config = AutoRoundConfig() 55 | model = AutoModelForCausalLM.from_pretrained( 56 | model_name, 57 | torch_dtype="auto", 58 | trust_remote_code=True, 59 | device_map="cpu", 60 | quantization_config=quantization_config, 61 | ) 62 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 63 | self.model_infer(model, tokenizer) 64 | -------------------------------------------------------------------------------- /.azure-pipelines/template/ut-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: dockerConfigName 3 | type: string 4 | default: "commonDockerConfig" 5 | - name: repo 6 | type: string 7 | default: "https://github.com/intel/auto-round" 8 | - name: utScriptFileName 9 | type: string 10 | - name: uploadPath 11 | type: string 12 | - name: utArtifact 13 | type: string 14 | - name: utTestMode 15 | type: string 16 | default: "coverage" 17 | - name: utContainerName 18 | type: string 19 | default: "AutoRoundUnitTest" 20 | - name: imageSource 21 | type: string 22 | default: "build" 23 | 24 | steps: 25 | - template: docker-template.yml 26 | parameters: 27 | dockerConfigName: ${{ parameters.dockerConfigName }} 28 | repoName: "auto-round" 29 | repoTag: "py312" 30 | dockerFileName: "Dockerfile" 31 | containerName: ${{ parameters.utContainerName }} 32 | repo: ${{ parameters.repo }} 33 | imageSource: ${{ parameters.imageSource }} 34 | 35 | - ${{ if eq(parameters.imageSource, 'build') }}: 36 | - script: | 37 | docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \ 38 | && uv pip install torch==2.8.0 torchvision --index-url https://download.pytorch.org/whl/cpu \ 39 | && uv pip install intel-extension-for-pytorch==2.8.0 \ 40 | && uv pip install -r requirements.txt \ 41 | && uv pip install -r requirements-cpu.txt \ 42 | && uv pip list" 43 | displayName: "Env Setup" 44 | 45 | - ${{ if eq(parameters.imageSource, 'pull') }}: 46 | - script: | 47 | docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \ 48 | && python setup.py bdist_wheel lib \ 49 | && pip install dist/*.whl \ 50 | && pip list" 51 | displayName: "HPU Env Setup" 52 | 53 | - script: | 54 | docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round/.azure-pipelines/scripts \ 55 | && bash ut/${{ parameters.utScriptFileName }}.sh ${{ parameters.utTestMode }}" 56 | displayName: "Run UT" 57 | 58 | - task: PublishPipelineArtifact@1 59 | condition: succeeded() 60 | inputs: 61 | targetPath: ${{ parameters.uploadPath }} 62 | artifact: ${{ parameters.utArtifact }}_coverage 63 | publishLocation: "pipeline" 64 | 65 | - task: Bash@3 66 | condition: always() 67 | inputs: 68 | targetType: "inline" 69 | script: | 70 | docker stop ${{ parameters.utContainerName }} 71 | docker rm -vf ${{ parameters.utContainerName }} || true 72 | displayName: "Docker clean up" 73 | -------------------------------------------------------------------------------- /test/test_cuda/test_vllm.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | """Test model set-up and inference for quantized HF models supported 3 | on the AutoRound. 4 | 5 | Validating the configuration and printing results for manual checking. 6 | 7 | Run `pytest test/test_cuda/test_vllm.py`. 8 | """ 9 | 10 | import os 11 | import shutil 12 | import subprocess 13 | 14 | import pytest 15 | from vllm import LLM, SamplingParams 16 | from vllm.platforms import current_platform 17 | 18 | MODELS = [ 19 | "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc", ##auto_round:auto_gptq 20 | "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound", ##auto_round:auto_awq 21 | ] 22 | 23 | 24 | @pytest.mark.skipif( 25 | not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(), 26 | reason="only supports CPU/XPU/CUDA backend.", 27 | ) 28 | @pytest.mark.parametrize("model", MODELS) 29 | def test_auto_round(model): 30 | # Sample prompts. 31 | prompts = [ 32 | "The capital of France is", 33 | "The future of AI is", 34 | ] 35 | # Create a sampling params object. 36 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 37 | # Create an LLM. 38 | QUANTIZATION = "auto-round" 39 | llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1) 40 | # Generate texts from the prompts. 41 | # The output is a list of RequestOutput objects 42 | # that contain the prompt, generated text, and other information. 43 | outputs = llm.generate(prompts, sampling_params) 44 | # Print the outputs. 45 | for output in outputs: 46 | prompt = output.prompt 47 | generated_text = output.outputs[0].text 48 | if "France" in prompt: 49 | assert "Paris" in generated_text 50 | 51 | 52 | @pytest.mark.parametrize("model", MODELS) 53 | def test_vllm_lm_eval(model): 54 | if shutil.which("auto-round") is None: 55 | pytest.skip("auto-round CLI not available") 56 | 57 | env = os.environ.copy() 58 | env["VLLM_SKIP_WARMUP"] = "true" 59 | env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" 60 | 61 | cmd = [ 62 | "auto-round", 63 | "--model", 64 | model, 65 | "--eval", 66 | "--tasks", 67 | "lambada_openai", 68 | "--eval_bs", 69 | "8", 70 | "--eval_backend", 71 | "vllm", 72 | "--limit", 73 | "10", 74 | ] 75 | 76 | proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 77 | assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}" 78 | -------------------------------------------------------------------------------- /test/test_hpu/test_auto_round.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy 4 | 5 | from auto_round.utils import is_hpex_available 6 | 7 | 8 | def run_opt_125m_on_hpu(): 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | from auto_round import AutoRound 12 | 13 | model_name = "facebook/opt-125m" 14 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 15 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 16 | 17 | bits, group_size, sym = 4, 128, False 18 | autoround = AutoRound( 19 | model, 20 | tokenizer, 21 | bits=bits, 22 | group_size=group_size, 23 | sym=sym, 24 | iters=2, 25 | seqlen=2, 26 | ) 27 | q_model, qconfig = autoround.quantize() 28 | assert q_model is not None, "Expected q_model to be not None" 29 | 30 | 31 | @pytest.mark.skipif(not is_hpex_available(), reason="HPU is not supported") 32 | @pytest.mark.skipif(not is_pytest_mode_lazy(), reason="Only for lazy mode") 33 | def test_opt_125m_lazy_mode(): 34 | run_opt_125m_on_hpu() 35 | 36 | 37 | @pytest.mark.skipif(not is_hpex_available(), reason="HPU is not supported") 38 | @pytest.mark.skipif(not is_pytest_mode_compile(), reason="Only for compile mode") 39 | def test_opt_125m_compile_mode(): 40 | torch._dynamo.reset() 41 | run_opt_125m_on_hpu() 42 | 43 | 44 | def test_import(): 45 | from auto_round import AutoRound 46 | from auto_round.export.export_to_itrex.export import WeightOnlyLinear, save_quantized_as_itrex 47 | 48 | 49 | @pytest.mark.parametrize( 50 | "data_type", 51 | ["fp8_to_int_sym"], 52 | ) 53 | def test_w4a8(data_type): 54 | from transformers import AutoModelForCausalLM, AutoTokenizer 55 | 56 | from auto_round import AutoRound 57 | 58 | model_name = "facebook/opt-125m" 59 | model = AutoModelForCausalLM.from_pretrained( 60 | model_name, 61 | torch_dtype="auto", 62 | attn_implementation="eager", 63 | trust_remote_code=True, 64 | ) 65 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 66 | 67 | autoround = AutoRound( 68 | model, 69 | tokenizer, 70 | bits=4, 71 | group_size=128, 72 | iters=2, 73 | seqlen=2, 74 | data_type=data_type, 75 | act_data_type="fp8_sym", 76 | act_bits=8, 77 | nsamples=1, 78 | act_dynamic=False, 79 | ) 80 | q_model, qconfig = autoround.quantize() 81 | assert q_model is not None, "Expected q_model to be not None" 82 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/tests/test_fp8kv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import pytest 15 | import torch 16 | from vllm.platforms import current_platform 17 | 18 | 19 | def cuda_capability_at_least(major, minor): 20 | device_capability = torch.cuda.get_device_capability() 21 | return device_capability[0] >= major or (device_capability[0] == major and device_capability[1] >= minor) 22 | 23 | 24 | MODELS = ["/home/yiliu7/workspace/auto-round/examples/Qwen2.5-0.5B-Instruct-ar-MXFP4-fp8"] 25 | 26 | 27 | @pytest.fixture(autouse=True) 28 | def set_vllm_ar_env(monkeypatch): 29 | monkeypatch.setenv("VLLM_AR_MXFP4_MODULAR_MOE", "1") 30 | monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1") 31 | monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0") 32 | monkeypatch.setenv("VLLM_ENABLE_STATIC_MOE", "0") 33 | monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0") 34 | monkeypatch.setenv("VLLM_ENABLE_AR_EXT", "1") 35 | monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") 36 | monkeypatch.setenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "1") 37 | monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") 38 | 39 | 40 | @pytest.mark.skipif( 41 | not current_platform.is_cuda(), 42 | reason="only supports CUDA backend.", 43 | ) 44 | @pytest.mark.skipif( 45 | not cuda_capability_at_least(10, 0), reason="FP8 KV cache only supported on CUDA with compute capability >= 10.0" 46 | ) 47 | @pytest.mark.parametrize("model", MODELS) 48 | def test_auto_fp8_kv(vllm_runner, model): 49 | with vllm_runner( 50 | model, 51 | # enforce_eager=True, 52 | kv_cache_dtype="fp8", 53 | gpu_memory_utilization=0.1, 54 | ) as llm: 55 | output = llm.generate_greedy(["The capital of France is"], max_tokens=8) 56 | assert ( 57 | llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype 58 | == torch.uint8 59 | ), f"Expected kv_cache_dtype to be torch.uint8, but got {llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype}" 60 | assert output 61 | print(f"output is: {output[0][1]}") 62 | -------------------------------------------------------------------------------- /test/test_cuda/test_diffusion.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | import re 4 | import shutil 5 | import sys 6 | import unittest 7 | 8 | import requests 9 | 10 | sys.path.insert(0, "../..") 11 | 12 | from diffusers import AutoPipelineForText2Image 13 | from PIL import Image 14 | 15 | from auto_round import AutoRoundDiffusion 16 | from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env 17 | 18 | 19 | class TestAutoRound(unittest.TestCase): 20 | @classmethod 21 | def setUpClass(self): 22 | self.model_name = "/dataset/FLUX.1-dev" 23 | 24 | @classmethod 25 | def tearDownClass(self): 26 | shutil.rmtree("runs", ignore_errors=True) 27 | 28 | @require_optimum 29 | def test_diffusion_tune(self): 30 | ## load the model 31 | pipe = AutoPipelineForText2Image.from_pretrained(self.model_name).to("cuda") 32 | model = pipe.transformer 33 | 34 | layer_config = {} 35 | # skip some layers since it takes much time 36 | for n, m in model.named_modules(): 37 | if m.__class__.__name__ != "Linear": 38 | continue 39 | match = re.search(r"blocks\.(\d+)", n) 40 | if match and int(match.group(1)) > 0: 41 | layer_config[n] = {"bits": 16, "act_bits": 16} 42 | 43 | ## quantize the model 44 | autoround = AutoRoundDiffusion( 45 | pipe, 46 | tokenizer=None, 47 | scheme="MXFP4", 48 | iters=1, 49 | nsamples=1, 50 | num_inference_steps=2, 51 | layer_config=layer_config, 52 | dataset="/dataset/captions_source.tsv", 53 | ) 54 | # skip model saving since it takes much time 55 | autoround.quantize() 56 | 57 | def test_diffusion_rtn(self): 58 | ## load the model 59 | pipe = AutoPipelineForText2Image.from_pretrained(self.model_name) 60 | 61 | ## quantize the model 62 | autoround = AutoRoundDiffusion( 63 | pipe, 64 | tokenizer=None, 65 | scheme="MXFP4", 66 | iters=0, 67 | num_inference_steps=2, 68 | dataset="/dataset/captions_source.tsv", 69 | ) 70 | # skip model saving since it takes much time 71 | autoround.quantize() 72 | 73 | def test_diffusion_model_checker(self): 74 | from auto_round.utils import is_diffusion_model 75 | 76 | self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev")) 77 | self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1")) 78 | self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0")) 79 | self.assertFalse(is_diffusion_model("/models/Qwen3-8B")) 80 | 81 | 82 | if __name__ == "__main__": 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/mxfp8_qdq_utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 3 | import torch 4 | 5 | __all__ = ["get_fp_scale", "dequant_mx_fp8", "quant_mx_fp8"] 6 | 7 | 8 | # def get_fp_scale(scale_e8m0): 9 | # # https://github.com/pytorch/ao/blob/994a4ba6c869854fcaa6ca7e118fcbd75e6c28cc/torchao/prototype/mx_formats/mx_tensor.py#L337 10 | # assert scale_e8m0.dtype == torch.uint8, f"Expected uint8, got {scale_e8m0.dtype}" 11 | # E8M0_EXPONENT_BIAS = 127 12 | # scale_e8m0 = scale_e8m0.view(torch.uint8) 13 | # s_offset = scale_e8m0.to(torch.int16) - E8M0_EXPONENT_BIAS 14 | # # TODO(later): it would be nice if there was a way to do the 2^x operation 15 | # # in PyTorch without creating a tensor of twos 16 | # two = torch.full(s_offset.size(), 2.0, device=scale_e8m0.device) 17 | # # pow(two, s_offset) can be out of range of floating point formats. 18 | # # TODO(later): handle this for float16 if we decide to support float16 19 | # # scales. 20 | # s_fp = torch.pow(two, s_offset) 21 | 22 | # return s_fp 23 | 24 | 25 | def get_fp_scale(scale_e8m0): 26 | # https://github.com/pytorch/ao/blob/994a4ba6c869854fcaa6ca7e118fcbd75e6c28cc/torchao/prototype/mx_formats/mx_tensor.py#L337 27 | E8M0_EXPONENT_BIAS = 127 28 | 29 | scale_e8m0 = scale_e8m0.view(torch.uint8) 30 | s_offset = scale_e8m0.to(torch.int16) - E8M0_EXPONENT_BIAS 31 | # TODO(later): it would be nice if there was a way to do the 2^x operation 32 | # in PyTorch without creating a tensor of twos 33 | # two = torch.full(s_offset.size(), 2.0, device=scale_e8m0.device) 34 | # pow(two, s_offset) can be out of range of floating point formats. 35 | # TODO(later): handle this for float16 if we decide to support float16 36 | # scales. 37 | # s_fp = torch.pow(two, s_offset) 38 | # !!!!NOTE Critical: fixed the OoM issue when using HPU graph 39 | s_fp = torch.pow(2.0, s_offset.to(torch.float)) 40 | 41 | return s_fp 42 | 43 | 44 | def dequant_mx_fp8(weight_fp8, scale_e8m0, block_size, target_dtype): 45 | scale_float = get_fp_scale(scale_e8m0) 46 | weight_bf16 = weight_fp8.to(torch.bfloat16) 47 | weight_original_shape = weight_bf16.shape 48 | weight_bf16 = weight_bf16.reshape(-1, block_size) 49 | scale_float = scale_float.reshape(-1, 1) 50 | dequant_weight = weight_bf16 * scale_float 51 | dequant_weight = dequant_weight.reshape(weight_original_shape) 52 | return dequant_weight.to(target_dtype) 53 | 54 | 55 | def quant_mx_fp8(tensor): 56 | from auto_round_extension.vllm_ext.utils import to_mx_fp8e4m3 57 | 58 | scale_e8m0_biased, data_lp = to_mx_fp8e4m3( 59 | data_hp=tensor, 60 | elem_dtype=torch.float8_e4m3fn, 61 | block_size=32, 62 | ) 63 | return scale_e8m0_biased, data_lp 64 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/quant_method_moe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import torch 18 | from vllm.logger import init_logger 19 | from vllm.model_executor.layers.fused_moe import ( 20 | FusedMoEConfig, 21 | FusedMoEMethodBase, 22 | ) 23 | from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig 24 | from vllm.model_executor.layers.quantization.auto_round import AutoRoundConfig 25 | 26 | from auto_round.schemes import QuantizationScheme 27 | from auto_round_extension.vllm_ext.utils import _is_mxfp4_w4a4, _is_mxfp8_w8a8, get_scheme, need_quantize 28 | 29 | logger = init_logger(__name__) 30 | 31 | 32 | QMOE_METHODS_DISPATCH_TABLE = {} 33 | 34 | 35 | class AutoRoundMoEMethod(FusedMoEMethodBase): 36 | def __init__(self, moe: FusedMoEConfig): 37 | super().__init__(moe) 38 | 39 | @staticmethod 40 | def get_moe_method( 41 | quant_config: AutoRoundConfig, 42 | layer: torch.nn.Module, 43 | prefix: str, 44 | ) -> "AutoRoundMoEMethod": 45 | 46 | def get_impl(scheme: QuantizationScheme): 47 | if not need_quantize(scheme.bits): 48 | from vllm.model_executor.layers.fused_moe.layer import ( 49 | UnquantizedFusedMoEMethod, 50 | ) 51 | 52 | return UnquantizedFusedMoEMethod(layer.moe_config) 53 | 54 | elif _is_mxfp4_w4a4(scheme): 55 | from auto_round_extension.vllm_ext.moe_impl_mxfp4 import AutoRoundMoEMethodMXFp4Impl 56 | 57 | return AutoRoundMoEMethodMXFp4Impl(quant_config, layer.moe_config) 58 | 59 | elif _is_mxfp8_w8a8(scheme): 60 | from auto_round_extension.vllm_ext.moe_impl_mxfp8 import AutoRoundMoEMethodMXFp8Impl 61 | 62 | return AutoRoundMoEMethodMXFp8Impl(quant_config, layer.moe_config) 63 | 64 | raise ValueError(f"Unsupported FusedMoe scheme: {scheme}") 65 | 66 | layer_scheme = get_scheme(quant_config, prefix) 67 | impl = get_impl(layer_scheme) 68 | layer._prefix = prefix 69 | logger.debug("Apply %s to %s", impl.__class__.__name__, prefix) 70 | return impl 71 | 72 | def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: 73 | return self.impl.get_fused_moe_quant_config(layer) 74 | -------------------------------------------------------------------------------- /test/test_cuda/test_mxfp_and_nvfp_quant.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | 4 | import pytest 5 | import torch 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | 8 | from auto_round import AutoRound 9 | from auto_round import schemes as ar_schemes 10 | from auto_round.experimental import qmodules as ar_qmodules 11 | from auto_round.export.export_to_autoround import AutoRoundFormat 12 | from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp 13 | from auto_round.testing_utils import has_module 14 | 15 | testing_schemes = [AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, AutoRoundFormat.NVFP4.value] 16 | QMODULE_MAPPING = { 17 | AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear, 18 | AutoRoundFormat.MXFP4.value: ar_qmodules.MXFP4QuantLinear, 19 | AutoRoundFormat.NVFP4.value: ar_qmodules.NVFP4QuantLinear, 20 | } 21 | 22 | 23 | @pytest.mark.parametrize("scheme", testing_schemes) 24 | @torch.inference_mode() 25 | def test_e2e_quant_and_infer(scheme): 26 | # Use a temporary directory for saving the quantized model 27 | with tempfile.TemporaryDirectory() as temp_dir: 28 | model_name = "Qwen/Qwen2.5-0.5B-Instruct" 29 | 30 | # Load the tokenizer and model 31 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 32 | model = AutoModelForCausalLM.from_pretrained( 33 | model_name, 34 | device_map="cpu", 35 | torch_dtype="auto", 36 | trust_remote_code=True, 37 | ) 38 | 39 | # Initialize AutoRound for quantization 40 | autoround = AutoRound( 41 | model, 42 | tokenizer, 43 | scheme=scheme, 44 | iters=0, 45 | nsamples=2, 46 | ) 47 | 48 | # Quantize and save the model to the temporary directory 49 | quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}" 50 | autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) 51 | 52 | # Perform inference with the quantized model 53 | model = AutoModelForCausalLM.from_pretrained( 54 | quantized_model_path, 55 | torch_dtype="auto", 56 | ) 57 | model.eval() 58 | assert has_module(model, QMODULE_MAPPING[scheme]), f"Expected {QMODULE_MAPPING[scheme].__name__} in the model." 59 | 60 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 61 | prompt = "Ai is " 62 | 63 | # Tokenize the input prompt 64 | encode = tokenizer.encode(prompt, return_tensors="pt") 65 | 66 | # Generate output tokens 67 | output_tokens = model.generate( 68 | encode, 69 | max_length=30, 70 | ) 71 | output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) 72 | 73 | # Print and validate the output 74 | print(f"Prompt: {prompt}") 75 | print(f"Output: {output}") 76 | assert output is not None, "Output should not be None" 77 | -------------------------------------------------------------------------------- /test/test_cuda/test_alg_ext.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, "../..") 6 | 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from auto_round import AutoRound, AutoRoundConfig 11 | from auto_round.eval.evaluation import simple_evaluate_user_model 12 | 13 | 14 | class TestAlgExt(unittest.TestCase): 15 | 16 | @classmethod 17 | def setUpClass(self): 18 | self.model_name = "/models/opt-125m" 19 | self.save_folder = "./saved" 20 | 21 | @classmethod 22 | def tearDownClass(self): 23 | shutil.rmtree(self.save_folder, ignore_errors=True) 24 | shutil.rmtree("runs", ignore_errors=True) 25 | 26 | def test_2bits(self): 27 | model_name = "/models/opt-125m" 28 | ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True) 29 | ar.quantize_and_save(self.save_folder) 30 | model = AutoModelForCausalLM.from_pretrained( 31 | self.save_folder, 32 | device_map="auto", 33 | ) 34 | 35 | tokenizer = AutoTokenizer.from_pretrained(self.save_folder) 36 | result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") 37 | print(result["results"]["lambada_openai"]["acc,none"]) 38 | # wo alg ext 0.2078, with 0.2371 39 | self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.22) 40 | shutil.rmtree(self.save_folder, ignore_errors=True) 41 | 42 | def test_cli(self): 43 | import os 44 | 45 | model_name = "/models/opt-125m" 46 | python_path = sys.executable 47 | 48 | res = os.system( 49 | f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits" 50 | ) 51 | if res > 0 or res == -1: 52 | assert False, "cmd line test fail, please have a check" 53 | 54 | res = os.system( 55 | f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile" 56 | ) 57 | if res > 0 or res == -1: 58 | assert False, "cmd line test fail, please have a check" 59 | 60 | def test_all_support_dtype(self): 61 | from auto_round.auto_scheme import AutoScheme 62 | 63 | model_name = "/models/Qwen3-0.6B" 64 | for scheme in ["MXFP4", "NVFP4", "W2A16G64", "gguf:q2_k_s,gguf:q4_k_s"]: 65 | avg_bits = 2 if scheme == "W2A16G64" else 4 66 | scheme = AutoScheme(options=scheme, avg_bits=avg_bits, ignore_scale_zp_bits=True) 67 | ar = AutoRound( 68 | model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True 69 | ) 70 | ar.quantize() 71 | 72 | 73 | if __name__ == "__main__": 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /test/test_cpu/test_mxfp_save_load.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | 4 | import pytest 5 | import torch 6 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer 7 | from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM 8 | 9 | from auto_round import AutoRound 10 | from auto_round import schemes as ar_schemes 11 | from auto_round.experimental import qmodules as ar_qmodules 12 | from auto_round.export.export_to_autoround import AutoRoundFormat 13 | from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp 14 | from auto_round.inference.backend import MX_TENSOR_DATA_TYPES 15 | from auto_round.testing_utils import has_module 16 | 17 | testing_scheme_name_lst = [ 18 | AutoRoundFormat.MXFP8.value, 19 | AutoRoundFormat.MXFP4.value, 20 | ] 21 | QMODULE_MAPPING = { 22 | AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear, 23 | AutoRoundFormat.MXFP4.value: ar_qmodules.MXFP4QuantLinear, 24 | } 25 | SCHEMES_MAPPING = { 26 | AutoRoundFormat.MXFP8.value: ar_schemes.MXFP8, 27 | AutoRoundFormat.MXFP4.value: ar_schemes.MXFP4, 28 | } 29 | 30 | 31 | @pytest.mark.parametrize("scheme_name", testing_scheme_name_lst) 32 | @pytest.mark.parametrize("weight_data_type", MX_TENSOR_DATA_TYPES) 33 | @pytest.mark.parametrize("act_data_type", MX_TENSOR_DATA_TYPES) 34 | @torch.inference_mode() 35 | def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): 36 | # Use a temporary directory for saving the quantized model 37 | with tempfile.TemporaryDirectory() as temp_dir: 38 | model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" 39 | config = AutoConfig.from_pretrained(model_name) 40 | config.num_hidden_layers = 2 # Use a smaller model for testing 41 | # Fix configuration validation issues 42 | config.layer_types = config.layer_types[: config.num_hidden_layers] 43 | 44 | # Load the tokenizer and model 45 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 46 | model = Qwen2ForCausalLM(config) 47 | scheme = SCHEMES_MAPPING[scheme_name] 48 | scheme.data_type = weight_data_type 49 | scheme.act_data_type = act_data_type 50 | # Initialize AutoRound for quantization 51 | autoround = AutoRound( 52 | model, 53 | tokenizer, 54 | scheme=scheme, 55 | iters=0, 56 | nsamples=2, 57 | ) 58 | 59 | # Quantize and save the model to the temporary directory 60 | quantized_model_path = f"{temp_dir}/tmp_autoround" 61 | autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) 62 | 63 | # Perform inference with the quantized model 64 | model = AutoModelForCausalLM.from_pretrained( 65 | quantized_model_path, 66 | torch_dtype="auto", 67 | ) 68 | model.eval() 69 | assert has_module( 70 | model, QMODULE_MAPPING[scheme_name] 71 | ), f"Expected {QMODULE_MAPPING[scheme_name].__name__} in the model." 72 | -------------------------------------------------------------------------------- /auto_round/eval/evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from typing import Optional, Union 17 | 18 | from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401 19 | 20 | from auto_round.logger import logger 21 | 22 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 23 | 24 | from lm_eval.models.huggingface import HFLM # pylint: disable=E0401 25 | 26 | 27 | def simple_evaluate_user_model( 28 | user_model, 29 | tokenizer, 30 | batch_size: Optional[int] = 1, 31 | limit: Optional[Union[int, float]] = None, 32 | max_batch_size: Optional[int] = 64, 33 | eval_model_dtype="auto", 34 | add_bos_token: bool = False, 35 | mllm: bool = False, 36 | **kwargs 37 | ): 38 | if mllm: 39 | from lm_eval.models.hf_vlms import HFMultimodalLM # pylint: disable=E0401 40 | 41 | if batch_size is None or batch_size == "auto": 42 | logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") 43 | batch_size = 16 44 | hflm = HFMultimodalLM( 45 | pretrained=user_model, 46 | tokenizer=tokenizer, 47 | batch_size=batch_size, 48 | max_batch_size=max_batch_size, 49 | dtype=eval_model_dtype, 50 | add_bos_token=add_bos_token, 51 | ) 52 | else: 53 | hflm = HFLM( 54 | pretrained=user_model, 55 | tokenizer=tokenizer, 56 | batch_size=batch_size, 57 | max_batch_size=max_batch_size, 58 | dtype=eval_model_dtype, 59 | add_bos_token=add_bos_token, 60 | ) 61 | return lm_simple_evaluate( 62 | model=hflm, model_args=None, batch_size=batch_size, max_batch_size=max_batch_size, limit=limit, **kwargs 63 | ) 64 | 65 | 66 | def simple_evaluate( 67 | model, 68 | model_args: Optional[Union[str, dict]] = None, 69 | batch_size: Optional[int] = None, 70 | limit: Optional[Union[int, float]] = None, 71 | max_batch_size: Optional[int] = None, 72 | device: Optional[str] = None, 73 | **kwargs 74 | ): 75 | try: 76 | from transformers import AutoRoundConfig 77 | except: 78 | from auto_round.inference.auto_quantizer import AutoHfQuantizer 79 | 80 | return lm_simple_evaluate( 81 | model=model, 82 | model_args=model_args, 83 | batch_size=batch_size, 84 | limit=limit, 85 | max_batch_size=max_batch_size, 86 | device=device, 87 | **kwargs 88 | ) 89 | -------------------------------------------------------------------------------- /auto_round/experimental/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from auto_round.utils import logger 18 | 19 | 20 | def per_tensor_fp8_qdq( 21 | tensor: torch.Tensor, tensor_max: None | torch.Tensor = None 22 | ) -> tuple[torch.Tensor, torch.Tensor]: 23 | from auto_round.data_type.fp8 import quant_fp8_sym 24 | 25 | qdq_tensor, scale, _ = quant_fp8_sym(tensor, max_scale=1.0, tensor_max=tensor_max, group_size=0, v=0) 26 | return qdq_tensor, scale 27 | 28 | 29 | # @torch.compiler.disable 30 | def update_parameter_data(module: torch.nn.Module, new_val: torch.Tensor, name: str): 31 | """ 32 | Update the data of a parameter in a module. 33 | If the parameter does not exist, it will be created. 34 | """ 35 | if hasattr(module, name): 36 | param = getattr(module, name) 37 | if isinstance(param, torch.nn.Parameter): 38 | param.data.copy_(new_val) 39 | else: 40 | module.register_parameter(name, torch.nn.Parameter(new_val)) 41 | else: 42 | logger.warning_once( 43 | "Parameter %s not found in module %s, creating new parameter." 44 | % (name, module.__class__.__name__ + str(getattr(module, "layer_idx", ""))) 45 | ) 46 | module.register_parameter(name, torch.nn.Parameter(new_val)) 47 | 48 | 49 | def normalize_static_kv_dtype(static_kv_dtype: str | torch.dtype) -> torch.dtype: 50 | valid_dtype_name_lst = ["float16", "bfloat16", "fp8", "float32", "float"] 51 | valid_torch_dtype = { 52 | "float16": torch.float16, 53 | "bfloat16": torch.bfloat16, 54 | "fp8": torch.float8_e4m3fn, 55 | "float8_e4m3fn": torch.float8_e4m3fn, 56 | "float32": torch.float32, 57 | "float": torch.float32, # Alias for float32 58 | } 59 | if static_kv_dtype in valid_dtype_name_lst: 60 | new_dtype = valid_torch_dtype[static_kv_dtype] 61 | elif static_kv_dtype in valid_torch_dtype.values(): 62 | new_dtype = static_kv_dtype 63 | else: 64 | raise ValueError( 65 | f"Invalid static kv dtype: {static_kv_dtype}. " 66 | # f"Valid options are: {', '.join(valid_dtype_name_lst + list(valid_torch_dtype.values()))}." 67 | ) 68 | return new_dtype 69 | 70 | 71 | def is_attention_module(module: torch.nn.Module): 72 | # FIXME: Handle this better. 73 | return "attention" in module.__class__.__name__.lower() and ( 74 | hasattr(module, "k_proj") or hasattr(module, "v_proj") or hasattr(module, "qkv_proj") 75 | ) 76 | -------------------------------------------------------------------------------- /auto_round/envs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Note: the design of this module is inspired by vLLM's envs.py 15 | # For detailed usage and configuration guide, see: docs/environments.md 16 | 17 | import os 18 | from typing import TYPE_CHECKING, Any, Callable, Optional 19 | 20 | if TYPE_CHECKING: 21 | AR_LOG_LEVEL: str = "INFO" 22 | AR_USE_MODELSCOPE: bool = "False" 23 | 24 | environment_variables: dict[str, Callable[[], Any]] = { 25 | # this is used for configuring the default logging level 26 | "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(), 27 | "AR_ENABLE_COMPILE_PACKING": lambda: os.getenv("AR_ENABLE_COMPILE_PACKING", "0").lower() in ("1", "true", "yes"), 28 | "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"], 29 | "AR_WORK_SPACE": lambda: os.getenv("AR_WORK_SPACE", "ar_work_space").lower(), 30 | } 31 | 32 | 33 | def __getattr__(name: str): 34 | # lazy evaluation of environment variables 35 | if name in environment_variables: 36 | return environment_variables[name]() 37 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 38 | 39 | 40 | def __dir__(): 41 | return list(environment_variables.keys()) 42 | 43 | 44 | def is_set(name: str): 45 | """Check if an environment variable is explicitly set.""" 46 | if name in environment_variables: 47 | return name in os.environ 48 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 49 | 50 | 51 | def set_config(**kwargs): 52 | """ 53 | Set configuration values for environment variables. 54 | 55 | Args: 56 | **kwargs: Keyword arguments where keys are environment variable names 57 | and values are the desired values to set. 58 | 59 | Example: 60 | set_config(AR_LOG_LEVEL="DEBUG", AR_USE_MODELSCOPE=True) 61 | """ 62 | for key, value in kwargs.items(): 63 | if key in environment_variables: 64 | # Convert value to appropriate string format 65 | if key == "AR_USE_MODELSCOPE": 66 | # Handle boolean values for AR_USE_MODELSCOPE 67 | str_value = "true" if value in [True, "True", "true", "1", 1] else "false" 68 | else: 69 | # For other variables, convert to string 70 | str_value = str(value) 71 | 72 | # Set the environment variable 73 | os.environ[key] = str_value 74 | else: 75 | raise AttributeError(f"module {__name__!r} has no attribute {key!r}") 76 | -------------------------------------------------------------------------------- /docs/tuning_norm_bias.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ## Fast tuning LayerNorm and Linear bias via fake quantization without rounding 4 | 5 | **Personal view by Wenhua, welcome to discuss** 6 | 7 | **Work in Progress** 8 |
9 | Recent studies have found that tuning LayerNorm and bias through optimizer like Adam can lead to better results, especially for low-bit quantization such as 2-bit. However, I personally do not favor the use of Adam for this purpose, as detailed in the following section, and introduce an alternative way, detailed in the last section. 10 | 11 | ### Why not using Adam 12 | 13 | #### Reason1 hard to tune the learning rate and steps 14 | 15 | Since Adam adaptively tunes the step size based on the gradient and its square, the learning rate often needs adjustment for different models, different quantization bits, or both, as observed in most papers. I hypothesize that this tuning requirement arises because most papers report results for only a limited range of model families, while many new models continually emerge. Despite my experience in this domain, I still find it challenging to tune the learning rate beyond using grid search. I believe many users encounter the same issue. 16 | 17 | #### Reason2 Prone to overfitting 18 | 19 | Since Adam adapts the step size with each iteration, it is difficult to control the changes in parameters, leading to significant deviations from the original model's weights in some scenarios. However, we only use hundreds or thousands of samples to fine-tune a low-bit model, whereas the original model is trained on a large corpus and specialized datasets (e.g., instruction datasets). Consequently, even if the low-bit tuned model performs well on some language-modeling tasks, it may lose other compatibility as the deviations increase. 20 | 21 | 22 | 23 | ### Our way 24 | 25 | **An overview of our method** 26 |
27 | 28 | ![](../docs/imgs/norm_bias_overview.png) 29 | 30 |
31 | 32 | 33 | We limit the tuned parameters in a quantization space, expressed as: 34 | $$ 35 | W' = s*clip(W/s+zp,N,M) 36 | $$ 37 | where 𝑠 is the quantization scale, predefined by 𝑊 and hyperparameters such as bits. 38 | 39 | To tune the W', following Signround, we add a trainable parameter V in the range [-0.5, 0.5], which can be easily tuned by SignSGD. 40 | 41 | $$ 42 | W' = s*clip(W/s+zp+v,N,M) 43 | $$ 44 | 45 | 46 | An important note: We remove the rounding to reduce unnecessary rounding loss, as the final weights of LayerNorm and bias are typically kept at 16-bit precision in most cases. 47 | 48 | 49 | 50 | **Result at W2G32** 51 | 52 | the tuning of layer normalization and Linear bias are fake quantized at W4G-1. 53 | 54 | Average accuracies of HellaSwag, WinoGrand, PIQA and LAMBADA, higher is better. 55 | 56 | | | OPT125m | OPT1.3B | OPT2.7B | OPT6.7B | LLaMAV2-7b | LLaMAV3-8B-Instruct | 57 | | --------- | ---------- | ---------- | ---------- | ---------- | ---------- | ------------------- | 58 | | SignRound | 0.3978 | 0.5094 | 0.5267 | 0.3681 | 0.6267 | 0.5890 | 59 | | Ours | **0.4077** | **0.5151** | **0.5596** | **0.3887** | **0.6315** | **0.5949** | 60 | 61 | -------------------------------------------------------------------------------- /test/test_cpu/test_woq_linear.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pytest 4 | import torch 5 | 6 | sys.path.insert(0, "../..") 7 | from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear 8 | 9 | 10 | class TestWeightOnlyLinear: 11 | @pytest.mark.parametrize( 12 | "bits, compression_dtype", 13 | [ 14 | (8, torch.int16), 15 | (8, torch.int32), 16 | (8, torch.int64), 17 | (4, torch.int8), 18 | (4, torch.int16), 19 | (4, torch.int32), 20 | (4, torch.int64), 21 | (2, torch.int8), 22 | (2, torch.int16), 23 | (2, torch.int32), 24 | (2, torch.int64), 25 | ], 26 | ) 27 | def test_pack_with_numba(self, bits, compression_dtype): 28 | m = torch.nn.Linear(1024, 512) 29 | dtype = "int" 30 | weight = m.weight.detach() 31 | group_size = 32 32 | origin_shape = weight.shape 33 | from auto_round.data_type.int import quant_tensor_sym 34 | 35 | origin_shape = weight.shape 36 | weight = weight.reshape(-1, group_size) 37 | qdq, scale, zp = quant_tensor_sym(weight, -1) 38 | if isinstance(zp, int | float): 39 | zp = torch.full_like(scale, zp) 40 | int_weight = qdq.div(scale).add(zp).clamp(0, 2 ** (bits) - 1).to(torch.int32).reshape(origin_shape) 41 | scale = scale.reshape(origin_shape[0], -1) 42 | if isinstance(zp, torch.Tensor): 43 | zp = zp.reshape(origin_shape[0], -1).to(torch.int32).clamp(0, 2 ** (bits) - 1) 44 | module_with_legacy_pack = WeightOnlyLinear( 45 | in_features=m.in_features, 46 | out_features=m.out_features, 47 | dtype=dtype, 48 | bits=bits, 49 | groupsize=32, 50 | zp=zp is not None, 51 | bias=m.bias is not None, 52 | use_optimum_format=False, 53 | compression_dtype=compression_dtype, 54 | use_legacy_pack=True, 55 | ) 56 | module_with_legacy_pack.pack( 57 | int_weight.clone(), scale.clone(), zp.clone() if isinstance(zp, torch.Tensor) else zp, m.bias 58 | ) 59 | module_with_new_pack = WeightOnlyLinear( 60 | in_features=m.in_features, 61 | out_features=m.out_features, 62 | dtype=dtype, 63 | bits=bits, 64 | groupsize=32, 65 | zp=zp is not None, 66 | bias=m.bias is not None, 67 | use_optimum_format=False, 68 | compression_dtype=compression_dtype, 69 | use_legacy_pack=False, 70 | ) 71 | module_with_new_pack.pack( 72 | int_weight.clone(), scale.clone(), zp.clone() if isinstance(zp, torch.Tensor) else zp, m.bias 73 | ) 74 | 75 | assert torch.equal(module_with_new_pack.qweight, module_with_legacy_pack.qweight) 76 | 77 | assert torch.equal(module_with_new_pack.qzeros, module_with_legacy_pack.qzeros) 78 | assert torch.equal(module_with_new_pack.scales, module_with_legacy_pack.scales) 79 | unpacked_int_weight = module_with_new_pack.unpack_tensor(module_with_legacy_pack.qweight) 80 | assert torch.equal(unpacked_int_weight, int_weight) 81 | -------------------------------------------------------------------------------- /test/test_cpu/test_cli_usage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | 8 | 9 | class TestAutoRoundCmd(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(self): 12 | pass 13 | 14 | @classmethod 15 | def tearDownClass(self): 16 | shutil.rmtree("./saved", ignore_errors=True) 17 | shutil.rmtree("runs", ignore_errors=True) 18 | shutil.rmtree("../../saved", ignore_errors=True) 19 | shutil.rmtree("../../tmp_autoround", ignore_errors=True) 20 | 21 | def test_auto_round_cmd(self): 22 | python_path = sys.executable 23 | 24 | # Test llm script 25 | res = os.system(f"cd ../.. && {python_path} -m auto_round -h") 26 | if res > 0 or res == -1: 27 | assert False, "cmd line test fail, please have a check" 28 | 29 | res = os.system( 30 | f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" 31 | ) 32 | if res > 0 or res == -1: 33 | assert False, "cmd line test fail, please have a check" 34 | 35 | res = os.system( 36 | f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" 37 | ) 38 | if res > 0 or res == -1: 39 | assert False, "cmd line test fail, please have a check" 40 | 41 | res = os.system( 42 | f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai" 43 | ) 44 | if res > 0 or res == -1: 45 | assert False, "cmd line test fail, please have a check" 46 | 47 | # test mllm script 48 | 49 | # test auto_round_mllm --eval help 50 | res = os.system(f"cd ../.. && {python_path} -m auto_round --eval -h") 51 | if res > 0 or res == -1: 52 | assert False, "cmd line test fail, please have a check" 53 | 54 | # test auto_round_mllm --lmms help 55 | res = os.system(f"cd ../.. && {python_path} -m auto_round --eval --lmms -h") 56 | if res > 0 or res == -1: 57 | assert False, "cmd line test fail, please have a check" 58 | 59 | res = os.system( 60 | f"cd ../.. && {python_path} -m auto_round --mllm --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" 61 | ) 62 | if res > 0 or res == -1: 63 | assert False, "cmd line test fail, please have a check" 64 | 65 | res = os.system( 66 | f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --seqlen 32 --format auto_round" 67 | " --quant_nontext_module --output_dir ./saved " 68 | ) 69 | if res > 0 or res == -1: 70 | assert False, "cmd line test fail, please have a check" 71 | 72 | 73 | if __name__ == "__main__": 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /test/test_cuda/test_packing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from auto_round.export.export_to_autoround.qlinear_fp import FLOAT_TO_E2M1, pack_fp4_to_uint8 5 | 6 | 7 | # Random sampling from FLOAT_TO_E2M1 8 | def _create_random_e2m1_tensor(shape): 9 | """Create a tensor of the given shape with random values from FLOAT_TO_E2M1.""" 10 | # Create a tensor of indices randomly selected from 0 to len(FLOAT_TO_E2M1)-1 11 | indices = torch.randint(0, len(FLOAT_TO_E2M1), shape) 12 | 13 | # Map the indices to their corresponding values 14 | e2m1_tensor = torch.tensor(FLOAT_TO_E2M1, dtype=torch.float32)[indices] 15 | return e2m1_tensor 16 | 17 | 18 | def pack_fp4_to_uint8_old(x: torch.Tensor) -> torch.Tensor: 19 | """ 20 | Packs a tensor with values in the fp4 range into uint8. 21 | As there are 16 valid fp4 values, two fp4 values can be 22 | packed into one uint8. Each fp4 value is mapped to its 23 | particular index (e.g. 0.5 is mapped to index 1, 6.0 is mapped 24 | to index 7) which is then represented using 4 bits. Consecutive 25 | pairs of 4 bits are then packed into an uint8. 26 | 27 | :param x: tensor to pack 28 | returns: a packed tensor in uint8 29 | """ 30 | 31 | m, n = x.shape 32 | device = x.device 33 | 34 | # Create lookup table for FP4 values to indices 35 | # Map the absolute values to 0-7 indices 36 | kE2M1 = torch.tensor(FLOAT_TO_E2M1, device=device, dtype=x.dtype) 37 | 38 | # Find closest valid FP4 value index for each element 39 | abs_x = torch.abs(x) 40 | abs_indices = torch.zeros_like(abs_x, dtype=torch.long) 41 | for i, val in enumerate(kE2M1): # TODO any optimize? 42 | abs_indices = torch.where(torch.isclose(abs_x, val), i, abs_indices) 43 | 44 | # Apply sign bit (bit 3) to get final 4-bit representation 45 | indices = abs_indices + (torch.signbit(x) << 3).to(torch.long) 46 | 47 | # Reshape to prepare for packing pairs of values 48 | indices = indices.reshape(-1) 49 | 50 | # Handle odd length by padding if necessary 51 | if indices.numel() % 2 != 0: 52 | indices = torch.cat([indices, torch.zeros(1, dtype=torch.long, device=device)]) 53 | 54 | # Reshape to pair consecutive elements 55 | indices = indices.reshape(-1, 2) 56 | 57 | # Pack pairs of 4-bit values into 8-bit values 58 | packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8) 59 | 60 | return packed.reshape(m, n // 2) 61 | 62 | 63 | qwen_weight_shapes = [ 64 | torch.Size([2048, 768]), 65 | torch.Size([768, 2048]), 66 | torch.Size([128, 2048]), 67 | torch.Size([512, 2048]), 68 | torch.Size([4096, 2048]), 69 | torch.Size([151936, 2048]), 70 | torch.Size([2048, 4096]), 71 | ] 72 | 73 | 74 | @pytest.mark.parametrize("shape", qwen_weight_shapes) 75 | def test_packing_fp4(shape): 76 | with torch.device("cuda"): 77 | M, N = shape 78 | random_tensor = _create_random_e2m1_tensor((M, N)) 79 | # Pack the tensor using the packing function 80 | packed_tensor = pack_fp4_to_uint8(random_tensor) 81 | packed_tensor_old = pack_fp4_to_uint8_old(random_tensor) 82 | # check equal 83 | assert torch.equal(packed_tensor, packed_tensor_old), "Packed tensors are not equal" 84 | -------------------------------------------------------------------------------- /test/test_cpu/test_gpt_oss.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import AutoConfig, AutoTokenizer 3 | from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM 4 | 5 | from auto_round import AutoRound 6 | 7 | 8 | @pytest.fixture 9 | def setup_gpt_oss(): 10 | """Fixture to set up the GPT-OSS model and tokenizer.""" 11 | model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16" 12 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 13 | config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) 14 | config.num_hidden_layers = 1 # Reduce layers for testing 15 | model = GptOssForCausalLM(config) 16 | output_dir = "/tmp/test_quantized_gpt_oss" 17 | return model, tokenizer, output_dir, config 18 | 19 | 20 | def quantize_model(model, tokenizer, output_dir, scheme, iters=0): 21 | """Helper function to quantize the model with the given scheme.""" 22 | autoround = AutoRound( 23 | model, 24 | tokenizer, 25 | scheme=scheme, 26 | nsamples=2, 27 | iters=iters, 28 | fp_layers="self_attn,router,lm_head,mlp.gate", 29 | ) 30 | quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) 31 | return quantized_model 32 | 33 | 34 | def count_modules_by_type(model, target_module_name_or_class): 35 | """Helper function to count modules of a specific type in the model.""" 36 | cnt = 0 37 | for name, module in model.named_modules(): 38 | if isinstance(target_module_name_or_class, str): 39 | if target_module_name_or_class == module.__class__.__name__: 40 | cnt += 1 41 | else: 42 | if isinstance(module, target_module_name_or_class): 43 | cnt += 1 44 | return cnt 45 | 46 | 47 | @pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"]) 48 | def test_quantization(setup_gpt_oss, scheme): 49 | """Test quantization with the scheme.""" 50 | model, tokenizer, output_dir, config = setup_gpt_oss 51 | quantized_model = quantize_model(model, tokenizer, output_dir, scheme) 52 | 53 | # Ensure the quantized model is not None 54 | assert quantized_model is not None, "Quantized model should not be None." 55 | from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear 56 | from auto_round.modelling.gpt_oss import GPTOssSingleExpert 57 | 58 | single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert) 59 | quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear) 60 | assert ( 61 | single_expert_cnt == config.num_local_experts 62 | ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}." 63 | assert ( 64 | quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts 65 | ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}." 66 | 67 | print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}") 68 | print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}") 69 | # clean the output directory after test 70 | import shutil 71 | 72 | shutil.rmtree(output_dir, ignore_errors=True) 73 | -------------------------------------------------------------------------------- /docs/opt_rtn.md: -------------------------------------------------------------------------------- 1 | ### 🧮 Evaluation Results (LM-Eval) 2 | For 2/3bit, we strongly recommend not using iter=0 except for GGUF:Q2_K_S which has a different quantization algorithm. 3 | 4 | 4BIT=W4A16 5 | 3BIT=W3A16 6 | 2BIT=W2A16G64 7 | 8 | RTN mode 9 | 10 | ~~~bash 11 | auto-round --model xxx --disable_opt_rtn --iters 0 12 | ~~~ 13 | 14 | OPT RTN mode 15 | 16 | ~~~bash 17 | auto-round --model xxx --iters 0 18 | ~~~ 19 | 20 | 21 | 22 | | Model | RNT/OPT | AVG | HellaSwag | LAMBADA | MMLU | PIQA | WinoGrande | 23 | |--------------------------------|----------|---------|-----------|---------|--------|--------|------------| 24 | | **Meta-Llama-3.1-8B-Instruct** | RTN-4BIT | 0.69328 | 0.5896 | 0.7013 | 0.6538 | 0.7987 | 0.7230 | 25 | | | OPT-4BIT | 0.69560 | 0.5882 | 0.7074 | 0.6631 | 0.7916 | 0.7277 | 26 | | | RTN-3BIT | 0.64562 | 0.5410 | 0.6695 | 0.5449 | 0.7742 | 0.6985 | 27 | | | OPT-3BIT | 0.65970 | 0.5490 | 0.6893 | 0.5711 | 0.7677 | 0.7214 | 28 | | | RTN-2BIT | 0.33008 | 0.2918 | 0.0474 | 0.2321 | 0.5740 | 0.5051 | 29 | | | OPT-2BIT | 0.38908 | 0.3241 | 0.1560 | 0.2822 | 0.6235 | 0.5596 | 30 | | **Qwen2.5-7B-Instruct** | RTN-4BIT | 0.69560 | 0.6114 | 0.6713 | 0.7011 | 0.7878 | 0.7064 | 31 | | | OPT-4BIT | 0.70034 | 0.6143 | 0.6945 | 0.7115 | 0.7845 | 0.6969 | 32 | | | RTN-3BIT | 0.64144 | 0.5585 | 0.6092 | 0.6455 | 0.7476 | 0.6464 | 33 | | | OPT-3BIT | 0.66764 | 0.5756 | 0.7013 | 0.6597 | 0.7481 | 0.6535 | 34 | | | RTN-2BIT | 0.31856 | 0.2804 | 0.0351 | 0.2379 | 0.5256 | 0.5138 | 35 | | | OPT-2BIT | 0.45146 | 0.3645 | 0.2992 | 0.4043 | 0.6415 | 0.5478 | 36 | | **Qwen3-8B** | RTN-4BIT | 0.66240 | 0.5619 | 0.6150 | 0.7077 | 0.7573 | 0.6701 | 37 | | | OPT-4BIT | 0.66992 | 0.5619 | 0.6346 | 0.7102 | 0.7633 | 0.6796 | 38 | | | RTN-3BIT | 0.57322 | 0.4992 | 0.4260 | 0.6002 | 0.7361 | 0.6046 | 39 | | | OPT-3BIT | 0.63698 | 0.5226 | 0.5814 | 0.6718 | 0.7437 | 0.6654 | 40 | | | RTN-2BIT | 0.31150 | 0.2679 | 0.0041 | 0.2536 | 0.5283 | 0.5036 | 41 | | | OPT-2BIT | 0.44254 | 0.3749 | 0.2005 | 0.4202 | 0.6670 | 0.5501 | 42 | | **Qwen3-14B** | RTN-4BIT | 0.70448 | 0.5999 | 0.6511 | 0.7565 | 0.7998 | 0.7151 | 43 | | | OPT-4BIT | 0.70798 | 0.6031 | 0.6627 | 0.7534 | 0.8009 | 0.7198 | 44 | | | RTN-3BIT | 0.65876 | 0.5746 | 0.5467 | 0.7065 | 0.7628 | 0.7032 | 45 | | | OPT-3BIT | 0.68610 | 0.5683 | 0.6633 | 0.7258 | 0.7699 | 0.7032 | 46 | | | RTN-2BIT | 0.39398 | 0.3764 | 0.0607 | 0.3836 | 0.6480 | 0.5012 | 47 | | | OPT-2BIT | 0.50080 | 0.4554 | 0.2451 | 0.4899 | 0.7138 | 0.5998 | -------------------------------------------------------------------------------- /test/test_cpu/test_autoround_acc.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | from auto_round.eval.evaluation import simple_evaluate 7 | 8 | sys.path.insert(0, "../..") 9 | from math import isclose 10 | 11 | import torch 12 | import transformers 13 | from transformers import AutoModelForCausalLM, AutoTokenizer 14 | 15 | from auto_round import AutoRound # pylint: disable=E0401 16 | 17 | 18 | class LLMDataLoader: 19 | def __init__(self): 20 | self.batch_size = 1 21 | 22 | def __iter__(self): 23 | for i in range(2): 24 | yield torch.ones([1, 10], dtype=torch.long) 25 | 26 | 27 | class TestAutoRound(unittest.TestCase): 28 | @classmethod 29 | def setUpClass(self): 30 | self.llm_dataloader = LLMDataLoader() 31 | self.save_dir = "./saved" 32 | 33 | @classmethod 34 | def tearDownClass(self): 35 | shutil.rmtree(self.save_dir, ignore_errors=True) 36 | shutil.rmtree("runs", ignore_errors=True) 37 | 38 | def test_default_acc(self): 39 | model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM" 40 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True) 41 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 42 | bits, group_size, sym = 4, 128, True 43 | inp = torch.ones([1, 10], dtype=torch.long) 44 | autoround = AutoRound( 45 | model, 46 | tokenizer, 47 | bits=bits, 48 | device="cpu", 49 | group_size=group_size, 50 | sym=sym, 51 | iters=2, 52 | seqlen=10, 53 | dataset=self.llm_dataloader, 54 | ) 55 | autoround.quantize() 56 | out0 = model(inp) 57 | print(f"out0 = {float(out0[0][0][0][0])}") 58 | 59 | model_tmp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True) 60 | autoround_1 = AutoRound( 61 | model_tmp, 62 | tokenizer, 63 | bits=bits, 64 | group_size=group_size, 65 | sym=sym, 66 | device="cpu", 67 | iters=2, 68 | seqlen=10, 69 | dataset=self.llm_dataloader, 70 | ) 71 | autoround_1.quantize() 72 | out1 = model_tmp(inp) 73 | 74 | assert out0[0].equal(out1[0]) 75 | self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04)) 76 | 77 | def test_3bits_asym_autoround(self): 78 | model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" 79 | 80 | bits, sym = 3, False 81 | autoround = AutoRound(model_name, bits=bits, sym=sym, iters=0) 82 | autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) 83 | model_args = f"pretrained={self.save_dir}" 84 | # res = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto", limit=10) 85 | 86 | # accuracy = res["results"]["lambada_openai"]["acc,none"] 87 | # print(f"accuracy = {accuracy}") 88 | # assert accuracy > 0.15 89 | shutil.rmtree(self.save_dir, ignore_errors=True) 90 | 91 | 92 | if __name__ == "__main__": 93 | unittest.main() 94 | -------------------------------------------------------------------------------- /docs/auto_scheme_acc.md: -------------------------------------------------------------------------------- 1 | We use **lm-eval** for evaluation. For LLaMA, we enabled `add_bos_token` and 2 | `removed @use_kernel_forward_from_hub("RMSNorm")` 3 | in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40) 4 | to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval. 5 | 6 | We ignore the scale and zp bits in the tables below. The accuracy may change a little as we modified a little of the 7 | implementation. We will rerun all the experiments. 8 | 9 | For mxfp experiment, we use fake model while for weight only model we use real model. **No tuning is applied unless explicit stated.** 10 | 11 | *Average accuracy across `lambada_openai`, `hellaswag`, `piqa`, `winogrande`, and `mmlu`.* 12 | 13 | ### Table 1 MXFP4/8 mixed accuracy. 14 | 15 | | Average bits | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | Qwen3-32B | 16 | |:------------------|:----------------:|:----------------:|:----------------:|:----------------:| 17 | | **BF16** | 0.7076 (100%) | 0.7075 (100%) | 0.6764 (100%) | 0.7321 (100%) | 18 | | **Pure 4-bit** | 0.6626 (93.6%) | 0.6550 (92.6%) | 0.6316 (93.4%) | 0.6901 (94.3%) | 19 | | **Ours 4.5-bit** | 0.6808 (96.2%) | 0.6776 (95.8%) | 0.6550 (96.8%) | 0.7176 (98.0%) | 20 | | **Ours 5-bit** | 0.6857 (96.9%) | 0.6823 (96.4%) | 0.6594 (97.5%) | 0.7201 (98.3%) | 21 | | **Ours 6-bit** | 0.6975 (98.6%) | 0.6970 (98.5%) | 0.6716 (99.3%) | 0.7303 (99.8%) | 22 | 23 | We compare the proposed method against naive layer-wise bit allocation strategies, such as assigning higher 24 | precision to the network’s head((near lm-head) or tailad(close to embedding)) layers, to demonstrate its relative 25 | performance advantages. 26 | 27 | ### Table 2 Comparison with other recipes at an average of 5 bits of mxfp datatype 28 | 29 | | Avg. bits = 5 | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | 30 | |:------------------|:----------------:|:----------------:|:----------------:| 31 | | **Tail layers 8-bit** | 0.6671 (94.3%) | 0.6616 (93.5%) | 0.6410 (94.8%) | 32 | | **Head layers 8-bit** | 0.6657 (94.1%) | 0.6686 (94.5%) | 0.6356 (94.0%) | 33 | | **Ours** | **0.6857 (96.9%)** | **0.6823 (96.4%)** | **0.6594 (97.5%)** | 34 | 35 | ### Table 3 Comparison with other recipes at an average of 4.5 bits of mxfp datatype 36 | 37 | | Avg. bits = 4.5 | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | 38 | |:------------------|:----------------:|:----------------:|:----------------:| 39 | | **Tail layers 8-bit** | 0.6614 (93.5%) | 0.6535 (92.4%) | 0.6373 (94.2%) | 40 | | **Head layers 8-bit** | 0.6568 (92.8%) | 0.6642 (93.9%) | 0.6305 (93.2%) | 41 | | **Ours** | **0.6808 (96.2%)** | **0.6776 (95.5%)** | **0.6550 (95.8%)** | 42 | 43 | 44 | ### Table4 Comparison with other recipes at an average of 3 bits of W2G128 and W4G128 45 | 46 | | Avg. bits = 4.5 | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | 47 | |:------------------|:----------------:|:----------------:|:----------------:| 48 | | **Tail layers 4-bit** | 0.6058 | 0.3798 | 0.4536 | 49 | | **Head layers 4-bit** | 0.3198 | 0.3270 | 0.3196 | 50 | | **Ours** | 0.6148 | 0.4058 | 0.4862 | 51 | -------------------------------------------------------------------------------- /.azure-pipelines/unit-test.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - auto_round 12 | - auto_round_extension 13 | - test 14 | - setup.py 15 | - requirements.txt 16 | - requirements-cpu.txt 17 | - .azure-pipelines/scripts/ut 18 | - .azure-pipelines/unit-test.yml 19 | - .azure-pipelines/template/ut-template.yml 20 | - .azure-pipelines/template/docker-template.yml 21 | exclude: 22 | - test/test*hpu* 23 | - "*.md" 24 | - "**/*.md" 25 | 26 | pool: ICX-16C 27 | 28 | variables: 29 | IMAGE_NAME: "auto-round" 30 | IMAGE_TAG: "py312" 31 | UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir 32 | DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir 33 | ARTIFACT_NAME: "UT_coverage_report" 34 | REPO: $(Build.Repository.Uri) 35 | 36 | stages: 37 | - stage: Unit_test 38 | displayName: Unit Test 39 | dependsOn: [] 40 | jobs: 41 | - job: 42 | timeoutInMinutes: 120 43 | strategy: 44 | matrix: 45 | part1: 46 | PART: 1 47 | part2: 48 | PART: 2 49 | part3: 50 | PART: 3 51 | part4: 52 | PART: 4 53 | part5: 54 | PART: 5 55 | steps: 56 | - template: template/ut-template.yml 57 | parameters: 58 | dockerConfigName: "commonDockerConfig" 59 | utScriptFileName: "run_ut" 60 | uploadPath: $(UPLOAD_PATH) 61 | utArtifact: "ut-$(PART)" 62 | utTestMode: $(PART) 63 | 64 | - stage: Coverage 65 | displayName: "Collect Coverage" 66 | pool: 67 | vmImage: "ubuntu-latest" 68 | dependsOn: [Unit_test] 69 | jobs: 70 | - job: CollectDatafiles 71 | steps: 72 | - task: DownloadPipelineArtifact@2 73 | inputs: 74 | artifact: 75 | patterns: '*_coverage/.coverage.*' 76 | path: $(DOWNLOAD_PATH) 77 | 78 | - task: UsePythonVersion@0 79 | inputs: 80 | versionSpec: '3.12' 81 | displayName: 'Use Python 3.12' 82 | 83 | - script: | 84 | cd ${BUILD_SOURCESDIRECTORY} 85 | pip install -U pip setuptools uv 86 | uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu 87 | uv pip install . 88 | pip list 89 | cd ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/scripts 90 | bash ut/collect_log.sh 91 | env: 92 | PYTHONUNBUFFERED: '1' 93 | UV_NO_PROGRESS: '1' 94 | UV_SYSTEM_PYTHON: '1' 95 | displayName: "Collect UT Coverage" 96 | 97 | - task: PublishPipelineArtifact@1 98 | condition: succeededOrFailed() 99 | inputs: 100 | targetPath: $(UPLOAD_PATH)/coverage_PR 101 | artifact: $(ARTIFACT_NAME) 102 | publishLocation: "pipeline" 103 | 104 | - task: PublishCodeCoverageResults@2 105 | inputs: 106 | summaryFileLocation: $(Build.SourcesDirectory)/log_dir/coverage_PR/coverage.xml 107 | -------------------------------------------------------------------------------- /auto_round/data_type/w4fp8.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from auto_round.data_type.register import register_dtype 18 | from auto_round.data_type.utils import float8_e4m3fn_ste, get_gaudi_fp8_ste_func 19 | 20 | 21 | @register_dtype("fp8_to_int_sym") 22 | def progressive_quant_fp8_int4( 23 | tensor, bits=4, group_size=-1, v=0, min_scale=1.0, max_scale=1.0, q_scale_thresh=1e-5, **kwargs 24 | ): 25 | """Two-stage quantization: quantize tensor to fp8 by per tensor, then quantize fp8 to w4g128 26 | 27 | This method first quantizes the input tensor into float8 format and then performs 28 | a secondary quantization to int4 with grouping. 29 | 30 | Args: 31 | tensor (torch.Tensor): Input tensor to quantize. 32 | bits (int, optional): Bit precision for secondary quantization. Defaults to 4. 33 | group_size (int, optional): Group size for int4 quantization. Defaults to -1 (no grouping). 34 | v (float, optional): Optional parameter for variance tuning. Defaults to 0. 35 | min_scale (float, optional): Minimum scaling factor for int4 quantization. Defaults to 1.0. 36 | max_scale (float, optional): Maximum scaling factor for int4 quantization. Defaults to 1.0. 37 | q_scale_thresh (float, optional): Threshold for scaling. Defaults to 1e-5. 38 | **kwargs: Additional arguments for compatibility. 39 | 40 | Returns: 41 | tuple: 42 | - Quantized and dequantized tensor (torch.Tensor). 43 | - Combined scaling factor (torch.Tensor). 44 | - Placeholder for zp (None). 45 | """ 46 | 47 | info = torch.finfo(torch.float8_e4m3fn) 48 | tensor_max = torch.max(torch.abs(tensor)).to(torch.float32) 49 | scale = tensor_max.to(torch.float32) / info.max 50 | min_scaling_factor = 1.0 / (info.max * 512.0) ##copy from vllm 51 | bf16_to_fp8_scale = torch.clip(scale, min=min_scaling_factor) 52 | fp8_res = tensor / bf16_to_fp8_scale 53 | fp8_res = torch.clip(fp8_res, info.min, info.max) 54 | fp8_res = float8_e4m3fn_ste(fp8_res) 55 | 56 | ##convert to bf16 57 | fp8_res_using_16bit = fp8_res.to(tensor.dtype) 58 | ##convert to int4 59 | from auto_round.data_type.int import quant_tensor_sym 60 | 61 | qdq_int4_tensor, scale_fp8_to_int4, zp_fp8_to_int4 = quant_tensor_sym( 62 | fp8_res_using_16bit, 63 | bits=bits, 64 | group_size=group_size, 65 | v=v, 66 | min_scale=min_scale, 67 | max_scale=max_scale, 68 | scale_dtype=torch.bfloat16, 69 | q_scale_thresh=q_scale_thresh, 70 | ) 71 | qdq_tensor = qdq_int4_tensor * bf16_to_fp8_scale 72 | 73 | bf16_to_int4_scale = scale_fp8_to_int4 * bf16_to_fp8_scale 74 | return qdq_tensor, {"scale": bf16_to_int4_scale, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4 75 | -------------------------------------------------------------------------------- /auto_round/compressors/diffusion/README.md: -------------------------------------------------------------------------------- 1 | # AutoRound for Diffusion Models (Experimental) 2 | 3 | This feature is experimental and may be subject to changes, including potential bug fixes, API modifications, or adjustments to default parameters. 4 | 5 | ## Quantization 6 | 7 | Quantization for diffusion models is limited: 8 | 9 | 1. Only transformer module of diffusion models will be quantized.. 10 | 2. Loading quantized model is not supported yet, so please use `fake` format for quantization. 11 | 3. Calibration dataset only supports `coco2014` and user customized `.tsv` file. 12 | 13 | 14 | ### API Usage (CPU/GPU) Recommended 15 | 16 | 17 | ```python 18 | import torch 19 | from auto_round import AutoRound 20 | from diffusers import AutoPipelineForText2Image 21 | 22 | # Load the model 23 | model_name = "black-forest-labs/FLUX.1-dev" 24 | pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) 25 | 26 | # Quantize the model 27 | autoround = AutoRound( 28 | pipe, 29 | scheme="MXFP8", 30 | dataset="coco2014", 31 | num_inference_steps=10, 32 | guidance_scale=7.5, 33 | generator_seed=None, 34 | batch_size=1, 35 | ) 36 | autoround.quantize() 37 | 38 | # Save the quantized model 39 | output_dir = "./tmp_autoround" 40 | # Currently loading the quantized diffusion model is not supported, so use fake format 41 | autoround.save_quantized(output_dir, format="fake", inplace=True) 42 | ``` 43 | 44 | - `dataset`: the dataset for quantization training. Currently only support coco2014 and user customized .tsv file. 45 | 46 | - `num_inference_steps`: The reference number of denoising steps. 47 | 48 | - `guidance_scale`: Control how much the image generation process follows the text prompt. The more it is, the more closely it follows the prompt. 49 | 50 | - `generator_seed`: A seed that controls the initial noise from which an image is generated. 51 | 52 | for more hyperparameters introduction, please refer [Homepage Detailed Hyperparameters](../../README.md#api-usage-gaudi2cpugpu) 53 | 54 | ### CLI Usage 55 | 56 | A user guide detailing the full list of supported arguments is provided by calling ```auto-round -h``` on the 57 | terminal. 58 | 59 | ```bash 60 | auto-round \ 61 | --model black-forest-labs/FLUX.1-dev \ 62 | --scheme MXFP8 \ 63 | --format fake \ 64 | --batch_size 1 \ 65 | --output_dir ./tmp_autoround 66 | ``` 67 | 68 | ### Diffusion Support Matrix 69 | 70 | For diffusion models, currently we only validate quantizaion on the FLUX.1-dev, which involves quantizing the transformer component of the pipeline. 71 | 72 | | Model | calibration dataset | 73 | |--------------|--------------| 74 | | black-forest-labs/FLUX.1-dev | COCO2014 | 75 | 76 | 77 | 78 |
79 | Calibration Dataset 80 | 81 | For diffusion models, we used [**coco2014**]("https://github.com/mlcommons/inference/raw/refs/heads/master/text_to_image/coco2014/captions/captions_source.tsv") calibration dataset as our default. 82 | 83 | If users want to use their own dataset, please build the dataset file in ".tsv" format following below structure and use it through argument --dataset (tsv file): 84 | ``` 85 | id caption 86 | 0 YOUR_PROMPT 87 | 1 YOUR_PROMPT 88 | ... ... 89 | ``` 90 | - `id`: The id used to map generated images and prompts. 91 | - `caption`: The text prompt used to generate the images. 92 | 93 | 94 |
95 | -------------------------------------------------------------------------------- /auto_round/export/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round.export.register import EXPORT_FORMAT, PACKING_LAYER_WITH_FORMAT, register_format, register_layer_packing 16 | 17 | 18 | @register_format("auto_gptq") 19 | def _save_quantized_as_autogptq(*args, **kwargs): 20 | from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq 21 | 22 | return save_quantized_as_autogptq(*args, **kwargs) 23 | 24 | 25 | @register_format("itrex") 26 | def _save_quantized_as_itrex(*args, **kwargs): 27 | from auto_round.export.export_to_itrex.export import save_quantized_as_itrex 28 | 29 | return save_quantized_as_itrex(*args, **kwargs) 30 | 31 | 32 | @register_format("itrex_xpu") 33 | def _save_quantized_as_itrex_xpu(*args, **kwargs): 34 | from auto_round.export.export_to_itrex.export import save_quantized_as_itrex_xpu 35 | 36 | return save_quantized_as_itrex_xpu(*args, **kwargs) 37 | 38 | 39 | @register_format("auto_round") 40 | def _save_quantized_as_autoround(*args, **kwargs): 41 | from auto_round.export.export_to_autoround.export import save_quantized_as_autoround 42 | 43 | return save_quantized_as_autoround(*args, **kwargs) 44 | 45 | 46 | @register_format("auto_awq") 47 | def _save_quantized_as_autoawq(*args, **kwargs): 48 | from auto_round.export.export_to_awq.export import save_quantized_as_autoawq 49 | 50 | return save_quantized_as_autoawq(*args, **kwargs) 51 | 52 | 53 | @register_format("gguf") 54 | def _save_quantized_as_gguf(*args, **kwargs): 55 | from auto_round.export.export_to_gguf.export import save_quantized_as_gguf 56 | 57 | return save_quantized_as_gguf(*args, **kwargs) 58 | 59 | 60 | @register_layer_packing("auto_round") 61 | def _packing_layer_with_autoround(*args, **kwargs): 62 | from auto_round.export.export_to_autoround.export import pack_layer 63 | 64 | return pack_layer(*args, **kwargs) 65 | 66 | 67 | @register_layer_packing("auto_gptq") 68 | def _packing_layer_with_autogptq(*args, **kwargs): 69 | from auto_round.export.export_to_autogptq.export import pack_layer 70 | 71 | return pack_layer(*args, **kwargs) 72 | 73 | 74 | @register_layer_packing("auto_awq") 75 | def _packing_layer_with_autoawq(*args, **kwargs): 76 | from auto_round.export.export_to_awq.export import pack_layer 77 | 78 | return pack_layer(*args, **kwargs) 79 | 80 | 81 | @register_format("llm_compressor") 82 | def _save_quantized_as_llmcompressor(*args, **kwargs): 83 | from auto_round.export.export_to_llmcompressor.export import save_quantized_as_llmcompressor 84 | 85 | return save_quantized_as_llmcompressor(*args, **kwargs) 86 | 87 | 88 | @register_layer_packing("llm_compressor") 89 | def _packing_layer_with_llmcompressor(*args, **kwargs): 90 | from auto_round.export.export_to_llmcompressor.export import pack_layer 91 | 92 | return pack_layer(*args, **kwargs) 93 | -------------------------------------------------------------------------------- /test/test_cpu/test_llmc_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme 4 | from llmcompressor import oneshot 5 | from llmcompressor.modifiers.autoround import AutoRoundModifier 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | 8 | from auto_round.calib_dataset import get_dataset 9 | 10 | recipe_str = """ 11 | quant_stage: 12 | quant_modifiers: 13 | AutoRoundModifier: 14 | ignore: ["lm_head"] 15 | iters: 1 16 | config_groups: 17 | group_0: 18 | targets: 19 | - "Linear" 20 | input_activations: null 21 | output_activations: null 22 | weights: 23 | num_bits: 4 24 | type: "int" 25 | symmetric: true 26 | strategy: group 27 | group_size: 128 28 | """ 29 | 30 | recipe_modifier_full = AutoRoundModifier( 31 | ignore=["lm_head"], 32 | iters=1, 33 | config_groups={ 34 | "group_0": QuantizationScheme( 35 | targets=["Linear"], 36 | weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), 37 | ) 38 | }, 39 | ) 40 | 41 | 42 | @pytest.mark.parametrize( 43 | "recipe", 44 | [ 45 | recipe_str, 46 | recipe_modifier_full, 47 | ], 48 | ) 49 | def test_oneshot_application(recipe, tmp_path): 50 | output = tmp_path / "oneshot_output" 51 | model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 52 | tokenizer = AutoTokenizer.from_pretrained(model) 53 | dataset = get_dataset( 54 | tokenizer=tokenizer, 55 | seqlen=16, 56 | nsamples=2, 57 | ) 58 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 59 | 60 | oneshot( 61 | model=model, 62 | dataset=dataset, 63 | output_dir=output, 64 | recipe=recipe, 65 | ) 66 | model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) 67 | 68 | # Check that the model is quantized 69 | # for compression_config - decompress() will attach a quantization_config 70 | # to the model as we decompress right away 71 | # for quantization_config - we have CompressedLinear which will only 72 | # decompress on the forward pass and does not call decompress(). Results 73 | # in a slightly different parameter tree to access the quant config 74 | quantization_config = model_loaded.config.quantization_config.quantization_config 75 | assert quantization_config is not None 76 | 77 | # check config is set properly 78 | assert "lm_head" in quantization_config.ignore 79 | assert len(quantization_config.config_groups) == 1 80 | quant_scheme = quantization_config.config_groups["group_0"] 81 | assert isinstance(quant_scheme, QuantizationScheme) 82 | 83 | weight_args = quantization_config.config_groups["group_0"].weights 84 | assert isinstance(weight_args, QuantizationArgs) 85 | assert weight_args.num_bits == 4 86 | 87 | # Check a specific layer is quantized 88 | targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj 89 | assert hasattr(targeted_linear_layer, "quantization_scheme") 90 | 91 | # Check lm-head is not quantized 92 | not_targeted = model_loaded.lm_head 93 | assert not hasattr(not_targeted, "quantization_scheme") 94 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.codespell] 2 | skip = 'pyproject.toml,.azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt' 3 | ignore-words = ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt" 4 | 5 | [tool.isort] 6 | profile = "black" 7 | line_length = 120 8 | known_first_party = ["auto_round", "auto_round_extension"] 9 | extend_skip_glob = ["**/__init__.py"] 10 | 11 | [tool.black] 12 | line-length = 120 13 | 14 | [tool.typos] 15 | [tool.typos.files] 16 | extend-exclude = [ 17 | ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt" 18 | ] 19 | [tool.typos.default.extend-words] 20 | ue = "ue" 21 | endianess = "endianess" 22 | 23 | [tool.ruff] 24 | # Exclude a variety of commonly ignored directories. 25 | exclude = [ 26 | ".bzr", 27 | ".direnv", 28 | ".eggs", 29 | ".git", 30 | ".git-rewrite", 31 | ".hg", 32 | ".ipynb_checkpoints", 33 | ".mypy_cache", 34 | ".nox", 35 | ".pants.d", 36 | ".pyenv", 37 | ".pytest_cache", 38 | ".pytype", 39 | ".ruff_cache", 40 | ".svn", 41 | ".tox", 42 | ".venv", 43 | ".vscode", 44 | "__pypackages__", 45 | "_build", 46 | "buck-out", 47 | "build", 48 | "dist", 49 | "node_modules", 50 | "site-packages", 51 | "venv", 52 | ] 53 | 54 | # Same as Black. 55 | line-length = 120 56 | indent-width = 4 57 | 58 | # Assume Python 3.10 59 | target-version = "py310" 60 | 61 | [tool.ruff.lint] 62 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 63 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 64 | # McCabe complexity (`C901`) by default. 65 | select = ["E4", "E7", "E9", "F", "NPY", "FURB"] 66 | ignore = [ 67 | "E402", # Module level import not at top of file 68 | "E501", # Line too long (121 > 120 characters) 69 | "E721", # Do not compare types, use isinstance() 70 | "E722", # Do not use bare except 71 | "E731", # Do not assign a lambda expression, use a def 72 | "E741", # Do not use variables named ‘l’, ‘O’, or ‘I’ 73 | "F401", # {name} imported but unused 74 | "F403", # from {name} import * used; unable to detect undefined names 75 | "F841", # Local variable is assigned to but never used{name} 76 | ] 77 | 78 | # Allow fix for all enabled rules (when `--fix`) is provided. 79 | fixable = ["ALL"] 80 | unfixable = [] 81 | 82 | # Allow unused variables when underscore-prefixed. 83 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 84 | 85 | [tool.ruff.format] 86 | # Like Black, use double quotes for strings. 87 | quote-style = "double" 88 | 89 | # Like Black, indent with spaces, rather than tabs. 90 | indent-style = "space" 91 | 92 | # Like Black, respect magic trailing commas. 93 | skip-magic-trailing-comma = false 94 | 95 | # Like Black, automatically detect the appropriate line ending. 96 | line-ending = "auto" 97 | 98 | # Enable auto-formatting of code examples in docstrings. Markdown, 99 | # reStructuredText code/literal blocks and doctests are all supported. 100 | # 101 | # This is currently disabled by default, but it is planned for this 102 | # to be opt-out in the future. 103 | docstring-code-format = false 104 | 105 | # Set the line length limit used when formatting code snippets in 106 | # docstrings. 107 | # 108 | # This only has an effect when the `docstring-code-format` setting is 109 | # enabled. 110 | docstring-code-line-length = "dynamic" 111 | -------------------------------------------------------------------------------- /auto_round_extension/vllm_ext/auto_round_ext.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any 16 | 17 | import torch 18 | from vllm.logger import init_logger 19 | from vllm.model_executor.layers.fused_moe import FusedMoE 20 | from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod 21 | from vllm.model_executor.layers.quantization.auto_round import AutoRoundConfig as _BaseAutoRoundConfig 22 | 23 | from auto_round.schemes import QuantizationScheme 24 | from auto_round_extension.vllm_ext.quant_method_linear import AutoRoundQuantLinearMethod 25 | from auto_round_extension.vllm_ext.quant_method_moe import AutoRoundMoEMethod 26 | 27 | logger = init_logger(__name__) 28 | 29 | 30 | class AutoRoundExtensionConfig(_BaseAutoRoundConfig): 31 | SUPPORTED_DTYPES = _BaseAutoRoundConfig.SUPPORTED_DTYPES.union({"mx_fp"}) 32 | SUPPORTED_FORMATS = _BaseAutoRoundConfig.SUPPORTED_FORMATS.union({"auto_round:llm_compressor"}) 33 | 34 | def get_quant_method(self, layer: torch.nn.Module, prefix: str): 35 | # FIXME: (yi) make it compatible with `AutoRoundConfig` 36 | from vllm.attention.layer import Attention 37 | 38 | if isinstance(layer, Attention): 39 | from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod 40 | 41 | return AutoRoundKVCacheMethod(self) 42 | if isinstance(layer, FusedMoE): 43 | quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix) 44 | return quant_method 45 | elif isinstance(layer, LinearBase): 46 | return AutoRoundQuantLinearMethod.get_method(self, layer, prefix) 47 | else: 48 | return None 49 | 50 | @staticmethod 51 | def _parse_quant_scheme(config: dict): 52 | quant_scheme_attrs = QuantizationScheme.get_attributes() 53 | filter_config = {key: value for key, value in config.items() if key in quant_scheme_attrs} 54 | quant_scheme = QuantizationScheme.from_dict(filter_config) 55 | return quant_scheme 56 | 57 | @classmethod 58 | def from_config(cls, config: dict[str, Any]) -> _BaseAutoRoundConfig: 59 | ar_config = super().from_config(config) 60 | # TODO: (yi) refine below implementation 61 | quant_scheme = AutoRoundExtensionConfig._parse_quant_scheme(config) 62 | layer_schemes = {} 63 | layer_schemes = {} # ensure dict 64 | extra_config = getattr(ar_config, "extra_config", None) 65 | if extra_config is not None: 66 | for layer_name, layer_config in extra_config.items(): 67 | layer_schemes[layer_name] = AutoRoundExtensionConfig._parse_quant_scheme(layer_config) 68 | ar_config.quant_scheme = quant_scheme 69 | ar_config.layer_schemes = layer_schemes 70 | return ar_config 71 | 72 | 73 | # Patch vLLM’s AutoRoundConfig at import time 74 | import vllm.model_executor.layers.quantization.auto_round as _auto_round_module 75 | 76 | _auto_round_module.AutoRoundConfig = AutoRoundExtensionConfig 77 | -------------------------------------------------------------------------------- /auto_round/modelling/llama4.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Note: adapted from # https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/modeling/llama4.py 15 | 16 | __all__ = ["get_replacement_info"] 17 | 18 | 19 | import torch 20 | from transformers.modeling_utils import no_init_weights 21 | from transformers.models.llama4.modeling_llama4 import Llama4TextMLP 22 | 23 | from auto_round.utils import unsupported_meta_device 24 | 25 | 26 | class SequentialLlama4TextExperts(torch.nn.ModuleList): 27 | def __init__(self, config, original): 28 | self.num_experts = original.gate_up_proj.shape[0] 29 | with no_init_weights(): 30 | super().__init__([Llama4TextMLP(config) for _ in range(self.num_experts)]) 31 | 32 | if not unsupported_meta_device(original): 33 | intermediate_size = original.down_proj.shape[1] 34 | 35 | for i in range(self.num_experts): 36 | gate_up = original.gate_up_proj[i] 37 | down = original.down_proj[i] 38 | gate_proj = gate_up[:, :intermediate_size] 39 | up_proj = gate_up[:, intermediate_size:] 40 | 41 | self[i].gate_proj.weight.data.copy_(gate_proj.t()) 42 | self[i].up_proj.weight.data.copy_(up_proj.t()) 43 | self[i].down_proj.weight.data.copy_(down.t()) 44 | 45 | 46 | class SequentialLlama4TextMoe(torch.nn.Module): 47 | def __init__(self, config, original): 48 | super().__init__() 49 | self.top_k = config.num_experts_per_tok 50 | self.hidden_dim = config.hidden_size 51 | self.num_experts = config.num_local_experts 52 | self.experts = SequentialLlama4TextExperts(config, original.experts) 53 | self.router = original.router 54 | self.shared_expert = original.shared_expert 55 | 56 | def forward(self, hidden_states: torch.Tensor): 57 | hidden_states = hidden_states.reshape(-1, self.hidden_dim) 58 | router_logits = self.router(hidden_states) 59 | if isinstance(router_logits, tuple): 60 | router_scores, router_logits = router_logits 61 | router_scores = router_scores.t() 62 | else: 63 | # transformers < 4.54.0 only returns router_logits 64 | router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=1) 65 | 66 | router_scores = ( 67 | torch.full_like(router_logits, float("-inf")) 68 | .scatter_(1, router_indices, router_top_value) 69 | .transpose(0, 1) 70 | ) 71 | router_scores = torch.sigmoid(router_scores.float()).to(hidden_states.dtype) 72 | 73 | out = self.shared_expert(hidden_states) 74 | for i in range(self.num_experts): 75 | out += self.experts[i](hidden_states) * router_scores[i].reshape(-1, 1) 76 | 77 | return out, router_logits 78 | 79 | 80 | def get_replacement_info(config): 81 | return SequentialLlama4TextMoe, config.get_text_config(), "Llama4TextMoe" 82 | --------------------------------------------------------------------------------