├── .azure-pipelines
    ├── code-scan.yml
    ├── docker
    │   ├── Dockerfile.devel
    │   └── DockerfileCodeScan.devel
    ├── license_template.txt
    ├── scripts
    │   ├── change_color.sh
    │   ├── codeScan
    │   │   ├── bandit
    │   │   │   └── bandit.sh
    │   │   ├── codespell
    │   │   │   └── autoround_dict.txt
    │   │   └── pylint
    │   │   │   └── pylint.sh
    │   └── ut
    │   │   ├── .coverage
    │   │   ├── collect_log.sh
    │   │   ├── compare_coverage.sh
    │   │   ├── run_ut.sh
    │   │   └── run_ut_hpu.sh
    ├── template
    │   ├── code-scan-template.yml
    │   ├── docker-template.yml
    │   └── ut-template.yml
    ├── unit-test-hpu.yml
    └── unit-test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── auto_round
    ├── __init__.py
    ├── __main__.py
    ├── autoround.py
    ├── calib_dataset.py
    ├── data_type
    │   ├── __init__.py
    │   ├── fp8.py
    │   ├── int.py
    │   ├── mxfp.py
    │   ├── nvfp.py
    │   ├── register.py
    │   ├── utils.py
    │   └── w4fp8.py
    ├── eval
    │   ├── __init__.py
    │   └── evaluation.py
    ├── export
    │   ├── __init__.py
    │   ├── export_to_autogptq
    │   │   ├── __init__.py
    │   │   ├── export.py
    │   │   └── qlinear_triton.py
    │   ├── export_to_autoround
    │   │   ├── __init__.py
    │   │   ├── export.py
    │   │   └── qlinear_triton_act.py
    │   ├── export_to_awq
    │   │   ├── __init__.py
    │   │   ├── export.py
    │   │   └── utils.py
    │   ├── export_to_gguf
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── convert.py
    │   │   ├── export.py
    │   │   ├── quant_cpu.py
    │   │   ├── quant_gpu.py
    │   │   └── utils.py
    │   ├── export_to_itrex
    │   │   ├── __init__.py
    │   │   ├── bit_packer.py
    │   │   ├── config.py
    │   │   ├── export.py
    │   │   └── model_wrapper.py
    │   └── register.py
    ├── inference
    │   ├── __init__.py
    │   ├── auto_quantizer.py
    │   ├── backend.py
    │   └── convert_model.py
    ├── low_cpu_mem
    │   ├── __init__.py
    │   ├── load.py
    │   ├── modified_pickle.py
    │   └── utils.py
    ├── mllm
    │   ├── README.md
    │   ├── __init__.py
    │   ├── autoround_mllm.py
    │   ├── eval.py
    │   ├── mllm_dataset.py
    │   ├── processor.py
    │   ├── template.py
    │   ├── templates
    │   │   ├── cogvlm2.json
    │   │   ├── default.json
    │   │   ├── llava.json
    │   │   └── phi3_v.json
    │   └── utils.py
    ├── script
    │   ├── __init__.py
    │   ├── llm.py
    │   └── mllm.py
    ├── sign_sgd.py
    ├── special_model_handler.py
    ├── testing_utils.py
    ├── utils.py
    ├── version.py
    └── wrapper.py
├── auto_round_extension
    ├── __init__.py
    ├── cuda
    │   ├── __init__.py
    │   └── gptqmodel_marlin.py
    ├── hpu
    │   ├── __init__.py
    │   ├── qlinear_hpu.py
    │   └── qlinear_hpu_gptq.py
    ├── ipex
    │   ├── __init__.py
    │   ├── qlinear_ipex_awq.py
    │   └── qlinear_ipex_gptq.py
    ├── qbits
    │   ├── __init__.py
    │   ├── qbits_awq.py
    │   ├── qlinear_qbits.py
    │   └── qlinear_qbits_gptq.py
    ├── torch
    │   ├── __init__.py
    │   ├── qlinear_torch.py
    │   └── qlinear_torch_zp.py
    └── triton
    │   ├── __init__.py
    │   ├── qlinear_tritonv2.py
    │   ├── qlinear_tritonv2_zp.py
    │   ├── triton_utils
    │       ├── __init__.py
    │       ├── custom_autotune.py
    │       ├── dequant.py
    │       ├── kernels.py
    │       └── mixin.py
    │   └── triton_utils_zp
    │       ├── __init__.py
    │       ├── custom_autotune.py
    │       ├── dequant.py
    │       ├── kernels.py
    │       └── mixin.py
├── docs
    ├── DeepSeek-R1-0528-int2-mixed-sym-inc.md
    ├── DeepSeek-R1-0528-int4-asym-awq-inc.md
    ├── DeepSeek-R1-0528-int4-sym-gptq-inc.md
    ├── Llama-2-7b-chat-hf-asym-recipe.md
    ├── Llama-3.2-11B-Vision-Instruct-sym.md
    ├── Meta-Llama-3-8B-Instruct-asym-recipe.md
    ├── Mistral-7B-Instruct-v0.2-asym-recipe.md
    ├── Mistral-7B-v0.1-asym-recipe.md
    ├── Mixtral-8x7B-Instruct-v0.1-asym-recipe.md
    ├── Mixtral-8x7B-v0.1-asym-acc.md
    ├── Phi-3.5-vision-instruct-sym.md
    ├── Qwen1.5-7B-Chat-acc.md
    ├── Qwen2-VL-7B-Instruct-sym.md
    ├── Qwen2.5-14B-Instruct-sym.md
    ├── Qwen2.5-32B-Instruct-sym.md
    ├── Qwen2.5-72B-Instruct-sym.md
    ├── Qwen2.5-7B-Instruct-sym.md
    ├── Qwen3-14B-sym-recipe.md
    ├── Qwen3-8B-sym-recipe.md
    ├── Yi-6B-Chat-asym-recipe.md
    ├── acc.md
    ├── baichuan2-7b-cha-asym-recipe.md
    ├── bloom-3B-asym-recipe.md
    ├── cogvlm2-llama3-chat-19B-sym.md
    ├── falcon-7b-asym-recipe.md
    ├── full_range_sym.md
    ├── gemma-2b-asym-recipe.md
    ├── gemma-7b-asym-recipe.md
    ├── gemma-7b-it-asym-recipe.md
    ├── gguf_accuracy.md
    ├── glm-4-9b-chat-recipe.md
    ├── gpt-j-6B-asym-recipe.md
    ├── imgs
    │   ├── autoround_overview.png
    │   ├── full_range_sym.png
    │   └── norm_bias_overview.png
    ├── llava-v1.5-7b-sym.md
    ├── neural-chat-7b-v3-1-asym-recipe.md
    ├── neural-chat-7b-v3-3-asym-recipe.md
    ├── opt-2.7b-asym-recipe.md
    ├── phi-2-old-sym-recipe.md
    ├── step_by_step.md
    ├── tips_and_tricks.md
    └── tuning_norm_bias.md
├── pyproject.toml
├── requirements-cpu.txt
├── requirements-lib.txt
├── requirements.txt
├── setup.cfg
├── setup.py
├── test
    ├── test_cpu
    │   ├── _test_helpers.py
    │   ├── conftest.py
    │   ├── requirements.txt
    │   ├── test_act_quantization.py
    │   ├── test_auto_round_hpu_only.py
    │   ├── test_autoopt.py
    │   ├── test_autoround.py
    │   ├── test_autoround_acc.py
    │   ├── test_autoround_export_to_itrex.py
    │   ├── test_basic_usage.py
    │   ├── test_block_names.py
    │   ├── test_calib_dataset.py
    │   ├── test_conv1d.py
    │   ├── test_export.py
    │   ├── test_generation.py
    │   ├── test_gguf_format.py
    │   ├── test_hpu.py
    │   ├── test_load_awq_gptq.py
    │   ├── test_low_cpu_mem.py
    │   ├── test_mllm.py
    │   ├── test_script.py
    │   ├── test_utils.py
    │   └── test_woq_linear.py
    ├── test_cuda
    │   ├── _test_helpers.py
    │   ├── requirements.txt
    │   ├── requirements_vlm.txt
    │   ├── test_2_3bits.py
    │   ├── test_auto_round_format.py
    │   ├── test_calib_dataset.py
    │   ├── test_conv1d.py
    │   ├── test_exllamav2_backend.py
    │   ├── test_export.py
    │   ├── test_get_block_name.py
    │   ├── test_gguf.py
    │   ├── test_main_func.py
    │   ├── test_marlin_backend.py
    │   ├── test_multiple_card.py
    │   ├── test_multiple_card_calib.py
    │   ├── test_qbits.py
    │   ├── test_support_vlms.py
    │   ├── test_triton_backend.py
    │   └── test_vlms.py
    └── test_xpu
    │   └── test_autoround.py
└── third-party-programs.txt


/.azure-pipelines/code-scan.yml:
--------------------------------------------------------------------------------
 1 | trigger: none
 2 | 
 3 | pr:
 4 |   autoCancel: true
 5 |   drafts: false
 6 |   branches:
 7 |     include:
 8 |       - main
 9 |   paths:
10 |     include:
11 |       - auto_round
12 |       - setup.py
13 |       - requirements.txt
14 |       - .azure-pipelines/code-scan.yml
15 |       - .azure-pipelines/scripts/codeScan
16 | 
17 | pool:
18 |   vmImage: "ubuntu-latest"
19 | 
20 | variables:
21 |   CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog"
22 | 
23 | stages:
24 | 
25 |   - stage: BanditCodeScan
26 |     displayName: Bandit Code Scan
27 |     dependsOn: []
28 |     jobs:
29 |       - job: Bandit
30 |         displayName: Bandit
31 |         steps:
32 |           - template: template/code-scan-template.yml
33 |             parameters:
34 |               codeScanFileName: "bandit"
35 |               uploadPath: "bandit.log"
36 | 
37 |   - stage: PylintCodeScan
38 |     displayName: Pylint Code Scan
39 |     dependsOn: []
40 |     jobs:
41 |       - job: Pylint
42 |         displayName: Pylint
43 |         steps:
44 |           - template: template/code-scan-template.yml
45 |             parameters:
46 |               codeScanFileName: "pylint"
47 |               uploadPath: "pylint.json"
48 | 


--------------------------------------------------------------------------------
/.azure-pipelines/docker/Dockerfile.devel:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ARG UBUNTU_VER=22.04
16 | FROM ubuntu:${UBUNTU_VER} as devel
17 | 
18 | # See http://bugs.python.org/issue19846
19 | ENV LANG C.UTF-8
20 | 
21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
22 |     python3 \
23 |     python3-pip \
24 |     python3-dev \
25 |     python3-distutils \
26 |     autoconf \
27 |     build-essential \
28 |     git \
29 |     libgl1-mesa-glx \
30 |     libglib2.0-0 \
31 |     numactl \
32 |     time \
33 |     wget \
34 |     bc \
35 |     jq \
36 |     vim
37 | 
38 | RUN ln -sf $(which python3) /usr/bin/python
39 | 
40 | ARG USER_ID=1000
41 | ARG GROUP_ID=1000
42 | 
43 | RUN groupadd -g ${GROUP_ID} hostgroup && \
44 |     useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser
45 | 
46 | USER hostuser
47 | 
48 | RUN python -m pip install --no-cache-dir --upgrade pip
49 | RUN python -m pip install --no-cache-dir setuptools
50 | 
51 | RUN pip list
52 | 
53 | WORKDIR /
54 | 
55 | 


--------------------------------------------------------------------------------
/.azure-pipelines/docker/DockerfileCodeScan.devel:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | ARG UBUNTU_VER=22.04
17 | FROM ubuntu:${UBUNTU_VER} as devel
18 | 
19 | # See http://bugs.python.org/issue19846
20 | ENV LANG C.UTF-8
21 | 
22 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
23 |     aspell \
24 |     aspell-en \
25 |     python3 \
26 |     python3-pip \
27 |     python3-dev \
28 |     python3-distutils \
29 |     autoconf \
30 |     build-essential \
31 |     wget
32 | 
33 | RUN ln -sf $(which python3) /usr/bin/python
34 | 
35 | ARG USER_ID=1000
36 | ARG GROUP_ID=1000
37 | 
38 | RUN groupadd -g ${GROUP_ID} hostgroup && \
39 |     useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser
40 | 
41 | USER hostuser
42 | 
43 | RUN python -m pip install --no-cache-dir pylint==2.12.1\
44 |     bandit
45 | 
46 | WORKDIR /
47 | 


--------------------------------------------------------------------------------
/.azure-pipelines/license_template.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Intel Corporation
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/change_color.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # -------------- general approach start----------------
 4 | 
 5 | # 1. import this file:
 6 |     # source path/change_color.sh
 7 | # 2. use COLOR/BG:
 8 |     # $VARIABLE_NAME && out_put_content && $RESET
 9 | # 3. COLOR + BG:
10 |     # $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET
11 | # 4. custom
12 |     # abbreviation(change number)
13 |         # txt number range (30, 37)
14 |         # bg number range (40, 47)
15 |         # special effects number range (1, 7)
16 |         # echo -en \\E[number1 + ; + number2 + ; + number3 + m"
17 |         # e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m"
18 | 
19 | # -------------- general approach end----------------==
20 | 
21 | 
22 | # general setting
23 | # ------------- light_color start----------------
24 | # black
25 | LIGHT_BLACK="echo -en \\E[30m"
26 | # red
27 | LIGHT_RED="echo -en \\E[31m"
28 | # green
29 | LIGHT_GREEN="echo -en \\E[32m"
30 | # yellow
31 | LIGHT_YELLOW="echo -en \\E[33m"
32 | # blue
33 | LIGHT_BLUE="echo -en \\E[34m"
34 | # purple
35 | LIGHT_PURPLE="echo -en \\E[35m"
36 | # cyan
37 | LIGHT_CYAN="echo -en \\E[36m"
38 | # gray
39 | LIGHT_GRAY="echo -en \\E[37m"
40 | # ------------- light_color end----------------
41 | 
42 | # ------------- bold_color start----------------
43 | # black
44 | BOLD_BLACK="echo -en \\E[1;30m"
45 | # red
46 | BOLD_RED="echo -en \\E[1;31m"
47 | # green
48 | BOLD_GREEN="echo -en \\E[1;32m"
49 | # yellow
50 | BOLD_YELLOW="echo -en \\E[1;33m"
51 | # blue
52 | BOLD_BLUE="echo -en \\E[1;34m"
53 | # purple
54 | BOLD_PURPLE="echo -en \\E[1;35m"
55 | # cyan
56 | BOLD_CYAN="echo -en \\E[1;36m"
57 | # gray
58 | BOLD_GRAY="echo -en \\E[1;37m"
59 | # ------------- bold_color end----------------
60 | 
61 | # ------------- background_color start----------------
62 | # black
63 | BG_BLACK="echo -en \\E[40m"
64 | # red
65 | BG_RED="echo -en \\E[41m"
66 | # green
67 | BG_GREEN="echo -en \\E[42m"
68 | # yellow
69 | BG_YELLOW="echo -en \\E[43m"
70 | # blue
71 | BG_BLUE="echo -en \\E[44m"
72 | # purple
73 | BG_PURPLE="echo -en \\E[45m"
74 | # cyan
75 | BG_CYAN="echo -en \\E[46m"
76 | # gray
77 | BG_GRAY="echo -en \\E[47m"
78 | # ------------- background_color end----------------
79 | 
80 | # close
81 | RESET="echo -en \\E[0m"
82 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/codeScan/bandit/bandit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for var in "$@"
 4 | do
 5 |   case $var in
 6 |     --scan_module=*)
 7 |         scan_module=$(echo $var |cut -f2 -d=)
 8 |     ;;
 9 |   esac
10 | done
11 | 
12 | source /auto-round/.azure-pipelines/scripts/change_color.sh
13 | RESET="echo -en \\E[0m \\n" # close color
14 | 
15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog"
16 | mkdir -p $log_dir
17 | 
18 | python -m bandit -r -lll -iii "/auto-round/${scan_module}" >$log_dir/bandit.log
19 | exit_code=$?
20 | 
21 | $BOLD_YELLOW && echo " -----------------  Current bandit cmd start --------------------------" && $RESET
22 | echo "python -m bandit -r -lll -iii  /auto-round/${scan_module} > $log_dir/bandit.log"
23 | $BOLD_YELLOW && echo " -----------------  Current bandit cmd end --------------------------" && $RESET
24 | 
25 | $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------"
26 | cat $log_dir/bandit.log
27 | $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
28 | 
29 | if [ ${exit_code} -ne 0 ]; then
30 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET
31 |     exit 1
32 | fi
33 | $BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
34 | exit 0
35 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt:
--------------------------------------------------------------------------------
1 | endianess


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/codeScan/pylint/pylint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for var in "$@"
 4 | do
 5 |   case $var in
 6 |     --scan_module=*)
 7 |         scan_module=$(echo $var |cut -f2 -d=)
 8 |     ;;
 9 |   esac
10 | done
11 | 
12 | source /auto-round/.azure-pipelines/scripts/change_color.sh
13 | RESET="echo -en \\E[0m \\n" # close color
14 | 
15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog"
16 | mkdir -p $log_dir
17 | 
18 | pip install torch --index-url https://download.pytorch.org/whl/cpu
19 | pip install -r /auto-round/requirements.txt
20 | pip install -r /auto-round/requirements-cpu.txt
21 | 
22 | echo "[DEBUG] list pipdeptree..."
23 | pip install pipdeptree
24 | pipdeptree
25 | 
26 | python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto \
27 | --ignored-modules=tensorflow,keras,torch,torch.quantization,torch.tensor,torchvision,fairseq,mxnet,onnx,onnxruntime,intel_extension_for_pytorch,intel_extension_for_tensorflow,torchinfo,horovod,transformers \
28 | /auto-round/${scan_module} > $log_dir/pylint.json
29 | 
30 | exit_code=$?
31 | 
32 | $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------" && $RESET
33 | cat $log_dir/pylint.json
34 | $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
35 | 
36 | if [ ${exit_code} -ne 0 ]; then
37 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Pylint error details." && $RESET
38 |     exit 1
39 | fi
40 | $BOLD_PURPLE && echo "Congratulations, Pylint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
41 | exit 0
42 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/.coverage:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | 
 4 | [report]
 5 | include =
 6 |  */auto_round/**
 7 |  */auto_round_extension/**
 8 | exclude_lines =
 9 |  pragma: no cover
10 |  raise NotImplementedError
11 |  raise TypeError
12 |  except ImportError:
13 |  except Exception as e:


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/run_ut.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | # install requirements
 5 | echo "##[group]set up UT env..."
 6 | export TQDM_MININTERVAL=60
 7 | pip install pytest-cov pytest-html
 8 | pip install -r /auto-round/test/test_cpu/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
 9 | pip list
10 | # install latest gguf for ut test
11 | git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install .
12 | echo "##[endgroup]"
13 | pip list
14 | 
15 | cd /auto-round/test/test_cpu || exit 1
16 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
17 | 
18 | export LD_LIBRARY_PATH=${HOME}/.local/lib/:$LD_LIBRARY_PATH
19 | export FORCE_BF16=1
20 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage
21 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
22 | 
23 | LOG_DIR=/auto-round/log_dir
24 | mkdir -p ${LOG_DIR}
25 | ut_log_name=${LOG_DIR}/ut.log
26 | 
27 | find . -name "test*.py" ! -name "*hpu_only*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
28 | cat run.sh
29 | bash run.sh 2>&1 | tee ${ut_log_name}
30 | 
31 | cp report.html ${LOG_DIR}/
32 | cp coverage.xml ${LOG_DIR}/
33 | 
34 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
35 |     echo "##[error]Find errors in pytest case, please check the output..."
36 |     exit 1
37 | fi
38 | 
39 | # if ut pass, collect the coverage file into artifacts
40 | cp .coverage ${LOG_DIR}/.coverage
41 | 
42 | echo "UT finished successfully! "
43 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/run_ut_hpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | # install requirements
 5 | echo "set up UT env..."
 6 | pip install pytest-cov pytest-html
 7 | pip list
 8 | 
 9 | cd /auto-round/test/test_cpu || exit 1
10 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
11 | 
12 | export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
13 | export FORCE_BF16=1
14 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage
15 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
16 | 
17 | LOG_DIR=/auto-round/log_dir
18 | mkdir -p ${LOG_DIR}
19 | ut_log_name=${LOG_DIR}/ut.log
20 | 
21 | find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
22 | find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
23 | 
24 | cat run_lazy.sh
25 | bash run_lazy.sh 2>&1 | tee ${ut_log_name}
26 | 
27 | cat run_compile.sh
28 | bash run_compile.sh 2>&1 | tee ${ut_log_name}
29 | 
30 | cp report.html ${LOG_DIR}/
31 | cp coverage.xml ${LOG_DIR}/
32 | 
33 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
34 |     echo "##[error]Find errors in pytest case, please check the output..."
35 |     exit 1
36 | fi
37 | 
38 | # if ut pass, collect the coverage file into artifacts
39 | cp .coverage ${LOG_DIR}/.coverage
40 | 
41 | echo "UT finished successfully! "


--------------------------------------------------------------------------------
/.azure-pipelines/template/code-scan-template.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: codeScanFileName
 3 |     type: string
 4 |   - name: uploadPath
 5 |     type: string
 6 | 
 7 |   - name: codeScanContainerName
 8 |     type: string
 9 |     default: "codeScan"
10 |   - name: scanModule
11 |     type: string
12 |     default: "auto_round"
13 | 
14 | steps:
15 |   - template: docker-template.yml
16 |     parameters:
17 |       dockerConfigName: "commonDockerConfig"
18 |       repoName: "code-scan"
19 |       repoTag: "1.0"
20 |       dockerFileName: "DockerfileCodeScan"
21 |       containerName: ${{ parameters.codeScanContainerName }}
22 | 
23 |   - script: |
24 |       docker exec ${{ parameters.codeScanContainerName }} bash -c "bash /auto-round/.azure-pipelines/scripts/codeScan/${{ parameters.codeScanFileName }}/${{ parameters.codeScanFileName }}.sh \
25 |       --scan_module=${{ parameters.scanModule }}"
26 |     displayName: "${{ parameters.codeScanFileName }} Check"
27 | 
28 |   - task: PublishPipelineArtifact@1
29 |     condition: succeededOrFailed()
30 |     inputs:
31 |       targetPath: .azure-pipelines/scripts/codeScan/scanLog/${{ parameters.uploadPath }}
32 |       artifact: ${{ parameters.codeScanFileName }}
33 |       publishLocation: "pipeline"
34 |     displayName: "PublishPipelineArtifact"
35 | 
36 |   - task: Bash@3
37 |     condition: always()
38 |     inputs:
39 |       targetType: "inline"
40 |       script: |
41 |         docker exec ${{ parameters.codeScanContainerName }} bash -c "rm -fr /auto-round/* && rm -fr /auto-round/.* || true"
42 |     displayName: "Docker clean up"
43 | 


--------------------------------------------------------------------------------
/.azure-pipelines/template/docker-template.yml:
--------------------------------------------------------------------------------
  1 | parameters:
  2 |   - name: dockerConfigName
  3 |     type: string
  4 |     default: "commonDockerConfig"
  5 |   - name: repoName
  6 |     type: string
  7 |     default: "auto-round"
  8 |   - name: repoTag
  9 |     type: string
 10 |     default: "py310"
 11 |   - name: dockerFileName
 12 |     type: string
 13 |     default: "Dockerfile"
 14 |   - name: containerName
 15 |     type: string
 16 |   - name: repo
 17 |     type: string
 18 |     default: "https://github.com/intel/auto-round"
 19 |   - name: imageSource
 20 |     type: string
 21 |     default: "build"
 22 | 
 23 | steps:
 24 |   - task: Bash@3
 25 |     inputs:
 26 |       targetType: "inline"
 27 |       script: |
 28 |         docker ps -a
 29 |         if [[ $(docker ps -a | grep -i '${{ parameters.containerName }}'$) ]]; then
 30 |             docker start ${{ parameters.containerName }}
 31 |             echo "remove left files through container ..."
 32 |             docker exec ${{ parameters.containerName }} bash -c "ls -a /auto-round && rm -fr /auto-round/* && rm -fr /auto-round/.* && ls -a /auto-round || true"
 33 |         fi
 34 |     displayName: "Docker clean up"
 35 | 
 36 |   - ${{ if eq(parameters.dockerConfigName, 'commonDockerConfig') }}:
 37 |       - script: |
 38 |           rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
 39 |         displayName: "Clean workspace"
 40 | 
 41 |       - checkout: self
 42 |         clean: true
 43 |         displayName: "Checkout out Repo"
 44 | 
 45 |   - ${{ if eq(parameters.dockerConfigName, 'gitCloneDockerConfig') }}:
 46 |       - script: |
 47 |           rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
 48 |           mkdir ${BUILD_SOURCESDIRECTORY}
 49 |           chmod 777 ${BUILD_SOURCESDIRECTORY}
 50 |         displayName: "Clean workspace"
 51 | 
 52 |       - checkout: none
 53 | 
 54 |       - script: |
 55 |           git clone ${{ parameters.repo }} ${BUILD_SOURCESDIRECTORY}
 56 |           git config --global --add safe.directory ${BUILD_SOURCESDIRECTORY}
 57 |           cd ${BUILD_SOURCESDIRECTORY}
 58 |           git checkout main
 59 |         displayName: "Checkout out main"
 60 | 
 61 |   - ${{ if eq(parameters.imageSource, 'build') }}:
 62 |       - script: |
 63 |           docker image prune -a -f
 64 |           if [[ ! $(docker images | grep -i ${{ parameters.repoName }}:${{ parameters.repoTag }}) ]]; then
 65 |             docker build --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) \
 66 |              -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/${{parameters.dockerFileName}}.devel -t ${{ parameters.repoName }}:${{ parameters.repoTag }} .
 67 |           fi
 68 |           docker images | grep -i ${{ parameters.repoName }}
 69 |           if [[ $? -ne 0 ]]; then
 70 |             echo "NO Such Repo"
 71 |             exit 1
 72 |           fi
 73 |         displayName: "Build develop docker image"
 74 | 
 75 |   - ${{ if eq(parameters.imageSource, 'pull') }}:
 76 |       - script: |
 77 |             docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 78 |         displayName: "Pull habana docker image"
 79 | 
 80 |   - script: |
 81 |       docker stop ${{ parameters.containerName }}
 82 |       docker rm -vf ${{ parameters.containerName }} || true
 83 |       env | sort
 84 |     displayName: "Clean docker container"
 85 | 
 86 |   - ${{ if ne(parameters.containerName, '') }}:
 87 |       - task: Bash@3
 88 |         inputs:
 89 |           targetType: "inline"
 90 |           script: |
 91 |             if [[ "${{ parameters.imageSource }}" == "build" ]]; then
 92 |                 docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
 93 |                   -v ${BUILD_SOURCESDIRECTORY}:/auto-round -v /tf_dataset:/tf_dataset -v /tf_dataset2:/tf_dataset2 \
 94 |                   ${{ parameters.repoName }}:${{ parameters.repoTag }}
 95 |             else
 96 |                 docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
 97 |                   --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
 98 |                   -v ${BUILD_SOURCESDIRECTORY}:/auto-round vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 99 |                 docker exec ${{ parameters.containerName }} bash -c "ln -sf \$(which python3) /usr/bin/python"
100 |             fi
101 |             echo "Show the container list after docker run ... "
102 |             docker ps -a
103 |         displayName: "Docker run - ${{ parameters.containerName }} Container"
104 | 


--------------------------------------------------------------------------------
/.azure-pipelines/template/ut-template.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: dockerConfigName
 3 |     type: string
 4 |     default: "commonDockerConfig"
 5 |   - name: repo
 6 |     type: string
 7 |     default: "https://github.com/intel/auto-round"
 8 |   - name: utScriptFileName
 9 |     type: string
10 |   - name: uploadPath
11 |     type: string
12 |   - name: utArtifact
13 |     type: string
14 |   - name: utTestMode
15 |     type: string
16 |     default: "coverage"
17 |   - name: utContainerName
18 |     type: string
19 |     default: "AutoRoundUnitTest"
20 |   - name: imageSource
21 |     type: string
22 |     default: "build"
23 | 
24 | steps:
25 |   - template: docker-template.yml
26 |     parameters:
27 |       dockerConfigName: ${{ parameters.dockerConfigName }}
28 |       repoName: "auto-round"
29 |       repoTag: "py310"
30 |       dockerFileName: "Dockerfile"
31 |       containerName: ${{ parameters.utContainerName }}
32 |       repo: ${{ parameters.repo }}
33 |       imageSource: ${{ parameters.imageSource }}
34 | 
35 |   - ${{ if eq(parameters.imageSource, 'build') }}:
36 |     - script: |
37 |         docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \
38 |           && pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cpu \
39 |           && pip install intel-extension-for-pytorch==2.7.0 \
40 |           && pip install .[cpu] \
41 |           && pip list"
42 |       displayName: "Env Setup"
43 | 
44 |   - ${{ if eq(parameters.imageSource, 'pull') }}:
45 |     - script: |
46 |         docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \
47 |           && python setup.py bdist_wheel lib \
48 |           && pip install dist/*.whl \
49 |           && pip list"
50 |       displayName: "HPU Env Setup"
51 | 
52 |   - script: |
53 |       docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round/.azure-pipelines/scripts \
54 |         && bash ut/${{ parameters.utScriptFileName }}.sh ${{ parameters.utTestMode }}"
55 |     displayName: "Run UT"
56 | 
57 |   - task: PublishPipelineArtifact@1
58 |     condition: succeededOrFailed()
59 |     inputs:
60 |       targetPath: ${{ parameters.uploadPath }}
61 |       artifact: ${{ parameters.utArtifact }}_coverage
62 |       publishLocation: "pipeline"
63 | 
64 |   - task: UseDotNet@2
65 |     displayName: 'Use .NET Core sdk 7.0.x'
66 |     inputs:
67 |       version: 7.0.x
68 | 
69 |   - task: PublishCodeCoverageResults@2
70 |     inputs:
71 |       summaryFileLocation: ${{ parameters.uploadPath }}/coverage.xml
72 | 
73 |   - task: Bash@3
74 |     condition: always()
75 |     inputs:
76 |       targetType: "inline"
77 |       script: |
78 |         docker exec ${{ parameters.utContainerName }} bash -c "rm -rf /auto-round/* && rm -rf /auto-round/.* || true"
79 |         docker stop ${{ parameters.utContainerName }}
80 |         docker rm -vf ${{ parameters.utContainerName }} || true
81 |     displayName: "Docker clean up"
82 | 


--------------------------------------------------------------------------------
/.azure-pipelines/unit-test-hpu.yml:
--------------------------------------------------------------------------------
 1 | trigger: none
 2 | 
 3 | pr:
 4 |   autoCancel: true
 5 |   drafts: false
 6 |   branches:
 7 |     include:
 8 |       - main
 9 |   paths:
10 |     include:
11 |       - auto_round
12 |       - test/test*hpu*'
13 |       - setup.py
14 |       - requirements-hpu.txt
15 |       - .azure-pipelines/scripts/ut
16 |       - .azure-pipelines/template/docker-template.yml
17 |       - .azure-pipelines/template/ut-template.yml
18 |     exclude:
19 |       - auto_round/export/export_to_autogptq
20 |       - auto_round/export/export_to_awq
21 |       - "*.md"
22 |       - "**/*.md"
23 | 
24 | pool: GAUDI
25 | 
26 | variables:
27 |   IMAGE_NAME: "auto-round"
28 |   IMAGE_TAG: "py310"
29 |   UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
30 |   DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
31 |   ARTIFACT_NAME: "UT_coverage_report"
32 |   REPO: $(Build.Repository.Uri)
33 | 
34 | stages:
35 |   - stage: Unit_test
36 |     displayName: Unit Test
37 |     dependsOn: []
38 |     jobs:
39 |       - job:
40 |         displayName: Unit Test
41 |         steps:
42 |           - template: template/ut-template.yml
43 |             parameters:
44 |               imageSource: "pull"
45 |               dockerConfigName: "commonDockerConfig"
46 |               utScriptFileName: "run_ut_hpu"
47 |               uploadPath: $(UPLOAD_PATH)
48 |               utArtifact: "ut"
49 | 
50 |   # - stage: Unit_test_baseline
51 |   #   displayName: Unit Test Baseline
52 |   #   dependsOn: []
53 |   #   jobs:
54 |   #     - job:
55 |   #       displayName: Unit Test
56 |   #       steps:
57 |   #         - template: template/ut-template.yml
58 |   #           parameters:
59 |   #             imageSource: "pull"
60 |   #             dockerConfigName: "gitCloneDockerConfig"
61 |   #             utScriptFileName: "run_ut_hpu"
62 |   #             uploadPath: $(UPLOAD_PATH)
63 |   #             utArtifact: "ut_baseline"
64 |   #             repo: $(REPO)
65 | 


--------------------------------------------------------------------------------
/.azure-pipelines/unit-test.yml:
--------------------------------------------------------------------------------
  1 | trigger: none
  2 | 
  3 | pr:
  4 |   autoCancel: true
  5 |   drafts: false
  6 |   branches:
  7 |     include:
  8 |       - main
  9 |   paths:
 10 |     include:
 11 |       - auto_round
 12 |       - auto_round_extension
 13 |       - test
 14 |       - setup.py
 15 |       - requirements.txt
 16 |       - .azure-pipelines/scripts/ut
 17 |       - .azure-pipelines/unit-test.yml
 18 |       - .azure-pipelines/template/ut-template.yml
 19 |       - .azure-pipelines/template/docker-template.yml
 20 |     exclude:
 21 |       - test/test*hpu*
 22 |       - "*.md"
 23 |       - "**/*.md"
 24 | 
 25 | pool: ICX-16C
 26 | 
 27 | variables:
 28 |   IMAGE_NAME: "auto-round"
 29 |   IMAGE_TAG: "py310"
 30 |   UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
 31 |   DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
 32 |   ARTIFACT_NAME: "UT_coverage_report"
 33 |   REPO: $(Build.Repository.Uri)
 34 | 
 35 | stages:
 36 |   - stage: Unit_test
 37 |     displayName: Unit Test
 38 |     dependsOn: []
 39 |     jobs:
 40 |       - job:
 41 |         displayName: Unit Test
 42 |         timeoutInMinutes: 240
 43 |         steps:
 44 |           - template: template/ut-template.yml
 45 |             parameters:
 46 |               dockerConfigName: "commonDockerConfig"
 47 |               utScriptFileName: "run_ut"
 48 |               uploadPath: $(UPLOAD_PATH)
 49 |               utArtifact: "ut"
 50 | 
 51 |   # - stage: Unit_test_baseline
 52 |   #   displayName: Unit Test Baseline
 53 |   #   dependsOn: []
 54 |   #   jobs:
 55 |   #     - job:
 56 |   #       displayName: Unit Test
 57 |   #       timeoutInMinutes: 120
 58 |   #       steps:
 59 |   #         - template: template/ut-template.yml
 60 |   #           parameters:
 61 |   #             dockerConfigName: "gitCloneDockerConfig"
 62 |   #             utScriptFileName: "run_ut"
 63 |   #             uploadPath: $(UPLOAD_PATH)
 64 |   #             utArtifact: "ut_baseline"
 65 |   #             repo: $(REPO)
 66 | 
 67 |   # - stage: Coverage
 68 |   #   displayName: "Coverage Compare"
 69 |   #   pool:
 70 |   #     vmImage: "ubuntu-latest"
 71 |   #   dependsOn: [Unit_test, Unit_test_baseline]
 72 |   #   jobs:
 73 |   #     - job: CollectDatafiles
 74 |   #       steps:
 75 |   #         - script: |
 76 |   #             if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then
 77 |   #               docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
 78 |   #             fi
 79 |   #             docker images | grep -i ${IMAGE_NAME}
 80 |   #             if [[ $? -ne 0 ]]; then
 81 |   #               echo "NO Such Repo"
 82 |   #               exit 1
 83 |   #             fi
 84 |   #           displayName: "Build develop docker image"
 85 | 
 86 |   #         - task: DownloadPipelineArtifact@2
 87 |   #           inputs:
 88 |   #             artifact:
 89 |   #             patterns: "*_coverage/.coverage"
 90 |   #             path: $(DOWNLOAD_PATH)
 91 | 
 92 |   #         - script: |
 93 |   #             echo "--- create container ---"
 94 |   #             docker run -d -it --name="collectLogs"  -v ${BUILD_SOURCESDIRECTORY}:/auto-round  ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash
 95 |   #             echo "--- docker ps ---"
 96 |   #             docker ps
 97 |   #             echo "--- collect logs ---"
 98 |   #             docker exec collectLogs bash -c "cd /auto-round \
 99 |   #               && pip install -r requirements.txt \
100 |   #               && pip install -vvv --no-build-isolation -e .[cpu] \
101 |   #               && pip list"
102 |   #             docker exec collectLogs /bin/bash +x -c "cd /auto-round/.azure-pipelines/scripts \
103 |   #               && bash ut/collect_log.sh"
104 |   #           displayName: "Collect UT Coverage"
105 | 
106 |   #         - task: PublishPipelineArtifact@1
107 |   #           condition: succeededOrFailed()
108 |   #           inputs:
109 |   #             targetPath: $(UPLOAD_PATH)
110 |   #             artifact: $(ARTIFACT_NAME)
111 |   #             publishLocation: "pipeline"
112 | 
113 |   #         - task: Bash@3
114 |   #           condition: always()
115 |   #           inputs:
116 |   #             targetType: "inline"
117 |   #             script: |
118 |   #               docker exec collectLogs bash -c "rm -fr /auto-round/* && rm -fr /auto-round/.* || true"
119 |   #           displayName: "Docker clean up"
120 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ﻿.vs
 2 | .vscode
 3 | __pycache__
 4 | *.egg-info/
 5 | build/*
 6 | .eggs/
 7 | dist/
 8 | .cache/
 9 | .clangd
10 | CMakeUserPresets.json
11 | tmp_autoround/
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autofix_prs: true
 3 |   autoupdate_schedule: quarterly
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/Lucas-C/pre-commit-hooks
 7 |     rev: v1.5.5
 8 |     hooks:
 9 |       - id: insert-license
10 |         files: |
11 |           (?x)^(
12 |             auto_round/.*(py|yaml|yml|sh)
13 |           )$
14 |         args:
15 |           [
16 |             --license-filepath=.azure-pipelines/license_template.txt,
17 |             --use-current-year,
18 |             --detect-license-in-X-top-lines=40,
19 |             --skip-license-insertion-comment=Copyright,
20 |           ]
21 | 
22 |   - repo: https://github.com/codespell-project/codespell
23 |     rev: v2.4.1
24 |     hooks:
25 |       - id: codespell
26 |         args: [-w]
27 |         additional_dependencies:
28 |           - tomli
29 |         exclude: |
30 |           (?x)^(
31 |               examples/.*(txt|patch)
32 |           )$
33 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ### License
 4 | 
 5 | <PROJECT NAME> is licensed under the terms in [LICENSE]<link to license file in repo>. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
 6 | 
 7 | ### Sign your work
 8 | 
 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 | 
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include requirements-cpu.txt
3 | include requirements-lib.txt
4 | exclude test/*
5 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 | 


--------------------------------------------------------------------------------
/auto_round/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .autoround import AutoRound, AutoRoundAdam, AutoRoundOPT
15 | from .mllm import AutoRoundMLLM
16 | from auto_round.utils import LazyImport
17 | 
18 | def __getattr__(name):
19 |     if name == 'AutoHfQuantizer':
20 |         from auto_round.inference.auto_quantizer import AutoHfQuantizer
21 |         return AutoHfQuantizer
22 |     if name == 'AutoRoundConfig':
23 |         from auto_round.inference.auto_quantizer import AutoRoundConfig
24 |         return AutoRoundConfig
25 | 
26 |     raise AttributeError(f"auto-round has no attribute '{name}'")
27 | 
28 | from .version import __version__
29 | 


--------------------------------------------------------------------------------
/auto_round/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import sys
15 | 
16 | def run_eval():
17 |     from auto_round.script.llm import setup_eval_parser
18 |     args = setup_eval_parser()
19 |     if args.eval_task_by_task:
20 |         from auto_round.script.llm import eval_task_by_task
21 |         eval_task_by_task(
22 |             model=args.model,
23 |             device=args.device,
24 |             tasks=args.tasks,
25 |             batch_size=args.eval_bs,
26 |             trust_remote_code=not args.disable_trust_remote_code,
27 |             eval_model_dtype=args.eval_model_dtype
28 |             )
29 |     else:
30 |         from auto_round.script.llm import eval
31 |         eval(args)
32 |     
33 | 
34 | def run():
35 |     if "--eval" in sys.argv:
36 |         sys.argv.remove("--eval")
37 |         run_eval()
38 |     else:
39 |         from auto_round.script.llm import setup_parser, tune
40 |         args = setup_parser()
41 |         tune(args)
42 | 
43 | def run_best():
44 |     from auto_round.script.llm import setup_best_parser, tune
45 |     args = setup_best_parser()
46 |     tune(args)
47 |     
48 | def run_light():
49 |     from auto_round.script.llm import setup_light_parser, tune
50 |     args = setup_light_parser()
51 |     tune(args)
52 | 
53 | def run_fast():
54 |     from auto_round.script.llm import setup_fast_parser, tune
55 |     args = setup_fast_parser()
56 |     tune(args)
57 | 
58 | 
59 | def run_mllm():
60 |     if "--eval" in sys.argv:
61 |         from auto_round.script.mllm import setup_lmeval_parser, eval
62 |         sys.argv.remove("--eval")
63 |         args = setup_lmeval_parser()
64 |         eval(args)
65 |     elif "--lmms" in sys.argv:
66 |         sys.argv.remove("--lmms")
67 |         run_lmms()
68 |     else:
69 |         from auto_round.script.mllm import setup_parser, tune
70 |         args = setup_parser()
71 |         tune(args)
72 | 
73 | def run_lmms():
74 |     # from auto_round.script.lmms_eval import setup_lmms_args, eval
75 |     from auto_round.script.mllm import setup_lmms_parser, lmms_eval
76 |     args = setup_lmms_parser()
77 |     lmms_eval(args)
78 | 
79 | def switch():
80 |     if "--mllm" in sys.argv:
81 |         sys.argv.remove("--mllm")
82 |         run_mllm()
83 |     else:
84 |         run()
85 | 
86 | if __name__ == '__main__':
87 |     switch()
88 | 
89 | 


--------------------------------------------------------------------------------
/auto_round/data_type/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import auto_round.data_type.int
16 | import auto_round.data_type.mxfp
17 | import auto_round.data_type.fp8
18 | from auto_round.data_type.register import QUANT_FUNC_WITH_DTYPE
19 | import auto_round.data_type.w4fp8
20 | from auto_round.data_type.utils import get_quant_func
21 | import auto_round.data_type.nvfp
22 | 
23 | 


--------------------------------------------------------------------------------
/auto_round/data_type/register.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | QUANT_FUNC_WITH_DTYPE = {}
17 | 
18 | 
19 | def register_dtype(name):
20 |     """Class decorator to register a EXPORT subclass to the registry.
21 | 
22 |     Decorator function used before a Pattern subclass.
23 | 
24 |     Args:
25 |         cls (class): The subclass of register.
26 |         name: A string. Define the export type.
27 | 
28 |     Returns:
29 |         cls: The class of register.
30 |     """
31 | 
32 |     def register(dtype):
33 |         QUANT_FUNC_WITH_DTYPE[name] = dtype
34 |         return dtype
35 | 
36 |     return register
37 | 


--------------------------------------------------------------------------------
/auto_round/eval/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.


--------------------------------------------------------------------------------
/auto_round/eval/evaluation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Optional, Union
16 | 
17 | from lm_eval import simple_evaluate as lm_simple_evaluate
18 | import os
19 | 
20 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
21 | 
22 | from lm_eval.models.huggingface import HFLM
23 | 
24 | 
25 | def simple_evaluate_user_model(
26 |         user_model,
27 |         tokenizer,
28 |         batch_size: Optional[int] = 1,
29 |         max_batch_size: Optional[int] = 64,
30 |         eval_model_dtype = "auto",
31 |         **kwargs
32 | ):
33 |     hflm = HFLM(
34 |         pretrained=user_model,
35 |         tokenizer=tokenizer,
36 |         batch_size=batch_size,
37 |         max_batch_size=max_batch_size,
38 |         dtype=eval_model_dtype)
39 |     return lm_simple_evaluate(
40 |         model=hflm, model_args=None, batch_size=batch_size, max_batch_size=max_batch_size, **kwargs)
41 | 
42 | 
43 | def simple_evaluate(
44 |         model,
45 |         model_args: Optional[Union[str, dict]] = None,
46 |         batch_size: Optional[int] = None,
47 |         max_batch_size: Optional[int] = None,
48 |         device: Optional[str] = None,
49 |         **kwargs):
50 |     try:
51 |         from auto_round import AutoRoundConfig
52 |     except:
53 |         from auto_round.inference.auto_quantizer import AutoHfQuantizer
54 | 
55 |     return lm_simple_evaluate(
56 |         model=model,
57 |         model_args=model_args,
58 |         batch_size=batch_size,
59 |         max_batch_size=max_batch_size,
60 |         device=device,
61 |         **kwargs)
62 | 


--------------------------------------------------------------------------------
/auto_round/export/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from auto_round.export.register import EXPORT_FORMAT,PACKING_LAYER_WITH_FORMAT, register_format,register_layer_packing
16 | 
17 | 
18 | @register_format("auto_gptq")
19 | def _save_quantized_as_autogptq(*args, **kwargs):
20 |     from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq
21 | 
22 |     return save_quantized_as_autogptq(*args, **kwargs)
23 | 
24 | 
25 | @register_format("itrex")
26 | def _save_quantized_as_itrex(*args, **kwargs):
27 |     from auto_round.export.export_to_itrex.export import save_quantized_as_itrex
28 | 
29 |     return save_quantized_as_itrex(*args, **kwargs)
30 | 
31 | 
32 | @register_format("itrex_xpu")
33 | def _save_quantized_as_itrex_xpu(*args, **kwargs):
34 |     from auto_round.export.export_to_itrex.export import save_quantized_as_itrex_xpu
35 | 
36 |     return save_quantized_as_itrex_xpu(*args, **kwargs)
37 | 
38 | 
39 | @register_format("auto_round")
40 | def _save_quantized_as_autoround(*args, **kwargs):
41 |     from auto_round.export.export_to_autoround.export import save_quantized_as_autoround
42 | 
43 |     return save_quantized_as_autoround(*args, **kwargs)
44 | 
45 | 
46 | @register_format("auto_awq")
47 | def _save_quantized_as_autoawq(*args, **kwargs):
48 |     from auto_round.export.export_to_awq.export import save_quantized_as_autoawq
49 | 
50 |     return save_quantized_as_autoawq(*args, **kwargs)
51 | 
52 | @register_format("gguf")
53 | def _save_quantized_as_autoawq(*args, **kwargs):
54 |     from auto_round.export.export_to_gguf.export import save_quantized_as_gguf
55 |     return save_quantized_as_gguf(*args, **kwargs)
56 | 
57 | 
58 | @register_layer_packing("auto_round")
59 | def _packing_layer_with_autoround(*args, **kwargs):
60 |     from auto_round.export.export_to_autoround.export import pack_layer
61 | 
62 |     return pack_layer(*args, **kwargs)
63 | 
64 | 
65 | @register_layer_packing("auto_gptq")
66 | def _packing_layer_with_autogptq(*args, **kwargs):
67 |     from auto_round.export.export_to_autogptq.export import pack_layer
68 | 
69 |     return pack_layer(*args, **kwargs)
70 | 
71 | 
72 | @register_layer_packing("auto_awq")
73 | def _packing_layer_with_autoawq(*args, **kwargs):
74 |     from auto_round.export.export_to_awq.export import pack_layer
75 | 
76 |     return pack_layer(*args, **kwargs)
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_autogptq/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_autogptq/qlinear_triton.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import math
 16 | 
 17 | import numpy as np
 18 | import torch
 19 | import torch.nn as nn
 20 | import transformers
 21 | 
 22 | class TritonModuleMixin:
 23 |     @classmethod
 24 |     def warmup(cls, model, transpose=False, seqlen=2048):
 25 |         pass
 26 | 
 27 | 
 28 | class QuantLinear(nn.Module, TritonModuleMixin):
 29 |     QUANT_TYPE = "triton"
 30 | 
 31 |     def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
 32 |         super().__init__()
 33 |         if bits not in [2, 4, 8]:
 34 |             raise NotImplementedError("Only 2,4,8 bits are supported.")
 35 |         if infeatures % 32 != 0 or outfeatures % 32 != 0:
 36 |             raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
 37 |         self.infeatures = infeatures
 38 |         self.outfeatures = outfeatures
 39 |         self.bits = bits
 40 |         self.group_size = group_size if group_size != -1 else infeatures
 41 |         self.maxq = 2 ** self.bits - 1
 42 | 
 43 |         self.register_buffer(
 44 |             "qweight",
 45 |             torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
 46 |         )
 47 |         self.register_buffer(
 48 |             "qzeros",
 49 |             torch.zeros(
 50 |                 (
 51 |                     math.ceil(infeatures / self.group_size),
 52 |                     outfeatures // 32 * self.bits,
 53 |                 ),
 54 |                 dtype=torch.int32,
 55 |             ),
 56 |         )
 57 |         self.register_buffer(
 58 |             "scales",
 59 |             torch.zeros(
 60 |                 (math.ceil(infeatures / self.group_size), outfeatures),
 61 |                 dtype=torch.float16,
 62 |             ),
 63 |         )
 64 |         self.register_buffer(
 65 |             "g_idx",
 66 |             torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
 67 |         )
 68 | 
 69 |         if bias:
 70 |             self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
 71 |         else:
 72 |             self.bias = None
 73 | 
 74 |         self.trainable = trainable
 75 | 
 76 |     def post_init(self):
 77 |         pass
 78 | 
 79 |     def pack(self, linear, scales, zeros, g_idx=None):
 80 |         scales_t = scales.t().contiguous()
 81 |         self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
 82 |         if linear.bias is not None:
 83 |             self.bias = linear.bias.clone().half()
 84 |         self.scales = scales_t.clone().half()
 85 |         device = "cpu"
 86 |         if torch.cuda.is_available():
 87 |             device = "cuda:0"
 88 |         elif torch.xpu.is_available():
 89 |             device = "xpu:0"
 90 | 
 91 |         W = linear.weight.data.to(device).clone()
 92 |         if isinstance(linear, nn.Conv2d):
 93 |             W = W.flatten(1)
 94 |         if isinstance(linear, transformers.pytorch_utils.Conv1D):
 95 |             W = W.t()
 96 | 
 97 |         repeat_scales = scales.to(device).repeat_interleave(self.group_size, 1)
 98 |         if isinstance(zeros, torch.Tensor):
 99 |             repeat_zeros = zeros.to(device).repeat_interleave(self.group_size, 1)
100 |             intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros[:, :W.shape[1]]).to(
101 |                 torch.int32)
102 |         else:
103 |             repeat_zeros = zeros
104 |             intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros).to(
105 |                 torch.int32)
106 | 
107 |         del repeat_scales
108 |         intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits)
109 |         order_map = torch.arange(0, 32 // self.bits, device=device) * self.bits
110 |         intweight = intweight << order_map
111 |         intweight = torch.sum(intweight, dim=-1)
112 | 
113 |         intweight = intweight.t().contiguous().to(torch.int32)
114 |         self.qweight = intweight.to("cpu")
115 | 
116 |         if isinstance(zeros, torch.Tensor):
117 |             zeros = zeros.t().contiguous()
118 |             zeros -= 1
119 |             zeros = zeros.numpy().astype(np.uint32)
120 |             qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
121 |             i = 0
122 |             col = 0
123 |             while col < qzeros.shape[1]:
124 |                 for j in range(i, i + (32 // self.bits)):
125 |                     qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
126 |                 i += 32 // self.bits
127 |                 col += 1
128 | 
129 |             qzeros = qzeros.astype(np.int32)
130 |             self.qzeros = torch.from_numpy(qzeros)
131 |         else:
132 |             zeros -= 1
133 |             shape = scales_t.shape
134 |             value = 0
135 |             for j in range(0, (32 // self.bits)):
136 |                 value |= zeros << (self.bits * j)
137 |             qzeros = np.ones((shape[0], shape[1] // 32 * self.bits), dtype=np.uint32) * value
138 |             qzeros = qzeros.astype(np.int32)
139 |             self.qzeros = torch.from_numpy(qzeros)
140 | 
141 | 
142 | __all__ = ["QuantLinear"]


--------------------------------------------------------------------------------
/auto_round/export/export_to_autoround/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .export import save_quantized_as_autoround
16 | 
17 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_awq/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .export import save_quantized_as_autoawq
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_gguf/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_gguf/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | GGUF_CONFIG = {}
16 | 
17 | GGUF_CONFIG["gguf:q4_0"] = {"bits": 4, "act_bits": 16, "group_size": 32, "asym": False, "sym": True, "data_type": "int"}
18 | 
19 | GGUF_CONFIG["gguf:q4_1"] = {
20 |     "bits": 4,
21 |     "act_bits": 16,
22 |     "group_size": 32,
23 |     "asym": True,
24 |     "sym": False,
25 |     "data_type": "int_asym_float_zp"
26 | }
27 | 
28 | GGUF_CONFIG["gguf:q5_0"] = {"bits": 5, "act_bits": 16, "group_size": 32, "asym": False, "sym": True, "data_type": "int"}
29 | 
30 | GGUF_CONFIG["gguf:q5_1"] = {
31 |     "bits": 5,
32 |     "act_bits": 16,
33 |     "group_size": 32,
34 |     "asym": True,
35 |     "sym": False,
36 |     "data_type": "int_asym_float_zp"
37 | }
38 | 
39 | GGUF_CONFIG["gguf:q8_0"] = {"bits": 8, "act_bits": 16, "group_size": 32, "asym": False, "sym": True, "data_type": "int"}
40 | 
41 | 
42 | GGUF_CONFIG["gguf:q2_k_s"] = {
43 |     "bits": 2,
44 |     "act_bits": 16,
45 |     "super_group_size": 16,
46 |     "super_bits": 4,
47 |     "group_size": 16,
48 |     "asym": True,
49 |     "sym": False,
50 |     "data_type": "int_asym_dq"
51 | }
52 | 
53 | GGUF_CONFIG["gguf:q3_k_s"] = {
54 |     "bits": 3,
55 |     "act_bits": 16,
56 |     "super_group_size": 16,
57 |     "super_bits": 6,
58 |     "group_size": 16,
59 |     "asym": False,
60 |     "sym": True,
61 |     "data_type": "int_sym_dq"
62 | }
63 | 
64 | GGUF_CONFIG["gguf:q4_k_s"] = {
65 |     "bits": 4,
66 |     "act_bits": 16,
67 |     "super_group_size": 8,
68 |     "super_bits": 6,
69 |     "group_size": 32,
70 |     "asym": True,
71 |     "sym": False,
72 |     "data_type": "int_asym_dq"
73 | }
74 | 
75 | GGUF_CONFIG["gguf:q5_k_s"] = {
76 |     "bits": 5,
77 |     "act_bits": 16,
78 |     "super_group_size": 8,
79 |     "super_bits": 6,
80 |     "group_size": 32,
81 |     "asym": True,
82 |     "sym": False,
83 |     "data_type": "int_asym_dq"
84 | }
85 | 
86 | GGUF_CONFIG["gguf:q6_k"] = GGUF_CONFIG["gguf:q6_k_s"] = {
87 |     "bits": 6,
88 |     "act_bits": 16,
89 |     "super_group_size": 16,
90 |     "super_bits": 8,
91 |     "group_size": 16,
92 |     "asym": False,
93 |     "sym": True,
94 |     "data_type": "int_sym_dq"
95 | }
96 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_gguf/export.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import sys
 17 | import shutil
 18 | import torch
 19 | from .convert import Model
 20 | from auto_round.utils import logger, LazyImport
 21 | from pathlib import Path
 22 | import time
 23 | 
 24 | gguf = LazyImport("gguf")
 25 | 
 26 | FTYPE_MAP: dict[str, gguf.LlamaFileType] = {
 27 |         "f32": gguf.LlamaFileType.ALL_F32,
 28 |         "f16": gguf.LlamaFileType.MOSTLY_F16,
 29 |         "bf16": gguf.LlamaFileType.MOSTLY_BF16,
 30 |         "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
 31 |         "q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
 32 |         "q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
 33 |         "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
 34 |         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
 35 |         "q2_k_s": gguf.LlamaFileType.MOSTLY_Q2_K_S,
 36 |         "q3_k_s": gguf.LlamaFileType.MOSTLY_Q3_K_S,
 37 |         "q4_k_s": gguf.LlamaFileType.MOSTLY_Q4_K_S,
 38 |         "q5_k_s": gguf.LlamaFileType.MOSTLY_Q5_K_S,
 39 |         "q6_k": gguf.LlamaFileType.MOSTLY_Q6_K,
 40 |         "q6_k_s": gguf.LlamaFileType.MOSTLY_Q6_K,
 41 |         "auto": gguf.LlamaFileType.GUESSED,
 42 |     }
 43 | 
 44 | def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", **kwargs):
 45 |     """Export the model to gguf format."""
 46 |     if output_dir is not None and os.path.exists(output_dir):
 47 |         logger.warning(f"{output_dir} already exists, this may cause model conflict")
 48 | 
 49 |     st = time.time()
 50 | 
 51 |     model = kwargs["model"]
 52 |     tokenizer = kwargs.get("tokenizer", None)
 53 |     config = model.config
 54 | 
 55 |     tmp_work_dir = Path(os.path.join(output_dir, 'tmp_dir'))
 56 |     if tokenizer is not None:
 57 |         tokenizer.save_pretrained(tmp_work_dir)
 58 |     config.save_pretrained(tmp_work_dir)
 59 | 
 60 |     with torch.inference_mode():
 61 |         hparams = Model.load_hparams(tmp_work_dir)
 62 |         model_architecture = hparams["architectures"][0]
 63 |         try:
 64 |             model_class = Model.from_model_architecture(model_architecture)
 65 |         except NotImplementedError:
 66 |             logger.error(f"Model {model_architecture} is not supported")
 67 |             sys.exit(1)
 68 |         model_class = Model.from_model_architecture(model_architecture)
 69 |         model_name = model.name_or_path.split('/')
 70 |         if len(model_name[-1]) == 0:
 71 |             model_name = model_name[-2]
 72 |         else:
 73 |             model_name = model_name[-1]
 74 | 
 75 |         output_type = backend.split(":")[-1]
 76 |         if output_type.lower() not in FTYPE_MAP:
 77 |             raise TypeError(f"{output_type} type is not supported")
 78 |         output_type = FTYPE_MAP.get(output_type.lower())
 79 | 
 80 | 
 81 |         model_instance = model_class(
 82 |             model,
 83 |             dir_model=tmp_work_dir,
 84 |             ftype=output_type,
 85 |             fname_out=Path(output_dir),
 86 |             is_big_endian=False,
 87 |             model_name=model_name,
 88 |             split_max_tensors=False,
 89 |             split_max_size=0,
 90 |             dry_run=False,
 91 |             small_first_shard=False)
 92 |         model_instance.write()
 93 |         rt = time.time() - st
 94 |         logger.info(f"Model successfully exported to {model_instance.fname_out}, running time={rt}")
 95 | 
 96 |     shutil.rmtree(tmp_work_dir, ignore_errors=True)
 97 | 
 98 |     return model
 99 | 
100 | 


--------------------------------------------------------------------------------
/auto_round/export/export_to_gguf/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | QK_K = 256
16 | K_SCALE_SIZE = 12
17 | GGML_QUANT_SIZES = {
18 |     "bf16": (1, 2),
19 |     "q4_0": (32, 2 + 16),
20 |     "q4_1": (32, 2 + 2 + 16),
21 |     "q5_0": (32, 2 + 4 + 16),
22 |     "q5_1": (32, 2 + 2 + 4 + 16),
23 |     "q8_0": (32, 2 + 32),
24 |     "q2_k": (256, 2 + 2 + QK_K//16 + QK_K//4),
25 |     "q3_k": (256, 2 + QK_K // 4 + QK_K // 8 + 12),
26 |     "q4_k": (256, 2 + 2 + QK_K//2 + 12),
27 |     "q5_k": (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
28 |     "q6_k": (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
29 |     "q8_k": (256, 4 + QK_K + QK_K // 8)
30 | }


--------------------------------------------------------------------------------
/auto_round/export/export_to_itrex/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .export import save_quantized_as_itrex, pack_model
15 | from .config import QuantConfig
16 | 


--------------------------------------------------------------------------------
/auto_round/export/register.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | EXPORT_FORMAT = {}
17 | 
18 | 
19 | def register_format(name):
20 |     """Class decorator to register a EXPORT subclass to the registry.
21 | 
22 |     Decorator function used before a Pattern subclass.
23 | 
24 |     Args:
25 |         cls (class): The subclass of register.
26 |         name: A string. Define the export type.
27 | 
28 |     Returns:
29 |         cls: The class of register.
30 |     """
31 | 
32 |     def register(format):
33 |         EXPORT_FORMAT[name] = format
34 |         return format
35 | 
36 |     return register
37 | 
38 | 
39 | 
40 | PACKING_LAYER_WITH_FORMAT = {}
41 | 
42 | def register_layer_packing(name):
43 |     """Class decorator to register a EXPORT subclass to the registry.
44 | 
45 |     Decorator function used before a Pattern subclass.
46 | 
47 |     Args:
48 |         cls (class): The subclass of register.
49 |         name: A string. Define the export type.
50 | 
51 |     Returns:
52 |         cls: The class of register.
53 |     """
54 | 
55 |     def register(format):
56 |         PACKING_LAYER_WITH_FORMAT[name] = format
57 |         return format
58 | 
59 |     return register
60 | 


--------------------------------------------------------------------------------
/auto_round/inference/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init
15 | 
16 | 


--------------------------------------------------------------------------------
/auto_round/low_cpu_mem/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | """Torch layer-wise quantization module."""
18 | from .utils import *
19 | 


--------------------------------------------------------------------------------
/auto_round/mllm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .mllm_dataset import get_mllm_dataloader
16 | from .template import Template, get_template, TEMPLATES
17 | from .autoround_mllm import AutoRoundMLLM
18 | from ..utils import LazyImport
19 | from .eval import mllm_eval, lmms_eval


--------------------------------------------------------------------------------
/auto_round/mllm/templates/cogvlm2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_type": "cogvlm2",
3 |     "format_user": "Question: {{content}} ",
4 |     "format_assistant": "Answer: {{content}}\n",
5 |     "replace_tokens": ["<image>\n", ""],
6 |     "processor": "cogvlm2",
7 |     "extra_encode" : true,
8 |     "default_dataset": "NeelNanda/pile-10k"
9 | }


--------------------------------------------------------------------------------
/auto_round/mllm/templates/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "default",
 3 |     "format_user": "{{content}}",
 4 |     "format_assistant": "{{content}}",
 5 |     "format_system": "{{content}}",
 6 |     "format_function": "",
 7 |     "format_observation": "",
 8 |     "format_separator": "\n",
 9 |     "default_system": "You are a helpful assistant.",
10 |     "replace_tokens": null,
11 |     "extra_encode" : false,
12 |     "default_dataset": "NeelNanda/pile-10k",
13 |     "processor": "hf"
14 | }


--------------------------------------------------------------------------------
/auto_round/mllm/templates/llava.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_type": "llava",
3 |     "replace_tokens": null,
4 |     "processor": "llava",
5 |     "extra_encode" : false,
6 |     "default_dataset": "NeelNanda/pile-10k"
7 | }


--------------------------------------------------------------------------------
/auto_round/mllm/templates/phi3_v.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_type": "phi3_v",
3 |     "replace_tokens": ["<image>", "<|image_1|>"],
4 |     "processor": "hf",
5 |     "extra_encode" : false,
6 |     "default_dataset": "NeelNanda/pile-10k"
7 | }


--------------------------------------------------------------------------------
/auto_round/mllm/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import requests
17 | 
18 | from ..utils import LazyImport
19 | 
20 | PIL = LazyImport("PIL")
21 | from PIL import Image  # pylint: disable=E0401
22 | 
23 | 
24 | def _extract_data_dir(dir_path: str):
25 |     if os.path.isdir(dir_path):
26 |         return dir_path
27 |     elif "=" in dir_path:
28 |         result = {}
29 |         dir_path = dir_path.split(",")
30 |         for _path in dir_path:
31 |             k, v = _path.split('=')
32 |             if k in ['image', 'video', 'audio']:
33 |                 result[k] = v
34 |         return result
35 |     else:
36 |         raise TypeError("incorrect input of extra_data_dir, please use auto_round --help for more details.")
37 | 
38 | 
39 | def fetch_image(path_or_url):
40 |     if os.path.isfile(path_or_url):
41 |         image_obj = Image.open(path_or_url)
42 |     elif path_or_url.startswith("http://") or path_or_url.startswith("https://"):
43 |         image_obj = Image.open(requests.get(path_or_url, stream=True).raw)
44 |     else:
45 |         raise TypeError(f"{path_or_url} neither a path or url.")
46 | 
47 |     return image_obj
48 | 


--------------------------------------------------------------------------------
/auto_round/script/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.


--------------------------------------------------------------------------------
/auto_round/special_model_handler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 Intel Corporation
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama")  # Limitations on batch_size
 17 | 
 18 | SUPPORT_ONLY_TEXT_MODELS = [
 19 |     "phi3_v",
 20 |     "cogvlm2",
 21 |     "llava",
 22 |     "qwen2_vl",
 23 |     "deepseek_vl_v2",
 24 |     "chatglm",
 25 |     "idefics3"
 26 | ]
 27 | 
 28 | SPECIAL_SHARED_CACHE_KEYS = {
 29 |     "Gemma3ForConditionalGeneration": ("position_embeddings_global", "position_embeddings_local")}
 30 | SPECIAL_SHARED_CACHE_KEYS["MiniMaxText01ForCausalLM"] = ("slope_rate",)
 31 | 
 32 | 
 33 | def _handle_special_model(model):
 34 |     if model.config.model_type == "deepseek_vl_v2":
 35 |         from functools import partial
 36 |         model.forward = partial(_deepseek_vl2_forward, model)
 37 |     return model
 38 | 
 39 | 
 40 | def _get_deepseek_vl2_multimodal_block(model, quant_vision=False):
 41 |     model.forward = model.language.forward
 42 |     block_names = []
 43 |     if quant_vision:
 44 |         block_names.append([f"vision.blocks.{i}" for i in range(len(model.vision.blocks))])
 45 |         block_names.append([f"projector.layers.{i}" for i in range(len(model.projector.layers))])
 46 |     block_names.append([f"language.model.layers.{i}" for i in range(len(model.language.model.layers))])
 47 |     return block_names
 48 | 
 49 | 
 50 | SPECIAL_MULTIMODAL_BLOCK = {
 51 |     "deepseek_vl_v2": _get_deepseek_vl2_multimodal_block
 52 | }
 53 | 
 54 | 
 55 | def _deepseek_vl2_forward(
 56 |         model,
 57 |         input_ids=None,
 58 | 
 59 |         position_ids=None,
 60 |         attention_mask=None,
 61 |         past_key_values=None,
 62 |         inputs_embeds=None,
 63 | 
 64 |         images=None,
 65 |         images_seq_mask=None,
 66 |         images_spatial_crop=None,
 67 | 
 68 |         labels=None,
 69 |         use_cache=None,
 70 |         output_attentions=None,
 71 |         output_hidden_states=None,
 72 |         return_dict=None,
 73 |         cache_position=None,
 74 |         **kwargs
 75 | ):
 76 |     inputs_embeds = model.prepare_inputs_embeds(
 77 |         input_ids=input_ids,
 78 |         images=images,
 79 |         images_seq_mask=images_seq_mask,
 80 |         images_spatial_crop=images_spatial_crop,
 81 |     )
 82 |     return model.language(
 83 |         input_ids=None,
 84 |         attention_mask=attention_mask,
 85 |         position_ids=position_ids,
 86 |         past_key_values=past_key_values,
 87 |         inputs_embeds=inputs_embeds,
 88 |         labels=labels,
 89 |         use_cache=use_cache,
 90 |         output_attentions=output_attentions,
 91 |         output_hidden_states=output_hidden_states,
 92 |         return_dict=return_dict,
 93 |         cache_position=cache_position)
 94 | 
 95 | 
 96 | def check_mllm_model_batch(model, batch_size, gradient_accumulate_steps=1):
 97 |     """
 98 |     Checks model configuration to determine if it's necessary to limit bs to avoid potential input shape mismatches.
 99 |     """
100 |     for key in mllms_with_limited_bs:
101 |         if hasattr(model, "config") and key in model.config.model_type and batch_size != 1:
102 |             accumulate_steps = batch_size * gradient_accumulate_steps
103 |             print("To avoid the tensor concat mismatch problem, modified parameters to " \
104 |                   f"batch_size=1. As an alternative, set the gradient_accumulate_steps={accumulate_steps}")
105 |             return 1, accumulate_steps
106 |     return batch_size, gradient_accumulate_steps
107 | 


--------------------------------------------------------------------------------
/auto_round/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Intel® auto-round: An open-source Python library
15 | supporting popular model weight only compression based on signround."""
16 | 
17 | __version__ = "0.5.1"
18 | 


--------------------------------------------------------------------------------
/auto_round_extension/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/__init__.py


--------------------------------------------------------------------------------
/auto_round_extension/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/cuda/__init__.py


--------------------------------------------------------------------------------
/auto_round_extension/hpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/hpu/__init__.py


--------------------------------------------------------------------------------
/auto_round_extension/ipex/__init__.py:
--------------------------------------------------------------------------------
1 | from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear as IpexAWQQuantLinear
2 | from auto_round_extension.ipex.qlinear_ipex_gptq import (
3 |     QuantLinear as IpexGPTQQuantLinear,
4 | )
5 | 
6 | ipex_qlinear_classes = (IpexAWQQuantLinear, IpexGPTQQuantLinear)
7 | 


--------------------------------------------------------------------------------
/auto_round_extension/ipex/qlinear_ipex_awq.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class QuantLinear(nn.Module):
 5 |     QUANT_TYPE = "ipex_awq"
 6 |     def __init__(self, w_bit, group_size, in_features, out_features, bias, dev):
 7 |         super().__init__()
 8 |         assert w_bit == 4, "Only 4 bit are supported for now."
 9 |         self.compute_dtype = torch.float16 if torch.xpu.is_available() else torch.bfloat16
10 |         self.in_features = in_features
11 |         self.out_features = out_features
12 |         self.w_bit = w_bit
13 |         self.group_size = group_size if group_size != -1 else in_features
14 |         self.scale_dtype = torch.float32
15 | 
16 |         # quick sanity check (make sure alignment)
17 |         assert self.in_features % self.group_size == 0
18 |         assert out_features % (32 // self.w_bit) == 0
19 |         self.pack_num = 32 // self.w_bit
20 | 
21 |         self.register_buffer(
22 |             "qzeros",
23 |             torch.zeros(
24 |                 (in_features // self.group_size, out_features // self.pack_num),
25 |                 dtype=torch.int32,
26 |                 device=dev,
27 |             ),
28 |         )
29 |         self.register_buffer(
30 |             "scales",
31 |             torch.zeros(
32 |                 (in_features // self.group_size, out_features),
33 |                 dtype=self.compute_dtype,
34 |                 device=dev,
35 |             ))
36 |         if bias:
37 |             self.register_buffer(
38 |                 "bias",
39 |                 torch.zeros((out_features), dtype=self.compute_dtype, device=dev),
40 |             )
41 |         else:
42 |             self.register_buffer(
43 |                 "bias",
44 |                 None,
45 |             )
46 |         qweight = torch.zeros((in_features, out_features // self.pack_num), dtype=torch.int32, device=dev)
47 |         self.register_buffer("qweight", qweight)
48 | 
49 |     def post_init(self):
50 |         assert self.qweight.device.type == "cpu" or self.qweight.device.type == "xpu"
51 |         import intel_extension_for_pytorch as ipex
52 | 
53 |         self.ipex_linear = ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(self.qweight,
54 |                                                                                            self.scales,
55 |                                                                                            self.qzeros, \
56 |                                                                                            self.in_features,
57 |                                                                                            self.out_features,
58 |                                                                                            None,
59 |                                                                                            self.bias, \
60 |                                                                                            self.group_size,
61 |                                                                                            None,
62 |                                                                                            1,
63 |                                                                                            0
64 |                                                                                            )
65 | 
66 |     @classmethod
67 |     def from_linear(cls, linear, w_bit, group_size, init_only=False, scales=None):
68 |         awq_linear = cls(
69 |             w_bit,
70 |             group_size,
71 |             linear.in_features,
72 |             linear.out_features,
73 |             linear.bias is not None,
74 |             linear.weight.device,
75 |         )
76 |         if init_only:  # just prepare for loading sd
77 |             return awq_linear
78 | 
79 |         raise NotImplementedError("Only inference is supported for IPEX kernels")
80 | 
81 |     @torch.no_grad()
82 |     def forward(self, x):
83 | 
84 |         outputs = self.ipex_linear(x)
85 | 
86 |         return outputs
87 | 
88 |     def extra_repr(self) -> str:
89 |         return ("in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
90 |             self.in_features,
91 |             self.out_features,
92 |             self.bias is not None,
93 |             self.w_bit,
94 |             self.group_size,
95 |         ))
96 | 
97 | 


--------------------------------------------------------------------------------
/auto_round_extension/qbits/__init__.py:
--------------------------------------------------------------------------------
 1 | from auto_round_extension.qbits.qlinear_qbits import QuantLinear as QBitsQuantLinear
 2 | from auto_round_extension.qbits.qlinear_qbits_gptq import (
 3 |     QuantLinear as QBitsGPTQQuantLinear,
 4 | )
 5 | from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
 6 | 
 7 | qbits_qlinear_classes = (QBitsQuantLinear, QBitsGPTQQuantLinear)
 8 | 
 9 | qbits_awq_classes = (QBitsAWQQuantLinear,)
10 | 


--------------------------------------------------------------------------------
/auto_round_extension/torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/torch/__init__.py


--------------------------------------------------------------------------------
/auto_round_extension/triton/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/triton/__init__.py


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils/mixin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # MIT License
16 | #
17 | # Copyright (c) 2023 潘其威(William)
18 | #
19 | # Permission is hereby granted, free of charge, to any person obtaining a copy
20 | # of this software and associated documentation files (the "Software"), to deal
21 | # in the Software without restriction, including without limitation the rights
22 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 | # copies of the Software, and to permit persons to whom the Software is
24 | # furnished to do so, subject to the following conditions:
25 | #
26 | # The above copyright notice and this permission notice shall be included in all
27 | # copies or substantial portions of the Software.
28 | #
29 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 | # SOFTWARE.
36 | class TritonModuleMixin:
37 |     @classmethod
38 |     def warmup(cls, model, transpose=False, seqlen=2048):
39 |         pass
40 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils_zp/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/auto_round_extension/triton/triton_utils_zp/mixin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # MIT License
16 | #
17 | # Copyright (c) 2023 潘其威(William)
18 | #
19 | # Permission is hereby granted, free of charge, to any person obtaining a copy
20 | # of this software and associated documentation files (the "Software"), to deal
21 | # in the Software without restriction, including without limitation the rights
22 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 | # copies of the Software, and to permit persons to whom the Software is
24 | # furnished to do so, subject to the following conditions:
25 | #
26 | # The above copyright notice and this permission notice shall be included in all
27 | # copies or substantial portions of the Software.
28 | #
29 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 | # SOFTWARE.
36 | class TritonModuleMixin:
37 |     @classmethod
38 |     def warmup(cls, model, transpose=False, seqlen=2048):
39 |         pass
40 | 


--------------------------------------------------------------------------------
/docs/Llama-2-7b-chat-hf-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model  meta-llama/Llama-2-7b-chat-hf \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --asym \
13 | --format 'auto_gptq,auto_round' \
14 | --output_dir "./tmp_autoround"
15 | ```
16 | 
17 | 
18 | Due to licensing restrictions, we are unable to release the model.
19 | 
20 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
21 | 
22 | Since we encountered an issue evaluating this model with lm-eval, we opted to evaluate the qdq model instead. In our assessment, we found that its accuracy closely matches that of the real quantized model in most cases except for some small models like opt-125m.
23 | 
24 | 
25 | | Metric         | FP16   | int4 qdq |
26 | | -------------- | ------ | -------- |
27 | | Avg.           | 0.5901 | 0.5897   |
28 | | mmlu           | 0.4640 | 0.4545   |
29 | | lambada_openai | 0.7105 | 0.7037   |
30 | | hellaswag      | 0.5780 | 0.5706   |
31 | | winogrande     | 0.6638 | 0.6614   |
32 | | piqa           | 0.7639 | 0.7633   |
33 | | truthfulqa_mc1 | 0.3023 | 0.3035   |
34 | | openbookqa     | 0.3340 | 0.3260   |
35 | | boolq          | 0.7976 | 0.8064   |
36 | | rte            | 0.6968 | 0.7292   |
37 | | arc_easy       | 0.7382 | 0.7336   |
38 | | arc_challenge  | 0.4420 | 0.4352   |
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/Llama-3.2-11B-Vision-Instruct-sym.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Model Details
  3 | 
  4 | This model is an int4 model with group_size 128 and symmetric quantization of [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct). Load the model with revision="f036ca" to use AutoGPTQ format.
  5 | 
  6 | ## How To Use
  7 | 
  8 | ### Requirements
  9 | Please use Transformers version 4.45.0 or later
 10 | AutoRound version >= 0.4.1
 11 | 
 12 | ### INT4 Inference
 13 | ```python
 14 | from auto_round import AutoRoundConfig ## must import for auto-round format
 15 | import requests
 16 | import torch
 17 | from PIL import Image
 18 | from transformers import MllamaForConditionalGeneration, AutoProcessor
 19 | 
 20 | quantized_model_path="Intel/Llama-3.2-11B-Vision-Instruct-inc-private"
 21 | 
 22 | model = MllamaForConditionalGeneration.from_pretrained(
 23 |     quantized_model_path,
 24 |     torch_dtype="auto",
 25 |     device_map="auto",
 26 |     ##revision="f036ca" ##AutoGPTQ format
 27 | )
 28 | processor = AutoProcessor.from_pretrained(quantized_model_path)
 29 | image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
 30 | messages = [
 31 |     {"role": "user", "content": [
 32 |         {"type": "image"},
 33 |         {"type": "text", "text": "Please write a haiku for this one, it would be: "}
 34 |     ]}
 35 | ]
 36 | 
 37 | # Preparation for inference
 38 | image = Image.open(requests.get(image_url, stream=True).raw)
 39 | input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
 40 | inputs = processor(
 41 |     image,
 42 |     input_text,
 43 |     add_special_tokens=False,
 44 |     return_tensors="pt"
 45 | ).to(model.device)
 46 | 
 47 | output = model.generate(**inputs, max_new_tokens=50)
 48 | print(processor.decode(output[0]))
 49 | 
 50 | ##INT4: 
 51 | ##  Here is a haiku for the rabbit:
 52 | 
 53 | ##  Whiskers twitching bright
 54 | ##  Ears perked up, alert and keen
 55 | ##  Spring's gentle delight<|eot_id|>
 56 | 
 57 | 
 58 | ##BF16: 
 59 | ## Here is a haiku for the rabbit:
 60 | 
 61 | ## Whiskers twitching fast
 62 | ## In a coat of blue and brown
 63 | ## Hoppy little soul<|eot_id|>
 64 | 
 65 | image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
 66 | messages = [
 67 |     {"role": "user", "content": [
 68 |         {"type": "image"},
 69 |         {"type": "text", "text": "How many people are on the baseball field in the picture?"}
 70 |     ]}
 71 | ]
 72 | ##INT4: There are five people on the baseball field in the picture.
 73 | ## 
 74 | 
 75 | ##BF16: There are five people on the baseball field in the picture.
 76 | ## 
 77 | 
 78 | image_url = "https://intelcorp.scene7.com/is/image/intelcorp/processor-overview-framed-badge:1920-1080?wid=480&hei=270"
 79 | messages = [
 80 |     {"role": "user", "content": [
 81 |         {"type": "image"},
 82 |         {"type": "text", "text": "Which company does this picture represent?"}
 83 |     ]}
 84 | ]
 85 | ##INT4: This picture represents Intel.
 86 | ## 
 87 | 
 88 | ##BF16: This image represents Intel, a multinational semiconductor corporation headquartered in Santa Clara, California.
 89 | ## 
 90 | 
 91 | ```
 92 | 
 93 | ## Evaluation the model
 94 | pip3 install git+https://github.com/open-compass/VLMEvalKit.git@7de2dcb. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update.
 95 | ```bash
 96 | auto-round-mllm --eval --model Intel/Llama-3.2-11B-Vision-Instruct-inc-private --tasks MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE --output_dir "./eval_result"
 97 | ```
 98 | |Metric             |16bits|Pile Calib INT4  |Llava Calib INT4|
 99 | |:-------------------|:------|:------|:------|
100 | |avg                |66.05 |67.81 |66.02 |
101 | |MMBench_DEV_EN_V11 |52.86 |53.48 |52.17 |
102 | |ScienceQA_VAL      |68.86 |70.39 |69.15 |
103 | |TextVQA_VAL        |54.49 |59.62 |55.07 |
104 | |POPE               |88.00 |87.76 |87.71 |
105 | 
106 | ### Generate the model
107 | Here is the sample command to reproduce the model.
108 | ```bash
109 | pip install auto-round
110 | auto-round-mllm \
111 | --model meta-llama/Llama-3.2-11B-Vision-Instruct \
112 | --device 0 \
113 | --group_size 128 \
114 | --bits 4 \
115 | --iters 1000 \
116 | --nsample 512 \
117 | --seqlen 512 \
118 | --format 'auto_gptq,auto_round' \
119 | --output_dir "./tmp_autoround"
120 | ```
121 | 
122 | ## Ethical Considerations and Limitations
123 | 
124 | The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
125 | 
126 | Therefore, before deploying any applications of the model, developers should perform safety testing.
127 | 
128 | ## Caveats and Recommendations
129 | 
130 | Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
131 | 
132 | Here are a couple of useful links to learn more about Intel's AI software:
133 | 
134 | - Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
135 | 
136 | ## Disclaimer
137 | 
138 | The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
139 | 
140 | ## Cite
141 | 
142 | @article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
143 | 
144 | [arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
145 | 


--------------------------------------------------------------------------------
/docs/Meta-Llama-3-8B-Instruct-asym-recipe.md:
--------------------------------------------------------------------------------
 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model. 
 4 | ```bash
 5 | auto-round \
 6 | --model  meta-llama/Meta-Llama-3-8B-Instruct \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --asym \
13 | --format 'auto_gptq,auto_round' \
14 | --output_dir "./tmp_autoround"
15 | ```
16 | 
17 | quant lm-head
18 | ```bash
19 | auto-round \
20 | --model  meta-llama/Meta-Llama-3-8B-Instruct \
21 | --device 0 \
22 | --group_size 128 \
23 | --bits 4 \
24 | --iters 1000 \
25 | --nsamples 512 \
26 | --asym \
27 | --quant_lm_head \
28 | --format 'auto_gptq,auto_round' \
29 | --output_dir "./tmp_autoround"
30 | ```
31 | lm-eval 0.4.2 is used
32 | 
33 | | Metric           | **BF16** | w4g128 w/o lm-head | w4g128 with lm-head |
34 | | ---------------- | :------- |--------------------|-----------------------------|
35 | | Avg.             | 0.6352   | 0.6312             | 0.6303                      |
36 | | mmlu             | 0.6386   | 0.6306             | 0.6243                     |
37 | | winogrande       | 0.7143   | 0.7238             | 0.7261                      |
38 | | truthfulqa_mc1   | 0.3623   | 0.3537             | 0.3574                     |
39 | | rte              | 0.6751   | 0.6859             | 0.6715                      |
40 | | piqa             | 0.7867   | 0.7797             | 0.7775                     |
41 | | openbookqa       | 0.3400   | 0.3300             | 0.3340                      |
42 | | lambada_openai   | 0.7182   | 0.7200             | 0.7118                      |
43 | | hellaswag        | 0.5769   | 0.5699             | 0.5686                     |
44 | | boolq            | 0.8297   | 0.8309             | 0.8266                     |
45 | | arc_easy         | 0.8152   | 0.8089             | 0.8123                      |
46 | | arc_challenge    | 0.5299   | 0.5102             |  0.5111                          |
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/docs/Mistral-7B-Instruct-v0.2-asym-recipe.md:
--------------------------------------------------------------------------------
 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model. 
 4 | ```bash
 5 | auto-round \
 6 | --model  mistralai/Mistral-7B-Instruct-v0.2 \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --asym \
13 | --format 'auto_gptq,auto_round' \
14 | --output_dir "./tmp_autoround"
15 | ```
16 | 
17 | | Metric         | BF16   | INT4   |
18 | | -------------- | ------ | ------ |
19 | | Avg.           | 0.6647 | 0.6621 |
20 | | mmlu           | 0.5906 | 0.5872 |
21 | | lambada_openai | 0.7141 | 0.7141 |
22 | | hellaswag      | 0.6602 | 0.6557 |
23 | | winogrande     | 0.7395 | 0.7364 |
24 | | piqa           | 0.8052 | 0.8047 |
25 | | truthfulqa_mc1 | 0.5251 | 0.5153 |
26 | | openbookqa     | 0.3600 | 0.3420 |
27 | | boolq          | 0.8535 | 0.8541 |
28 | | rte            | 0.7040 | 0.7148 |
29 | | arc_easy       | 0.8161 | 0.8165 |
30 | | arc_challenge  | 0.5435 | 0.5435 |
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/Mistral-7B-v0.1-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 |  
 3 | 
 4 | A sample command to generate an INT4 model.
 5 | ```bash
 6 | auto-round \
 7 | --model  mistralai/Mistral-7B-v0.1 \
 8 | --device 0 \
 9 | --group_size 128 \
10 | --bits 4 \
11 | --iters 1000 \
12 | --nsamples 512 \
13 | --asym \
14 | --format 'auto_gptq,auto_round' \
15 | --output_dir "./tmp_autoround"
16 | ```
17 | 
18 | quant_lm_head
19 | 
20 | ```bash
21 | auto-round \
22 | --model  mistralai/Mistral-7B-v0.1 \
23 | --device 0 \
24 | --group_size 128 \
25 | --bits 4 \
26 | --iters 1000 \
27 | --nsamples 512 \
28 | --asym \
29 | --quant_lm_head \
30 | --format 'auto_gptq,auto_round' \
31 | --output_dir "./tmp_autoround"
32 | ```
33 | 
34 | lm-eval 0.4.2 is used
35 | 
36 | | Metric         | BF16   | [INT4-lmhead](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc-lmhead) | [INT4](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc) |
37 | | -------------- | ------ |-----------------| ------------------------------------------------------------ |
38 | | Avg.           | 0.6260 | 0.6228          | 0.6218                                                       |
39 | | mmlu           | 0.5868 | 0.5760          | 0.5772                                                       |
40 | | lambada_openai | 0.7555 | 0.7539          | 0.7543                                                       |
41 | | hellaswag      | 0.6125 | 0.6055          | 0.6072                                                       |
42 | | winogrande     | 0.7395 | 0.7380          | 0.7388                                                       |
43 | | piqa           | 0.8069 | 0.8009          | 0.8030                                                       |
44 | | truthfulqa_mc1 | 0.2803 | 0.2876          | 0.2864                                                       |
45 | | openbookqa     | 0.3280 | 0.3300          | 0.3260                                                       |
46 | | boolq          | 0.8379 | 0.8291          | 0.8281                                                       |
47 | | arc_easy       | 0.8089 | 0.8043          | 0.8035                                                       |
48 | | arc_challenge  | 0.5034 | 0.5026          | 0.4932                                                       |
49 | 


--------------------------------------------------------------------------------
/docs/Mixtral-8x7B-Instruct-v0.1-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model  mistralai/Mixtral-8x7B-Instruct-v0.1 \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --asym \
13 | --format 'auto_gptq,auto_round' \
14 | --output_dir "./tmp_autoround"
15 | ```
16 | 
17 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source,  and the  git id f3b7917091afba325af3980a35d8a6dcba03dc3f is used
18 | 
19 | | Metric         | BF16   | INT4   |
20 | | -------------- |--------| ------ |
21 | | Avg.           | 0.7000 | 0.6977 |
22 | | mmlu           | 0.6885 | 0.6824 |
23 | | lambada_openai | 0.7718 | 0.7790 |
24 | | hellaswag      | 0.6767 | 0.6745 |
25 | | winogrande     | 0.7687 | 0.7719 |
26 | | piqa           | 0.8351 | 0.8335 |
27 | | truthfulqa_mc1 | 0.4969 | 0.4884 |
28 | | openbookqa     | 0.3680 | 0.3720 |
29 | | boolq          | 0.8850 | 0.8783 |
30 | | rte            | 0.7184 | 0.7004 |
31 | | arc_easy       | 0.8699 | 0.8712 |
32 | | arc_challenge  | 0.6220 | 0.6229 |
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/Mixtral-8x7B-v0.1-asym-acc.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model   mistralai/Mixtral-8x7B-v0.1 \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --asym \
13 | --format 'auto_gptq,auto_round' \
14 | --output_dir "./tmp_autoround"
15 | ```
16 | 
17 | 
18 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f
19 | 
20 | Download the model from hf(coming soon) or follow examples/language-modeling/scripts/Mixtral-8x7B-v0.1.sh to generate the model
21 | 
22 | ~~~bash
23 | lm_eval --model hf --model_args pretrained="Intel/Mixtral-8x7B-v0.1-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
24 | ~~~
25 | 
26 | | Metric         | BF16   | INT4   |
27 | | -------------- |--------| ------ |
28 | | Avg.           | 0.6698 | 0.6633 |
29 | | mmlu           | 0.6802 | 0.6693 |
30 | | lambada_openai | 0.7827 | 0.7825 |
31 | | hellaswag      | 0.6490 | 0.6459 |
32 | | winogrande     | 0.7648 | 0.7514 |
33 | | piqa           | 0.8248 | 0.8210 |
34 | | truthfulqa_mc1 | 0.3427 | 0.3219 |
35 | | openbookqa     | 0.3540 | 0.3560 |
36 | | boolq          | 0.8523 | 0.8474 |
37 | | rte            | 0.7076 | 0.6931 |
38 | | arc_easy       | 0.8430 | 0.8430 |
39 | | arc_challenge  | 0.5666 | 0.5648 |
40 | 


--------------------------------------------------------------------------------
/docs/Qwen1.5-7B-Chat-acc.md:
--------------------------------------------------------------------------------
 1 | Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
 2 | 
 3 | We used the following command for evaluation.
 4 | For reference, the results of official AWQ-INT4 and GPTQ-INT4 release are listed.
 5 | 
 6 | ~~~bash
 7 | lm_eval --model hf  --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0
 8 | ~~~
 9 | 
10 | | Metric         | BF16   |  [Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ) | [Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4) | INT4 sym recipe | INT4 asym recipe |
11 | | -------------- | ------ |-----------|--------------------------------|-----------------|------------------|
12 | | Avg.           | 0.6231 | 0.6152    | 0.6070                           | 0.6205          | 0.6186           |
13 | | ceval          | 0.6887 | 0.6820    | 0.6679                           | 0.6761          | 0.6820           |
14 | | cmmlu          | 0.6959 | 0.6862    | 0.6831                           | 0.6870          | 0.6884           |
15 | | mmlu           | 0.6020 | 0.5944    | 0.5902                           | 0.5974          | 0.5946           |
16 | | gsm8k          | 0.5057 | 0.4981    | 0.4867                           | 0.5216          | 0.5095           |
17 | 


--------------------------------------------------------------------------------
/docs/Yi-6B-Chat-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | ```bash
 3 | auto-round \
 4 | --model 01-ai/Yi-6B-Chat  \
 5 | --device 0 \
 6 | --group_size 128 \
 7 | --bits 4 \
 8 | --iters 1000 \
 9 | --nsamples 512 \
10 | --asym \
11 | --minmax_lr 2e-3 \
12 | --format 'auto_gptq,auto_round' \
13 | --output_dir "./tmp_autoround"
14 | ```
15 | 
16 | 
17 | Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
18 | 
19 | We used the following command for evaluation.
20 | For reference, the results of official AWQ-INT4 release are listed.
21 | 
22 | ~~~bash
23 | lm_eval --model hf  --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0
24 | ~~~
25 | 
26 | | Metric | BF16   |[01-ai/Yi-6B-Chat-4bits](https://huggingface.co/01-ai/Yi-6B-Chat-4bits)| INT4   |
27 | |--------|--------|----------------------|--------|
28 | | Avg.   | 0.6043 | 0.5867               | 0.5939 |
29 | | mmlu   | 0.6163 | 0.6133               | 0.6119 |
30 | | cmmlu  | 0.7431 | 0.7312               | 0.7314 |
31 | | ceval  | 0.7355 | 0.7155               | 0.7281 |
32 | | gsm8k  | 0.3222 | 0.2866               | 0.3040 |
33 | 


--------------------------------------------------------------------------------
/docs/baichuan2-7b-cha-asym-recipe.md:
--------------------------------------------------------------------------------
 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | ```bash
 3 | auto-round \
 4 | --model baichuan-inc/Baichuan2-7B-Chat  \
 5 | --device 0 \
 6 | --group_size 128 \
 7 | --bits 4 \
 8 | --iters 1000 \
 9 | --nsamples 512 \
10 | --asym \
11 | --minmax_lr 2e-3 \
12 | --format 'auto_gptq,auto_round' \
13 | --output_dir "./tmp_autoround"
14 | ```
15 | 
16 | 
17 | 
18 | Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d.
19 | 
20 | We used the following command for evaluation.
21 | 
22 | ~~~bash
23 | lm_eval --model hf  --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0
24 | ~~~
25 | 
26 | | Metric | BF16   | INT4   |
27 | |--------|--------|--------|
28 | | Avg.   | 0.4504 | 0.4470 |
29 | | mmlu   | 0.5096 | 0.5053 |
30 | | cmmlu  | 0.5486 | 0.5426 |
31 | | ceval  | 0.5394 | 0.5223 |
32 | | gsm8k  | 0.2039 | 0.2176 |
33 | 


--------------------------------------------------------------------------------
/docs/bloom-3B-asym-recipe.md:
--------------------------------------------------------------------------------
 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | ```bash
 3 | auto-round \
 4 | --model bigscience/bloom-3b  \
 5 | --device 0 \
 6 | --group_size 128 \
 7 | --bits 4 \
 8 | --iters 1000 \
 9 | --nsamples 512 \
10 | --asym \
11 | --format 'auto_gptq,auto_round' \
12 | --output_dir "./tmp_autoround"
13 | ```
14 | 
15 | 
16 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d
17 | ##pip install auto-gptq[triton] 
18 | ##pip install triton==2.2.0
19 | ```bash
20 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
21 | ```
22 | 
23 | 
24 | 
25 | | Metric         | FP16   | INT4   |
26 | | -------------- | ------ | ------ |
27 | | Avg.           | 0.4532 | 0.4514 |
28 | | mmlu           | 0.2592 | 0.2537 |
29 | | lambada_openai | 0.5176 | 0.5135 |
30 | | hellaswag      | 0.4136 | 0.4093 |
31 | | winogrande     | 0.5864 | 0.5856 |
32 | | piqa           | 0.7062 | 0.7095 |
33 | | truthfulqa_mc1 | 0.2326 | 0.2264 |
34 | | openbookqa     | 0.2160 | 0.2140 |
35 | | boolq          | 0.6156 | 0.6199 |
36 | | rte            | 0.5632 | 0.5632 |
37 | | arc_easy       | 0.5947 | 0.5888 |
38 | | arc_challenge  | 0.2799 | 0.2816 |
39 | 


--------------------------------------------------------------------------------
/docs/falcon-7b-asym-recipe.md:
--------------------------------------------------------------------------------
 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | ```bash
 3 | auto-round \
 4 | --model tiiuae/falcon-7b   \
 5 | --device 0 \
 6 | --group_size 64 \
 7 | --bits 4 \
 8 | --iters 1000 \
 9 | --nsamples 512 \
10 | --asym \
11 | --format 'auto_gptq,auto_round' \
12 | --output_dir "./tmp_autoround"
13 | ```
14 | 
15 | 
16 | We generate the model with group_size 64 as there is an issue when evaluating with group_size 128.
17 | Evaluate the model
18 | pip3 install lm-eval==0.4.2
19 | 
20 | ```bash
21 | lm_eval --model hf --model_args pretrained="Intel/falcon-7b-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16
22 | ```
23 | 
24 | | Metric         | BF16   | int4   |
25 | | -------------- | ------ | ------ |
26 | | Avg.           | 0.5462 | 0.5454 |
27 | | mmlu           | 0.2546 | 0.2562 |
28 | | lambada_openai | 0.7450 | 0.7485 |
29 | | hellaswag      | 0.5773 | 0.5719 |
30 | | winogrande     | 0.6740 | 0.6835 |
31 | | piqa           | 0.7943 | 0.7905 |
32 | | truthfulqa_mc1 | 0.2228 | 0.2166 |
33 | | openbookqa     | 0.3080 | 0.3100 |
34 | | boolq          | 0.7361 | 0.7431 |
35 | | arc_easy       | 0.7475 | 0.7424 |
36 | | arc_challenge  | 0.4027 | 0.3908 |
37 | 
38 | 


--------------------------------------------------------------------------------
/docs/full_range_sym.md:
--------------------------------------------------------------------------------
 1 | W2G32 nsamples 512,iter 200, average accuracy of 10 tasks
 2 | 
 3 | | Models                     | gptq_sym | asym       | full_range_sym |
 4 | |----------------------------|----------|------------|----------------|
 5 | | Meta-Llama-3.1-8B-Instruct | 0.4500   | 0.52802    | **0.5381**     |
 6 | | Qwen2-7B                   | 0.5229   | **0.5559** | 0.5486         |
 7 | 
 8 | W4G128 nsamples 128,iter 200, average accuracy of 10 tasks
 9 | 
10 | | Models                     | asym       | full_range_sym |
11 | |----------------------------|------------|----------------|
12 | | Meta-Llama-3.1-8B-Instruct | 0.6342     | **0.6370**     |
13 | | Qwen2-7B                   | 0.6143     | **0.6167**     |
14 | | Mistral-7B-Instruct-v0.2   | 0.6606     | **0.6635**     |
15 | | Phi-3-mini-4k-instruct     | **0.6475** | 0.6432         |
16 | 


--------------------------------------------------------------------------------
/docs/gemma-2b-asym-recipe.md:
--------------------------------------------------------------------------------
 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | ```bash
 3 | auto-round \
 4 | --model google/gemma-2b  \
 5 | --device 0 \
 6 | --group_size 128 \
 7 | --bits 4 \
 8 | --iters 400 \
 9 | --model_dtype "float16" \
10 | --nsamples 512 \
11 | --asym \
12 | --format 'auto_gptq,auto_round' \
13 | --output_dir "./tmp_autoround"
14 | ```
15 | 
16 | Evaluate the model 
17 | 
18 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, lm-eval 0.4.2 is used
19 | 
20 | pip install auto-gptq
21 | 
22 | 
23 | Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
24 | Given that the Gemma model family exhibits inconsistent results between FP16 and BF16 on lm-eval, we recommend converting to FP16 for both tuning and evaluation.
25 | 
26 | ```bash
27 | lm_eval --model hf --model_args pretrained="Intel/gemma-2b-int4-inc",autogptq=True,gptq_use_triton=True,dtype=float16 --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 16
28 | ```
29 | 
30 | 
31 | 
32 | | Metric         | BF16 | FP16   | AutoRound v0.1 | AutoRound v0.2 |
33 | | -------------- | ---- | ------ |----------------|----------------|
34 | | Avg.| 0.5263 | 0.5277 | 0.5235         | 0.5248         |
35 | | mmlu           | 0.3287 | 0.3287 | 0.3297         | 0.3309         |
36 | | lambada_openai | 0.6344 | 0.6375 | 0.6307         | 0.6379         |
37 | | hellaswag      | 0.5273 | 0.5281 | 0.5159         | 0.5184         |
38 | | winogrande     | 0.6504 | 0.6488 | 0.6543         | 0.6575         |
39 | | piqa           | 0.7671 | 0.7720 | 0.7612         | 0.7606         |
40 | | truthfulqa_mc1 | 0.2203 | 0.2203 | 0.2203         | 0.2191         |
41 | | openbookqa     | 0.2980 | 0.3020 | 0.3000         | 0.3060         |
42 | | boolq          | 0.6927 | 0.6936 | 0.6939         | 0.6966         |
43 | | arc_easy       | 0.7420 | 0.7403 | 0.7353         | 0.7357         |
44 | | arc_challenge  | 0.4019 | 0.4061 | 0.3933         | 0.3857         |
45 | 


--------------------------------------------------------------------------------
/docs/gemma-7b-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model  google/gemma-7b \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --minmax_lr 2e-3 \
11 | --model_dtype "float16" \
12 | --iters 1000 \
13 | --nsamples 512 \
14 | --asym \
15 | --format 'auto_gptq,auto_round' \
16 | --output_dir "./tmp_autoround"
17 | ```
18 | 
19 | 
20 | pip install lm-eval==0.4.2
21 | pip install auto-gptq
22 | 
23 | Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
24 | 
25 | Given that the Gemma model family exhibits inconsistent results between FP16 and BF16 on lm-eval, we recommend converting to FP16 for both tuning and evaluation.
26 | ```bash
27 | lm_eval --model hf --model_args pretrained="Intel/gemma-7b-int4-inc",autogptq=True,gptq_use_triton=True,dtype=float16 --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
28 | ```
29 | | Metric         | BF16 | FP16   | AutoRound v0.1 | AutoRound V0.2 |
30 | | -------------- | ---- | ------ |----------------|----------------|
31 | | Avg. | 0.6208 | 0.6302 | 0.6242         | 0.6254         |
32 | | mmlu           | 0.6126 | 0.6189 | 0.6085         | 0.6147         |
33 | | lambada_openai | 0.6707 | 0.7308 | 0.7165         | 0.7270         |
34 | | hellaswag      | 0.6039 | 0.6063 | 0.6017         | 0.6017         |
35 | | winogrande     | 0.7356 | 0.7506 | 0.7482         | 0.7490         |
36 | | piqa           | 0.8014 | 0.8025 | 0.7976         | 0.7982         |
37 | | truthfulqa_mc1 | 0.3121 | 0.3121 | 0.3060         | 0.2840         |
38 | | openbookqa     | 0.3300 | 0.3220 | 0.3340         | 0.3240         |
39 | | boolq          | 0.8254 | 0.8324 | 0.8300         | 0.8407         |
40 | | rte            | 0.6643 | 0.6859 | 0.6787         | 0.6968         |
41 | | arc_easy       | 0.8068 | 0.8262 | 0.8089         | 0.8194         |
42 | | arc_challenge  | 0.5043 | 0.5000 | 0.4915         | 0.4949         |
43 | 


--------------------------------------------------------------------------------
/docs/gemma-7b-it-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model  google/gemma-7b-it \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --minmax_lr 2e-3 \
11 | --model_dtype "float16" \
12 | --iters 1000 \
13 | --nsamples 512 \
14 | --asym \
15 | --format 'auto_gptq,auto_round' \
16 | --output_dir "./tmp_autoround"
17 | ```
18 | 
19 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source,  and the  git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d, Install the latest [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) from source first
20 | 
21 | Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
22 | 
23 | ```bash
24 | lm_eval --model hf --model_args pretrained="Intel/gemma-7b-it-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
25 | ```
26 | 
27 | | Metric         | BF16   | int4   |
28 | | -------------- |--------| ------ |
29 | | Avg.           | 0.6022 | 0.6017 |
30 | | mmlu           | 0.5029 | 0.4993 |
31 | | lambada_openai | 0.6035 | 0.6286 |
32 | | hellaswag      | 0.5620 | 0.5564 |
33 | | winogrande     | 0.6796 | 0.6788 |
34 | | piqa           | 0.7709 | 0.7731 |
35 | | truthfulqa_mc1 | 0.3048 | 0.3035 |
36 | | openbookqa     | 0.3740 | 0.3700 |
37 | | boolq          | 0.8138 | 0.8144 |
38 | | rte            | 0.7870 | 0.7870 |
39 | | arc_easy       | 0.7525 | 0.7508 |
40 | | arc_challenge  | 0.4727 | 0.4573 |
41 | 


--------------------------------------------------------------------------------
/docs/gguf_accuracy.md:
--------------------------------------------------------------------------------
 1 | 1 We evaluate all models using the `fake` format, as lm-eval reports inaccurate accuracy for real GGUF format 
 2 | 
 3 | 
 4 | lm-eval 0.48
 5 | 
 6 | ```bash
 7 |  lm-eval --model hf --model_args pretrained="./"   --tasks mmlu,leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k 
 8 |  --batch_size 16
 9 | ```
10 | 
11 | 2 `lm-head` and `embedding` layers are not quantized in any of the following models.
12 | 
13 | | Q4_K_S                    | Avg.       | mmlu   | mmlu_pro | if_eval  | gsm8k  |
14 | |---------------------------|------------|--------|----------|----------|--------|
15 | | Qwen2.5-7B-GGUF           | 0.6366     | 0.7097 | 0.4385   | 0.61115  | 0.7870 |
16 | | Qwen2.5-7B-AutoRound      | **0.6529** | 0.7137 | 0.4471   | 0.6373   | 0.8135 |
17 | | Llama-3.1-8B-GGUF         | 0.5589     | 0.6609 | 0.3610   | 0.4949   | 0.7187 |
18 | | Llama-3.1-8B-AutoRound    | **0.5666** | 0.6627 | 0.3648   | 0.49965  | 0.7392 |
19 | | Falcon3-7B-GGUF           | 0.5179     | 0.6649 | 0.3607   | 0.3251   | 0.7210 |
20 | | Falcon3-7B-AutoRound      | **0.5261** | 0.6706 | 0.3841   | 0.31445  | 0.7354 |
21 | | phi-4-GGUF                | **0.5623** | 0.7648 | 0.5292   | 0.0590   | 0.8961 |
22 | | phi-4-AutoRound           | 0.5588     | 0.7673 | 0.5239   | 0.05175  | 0.8923 |
23 | 
24 | | Q3_K_S                    | Avg.       | mmlu   | mmlu_pro | if_eval  | gsm8k  |
25 | |---------------------------|------------|--------|----------|----------|--------|
26 | | Qwen2.5-7B-GGUF           | 0.5939     | 0.6936 | 0.4062   | 0.57675  | 0.6990 |
27 | | Qwen2.5-7B-AutoRound      | **0.6103** | 0.7002 | 0.4171   | 0.6194   | 0.7043 |
28 | | Llama-3.1-8B-GGUF         | 0.4903     | 0.6050 | 0.3260   | 0.44265  | 0.5876 |
29 | | Llama-3.1-8B-AutoRound    | **0.5511** | 0.6548 | 0.3533   | 0.4913   | 0.7051 |
30 | | Falcon3-7B-GGUF           | 0.4905     | 0.6434 | 0.3439   | 0.2871   | 0.6876 |
31 | | Falcon3-7B-AutoRound      | **0.5296** | 0.6520 | 0.3679   | 0.30745  | 0.7911 |
32 | | phi-4-GGUF                | **0.5527** | 0.7590 | 0.5072   | 0.0802   | 0.8643 |
33 | | phi-4-AutoRound           | 0.5523     | 0.7657 | 0.5124   | 0.0587   | 0.8726 |
34 | 
35 | | Q2_K_S                    | Avg.       | mmlu   | mmlu_pro | if_eval  | gsm8k  |
36 | |---------------------------|------------|--------|----------|----------|--------|
37 | | Qwen2.5-7B-GGUF           | 0.3942     | 0.5750 | 0.2701   | 0.4071   | 0.3245 |
38 | | Qwen2.5-7B-AutoRound      | **0.5133** | 0.6384 | 0.3383   | 0.4714   | 0.6050 |
39 | | Falcon3-7B-GGUF           | 0.1936     | 0.3491 | 0.1521   | 0.21615  | 0.0569 |
40 | | Falcon3-7B-AutoRound      | **0.3817** | 0.5607 | 0.2625   | 0.28955  | 0.4139 |
41 | | phi-4-GGUF                | 0.4438     | 0.6715 | 0.3807   | 0.0802   | 0.6429 |
42 | | phi-4-AutoRound           | **0.5113** | 0.7107 | 0.4383   | 0.08675  | 0.8097 |
43 | 
44 | 


--------------------------------------------------------------------------------
/docs/gpt-j-6B-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model EleutherAI/gpt-j-6b \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --asym \
13 | --format 'auto_gptq,auto_round' \
14 | --output_dir "./tmp_autoround"
15 | ```
16 | 
17 | 
18 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d
19 | ##pip install auto-gptq[triton] 
20 | ##pip install triton==2.2.0
21 | ```bash
22 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
23 | ```
24 | 
25 | 
26 | 
27 | | Metric         | FP16   | INT4   |
28 | | -------------- | ------ | ------ |
29 | | Avg.           | 0.5039 | 0.5034 |
30 | | mmlu           | 0.2694 | 0.2793 |
31 | | lambada_openai | 0.6831 | 0.6790 |
32 | | hellaswag      | 0.4953 | 0.4902 |
33 | | winogrande     | 0.6409 | 0.6401 |
34 | | piqa           | 0.7541 | 0.7465 |
35 | | truthfulqa_mc1 | 0.2020 | 0.2179 |
36 | | openbookqa     | 0.2900 | 0.2900 |
37 | | boolq          | 0.6544 | 0.6554 |
38 | | rte            | 0.5451 | 0.5271 |
39 | | arc_easy       | 0.6692 | 0.6734 |
40 | | arc_challenge  | 0.3396 | 0.3387 |
41 | 


--------------------------------------------------------------------------------
/docs/imgs/autoround_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/docs/imgs/autoround_overview.png


--------------------------------------------------------------------------------
/docs/imgs/full_range_sym.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/docs/imgs/full_range_sym.png


--------------------------------------------------------------------------------
/docs/imgs/norm_bias_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/docs/imgs/norm_bias_overview.png


--------------------------------------------------------------------------------
/docs/llava-v1.5-7b-sym.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Model Details
  3 | 
  4 | This model is an int4 model with group_size 128 and symmetric quantization of [liuhaotian/llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b). Load the model with revision="8ab8ff" to use AutoGPTQ format.
  5 | 
  6 | ## How To Use
  7 | 
  8 | ### Requirements
  9 | 
 10 | 1. Clone this repository and navigate to LLaVA folder
 11 | ```shell
 12 | git clone https://github.com/haotian-liu/LLaVA.git
 13 | cd LLaVA
 14 | ```
 15 | 
 16 | 2. Refine LLaVA repo
 17 | ```
 18 | vi llava/model/language_model/llava_llama.py
 19 | # add 'cache_position = None,' to line 71.
 20 | ```
 21 | 3. Install Package
 22 | ```
 23 | pip install --upgrade pip  # enable PEP 660 support
 24 | pip install -e .
 25 | ```
 26 | 
 27 | ### INT4 Inference
 28 | ```python
 29 | from auto_round import AutoRoundConfig ## must import for auto-round format
 30 | import requests
 31 | import torch
 32 | from PIL import Image
 33 | from llava.model.builder import load_pretrained_model
 34 | from llava.train.train import preprocess, preprocess_multimodal, DataCollatorForSupervisedDataset
 35 | class DataArgs:
 36 |     is_multimodal = True
 37 |     mm_use_im_start_end = False
 38 | 
 39 | quantized_model_path="Intel/llava-v1.5-7b-inc-private"
 40 | 
 41 | tokenizer, model, image_processor, _ = load_pretrained_model(
 42 |     quantized_model_path,
 43 |     model_base=None,
 44 |     model_name=quantized_model_path,
 45 |     torch_dtype="auto",
 46 |     device_map="auto",
 47 |     ##revision="8ab8ff" ##AutoGPTQ format
 48 | )
 49 | image_url = "http://images.cocodataset.org/train2017/000000116003.jpg"
 50 | messages = [{"from": "human", "value": "What is the tennis player doing in the image?\n<image>"}]
 51 | 
 52 | # Preparation for inference
 53 | image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
 54 | image_input = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].to(model.device)
 55 | input_data = preprocess_multimodal([messages], DataArgs())
 56 | inputs = preprocess(input_data, tokenizer, has_image=(image_input is not None))
 57 | 
 58 | output = model.generate(inputs['input_ids'].to(model.device), images=image_input.unsqueeze(0).half(), max_new_tokens=50)
 59 | print(tokenizer.batch_decode(output))
 60 | 
 61 | ##INT4: The tennis player is celebrating a victory, raising his arms in the air, and holding his tennis racket.
 62 | 
 63 | ##BF16: The tennis player is celebrating a victory, raising his arms in the air, and holding a tennis racket.
 64 | 
 65 | image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
 66 | messages = [{"from": "human", "value": "How many people are on the baseball field in the picture?\n<image>"}]
 67 | 
 68 | ##INT4: There are three people on the baseball field in the picture.
 69 | 
 70 | ##BF16: There are three people on the baseball field in the picture.
 71 | 
 72 | 
 73 | image_url = "http://images.cocodataset.org/train2017/000000093025.jpg"
 74 | messages = [{"from": "human", "value": "How many people and animals are there in the image?\n<image>"}]
 75 | 
 76 | ##INT4: There are two people and one animal in the image.
 77 | 
 78 | ##BF16: There are two people and one animal in the image.
 79 | 
 80 | ```
 81 | 
 82 | ## Evaluation the model
 83 | pip3 install lmms_eval. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update
 84 | ```bash
 85 | auto-round-mllm --lmms --model Intel/llava-v1.5-7b-inc-private --tasks pope,textvqa_val,scienceqa,mmbench_en  --output_dir "./eval_result" --device cuda:0 
 86 | ```
 87 | |Metric             |16bits|Pile Calib INT4  | Llava Calib INT4  |
 88 | |:-------------------|:------|:------|:--------------|
 89 | |avg                |65.40 |65.91 | 65.79 |
 90 | |MMBench_DEV_EN_V11 |64.09 |64.43 |64.43 |
 91 | |ScienceQA_VAL      |64.87 |67.20 |66.80 |
 92 | |TextVQA_VAL        |45.56 |45.71 |45.81 |
 93 | |POPE               |87.09 |86.31 |86.12 |
 94 | 
 95 | ### Generate the model
 96 | Here is the sample command to reproduce the model.
 97 | ```bash
 98 | pip install auto-round
 99 | auto-round-mllm \
100 | --model liuhaotian/llava-v1.5-7b \
101 | --device 0 \
102 | --group_size 128 \
103 | --bits 4 \
104 | --iters 1000 \
105 | --nsample 512 \
106 | --seqlen 2048 \
107 | --format 'auto_gptq,auto_round' \
108 | --output_dir "./tmp_autoround"
109 | ```
110 | 
111 | ## Ethical Considerations and Limitations
112 | 
113 | The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
114 | 
115 | Therefore, before deploying any applications of the model, developers should perform safety testing.
116 | 
117 | ## Caveats and Recommendations
118 | 
119 | Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
120 | 
121 | Here are a couple of useful links to learn more about Intel's AI software:
122 | 
123 | - Intel Neural Compressor [link](https://github.com/intel/neural-compressor)
124 | 
125 | ## Disclaimer
126 | 
127 | The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
128 | 
129 | ## Cite
130 | 
131 | @article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
132 | 
133 | [arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
134 | 


--------------------------------------------------------------------------------
/docs/neural-chat-7b-v3-1-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model   intel/neural-chat-7b-v3-1 \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --minmax_lr 2e-3 \
13 | --asym \
14 | --format 'auto_gptq,auto_round' \
15 | --output_dir "./tmp_autoround"
16 | ```
17 | 
18 | 
19 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f
20 | 
21 | ~~~bash
22 | lm_eval  --model hf --model_args pretrained="Intel/neural-chat-v3-1-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu  --batch_size 128
23 | ~~~
24 | 
25 | | Metric         | FP16   | INT4   |
26 | | -------------- | ------ | ------ |
27 | | Avg.           | 0.6769 | 0.6721 |
28 | | mmlu           | 0.5919 | 0.5862 |
29 | | lambada_openai | 0.7394 | 0.7337 |
30 | | hellaswag      | 0.6323 | 0.6272 |
31 | | winogrande     | 0.7687 | 0.7577 |
32 | | piqa           | 0.8161 | 0.8150 |
33 | | truthfulqa_mc1 | 0.4431 | 0.4394 |
34 | | openbookqa     | 0.3760 | 0.3700 |
35 | | boolq          | 0.8783 | 0.8743 |
36 | | rte            | 0.7690 | 0.7726 |
37 | | arc_easy       | 0.8413 | 0.8384 |
38 | | arc_challenge  | 0.5896 | 0.5785 |
39 | 


--------------------------------------------------------------------------------
/docs/neural-chat-7b-v3-3-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model   intel/neural-chat-7b-v3-3 \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --minmax_lr 2e-3 \
13 | --asym \
14 | --format 'auto_gptq,auto_round' \
15 | --output_dir "./tmp_autoround"
16 | ```
17 | 
18 | 
19 | 
20 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f
21 | 
22 | ~~~bash
23 | lm_eval  --model hf --model_args pretrained="Intel/neural-chat-v3-3-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu  --batch_size 128
24 | ~~~
25 | 
26 | | Metric         | FP16   | INT4   |
27 | | -------------- | ------ | ------ |
28 | | Avg.           | 0.6778 | 0.6748 |
29 | | mmlu           | 0.5993 | 0.5926 |
30 | | lambada_openai | 0.7303 | 0.7370 |
31 | | hellaswag      | 0.6639 | 0.6559 |
32 | | winogrande     | 0.7632 | 0.7735 |
33 | | piqa           | 0.8101 | 0.8074 |
34 | | truthfulqa_mc1 | 0.4737 | 0.4737 |
35 | | openbookqa     | 0.3880 | 0.3680 |
36 | | boolq          | 0.8694 | 0.8694 |
37 | | rte            | 0.7581 | 0.7509 |
38 | | arc_easy       | 0.8266 | 0.8249 |
39 | | arc_challenge  | 0.5734 | 0.5691 |
40 | 


--------------------------------------------------------------------------------
/docs/opt-2.7b-asym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model   facebook/opt-2.7b \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --minmax_lr 2e-3 \
13 | --asym \
14 | --format 'auto_gptq,auto_round' \
15 | --output_dir "./tmp_autoround"
16 | ```
17 | 
18 | 
19 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d
20 | ##pip install auto-gptq[triton] 
21 | ##pip install triton==2.2.0
22 | ```bash
23 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32
24 | ```
25 | 
26 | 
27 | 
28 | | Metric         | FP16   | INT4   |
29 | | -------------- | ------ | ------ |
30 | | Avg.           | 0.4722 | 0.4757 |
31 | | mmlu           | 0.2568 | 0.2636 |
32 | | lambada_openai | 0.6359 | 0.6487 |
33 | | hellaswag      | 0.4585 | 0.4519 |
34 | | winogrande     | 0.6077 | 0.5967 |
35 | | piqa           | 0.7367 | 0.7410 |
36 | | truthfulqa_mc1 | 0.2240 | 0.2338 |
37 | | openbookqa     | 0.2500 | 0.2380 |
38 | | boolq          | 0.6046 | 0.6505 |
39 | | rte            | 0.5451 | 0.5379 |
40 | | arc_easy       | 0.6077 | 0.6035 |
41 | | arc_challenge  | 0.2679 | 0.2671 |
42 | 


--------------------------------------------------------------------------------
/docs/phi-2-old-sym-recipe.md:
--------------------------------------------------------------------------------
 1 |  **This recipe is outdated, we recommend using the latest full range symmetric quantization.** You can remove --asym from the command.
 2 | 
 3 | A sample command to generate an INT4 model.
 4 | ```bash
 5 | auto-round \
 6 | --model   facebook/opt-2.7b \
 7 | --device 0 \
 8 | --group_size 128 \
 9 | --bits 4 \
10 | --iters 1000 \
11 | --nsamples 512 \
12 | --format 'auto_gptq,auto_round' \
13 | --output_dir "./tmp_autoround"
14 | ```
15 | 
16 | 
17 | pip install lm-eval==0.4.2
18 | 
19 | Due to the significant accuracy drop with the asymmetric kernel for this model, we opted to use symmetric quantization.
20 | 
21 | ```bash
22 | lm_eval --model hf --model_args pretrained="Intel/phi-2-int4-inc" --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16
23 | ```
24 | 
25 | | Metric         | FP16   | INT4  |
26 | | -------------- | ------ | -------- |
27 | | Avg.           | 0.6155 | 0.6163   |
28 | | mmlu           | 0.5448 | 0.5417   |
29 | | lambada_openai | 0.6268 | 0.6225   |
30 | | hellaswag      | 0.5585 | 0.5498   |
31 | | winogrande     | 0.7530 | 0.7545   |
32 | | piqa           | 0.7867 | 0.7824   |
33 | | truthfulqa_mc1 | 0.3133 | 0.3060   |
34 | | openbookqa     | 0.4000 | 0.4100   |
35 | | boolq          | 0.8339 | 0.8327   |
36 | | rte            | 0.6245 | 0.6643   |
37 | | arc_easy       | 0.7997 | 0.7955   |
38 | | arc_challenge  | 0.5290 | 0.5196   |
39 | 


--------------------------------------------------------------------------------
/docs/tuning_norm_bias.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | ## Fast tuning LayerNorm and Linear bias via fake quantization without rounding
 4 | 
 5 | **Personal view by Wenhua, welcome to discuss**
 6 | 
 7 | **Work in Progress** 
 8 | <div align="left">
 9 | Recent studies have found that tuning LayerNorm and bias through optimizer like Adam can lead to better results, especially for low-bit quantization such as 2-bit. However, I personally do not favor the use of Adam for this purpose, as detailed in the following section, and introduce an alternative way, detailed in the last section.
10 | 
11 | ### Why not using Adam
12 | 
13 | #### Reason1 hard to tune the learning rate and steps
14 | 
15 | Since Adam adaptively tunes the step size based on the gradient and its square, the learning rate often needs adjustment for different models, different quantization bits, or both, as observed in most papers. I hypothesize that this tuning requirement arises because most papers report results for only a limited range of model families, while many new models continually emerge. Despite my experience in this domain, I still find it challenging to tune the learning rate beyond using grid search. I believe many users encounter the same issue.
16 | 
17 | #### Reason2 Prone to overfitting
18 | 
19 | Since Adam adapts the step size with each iteration, it is difficult to control the changes in parameters, leading to significant deviations from the original model's weights in some scenarios. However, we only use hundreds or thousands of samples to fine-tune a low-bit model, whereas the original model is trained on a large corpus and specialized datasets (e.g., instruction datasets). Consequently, even if the low-bit tuned model performs well on some language-modeling tasks, it may lose other compatibility as the deviations increase.
20 | 
21 | 
22 | 
23 | ### Our way
24 | 
25 | **An overview of our method**
26 | <div align="center">
27 | 
28 | ![](../docs/imgs/norm_bias_overview.png)
29 | 
30 | <div align="left">
31 | 
32 | 
33 | We limit the tuned parameters in a quantization space, expressed as:
34 | $$
35 | W' = s*clip(W/s+zp,N,M)
36 | $$
37 | where 𝑠 is the quantization scale, predefined by 𝑊 and hyperparameters such as bits.
38 | 
39 | To tune the W', following Signround, we add a trainable parameter V in the range [-0.5, 0.5], which can be easily tuned by SignSGD.
40 | 
41 | $$
42 | W' = s*clip(W/s+zp+v,N,M)
43 | $$
44 | 
45 | 
46 | An important note: We remove the rounding to reduce unnecessary rounding loss, as the final weights of LayerNorm and bias are typically kept at 16-bit precision in most cases.
47 | 
48 | 
49 | 
50 | **Result at W2G32**
51 | 
52 | the tuning of layer normalization and Linear bias are fake quantized at W4G-1.
53 | 
54 |  Average accuracies of HellaSwag, WinoGrand, PIQA and LAMBADA, higher is better.
55 | 
56 | |           | OPT125m    | OPT1.3B    | OPT2.7B    | OPT6.7B    | LLaMAV2-7b | LLaMAV3-8B-Instruct |
57 | | --------- | ---------- | ---------- | ---------- | ---------- | ---------- | ------------------- |
58 | | SignRound | 0.3978     | 0.5094     | 0.5267     | 0.3681     | 0.6267     | 0.5890              |
59 | | Ours      | **0.4077** | **0.5151** | **0.5596** | **0.3887** | **0.6315** | **0.5949**          |
60 | 
61 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.codespell]
2 | ignore-words = ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt"


--------------------------------------------------------------------------------
/requirements-cpu.txt:
--------------------------------------------------------------------------------
1 | intel-extension-for-pytorch
2 | intel-extension-for-transformers
3 | 


--------------------------------------------------------------------------------
/requirements-lib.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | datasets
 3 | py-cpuinfo
 4 | sentencepiece
 5 | numpy < 2.0
 6 | tqdm
 7 | packaging
 8 | pillow
 9 | numba
10 | tbb
11 | transformers
12 | threadpoolctl
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | datasets
 3 | py-cpuinfo
 4 | sentencepiece
 5 | numpy < 2.0
 6 | tqdm
 7 | packaging
 8 | pillow
 9 | numba
10 | tbb
11 | torch
12 | transformers>=4.38
13 | threadpoolctl
14 | lm-eval>=0.4.2,<0.5


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | license_files =
 3 |    LICENSE
 4 |    third-party-programs.txt
 5 | 
 6 | [options.entry_points]
 7 | console_scripts =
 8 |     auto_round = auto_round.__main__:run
 9 |     auto-round = auto_round.__main__:run
10 |     auto_round_eval = auto_round.__main__:run_eval
11 |     auto-round-eval = auto_round.__main__:run_eval
12 |     auto_round_mllm = auto_round.__main__:run_mllm
13 |     auto-round-mllm = auto_round.__main__:run_mllm
14 |     auto-round-fast = auto_round.__main__:run_fast
15 |     auto_round_fast = auto_round.__main__:run_fast
16 |     auto-round-best = auto_round.__main__:run_best
17 |     auto_round_best = auto_round.__main__:run_best
18 |     auto-round-light = auto_round.__main__:run_light
19 |     auto_round_light = auto_round.__main__:run_light
20 | 
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from io import open
  3 | import os
  4 | from setuptools import find_packages, setup
  5 | import sys
  6 | from functools import lru_cache
  7 | 
  8 | os.environ["CC"] = "g++"
  9 | os.environ["CXX"] = "g++"
 10 | try:
 11 |     filepath = "./auto_round/version.py"
 12 |     with open(filepath) as version_file:
 13 |         (__version__,) = re.findall('__version__ = "(.*)"', version_file.read())
 14 | except Exception as error:
 15 |     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
 16 | 
 17 | version = __version__
 18 | 
 19 | # All BUILD_* flags are initially set to `False`` and
 20 | # will be updated to `True` if the corresponding environment check passes.
 21 | PYPI_RELEASE = os.environ.get("PYPI_RELEASE", None)
 22 | BUILD_HPU_ONLY = os.environ.get("BUILD_HPU_ONLY", "0") == "1"
 23 | 
 24 | 
 25 | @lru_cache(None)
 26 | def is_habana_framework_installed():
 27 |     """Check if Habana framework is installed.
 28 |     Only check for the habana_frameworks package without importing it to avoid
 29 |     initializing lazy-mode-related components.
 30 |     """
 31 |     from importlib.util import find_spec
 32 | 
 33 |     package_spec = find_spec("habana_frameworks")
 34 |     return package_spec is not None
 35 | 
 36 | 
 37 | @lru_cache(None)
 38 | def is_hpu_available():
 39 |     try:
 40 |         import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
 41 |         return True
 42 |     except ImportError:
 43 |         return False
 44 | 
 45 | 
 46 | if is_hpu_available() or is_habana_framework_installed():
 47 |     # When HPU is available, we build HPU only by default
 48 |     BUILD_HPU_ONLY = True
 49 | 
 50 | 
 51 | def is_cpu_env():
 52 |     try:
 53 |         import torch
 54 |     except Exception as e:
 55 |         print(
 56 |             f"Building extension requires PyTorch being installed, please install PyTorch first: {e}.\n NOTE: This issue may be raised due to pip build isolation system (ignoring local packages). Please use `--no-build-isolation` when installing with pip, and refer to https://github.com/intel/auto-round for more details.")
 57 |         sys.exit(1)
 58 |     if torch.cuda.is_available():
 59 |         return False
 60 |     try:
 61 |         import habana_frameworks.torch.core as htcore
 62 |         return False
 63 |     except:
 64 |         return True
 65 | 
 66 | 
 67 | def fetch_requirements(path):
 68 |     requirements = []
 69 |     with open(path, "r") as fd:
 70 |         requirements = [r.strip() for r in fd.readlines()]
 71 |     return requirements
 72 | 
 73 | 
 74 | PKG_INSTALL_CFG = {
 75 |     "include_packages": find_packages(
 76 |         include=[
 77 |             "auto_round",
 78 |             "auto_round.*",
 79 |             "auto_round_extension",
 80 |             "auto_round_extension.*",
 81 |         ],
 82 |     ),
 83 |     "install_requires": fetch_requirements("requirements.txt"),
 84 |     "extras_require": {
 85 |         "cpu": fetch_requirements("requirements-cpu.txt"),
 86 |     },
 87 | }
 88 | 
 89 | ###############################################################################
 90 | # Configuration for auto_round_lib
 91 | # From pip:
 92 | # pip install auto-round-lib
 93 | # From source:
 94 | # python setup.py lib install
 95 | ###############################################################################
 96 | 
 97 | 
 98 | LIB_REQUIREMENTS_FILE = "requirements-lib.txt"
 99 | LIB_INSTALL_CFG = {
100 |     "include_packages": find_packages(
101 |         include=[
102 |             "auto_round",
103 |             "auto_round.*",
104 |             "auto_round_extension",
105 |             "auto_round_extension.*",
106 |         ],
107 |     ),
108 |     "install_requires": fetch_requirements(LIB_REQUIREMENTS_FILE),
109 | }
110 | 
111 | if __name__ == "__main__":
112 |     # There are two ways to install hpu-only package:
113 |     # 1. python setup.py lib install
114 |     # 2. Within the gaudi docker where the HPU is available, we install the auto_round_lib by default.
115 |     is_user_requesting_library_build = "lib" in sys.argv
116 |     if is_user_requesting_library_build:
117 |         sys.argv.remove("lib")
118 |     should_build_library = is_user_requesting_library_build or BUILD_HPU_ONLY
119 | 
120 |     if should_build_library:
121 |         package_name = "auto_round_lib"
122 |         INSTALL_CFG = LIB_INSTALL_CFG
123 |     else:
124 |         package_name = "auto_round"
125 |         INSTALL_CFG = PKG_INSTALL_CFG
126 | 
127 |     include_packages = INSTALL_CFG.get("include_packages", {})
128 |     install_requires = INSTALL_CFG.get("install_requires", [])
129 |     extras_require = INSTALL_CFG.get("extras_require", {})
130 | 
131 |     setup(
132 |         name=package_name,
133 |         author="Intel AIPT Team",
134 |         version=version,
135 |         author_email="wenhua.cheng@intel.com, weiwei1.zhang@intel.com, heng.guo@intel.com",
136 |         description="Repository of AutoRound: Advanced Weight-Only Quantization Algorithm for LLMs",
137 |         long_description=open("README.md", "r", encoding="utf-8").read(),
138 |         long_description_content_type="text/markdown",
139 |         keywords="quantization,auto-around,LLM,SignRound",
140 |         license="Apache 2.0",
141 |         url="https://github.com/intel/auto-round",
142 |         packages=include_packages,
143 |         install_requires=install_requires,
144 |         extras_require=extras_require,
145 |         python_requires=">=3.7.0",
146 |         classifiers=[
147 |             "Intended Audience :: Science/Research",
148 |             "Programming Language :: Python :: 3",
149 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
150 |             "License :: OSI Approved :: Apache Software License",
151 |         ],
152 |         include_package_data=True,
153 |         package_data={"": ["mllm/templates/*.json"]},
154 |     )
155 | 


--------------------------------------------------------------------------------
/test/test_cpu/_test_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def is_pytest_mode_compile():
 5 |     return pytest.mode == "compile"
 6 | 
 7 | 
 8 | def is_pytest_mode_lazy():
 9 |     return pytest.mode == "lazy"
10 | 
11 | 
12 | def model_infer(model, tokenizer, apply_chat_template=False):
13 |     prompts = [
14 |         "Hello,my name is",
15 |         # "The president of the United States is",
16 |         # "The capital of France is",
17 |         # "The future of AI is",
18 |     ]
19 |     if apply_chat_template:
20 |         texts = []
21 |         for prompt in prompts:
22 |             messages = [
23 |                 {"role": "user", "content": prompt}
24 |             ]
25 |             text = tokenizer.apply_chat_template(
26 |                 messages,
27 |                 tokenize=False,
28 |                 add_generation_prompt=True
29 |             )
30 |             texts.append(text)
31 |         prompts = texts
32 | 
33 |     inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
34 | 
35 |     outputs = model.generate(
36 |         input_ids=inputs["input_ids"].to(model.device),
37 |         attention_mask=inputs["attention_mask"].to(model.device),
38 |         do_sample=False,  ## change this to follow official usage
39 |         max_new_tokens=5
40 |     )
41 |     generated_ids = [
42 |         output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs)
43 |     ]
44 | 
45 |     decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
46 | 
47 |     for i, prompt in enumerate(prompts):
48 |         print(f"Prompt: {prompt}")
49 |         print(f"Generated: {decoded_outputs[i]}")
50 |         print("-" * 50)
51 |     return decoded_outputs[0]
52 | 


--------------------------------------------------------------------------------
/test/test_cpu/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Mapping
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | def pytest_addoption(parser):
 8 |     parser.addoption(
 9 |         "--mode",
10 |         action="store",
11 |         default="lazy",
12 |         help="{compile|lazy}, default lazy. Choose mode to run tests",
13 |     )
14 | 
15 | 
16 | backup_env = pytest.StashKey[Mapping]()
17 | 
18 | 
19 | def pytest_configure(config):
20 |     pytest.mode = config.getoption("--mode")
21 |     assert pytest.mode.lower() in ["lazy", "compile"]
22 | 
23 |     config.stash[backup_env] = os.environ
24 | 
25 |     if pytest.mode == "lazy":
26 |         os.environ["PT_HPU_LAZY_MODE"] = "1"
27 |     elif pytest.mode == "compile":
28 |         os.environ["PT_HPU_LAZY_MODE"] = "0"
29 |         os.environ["PT_ENABLE_INT64_SUPPORT"] = "1"
30 | 
31 | 
32 | def pytest_unconfigure(config):
33 |     os.environ.clear()
34 |     os.environ.update(config.stash[backup_env])
35 | 


--------------------------------------------------------------------------------
/test/test_cpu/requirements.txt:
--------------------------------------------------------------------------------
1 | addict
2 | modelscope
3 | gguf
4 | torchvision
5 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_act_quantization.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | from auto_round import AutoRound
11 | 
12 | 
13 | class LLMDataLoader:
14 |     def __init__(self):
15 |         self.batch_size = 1
16 | 
17 |     def __iter__(self):
18 |         for i in range(3):
19 |             yield torch.ones([1, 10], dtype=torch.long)
20 | 
21 | 
22 | class TestAutoRoundAct(unittest.TestCase):
23 |     @classmethod
24 |     def setUpClass(self):
25 |         model_name = "facebook/opt-125m"
26 |         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
27 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
28 |         self.llm_dataloader = LLMDataLoader()
29 | 
30 |     @classmethod
31 |     def tearDownClass(self):
32 |         shutil.rmtree("./saved", ignore_errors=True)
33 |         shutil.rmtree("runs", ignore_errors=True)
34 | 
35 |     def test_mx_fp4(self):
36 |         model_name = "facebook/opt-125m"
37 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
38 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
39 |         bits, group_size, sym = 4, 128, True
40 |         autoround = AutoRound(
41 |             model,
42 |             tokenizer,
43 |             bits=bits,
44 |             group_size=group_size,
45 |             sym=sym,
46 |             iters=2,
47 |             seqlen=2,
48 |             dataset=self.llm_dataloader,
49 |             act_bits=4,
50 |             data_type="mx_fp"
51 |         )
52 |         autoround.quantize()
53 | 
54 |     def test_wint4fp8_dynamic(self):
55 |         model_name = "facebook/opt-125m"
56 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
57 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
58 |         bits, group_size = 4, 128
59 |         autoround = AutoRound(
60 |             model,
61 |             tokenizer,
62 |             bits=bits,
63 |             group_size=group_size,
64 |             iters=2,
65 |             seqlen=2,
66 |             dataset=self.llm_dataloader,
67 |             act_bits=8,
68 |             data_type="fp8_to_int_sym",
69 |             act_data_type="fp8_dynamic_per_token"
70 |         )
71 |         autoround.quantize()
72 | 
73 |     def test_wint4fp8_static(self):
74 |         bits, group_size, sym = 4, 128, True
75 |         autoround = AutoRound(
76 |             self.model,
77 |             self.tokenizer,
78 |             bits=bits,
79 |             group_size=group_size,
80 |             sym=sym,
81 |             iters=2,
82 |             seqlen=2,
83 |             dataset=self.llm_dataloader,
84 |             act_bits=8,
85 |             data_type="fp8_to_int_sym",
86 |             act_dynamic=False,
87 |             act_data_type="fp8"
88 |         )
89 |         autoround.quantize()
90 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_auto_round_hpu_only.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from auto_round.utils import is_hpu_supported
 4 | 
 5 | from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy
 6 | 
 7 | 
 8 | def run_opt_125m_on_hpu():
 9 |     from auto_round import AutoRound
10 |     from transformers import AutoModelForCausalLM, AutoTokenizer
11 | 
12 |     model_name = "facebook/opt-125m"
13 |     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
14 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
15 | 
16 |     bits, group_size, sym = 4, 128, False
17 |     autoround = AutoRound(
18 |         model,
19 |         tokenizer,
20 |         bits=bits,
21 |         group_size=group_size,
22 |         sym=sym,
23 |         iters=2,
24 |         seqlen=2,
25 |     )
26 |     q_model, qconfig = autoround.quantize()
27 |     assert q_model is not None, f"Expected q_model to be not None"
28 | 
29 | 
30 | @pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported")
31 | @pytest.mark.skipif(not is_pytest_mode_lazy(), reason="Only for lazy mode")
32 | def test_opt_125m_lazy_mode():
33 |     run_opt_125m_on_hpu()
34 | 
35 | 
36 | @pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported")
37 | @pytest.mark.skipif(not is_pytest_mode_compile(), reason="Only for compile mode")
38 | def test_opt_125m_compile_mode():
39 |     torch._dynamo.reset()
40 |     run_opt_125m_on_hpu()
41 | 
42 | 
43 | def test_import():
44 |     from auto_round import AutoRound
45 |     from auto_round.export.export_to_itrex.export import (
46 |         WeightOnlyLinear, save_quantized_as_itrex)
47 | 
48 | 
49 | @pytest.mark.parametrize(
50 |     "data_type",
51 |     ["fp8_to_int_sym"],
52 | )
53 | def test_w4a8(data_type):
54 |     from auto_round import AutoRound
55 |     from transformers import AutoModelForCausalLM, AutoTokenizer
56 | 
57 |     model_name = "facebook/opt-125m"
58 |     model = AutoModelForCausalLM.from_pretrained(
59 |         model_name,
60 |         torch_dtype="auto",
61 |         attn_implementation="eager",
62 |         trust_remote_code=True,
63 |     )
64 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
65 | 
66 |     autoround = AutoRound(
67 |         model,
68 |         tokenizer,
69 |         bits=4,
70 |         group_size=128,
71 |         iters=2,
72 |         seqlen=2,
73 |         data_type=data_type,
74 |         act_data_type="fp8_sym",
75 |         act_bits=8,
76 |         act_dynamic=False,
77 |     )
78 |     q_model, qconfig = autoround.quantize()
79 |     assert q_model is not None, f"Expected q_model to be not None"
80 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_autoopt.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | import transformers
 9 | from transformers import AutoModelForCausalLM, AutoTokenizer
10 | 
11 | from auto_round import AutoRoundOPT, AutoRoundAdam
12 | 
13 | 
14 | class LLMDataLoader:
15 |     def __init__(self):
16 |         self.batch_size = 1
17 | 
18 |     def __iter__(self):
19 |         for i in range(2):
20 |             yield torch.ones([1, 10], dtype=torch.long)
21 | 
22 | 
23 | class TestAutoRound(unittest.TestCase):
24 |     @classmethod
25 |     def setUpClass(self):
26 |         model_name = "facebook/opt-125m"
27 |         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
28 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
29 |         self.llm_dataloader = LLMDataLoader()
30 | 
31 |     @classmethod
32 |     def tearDownClass(self):
33 |         shutil.rmtree("./saved", ignore_errors=True)
34 |         shutil.rmtree("runs", ignore_errors=True)
35 | 
36 |     def test_default(self):
37 |         bits, group_size, sym = 4, 128, False
38 |         autoround = AutoRoundOPT(
39 |             self.model,
40 |             self.tokenizer,
41 |             bits=bits,
42 |             group_size=group_size,
43 |             sym=sym,
44 |             iters=2,
45 |             seqlen=10,
46 |             dataset=self.llm_dataloader,
47 |             to_quant_block_names=None
48 |         )
49 |         autoround.quantize()
50 |         
51 |         
52 |     def test_Adam(self):
53 |         bits, group_size, sym = 4, 128, False
54 |         from auto_round.utils import get_block_names
55 |         llm_block_names = get_block_names(self.model, quant_vision=True)
56 |         bits, group_size, sym, batch_size = 4, 128, False, 20
57 |         adamround = AutoRoundAdam(
58 |             self.model,
59 |             self.tokenizer,
60 |             bits=bits,
61 |             group_size=group_size,
62 |             sym=sym,
63 |             iters=2,
64 |             seqlen=2,
65 |             batch_size=batch_size,
66 |             dataset=self.llm_dataloader,
67 |             to_quant_block_names=llm_block_names
68 |         )
69 |         adamround.quantize()
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     unittest.main()
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_autoround_acc.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import copy
 3 | import shutil
 4 | import sys
 5 | import unittest
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | import transformers
 9 | from math import isclose
10 | from transformers import AutoModelForCausalLM, AutoTokenizer
11 | from auto_round import AutoRound  # pylint: disable=E0401
12 | from auto_round.export.export_to_itrex.export import pack_model  # pylint: disable=E0401
13 | 
14 | class LLMDataLoader:
15 |     def __init__(self):
16 |         self.batch_size = 1
17 | 
18 |     def __iter__(self):
19 |         for i in range(2):
20 |             yield torch.ones([1, 10], dtype=torch.long)
21 |             
22 | 
23 | class TestAutoRound(unittest.TestCase):
24 |     @classmethod
25 |     def setUpClass(self):
26 |         self.model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM"
27 |         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float32, trust_remote_code=True)
28 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
29 |         self.llm_dataloader = LLMDataLoader()
30 | 
31 |     @classmethod
32 |     def tearDownClass(self):
33 |         shutil.rmtree("./saved", ignore_errors=True)
34 |         shutil.rmtree("runs", ignore_errors=True)
35 | 
36 |     def test_default_acc(self):
37 |         bits, group_size, sym = 4, 128, True
38 |         inp = torch.ones([1, 10], dtype=torch.long)
39 |         autoround = AutoRound(
40 |             self.model,
41 |             self.tokenizer,
42 |             bits=bits,
43 |             device="cpu",
44 |             group_size=group_size,
45 |             sym=sym,
46 |             iters=2,
47 |             seqlen=10,
48 |             dataset=self.llm_dataloader
49 |         )
50 |         autoround.quantize()
51 |         out0 = self.model(inp)
52 |         print(f"out0 = {float(out0[0][0][0][0])}")
53 |         
54 |         model_tmp = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float32, trust_remote_code=True)
55 |         autoround_1 = AutoRound(
56 |             model_tmp,
57 |             self.tokenizer,
58 |             bits=bits,
59 |             group_size=group_size,
60 |             sym=sym,
61 |             device="cpu",
62 |             iters=2,
63 |             seqlen=10,
64 |             dataset=self.llm_dataloader
65 |         )
66 |         autoround_1.quantize()
67 |         out1 = model_tmp(inp)
68 |         
69 |         assert out0[0].equal(out1[0])
70 |         self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04))
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     unittest.main()
75 | 
76 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_autoround_export_to_itrex.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import shutil
  3 | import sys
  4 | import unittest
  5 | 
  6 | sys.path.insert(0, "../..")
  7 | import torch
  8 | import transformers
  9 | from transformers import AutoModelForCausalLM, AutoTokenizer
 10 | 
 11 | from auto_round import AutoRound
 12 | 
 13 | 
 14 | class SimpleDataLoader:
 15 |     def __init__(self):
 16 |         self.batch_size = 1
 17 | 
 18 |     def __iter__(self):
 19 |         for i in range(2):
 20 |             yield torch.randn([1, 30])
 21 | 
 22 | 
 23 | class LLMDataLoader:
 24 |     def __init__(self):
 25 |         self.batch_size = 1
 26 | 
 27 |     def __iter__(self):
 28 |         for i in range(2):
 29 |             yield torch.ones([1, 10], dtype=torch.long)
 30 | 
 31 | 
 32 | class TestAutoroundExport(unittest.TestCase):
 33 |     approach = "weight_only"
 34 | 
 35 |     @classmethod
 36 |     def setUpClass(self):
 37 |         self.gptj = transformers.AutoModelForCausalLM.from_pretrained(
 38 |             "hf-internal-testing/tiny-random-GPTJForCausalLM",
 39 |             torchscript=True,
 40 |         )
 41 |         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
 42 |             "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
 43 |         )
 44 |         self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained(
 45 |             "hf-internal-testing/tiny-random-GPTJForCausalLM",
 46 |         )
 47 |         self.llm_dataloader = LLMDataLoader()
 48 |         self.lm_input = torch.ones([1, 10], dtype=torch.long)
 49 | 
 50 |     @classmethod
 51 |     def tearDownClass(self):
 52 |         shutil.rmtree("./saved", ignore_errors=True)
 53 |         shutil.rmtree("runs", ignore_errors=True)
 54 | 
 55 |     def test_autoround_int_quant(self):
 56 |         model = copy.deepcopy(self.gptj)
 57 |         out1 = model(self.lm_input)
 58 |         round = AutoRound
 59 |         optq_1 = round(model, self.tokenizer, nsamples=20, amp=False, seqlen=10, iters=10, enable_torch_compile=False)
 60 |         q_model, layer_config1 = optq_1.quantize() ##compile model
 61 |         from auto_round.export.export_to_itrex import pack_model
 62 | 
 63 |         compressed_model = pack_model(model=q_model, layer_config=layer_config1)
 64 |         out2 = model(self.lm_input)
 65 |         out3 = q_model(self.lm_input)
 66 |         out4 = compressed_model(self.lm_input)
 67 |         self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
 68 |         self.assertFalse(torch.all(out1[0] == out2[0]))
 69 |         self.assertTrue(torch.all(out2[0] == out3[0]))
 70 |         self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)))
 71 |         self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys())
 72 | 
 73 |         model = copy.deepcopy(self.gptj)
 74 |         out6 = model(self.lm_input)
 75 |         optq_2 = round(model, self.tokenizer, device="cpu", nsamples=20, seqlen=10)
 76 |         q_model, layer_config2 = optq_2.quantize()
 77 |         compressed_model = pack_model(model=q_model, layer_config=layer_config2, inplace=False)
 78 |         compressed_model = compressed_model.to(torch.float32)
 79 |         out4 = q_model(self.lm_input)
 80 |         out5 = compressed_model(self.lm_input)
 81 |         self.assertTrue(torch.all(out1[0] == out6[0]))
 82 |         self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=5e-3)))
 83 | 
 84 |     def test_config(self):
 85 |         from auto_round.export.export_to_itrex import QuantConfig
 86 | 
 87 |         config = QuantConfig.from_pretrained("TheBloke/Llama-2-7B-Chat-GPTQ")
 88 |         config.save_pretrained("quantization_config_dir")
 89 |         loaded_config = QuantConfig.from_pretrained("quantization_config_dir")
 90 |         self.assertEqual(config.group_size, loaded_config.group_size)
 91 |         self.assertEqual(config.desc_act, loaded_config.desc_act)
 92 |         self.assertEqual(config.bits, loaded_config.bits)
 93 |         self.assertEqual(config.sym, loaded_config.sym)
 94 | 
 95 |     def test_xpu_export(self):
 96 |         model = copy.deepcopy(self.gptj)
 97 |         out1 = model(self.lm_input)
 98 |         round = AutoRound
 99 |         optq_1 = round(model, self.tokenizer, nsamples=20, amp=False, seqlen=10, iters=10, enable_torch_compile=False)
100 |         q_model, layer_config1 = optq_1.quantize()
101 |         from auto_round.export.export_to_itrex import pack_model
102 | 
103 |         compressed_model_xpu = pack_model(model=q_model, layer_config=layer_config1, device="xpu", inplace=False)
104 |         compressed_model_cpu = pack_model(model=q_model, layer_config=layer_config1, inplace=False)
105 |         out2 = model(self.lm_input)
106 |         out3 = q_model(self.lm_input)
107 |         out4 = compressed_model_xpu(self.lm_input)
108 |         out5 = compressed_model_cpu(self.lm_input)
109 |         self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
110 |         self.assertFalse(torch.all(out1[0] == out2[0]))
111 |         self.assertTrue(torch.all(out2[0] == out3[0]))
112 |         self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)))
113 |         self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5)))
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     unittest.main()
118 | 
119 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_basic_usage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, '../..')
 7 | 
 8 | 
 9 | class TestAutoRoundCmd(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(self):
12 |         pass
13 | 
14 |     @classmethod
15 |     def tearDownClass(self):
16 |         shutil.rmtree("./saved", ignore_errors=True)
17 |         shutil.rmtree("runs", ignore_errors=True)
18 |     
19 |     def test_auto_round_cmd(self):
20 |         python_path = sys.executable
21 | 
22 |         ##test llm script
23 |         # res = os.system(
24 |         #     f"cd ../.. && {python_path} -m auto_round -h")
25 |         # if res > 0 or res == -1:
26 |         #     assert False, "cmd line test fail, please have a check"
27 |         #
28 |         res = os.system(
29 |             f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa")
30 |         if res > 0 or res == -1:
31 |             assert False, "cmd line test fail, please have a check"
32 |         
33 |         res = os.system(
34 |             f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
35 |         )
36 |         if res > 0 or res == -1:
37 |             assert False, "cmd line test fail, please have a check"
38 | 
39 |         res = os.system(
40 |             f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai")
41 |         if res > 0 or res == -1:
42 |             assert False, "cmd line test fail, please have a check"
43 | 
44 |         # test mllm script
45 |         # test auto_round_mllm help
46 |         res = os.system(
47 |             f"cd ../.. && {python_path} -m auto_round --mllm -h")
48 |         if res > 0 or res == -1:
49 |             assert False, "cmd line test fail, please have a check"
50 | 
51 |         # test auto_round_mllm --eval help
52 |         res = os.system(
53 |             f"cd ../.. && {python_path} -m auto_round --mllm --eval -h")
54 |         if res > 0 or res == -1:
55 |             assert False, "cmd line test fail, please have a check"
56 | 
57 |         # test auto_round_mllm --lmms help
58 |         res = os.system(
59 |             f"cd ../.. && {python_path} -m auto_round --mllm --lmms -h")
60 |         if res > 0 or res == -1:
61 |             assert False, "cmd line test fail, please have a check"
62 | 
63 |         res = os.system(
64 |             f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 32 --format auto_round --output_dir ./saved")
65 |         if res > 0 or res == -1:
66 |             assert False, "cmd line test fail, please have a check"
67 | 
68 |         res = os.system(
69 |             f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10  --seqlen 256 --format auto_round"
70 |             " --quant_nontext_module --output_dir ./saved ")
71 |         if res > 0 or res == -1:
72 |             assert False, "cmd line test fail, please have a check"
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_calib_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import sys
  4 | import unittest
  5 | 
  6 | sys.path.insert(0, "../..")
  7 | import json
  8 | 
  9 | import torch
 10 | from transformers import AutoModelForCausalLM, AutoTokenizer
 11 | 
 12 | from auto_round import AutoRound
 13 | 
 14 | 
 15 | class LLMDataLoader:
 16 |     def __init__(self):
 17 |         self.batch_size = 1
 18 | 
 19 |     def __iter__(self):
 20 |         for i in range(2):
 21 |             yield torch.ones([1, 10], dtype=torch.long)
 22 | 
 23 | 
 24 | class TestLocalCalibDataset(unittest.TestCase):
 25 |     @classmethod
 26 |     def setUpClass(self):
 27 |         json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
 28 |         os.makedirs("./saved", exist_ok=True)
 29 |         self.json_file = "./saved/tmp.json"
 30 |         with open(self.json_file, "w") as json_file:
 31 |             json.dump(json_data, json_file, indent=4)
 32 | 
 33 |         jsonl_data = [{"text": "哈哈，開心點"}, {"text": "hello world"}]
 34 |         os.makedirs("./saved", exist_ok=True)
 35 |         self.jsonl_file = "./saved/tmp.jsonl"
 36 |         with open(self.jsonl_file, "w") as jsonl_file:
 37 |             for item in jsonl_data:
 38 |                 json.dump(item, jsonl_file, ensure_ascii=False)
 39 |                 jsonl_file.write('\n')
 40 | 
 41 |         model_name = "facebook/opt-125m"
 42 |         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
 43 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 44 | 
 45 |     def test_json(self):
 46 |         bits, group_size, sym = 4, 128, True
 47 |         autoround = AutoRound(
 48 |             self.model,
 49 |             self.tokenizer,
 50 |             bits=bits,
 51 |             group_size=group_size,
 52 |             sym=sym,
 53 |             iters=2,
 54 |             seqlen=5,
 55 |             dataset=self.json_file,
 56 |         )
 57 |         autoround.quantize()
 58 | 
 59 |     def test_jsonl(self):
 60 |         bits, group_size, sym = 4, 128, True
 61 |         autoround = AutoRound(
 62 |             self.model,
 63 |             self.tokenizer,
 64 |             bits=bits,
 65 |             group_size=group_size,
 66 |             sym=sym,
 67 |             iters=2,
 68 |             seqlen=4,
 69 |             dataset=self.jsonl_file,
 70 |         )
 71 |         autoround.quantize()
 72 | 
 73 |     def test_apply_chat_template(self):
 74 |         model_name = "Qwen/Qwen2.5-0.5B-Instruct"
 75 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
 76 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 77 |         dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''"
 78 |         bits, group_size, sym = 4, 128, True
 79 |         autoround = AutoRound(
 80 |             model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
 81 |         )
 82 |         autoround.quantize()
 83 | 
 84 |     def test_combine_dataset(self):
 85 |         dataset = "NeelNanda/pile-10k" + "," + "madao33/new-title-chinese" + "," + "mbpp"
 86 |         bits, group_size, sym = 4, 128, True
 87 |         autoround = AutoRound(
 88 |             self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
 89 |         )
 90 |         autoround.quantize()
 91 | 
 92 |     def test_combine_dataset2(self):
 93 |         dataset = "NeelNanda/pile-10k:num=256,mbpp:num=256"
 94 |         bits, group_size, sym = 4, 128, True
 95 |         autoround = AutoRound(
 96 |             self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
 97 |         )
 98 |         autoround.quantize()
 99 | 
100 |     # def test_pile_val_backup_dataset(self):
101 |     #     dataset = "swift/pile-val-backup"
102 |     #     bits, group_size, sym = 4, 128, True
103 |     #     autoround = AutoRound(
104 |     #         self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
105 |     #     )
106 |     #     autoround.quantize()
107 | 
108 |     @classmethod
109 |     def tearDownClass(self):
110 |         shutil.rmtree("./saved", ignore_errors=True)
111 |         shutil.rmtree("runs", ignore_errors=True)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     unittest.main()
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_conv1d.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | from auto_round import AutoRound
11 | from _test_helpers import model_infer
12 | class LLMDataLoader:
13 |     def __init__(self):
14 |         self.batch_size = 1
15 | 
16 |     def __iter__(self):
17 |         for i in range(2):
18 |             yield torch.ones([1, 10], dtype=torch.long)
19 | 
20 | 
21 | class TestQuantizationConv1d(unittest.TestCase):
22 |     @classmethod
23 |     def setUpClass(self):
24 |         self.model_name = "MBZUAI/LaMini-GPT-124M"
25 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
26 |         self.llm_dataloader = LLMDataLoader()
27 | 
28 |     @classmethod
29 |     def tearDownClass(self):
30 |         shutil.rmtree("./saved", ignore_errors=True)
31 |         shutil.rmtree("runs", ignore_errors=True)
32 | 
33 | 
34 |     def test_quant(self):
35 |         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
36 |         bits, group_size, sym = 4, 128, True
37 |         autoround = AutoRound(
38 |             self.model,
39 |             self.tokenizer,
40 |             bits=bits,
41 |             group_size=group_size,
42 |             sym=sym,
43 |             iters=2,
44 |             seqlen=2,
45 |             dataset=self.llm_dataloader,
46 | 
47 |         )
48 | 
49 |         autoround.quantize()
50 |         autoround.save_quantized("./saved")
51 | 
52 |         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True)
53 |         model_infer(model, self.tokenizer)
54 | 
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_generation.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import shutil
  3 | import sys
  4 | import unittest
  5 | 
  6 | sys.path.insert(0, "../..")
  7 | import torch
  8 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
  9 | 
 10 | from auto_round import AutoRound
 11 | 
 12 | 
 13 | class LLMDataLoader:
 14 |     def __init__(self):
 15 |         self.batch_size = 1
 16 | 
 17 |     def __iter__(self):
 18 |         for i in range(2):
 19 |             yield torch.ones([1, 10], dtype=torch.long)
 20 | 
 21 | 
 22 | class TestAutoRoundFormatGeneration(unittest.TestCase):
 23 |     @classmethod
 24 |     def setUpClass(self):
 25 |         self.model_name = "facebook/opt-125m"
 26 |         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
 27 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 28 |         self.llm_dataloader = LLMDataLoader()
 29 |         self.save_folder = "./saved"
 30 | 
 31 |     @classmethod
 32 |     def tearDownClass(self):
 33 |         shutil.rmtree(self.save_folder, ignore_errors=True)
 34 |         shutil.rmtree("runs", ignore_errors=True)
 35 | 
 36 |     def test_4bits_sym(self):
 37 |         bits = 4
 38 |         group_size = 128
 39 |         sym = True
 40 |         autoround = AutoRound(
 41 |             self.model,
 42 |             self.tokenizer,
 43 |             bits=bits,
 44 |             group_size=group_size,
 45 |             sym=sym,
 46 |             iters=1,
 47 |             seqlen=2,
 48 |             dataset=self.llm_dataloader,
 49 |         )
 50 |         quantized_model_path = self.save_folder
 51 | 
 52 |         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False)
 53 | 
 54 |         quantization_config = AutoRoundConfig(
 55 |             backend="ipex"
 56 |         )
 57 |         model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
 58 |                                                      device_map="cpu", quantization_config=quantization_config)
 59 |         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
 60 |         text = "My name is "
 61 |         inputs = tokenizer(text, return_tensors="pt").to(model.device)
 62 |         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
 63 |         print(res)
 64 |         assert ("!!!" not in res)
 65 | 
 66 |         model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
 67 |                                                      device_map="cpu", quantization_config=quantization_config,
 68 |                                                      torch_dtype=torch.float16)
 69 |         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
 70 |         text = "There is a girl who likes adventure,"
 71 |         inputs = tokenizer(text, return_tensors="pt").to(model.device)
 72 |         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
 73 |         print(res)
 74 |         assert ("!!!" not in res)
 75 | 
 76 |     def test_autoround_sym(self):
 77 |         for bits in [4]:
 78 |             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
 79 |             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 80 |             bits, group_size, sym = bits, 128, True
 81 |             autoround = AutoRound(
 82 |                 model,
 83 |                 tokenizer,
 84 |                 bits=bits,
 85 |                 group_size=group_size,
 86 |                 sym=sym,
 87 |                 iters=2,
 88 |                 seqlen=2,
 89 |                 dataset=self.llm_dataloader,
 90 |             )
 91 |             quantized_model_path = "./saved"
 92 | 
 93 |             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 94 | 
 95 |             model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto",
 96 |                                                          trust_remote_code=True)
 97 |             tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
 98 |             text = "There is a girl who likes adventure,"
 99 |             inputs = tokenizer(text, return_tensors="pt").to(model.device)
100 |             res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
101 |             print(res)
102 |             assert ("!!!" not in res)
103 |             shutil.rmtree(self.save_folder, ignore_errors=True)
104 | 
105 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_hpu.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.insert(0, "../..")
 6 | import torch
 7 | from transformers import AutoModelForCausalLM, AutoTokenizer
 8 | 
 9 | from auto_round import AutoRound
10 | 
11 | 
12 | 
13 | class LLMDataLoader:
14 |     def __init__(self):
15 |         self.batch_size = 1
16 | 
17 |     def __iter__(self):
18 |         for i in range(2):
19 |             yield torch.ones([1, 10], dtype=torch.long)
20 | 
21 | def is_hpu_supported():
22 |     try:
23 |         import habana_frameworks.torch.core as htcore # pylint: disable=E0401
24 |     except ImportError as e:
25 |         return False
26 |     return True
27 | 
28 | 
29 | class TestAutoRound(unittest.TestCase):
30 |     @classmethod
31 |     def setUpClass(self):
32 |         model_name = "facebook/opt-125m"
33 |         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
34 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
35 |         self.llm_dataloader = LLMDataLoader()
36 | 
37 |     @classmethod
38 |     def tearDownClass(self):
39 |         shutil.rmtree("./saved", ignore_errors=True)
40 |         shutil.rmtree("runs", ignore_errors=True)
41 | 
42 |     def test_autogptq_format_hpu_inference(self):
43 |         if not is_hpu_supported():
44 |             return
45 |         try:
46 |             import auto_gptq
47 |         except:
48 |             return
49 |         bits, group_size, sym = 4, 128, False
50 |         autoround = AutoRound(
51 |             self.model,
52 |             self.tokenizer,
53 |             bits=bits,
54 |             group_size=group_size,
55 |             sym=sym,
56 |             iters=2,
57 |             seqlen=2,
58 |             dataset=self.llm_dataloader,
59 |         )
60 |         autoround.quantize()
61 |         quantized_model_path = "./saved"
62 | 
63 |         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_gptq")
64 |         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", \
65 |             trust_remote_code=True).to('hpu').to(torch.float32)
66 |         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
67 |         text = "There is a girl who likes adventure,"
68 |         inputs = tokenizer(text, return_tensors="pt").to(model.device)
69 |         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
70 |         shutil.rmtree("./saved", ignore_errors=True)
71 | 
72 | 
73 |     def test_autoround_format_hpu_inference(self):
74 |         if not is_hpu_supported():
75 |             return
76 |         bits, group_size, sym = 4, 128, False
77 |         autoround = AutoRound(
78 |             self.model,
79 |             self.tokenizer,
80 |             bits=bits,
81 |             group_size=group_size,
82 |             sym=sym,
83 |             iters=2,
84 |             seqlen=2,
85 |             dataset=self.llm_dataloader,
86 |         )
87 |         autoround.quantize()
88 |         quantized_model_path = "./saved"
89 | 
90 |         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
91 | 
92 |         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, \
93 |             device_map="auto").to('hpu').to(torch.float32)
94 |         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
95 |         text = "There is a girl who likes adventure,"
96 |         inputs = tokenizer(text, return_tensors="pt").to(model.device)
97 |         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
98 |         shutil.rmtree("./saved", ignore_errors=True)
99 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_load_awq_gptq.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.insert(0, "../..")
 6 | 
 7 | from transformers import AutoModelForCausalLM, AutoTokenizer
 8 | 
 9 | from transformers import AutoRoundConfig
10 | 
11 | 
12 | class TestAutoRound(unittest.TestCase):
13 |     def model_infer(self, model, tokenizer):
14 |         prompts = [
15 |             "Hello,my name is",
16 |             # "The president of the United States is",
17 |             # "The capital of France is",
18 |             # "The future of AI is",
19 |         ]
20 | 
21 |         inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
22 | 
23 |         outputs = model.generate(
24 |             input_ids=inputs["input_ids"].to(model.device),
25 |             attention_mask=inputs["attention_mask"].to(model.device),
26 |             do_sample=False,  ## change this to follow official usage
27 |             max_new_tokens=5
28 |         )
29 |         generated_ids = [
30 |             output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs)
31 |         ]
32 | 
33 |         decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
34 | 
35 |         for i, prompt in enumerate(prompts):
36 |             print(f"Prompt: {prompt}")
37 |             print(f"Generated: {decoded_outputs[i]}")
38 |             print("-" * 50)
39 | 
40 |     @classmethod
41 |     def tearDownClass(self):
42 |         shutil.rmtree("runs", ignore_errors=True)
43 | 
44 |     def test_load_gptq_no_dummy_gidx_model(self):
45 |         model_name = "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1"
46 |         quantization_config = AutoRoundConfig()
47 |         with self.assertRaises(NotImplementedError) as cm:
48 |             model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True,
49 |                                                          device_map="cpu",
50 |                                                          quantization_config=quantization_config)
51 | 
52 |     def test_load_awq(self):
53 |         model_name = "casperhansen/opt-125m-awq"
54 |         quantization_config = AutoRoundConfig()
55 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True,
56 |                                                      device_map="cpu",
57 |                                                      quantization_config=quantization_config)
58 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
59 |         self.model_infer(model, tokenizer)
60 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_low_cpu_mem.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import sys
  3 | import os
  4 | import unittest
  5 | sys.path.insert(0, "../..")
  6 | 
  7 | import torch
  8 | from transformers import AutoModelForCausalLM, AutoTokenizer
  9 | from auto_round.low_cpu_mem.utils import (
 10 |     load_model_with_hooks,
 11 |     load_empty_model,
 12 |     get_layers_before_block,
 13 |     layer_wise_load,
 14 |     layer_wise_save,
 15 |     )
 16 | 
 17 | from auto_round import AutoRound
 18 | 
 19 | 
 20 | class LLMDataLoader:
 21 |     def __init__(self):
 22 |         self.batch_size = 1
 23 | 
 24 |     def __iter__(self):
 25 |         for i in range(2):
 26 |             yield torch.ones([1, 10], dtype=torch.long)
 27 | 
 28 | class TestLowCPUMem(unittest.TestCase):
 29 |     @classmethod
 30 |     def setUpClass(self):
 31 |         self.model_name = "facebook/opt-125m"
 32 |         self.saved_path = './test_tmp_saved'
 33 |         self.ori_model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True)
 34 |         self.model = load_model_with_hooks(self.model_name, AutoModelForCausalLM, saved_path=self.saved_path, device='cpu')
 35 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 36 |         self.llm_dataloader = LLMDataLoader()
 37 | 
 38 |     @classmethod
 39 |     def tearDownClass(self):
 40 |         shutil.rmtree(self.saved_path, ignore_errors=True)
 41 | 
 42 |     def test_default(self):
 43 |         self.assertTrue(self.model.device.type, 'meta')
 44 | 
 45 |         layers = get_layers_before_block(self.model)
 46 |         self.assertEqual(layers[0][0], 'model.decoder.embed_tokens')
 47 | 
 48 |         # test get_weight bias
 49 |         self.assertTrue(torch.equal(
 50 |             self.model.model.decoder.layers[0].self_attn.k_proj.get_weight(),
 51 |             self.ori_model.model.decoder.layers[0].self_attn.k_proj.weight,
 52 |         ))
 53 |         self.assertTrue(torch.equal(
 54 |             self.model.model.decoder.layers[0].self_attn.k_proj.get_bias(),
 55 |             self.ori_model.model.decoder.layers[0].self_attn.k_proj.bias,
 56 |         ))
 57 | 
 58 |         # test hooks
 59 |         text = ["Hello, my dog is cute"]
 60 |         input = self.tokenizer(text)
 61 |         for key in input:
 62 |             input[key] = torch.tensor(input[key])
 63 |         ori_output = self.ori_model(**input)
 64 |         output = self.model(**input)
 65 |         self.assertTrue(torch.equal(ori_output[0], output[0]))
 66 | 
 67 |         # test save and load
 68 |         layer_wise_save(self.model, self.saved_path)
 69 |         state_dict = layer_wise_load(self.saved_path)
 70 |         self.assertTrue(torch.equal(
 71 |             state_dict['lm_head.weight'],
 72 |             self.ori_model.lm_head.weight
 73 |         ))
 74 | 
 75 |         # test layer-wise auto_round
 76 |         bits, group_size, sym = 4, 128, False
 77 |         autoround = AutoRound(
 78 |             self.model,
 79 |             self.tokenizer,
 80 |             device='cpu',
 81 |             bits=bits,
 82 |             group_size=group_size,
 83 |             sym=sym,
 84 |             iters=2,
 85 |             seqlen=2,
 86 |             dataset=self.llm_dataloader,
 87 |             enable_torch_compile=False
 88 |         )
 89 |         autoround.quantize()
 90 | 
 91 |         # test block-wise auto_round
 92 |         self.model = load_empty_model(self.model_name, AutoModelForCausalLM, saved_path=self.saved_path, device='cpu')
 93 |         bits, group_size, sym = 4, 128, False
 94 |         autoround = AutoRound(
 95 |             self.model,
 96 |             self.tokenizer,
 97 |             device='cpu',
 98 |             bits=bits,
 99 |             group_size=group_size,
100 |             sym=sym,
101 |             iters=2,
102 |             seqlen=2,
103 |             dataset=self.llm_dataloader,
104 |             low_cpu_mem_usage=True
105 |         )
106 |         autoround.quantize()
107 | 
108 | if __name__ == "__main__":
109 |     unittest.main()


--------------------------------------------------------------------------------
/test/test_cpu/test_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.insert(0, "../..")
 6 | 
 7 | 
 8 | class TestScript(unittest.TestCase):
 9 |     def test_default(self):
10 |         os.system('''
11 |                 cd ../.. && 
12 |                 python -m auto_round
13 |                     --iters 2
14 |                     --deployment_device fake
15 |                     --output_dir ./tmp_script_test''')
16 | 
17 | if __name__ == "__main__":
18 |     unittest.main()


--------------------------------------------------------------------------------
/test/test_cpu/test_utils.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | import sys
 3 | sys.path.insert(0, "../..")
 4 | import auto_round.utils as auto_round_utils
 5 | 
 6 | class TestPackingWithNumba:
 7 | 
 8 |     @patch.object(auto_round_utils, "_is_tbb_installed", lambda: False)
 9 |     def test_tbb_not_installed(self):
10 |         assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False."
11 |         assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False."
12 | 
13 |     @patch.object(auto_round_utils, "_is_tbb_installed", lambda: True)
14 |     @patch.object(auto_round_utils, "_is_tbb_configured", lambda: False)
15 |     def test_tbb_installed_but_not_configured_right(self):
16 |         assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False."
17 |         assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False."
18 | 
19 |     @patch.object(auto_round_utils, "is_numba_available", lambda: False)
20 |     def test_numba_not_installed(self):
21 |         assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False."
22 | 


--------------------------------------------------------------------------------
/test/test_cpu/test_woq_linear.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import sys
 4 | sys.path.insert(0, "../..")
 5 | from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
 6 | 
 7 | 
 8 | class TestWeightOnlyLinear:
 9 |     @pytest.mark.parametrize(
10 |         "bits, compression_dtype",
11 |         [
12 |             (8, torch.int16),
13 |             (8, torch.int32),
14 |             (8, torch.int64),
15 |             (4, torch.int8),
16 |             (4, torch.int16),
17 |             (4, torch.int32),
18 |             (4, torch.int64),
19 |             (2, torch.int8),
20 |             (2, torch.int16),
21 |             (2, torch.int32),
22 |             (2, torch.int64),
23 |         ],
24 |     )
25 |     def test_pack_with_numba(self, bits, compression_dtype):
26 |         m = torch.nn.Linear(1024, 512)
27 |         dtype = "int"
28 |         weight = m.weight.detach()
29 |         group_size = 32
30 |         origin_shape = weight.shape
31 |         from auto_round.data_type.int import quant_tensor_sym
32 |         origin_shape = weight.shape
33 |         weight = weight.reshape(-1, group_size)
34 |         qdq, scale, zp = quant_tensor_sym( weight, -1
35 |         )
36 |         int_weight = (
37 |             qdq.div(scale)
38 |             .add(zp)
39 |             .clamp(0, 2 ** (bits) - 1)
40 |             .to(torch.int32)
41 |             .reshape(origin_shape)
42 |         )
43 |         scale = scale.reshape(origin_shape[0], -1)
44 |         zp = zp.reshape(origin_shape[0], -1).to(torch.int32).clamp(0, 2 ** (bits) - 1)
45 |         module_with_legacy_pack = WeightOnlyLinear(
46 |             in_features=m.in_features,
47 |             out_features=m.out_features,
48 |             dtype=dtype,
49 |             bits=bits,
50 |             groupsize=32,
51 |             zp=zp is not None,
52 |             bias=m.bias is not None,
53 |             use_optimum_format=False,
54 |             compression_dtype=compression_dtype,
55 |             use_legacy_pack=True,
56 |         )
57 |         module_with_legacy_pack.pack(
58 |             int_weight.clone(), scale.clone(), zp.clone(), m.bias
59 |         )
60 |         module_with_new_pack = WeightOnlyLinear(
61 |             in_features=m.in_features,
62 |             out_features=m.out_features,
63 |             dtype=dtype,
64 |             bits=bits,
65 |             groupsize=32,
66 |             zp=zp is not None,
67 |             bias=m.bias is not None,
68 |             use_optimum_format=False,
69 |             compression_dtype=compression_dtype,
70 |             use_legacy_pack=False,
71 |         )
72 |         module_with_new_pack.pack(int_weight.clone(), scale.clone(), zp.clone(), m.bias)
73 | 
74 |         assert torch.equal(
75 |             module_with_new_pack.qweight, module_with_legacy_pack.qweight
76 |         )
77 | 
78 |         assert torch.equal(module_with_new_pack.qzeros, module_with_legacy_pack.qzeros)
79 |         assert torch.equal(module_with_new_pack.scales, module_with_legacy_pack.scales)
80 |         unpacked_int_weight = module_with_new_pack.unpack_tensor(
81 |             module_with_legacy_pack.qweight
82 |         )
83 |         assert torch.equal(unpacked_int_weight, int_weight)
84 | 


--------------------------------------------------------------------------------
/test/test_cuda/_test_helpers.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def model_infer(model, tokenizer, apply_chat_template=False):
 3 |     prompts = [
 4 |         "Hello,my name is",
 5 |         # "The president of the United States is",
 6 |         # "The capital of France is",
 7 |         # "The future of AI is",
 8 |     ]
 9 |     if apply_chat_template:
10 |         texts = []
11 |         for prompt in prompts:
12 |             messages = [
13 |                 {"role": "user", "content": prompt}
14 |             ]
15 |             text = tokenizer.apply_chat_template(
16 |                 messages,
17 |                 tokenize=False,
18 |                 add_generation_prompt=True
19 |             )
20 |             texts.append(text)
21 |         prompts = texts
22 | 
23 |     inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
24 | 
25 |     outputs = model.generate(
26 |         input_ids=inputs["input_ids"].to(model.device),
27 |         attention_mask=inputs["attention_mask"].to(model.device),
28 |         do_sample=False,  ## change this to follow official usage
29 |         max_new_tokens=5
30 |     )
31 |     generated_ids = [
32 |         output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs)
33 |     ]
34 | 
35 |     decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
36 | 
37 |     for i, prompt in enumerate(prompts):
38 |         print(f"Prompt: {prompt}")
39 |         print(f"Generated: {decoded_outputs[i]}")
40 |         print("-" * 50)
41 |     return decoded_outputs[0]
42 | 


--------------------------------------------------------------------------------
/test/test_cuda/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | # autoawq
 3 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation
 4 | auto-gptq
 5 | datasets
 6 | einops
 7 | # gptqmodel>=2.0
 8 | # pip install -v git+https://github.com/ModelCloud/GPTQModel.git@v2.2.0 --no-build-isolation
 9 | intel-extension-for-pytorch
10 | lm-eval>=0.4.2,<0.5
11 | numpy < 2.0
12 | optimum
13 | pandas
14 | pillow
15 | py-cpuinfo
16 | torch
17 | torchvision
18 | tqdm
19 | transformers==4.51.3
20 | 


--------------------------------------------------------------------------------
/test/test_cuda/requirements_vlm.txt:
--------------------------------------------------------------------------------
 1 | # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2
 2 | # pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git
 3 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation
 4 | accelerate
 5 | # autoawq
 6 | bitsandbytes==0.44.0 
 7 | datasets
 8 | einops
 9 | flash-attn==2.5.8
10 | intel-extension-for-transformers
11 | lm-eval>=0.4.2,<0.5
12 | numpy < 2.0
13 | optimum
14 | pandas
15 | protobuf==3.20.2
16 | pillow
17 | py-cpuinfo
18 | torch==2.3.0
19 | torchvision
20 | triton==2.3.0
21 | tqdm
22 | transformers==4.45.0
23 | xformers
24 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_2_3bits.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import shutil
  3 | import sys
  4 | import unittest
  5 | import re
  6 | 
  7 | sys.path.insert(0, "../..")
  8 | import torch
  9 | import transformers
 10 | from transformers import AutoModelForCausalLM, AutoTokenizer
 11 | 
 12 | from auto_round import AutoRound
 13 | from auto_round.eval.evaluation import simple_evaluate
 14 | from lm_eval.utils import make_table  # pylint: disable=E0401
 15 | from auto_round.testing_utils import require_autogptq, require_greater_than_050
 16 | 
 17 | 
 18 | def get_accuracy(data):
 19 |     match = re.search(r'\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|', data)
 20 | 
 21 |     if match:
 22 |         accuracy = float(match.group(1))
 23 |         return accuracy
 24 |     else:
 25 |         return 0.0
 26 | 
 27 | 
 28 | class TestAutoRound(unittest.TestCase):
 29 |     @classmethod
 30 |     def setUpClass(self):
 31 |         self.save_dir = "./saved"
 32 |         self.tasks = "lambada_openai"
 33 | 
 34 |     @classmethod
 35 |     def tearDownClass(self):
 36 |         shutil.rmtree("./saved", ignore_errors=True)
 37 |         shutil.rmtree("runs", ignore_errors=True)
 38 | 
 39 |     @require_autogptq
 40 |     def test_3bits_autoround(self):
 41 |         model_name = "/models/opt-125m"
 42 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
 43 |         tokenizer = AutoTokenizer.from_pretrained(model_name)
 44 |         autoround = AutoRound(model, tokenizer, bits=3)
 45 |         autoround.quantize()
 46 | 
 47 |         # autoround.save_quantized(self.save_dir, format="auto_gptq", inplace=False)
 48 |         autoround.save_quantized(self.save_dir, format="auto_round", inplace=False)
 49 |         model_args = f"pretrained={self.save_dir}"
 50 |         res = simple_evaluate(model="hf", model_args=model_args,
 51 |                             #   tasks="arc_easy",
 52 |                               tasks=self.tasks,
 53 |                               batch_size="auto")
 54 | 
 55 |         ## 0.2529
 56 |         accuracy = res['results']['lambada_openai']['acc,none']
 57 |         assert accuracy > 0.3
 58 |         shutil.rmtree("./saved", ignore_errors=True)
 59 |         
 60 |     def test_3bits_asym_autoround(self):
 61 |         model_name = "/models/opt-125m"
 62 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
 63 |         tokenizer = AutoTokenizer.from_pretrained(model_name)
 64 |         bits, sym = 3, False
 65 |         autoround = AutoRound(model, tokenizer, bits=bits, sym=sym)
 66 |         autoround.quantize()
 67 |         autoround.save_quantized(self.save_dir, format="auto_round", inplace=False)
 68 |         model_args = f"pretrained={self.save_dir}"
 69 |         res = simple_evaluate(model="hf", model_args=model_args,
 70 |                             #   tasks="arc_easy",
 71 |                               tasks=self.tasks,
 72 |                               batch_size="auto")
 73 | 
 74 |         ## 0.3423
 75 |         accuracy = res['results']['lambada_openai']['acc,none']
 76 |         assert accuracy > 0.32
 77 |         shutil.rmtree("./saved", ignore_errors=True)
 78 | 
 79 |     @require_greater_than_050
 80 |     def test_norm_bias_tuning(self):
 81 |         model_name = "/models/opt-125m"
 82 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
 83 |         tokenizer = AutoTokenizer.from_pretrained(model_name)
 84 |         autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True)
 85 |         autoround.quantize()
 86 | 
 87 |         ##test auto_round format
 88 |         autoround.save_quantized(self.save_dir, format="auto_round", inplace=False)
 89 |         model_args = f"pretrained={self.save_dir}"
 90 |         res = simple_evaluate(model="hf", model_args=model_args,
 91 |                               tasks=self.tasks,
 92 |                               batch_size="auto")
 93 |         res = make_table(res)  ##0.2212 0.1844
 94 |         accuracy = get_accuracy(res)
 95 |         assert accuracy > 0.18
 96 |         shutil.rmtree("./saved", ignore_errors=True)
 97 | 
 98 |     @require_greater_than_050
 99 |     def test_2bits_autoround(self):
100 |         model_name = "/models/opt-125m"
101 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
102 |         tokenizer = AutoTokenizer.from_pretrained(model_name)
103 |         autoround = AutoRound(model, tokenizer, bits=2, group_size=64)
104 |         autoround.quantize()
105 | 
106 |         ##test auto_round format
107 |         autoround.save_quantized(self.save_dir, format="auto_round", inplace=False)
108 |         model_args = f"pretrained={self.save_dir}"
109 |         res = simple_evaluate(model="hf", model_args=model_args,
110 |                               tasks=self.tasks,
111 |                               batch_size="auto")
112 |         res = make_table(res) ##0.1985
113 |         accuracy = get_accuracy(res)
114 |         assert accuracy > 0.18
115 |         shutil.rmtree("./saved", ignore_errors=True)
116 | 
117 | 
118 |         autoround.save_quantized(self.save_dir, format="auto_gptq", inplace=False)
119 |         model_args = f"pretrained={self.save_dir}"
120 |         res = simple_evaluate(model="hf", model_args=model_args,
121 |                               tasks=self.tasks,
122 |                               batch_size="auto")
123 |         res = make_table(res) ##0.1985
124 |         accuracy = get_accuracy(res)
125 |         assert accuracy > 0.18
126 |         shutil.rmtree("./saved", ignore_errors=True)
127 | 
128 | if __name__ == "__main__":
129 |     unittest.main()
130 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_calib_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import json
 8 | 
 9 | import torch
10 | from transformers import AutoModelForCausalLM, AutoTokenizer
11 | 
12 | from auto_round import AutoRound
13 | 
14 | 
15 | class TestLocalCalibDataset(unittest.TestCase):
16 |     @classmethod
17 |     def setUpClass(self):
18 |         json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
19 |         os.makedirs("./saved", exist_ok=True)
20 |         self.json_file = "./saved/tmp.json"
21 |         with open(self.json_file, "w") as json_file:
22 |             json.dump(json_data, json_file, indent=4)
23 | 
24 |         jsonl_data = [{"text": "哈哈，開心點"}, {"text": "hello world"}]
25 |         os.makedirs("./saved", exist_ok=True)
26 |         self.jsonl_file = "./saved/tmp.jsonl"
27 |         with open(self.jsonl_file, "w") as jsonl_file:
28 |             for item in jsonl_data:
29 |                 json.dump(item, jsonl_file, ensure_ascii=False)
30 |                 jsonl_file.write('\n')
31 | 
32 |         model_name = "facebook/opt-125m"
33 |         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
34 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
35 | 
36 |     def test_combine_dataset(self):
37 |         dataset = (
38 |                     "NeelNanda/pile-10k" + ",codeparrot/github-code-clean" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese")
39 |         bits, group_size, sym = 4, 128, True
40 |         autoround = AutoRound(
41 |             self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
42 |         )
43 |         autoround.quantize()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     unittest.main()
48 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_conv1d.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.insert(0, "../..")
 7 | import torch
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | from auto_round import AutoRound
11 | from auto_round.testing_utils import require_gptqmodel
12 | from _test_helpers import model_infer
13 | class LLMDataLoader:
14 |     def __init__(self):
15 |         self.batch_size = 1
16 | 
17 |     def __iter__(self):
18 |         for i in range(2):
19 |             yield torch.ones([1, 10], dtype=torch.long)
20 | 
21 | 
22 | class TestQuantizationConv1d(unittest.TestCase):
23 |     @classmethod
24 |     def setUpClass(self):
25 |         self.model_name = "MBZUAI/LaMini-GPT-124M"
26 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
27 |         self.llm_dataloader = LLMDataLoader()
28 | 
29 |     @classmethod
30 |     def tearDownClass(self):
31 |         shutil.rmtree("./saved", ignore_errors=True)
32 |         shutil.rmtree("runs", ignore_errors=True)
33 | 
34 |     @require_gptqmodel
35 |     def test_quant(self):
36 |         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
37 |         bits, group_size, sym = 4, 128, True
38 |         from auto_round import  AutoRoundConfig
39 |         autoround = AutoRound(
40 |             self.model,
41 |             self.tokenizer,
42 |             bits=bits,
43 |             group_size=group_size,
44 |             sym=sym,
45 |             iters=2,
46 |             seqlen=2,
47 |             dataset=self.llm_dataloader,
48 | 
49 |         )
50 | 
51 |         autoround.quantize()
52 |         autoround.save_quantized("./saved")
53 | 
54 |         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True)
55 |         model_infer(model, self.tokenizer)
56 | 
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/test/test_cuda/test_multiple_card_calib.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import sys
 4 | import shutil
 5 | import unittest
 6 | 
 7 | sys.path.insert(0, "../..")
 8 | 
 9 | from auto_round.testing_utils import multi_card
10 | 
11 | def get_accuracy(data):
12 |     match = re.search(r'\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|', data)
13 | 
14 |     if match:
15 |         accuracy = float(match.group(1))
16 |         return accuracy
17 |     else:
18 |         return 0.0
19 | 
20 | 
21 | class TestAutoRound(unittest.TestCase):
22 |     @classmethod
23 |     def setUpClass(self):
24 |         self.save_dir = "./saved"
25 |         self.tasks = "lambada_openai"
26 | 
27 |     @classmethod
28 |     def tearDownClass(self):
29 |         shutil.rmtree("./saved", ignore_errors=True)
30 |         shutil.rmtree("runs", ignore_errors=True)
31 | 
32 |     @multi_card
33 |     def test_multiple_card_calib(self):
34 |         python_path = sys.executable
35 | 
36 |         ##test llm script
37 |         res = os.system(
38 |             f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None")
39 |         if res > 0 or res == -1:
40 |             assert False, "cmd line test fail, please have a check"
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     unittest.main()
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/test/test_xpu/test_autoround.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import shutil
  3 | import sys
  4 | import unittest
  5 | 
  6 | sys.path.insert(0, "../..")
  7 | import torch
  8 | import transformers
  9 | from transformers import AutoModelForCausalLM, AutoTokenizer
 10 | 
 11 | from auto_round import AutoRoundConfig
 12 | from auto_round import AutoRound
 13 | 
 14 | class LLMDataLoader:
 15 |     def __init__(self):
 16 |         self.batch_size = 1
 17 | 
 18 |     def __iter__(self):
 19 |         for i in range(3):
 20 |             yield torch.ones([1, 10], dtype=torch.long)
 21 | 
 22 | 
 23 | class TestAutoRoundXPU(unittest.TestCase):
 24 |     @classmethod
 25 |     def setUpClass(self):
 26 | 
 27 |         self.llm_dataloader = LLMDataLoader()
 28 | 
 29 |     @classmethod
 30 |     def tearDownClass(self):
 31 |         shutil.rmtree("./saved", ignore_errors=True)
 32 |         shutil.rmtree("runs", ignore_errors=True)
 33 |         pass
 34 | 
 35 |     def test_gptq_format(self):
 36 |         model_name = "facebook/opt-125m"
 37 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True,
 38 |                                                      device_map="auto")
 39 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 40 |         bits, group_size, sym = 4, 128, True
 41 | 
 42 |         autoround = AutoRound(
 43 |             model,
 44 |             tokenizer,
 45 |             bits=bits,
 46 |             group_size=group_size,
 47 |             sym=sym,
 48 |             iters=2,
 49 |             seqlen=2,
 50 |             dataset=self.llm_dataloader,
 51 |         )
 52 |         quantized_model_path = "./saved"
 53 |         autoround.quantize_and_save(output_dir=quantized_model_path)
 54 | 
 55 |         from auto_round import AutoRoundConfig
 56 |         quantization_config = AutoRoundConfig(
 57 |             backend="auto"
 58 |         )
 59 | 
 60 |         model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
 61 |                                                      device_map="auto", quantization_config=quantization_config)
 62 |         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
 63 |         text = "There is a girl who likes adventure,"
 64 |         inputs = tokenizer(text, return_tensors="pt").to(model.device)
 65 |         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
 66 |         print(res)
 67 |         assert ("!!!" not in res)
 68 | 
 69 | 
 70 | 
 71 |     def test_awq_format(self):
 72 |         model_name = "facebook/opt-125m"
 73 |         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True,
 74 |                                                      device_map="xpu")
 75 |         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 76 |         bits, group_size, sym = 4, 128, True
 77 |         autoround = AutoRound(
 78 |             model,
 79 |             tokenizer,
 80 |             bits=bits,
 81 |             group_size=group_size,
 82 |             sym=sym,
 83 |             iters=2,
 84 |             seqlen=2,
 85 |             dataset=self.llm_dataloader,
 86 |         )
 87 |         quantized_model_path = "./saved"
 88 |         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
 89 | 
 90 |         quantized_model_path = "./saved"
 91 |         from auto_round import AutoRoundConfig
 92 |         quantization_config = AutoRoundConfig(
 93 |             backend="auto"
 94 |         )
 95 | 
 96 |         model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
 97 |                                                      device_map="auto", quantization_config=quantization_config)
 98 |         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
 99 |         text = "There is a girl who likes adventure,"
100 |         inputs = tokenizer(text, return_tensors="pt").to(model.device)
101 |         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
102 |         print(res)
103 |         assert ("!!!" not in res)
104 | 
105 | if __name__ == "__main__":
106 |     unittest.main()


--------------------------------------------------------------------------------
/third-party-programs.txt:
--------------------------------------------------------------------------------
  1 | Third Party Programs File
  2 |  
  3 | This file contains the list of third party software ("third party programs")
  4 | contained in the Intel software and their required notices and/or license
  5 | terms. This third party software, even if included with the distribution of
  6 | the Intel software, may be governed by separate license terms, including
  7 | without limitation, third party license terms, other Intel software license
  8 | terms, and open source software license terms. These separate license terms
  9 | govern your use of the third party programs as set forth in in the
 10 | "THIRD-PARTY-PROGRAMS" file.
 11 | 
 12 | Third party programs and their corresponding required notices and/or license terms are listed
 13 | below.
 14 | ===============================================================================
 15 | 1. Pytorch
 16 | 
 17 | From PyTorch:
 18 | 
 19 | Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
 20 | Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
 21 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 22 | Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 23 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 24 | Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 25 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 26 | Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 27 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 28 | 
 29 | From Caffe2:
 30 | 
 31 | Copyright (c) 2016-present, Facebook Inc. All rights reserved.
 32 | 
 33 | All contributions by Facebook:
 34 | Copyright (c) 2016 Facebook Inc.
 35 | 
 36 | All contributions by Google:
 37 | Copyright (c) 2015 Google Inc.
 38 | All rights reserved.
 39 | 
 40 | All contributions by Yangqing Jia:
 41 | Copyright (c) 2015 Yangqing Jia
 42 | All rights reserved.
 43 | 
 44 | All contributions by Kakao Brain:
 45 | Copyright 2019-2020 Kakao Brain
 46 | 
 47 | All contributions by Cruise LLC:
 48 | Copyright (c) 2022 Cruise LLC.
 49 | All rights reserved.
 50 | 
 51 | All contributions from Caffe:
 52 | Copyright(c) 2013, 2014, 2015, the respective contributors
 53 | All rights reserved.
 54 | 
 55 | All other contributions:
 56 | Copyright(c) 2015, 2016 the respective contributors
 57 | All rights reserved.
 58 | 
 59 | Caffe2 uses a copyright model similar to Caffe: each contributor holds
 60 | copyright over their contributions to Caffe2. The project versioning records
 61 | all such contribution and copyright details. If a contributor wants to further
 62 | mark their specific copyright on a particular contribution, they should
 63 | indicate their copyright solely in the commit message of the change when it is
 64 | committed.
 65 | 
 66 | All rights reserved.
 67 | 
 68 | Redistribution and use in source and binary forms, with or without
 69 | modification, are permitted provided that the following conditions are met:
 70 | 
 71 | 1. Redistributions of source code must retain the above copyright
 72 |    notice, this list of conditions and the following disclaimer.
 73 | 
 74 | 2. Redistributions in binary form must reproduce the above copyright
 75 |    notice, this list of conditions and the following disclaimer in the
 76 |    documentation and/or other materials provided with the distribution.
 77 | 
 78 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
 79 |    and IDIAP Research Institute nor the names of its contributors may be
 80 |    used to endorse or promote products derived from this software without
 81 |    specific prior written permission.
 82 | 
 83 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 84 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 85 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 86 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 87 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 88 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 89 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 90 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 91 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 92 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 93 | POSSIBILITY OF SUCH DAMAGE.
 94 | 
 95 | 
 96 | ===============================================================================
 97 | 2. lm-evaluation-harness
 98 |    Copyright (c) 2020 EleutherAI
 99 | 
100 |    AutoGPTQ
101 |    Copyright (c) 2023 潘其威(William)
102 | 
103 | MIT License
104 | 
105 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
106 | 
107 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
108 | 
109 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
110 | 
111 | 
112 | ===============================================================================


--------------------------------------------------------------------------------