├── .azure-pipelines ├── code-scan.yml ├── docker │ ├── Dockerfile.devel │ └── DockerfileCodeScan.devel ├── license_template.txt ├── scripts │ ├── change_color.sh │ ├── codeScan │ │ ├── bandit │ │ │ └── bandit.sh │ │ ├── codespell │ │ │ └── autoround_dict.txt │ │ └── pylint │ │ │ └── pylint.sh │ └── ut │ │ ├── .coverage │ │ ├── collect_log.sh │ │ ├── compare_coverage.sh │ │ ├── run_ut.sh │ │ └── run_ut_hpu.sh ├── template │ ├── code-scan-template.yml │ ├── docker-template.yml │ └── ut-template.yml ├── unit-test-hpu.yml └── unit-test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── auto_round ├── __init__.py ├── __main__.py ├── autoround.py ├── calib_dataset.py ├── data_type │ ├── __init__.py │ ├── fp8.py │ ├── int.py │ ├── mxfp.py │ ├── nvfp.py │ ├── register.py │ ├── utils.py │ └── w4fp8.py ├── eval │ ├── __init__.py │ └── evaluation.py ├── export │ ├── __init__.py │ ├── export_to_autogptq │ │ ├── __init__.py │ │ ├── export.py │ │ └── qlinear_triton.py │ ├── export_to_autoround │ │ ├── __init__.py │ │ ├── export.py │ │ └── qlinear_triton_act.py │ ├── export_to_awq │ │ ├── __init__.py │ │ ├── export.py │ │ └── utils.py │ ├── export_to_gguf │ │ ├── __init__.py │ │ ├── config.py │ │ ├── convert.py │ │ ├── export.py │ │ ├── quant_cpu.py │ │ ├── quant_gpu.py │ │ └── utils.py │ ├── export_to_itrex │ │ ├── __init__.py │ │ ├── bit_packer.py │ │ ├── config.py │ │ ├── export.py │ │ └── model_wrapper.py │ └── register.py ├── inference │ ├── __init__.py │ ├── auto_quantizer.py │ ├── backend.py │ └── convert_model.py ├── low_cpu_mem │ ├── __init__.py │ ├── load.py │ ├── modified_pickle.py │ └── utils.py ├── mllm │ ├── README.md │ ├── __init__.py │ ├── autoround_mllm.py │ ├── eval.py │ ├── mllm_dataset.py │ ├── processor.py │ ├── template.py │ ├── templates │ │ ├── cogvlm2.json │ │ ├── default.json │ │ ├── llava.json │ │ └── phi3_v.json │ └── utils.py ├── script │ ├── __init__.py │ ├── llm.py │ └── mllm.py ├── sign_sgd.py ├── special_model_handler.py ├── testing_utils.py ├── utils.py ├── version.py └── wrapper.py ├── auto_round_extension ├── __init__.py ├── cuda │ ├── __init__.py │ └── gptqmodel_marlin.py ├── hpu │ ├── __init__.py │ ├── qlinear_hpu.py │ └── qlinear_hpu_gptq.py ├── ipex │ ├── __init__.py │ ├── qlinear_ipex_awq.py │ └── qlinear_ipex_gptq.py ├── qbits │ ├── __init__.py │ ├── qbits_awq.py │ ├── qlinear_qbits.py │ └── qlinear_qbits_gptq.py ├── torch │ ├── __init__.py │ ├── qlinear_torch.py │ └── qlinear_torch_zp.py └── triton │ ├── __init__.py │ ├── qlinear_tritonv2.py │ ├── qlinear_tritonv2_zp.py │ ├── triton_utils │ ├── __init__.py │ ├── custom_autotune.py │ ├── dequant.py │ ├── kernels.py │ └── mixin.py │ └── triton_utils_zp │ ├── __init__.py │ ├── custom_autotune.py │ ├── dequant.py │ ├── kernels.py │ └── mixin.py ├── docs ├── DeepSeek-R1-0528-int2-mixed-sym-inc.md ├── DeepSeek-R1-0528-int4-asym-awq-inc.md ├── DeepSeek-R1-0528-int4-sym-gptq-inc.md ├── Llama-2-7b-chat-hf-asym-recipe.md ├── Llama-3.2-11B-Vision-Instruct-sym.md ├── Meta-Llama-3-8B-Instruct-asym-recipe.md ├── Mistral-7B-Instruct-v0.2-asym-recipe.md ├── Mistral-7B-v0.1-asym-recipe.md ├── Mixtral-8x7B-Instruct-v0.1-asym-recipe.md ├── Mixtral-8x7B-v0.1-asym-acc.md ├── Phi-3.5-vision-instruct-sym.md ├── Qwen1.5-7B-Chat-acc.md ├── Qwen2-VL-7B-Instruct-sym.md ├── Qwen2.5-14B-Instruct-sym.md ├── Qwen2.5-32B-Instruct-sym.md ├── Qwen2.5-72B-Instruct-sym.md ├── Qwen2.5-7B-Instruct-sym.md ├── Qwen3-14B-sym-recipe.md ├── Qwen3-8B-sym-recipe.md ├── Yi-6B-Chat-asym-recipe.md ├── acc.md ├── baichuan2-7b-cha-asym-recipe.md ├── bloom-3B-asym-recipe.md ├── cogvlm2-llama3-chat-19B-sym.md ├── falcon-7b-asym-recipe.md ├── full_range_sym.md ├── gemma-2b-asym-recipe.md ├── gemma-7b-asym-recipe.md ├── gemma-7b-it-asym-recipe.md ├── gguf_accuracy.md ├── glm-4-9b-chat-recipe.md ├── gpt-j-6B-asym-recipe.md ├── imgs │ ├── autoround_overview.png │ ├── full_range_sym.png │ └── norm_bias_overview.png ├── llava-v1.5-7b-sym.md ├── neural-chat-7b-v3-1-asym-recipe.md ├── neural-chat-7b-v3-3-asym-recipe.md ├── opt-2.7b-asym-recipe.md ├── phi-2-old-sym-recipe.md ├── step_by_step.md ├── tips_and_tricks.md └── tuning_norm_bias.md ├── pyproject.toml ├── requirements-cpu.txt ├── requirements-lib.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── test ├── test_cpu │ ├── _test_helpers.py │ ├── conftest.py │ ├── requirements.txt │ ├── test_act_quantization.py │ ├── test_auto_round_hpu_only.py │ ├── test_autoopt.py │ ├── test_autoround.py │ ├── test_autoround_acc.py │ ├── test_autoround_export_to_itrex.py │ ├── test_basic_usage.py │ ├── test_block_names.py │ ├── test_calib_dataset.py │ ├── test_conv1d.py │ ├── test_export.py │ ├── test_generation.py │ ├── test_gguf_format.py │ ├── test_hpu.py │ ├── test_load_awq_gptq.py │ ├── test_low_cpu_mem.py │ ├── test_mllm.py │ ├── test_script.py │ ├── test_utils.py │ └── test_woq_linear.py ├── test_cuda │ ├── _test_helpers.py │ ├── requirements.txt │ ├── requirements_vlm.txt │ ├── test_2_3bits.py │ ├── test_auto_round_format.py │ ├── test_calib_dataset.py │ ├── test_conv1d.py │ ├── test_exllamav2_backend.py │ ├── test_export.py │ ├── test_get_block_name.py │ ├── test_gguf.py │ ├── test_main_func.py │ ├── test_marlin_backend.py │ ├── test_multiple_card.py │ ├── test_multiple_card_calib.py │ ├── test_qbits.py │ ├── test_support_vlms.py │ ├── test_triton_backend.py │ └── test_vlms.py └── test_xpu │ └── test_autoround.py └── third-party-programs.txt /.azure-pipelines/code-scan.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - auto_round 12 | - setup.py 13 | - requirements.txt 14 | - .azure-pipelines/code-scan.yml 15 | - .azure-pipelines/scripts/codeScan 16 | 17 | pool: 18 | vmImage: "ubuntu-latest" 19 | 20 | variables: 21 | CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog" 22 | 23 | stages: 24 | 25 | - stage: BanditCodeScan 26 | displayName: Bandit Code Scan 27 | dependsOn: [] 28 | jobs: 29 | - job: Bandit 30 | displayName: Bandit 31 | steps: 32 | - template: template/code-scan-template.yml 33 | parameters: 34 | codeScanFileName: "bandit" 35 | uploadPath: "bandit.log" 36 | 37 | - stage: PylintCodeScan 38 | displayName: Pylint Code Scan 39 | dependsOn: [] 40 | jobs: 41 | - job: Pylint 42 | displayName: Pylint 43 | steps: 44 | - template: template/code-scan-template.yml 45 | parameters: 46 | codeScanFileName: "pylint" 47 | uploadPath: "pylint.json" 48 | -------------------------------------------------------------------------------- /.azure-pipelines/docker/Dockerfile.devel: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ARG UBUNTU_VER=22.04 16 | FROM ubuntu:${UBUNTU_VER} as devel 17 | 18 | # See http://bugs.python.org/issue19846 19 | ENV LANG C.UTF-8 20 | 21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 22 | python3 \ 23 | python3-pip \ 24 | python3-dev \ 25 | python3-distutils \ 26 | autoconf \ 27 | build-essential \ 28 | git \ 29 | libgl1-mesa-glx \ 30 | libglib2.0-0 \ 31 | numactl \ 32 | time \ 33 | wget \ 34 | bc \ 35 | jq \ 36 | vim 37 | 38 | RUN ln -sf $(which python3) /usr/bin/python 39 | 40 | ARG USER_ID=1000 41 | ARG GROUP_ID=1000 42 | 43 | RUN groupadd -g ${GROUP_ID} hostgroup && \ 44 | useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser 45 | 46 | USER hostuser 47 | 48 | RUN python -m pip install --no-cache-dir --upgrade pip 49 | RUN python -m pip install --no-cache-dir setuptools 50 | 51 | RUN pip list 52 | 53 | WORKDIR / 54 | 55 | -------------------------------------------------------------------------------- /.azure-pipelines/docker/DockerfileCodeScan.devel: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | ARG UBUNTU_VER=22.04 17 | FROM ubuntu:${UBUNTU_VER} as devel 18 | 19 | # See http://bugs.python.org/issue19846 20 | ENV LANG C.UTF-8 21 | 22 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 23 | aspell \ 24 | aspell-en \ 25 | python3 \ 26 | python3-pip \ 27 | python3-dev \ 28 | python3-distutils \ 29 | autoconf \ 30 | build-essential \ 31 | wget 32 | 33 | RUN ln -sf $(which python3) /usr/bin/python 34 | 35 | ARG USER_ID=1000 36 | ARG GROUP_ID=1000 37 | 38 | RUN groupadd -g ${GROUP_ID} hostgroup && \ 39 | useradd -m -u ${USER_ID} -g ${GROUP_ID} hostuser 40 | 41 | USER hostuser 42 | 43 | RUN python -m pip install --no-cache-dir pylint==2.12.1\ 44 | bandit 45 | 46 | WORKDIR / 47 | -------------------------------------------------------------------------------- /.azure-pipelines/license_template.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Intel Corporation 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/change_color.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------- general approach start---------------- 4 | 5 | # 1. import this file: 6 | # source path/change_color.sh 7 | # 2. use COLOR/BG: 8 | # $VARIABLE_NAME && out_put_content && $RESET 9 | # 3. COLOR + BG: 10 | # $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET 11 | # 4. custom 12 | # abbreviation(change number) 13 | # txt number range (30, 37) 14 | # bg number range (40, 47) 15 | # special effects number range (1, 7) 16 | # echo -en \\E[number1 + ; + number2 + ; + number3 + m" 17 | # e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m" 18 | 19 | # -------------- general approach end----------------== 20 | 21 | 22 | # general setting 23 | # ------------- light_color start---------------- 24 | # black 25 | LIGHT_BLACK="echo -en \\E[30m" 26 | # red 27 | LIGHT_RED="echo -en \\E[31m" 28 | # green 29 | LIGHT_GREEN="echo -en \\E[32m" 30 | # yellow 31 | LIGHT_YELLOW="echo -en \\E[33m" 32 | # blue 33 | LIGHT_BLUE="echo -en \\E[34m" 34 | # purple 35 | LIGHT_PURPLE="echo -en \\E[35m" 36 | # cyan 37 | LIGHT_CYAN="echo -en \\E[36m" 38 | # gray 39 | LIGHT_GRAY="echo -en \\E[37m" 40 | # ------------- light_color end---------------- 41 | 42 | # ------------- bold_color start---------------- 43 | # black 44 | BOLD_BLACK="echo -en \\E[1;30m" 45 | # red 46 | BOLD_RED="echo -en \\E[1;31m" 47 | # green 48 | BOLD_GREEN="echo -en \\E[1;32m" 49 | # yellow 50 | BOLD_YELLOW="echo -en \\E[1;33m" 51 | # blue 52 | BOLD_BLUE="echo -en \\E[1;34m" 53 | # purple 54 | BOLD_PURPLE="echo -en \\E[1;35m" 55 | # cyan 56 | BOLD_CYAN="echo -en \\E[1;36m" 57 | # gray 58 | BOLD_GRAY="echo -en \\E[1;37m" 59 | # ------------- bold_color end---------------- 60 | 61 | # ------------- background_color start---------------- 62 | # black 63 | BG_BLACK="echo -en \\E[40m" 64 | # red 65 | BG_RED="echo -en \\E[41m" 66 | # green 67 | BG_GREEN="echo -en \\E[42m" 68 | # yellow 69 | BG_YELLOW="echo -en \\E[43m" 70 | # blue 71 | BG_BLUE="echo -en \\E[44m" 72 | # purple 73 | BG_PURPLE="echo -en \\E[45m" 74 | # cyan 75 | BG_CYAN="echo -en \\E[46m" 76 | # gray 77 | BG_GRAY="echo -en \\E[47m" 78 | # ------------- background_color end---------------- 79 | 80 | # close 81 | RESET="echo -en \\E[0m" 82 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/codeScan/bandit/bandit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for var in "$@" 4 | do 5 | case $var in 6 | --scan_module=*) 7 | scan_module=$(echo $var |cut -f2 -d=) 8 | ;; 9 | esac 10 | done 11 | 12 | source /auto-round/.azure-pipelines/scripts/change_color.sh 13 | RESET="echo -en \\E[0m \\n" # close color 14 | 15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog" 16 | mkdir -p $log_dir 17 | 18 | python -m bandit -r -lll -iii "/auto-round/${scan_module}" >$log_dir/bandit.log 19 | exit_code=$? 20 | 21 | $BOLD_YELLOW && echo " ----------------- Current bandit cmd start --------------------------" && $RESET 22 | echo "python -m bandit -r -lll -iii /auto-round/${scan_module} > $log_dir/bandit.log" 23 | $BOLD_YELLOW && echo " ----------------- Current bandit cmd end --------------------------" && $RESET 24 | 25 | $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" 26 | cat $log_dir/bandit.log 27 | $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET 28 | 29 | if [ ${exit_code} -ne 0 ]; then 30 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET 31 | exit 1 32 | fi 33 | $BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 34 | exit 0 35 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt: -------------------------------------------------------------------------------- 1 | endianess -------------------------------------------------------------------------------- /.azure-pipelines/scripts/codeScan/pylint/pylint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for var in "$@" 4 | do 5 | case $var in 6 | --scan_module=*) 7 | scan_module=$(echo $var |cut -f2 -d=) 8 | ;; 9 | esac 10 | done 11 | 12 | source /auto-round/.azure-pipelines/scripts/change_color.sh 13 | RESET="echo -en \\E[0m \\n" # close color 14 | 15 | log_dir="/auto-round/.azure-pipelines/scripts/codeScan/scanLog" 16 | mkdir -p $log_dir 17 | 18 | pip install torch --index-url https://download.pytorch.org/whl/cpu 19 | pip install -r /auto-round/requirements.txt 20 | pip install -r /auto-round/requirements-cpu.txt 21 | 22 | echo "[DEBUG] list pipdeptree..." 23 | pip install pipdeptree 24 | pipdeptree 25 | 26 | python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto \ 27 | --ignored-modules=tensorflow,keras,torch,torch.quantization,torch.tensor,torchvision,fairseq,mxnet,onnx,onnxruntime,intel_extension_for_pytorch,intel_extension_for_tensorflow,torchinfo,horovod,transformers \ 28 | /auto-round/${scan_module} > $log_dir/pylint.json 29 | 30 | exit_code=$? 31 | 32 | $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" && $RESET 33 | cat $log_dir/pylint.json 34 | $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET 35 | 36 | if [ ${exit_code} -ne 0 ]; then 37 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Pylint error details." && $RESET 38 | exit 1 39 | fi 40 | $BOLD_PURPLE && echo "Congratulations, Pylint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 41 | exit 0 42 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/.coverage: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | 4 | [report] 5 | include = 6 | */auto_round/** 7 | */auto_round_extension/** 8 | exclude_lines = 9 | pragma: no cover 10 | raise NotImplementedError 11 | raise TypeError 12 | except ImportError: 13 | except Exception as e: -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/run_ut.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | # install requirements 5 | echo "##[group]set up UT env..." 6 | export TQDM_MININTERVAL=60 7 | pip install pytest-cov pytest-html 8 | pip install -r /auto-round/test/test_cpu/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu 9 | pip list 10 | # install latest gguf for ut test 11 | git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install . 12 | echo "##[endgroup]" 13 | pip list 14 | 15 | cd /auto-round/test/test_cpu || exit 1 16 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + 17 | 18 | export LD_LIBRARY_PATH=${HOME}/.local/lib/:$LD_LIBRARY_PATH 19 | export FORCE_BF16=1 20 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage 21 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') 22 | 23 | LOG_DIR=/auto-round/log_dir 24 | mkdir -p ${LOG_DIR} 25 | ut_log_name=${LOG_DIR}/ut.log 26 | 27 | find . -name "test*.py" ! -name "*hpu_only*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh 28 | cat run.sh 29 | bash run.sh 2>&1 | tee ${ut_log_name} 30 | 31 | cp report.html ${LOG_DIR}/ 32 | cp coverage.xml ${LOG_DIR}/ 33 | 34 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then 35 | echo "##[error]Find errors in pytest case, please check the output..." 36 | exit 1 37 | fi 38 | 39 | # if ut pass, collect the coverage file into artifacts 40 | cp .coverage ${LOG_DIR}/.coverage 41 | 42 | echo "UT finished successfully! " 43 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/run_ut_hpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | # install requirements 5 | echo "set up UT env..." 6 | pip install pytest-cov pytest-html 7 | pip list 8 | 9 | cd /auto-round/test/test_cpu || exit 1 10 | find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + 11 | 12 | export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH 13 | export FORCE_BF16=1 14 | export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage 15 | auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') 16 | 17 | LOG_DIR=/auto-round/log_dir 18 | mkdir -p ${LOG_DIR} 19 | ut_log_name=${LOG_DIR}/ut.log 20 | 21 | find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh 22 | find . -name "test*hpu_only.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh 23 | 24 | cat run_lazy.sh 25 | bash run_lazy.sh 2>&1 | tee ${ut_log_name} 26 | 27 | cat run_compile.sh 28 | bash run_compile.sh 2>&1 | tee ${ut_log_name} 29 | 30 | cp report.html ${LOG_DIR}/ 31 | cp coverage.xml ${LOG_DIR}/ 32 | 33 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then 34 | echo "##[error]Find errors in pytest case, please check the output..." 35 | exit 1 36 | fi 37 | 38 | # if ut pass, collect the coverage file into artifacts 39 | cp .coverage ${LOG_DIR}/.coverage 40 | 41 | echo "UT finished successfully! " -------------------------------------------------------------------------------- /.azure-pipelines/template/code-scan-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: codeScanFileName 3 | type: string 4 | - name: uploadPath 5 | type: string 6 | 7 | - name: codeScanContainerName 8 | type: string 9 | default: "codeScan" 10 | - name: scanModule 11 | type: string 12 | default: "auto_round" 13 | 14 | steps: 15 | - template: docker-template.yml 16 | parameters: 17 | dockerConfigName: "commonDockerConfig" 18 | repoName: "code-scan" 19 | repoTag: "1.0" 20 | dockerFileName: "DockerfileCodeScan" 21 | containerName: ${{ parameters.codeScanContainerName }} 22 | 23 | - script: | 24 | docker exec ${{ parameters.codeScanContainerName }} bash -c "bash /auto-round/.azure-pipelines/scripts/codeScan/${{ parameters.codeScanFileName }}/${{ parameters.codeScanFileName }}.sh \ 25 | --scan_module=${{ parameters.scanModule }}" 26 | displayName: "${{ parameters.codeScanFileName }} Check" 27 | 28 | - task: PublishPipelineArtifact@1 29 | condition: succeededOrFailed() 30 | inputs: 31 | targetPath: .azure-pipelines/scripts/codeScan/scanLog/${{ parameters.uploadPath }} 32 | artifact: ${{ parameters.codeScanFileName }} 33 | publishLocation: "pipeline" 34 | displayName: "PublishPipelineArtifact" 35 | 36 | - task: Bash@3 37 | condition: always() 38 | inputs: 39 | targetType: "inline" 40 | script: | 41 | docker exec ${{ parameters.codeScanContainerName }} bash -c "rm -fr /auto-round/* && rm -fr /auto-round/.* || true" 42 | displayName: "Docker clean up" 43 | -------------------------------------------------------------------------------- /.azure-pipelines/template/docker-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: dockerConfigName 3 | type: string 4 | default: "commonDockerConfig" 5 | - name: repoName 6 | type: string 7 | default: "auto-round" 8 | - name: repoTag 9 | type: string 10 | default: "py310" 11 | - name: dockerFileName 12 | type: string 13 | default: "Dockerfile" 14 | - name: containerName 15 | type: string 16 | - name: repo 17 | type: string 18 | default: "https://github.com/intel/auto-round" 19 | - name: imageSource 20 | type: string 21 | default: "build" 22 | 23 | steps: 24 | - task: Bash@3 25 | inputs: 26 | targetType: "inline" 27 | script: | 28 | docker ps -a 29 | if [[ $(docker ps -a | grep -i '${{ parameters.containerName }}'$) ]]; then 30 | docker start ${{ parameters.containerName }} 31 | echo "remove left files through container ..." 32 | docker exec ${{ parameters.containerName }} bash -c "ls -a /auto-round && rm -fr /auto-round/* && rm -fr /auto-round/.* && ls -a /auto-round || true" 33 | fi 34 | displayName: "Docker clean up" 35 | 36 | - ${{ if eq(parameters.dockerConfigName, 'commonDockerConfig') }}: 37 | - script: | 38 | rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true 39 | displayName: "Clean workspace" 40 | 41 | - checkout: self 42 | clean: true 43 | displayName: "Checkout out Repo" 44 | 45 | - ${{ if eq(parameters.dockerConfigName, 'gitCloneDockerConfig') }}: 46 | - script: | 47 | rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true 48 | mkdir ${BUILD_SOURCESDIRECTORY} 49 | chmod 777 ${BUILD_SOURCESDIRECTORY} 50 | displayName: "Clean workspace" 51 | 52 | - checkout: none 53 | 54 | - script: | 55 | git clone ${{ parameters.repo }} ${BUILD_SOURCESDIRECTORY} 56 | git config --global --add safe.directory ${BUILD_SOURCESDIRECTORY} 57 | cd ${BUILD_SOURCESDIRECTORY} 58 | git checkout main 59 | displayName: "Checkout out main" 60 | 61 | - ${{ if eq(parameters.imageSource, 'build') }}: 62 | - script: | 63 | docker image prune -a -f 64 | if [[ ! $(docker images | grep -i ${{ parameters.repoName }}:${{ parameters.repoTag }}) ]]; then 65 | docker build --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) \ 66 | -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/${{parameters.dockerFileName}}.devel -t ${{ parameters.repoName }}:${{ parameters.repoTag }} . 67 | fi 68 | docker images | grep -i ${{ parameters.repoName }} 69 | if [[ $? -ne 0 ]]; then 70 | echo "NO Such Repo" 71 | exit 1 72 | fi 73 | displayName: "Build develop docker image" 74 | 75 | - ${{ if eq(parameters.imageSource, 'pull') }}: 76 | - script: | 77 | docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest 78 | displayName: "Pull habana docker image" 79 | 80 | - script: | 81 | docker stop ${{ parameters.containerName }} 82 | docker rm -vf ${{ parameters.containerName }} || true 83 | env | sort 84 | displayName: "Clean docker container" 85 | 86 | - ${{ if ne(parameters.containerName, '') }}: 87 | - task: Bash@3 88 | inputs: 89 | targetType: "inline" 90 | script: | 91 | if [[ "${{ parameters.imageSource }}" == "build" ]]; then 92 | docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \ 93 | -v ${BUILD_SOURCESDIRECTORY}:/auto-round -v /tf_dataset:/tf_dataset -v /tf_dataset2:/tf_dataset2 \ 94 | ${{ parameters.repoName }}:${{ parameters.repoTag }} 95 | else 96 | docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \ 97 | --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \ 98 | -v ${BUILD_SOURCESDIRECTORY}:/auto-round vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest 99 | docker exec ${{ parameters.containerName }} bash -c "ln -sf \$(which python3) /usr/bin/python" 100 | fi 101 | echo "Show the container list after docker run ... " 102 | docker ps -a 103 | displayName: "Docker run - ${{ parameters.containerName }} Container" 104 | -------------------------------------------------------------------------------- /.azure-pipelines/template/ut-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: dockerConfigName 3 | type: string 4 | default: "commonDockerConfig" 5 | - name: repo 6 | type: string 7 | default: "https://github.com/intel/auto-round" 8 | - name: utScriptFileName 9 | type: string 10 | - name: uploadPath 11 | type: string 12 | - name: utArtifact 13 | type: string 14 | - name: utTestMode 15 | type: string 16 | default: "coverage" 17 | - name: utContainerName 18 | type: string 19 | default: "AutoRoundUnitTest" 20 | - name: imageSource 21 | type: string 22 | default: "build" 23 | 24 | steps: 25 | - template: docker-template.yml 26 | parameters: 27 | dockerConfigName: ${{ parameters.dockerConfigName }} 28 | repoName: "auto-round" 29 | repoTag: "py310" 30 | dockerFileName: "Dockerfile" 31 | containerName: ${{ parameters.utContainerName }} 32 | repo: ${{ parameters.repo }} 33 | imageSource: ${{ parameters.imageSource }} 34 | 35 | - ${{ if eq(parameters.imageSource, 'build') }}: 36 | - script: | 37 | docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \ 38 | && pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cpu \ 39 | && pip install intel-extension-for-pytorch==2.7.0 \ 40 | && pip install .[cpu] \ 41 | && pip list" 42 | displayName: "Env Setup" 43 | 44 | - ${{ if eq(parameters.imageSource, 'pull') }}: 45 | - script: | 46 | docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \ 47 | && python setup.py bdist_wheel lib \ 48 | && pip install dist/*.whl \ 49 | && pip list" 50 | displayName: "HPU Env Setup" 51 | 52 | - script: | 53 | docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round/.azure-pipelines/scripts \ 54 | && bash ut/${{ parameters.utScriptFileName }}.sh ${{ parameters.utTestMode }}" 55 | displayName: "Run UT" 56 | 57 | - task: PublishPipelineArtifact@1 58 | condition: succeededOrFailed() 59 | inputs: 60 | targetPath: ${{ parameters.uploadPath }} 61 | artifact: ${{ parameters.utArtifact }}_coverage 62 | publishLocation: "pipeline" 63 | 64 | - task: UseDotNet@2 65 | displayName: 'Use .NET Core sdk 7.0.x' 66 | inputs: 67 | version: 7.0.x 68 | 69 | - task: PublishCodeCoverageResults@2 70 | inputs: 71 | summaryFileLocation: ${{ parameters.uploadPath }}/coverage.xml 72 | 73 | - task: Bash@3 74 | condition: always() 75 | inputs: 76 | targetType: "inline" 77 | script: | 78 | docker exec ${{ parameters.utContainerName }} bash -c "rm -rf /auto-round/* && rm -rf /auto-round/.* || true" 79 | docker stop ${{ parameters.utContainerName }} 80 | docker rm -vf ${{ parameters.utContainerName }} || true 81 | displayName: "Docker clean up" 82 | -------------------------------------------------------------------------------- /.azure-pipelines/unit-test-hpu.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - auto_round 12 | - test/test*hpu*' 13 | - setup.py 14 | - requirements-hpu.txt 15 | - .azure-pipelines/scripts/ut 16 | - .azure-pipelines/template/docker-template.yml 17 | - .azure-pipelines/template/ut-template.yml 18 | exclude: 19 | - auto_round/export/export_to_autogptq 20 | - auto_round/export/export_to_awq 21 | - "*.md" 22 | - "**/*.md" 23 | 24 | pool: GAUDI 25 | 26 | variables: 27 | IMAGE_NAME: "auto-round" 28 | IMAGE_TAG: "py310" 29 | UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir 30 | DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir 31 | ARTIFACT_NAME: "UT_coverage_report" 32 | REPO: $(Build.Repository.Uri) 33 | 34 | stages: 35 | - stage: Unit_test 36 | displayName: Unit Test 37 | dependsOn: [] 38 | jobs: 39 | - job: 40 | displayName: Unit Test 41 | steps: 42 | - template: template/ut-template.yml 43 | parameters: 44 | imageSource: "pull" 45 | dockerConfigName: "commonDockerConfig" 46 | utScriptFileName: "run_ut_hpu" 47 | uploadPath: $(UPLOAD_PATH) 48 | utArtifact: "ut" 49 | 50 | # - stage: Unit_test_baseline 51 | # displayName: Unit Test Baseline 52 | # dependsOn: [] 53 | # jobs: 54 | # - job: 55 | # displayName: Unit Test 56 | # steps: 57 | # - template: template/ut-template.yml 58 | # parameters: 59 | # imageSource: "pull" 60 | # dockerConfigName: "gitCloneDockerConfig" 61 | # utScriptFileName: "run_ut_hpu" 62 | # uploadPath: $(UPLOAD_PATH) 63 | # utArtifact: "ut_baseline" 64 | # repo: $(REPO) 65 | -------------------------------------------------------------------------------- /.azure-pipelines/unit-test.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - auto_round 12 | - auto_round_extension 13 | - test 14 | - setup.py 15 | - requirements.txt 16 | - .azure-pipelines/scripts/ut 17 | - .azure-pipelines/unit-test.yml 18 | - .azure-pipelines/template/ut-template.yml 19 | - .azure-pipelines/template/docker-template.yml 20 | exclude: 21 | - test/test*hpu* 22 | - "*.md" 23 | - "**/*.md" 24 | 25 | pool: ICX-16C 26 | 27 | variables: 28 | IMAGE_NAME: "auto-round" 29 | IMAGE_TAG: "py310" 30 | UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir 31 | DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir 32 | ARTIFACT_NAME: "UT_coverage_report" 33 | REPO: $(Build.Repository.Uri) 34 | 35 | stages: 36 | - stage: Unit_test 37 | displayName: Unit Test 38 | dependsOn: [] 39 | jobs: 40 | - job: 41 | displayName: Unit Test 42 | timeoutInMinutes: 240 43 | steps: 44 | - template: template/ut-template.yml 45 | parameters: 46 | dockerConfigName: "commonDockerConfig" 47 | utScriptFileName: "run_ut" 48 | uploadPath: $(UPLOAD_PATH) 49 | utArtifact: "ut" 50 | 51 | # - stage: Unit_test_baseline 52 | # displayName: Unit Test Baseline 53 | # dependsOn: [] 54 | # jobs: 55 | # - job: 56 | # displayName: Unit Test 57 | # timeoutInMinutes: 120 58 | # steps: 59 | # - template: template/ut-template.yml 60 | # parameters: 61 | # dockerConfigName: "gitCloneDockerConfig" 62 | # utScriptFileName: "run_ut" 63 | # uploadPath: $(UPLOAD_PATH) 64 | # utArtifact: "ut_baseline" 65 | # repo: $(REPO) 66 | 67 | # - stage: Coverage 68 | # displayName: "Coverage Compare" 69 | # pool: 70 | # vmImage: "ubuntu-latest" 71 | # dependsOn: [Unit_test, Unit_test_baseline] 72 | # jobs: 73 | # - job: CollectDatafiles 74 | # steps: 75 | # - script: | 76 | # if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then 77 | # docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} . 78 | # fi 79 | # docker images | grep -i ${IMAGE_NAME} 80 | # if [[ $? -ne 0 ]]; then 81 | # echo "NO Such Repo" 82 | # exit 1 83 | # fi 84 | # displayName: "Build develop docker image" 85 | 86 | # - task: DownloadPipelineArtifact@2 87 | # inputs: 88 | # artifact: 89 | # patterns: "*_coverage/.coverage" 90 | # path: $(DOWNLOAD_PATH) 91 | 92 | # - script: | 93 | # echo "--- create container ---" 94 | # docker run -d -it --name="collectLogs" -v ${BUILD_SOURCESDIRECTORY}:/auto-round ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash 95 | # echo "--- docker ps ---" 96 | # docker ps 97 | # echo "--- collect logs ---" 98 | # docker exec collectLogs bash -c "cd /auto-round \ 99 | # && pip install -r requirements.txt \ 100 | # && pip install -vvv --no-build-isolation -e .[cpu] \ 101 | # && pip list" 102 | # docker exec collectLogs /bin/bash +x -c "cd /auto-round/.azure-pipelines/scripts \ 103 | # && bash ut/collect_log.sh" 104 | # displayName: "Collect UT Coverage" 105 | 106 | # - task: PublishPipelineArtifact@1 107 | # condition: succeededOrFailed() 108 | # inputs: 109 | # targetPath: $(UPLOAD_PATH) 110 | # artifact: $(ARTIFACT_NAME) 111 | # publishLocation: "pipeline" 112 | 113 | # - task: Bash@3 114 | # condition: always() 115 | # inputs: 116 | # targetType: "inline" 117 | # script: | 118 | # docker exec collectLogs bash -c "rm -fr /auto-round/* && rm -fr /auto-round/.* || true" 119 | # displayName: "Docker clean up" 120 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vs 2 | .vscode 3 | __pycache__ 4 | *.egg-info/ 5 | build/* 6 | .eggs/ 7 | dist/ 8 | .cache/ 9 | .clangd 10 | CMakeUserPresets.json 11 | tmp_autoround/ 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: true 3 | autoupdate_schedule: quarterly 4 | 5 | repos: 6 | - repo: https://github.com/Lucas-C/pre-commit-hooks 7 | rev: v1.5.5 8 | hooks: 9 | - id: insert-license 10 | files: | 11 | (?x)^( 12 | auto_round/.*(py|yaml|yml|sh) 13 | )$ 14 | args: 15 | [ 16 | --license-filepath=.azure-pipelines/license_template.txt, 17 | --use-current-year, 18 | --detect-license-in-X-top-lines=40, 19 | --skip-license-insertion-comment=Copyright, 20 | ] 21 | 22 | - repo: https://github.com/codespell-project/codespell 23 | rev: v2.4.1 24 | hooks: 25 | - id: codespell 26 | args: [-w] 27 | additional_dependencies: 28 | - tomli 29 | exclude: | 30 | (?x)^( 31 | examples/.*(txt|patch) 32 | )$ 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ### Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. 58 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include requirements-cpu.txt 3 | include requirements-lib.txt 4 | exclude test/* 5 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | -------------------------------------------------------------------------------- /auto_round/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .autoround import AutoRound, AutoRoundAdam, AutoRoundOPT 15 | from .mllm import AutoRoundMLLM 16 | from auto_round.utils import LazyImport 17 | 18 | def __getattr__(name): 19 | if name == 'AutoHfQuantizer': 20 | from auto_round.inference.auto_quantizer import AutoHfQuantizer 21 | return AutoHfQuantizer 22 | if name == 'AutoRoundConfig': 23 | from auto_round.inference.auto_quantizer import AutoRoundConfig 24 | return AutoRoundConfig 25 | 26 | raise AttributeError(f"auto-round has no attribute '{name}'") 27 | 28 | from .version import __version__ 29 | -------------------------------------------------------------------------------- /auto_round/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | def run_eval(): 17 | from auto_round.script.llm import setup_eval_parser 18 | args = setup_eval_parser() 19 | if args.eval_task_by_task: 20 | from auto_round.script.llm import eval_task_by_task 21 | eval_task_by_task( 22 | model=args.model, 23 | device=args.device, 24 | tasks=args.tasks, 25 | batch_size=args.eval_bs, 26 | trust_remote_code=not args.disable_trust_remote_code, 27 | eval_model_dtype=args.eval_model_dtype 28 | ) 29 | else: 30 | from auto_round.script.llm import eval 31 | eval(args) 32 | 33 | 34 | def run(): 35 | if "--eval" in sys.argv: 36 | sys.argv.remove("--eval") 37 | run_eval() 38 | else: 39 | from auto_round.script.llm import setup_parser, tune 40 | args = setup_parser() 41 | tune(args) 42 | 43 | def run_best(): 44 | from auto_round.script.llm import setup_best_parser, tune 45 | args = setup_best_parser() 46 | tune(args) 47 | 48 | def run_light(): 49 | from auto_round.script.llm import setup_light_parser, tune 50 | args = setup_light_parser() 51 | tune(args) 52 | 53 | def run_fast(): 54 | from auto_round.script.llm import setup_fast_parser, tune 55 | args = setup_fast_parser() 56 | tune(args) 57 | 58 | 59 | def run_mllm(): 60 | if "--eval" in sys.argv: 61 | from auto_round.script.mllm import setup_lmeval_parser, eval 62 | sys.argv.remove("--eval") 63 | args = setup_lmeval_parser() 64 | eval(args) 65 | elif "--lmms" in sys.argv: 66 | sys.argv.remove("--lmms") 67 | run_lmms() 68 | else: 69 | from auto_round.script.mllm import setup_parser, tune 70 | args = setup_parser() 71 | tune(args) 72 | 73 | def run_lmms(): 74 | # from auto_round.script.lmms_eval import setup_lmms_args, eval 75 | from auto_round.script.mllm import setup_lmms_parser, lmms_eval 76 | args = setup_lmms_parser() 77 | lmms_eval(args) 78 | 79 | def switch(): 80 | if "--mllm" in sys.argv: 81 | sys.argv.remove("--mllm") 82 | run_mllm() 83 | else: 84 | run() 85 | 86 | if __name__ == '__main__': 87 | switch() 88 | 89 | -------------------------------------------------------------------------------- /auto_round/data_type/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import auto_round.data_type.int 16 | import auto_round.data_type.mxfp 17 | import auto_round.data_type.fp8 18 | from auto_round.data_type.register import QUANT_FUNC_WITH_DTYPE 19 | import auto_round.data_type.w4fp8 20 | from auto_round.data_type.utils import get_quant_func 21 | import auto_round.data_type.nvfp 22 | 23 | -------------------------------------------------------------------------------- /auto_round/data_type/register.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | QUANT_FUNC_WITH_DTYPE = {} 17 | 18 | 19 | def register_dtype(name): 20 | """Class decorator to register a EXPORT subclass to the registry. 21 | 22 | Decorator function used before a Pattern subclass. 23 | 24 | Args: 25 | cls (class): The subclass of register. 26 | name: A string. Define the export type. 27 | 28 | Returns: 29 | cls: The class of register. 30 | """ 31 | 32 | def register(dtype): 33 | QUANT_FUNC_WITH_DTYPE[name] = dtype 34 | return dtype 35 | 36 | return register 37 | -------------------------------------------------------------------------------- /auto_round/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. -------------------------------------------------------------------------------- /auto_round/eval/evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional, Union 16 | 17 | from lm_eval import simple_evaluate as lm_simple_evaluate 18 | import os 19 | 20 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 21 | 22 | from lm_eval.models.huggingface import HFLM 23 | 24 | 25 | def simple_evaluate_user_model( 26 | user_model, 27 | tokenizer, 28 | batch_size: Optional[int] = 1, 29 | max_batch_size: Optional[int] = 64, 30 | eval_model_dtype = "auto", 31 | **kwargs 32 | ): 33 | hflm = HFLM( 34 | pretrained=user_model, 35 | tokenizer=tokenizer, 36 | batch_size=batch_size, 37 | max_batch_size=max_batch_size, 38 | dtype=eval_model_dtype) 39 | return lm_simple_evaluate( 40 | model=hflm, model_args=None, batch_size=batch_size, max_batch_size=max_batch_size, **kwargs) 41 | 42 | 43 | def simple_evaluate( 44 | model, 45 | model_args: Optional[Union[str, dict]] = None, 46 | batch_size: Optional[int] = None, 47 | max_batch_size: Optional[int] = None, 48 | device: Optional[str] = None, 49 | **kwargs): 50 | try: 51 | from auto_round import AutoRoundConfig 52 | except: 53 | from auto_round.inference.auto_quantizer import AutoHfQuantizer 54 | 55 | return lm_simple_evaluate( 56 | model=model, 57 | model_args=model_args, 58 | batch_size=batch_size, 59 | max_batch_size=max_batch_size, 60 | device=device, 61 | **kwargs) 62 | -------------------------------------------------------------------------------- /auto_round/export/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from auto_round.export.register import EXPORT_FORMAT,PACKING_LAYER_WITH_FORMAT, register_format,register_layer_packing 16 | 17 | 18 | @register_format("auto_gptq") 19 | def _save_quantized_as_autogptq(*args, **kwargs): 20 | from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq 21 | 22 | return save_quantized_as_autogptq(*args, **kwargs) 23 | 24 | 25 | @register_format("itrex") 26 | def _save_quantized_as_itrex(*args, **kwargs): 27 | from auto_round.export.export_to_itrex.export import save_quantized_as_itrex 28 | 29 | return save_quantized_as_itrex(*args, **kwargs) 30 | 31 | 32 | @register_format("itrex_xpu") 33 | def _save_quantized_as_itrex_xpu(*args, **kwargs): 34 | from auto_round.export.export_to_itrex.export import save_quantized_as_itrex_xpu 35 | 36 | return save_quantized_as_itrex_xpu(*args, **kwargs) 37 | 38 | 39 | @register_format("auto_round") 40 | def _save_quantized_as_autoround(*args, **kwargs): 41 | from auto_round.export.export_to_autoround.export import save_quantized_as_autoround 42 | 43 | return save_quantized_as_autoround(*args, **kwargs) 44 | 45 | 46 | @register_format("auto_awq") 47 | def _save_quantized_as_autoawq(*args, **kwargs): 48 | from auto_round.export.export_to_awq.export import save_quantized_as_autoawq 49 | 50 | return save_quantized_as_autoawq(*args, **kwargs) 51 | 52 | @register_format("gguf") 53 | def _save_quantized_as_autoawq(*args, **kwargs): 54 | from auto_round.export.export_to_gguf.export import save_quantized_as_gguf 55 | return save_quantized_as_gguf(*args, **kwargs) 56 | 57 | 58 | @register_layer_packing("auto_round") 59 | def _packing_layer_with_autoround(*args, **kwargs): 60 | from auto_round.export.export_to_autoround.export import pack_layer 61 | 62 | return pack_layer(*args, **kwargs) 63 | 64 | 65 | @register_layer_packing("auto_gptq") 66 | def _packing_layer_with_autogptq(*args, **kwargs): 67 | from auto_round.export.export_to_autogptq.export import pack_layer 68 | 69 | return pack_layer(*args, **kwargs) 70 | 71 | 72 | @register_layer_packing("auto_awq") 73 | def _packing_layer_with_autoawq(*args, **kwargs): 74 | from auto_round.export.export_to_awq.export import pack_layer 75 | 76 | return pack_layer(*args, **kwargs) 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /auto_round/export/export_to_autogptq/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /auto_round/export/export_to_autogptq/qlinear_triton.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | 17 | import numpy as np 18 | import torch 19 | import torch.nn as nn 20 | import transformers 21 | 22 | class TritonModuleMixin: 23 | @classmethod 24 | def warmup(cls, model, transpose=False, seqlen=2048): 25 | pass 26 | 27 | 28 | class QuantLinear(nn.Module, TritonModuleMixin): 29 | QUANT_TYPE = "triton" 30 | 31 | def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs): 32 | super().__init__() 33 | if bits not in [2, 4, 8]: 34 | raise NotImplementedError("Only 2,4,8 bits are supported.") 35 | if infeatures % 32 != 0 or outfeatures % 32 != 0: 36 | raise NotImplementedError("in_feature and out_feature must be divisible by 32.") 37 | self.infeatures = infeatures 38 | self.outfeatures = outfeatures 39 | self.bits = bits 40 | self.group_size = group_size if group_size != -1 else infeatures 41 | self.maxq = 2 ** self.bits - 1 42 | 43 | self.register_buffer( 44 | "qweight", 45 | torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32), 46 | ) 47 | self.register_buffer( 48 | "qzeros", 49 | torch.zeros( 50 | ( 51 | math.ceil(infeatures / self.group_size), 52 | outfeatures // 32 * self.bits, 53 | ), 54 | dtype=torch.int32, 55 | ), 56 | ) 57 | self.register_buffer( 58 | "scales", 59 | torch.zeros( 60 | (math.ceil(infeatures / self.group_size), outfeatures), 61 | dtype=torch.float16, 62 | ), 63 | ) 64 | self.register_buffer( 65 | "g_idx", 66 | torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32), 67 | ) 68 | 69 | if bias: 70 | self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16)) 71 | else: 72 | self.bias = None 73 | 74 | self.trainable = trainable 75 | 76 | def post_init(self): 77 | pass 78 | 79 | def pack(self, linear, scales, zeros, g_idx=None): 80 | scales_t = scales.t().contiguous() 81 | self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx 82 | if linear.bias is not None: 83 | self.bias = linear.bias.clone().half() 84 | self.scales = scales_t.clone().half() 85 | device = "cpu" 86 | if torch.cuda.is_available(): 87 | device = "cuda:0" 88 | elif torch.xpu.is_available(): 89 | device = "xpu:0" 90 | 91 | W = linear.weight.data.to(device).clone() 92 | if isinstance(linear, nn.Conv2d): 93 | W = W.flatten(1) 94 | if isinstance(linear, transformers.pytorch_utils.Conv1D): 95 | W = W.t() 96 | 97 | repeat_scales = scales.to(device).repeat_interleave(self.group_size, 1) 98 | if isinstance(zeros, torch.Tensor): 99 | repeat_zeros = zeros.to(device).repeat_interleave(self.group_size, 1) 100 | intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros[:, :W.shape[1]]).to( 101 | torch.int32) 102 | else: 103 | repeat_zeros = zeros 104 | intweight = torch.round(W.to(device) / repeat_scales[:, :W.shape[1]] + repeat_zeros).to( 105 | torch.int32) 106 | 107 | del repeat_scales 108 | intweight = intweight.reshape(-1, intweight.shape[1] // 32 * self.bits, 32 // self.bits) 109 | order_map = torch.arange(0, 32 // self.bits, device=device) * self.bits 110 | intweight = intweight << order_map 111 | intweight = torch.sum(intweight, dim=-1) 112 | 113 | intweight = intweight.t().contiguous().to(torch.int32) 114 | self.qweight = intweight.to("cpu") 115 | 116 | if isinstance(zeros, torch.Tensor): 117 | zeros = zeros.t().contiguous() 118 | zeros -= 1 119 | zeros = zeros.numpy().astype(np.uint32) 120 | qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32) 121 | i = 0 122 | col = 0 123 | while col < qzeros.shape[1]: 124 | for j in range(i, i + (32 // self.bits)): 125 | qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) 126 | i += 32 // self.bits 127 | col += 1 128 | 129 | qzeros = qzeros.astype(np.int32) 130 | self.qzeros = torch.from_numpy(qzeros) 131 | else: 132 | zeros -= 1 133 | shape = scales_t.shape 134 | value = 0 135 | for j in range(0, (32 // self.bits)): 136 | value |= zeros << (self.bits * j) 137 | qzeros = np.ones((shape[0], shape[1] // 32 * self.bits), dtype=np.uint32) * value 138 | qzeros = qzeros.astype(np.int32) 139 | self.qzeros = torch.from_numpy(qzeros) 140 | 141 | 142 | __all__ = ["QuantLinear"] -------------------------------------------------------------------------------- /auto_round/export/export_to_autoround/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .export import save_quantized_as_autoround 16 | 17 | -------------------------------------------------------------------------------- /auto_round/export/export_to_awq/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .export import save_quantized_as_autoawq 16 | 17 | 18 | -------------------------------------------------------------------------------- /auto_round/export/export_to_gguf/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /auto_round/export/export_to_gguf/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | GGUF_CONFIG = {} 16 | 17 | GGUF_CONFIG["gguf:q4_0"] = {"bits": 4, "act_bits": 16, "group_size": 32, "asym": False, "sym": True, "data_type": "int"} 18 | 19 | GGUF_CONFIG["gguf:q4_1"] = { 20 | "bits": 4, 21 | "act_bits": 16, 22 | "group_size": 32, 23 | "asym": True, 24 | "sym": False, 25 | "data_type": "int_asym_float_zp" 26 | } 27 | 28 | GGUF_CONFIG["gguf:q5_0"] = {"bits": 5, "act_bits": 16, "group_size": 32, "asym": False, "sym": True, "data_type": "int"} 29 | 30 | GGUF_CONFIG["gguf:q5_1"] = { 31 | "bits": 5, 32 | "act_bits": 16, 33 | "group_size": 32, 34 | "asym": True, 35 | "sym": False, 36 | "data_type": "int_asym_float_zp" 37 | } 38 | 39 | GGUF_CONFIG["gguf:q8_0"] = {"bits": 8, "act_bits": 16, "group_size": 32, "asym": False, "sym": True, "data_type": "int"} 40 | 41 | 42 | GGUF_CONFIG["gguf:q2_k_s"] = { 43 | "bits": 2, 44 | "act_bits": 16, 45 | "super_group_size": 16, 46 | "super_bits": 4, 47 | "group_size": 16, 48 | "asym": True, 49 | "sym": False, 50 | "data_type": "int_asym_dq" 51 | } 52 | 53 | GGUF_CONFIG["gguf:q3_k_s"] = { 54 | "bits": 3, 55 | "act_bits": 16, 56 | "super_group_size": 16, 57 | "super_bits": 6, 58 | "group_size": 16, 59 | "asym": False, 60 | "sym": True, 61 | "data_type": "int_sym_dq" 62 | } 63 | 64 | GGUF_CONFIG["gguf:q4_k_s"] = { 65 | "bits": 4, 66 | "act_bits": 16, 67 | "super_group_size": 8, 68 | "super_bits": 6, 69 | "group_size": 32, 70 | "asym": True, 71 | "sym": False, 72 | "data_type": "int_asym_dq" 73 | } 74 | 75 | GGUF_CONFIG["gguf:q5_k_s"] = { 76 | "bits": 5, 77 | "act_bits": 16, 78 | "super_group_size": 8, 79 | "super_bits": 6, 80 | "group_size": 32, 81 | "asym": True, 82 | "sym": False, 83 | "data_type": "int_asym_dq" 84 | } 85 | 86 | GGUF_CONFIG["gguf:q6_k"] = GGUF_CONFIG["gguf:q6_k_s"] = { 87 | "bits": 6, 88 | "act_bits": 16, 89 | "super_group_size": 16, 90 | "super_bits": 8, 91 | "group_size": 16, 92 | "asym": False, 93 | "sym": True, 94 | "data_type": "int_sym_dq" 95 | } 96 | -------------------------------------------------------------------------------- /auto_round/export/export_to_gguf/export.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | import shutil 18 | import torch 19 | from .convert import Model 20 | from auto_round.utils import logger, LazyImport 21 | from pathlib import Path 22 | import time 23 | 24 | gguf = LazyImport("gguf") 25 | 26 | FTYPE_MAP: dict[str, gguf.LlamaFileType] = { 27 | "f32": gguf.LlamaFileType.ALL_F32, 28 | "f16": gguf.LlamaFileType.MOSTLY_F16, 29 | "bf16": gguf.LlamaFileType.MOSTLY_BF16, 30 | "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0, 31 | "q4_1": gguf.LlamaFileType.MOSTLY_Q4_1, 32 | "q5_0": gguf.LlamaFileType.MOSTLY_Q5_0, 33 | "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1, 34 | "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, 35 | "q2_k_s": gguf.LlamaFileType.MOSTLY_Q2_K_S, 36 | "q3_k_s": gguf.LlamaFileType.MOSTLY_Q3_K_S, 37 | "q4_k_s": gguf.LlamaFileType.MOSTLY_Q4_K_S, 38 | "q5_k_s": gguf.LlamaFileType.MOSTLY_Q5_K_S, 39 | "q6_k": gguf.LlamaFileType.MOSTLY_Q6_K, 40 | "q6_k_s": gguf.LlamaFileType.MOSTLY_Q6_K, 41 | "auto": gguf.LlamaFileType.GUESSED, 42 | } 43 | 44 | def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", **kwargs): 45 | """Export the model to gguf format.""" 46 | if output_dir is not None and os.path.exists(output_dir): 47 | logger.warning(f"{output_dir} already exists, this may cause model conflict") 48 | 49 | st = time.time() 50 | 51 | model = kwargs["model"] 52 | tokenizer = kwargs.get("tokenizer", None) 53 | config = model.config 54 | 55 | tmp_work_dir = Path(os.path.join(output_dir, 'tmp_dir')) 56 | if tokenizer is not None: 57 | tokenizer.save_pretrained(tmp_work_dir) 58 | config.save_pretrained(tmp_work_dir) 59 | 60 | with torch.inference_mode(): 61 | hparams = Model.load_hparams(tmp_work_dir) 62 | model_architecture = hparams["architectures"][0] 63 | try: 64 | model_class = Model.from_model_architecture(model_architecture) 65 | except NotImplementedError: 66 | logger.error(f"Model {model_architecture} is not supported") 67 | sys.exit(1) 68 | model_class = Model.from_model_architecture(model_architecture) 69 | model_name = model.name_or_path.split('/') 70 | if len(model_name[-1]) == 0: 71 | model_name = model_name[-2] 72 | else: 73 | model_name = model_name[-1] 74 | 75 | output_type = backend.split(":")[-1] 76 | if output_type.lower() not in FTYPE_MAP: 77 | raise TypeError(f"{output_type} type is not supported") 78 | output_type = FTYPE_MAP.get(output_type.lower()) 79 | 80 | 81 | model_instance = model_class( 82 | model, 83 | dir_model=tmp_work_dir, 84 | ftype=output_type, 85 | fname_out=Path(output_dir), 86 | is_big_endian=False, 87 | model_name=model_name, 88 | split_max_tensors=False, 89 | split_max_size=0, 90 | dry_run=False, 91 | small_first_shard=False) 92 | model_instance.write() 93 | rt = time.time() - st 94 | logger.info(f"Model successfully exported to {model_instance.fname_out}, running time={rt}") 95 | 96 | shutil.rmtree(tmp_work_dir, ignore_errors=True) 97 | 98 | return model 99 | 100 | -------------------------------------------------------------------------------- /auto_round/export/export_to_gguf/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | QK_K = 256 16 | K_SCALE_SIZE = 12 17 | GGML_QUANT_SIZES = { 18 | "bf16": (1, 2), 19 | "q4_0": (32, 2 + 16), 20 | "q4_1": (32, 2 + 2 + 16), 21 | "q5_0": (32, 2 + 4 + 16), 22 | "q5_1": (32, 2 + 2 + 4 + 16), 23 | "q8_0": (32, 2 + 32), 24 | "q2_k": (256, 2 + 2 + QK_K//16 + QK_K//4), 25 | "q3_k": (256, 2 + QK_K // 4 + QK_K // 8 + 12), 26 | "q4_k": (256, 2 + 2 + QK_K//2 + 12), 27 | "q5_k": (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), 28 | "q6_k": (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), 29 | "q8_k": (256, 4 + QK_K + QK_K // 8) 30 | } -------------------------------------------------------------------------------- /auto_round/export/export_to_itrex/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .export import save_quantized_as_itrex, pack_model 15 | from .config import QuantConfig 16 | -------------------------------------------------------------------------------- /auto_round/export/register.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | EXPORT_FORMAT = {} 17 | 18 | 19 | def register_format(name): 20 | """Class decorator to register a EXPORT subclass to the registry. 21 | 22 | Decorator function used before a Pattern subclass. 23 | 24 | Args: 25 | cls (class): The subclass of register. 26 | name: A string. Define the export type. 27 | 28 | Returns: 29 | cls: The class of register. 30 | """ 31 | 32 | def register(format): 33 | EXPORT_FORMAT[name] = format 34 | return format 35 | 36 | return register 37 | 38 | 39 | 40 | PACKING_LAYER_WITH_FORMAT = {} 41 | 42 | def register_layer_packing(name): 43 | """Class decorator to register a EXPORT subclass to the registry. 44 | 45 | Decorator function used before a Pattern subclass. 46 | 47 | Args: 48 | cls (class): The subclass of register. 49 | name: A string. Define the export type. 50 | 51 | Returns: 52 | cls: The class of register. 53 | """ 54 | 55 | def register(format): 56 | PACKING_LAYER_WITH_FORMAT[name] = format 57 | return format 58 | 59 | return register 60 | -------------------------------------------------------------------------------- /auto_round/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init 15 | 16 | -------------------------------------------------------------------------------- /auto_round/low_cpu_mem/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """Torch layer-wise quantization module.""" 18 | from .utils import * 19 | -------------------------------------------------------------------------------- /auto_round/mllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .mllm_dataset import get_mllm_dataloader 16 | from .template import Template, get_template, TEMPLATES 17 | from .autoround_mllm import AutoRoundMLLM 18 | from ..utils import LazyImport 19 | from .eval import mllm_eval, lmms_eval -------------------------------------------------------------------------------- /auto_round/mllm/templates/cogvlm2.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "cogvlm2", 3 | "format_user": "Question: {{content}} ", 4 | "format_assistant": "Answer: {{content}}\n", 5 | "replace_tokens": ["\n", ""], 6 | "processor": "cogvlm2", 7 | "extra_encode" : true, 8 | "default_dataset": "NeelNanda/pile-10k" 9 | } -------------------------------------------------------------------------------- /auto_round/mllm/templates/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "default", 3 | "format_user": "{{content}}", 4 | "format_assistant": "{{content}}", 5 | "format_system": "{{content}}", 6 | "format_function": "", 7 | "format_observation": "", 8 | "format_separator": "\n", 9 | "default_system": "You are a helpful assistant.", 10 | "replace_tokens": null, 11 | "extra_encode" : false, 12 | "default_dataset": "NeelNanda/pile-10k", 13 | "processor": "hf" 14 | } -------------------------------------------------------------------------------- /auto_round/mllm/templates/llava.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "llava", 3 | "replace_tokens": null, 4 | "processor": "llava", 5 | "extra_encode" : false, 6 | "default_dataset": "NeelNanda/pile-10k" 7 | } -------------------------------------------------------------------------------- /auto_round/mllm/templates/phi3_v.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "phi3_v", 3 | "replace_tokens": ["", "<|image_1|>"], 4 | "processor": "hf", 5 | "extra_encode" : false, 6 | "default_dataset": "NeelNanda/pile-10k" 7 | } -------------------------------------------------------------------------------- /auto_round/mllm/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import requests 17 | 18 | from ..utils import LazyImport 19 | 20 | PIL = LazyImport("PIL") 21 | from PIL import Image # pylint: disable=E0401 22 | 23 | 24 | def _extract_data_dir(dir_path: str): 25 | if os.path.isdir(dir_path): 26 | return dir_path 27 | elif "=" in dir_path: 28 | result = {} 29 | dir_path = dir_path.split(",") 30 | for _path in dir_path: 31 | k, v = _path.split('=') 32 | if k in ['image', 'video', 'audio']: 33 | result[k] = v 34 | return result 35 | else: 36 | raise TypeError("incorrect input of extra_data_dir, please use auto_round --help for more details.") 37 | 38 | 39 | def fetch_image(path_or_url): 40 | if os.path.isfile(path_or_url): 41 | image_obj = Image.open(path_or_url) 42 | elif path_or_url.startswith("http://") or path_or_url.startswith("https://"): 43 | image_obj = Image.open(requests.get(path_or_url, stream=True).raw) 44 | else: 45 | raise TypeError(f"{path_or_url} neither a path or url.") 46 | 47 | return image_obj 48 | -------------------------------------------------------------------------------- /auto_round/script/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. -------------------------------------------------------------------------------- /auto_round/special_model_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama") # Limitations on batch_size 17 | 18 | SUPPORT_ONLY_TEXT_MODELS = [ 19 | "phi3_v", 20 | "cogvlm2", 21 | "llava", 22 | "qwen2_vl", 23 | "deepseek_vl_v2", 24 | "chatglm", 25 | "idefics3" 26 | ] 27 | 28 | SPECIAL_SHARED_CACHE_KEYS = { 29 | "Gemma3ForConditionalGeneration": ("position_embeddings_global", "position_embeddings_local")} 30 | SPECIAL_SHARED_CACHE_KEYS["MiniMaxText01ForCausalLM"] = ("slope_rate",) 31 | 32 | 33 | def _handle_special_model(model): 34 | if model.config.model_type == "deepseek_vl_v2": 35 | from functools import partial 36 | model.forward = partial(_deepseek_vl2_forward, model) 37 | return model 38 | 39 | 40 | def _get_deepseek_vl2_multimodal_block(model, quant_vision=False): 41 | model.forward = model.language.forward 42 | block_names = [] 43 | if quant_vision: 44 | block_names.append([f"vision.blocks.{i}" for i in range(len(model.vision.blocks))]) 45 | block_names.append([f"projector.layers.{i}" for i in range(len(model.projector.layers))]) 46 | block_names.append([f"language.model.layers.{i}" for i in range(len(model.language.model.layers))]) 47 | return block_names 48 | 49 | 50 | SPECIAL_MULTIMODAL_BLOCK = { 51 | "deepseek_vl_v2": _get_deepseek_vl2_multimodal_block 52 | } 53 | 54 | 55 | def _deepseek_vl2_forward( 56 | model, 57 | input_ids=None, 58 | 59 | position_ids=None, 60 | attention_mask=None, 61 | past_key_values=None, 62 | inputs_embeds=None, 63 | 64 | images=None, 65 | images_seq_mask=None, 66 | images_spatial_crop=None, 67 | 68 | labels=None, 69 | use_cache=None, 70 | output_attentions=None, 71 | output_hidden_states=None, 72 | return_dict=None, 73 | cache_position=None, 74 | **kwargs 75 | ): 76 | inputs_embeds = model.prepare_inputs_embeds( 77 | input_ids=input_ids, 78 | images=images, 79 | images_seq_mask=images_seq_mask, 80 | images_spatial_crop=images_spatial_crop, 81 | ) 82 | return model.language( 83 | input_ids=None, 84 | attention_mask=attention_mask, 85 | position_ids=position_ids, 86 | past_key_values=past_key_values, 87 | inputs_embeds=inputs_embeds, 88 | labels=labels, 89 | use_cache=use_cache, 90 | output_attentions=output_attentions, 91 | output_hidden_states=output_hidden_states, 92 | return_dict=return_dict, 93 | cache_position=cache_position) 94 | 95 | 96 | def check_mllm_model_batch(model, batch_size, gradient_accumulate_steps=1): 97 | """ 98 | Checks model configuration to determine if it's necessary to limit bs to avoid potential input shape mismatches. 99 | """ 100 | for key in mllms_with_limited_bs: 101 | if hasattr(model, "config") and key in model.config.model_type and batch_size != 1: 102 | accumulate_steps = batch_size * gradient_accumulate_steps 103 | print("To avoid the tensor concat mismatch problem, modified parameters to " \ 104 | f"batch_size=1. As an alternative, set the gradient_accumulate_steps={accumulate_steps}") 105 | return 1, accumulate_steps 106 | return batch_size, gradient_accumulate_steps 107 | -------------------------------------------------------------------------------- /auto_round/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Intel® auto-round: An open-source Python library 15 | supporting popular model weight only compression based on signround.""" 16 | 17 | __version__ = "0.5.1" 18 | -------------------------------------------------------------------------------- /auto_round_extension/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/__init__.py -------------------------------------------------------------------------------- /auto_round_extension/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/cuda/__init__.py -------------------------------------------------------------------------------- /auto_round_extension/hpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/hpu/__init__.py -------------------------------------------------------------------------------- /auto_round_extension/ipex/__init__.py: -------------------------------------------------------------------------------- 1 | from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear as IpexAWQQuantLinear 2 | from auto_round_extension.ipex.qlinear_ipex_gptq import ( 3 | QuantLinear as IpexGPTQQuantLinear, 4 | ) 5 | 6 | ipex_qlinear_classes = (IpexAWQQuantLinear, IpexGPTQQuantLinear) 7 | -------------------------------------------------------------------------------- /auto_round_extension/ipex/qlinear_ipex_awq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class QuantLinear(nn.Module): 5 | QUANT_TYPE = "ipex_awq" 6 | def __init__(self, w_bit, group_size, in_features, out_features, bias, dev): 7 | super().__init__() 8 | assert w_bit == 4, "Only 4 bit are supported for now." 9 | self.compute_dtype = torch.float16 if torch.xpu.is_available() else torch.bfloat16 10 | self.in_features = in_features 11 | self.out_features = out_features 12 | self.w_bit = w_bit 13 | self.group_size = group_size if group_size != -1 else in_features 14 | self.scale_dtype = torch.float32 15 | 16 | # quick sanity check (make sure alignment) 17 | assert self.in_features % self.group_size == 0 18 | assert out_features % (32 // self.w_bit) == 0 19 | self.pack_num = 32 // self.w_bit 20 | 21 | self.register_buffer( 22 | "qzeros", 23 | torch.zeros( 24 | (in_features // self.group_size, out_features // self.pack_num), 25 | dtype=torch.int32, 26 | device=dev, 27 | ), 28 | ) 29 | self.register_buffer( 30 | "scales", 31 | torch.zeros( 32 | (in_features // self.group_size, out_features), 33 | dtype=self.compute_dtype, 34 | device=dev, 35 | )) 36 | if bias: 37 | self.register_buffer( 38 | "bias", 39 | torch.zeros((out_features), dtype=self.compute_dtype, device=dev), 40 | ) 41 | else: 42 | self.register_buffer( 43 | "bias", 44 | None, 45 | ) 46 | qweight = torch.zeros((in_features, out_features // self.pack_num), dtype=torch.int32, device=dev) 47 | self.register_buffer("qweight", qweight) 48 | 49 | def post_init(self): 50 | assert self.qweight.device.type == "cpu" or self.qweight.device.type == "xpu" 51 | import intel_extension_for_pytorch as ipex 52 | 53 | self.ipex_linear = ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(self.qweight, 54 | self.scales, 55 | self.qzeros, \ 56 | self.in_features, 57 | self.out_features, 58 | None, 59 | self.bias, \ 60 | self.group_size, 61 | None, 62 | 1, 63 | 0 64 | ) 65 | 66 | @classmethod 67 | def from_linear(cls, linear, w_bit, group_size, init_only=False, scales=None): 68 | awq_linear = cls( 69 | w_bit, 70 | group_size, 71 | linear.in_features, 72 | linear.out_features, 73 | linear.bias is not None, 74 | linear.weight.device, 75 | ) 76 | if init_only: # just prepare for loading sd 77 | return awq_linear 78 | 79 | raise NotImplementedError("Only inference is supported for IPEX kernels") 80 | 81 | @torch.no_grad() 82 | def forward(self, x): 83 | 84 | outputs = self.ipex_linear(x) 85 | 86 | return outputs 87 | 88 | def extra_repr(self) -> str: 89 | return ("in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format( 90 | self.in_features, 91 | self.out_features, 92 | self.bias is not None, 93 | self.w_bit, 94 | self.group_size, 95 | )) 96 | 97 | -------------------------------------------------------------------------------- /auto_round_extension/qbits/__init__.py: -------------------------------------------------------------------------------- 1 | from auto_round_extension.qbits.qlinear_qbits import QuantLinear as QBitsQuantLinear 2 | from auto_round_extension.qbits.qlinear_qbits_gptq import ( 3 | QuantLinear as QBitsGPTQQuantLinear, 4 | ) 5 | from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear 6 | 7 | qbits_qlinear_classes = (QBitsQuantLinear, QBitsGPTQQuantLinear) 8 | 9 | qbits_awq_classes = (QBitsAWQQuantLinear,) 10 | -------------------------------------------------------------------------------- /auto_round_extension/torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/torch/__init__.py -------------------------------------------------------------------------------- /auto_round_extension/triton/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/auto_round_extension/triton/__init__.py -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils/mixin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # MIT License 16 | # 17 | # Copyright (c) 2023 潘其威(William) 18 | # 19 | # Permission is hereby granted, free of charge, to any person obtaining a copy 20 | # of this software and associated documentation files (the "Software"), to deal 21 | # in the Software without restriction, including without limitation the rights 22 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 23 | # copies of the Software, and to permit persons to whom the Software is 24 | # furnished to do so, subject to the following conditions: 25 | # 26 | # The above copyright notice and this permission notice shall be included in all 27 | # copies or substantial portions of the Software. 28 | # 29 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 34 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 | # SOFTWARE. 36 | class TritonModuleMixin: 37 | @classmethod 38 | def warmup(cls, model, transpose=False, seqlen=2048): 39 | pass 40 | -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils_zp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /auto_round_extension/triton/triton_utils_zp/mixin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # MIT License 16 | # 17 | # Copyright (c) 2023 潘其威(William) 18 | # 19 | # Permission is hereby granted, free of charge, to any person obtaining a copy 20 | # of this software and associated documentation files (the "Software"), to deal 21 | # in the Software without restriction, including without limitation the rights 22 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 23 | # copies of the Software, and to permit persons to whom the Software is 24 | # furnished to do so, subject to the following conditions: 25 | # 26 | # The above copyright notice and this permission notice shall be included in all 27 | # copies or substantial portions of the Software. 28 | # 29 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 34 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 | # SOFTWARE. 36 | class TritonModuleMixin: 37 | @classmethod 38 | def warmup(cls, model, transpose=False, seqlen=2048): 39 | pass 40 | -------------------------------------------------------------------------------- /docs/Llama-2-7b-chat-hf-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model meta-llama/Llama-2-7b-chat-hf \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --asym \ 13 | --format 'auto_gptq,auto_round' \ 14 | --output_dir "./tmp_autoround" 15 | ``` 16 | 17 | 18 | Due to licensing restrictions, we are unable to release the model. 19 | 20 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d. 21 | 22 | Since we encountered an issue evaluating this model with lm-eval, we opted to evaluate the qdq model instead. In our assessment, we found that its accuracy closely matches that of the real quantized model in most cases except for some small models like opt-125m. 23 | 24 | 25 | | Metric | FP16 | int4 qdq | 26 | | -------------- | ------ | -------- | 27 | | Avg. | 0.5901 | 0.5897 | 28 | | mmlu | 0.4640 | 0.4545 | 29 | | lambada_openai | 0.7105 | 0.7037 | 30 | | hellaswag | 0.5780 | 0.5706 | 31 | | winogrande | 0.6638 | 0.6614 | 32 | | piqa | 0.7639 | 0.7633 | 33 | | truthfulqa_mc1 | 0.3023 | 0.3035 | 34 | | openbookqa | 0.3340 | 0.3260 | 35 | | boolq | 0.7976 | 0.8064 | 36 | | rte | 0.6968 | 0.7292 | 37 | | arc_easy | 0.7382 | 0.7336 | 38 | | arc_challenge | 0.4420 | 0.4352 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /docs/Llama-3.2-11B-Vision-Instruct-sym.md: -------------------------------------------------------------------------------- 1 | 2 | ## Model Details 3 | 4 | This model is an int4 model with group_size 128 and symmetric quantization of [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct). Load the model with revision="f036ca" to use AutoGPTQ format. 5 | 6 | ## How To Use 7 | 8 | ### Requirements 9 | Please use Transformers version 4.45.0 or later 10 | AutoRound version >= 0.4.1 11 | 12 | ### INT4 Inference 13 | ```python 14 | from auto_round import AutoRoundConfig ## must import for auto-round format 15 | import requests 16 | import torch 17 | from PIL import Image 18 | from transformers import MllamaForConditionalGeneration, AutoProcessor 19 | 20 | quantized_model_path="Intel/Llama-3.2-11B-Vision-Instruct-inc-private" 21 | 22 | model = MllamaForConditionalGeneration.from_pretrained( 23 | quantized_model_path, 24 | torch_dtype="auto", 25 | device_map="auto", 26 | ##revision="f036ca" ##AutoGPTQ format 27 | ) 28 | processor = AutoProcessor.from_pretrained(quantized_model_path) 29 | image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" 30 | messages = [ 31 | {"role": "user", "content": [ 32 | {"type": "image"}, 33 | {"type": "text", "text": "Please write a haiku for this one, it would be: "} 34 | ]} 35 | ] 36 | 37 | # Preparation for inference 38 | image = Image.open(requests.get(image_url, stream=True).raw) 39 | input_text = processor.apply_chat_template(messages, add_generation_prompt=True) 40 | inputs = processor( 41 | image, 42 | input_text, 43 | add_special_tokens=False, 44 | return_tensors="pt" 45 | ).to(model.device) 46 | 47 | output = model.generate(**inputs, max_new_tokens=50) 48 | print(processor.decode(output[0])) 49 | 50 | ##INT4: 51 | ## Here is a haiku for the rabbit: 52 | 53 | ## Whiskers twitching bright 54 | ## Ears perked up, alert and keen 55 | ## Spring's gentle delight<|eot_id|> 56 | 57 | 58 | ##BF16: 59 | ## Here is a haiku for the rabbit: 60 | 61 | ## Whiskers twitching fast 62 | ## In a coat of blue and brown 63 | ## Hoppy little soul<|eot_id|> 64 | 65 | image_url = "http://images.cocodataset.org/train2017/000000411975.jpg" 66 | messages = [ 67 | {"role": "user", "content": [ 68 | {"type": "image"}, 69 | {"type": "text", "text": "How many people are on the baseball field in the picture?"} 70 | ]} 71 | ] 72 | ##INT4: There are five people on the baseball field in the picture. 73 | ## 74 | 75 | ##BF16: There are five people on the baseball field in the picture. 76 | ## 77 | 78 | image_url = "https://intelcorp.scene7.com/is/image/intelcorp/processor-overview-framed-badge:1920-1080?wid=480&hei=270" 79 | messages = [ 80 | {"role": "user", "content": [ 81 | {"type": "image"}, 82 | {"type": "text", "text": "Which company does this picture represent?"} 83 | ]} 84 | ] 85 | ##INT4: This picture represents Intel. 86 | ## 87 | 88 | ##BF16: This image represents Intel, a multinational semiconductor corporation headquartered in Santa Clara, California. 89 | ## 90 | 91 | ``` 92 | 93 | ## Evaluation the model 94 | pip3 install git+https://github.com/open-compass/VLMEvalKit.git@7de2dcb. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update. 95 | ```bash 96 | auto-round-mllm --eval --model Intel/Llama-3.2-11B-Vision-Instruct-inc-private --tasks MMBench_DEV_EN_V11,ScienceQA_VAL,TextVQA_VAL,POPE --output_dir "./eval_result" 97 | ``` 98 | |Metric |16bits|Pile Calib INT4 |Llava Calib INT4| 99 | |:-------------------|:------|:------|:------| 100 | |avg |66.05 |67.81 |66.02 | 101 | |MMBench_DEV_EN_V11 |52.86 |53.48 |52.17 | 102 | |ScienceQA_VAL |68.86 |70.39 |69.15 | 103 | |TextVQA_VAL |54.49 |59.62 |55.07 | 104 | |POPE |88.00 |87.76 |87.71 | 105 | 106 | ### Generate the model 107 | Here is the sample command to reproduce the model. 108 | ```bash 109 | pip install auto-round 110 | auto-round-mllm \ 111 | --model meta-llama/Llama-3.2-11B-Vision-Instruct \ 112 | --device 0 \ 113 | --group_size 128 \ 114 | --bits 4 \ 115 | --iters 1000 \ 116 | --nsample 512 \ 117 | --seqlen 512 \ 118 | --format 'auto_gptq,auto_round' \ 119 | --output_dir "./tmp_autoround" 120 | ``` 121 | 122 | ## Ethical Considerations and Limitations 123 | 124 | The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs. 125 | 126 | Therefore, before deploying any applications of the model, developers should perform safety testing. 127 | 128 | ## Caveats and Recommendations 129 | 130 | Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. 131 | 132 | Here are a couple of useful links to learn more about Intel's AI software: 133 | 134 | - Intel Neural Compressor [link](https://github.com/intel/neural-compressor) 135 | 136 | ## Disclaimer 137 | 138 | The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes. 139 | 140 | ## Cite 141 | 142 | @article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} } 143 | 144 | [arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round) 145 | -------------------------------------------------------------------------------- /docs/Meta-Llama-3-8B-Instruct-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model meta-llama/Meta-Llama-3-8B-Instruct \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --asym \ 13 | --format 'auto_gptq,auto_round' \ 14 | --output_dir "./tmp_autoround" 15 | ``` 16 | 17 | quant lm-head 18 | ```bash 19 | auto-round \ 20 | --model meta-llama/Meta-Llama-3-8B-Instruct \ 21 | --device 0 \ 22 | --group_size 128 \ 23 | --bits 4 \ 24 | --iters 1000 \ 25 | --nsamples 512 \ 26 | --asym \ 27 | --quant_lm_head \ 28 | --format 'auto_gptq,auto_round' \ 29 | --output_dir "./tmp_autoround" 30 | ``` 31 | lm-eval 0.4.2 is used 32 | 33 | | Metric | **BF16** | w4g128 w/o lm-head | w4g128 with lm-head | 34 | | ---------------- | :------- |--------------------|-----------------------------| 35 | | Avg. | 0.6352 | 0.6312 | 0.6303 | 36 | | mmlu | 0.6386 | 0.6306 | 0.6243 | 37 | | winogrande | 0.7143 | 0.7238 | 0.7261 | 38 | | truthfulqa_mc1 | 0.3623 | 0.3537 | 0.3574 | 39 | | rte | 0.6751 | 0.6859 | 0.6715 | 40 | | piqa | 0.7867 | 0.7797 | 0.7775 | 41 | | openbookqa | 0.3400 | 0.3300 | 0.3340 | 42 | | lambada_openai | 0.7182 | 0.7200 | 0.7118 | 43 | | hellaswag | 0.5769 | 0.5699 | 0.5686 | 44 | | boolq | 0.8297 | 0.8309 | 0.8266 | 45 | | arc_easy | 0.8152 | 0.8089 | 0.8123 | 46 | | arc_challenge | 0.5299 | 0.5102 | 0.5111 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /docs/Mistral-7B-Instruct-v0.2-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model mistralai/Mistral-7B-Instruct-v0.2 \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --asym \ 13 | --format 'auto_gptq,auto_round' \ 14 | --output_dir "./tmp_autoround" 15 | ``` 16 | 17 | | Metric | BF16 | INT4 | 18 | | -------------- | ------ | ------ | 19 | | Avg. | 0.6647 | 0.6621 | 20 | | mmlu | 0.5906 | 0.5872 | 21 | | lambada_openai | 0.7141 | 0.7141 | 22 | | hellaswag | 0.6602 | 0.6557 | 23 | | winogrande | 0.7395 | 0.7364 | 24 | | piqa | 0.8052 | 0.8047 | 25 | | truthfulqa_mc1 | 0.5251 | 0.5153 | 26 | | openbookqa | 0.3600 | 0.3420 | 27 | | boolq | 0.8535 | 0.8541 | 28 | | rte | 0.7040 | 0.7148 | 29 | | arc_easy | 0.8161 | 0.8165 | 30 | | arc_challenge | 0.5435 | 0.5435 | 31 | 32 | -------------------------------------------------------------------------------- /docs/Mistral-7B-v0.1-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | 4 | A sample command to generate an INT4 model. 5 | ```bash 6 | auto-round \ 7 | --model mistralai/Mistral-7B-v0.1 \ 8 | --device 0 \ 9 | --group_size 128 \ 10 | --bits 4 \ 11 | --iters 1000 \ 12 | --nsamples 512 \ 13 | --asym \ 14 | --format 'auto_gptq,auto_round' \ 15 | --output_dir "./tmp_autoround" 16 | ``` 17 | 18 | quant_lm_head 19 | 20 | ```bash 21 | auto-round \ 22 | --model mistralai/Mistral-7B-v0.1 \ 23 | --device 0 \ 24 | --group_size 128 \ 25 | --bits 4 \ 26 | --iters 1000 \ 27 | --nsamples 512 \ 28 | --asym \ 29 | --quant_lm_head \ 30 | --format 'auto_gptq,auto_round' \ 31 | --output_dir "./tmp_autoround" 32 | ``` 33 | 34 | lm-eval 0.4.2 is used 35 | 36 | | Metric | BF16 | [INT4-lmhead](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc-lmhead) | [INT4](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc) | 37 | | -------------- | ------ |-----------------| ------------------------------------------------------------ | 38 | | Avg. | 0.6260 | 0.6228 | 0.6218 | 39 | | mmlu | 0.5868 | 0.5760 | 0.5772 | 40 | | lambada_openai | 0.7555 | 0.7539 | 0.7543 | 41 | | hellaswag | 0.6125 | 0.6055 | 0.6072 | 42 | | winogrande | 0.7395 | 0.7380 | 0.7388 | 43 | | piqa | 0.8069 | 0.8009 | 0.8030 | 44 | | truthfulqa_mc1 | 0.2803 | 0.2876 | 0.2864 | 45 | | openbookqa | 0.3280 | 0.3300 | 0.3260 | 46 | | boolq | 0.8379 | 0.8291 | 0.8281 | 47 | | arc_easy | 0.8089 | 0.8043 | 0.8035 | 48 | | arc_challenge | 0.5034 | 0.5026 | 0.4932 | 49 | -------------------------------------------------------------------------------- /docs/Mixtral-8x7B-Instruct-v0.1-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model mistralai/Mixtral-8x7B-Instruct-v0.1 \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --asym \ 13 | --format 'auto_gptq,auto_round' \ 14 | --output_dir "./tmp_autoround" 15 | ``` 16 | 17 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id f3b7917091afba325af3980a35d8a6dcba03dc3f is used 18 | 19 | | Metric | BF16 | INT4 | 20 | | -------------- |--------| ------ | 21 | | Avg. | 0.7000 | 0.6977 | 22 | | mmlu | 0.6885 | 0.6824 | 23 | | lambada_openai | 0.7718 | 0.7790 | 24 | | hellaswag | 0.6767 | 0.6745 | 25 | | winogrande | 0.7687 | 0.7719 | 26 | | piqa | 0.8351 | 0.8335 | 27 | | truthfulqa_mc1 | 0.4969 | 0.4884 | 28 | | openbookqa | 0.3680 | 0.3720 | 29 | | boolq | 0.8850 | 0.8783 | 30 | | rte | 0.7184 | 0.7004 | 31 | | arc_easy | 0.8699 | 0.8712 | 32 | | arc_challenge | 0.6220 | 0.6229 | 33 | 34 | -------------------------------------------------------------------------------- /docs/Mixtral-8x7B-v0.1-asym-acc.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model mistralai/Mixtral-8x7B-v0.1 \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --asym \ 13 | --format 'auto_gptq,auto_round' \ 14 | --output_dir "./tmp_autoround" 15 | ``` 16 | 17 | 18 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f 19 | 20 | Download the model from hf(coming soon) or follow examples/language-modeling/scripts/Mixtral-8x7B-v0.1.sh to generate the model 21 | 22 | ~~~bash 23 | lm_eval --model hf --model_args pretrained="Intel/Mixtral-8x7B-v0.1-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32 24 | ~~~ 25 | 26 | | Metric | BF16 | INT4 | 27 | | -------------- |--------| ------ | 28 | | Avg. | 0.6698 | 0.6633 | 29 | | mmlu | 0.6802 | 0.6693 | 30 | | lambada_openai | 0.7827 | 0.7825 | 31 | | hellaswag | 0.6490 | 0.6459 | 32 | | winogrande | 0.7648 | 0.7514 | 33 | | piqa | 0.8248 | 0.8210 | 34 | | truthfulqa_mc1 | 0.3427 | 0.3219 | 35 | | openbookqa | 0.3540 | 0.3560 | 36 | | boolq | 0.8523 | 0.8474 | 37 | | rte | 0.7076 | 0.6931 | 38 | | arc_easy | 0.8430 | 0.8430 | 39 | | arc_challenge | 0.5666 | 0.5648 | 40 | -------------------------------------------------------------------------------- /docs/Qwen1.5-7B-Chat-acc.md: -------------------------------------------------------------------------------- 1 | Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d. 2 | 3 | We used the following command for evaluation. 4 | For reference, the results of official AWQ-INT4 and GPTQ-INT4 release are listed. 5 | 6 | ~~~bash 7 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0 8 | ~~~ 9 | 10 | | Metric | BF16 | [Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ) | [Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4) | INT4 sym recipe | INT4 asym recipe | 11 | | -------------- | ------ |-----------|--------------------------------|-----------------|------------------| 12 | | Avg. | 0.6231 | 0.6152 | 0.6070 | 0.6205 | 0.6186 | 13 | | ceval | 0.6887 | 0.6820 | 0.6679 | 0.6761 | 0.6820 | 14 | | cmmlu | 0.6959 | 0.6862 | 0.6831 | 0.6870 | 0.6884 | 15 | | mmlu | 0.6020 | 0.5944 | 0.5902 | 0.5974 | 0.5946 | 16 | | gsm8k | 0.5057 | 0.4981 | 0.4867 | 0.5216 | 0.5095 | 17 | -------------------------------------------------------------------------------- /docs/Yi-6B-Chat-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | ```bash 3 | auto-round \ 4 | --model 01-ai/Yi-6B-Chat \ 5 | --device 0 \ 6 | --group_size 128 \ 7 | --bits 4 \ 8 | --iters 1000 \ 9 | --nsamples 512 \ 10 | --asym \ 11 | --minmax_lr 2e-3 \ 12 | --format 'auto_gptq,auto_round' \ 13 | --output_dir "./tmp_autoround" 14 | ``` 15 | 16 | 17 | Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d. 18 | 19 | We used the following command for evaluation. 20 | For reference, the results of official AWQ-INT4 release are listed. 21 | 22 | ~~~bash 23 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0 24 | ~~~ 25 | 26 | | Metric | BF16 |[01-ai/Yi-6B-Chat-4bits](https://huggingface.co/01-ai/Yi-6B-Chat-4bits)| INT4 | 27 | |--------|--------|----------------------|--------| 28 | | Avg. | 0.6043 | 0.5867 | 0.5939 | 29 | | mmlu | 0.6163 | 0.6133 | 0.6119 | 30 | | cmmlu | 0.7431 | 0.7312 | 0.7314 | 31 | | ceval | 0.7355 | 0.7155 | 0.7281 | 32 | | gsm8k | 0.3222 | 0.2866 | 0.3040 | 33 | -------------------------------------------------------------------------------- /docs/baichuan2-7b-cha-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | ```bash 3 | auto-round \ 4 | --model baichuan-inc/Baichuan2-7B-Chat \ 5 | --device 0 \ 6 | --group_size 128 \ 7 | --bits 4 \ 8 | --iters 1000 \ 9 | --nsamples 512 \ 10 | --asym \ 11 | --minmax_lr 2e-3 \ 12 | --format 'auto_gptq,auto_round' \ 13 | --output_dir "./tmp_autoround" 14 | ``` 15 | 16 | 17 | 18 | Due to licensing restrictions, we are unable to release the model. Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d. 19 | 20 | We used the following command for evaluation. 21 | 22 | ~~~bash 23 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True,trust_remote_code=True --device cuda:0 --tasks ceval-valid,cmmlu,mmlu,gsm8k --batch_size 16 --num_fewshot 0 24 | ~~~ 25 | 26 | | Metric | BF16 | INT4 | 27 | |--------|--------|--------| 28 | | Avg. | 0.4504 | 0.4470 | 29 | | mmlu | 0.5096 | 0.5053 | 30 | | cmmlu | 0.5486 | 0.5426 | 31 | | ceval | 0.5394 | 0.5223 | 32 | | gsm8k | 0.2039 | 0.2176 | 33 | -------------------------------------------------------------------------------- /docs/bloom-3B-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | ```bash 3 | auto-round \ 4 | --model bigscience/bloom-3b \ 5 | --device 0 \ 6 | --group_size 128 \ 7 | --bits 4 \ 8 | --iters 1000 \ 9 | --nsamples 512 \ 10 | --asym \ 11 | --format 'auto_gptq,auto_round' \ 12 | --output_dir "./tmp_autoround" 13 | ``` 14 | 15 | 16 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d 17 | ##pip install auto-gptq[triton] 18 | ##pip install triton==2.2.0 19 | ```bash 20 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32 21 | ``` 22 | 23 | 24 | 25 | | Metric | FP16 | INT4 | 26 | | -------------- | ------ | ------ | 27 | | Avg. | 0.4532 | 0.4514 | 28 | | mmlu | 0.2592 | 0.2537 | 29 | | lambada_openai | 0.5176 | 0.5135 | 30 | | hellaswag | 0.4136 | 0.4093 | 31 | | winogrande | 0.5864 | 0.5856 | 32 | | piqa | 0.7062 | 0.7095 | 33 | | truthfulqa_mc1 | 0.2326 | 0.2264 | 34 | | openbookqa | 0.2160 | 0.2140 | 35 | | boolq | 0.6156 | 0.6199 | 36 | | rte | 0.5632 | 0.5632 | 37 | | arc_easy | 0.5947 | 0.5888 | 38 | | arc_challenge | 0.2799 | 0.2816 | 39 | -------------------------------------------------------------------------------- /docs/falcon-7b-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | ```bash 3 | auto-round \ 4 | --model tiiuae/falcon-7b \ 5 | --device 0 \ 6 | --group_size 64 \ 7 | --bits 4 \ 8 | --iters 1000 \ 9 | --nsamples 512 \ 10 | --asym \ 11 | --format 'auto_gptq,auto_round' \ 12 | --output_dir "./tmp_autoround" 13 | ``` 14 | 15 | 16 | We generate the model with group_size 64 as there is an issue when evaluating with group_size 128. 17 | Evaluate the model 18 | pip3 install lm-eval==0.4.2 19 | 20 | ```bash 21 | lm_eval --model hf --model_args pretrained="Intel/falcon-7b-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16 22 | ``` 23 | 24 | | Metric | BF16 | int4 | 25 | | -------------- | ------ | ------ | 26 | | Avg. | 0.5462 | 0.5454 | 27 | | mmlu | 0.2546 | 0.2562 | 28 | | lambada_openai | 0.7450 | 0.7485 | 29 | | hellaswag | 0.5773 | 0.5719 | 30 | | winogrande | 0.6740 | 0.6835 | 31 | | piqa | 0.7943 | 0.7905 | 32 | | truthfulqa_mc1 | 0.2228 | 0.2166 | 33 | | openbookqa | 0.3080 | 0.3100 | 34 | | boolq | 0.7361 | 0.7431 | 35 | | arc_easy | 0.7475 | 0.7424 | 36 | | arc_challenge | 0.4027 | 0.3908 | 37 | 38 | -------------------------------------------------------------------------------- /docs/full_range_sym.md: -------------------------------------------------------------------------------- 1 | W2G32 nsamples 512,iter 200, average accuracy of 10 tasks 2 | 3 | | Models | gptq_sym | asym | full_range_sym | 4 | |----------------------------|----------|------------|----------------| 5 | | Meta-Llama-3.1-8B-Instruct | 0.4500 | 0.52802 | **0.5381** | 6 | | Qwen2-7B | 0.5229 | **0.5559** | 0.5486 | 7 | 8 | W4G128 nsamples 128,iter 200, average accuracy of 10 tasks 9 | 10 | | Models | asym | full_range_sym | 11 | |----------------------------|------------|----------------| 12 | | Meta-Llama-3.1-8B-Instruct | 0.6342 | **0.6370** | 13 | | Qwen2-7B | 0.6143 | **0.6167** | 14 | | Mistral-7B-Instruct-v0.2 | 0.6606 | **0.6635** | 15 | | Phi-3-mini-4k-instruct | **0.6475** | 0.6432 | 16 | -------------------------------------------------------------------------------- /docs/gemma-2b-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | ```bash 3 | auto-round \ 4 | --model google/gemma-2b \ 5 | --device 0 \ 6 | --group_size 128 \ 7 | --bits 4 \ 8 | --iters 400 \ 9 | --model_dtype "float16" \ 10 | --nsamples 512 \ 11 | --asym \ 12 | --format 'auto_gptq,auto_round' \ 13 | --output_dir "./tmp_autoround" 14 | ``` 15 | 16 | Evaluate the model 17 | 18 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, lm-eval 0.4.2 is used 19 | 20 | pip install auto-gptq 21 | 22 | 23 | Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community. 24 | Given that the Gemma model family exhibits inconsistent results between FP16 and BF16 on lm-eval, we recommend converting to FP16 for both tuning and evaluation. 25 | 26 | ```bash 27 | lm_eval --model hf --model_args pretrained="Intel/gemma-2b-int4-inc",autogptq=True,gptq_use_triton=True,dtype=float16 --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 16 28 | ``` 29 | 30 | 31 | 32 | | Metric | BF16 | FP16 | AutoRound v0.1 | AutoRound v0.2 | 33 | | -------------- | ---- | ------ |----------------|----------------| 34 | | Avg.| 0.5263 | 0.5277 | 0.5235 | 0.5248 | 35 | | mmlu | 0.3287 | 0.3287 | 0.3297 | 0.3309 | 36 | | lambada_openai | 0.6344 | 0.6375 | 0.6307 | 0.6379 | 37 | | hellaswag | 0.5273 | 0.5281 | 0.5159 | 0.5184 | 38 | | winogrande | 0.6504 | 0.6488 | 0.6543 | 0.6575 | 39 | | piqa | 0.7671 | 0.7720 | 0.7612 | 0.7606 | 40 | | truthfulqa_mc1 | 0.2203 | 0.2203 | 0.2203 | 0.2191 | 41 | | openbookqa | 0.2980 | 0.3020 | 0.3000 | 0.3060 | 42 | | boolq | 0.6927 | 0.6936 | 0.6939 | 0.6966 | 43 | | arc_easy | 0.7420 | 0.7403 | 0.7353 | 0.7357 | 44 | | arc_challenge | 0.4019 | 0.4061 | 0.3933 | 0.3857 | 45 | -------------------------------------------------------------------------------- /docs/gemma-7b-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model google/gemma-7b \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --minmax_lr 2e-3 \ 11 | --model_dtype "float16" \ 12 | --iters 1000 \ 13 | --nsamples 512 \ 14 | --asym \ 15 | --format 'auto_gptq,auto_round' \ 16 | --output_dir "./tmp_autoround" 17 | ``` 18 | 19 | 20 | pip install lm-eval==0.4.2 21 | pip install auto-gptq 22 | 23 | Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community. 24 | 25 | Given that the Gemma model family exhibits inconsistent results between FP16 and BF16 on lm-eval, we recommend converting to FP16 for both tuning and evaluation. 26 | ```bash 27 | lm_eval --model hf --model_args pretrained="Intel/gemma-7b-int4-inc",autogptq=True,gptq_use_triton=True,dtype=float16 --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32 28 | ``` 29 | | Metric | BF16 | FP16 | AutoRound v0.1 | AutoRound V0.2 | 30 | | -------------- | ---- | ------ |----------------|----------------| 31 | | Avg. | 0.6208 | 0.6302 | 0.6242 | 0.6254 | 32 | | mmlu | 0.6126 | 0.6189 | 0.6085 | 0.6147 | 33 | | lambada_openai | 0.6707 | 0.7308 | 0.7165 | 0.7270 | 34 | | hellaswag | 0.6039 | 0.6063 | 0.6017 | 0.6017 | 35 | | winogrande | 0.7356 | 0.7506 | 0.7482 | 0.7490 | 36 | | piqa | 0.8014 | 0.8025 | 0.7976 | 0.7982 | 37 | | truthfulqa_mc1 | 0.3121 | 0.3121 | 0.3060 | 0.2840 | 38 | | openbookqa | 0.3300 | 0.3220 | 0.3340 | 0.3240 | 39 | | boolq | 0.8254 | 0.8324 | 0.8300 | 0.8407 | 40 | | rte | 0.6643 | 0.6859 | 0.6787 | 0.6968 | 41 | | arc_easy | 0.8068 | 0.8262 | 0.8089 | 0.8194 | 42 | | arc_challenge | 0.5043 | 0.5000 | 0.4915 | 0.4949 | 43 | -------------------------------------------------------------------------------- /docs/gemma-7b-it-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model google/gemma-7b-it \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --minmax_lr 2e-3 \ 11 | --model_dtype "float16" \ 12 | --iters 1000 \ 13 | --nsamples 512 \ 14 | --asym \ 15 | --format 'auto_gptq,auto_round' \ 16 | --output_dir "./tmp_autoround" 17 | ``` 18 | 19 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, and the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d, Install the latest [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) from source first 20 | 21 | Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community. 22 | 23 | ```bash 24 | lm_eval --model hf --model_args pretrained="Intel/gemma-7b-it-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32 25 | ``` 26 | 27 | | Metric | BF16 | int4 | 28 | | -------------- |--------| ------ | 29 | | Avg. | 0.6022 | 0.6017 | 30 | | mmlu | 0.5029 | 0.4993 | 31 | | lambada_openai | 0.6035 | 0.6286 | 32 | | hellaswag | 0.5620 | 0.5564 | 33 | | winogrande | 0.6796 | 0.6788 | 34 | | piqa | 0.7709 | 0.7731 | 35 | | truthfulqa_mc1 | 0.3048 | 0.3035 | 36 | | openbookqa | 0.3740 | 0.3700 | 37 | | boolq | 0.8138 | 0.8144 | 38 | | rte | 0.7870 | 0.7870 | 39 | | arc_easy | 0.7525 | 0.7508 | 40 | | arc_challenge | 0.4727 | 0.4573 | 41 | -------------------------------------------------------------------------------- /docs/gguf_accuracy.md: -------------------------------------------------------------------------------- 1 | 1 We evaluate all models using the `fake` format, as lm-eval reports inaccurate accuracy for real GGUF format 2 | 3 | 4 | lm-eval 0.48 5 | 6 | ```bash 7 | lm-eval --model hf --model_args pretrained="./" --tasks mmlu,leaderboard_ifeval,leaderboard_mmlu_pro,gsm8k 8 | --batch_size 16 9 | ``` 10 | 11 | 2 `lm-head` and `embedding` layers are not quantized in any of the following models. 12 | 13 | | Q4_K_S | Avg. | mmlu | mmlu_pro | if_eval | gsm8k | 14 | |---------------------------|------------|--------|----------|----------|--------| 15 | | Qwen2.5-7B-GGUF | 0.6366 | 0.7097 | 0.4385 | 0.61115 | 0.7870 | 16 | | Qwen2.5-7B-AutoRound | **0.6529** | 0.7137 | 0.4471 | 0.6373 | 0.8135 | 17 | | Llama-3.1-8B-GGUF | 0.5589 | 0.6609 | 0.3610 | 0.4949 | 0.7187 | 18 | | Llama-3.1-8B-AutoRound | **0.5666** | 0.6627 | 0.3648 | 0.49965 | 0.7392 | 19 | | Falcon3-7B-GGUF | 0.5179 | 0.6649 | 0.3607 | 0.3251 | 0.7210 | 20 | | Falcon3-7B-AutoRound | **0.5261** | 0.6706 | 0.3841 | 0.31445 | 0.7354 | 21 | | phi-4-GGUF | **0.5623** | 0.7648 | 0.5292 | 0.0590 | 0.8961 | 22 | | phi-4-AutoRound | 0.5588 | 0.7673 | 0.5239 | 0.05175 | 0.8923 | 23 | 24 | | Q3_K_S | Avg. | mmlu | mmlu_pro | if_eval | gsm8k | 25 | |---------------------------|------------|--------|----------|----------|--------| 26 | | Qwen2.5-7B-GGUF | 0.5939 | 0.6936 | 0.4062 | 0.57675 | 0.6990 | 27 | | Qwen2.5-7B-AutoRound | **0.6103** | 0.7002 | 0.4171 | 0.6194 | 0.7043 | 28 | | Llama-3.1-8B-GGUF | 0.4903 | 0.6050 | 0.3260 | 0.44265 | 0.5876 | 29 | | Llama-3.1-8B-AutoRound | **0.5511** | 0.6548 | 0.3533 | 0.4913 | 0.7051 | 30 | | Falcon3-7B-GGUF | 0.4905 | 0.6434 | 0.3439 | 0.2871 | 0.6876 | 31 | | Falcon3-7B-AutoRound | **0.5296** | 0.6520 | 0.3679 | 0.30745 | 0.7911 | 32 | | phi-4-GGUF | **0.5527** | 0.7590 | 0.5072 | 0.0802 | 0.8643 | 33 | | phi-4-AutoRound | 0.5523 | 0.7657 | 0.5124 | 0.0587 | 0.8726 | 34 | 35 | | Q2_K_S | Avg. | mmlu | mmlu_pro | if_eval | gsm8k | 36 | |---------------------------|------------|--------|----------|----------|--------| 37 | | Qwen2.5-7B-GGUF | 0.3942 | 0.5750 | 0.2701 | 0.4071 | 0.3245 | 38 | | Qwen2.5-7B-AutoRound | **0.5133** | 0.6384 | 0.3383 | 0.4714 | 0.6050 | 39 | | Falcon3-7B-GGUF | 0.1936 | 0.3491 | 0.1521 | 0.21615 | 0.0569 | 40 | | Falcon3-7B-AutoRound | **0.3817** | 0.5607 | 0.2625 | 0.28955 | 0.4139 | 41 | | phi-4-GGUF | 0.4438 | 0.6715 | 0.3807 | 0.0802 | 0.6429 | 42 | | phi-4-AutoRound | **0.5113** | 0.7107 | 0.4383 | 0.08675 | 0.8097 | 43 | 44 | -------------------------------------------------------------------------------- /docs/gpt-j-6B-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model EleutherAI/gpt-j-6b \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --asym \ 13 | --format 'auto_gptq,auto_round' \ 14 | --output_dir "./tmp_autoround" 15 | ``` 16 | 17 | 18 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d 19 | ##pip install auto-gptq[triton] 20 | ##pip install triton==2.2.0 21 | ```bash 22 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32 23 | ``` 24 | 25 | 26 | 27 | | Metric | FP16 | INT4 | 28 | | -------------- | ------ | ------ | 29 | | Avg. | 0.5039 | 0.5034 | 30 | | mmlu | 0.2694 | 0.2793 | 31 | | lambada_openai | 0.6831 | 0.6790 | 32 | | hellaswag | 0.4953 | 0.4902 | 33 | | winogrande | 0.6409 | 0.6401 | 34 | | piqa | 0.7541 | 0.7465 | 35 | | truthfulqa_mc1 | 0.2020 | 0.2179 | 36 | | openbookqa | 0.2900 | 0.2900 | 37 | | boolq | 0.6544 | 0.6554 | 38 | | rte | 0.5451 | 0.5271 | 39 | | arc_easy | 0.6692 | 0.6734 | 40 | | arc_challenge | 0.3396 | 0.3387 | 41 | -------------------------------------------------------------------------------- /docs/imgs/autoround_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/docs/imgs/autoround_overview.png -------------------------------------------------------------------------------- /docs/imgs/full_range_sym.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/docs/imgs/full_range_sym.png -------------------------------------------------------------------------------- /docs/imgs/norm_bias_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/auto-round/dc6d389545d841a2a29feac7ea89ff5d51b0e5a5/docs/imgs/norm_bias_overview.png -------------------------------------------------------------------------------- /docs/llava-v1.5-7b-sym.md: -------------------------------------------------------------------------------- 1 | 2 | ## Model Details 3 | 4 | This model is an int4 model with group_size 128 and symmetric quantization of [liuhaotian/llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b). Load the model with revision="8ab8ff" to use AutoGPTQ format. 5 | 6 | ## How To Use 7 | 8 | ### Requirements 9 | 10 | 1. Clone this repository and navigate to LLaVA folder 11 | ```shell 12 | git clone https://github.com/haotian-liu/LLaVA.git 13 | cd LLaVA 14 | ``` 15 | 16 | 2. Refine LLaVA repo 17 | ``` 18 | vi llava/model/language_model/llava_llama.py 19 | # add 'cache_position = None,' to line 71. 20 | ``` 21 | 3. Install Package 22 | ``` 23 | pip install --upgrade pip # enable PEP 660 support 24 | pip install -e . 25 | ``` 26 | 27 | ### INT4 Inference 28 | ```python 29 | from auto_round import AutoRoundConfig ## must import for auto-round format 30 | import requests 31 | import torch 32 | from PIL import Image 33 | from llava.model.builder import load_pretrained_model 34 | from llava.train.train import preprocess, preprocess_multimodal, DataCollatorForSupervisedDataset 35 | class DataArgs: 36 | is_multimodal = True 37 | mm_use_im_start_end = False 38 | 39 | quantized_model_path="Intel/llava-v1.5-7b-inc-private" 40 | 41 | tokenizer, model, image_processor, _ = load_pretrained_model( 42 | quantized_model_path, 43 | model_base=None, 44 | model_name=quantized_model_path, 45 | torch_dtype="auto", 46 | device_map="auto", 47 | ##revision="8ab8ff" ##AutoGPTQ format 48 | ) 49 | image_url = "http://images.cocodataset.org/train2017/000000116003.jpg" 50 | messages = [{"from": "human", "value": "What is the tennis player doing in the image?\n"}] 51 | 52 | # Preparation for inference 53 | image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') 54 | image_input = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].to(model.device) 55 | input_data = preprocess_multimodal([messages], DataArgs()) 56 | inputs = preprocess(input_data, tokenizer, has_image=(image_input is not None)) 57 | 58 | output = model.generate(inputs['input_ids'].to(model.device), images=image_input.unsqueeze(0).half(), max_new_tokens=50) 59 | print(tokenizer.batch_decode(output)) 60 | 61 | ##INT4: The tennis player is celebrating a victory, raising his arms in the air, and holding his tennis racket. 62 | 63 | ##BF16: The tennis player is celebrating a victory, raising his arms in the air, and holding a tennis racket. 64 | 65 | image_url = "http://images.cocodataset.org/train2017/000000411975.jpg" 66 | messages = [{"from": "human", "value": "How many people are on the baseball field in the picture?\n"}] 67 | 68 | ##INT4: There are three people on the baseball field in the picture. 69 | 70 | ##BF16: There are three people on the baseball field in the picture. 71 | 72 | 73 | image_url = "http://images.cocodataset.org/train2017/000000093025.jpg" 74 | messages = [{"from": "human", "value": "How many people and animals are there in the image?\n"}] 75 | 76 | ##INT4: There are two people and one animal in the image. 77 | 78 | ##BF16: There are two people and one animal in the image. 79 | 80 | ``` 81 | 82 | ## Evaluation the model 83 | pip3 install lmms_eval. The evaluation process may encounter errors that require changing model backend or evaluation code. Detailed instructions will be provided in a future update 84 | ```bash 85 | auto-round-mllm --lmms --model Intel/llava-v1.5-7b-inc-private --tasks pope,textvqa_val,scienceqa,mmbench_en --output_dir "./eval_result" --device cuda:0 86 | ``` 87 | |Metric |16bits|Pile Calib INT4 | Llava Calib INT4 | 88 | |:-------------------|:------|:------|:--------------| 89 | |avg |65.40 |65.91 | 65.79 | 90 | |MMBench_DEV_EN_V11 |64.09 |64.43 |64.43 | 91 | |ScienceQA_VAL |64.87 |67.20 |66.80 | 92 | |TextVQA_VAL |45.56 |45.71 |45.81 | 93 | |POPE |87.09 |86.31 |86.12 | 94 | 95 | ### Generate the model 96 | Here is the sample command to reproduce the model. 97 | ```bash 98 | pip install auto-round 99 | auto-round-mllm \ 100 | --model liuhaotian/llava-v1.5-7b \ 101 | --device 0 \ 102 | --group_size 128 \ 103 | --bits 4 \ 104 | --iters 1000 \ 105 | --nsample 512 \ 106 | --seqlen 2048 \ 107 | --format 'auto_gptq,auto_round' \ 108 | --output_dir "./tmp_autoround" 109 | ``` 110 | 111 | ## Ethical Considerations and Limitations 112 | 113 | The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs. 114 | 115 | Therefore, before deploying any applications of the model, developers should perform safety testing. 116 | 117 | ## Caveats and Recommendations 118 | 119 | Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. 120 | 121 | Here are a couple of useful links to learn more about Intel's AI software: 122 | 123 | - Intel Neural Compressor [link](https://github.com/intel/neural-compressor) 124 | 125 | ## Disclaimer 126 | 127 | The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes. 128 | 129 | ## Cite 130 | 131 | @article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} } 132 | 133 | [arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round) 134 | -------------------------------------------------------------------------------- /docs/neural-chat-7b-v3-1-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model intel/neural-chat-7b-v3-1 \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --minmax_lr 2e-3 \ 13 | --asym \ 14 | --format 'auto_gptq,auto_round' \ 15 | --output_dir "./tmp_autoround" 16 | ``` 17 | 18 | 19 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f 20 | 21 | ~~~bash 22 | lm_eval --model hf --model_args pretrained="Intel/neural-chat-v3-1-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 128 23 | ~~~ 24 | 25 | | Metric | FP16 | INT4 | 26 | | -------------- | ------ | ------ | 27 | | Avg. | 0.6769 | 0.6721 | 28 | | mmlu | 0.5919 | 0.5862 | 29 | | lambada_openai | 0.7394 | 0.7337 | 30 | | hellaswag | 0.6323 | 0.6272 | 31 | | winogrande | 0.7687 | 0.7577 | 32 | | piqa | 0.8161 | 0.8150 | 33 | | truthfulqa_mc1 | 0.4431 | 0.4394 | 34 | | openbookqa | 0.3760 | 0.3700 | 35 | | boolq | 0.8783 | 0.8743 | 36 | | rte | 0.7690 | 0.7726 | 37 | | arc_easy | 0.8413 | 0.8384 | 38 | | arc_challenge | 0.5896 | 0.5785 | 39 | -------------------------------------------------------------------------------- /docs/neural-chat-7b-v3-3-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model intel/neural-chat-7b-v3-3 \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --minmax_lr 2e-3 \ 13 | --asym \ 14 | --format 'auto_gptq,auto_round' \ 15 | --output_dir "./tmp_autoround" 16 | ``` 17 | 18 | 19 | 20 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id f3b7917091afba325af3980a35d8a6dcba03dc3f 21 | 22 | ~~~bash 23 | lm_eval --model hf --model_args pretrained="Intel/neural-chat-v3-3-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 128 24 | ~~~ 25 | 26 | | Metric | FP16 | INT4 | 27 | | -------------- | ------ | ------ | 28 | | Avg. | 0.6778 | 0.6748 | 29 | | mmlu | 0.5993 | 0.5926 | 30 | | lambada_openai | 0.7303 | 0.7370 | 31 | | hellaswag | 0.6639 | 0.6559 | 32 | | winogrande | 0.7632 | 0.7735 | 33 | | piqa | 0.8101 | 0.8074 | 34 | | truthfulqa_mc1 | 0.4737 | 0.4737 | 35 | | openbookqa | 0.3880 | 0.3680 | 36 | | boolq | 0.8694 | 0.8694 | 37 | | rte | 0.7581 | 0.7509 | 38 | | arc_easy | 0.8266 | 0.8249 | 39 | | arc_challenge | 0.5734 | 0.5691 | 40 | -------------------------------------------------------------------------------- /docs/opt-2.7b-asym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model facebook/opt-2.7b \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --minmax_lr 2e-3 \ 13 | --asym \ 14 | --format 'auto_gptq,auto_round' \ 15 | --output_dir "./tmp_autoround" 16 | ``` 17 | 18 | 19 | Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source, we used the git id 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d 20 | ##pip install auto-gptq[triton] 21 | ##pip install triton==2.2.0 22 | ```bash 23 | lm_eval --model hf --model_args pretrained="./",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 32 24 | ``` 25 | 26 | 27 | 28 | | Metric | FP16 | INT4 | 29 | | -------------- | ------ | ------ | 30 | | Avg. | 0.4722 | 0.4757 | 31 | | mmlu | 0.2568 | 0.2636 | 32 | | lambada_openai | 0.6359 | 0.6487 | 33 | | hellaswag | 0.4585 | 0.4519 | 34 | | winogrande | 0.6077 | 0.5967 | 35 | | piqa | 0.7367 | 0.7410 | 36 | | truthfulqa_mc1 | 0.2240 | 0.2338 | 37 | | openbookqa | 0.2500 | 0.2380 | 38 | | boolq | 0.6046 | 0.6505 | 39 | | rte | 0.5451 | 0.5379 | 40 | | arc_easy | 0.6077 | 0.6035 | 41 | | arc_challenge | 0.2679 | 0.2671 | 42 | -------------------------------------------------------------------------------- /docs/phi-2-old-sym-recipe.md: -------------------------------------------------------------------------------- 1 | **This recipe is outdated, we recommend using the latest full range symmetric quantization.** You can remove --asym from the command. 2 | 3 | A sample command to generate an INT4 model. 4 | ```bash 5 | auto-round \ 6 | --model facebook/opt-2.7b \ 7 | --device 0 \ 8 | --group_size 128 \ 9 | --bits 4 \ 10 | --iters 1000 \ 11 | --nsamples 512 \ 12 | --format 'auto_gptq,auto_round' \ 13 | --output_dir "./tmp_autoround" 14 | ``` 15 | 16 | 17 | pip install lm-eval==0.4.2 18 | 19 | Due to the significant accuracy drop with the asymmetric kernel for this model, we opted to use symmetric quantization. 20 | 21 | ```bash 22 | lm_eval --model hf --model_args pretrained="Intel/phi-2-int4-inc" --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16 23 | ``` 24 | 25 | | Metric | FP16 | INT4 | 26 | | -------------- | ------ | -------- | 27 | | Avg. | 0.6155 | 0.6163 | 28 | | mmlu | 0.5448 | 0.5417 | 29 | | lambada_openai | 0.6268 | 0.6225 | 30 | | hellaswag | 0.5585 | 0.5498 | 31 | | winogrande | 0.7530 | 0.7545 | 32 | | piqa | 0.7867 | 0.7824 | 33 | | truthfulqa_mc1 | 0.3133 | 0.3060 | 34 | | openbookqa | 0.4000 | 0.4100 | 35 | | boolq | 0.8339 | 0.8327 | 36 | | rte | 0.6245 | 0.6643 | 37 | | arc_easy | 0.7997 | 0.7955 | 38 | | arc_challenge | 0.5290 | 0.5196 | 39 | -------------------------------------------------------------------------------- /docs/tuning_norm_bias.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ## Fast tuning LayerNorm and Linear bias via fake quantization without rounding 4 | 5 | **Personal view by Wenhua, welcome to discuss** 6 | 7 | **Work in Progress** 8 |
9 | Recent studies have found that tuning LayerNorm and bias through optimizer like Adam can lead to better results, especially for low-bit quantization such as 2-bit. However, I personally do not favor the use of Adam for this purpose, as detailed in the following section, and introduce an alternative way, detailed in the last section. 10 | 11 | ### Why not using Adam 12 | 13 | #### Reason1 hard to tune the learning rate and steps 14 | 15 | Since Adam adaptively tunes the step size based on the gradient and its square, the learning rate often needs adjustment for different models, different quantization bits, or both, as observed in most papers. I hypothesize that this tuning requirement arises because most papers report results for only a limited range of model families, while many new models continually emerge. Despite my experience in this domain, I still find it challenging to tune the learning rate beyond using grid search. I believe many users encounter the same issue. 16 | 17 | #### Reason2 Prone to overfitting 18 | 19 | Since Adam adapts the step size with each iteration, it is difficult to control the changes in parameters, leading to significant deviations from the original model's weights in some scenarios. However, we only use hundreds or thousands of samples to fine-tune a low-bit model, whereas the original model is trained on a large corpus and specialized datasets (e.g., instruction datasets). Consequently, even if the low-bit tuned model performs well on some language-modeling tasks, it may lose other compatibility as the deviations increase. 20 | 21 | 22 | 23 | ### Our way 24 | 25 | **An overview of our method** 26 |
27 | 28 | ![](../docs/imgs/norm_bias_overview.png) 29 | 30 |
31 | 32 | 33 | We limit the tuned parameters in a quantization space, expressed as: 34 | $$ 35 | W' = s*clip(W/s+zp,N,M) 36 | $$ 37 | where 𝑠 is the quantization scale, predefined by 𝑊 and hyperparameters such as bits. 38 | 39 | To tune the W', following Signround, we add a trainable parameter V in the range [-0.5, 0.5], which can be easily tuned by SignSGD. 40 | 41 | $$ 42 | W' = s*clip(W/s+zp+v,N,M) 43 | $$ 44 | 45 | 46 | An important note: We remove the rounding to reduce unnecessary rounding loss, as the final weights of LayerNorm and bias are typically kept at 16-bit precision in most cases. 47 | 48 | 49 | 50 | **Result at W2G32** 51 | 52 | the tuning of layer normalization and Linear bias are fake quantized at W4G-1. 53 | 54 | Average accuracies of HellaSwag, WinoGrand, PIQA and LAMBADA, higher is better. 55 | 56 | | | OPT125m | OPT1.3B | OPT2.7B | OPT6.7B | LLaMAV2-7b | LLaMAV3-8B-Instruct | 57 | | --------- | ---------- | ---------- | ---------- | ---------- | ---------- | ------------------- | 58 | | SignRound | 0.3978 | 0.5094 | 0.5267 | 0.3681 | 0.6267 | 0.5890 | 59 | | Ours | **0.4077** | **0.5151** | **0.5596** | **0.3887** | **0.6315** | **0.5949** | 60 | 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.codespell] 2 | ignore-words = ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt" -------------------------------------------------------------------------------- /requirements-cpu.txt: -------------------------------------------------------------------------------- 1 | intel-extension-for-pytorch 2 | intel-extension-for-transformers 3 | -------------------------------------------------------------------------------- /requirements-lib.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets 3 | py-cpuinfo 4 | sentencepiece 5 | numpy < 2.0 6 | tqdm 7 | packaging 8 | pillow 9 | numba 10 | tbb 11 | transformers 12 | threadpoolctl 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets 3 | py-cpuinfo 4 | sentencepiece 5 | numpy < 2.0 6 | tqdm 7 | packaging 8 | pillow 9 | numba 10 | tbb 11 | torch 12 | transformers>=4.38 13 | threadpoolctl 14 | lm-eval>=0.4.2,<0.5 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = 3 | LICENSE 4 | third-party-programs.txt 5 | 6 | [options.entry_points] 7 | console_scripts = 8 | auto_round = auto_round.__main__:run 9 | auto-round = auto_round.__main__:run 10 | auto_round_eval = auto_round.__main__:run_eval 11 | auto-round-eval = auto_round.__main__:run_eval 12 | auto_round_mllm = auto_round.__main__:run_mllm 13 | auto-round-mllm = auto_round.__main__:run_mllm 14 | auto-round-fast = auto_round.__main__:run_fast 15 | auto_round_fast = auto_round.__main__:run_fast 16 | auto-round-best = auto_round.__main__:run_best 17 | auto_round_best = auto_round.__main__:run_best 18 | auto-round-light = auto_round.__main__:run_light 19 | auto_round_light = auto_round.__main__:run_light 20 | 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import open 3 | import os 4 | from setuptools import find_packages, setup 5 | import sys 6 | from functools import lru_cache 7 | 8 | os.environ["CC"] = "g++" 9 | os.environ["CXX"] = "g++" 10 | try: 11 | filepath = "./auto_round/version.py" 12 | with open(filepath) as version_file: 13 | (__version__,) = re.findall('__version__ = "(.*)"', version_file.read()) 14 | except Exception as error: 15 | assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) 16 | 17 | version = __version__ 18 | 19 | # All BUILD_* flags are initially set to `False`` and 20 | # will be updated to `True` if the corresponding environment check passes. 21 | PYPI_RELEASE = os.environ.get("PYPI_RELEASE", None) 22 | BUILD_HPU_ONLY = os.environ.get("BUILD_HPU_ONLY", "0") == "1" 23 | 24 | 25 | @lru_cache(None) 26 | def is_habana_framework_installed(): 27 | """Check if Habana framework is installed. 28 | Only check for the habana_frameworks package without importing it to avoid 29 | initializing lazy-mode-related components. 30 | """ 31 | from importlib.util import find_spec 32 | 33 | package_spec = find_spec("habana_frameworks") 34 | return package_spec is not None 35 | 36 | 37 | @lru_cache(None) 38 | def is_hpu_available(): 39 | try: 40 | import habana_frameworks.torch.core as htcore # pylint: disable=E0401 41 | return True 42 | except ImportError: 43 | return False 44 | 45 | 46 | if is_hpu_available() or is_habana_framework_installed(): 47 | # When HPU is available, we build HPU only by default 48 | BUILD_HPU_ONLY = True 49 | 50 | 51 | def is_cpu_env(): 52 | try: 53 | import torch 54 | except Exception as e: 55 | print( 56 | f"Building extension requires PyTorch being installed, please install PyTorch first: {e}.\n NOTE: This issue may be raised due to pip build isolation system (ignoring local packages). Please use `--no-build-isolation` when installing with pip, and refer to https://github.com/intel/auto-round for more details.") 57 | sys.exit(1) 58 | if torch.cuda.is_available(): 59 | return False 60 | try: 61 | import habana_frameworks.torch.core as htcore 62 | return False 63 | except: 64 | return True 65 | 66 | 67 | def fetch_requirements(path): 68 | requirements = [] 69 | with open(path, "r") as fd: 70 | requirements = [r.strip() for r in fd.readlines()] 71 | return requirements 72 | 73 | 74 | PKG_INSTALL_CFG = { 75 | "include_packages": find_packages( 76 | include=[ 77 | "auto_round", 78 | "auto_round.*", 79 | "auto_round_extension", 80 | "auto_round_extension.*", 81 | ], 82 | ), 83 | "install_requires": fetch_requirements("requirements.txt"), 84 | "extras_require": { 85 | "cpu": fetch_requirements("requirements-cpu.txt"), 86 | }, 87 | } 88 | 89 | ############################################################################### 90 | # Configuration for auto_round_lib 91 | # From pip: 92 | # pip install auto-round-lib 93 | # From source: 94 | # python setup.py lib install 95 | ############################################################################### 96 | 97 | 98 | LIB_REQUIREMENTS_FILE = "requirements-lib.txt" 99 | LIB_INSTALL_CFG = { 100 | "include_packages": find_packages( 101 | include=[ 102 | "auto_round", 103 | "auto_round.*", 104 | "auto_round_extension", 105 | "auto_round_extension.*", 106 | ], 107 | ), 108 | "install_requires": fetch_requirements(LIB_REQUIREMENTS_FILE), 109 | } 110 | 111 | if __name__ == "__main__": 112 | # There are two ways to install hpu-only package: 113 | # 1. python setup.py lib install 114 | # 2. Within the gaudi docker where the HPU is available, we install the auto_round_lib by default. 115 | is_user_requesting_library_build = "lib" in sys.argv 116 | if is_user_requesting_library_build: 117 | sys.argv.remove("lib") 118 | should_build_library = is_user_requesting_library_build or BUILD_HPU_ONLY 119 | 120 | if should_build_library: 121 | package_name = "auto_round_lib" 122 | INSTALL_CFG = LIB_INSTALL_CFG 123 | else: 124 | package_name = "auto_round" 125 | INSTALL_CFG = PKG_INSTALL_CFG 126 | 127 | include_packages = INSTALL_CFG.get("include_packages", {}) 128 | install_requires = INSTALL_CFG.get("install_requires", []) 129 | extras_require = INSTALL_CFG.get("extras_require", {}) 130 | 131 | setup( 132 | name=package_name, 133 | author="Intel AIPT Team", 134 | version=version, 135 | author_email="wenhua.cheng@intel.com, weiwei1.zhang@intel.com, heng.guo@intel.com", 136 | description="Repository of AutoRound: Advanced Weight-Only Quantization Algorithm for LLMs", 137 | long_description=open("README.md", "r", encoding="utf-8").read(), 138 | long_description_content_type="text/markdown", 139 | keywords="quantization,auto-around,LLM,SignRound", 140 | license="Apache 2.0", 141 | url="https://github.com/intel/auto-round", 142 | packages=include_packages, 143 | install_requires=install_requires, 144 | extras_require=extras_require, 145 | python_requires=">=3.7.0", 146 | classifiers=[ 147 | "Intended Audience :: Science/Research", 148 | "Programming Language :: Python :: 3", 149 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 150 | "License :: OSI Approved :: Apache Software License", 151 | ], 152 | include_package_data=True, 153 | package_data={"": ["mllm/templates/*.json"]}, 154 | ) 155 | -------------------------------------------------------------------------------- /test/test_cpu/_test_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def is_pytest_mode_compile(): 5 | return pytest.mode == "compile" 6 | 7 | 8 | def is_pytest_mode_lazy(): 9 | return pytest.mode == "lazy" 10 | 11 | 12 | def model_infer(model, tokenizer, apply_chat_template=False): 13 | prompts = [ 14 | "Hello,my name is", 15 | # "The president of the United States is", 16 | # "The capital of France is", 17 | # "The future of AI is", 18 | ] 19 | if apply_chat_template: 20 | texts = [] 21 | for prompt in prompts: 22 | messages = [ 23 | {"role": "user", "content": prompt} 24 | ] 25 | text = tokenizer.apply_chat_template( 26 | messages, 27 | tokenize=False, 28 | add_generation_prompt=True 29 | ) 30 | texts.append(text) 31 | prompts = texts 32 | 33 | inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) 34 | 35 | outputs = model.generate( 36 | input_ids=inputs["input_ids"].to(model.device), 37 | attention_mask=inputs["attention_mask"].to(model.device), 38 | do_sample=False, ## change this to follow official usage 39 | max_new_tokens=5 40 | ) 41 | generated_ids = [ 42 | output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs) 43 | ] 44 | 45 | decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 46 | 47 | for i, prompt in enumerate(prompts): 48 | print(f"Prompt: {prompt}") 49 | print(f"Generated: {decoded_outputs[i]}") 50 | print("-" * 50) 51 | return decoded_outputs[0] 52 | -------------------------------------------------------------------------------- /test/test_cpu/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Mapping 3 | 4 | import pytest 5 | 6 | 7 | def pytest_addoption(parser): 8 | parser.addoption( 9 | "--mode", 10 | action="store", 11 | default="lazy", 12 | help="{compile|lazy}, default lazy. Choose mode to run tests", 13 | ) 14 | 15 | 16 | backup_env = pytest.StashKey[Mapping]() 17 | 18 | 19 | def pytest_configure(config): 20 | pytest.mode = config.getoption("--mode") 21 | assert pytest.mode.lower() in ["lazy", "compile"] 22 | 23 | config.stash[backup_env] = os.environ 24 | 25 | if pytest.mode == "lazy": 26 | os.environ["PT_HPU_LAZY_MODE"] = "1" 27 | elif pytest.mode == "compile": 28 | os.environ["PT_HPU_LAZY_MODE"] = "0" 29 | os.environ["PT_ENABLE_INT64_SUPPORT"] = "1" 30 | 31 | 32 | def pytest_unconfigure(config): 33 | os.environ.clear() 34 | os.environ.update(config.stash[backup_env]) 35 | -------------------------------------------------------------------------------- /test/test_cpu/requirements.txt: -------------------------------------------------------------------------------- 1 | addict 2 | modelscope 3 | gguf 4 | torchvision 5 | -------------------------------------------------------------------------------- /test/test_cpu/test_act_quantization.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from auto_round import AutoRound 11 | 12 | 13 | class LLMDataLoader: 14 | def __init__(self): 15 | self.batch_size = 1 16 | 17 | def __iter__(self): 18 | for i in range(3): 19 | yield torch.ones([1, 10], dtype=torch.long) 20 | 21 | 22 | class TestAutoRoundAct(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(self): 25 | model_name = "facebook/opt-125m" 26 | self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 27 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 28 | self.llm_dataloader = LLMDataLoader() 29 | 30 | @classmethod 31 | def tearDownClass(self): 32 | shutil.rmtree("./saved", ignore_errors=True) 33 | shutil.rmtree("runs", ignore_errors=True) 34 | 35 | def test_mx_fp4(self): 36 | model_name = "facebook/opt-125m" 37 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 38 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 39 | bits, group_size, sym = 4, 128, True 40 | autoround = AutoRound( 41 | model, 42 | tokenizer, 43 | bits=bits, 44 | group_size=group_size, 45 | sym=sym, 46 | iters=2, 47 | seqlen=2, 48 | dataset=self.llm_dataloader, 49 | act_bits=4, 50 | data_type="mx_fp" 51 | ) 52 | autoround.quantize() 53 | 54 | def test_wint4fp8_dynamic(self): 55 | model_name = "facebook/opt-125m" 56 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 57 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 58 | bits, group_size = 4, 128 59 | autoround = AutoRound( 60 | model, 61 | tokenizer, 62 | bits=bits, 63 | group_size=group_size, 64 | iters=2, 65 | seqlen=2, 66 | dataset=self.llm_dataloader, 67 | act_bits=8, 68 | data_type="fp8_to_int_sym", 69 | act_data_type="fp8_dynamic_per_token" 70 | ) 71 | autoround.quantize() 72 | 73 | def test_wint4fp8_static(self): 74 | bits, group_size, sym = 4, 128, True 75 | autoround = AutoRound( 76 | self.model, 77 | self.tokenizer, 78 | bits=bits, 79 | group_size=group_size, 80 | sym=sym, 81 | iters=2, 82 | seqlen=2, 83 | dataset=self.llm_dataloader, 84 | act_bits=8, 85 | data_type="fp8_to_int_sym", 86 | act_dynamic=False, 87 | act_data_type="fp8" 88 | ) 89 | autoround.quantize() 90 | -------------------------------------------------------------------------------- /test/test_cpu/test_auto_round_hpu_only.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from auto_round.utils import is_hpu_supported 4 | 5 | from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy 6 | 7 | 8 | def run_opt_125m_on_hpu(): 9 | from auto_round import AutoRound 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | 12 | model_name = "facebook/opt-125m" 13 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 14 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 15 | 16 | bits, group_size, sym = 4, 128, False 17 | autoround = AutoRound( 18 | model, 19 | tokenizer, 20 | bits=bits, 21 | group_size=group_size, 22 | sym=sym, 23 | iters=2, 24 | seqlen=2, 25 | ) 26 | q_model, qconfig = autoround.quantize() 27 | assert q_model is not None, f"Expected q_model to be not None" 28 | 29 | 30 | @pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported") 31 | @pytest.mark.skipif(not is_pytest_mode_lazy(), reason="Only for lazy mode") 32 | def test_opt_125m_lazy_mode(): 33 | run_opt_125m_on_hpu() 34 | 35 | 36 | @pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported") 37 | @pytest.mark.skipif(not is_pytest_mode_compile(), reason="Only for compile mode") 38 | def test_opt_125m_compile_mode(): 39 | torch._dynamo.reset() 40 | run_opt_125m_on_hpu() 41 | 42 | 43 | def test_import(): 44 | from auto_round import AutoRound 45 | from auto_round.export.export_to_itrex.export import ( 46 | WeightOnlyLinear, save_quantized_as_itrex) 47 | 48 | 49 | @pytest.mark.parametrize( 50 | "data_type", 51 | ["fp8_to_int_sym"], 52 | ) 53 | def test_w4a8(data_type): 54 | from auto_round import AutoRound 55 | from transformers import AutoModelForCausalLM, AutoTokenizer 56 | 57 | model_name = "facebook/opt-125m" 58 | model = AutoModelForCausalLM.from_pretrained( 59 | model_name, 60 | torch_dtype="auto", 61 | attn_implementation="eager", 62 | trust_remote_code=True, 63 | ) 64 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 65 | 66 | autoround = AutoRound( 67 | model, 68 | tokenizer, 69 | bits=4, 70 | group_size=128, 71 | iters=2, 72 | seqlen=2, 73 | data_type=data_type, 74 | act_data_type="fp8_sym", 75 | act_bits=8, 76 | act_dynamic=False, 77 | ) 78 | q_model, qconfig = autoround.quantize() 79 | assert q_model is not None, f"Expected q_model to be not None" 80 | -------------------------------------------------------------------------------- /test/test_cpu/test_autoopt.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | import transformers 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | from auto_round import AutoRoundOPT, AutoRoundAdam 12 | 13 | 14 | class LLMDataLoader: 15 | def __init__(self): 16 | self.batch_size = 1 17 | 18 | def __iter__(self): 19 | for i in range(2): 20 | yield torch.ones([1, 10], dtype=torch.long) 21 | 22 | 23 | class TestAutoRound(unittest.TestCase): 24 | @classmethod 25 | def setUpClass(self): 26 | model_name = "facebook/opt-125m" 27 | self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 28 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 29 | self.llm_dataloader = LLMDataLoader() 30 | 31 | @classmethod 32 | def tearDownClass(self): 33 | shutil.rmtree("./saved", ignore_errors=True) 34 | shutil.rmtree("runs", ignore_errors=True) 35 | 36 | def test_default(self): 37 | bits, group_size, sym = 4, 128, False 38 | autoround = AutoRoundOPT( 39 | self.model, 40 | self.tokenizer, 41 | bits=bits, 42 | group_size=group_size, 43 | sym=sym, 44 | iters=2, 45 | seqlen=10, 46 | dataset=self.llm_dataloader, 47 | to_quant_block_names=None 48 | ) 49 | autoround.quantize() 50 | 51 | 52 | def test_Adam(self): 53 | bits, group_size, sym = 4, 128, False 54 | from auto_round.utils import get_block_names 55 | llm_block_names = get_block_names(self.model, quant_vision=True) 56 | bits, group_size, sym, batch_size = 4, 128, False, 20 57 | adamround = AutoRoundAdam( 58 | self.model, 59 | self.tokenizer, 60 | bits=bits, 61 | group_size=group_size, 62 | sym=sym, 63 | iters=2, 64 | seqlen=2, 65 | batch_size=batch_size, 66 | dataset=self.llm_dataloader, 67 | to_quant_block_names=llm_block_names 68 | ) 69 | adamround.quantize() 70 | 71 | 72 | if __name__ == "__main__": 73 | unittest.main() 74 | 75 | 76 | -------------------------------------------------------------------------------- /test/test_cpu/test_autoround_acc.py: -------------------------------------------------------------------------------- 1 | 2 | import copy 3 | import shutil 4 | import sys 5 | import unittest 6 | sys.path.insert(0, "../..") 7 | import torch 8 | import transformers 9 | from math import isclose 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | from auto_round import AutoRound # pylint: disable=E0401 12 | from auto_round.export.export_to_itrex.export import pack_model # pylint: disable=E0401 13 | 14 | class LLMDataLoader: 15 | def __init__(self): 16 | self.batch_size = 1 17 | 18 | def __iter__(self): 19 | for i in range(2): 20 | yield torch.ones([1, 10], dtype=torch.long) 21 | 22 | 23 | class TestAutoRound(unittest.TestCase): 24 | @classmethod 25 | def setUpClass(self): 26 | self.model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM" 27 | self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float32, trust_remote_code=True) 28 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 29 | self.llm_dataloader = LLMDataLoader() 30 | 31 | @classmethod 32 | def tearDownClass(self): 33 | shutil.rmtree("./saved", ignore_errors=True) 34 | shutil.rmtree("runs", ignore_errors=True) 35 | 36 | def test_default_acc(self): 37 | bits, group_size, sym = 4, 128, True 38 | inp = torch.ones([1, 10], dtype=torch.long) 39 | autoround = AutoRound( 40 | self.model, 41 | self.tokenizer, 42 | bits=bits, 43 | device="cpu", 44 | group_size=group_size, 45 | sym=sym, 46 | iters=2, 47 | seqlen=10, 48 | dataset=self.llm_dataloader 49 | ) 50 | autoround.quantize() 51 | out0 = self.model(inp) 52 | print(f"out0 = {float(out0[0][0][0][0])}") 53 | 54 | model_tmp = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float32, trust_remote_code=True) 55 | autoround_1 = AutoRound( 56 | model_tmp, 57 | self.tokenizer, 58 | bits=bits, 59 | group_size=group_size, 60 | sym=sym, 61 | device="cpu", 62 | iters=2, 63 | seqlen=10, 64 | dataset=self.llm_dataloader 65 | ) 66 | autoround_1.quantize() 67 | out1 = model_tmp(inp) 68 | 69 | assert out0[0].equal(out1[0]) 70 | self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04)) 71 | 72 | 73 | if __name__ == "__main__": 74 | unittest.main() 75 | 76 | -------------------------------------------------------------------------------- /test/test_cpu/test_autoround_export_to_itrex.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | import transformers 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | from auto_round import AutoRound 12 | 13 | 14 | class SimpleDataLoader: 15 | def __init__(self): 16 | self.batch_size = 1 17 | 18 | def __iter__(self): 19 | for i in range(2): 20 | yield torch.randn([1, 30]) 21 | 22 | 23 | class LLMDataLoader: 24 | def __init__(self): 25 | self.batch_size = 1 26 | 27 | def __iter__(self): 28 | for i in range(2): 29 | yield torch.ones([1, 10], dtype=torch.long) 30 | 31 | 32 | class TestAutoroundExport(unittest.TestCase): 33 | approach = "weight_only" 34 | 35 | @classmethod 36 | def setUpClass(self): 37 | self.gptj = transformers.AutoModelForCausalLM.from_pretrained( 38 | "hf-internal-testing/tiny-random-GPTJForCausalLM", 39 | torchscript=True, 40 | ) 41 | self.tokenizer = transformers.AutoTokenizer.from_pretrained( 42 | "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True 43 | ) 44 | self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained( 45 | "hf-internal-testing/tiny-random-GPTJForCausalLM", 46 | ) 47 | self.llm_dataloader = LLMDataLoader() 48 | self.lm_input = torch.ones([1, 10], dtype=torch.long) 49 | 50 | @classmethod 51 | def tearDownClass(self): 52 | shutil.rmtree("./saved", ignore_errors=True) 53 | shutil.rmtree("runs", ignore_errors=True) 54 | 55 | def test_autoround_int_quant(self): 56 | model = copy.deepcopy(self.gptj) 57 | out1 = model(self.lm_input) 58 | round = AutoRound 59 | optq_1 = round(model, self.tokenizer, nsamples=20, amp=False, seqlen=10, iters=10, enable_torch_compile=False) 60 | q_model, layer_config1 = optq_1.quantize() ##compile model 61 | from auto_round.export.export_to_itrex import pack_model 62 | 63 | compressed_model = pack_model(model=q_model, layer_config=layer_config1) 64 | out2 = model(self.lm_input) 65 | out3 = q_model(self.lm_input) 66 | out4 = compressed_model(self.lm_input) 67 | self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) 68 | self.assertFalse(torch.all(out1[0] == out2[0])) 69 | self.assertTrue(torch.all(out2[0] == out3[0])) 70 | self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))) 71 | self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys()) 72 | 73 | model = copy.deepcopy(self.gptj) 74 | out6 = model(self.lm_input) 75 | optq_2 = round(model, self.tokenizer, device="cpu", nsamples=20, seqlen=10) 76 | q_model, layer_config2 = optq_2.quantize() 77 | compressed_model = pack_model(model=q_model, layer_config=layer_config2, inplace=False) 78 | compressed_model = compressed_model.to(torch.float32) 79 | out4 = q_model(self.lm_input) 80 | out5 = compressed_model(self.lm_input) 81 | self.assertTrue(torch.all(out1[0] == out6[0])) 82 | self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=5e-3))) 83 | 84 | def test_config(self): 85 | from auto_round.export.export_to_itrex import QuantConfig 86 | 87 | config = QuantConfig.from_pretrained("TheBloke/Llama-2-7B-Chat-GPTQ") 88 | config.save_pretrained("quantization_config_dir") 89 | loaded_config = QuantConfig.from_pretrained("quantization_config_dir") 90 | self.assertEqual(config.group_size, loaded_config.group_size) 91 | self.assertEqual(config.desc_act, loaded_config.desc_act) 92 | self.assertEqual(config.bits, loaded_config.bits) 93 | self.assertEqual(config.sym, loaded_config.sym) 94 | 95 | def test_xpu_export(self): 96 | model = copy.deepcopy(self.gptj) 97 | out1 = model(self.lm_input) 98 | round = AutoRound 99 | optq_1 = round(model, self.tokenizer, nsamples=20, amp=False, seqlen=10, iters=10, enable_torch_compile=False) 100 | q_model, layer_config1 = optq_1.quantize() 101 | from auto_round.export.export_to_itrex import pack_model 102 | 103 | compressed_model_xpu = pack_model(model=q_model, layer_config=layer_config1, device="xpu", inplace=False) 104 | compressed_model_cpu = pack_model(model=q_model, layer_config=layer_config1, inplace=False) 105 | out2 = model(self.lm_input) 106 | out3 = q_model(self.lm_input) 107 | out4 = compressed_model_xpu(self.lm_input) 108 | out5 = compressed_model_cpu(self.lm_input) 109 | self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) 110 | self.assertFalse(torch.all(out1[0] == out2[0])) 111 | self.assertTrue(torch.all(out2[0] == out3[0])) 112 | self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))) 113 | self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5))) 114 | 115 | 116 | if __name__ == "__main__": 117 | unittest.main() 118 | 119 | -------------------------------------------------------------------------------- /test/test_cpu/test_basic_usage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, '../..') 7 | 8 | 9 | class TestAutoRoundCmd(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(self): 12 | pass 13 | 14 | @classmethod 15 | def tearDownClass(self): 16 | shutil.rmtree("./saved", ignore_errors=True) 17 | shutil.rmtree("runs", ignore_errors=True) 18 | 19 | def test_auto_round_cmd(self): 20 | python_path = sys.executable 21 | 22 | ##test llm script 23 | # res = os.system( 24 | # f"cd ../.. && {python_path} -m auto_round -h") 25 | # if res > 0 or res == -1: 26 | # assert False, "cmd line test fail, please have a check" 27 | # 28 | res = os.system( 29 | f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa") 30 | if res > 0 or res == -1: 31 | assert False, "cmd line test fail, please have a check" 32 | 33 | res = os.system( 34 | f"cd ../.. && {python_path} -m auto_round --model 'facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" 35 | ) 36 | if res > 0 or res == -1: 37 | assert False, "cmd line test fail, please have a check" 38 | 39 | res = os.system( 40 | f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai") 41 | if res > 0 or res == -1: 42 | assert False, "cmd line test fail, please have a check" 43 | 44 | # test mllm script 45 | # test auto_round_mllm help 46 | res = os.system( 47 | f"cd ../.. && {python_path} -m auto_round --mllm -h") 48 | if res > 0 or res == -1: 49 | assert False, "cmd line test fail, please have a check" 50 | 51 | # test auto_round_mllm --eval help 52 | res = os.system( 53 | f"cd ../.. && {python_path} -m auto_round --mllm --eval -h") 54 | if res > 0 or res == -1: 55 | assert False, "cmd line test fail, please have a check" 56 | 57 | # test auto_round_mllm --lmms help 58 | res = os.system( 59 | f"cd ../.. && {python_path} -m auto_round --mllm --lmms -h") 60 | if res > 0 or res == -1: 61 | assert False, "cmd line test fail, please have a check" 62 | 63 | res = os.system( 64 | f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 32 --format auto_round --output_dir ./saved") 65 | if res > 0 or res == -1: 66 | assert False, "cmd line test fail, please have a check" 67 | 68 | res = os.system( 69 | f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 10 --seqlen 256 --format auto_round" 70 | " --quant_nontext_module --output_dir ./saved ") 71 | if res > 0 or res == -1: 72 | assert False, "cmd line test fail, please have a check" 73 | 74 | 75 | if __name__ == "__main__": 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /test/test_cpu/test_calib_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import json 8 | 9 | import torch 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | 12 | from auto_round import AutoRound 13 | 14 | 15 | class LLMDataLoader: 16 | def __init__(self): 17 | self.batch_size = 1 18 | 19 | def __iter__(self): 20 | for i in range(2): 21 | yield torch.ones([1, 10], dtype=torch.long) 22 | 23 | 24 | class TestLocalCalibDataset(unittest.TestCase): 25 | @classmethod 26 | def setUpClass(self): 27 | json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] 28 | os.makedirs("./saved", exist_ok=True) 29 | self.json_file = "./saved/tmp.json" 30 | with open(self.json_file, "w") as json_file: 31 | json.dump(json_data, json_file, indent=4) 32 | 33 | jsonl_data = [{"text": "哈哈,開心點"}, {"text": "hello world"}] 34 | os.makedirs("./saved", exist_ok=True) 35 | self.jsonl_file = "./saved/tmp.jsonl" 36 | with open(self.jsonl_file, "w") as jsonl_file: 37 | for item in jsonl_data: 38 | json.dump(item, jsonl_file, ensure_ascii=False) 39 | jsonl_file.write('\n') 40 | 41 | model_name = "facebook/opt-125m" 42 | self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 43 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 44 | 45 | def test_json(self): 46 | bits, group_size, sym = 4, 128, True 47 | autoround = AutoRound( 48 | self.model, 49 | self.tokenizer, 50 | bits=bits, 51 | group_size=group_size, 52 | sym=sym, 53 | iters=2, 54 | seqlen=5, 55 | dataset=self.json_file, 56 | ) 57 | autoround.quantize() 58 | 59 | def test_jsonl(self): 60 | bits, group_size, sym = 4, 128, True 61 | autoround = AutoRound( 62 | self.model, 63 | self.tokenizer, 64 | bits=bits, 65 | group_size=group_size, 66 | sym=sym, 67 | iters=2, 68 | seqlen=4, 69 | dataset=self.jsonl_file, 70 | ) 71 | autoround.quantize() 72 | 73 | def test_apply_chat_template(self): 74 | model_name = "Qwen/Qwen2.5-0.5B-Instruct" 75 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 76 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 77 | dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''" 78 | bits, group_size, sym = 4, 128, True 79 | autoround = AutoRound( 80 | model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset 81 | ) 82 | autoround.quantize() 83 | 84 | def test_combine_dataset(self): 85 | dataset = "NeelNanda/pile-10k" + "," + "madao33/new-title-chinese" + "," + "mbpp" 86 | bits, group_size, sym = 4, 128, True 87 | autoround = AutoRound( 88 | self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset 89 | ) 90 | autoround.quantize() 91 | 92 | def test_combine_dataset2(self): 93 | dataset = "NeelNanda/pile-10k:num=256,mbpp:num=256" 94 | bits, group_size, sym = 4, 128, True 95 | autoround = AutoRound( 96 | self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset 97 | ) 98 | autoround.quantize() 99 | 100 | # def test_pile_val_backup_dataset(self): 101 | # dataset = "swift/pile-val-backup" 102 | # bits, group_size, sym = 4, 128, True 103 | # autoround = AutoRound( 104 | # self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset 105 | # ) 106 | # autoround.quantize() 107 | 108 | @classmethod 109 | def tearDownClass(self): 110 | shutil.rmtree("./saved", ignore_errors=True) 111 | shutil.rmtree("runs", ignore_errors=True) 112 | 113 | 114 | if __name__ == "__main__": 115 | unittest.main() 116 | 117 | 118 | -------------------------------------------------------------------------------- /test/test_cpu/test_conv1d.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from auto_round import AutoRound 11 | from _test_helpers import model_infer 12 | class LLMDataLoader: 13 | def __init__(self): 14 | self.batch_size = 1 15 | 16 | def __iter__(self): 17 | for i in range(2): 18 | yield torch.ones([1, 10], dtype=torch.long) 19 | 20 | 21 | class TestQuantizationConv1d(unittest.TestCase): 22 | @classmethod 23 | def setUpClass(self): 24 | self.model_name = "MBZUAI/LaMini-GPT-124M" 25 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 26 | self.llm_dataloader = LLMDataLoader() 27 | 28 | @classmethod 29 | def tearDownClass(self): 30 | shutil.rmtree("./saved", ignore_errors=True) 31 | shutil.rmtree("runs", ignore_errors=True) 32 | 33 | 34 | def test_quant(self): 35 | self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) 36 | bits, group_size, sym = 4, 128, True 37 | autoround = AutoRound( 38 | self.model, 39 | self.tokenizer, 40 | bits=bits, 41 | group_size=group_size, 42 | sym=sym, 43 | iters=2, 44 | seqlen=2, 45 | dataset=self.llm_dataloader, 46 | 47 | ) 48 | 49 | autoround.quantize() 50 | autoround.save_quantized("./saved") 51 | 52 | model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True) 53 | model_infer(model, self.tokenizer) 54 | 55 | 56 | 57 | if __name__ == "__main__": 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /test/test_cpu/test_generation.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig 9 | 10 | from auto_round import AutoRound 11 | 12 | 13 | class LLMDataLoader: 14 | def __init__(self): 15 | self.batch_size = 1 16 | 17 | def __iter__(self): 18 | for i in range(2): 19 | yield torch.ones([1, 10], dtype=torch.long) 20 | 21 | 22 | class TestAutoRoundFormatGeneration(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(self): 25 | self.model_name = "facebook/opt-125m" 26 | self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) 27 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 28 | self.llm_dataloader = LLMDataLoader() 29 | self.save_folder = "./saved" 30 | 31 | @classmethod 32 | def tearDownClass(self): 33 | shutil.rmtree(self.save_folder, ignore_errors=True) 34 | shutil.rmtree("runs", ignore_errors=True) 35 | 36 | def test_4bits_sym(self): 37 | bits = 4 38 | group_size = 128 39 | sym = True 40 | autoround = AutoRound( 41 | self.model, 42 | self.tokenizer, 43 | bits=bits, 44 | group_size=group_size, 45 | sym=sym, 46 | iters=1, 47 | seqlen=2, 48 | dataset=self.llm_dataloader, 49 | ) 50 | quantized_model_path = self.save_folder 51 | 52 | autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False) 53 | 54 | quantization_config = AutoRoundConfig( 55 | backend="ipex" 56 | ) 57 | model = AutoModelForCausalLM.from_pretrained(quantized_model_path, 58 | device_map="cpu", quantization_config=quantization_config) 59 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 60 | text = "My name is " 61 | inputs = tokenizer(text, return_tensors="pt").to(model.device) 62 | res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) 63 | print(res) 64 | assert ("!!!" not in res) 65 | 66 | model = AutoModelForCausalLM.from_pretrained(quantized_model_path, 67 | device_map="cpu", quantization_config=quantization_config, 68 | torch_dtype=torch.float16) 69 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 70 | text = "There is a girl who likes adventure," 71 | inputs = tokenizer(text, return_tensors="pt").to(model.device) 72 | res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) 73 | print(res) 74 | assert ("!!!" not in res) 75 | 76 | def test_autoround_sym(self): 77 | for bits in [4]: 78 | model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) 79 | tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 80 | bits, group_size, sym = bits, 128, True 81 | autoround = AutoRound( 82 | model, 83 | tokenizer, 84 | bits=bits, 85 | group_size=group_size, 86 | sym=sym, 87 | iters=2, 88 | seqlen=2, 89 | dataset=self.llm_dataloader, 90 | ) 91 | quantized_model_path = "./saved" 92 | 93 | autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") 94 | 95 | model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", 96 | trust_remote_code=True) 97 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 98 | text = "There is a girl who likes adventure," 99 | inputs = tokenizer(text, return_tensors="pt").to(model.device) 100 | res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) 101 | print(res) 102 | assert ("!!!" not in res) 103 | shutil.rmtree(self.save_folder, ignore_errors=True) 104 | 105 | -------------------------------------------------------------------------------- /test/test_cpu/test_hpu.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, "../..") 6 | import torch 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | from auto_round import AutoRound 10 | 11 | 12 | 13 | class LLMDataLoader: 14 | def __init__(self): 15 | self.batch_size = 1 16 | 17 | def __iter__(self): 18 | for i in range(2): 19 | yield torch.ones([1, 10], dtype=torch.long) 20 | 21 | def is_hpu_supported(): 22 | try: 23 | import habana_frameworks.torch.core as htcore # pylint: disable=E0401 24 | except ImportError as e: 25 | return False 26 | return True 27 | 28 | 29 | class TestAutoRound(unittest.TestCase): 30 | @classmethod 31 | def setUpClass(self): 32 | model_name = "facebook/opt-125m" 33 | self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 34 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 35 | self.llm_dataloader = LLMDataLoader() 36 | 37 | @classmethod 38 | def tearDownClass(self): 39 | shutil.rmtree("./saved", ignore_errors=True) 40 | shutil.rmtree("runs", ignore_errors=True) 41 | 42 | def test_autogptq_format_hpu_inference(self): 43 | if not is_hpu_supported(): 44 | return 45 | try: 46 | import auto_gptq 47 | except: 48 | return 49 | bits, group_size, sym = 4, 128, False 50 | autoround = AutoRound( 51 | self.model, 52 | self.tokenizer, 53 | bits=bits, 54 | group_size=group_size, 55 | sym=sym, 56 | iters=2, 57 | seqlen=2, 58 | dataset=self.llm_dataloader, 59 | ) 60 | autoround.quantize() 61 | quantized_model_path = "./saved" 62 | 63 | autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_gptq") 64 | model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", \ 65 | trust_remote_code=True).to('hpu').to(torch.float32) 66 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 67 | text = "There is a girl who likes adventure," 68 | inputs = tokenizer(text, return_tensors="pt").to(model.device) 69 | print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) 70 | shutil.rmtree("./saved", ignore_errors=True) 71 | 72 | 73 | def test_autoround_format_hpu_inference(self): 74 | if not is_hpu_supported(): 75 | return 76 | bits, group_size, sym = 4, 128, False 77 | autoround = AutoRound( 78 | self.model, 79 | self.tokenizer, 80 | bits=bits, 81 | group_size=group_size, 82 | sym=sym, 83 | iters=2, 84 | seqlen=2, 85 | dataset=self.llm_dataloader, 86 | ) 87 | autoround.quantize() 88 | quantized_model_path = "./saved" 89 | 90 | autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") 91 | 92 | model = AutoModelForCausalLM.from_pretrained(quantized_model_path, \ 93 | device_map="auto").to('hpu').to(torch.float32) 94 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 95 | text = "There is a girl who likes adventure," 96 | inputs = tokenizer(text, return_tensors="pt").to(model.device) 97 | print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) 98 | shutil.rmtree("./saved", ignore_errors=True) 99 | -------------------------------------------------------------------------------- /test/test_cpu/test_load_awq_gptq.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, "../..") 6 | 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | from transformers import AutoRoundConfig 10 | 11 | 12 | class TestAutoRound(unittest.TestCase): 13 | def model_infer(self, model, tokenizer): 14 | prompts = [ 15 | "Hello,my name is", 16 | # "The president of the United States is", 17 | # "The capital of France is", 18 | # "The future of AI is", 19 | ] 20 | 21 | inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) 22 | 23 | outputs = model.generate( 24 | input_ids=inputs["input_ids"].to(model.device), 25 | attention_mask=inputs["attention_mask"].to(model.device), 26 | do_sample=False, ## change this to follow official usage 27 | max_new_tokens=5 28 | ) 29 | generated_ids = [ 30 | output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs) 31 | ] 32 | 33 | decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 34 | 35 | for i, prompt in enumerate(prompts): 36 | print(f"Prompt: {prompt}") 37 | print(f"Generated: {decoded_outputs[i]}") 38 | print("-" * 50) 39 | 40 | @classmethod 41 | def tearDownClass(self): 42 | shutil.rmtree("runs", ignore_errors=True) 43 | 44 | def test_load_gptq_no_dummy_gidx_model(self): 45 | model_name = "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" 46 | quantization_config = AutoRoundConfig() 47 | with self.assertRaises(NotImplementedError) as cm: 48 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True, 49 | device_map="cpu", 50 | quantization_config=quantization_config) 51 | 52 | def test_load_awq(self): 53 | model_name = "casperhansen/opt-125m-awq" 54 | quantization_config = AutoRoundConfig() 55 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True, 56 | device_map="cpu", 57 | quantization_config=quantization_config) 58 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 59 | self.model_infer(model, tokenizer) 60 | -------------------------------------------------------------------------------- /test/test_cpu/test_low_cpu_mem.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import os 4 | import unittest 5 | sys.path.insert(0, "../..") 6 | 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | from auto_round.low_cpu_mem.utils import ( 10 | load_model_with_hooks, 11 | load_empty_model, 12 | get_layers_before_block, 13 | layer_wise_load, 14 | layer_wise_save, 15 | ) 16 | 17 | from auto_round import AutoRound 18 | 19 | 20 | class LLMDataLoader: 21 | def __init__(self): 22 | self.batch_size = 1 23 | 24 | def __iter__(self): 25 | for i in range(2): 26 | yield torch.ones([1, 10], dtype=torch.long) 27 | 28 | class TestLowCPUMem(unittest.TestCase): 29 | @classmethod 30 | def setUpClass(self): 31 | self.model_name = "facebook/opt-125m" 32 | self.saved_path = './test_tmp_saved' 33 | self.ori_model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True) 34 | self.model = load_model_with_hooks(self.model_name, AutoModelForCausalLM, saved_path=self.saved_path, device='cpu') 35 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 36 | self.llm_dataloader = LLMDataLoader() 37 | 38 | @classmethod 39 | def tearDownClass(self): 40 | shutil.rmtree(self.saved_path, ignore_errors=True) 41 | 42 | def test_default(self): 43 | self.assertTrue(self.model.device.type, 'meta') 44 | 45 | layers = get_layers_before_block(self.model) 46 | self.assertEqual(layers[0][0], 'model.decoder.embed_tokens') 47 | 48 | # test get_weight bias 49 | self.assertTrue(torch.equal( 50 | self.model.model.decoder.layers[0].self_attn.k_proj.get_weight(), 51 | self.ori_model.model.decoder.layers[0].self_attn.k_proj.weight, 52 | )) 53 | self.assertTrue(torch.equal( 54 | self.model.model.decoder.layers[0].self_attn.k_proj.get_bias(), 55 | self.ori_model.model.decoder.layers[0].self_attn.k_proj.bias, 56 | )) 57 | 58 | # test hooks 59 | text = ["Hello, my dog is cute"] 60 | input = self.tokenizer(text) 61 | for key in input: 62 | input[key] = torch.tensor(input[key]) 63 | ori_output = self.ori_model(**input) 64 | output = self.model(**input) 65 | self.assertTrue(torch.equal(ori_output[0], output[0])) 66 | 67 | # test save and load 68 | layer_wise_save(self.model, self.saved_path) 69 | state_dict = layer_wise_load(self.saved_path) 70 | self.assertTrue(torch.equal( 71 | state_dict['lm_head.weight'], 72 | self.ori_model.lm_head.weight 73 | )) 74 | 75 | # test layer-wise auto_round 76 | bits, group_size, sym = 4, 128, False 77 | autoround = AutoRound( 78 | self.model, 79 | self.tokenizer, 80 | device='cpu', 81 | bits=bits, 82 | group_size=group_size, 83 | sym=sym, 84 | iters=2, 85 | seqlen=2, 86 | dataset=self.llm_dataloader, 87 | enable_torch_compile=False 88 | ) 89 | autoround.quantize() 90 | 91 | # test block-wise auto_round 92 | self.model = load_empty_model(self.model_name, AutoModelForCausalLM, saved_path=self.saved_path, device='cpu') 93 | bits, group_size, sym = 4, 128, False 94 | autoround = AutoRound( 95 | self.model, 96 | self.tokenizer, 97 | device='cpu', 98 | bits=bits, 99 | group_size=group_size, 100 | sym=sym, 101 | iters=2, 102 | seqlen=2, 103 | dataset=self.llm_dataloader, 104 | low_cpu_mem_usage=True 105 | ) 106 | autoround.quantize() 107 | 108 | if __name__ == "__main__": 109 | unittest.main() -------------------------------------------------------------------------------- /test/test_cpu/test_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.insert(0, "../..") 6 | 7 | 8 | class TestScript(unittest.TestCase): 9 | def test_default(self): 10 | os.system(''' 11 | cd ../.. && 12 | python -m auto_round 13 | --iters 2 14 | --deployment_device fake 15 | --output_dir ./tmp_script_test''') 16 | 17 | if __name__ == "__main__": 18 | unittest.main() -------------------------------------------------------------------------------- /test/test_cpu/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | import sys 3 | sys.path.insert(0, "../..") 4 | import auto_round.utils as auto_round_utils 5 | 6 | class TestPackingWithNumba: 7 | 8 | @patch.object(auto_round_utils, "_is_tbb_installed", lambda: False) 9 | def test_tbb_not_installed(self): 10 | assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False." 11 | assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False." 12 | 13 | @patch.object(auto_round_utils, "_is_tbb_installed", lambda: True) 14 | @patch.object(auto_round_utils, "_is_tbb_configured", lambda: False) 15 | def test_tbb_installed_but_not_configured_right(self): 16 | assert auto_round_utils.is_tbb_available() is False, "`is_tbb_available` should return False." 17 | assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False." 18 | 19 | @patch.object(auto_round_utils, "is_numba_available", lambda: False) 20 | def test_numba_not_installed(self): 21 | assert auto_round_utils.can_pack_with_numba() is False, "`can_pack_with_numba` should return False." 22 | -------------------------------------------------------------------------------- /test/test_cpu/test_woq_linear.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import sys 4 | sys.path.insert(0, "../..") 5 | from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear 6 | 7 | 8 | class TestWeightOnlyLinear: 9 | @pytest.mark.parametrize( 10 | "bits, compression_dtype", 11 | [ 12 | (8, torch.int16), 13 | (8, torch.int32), 14 | (8, torch.int64), 15 | (4, torch.int8), 16 | (4, torch.int16), 17 | (4, torch.int32), 18 | (4, torch.int64), 19 | (2, torch.int8), 20 | (2, torch.int16), 21 | (2, torch.int32), 22 | (2, torch.int64), 23 | ], 24 | ) 25 | def test_pack_with_numba(self, bits, compression_dtype): 26 | m = torch.nn.Linear(1024, 512) 27 | dtype = "int" 28 | weight = m.weight.detach() 29 | group_size = 32 30 | origin_shape = weight.shape 31 | from auto_round.data_type.int import quant_tensor_sym 32 | origin_shape = weight.shape 33 | weight = weight.reshape(-1, group_size) 34 | qdq, scale, zp = quant_tensor_sym( weight, -1 35 | ) 36 | int_weight = ( 37 | qdq.div(scale) 38 | .add(zp) 39 | .clamp(0, 2 ** (bits) - 1) 40 | .to(torch.int32) 41 | .reshape(origin_shape) 42 | ) 43 | scale = scale.reshape(origin_shape[0], -1) 44 | zp = zp.reshape(origin_shape[0], -1).to(torch.int32).clamp(0, 2 ** (bits) - 1) 45 | module_with_legacy_pack = WeightOnlyLinear( 46 | in_features=m.in_features, 47 | out_features=m.out_features, 48 | dtype=dtype, 49 | bits=bits, 50 | groupsize=32, 51 | zp=zp is not None, 52 | bias=m.bias is not None, 53 | use_optimum_format=False, 54 | compression_dtype=compression_dtype, 55 | use_legacy_pack=True, 56 | ) 57 | module_with_legacy_pack.pack( 58 | int_weight.clone(), scale.clone(), zp.clone(), m.bias 59 | ) 60 | module_with_new_pack = WeightOnlyLinear( 61 | in_features=m.in_features, 62 | out_features=m.out_features, 63 | dtype=dtype, 64 | bits=bits, 65 | groupsize=32, 66 | zp=zp is not None, 67 | bias=m.bias is not None, 68 | use_optimum_format=False, 69 | compression_dtype=compression_dtype, 70 | use_legacy_pack=False, 71 | ) 72 | module_with_new_pack.pack(int_weight.clone(), scale.clone(), zp.clone(), m.bias) 73 | 74 | assert torch.equal( 75 | module_with_new_pack.qweight, module_with_legacy_pack.qweight 76 | ) 77 | 78 | assert torch.equal(module_with_new_pack.qzeros, module_with_legacy_pack.qzeros) 79 | assert torch.equal(module_with_new_pack.scales, module_with_legacy_pack.scales) 80 | unpacked_int_weight = module_with_new_pack.unpack_tensor( 81 | module_with_legacy_pack.qweight 82 | ) 83 | assert torch.equal(unpacked_int_weight, int_weight) 84 | -------------------------------------------------------------------------------- /test/test_cuda/_test_helpers.py: -------------------------------------------------------------------------------- 1 | 2 | def model_infer(model, tokenizer, apply_chat_template=False): 3 | prompts = [ 4 | "Hello,my name is", 5 | # "The president of the United States is", 6 | # "The capital of France is", 7 | # "The future of AI is", 8 | ] 9 | if apply_chat_template: 10 | texts = [] 11 | for prompt in prompts: 12 | messages = [ 13 | {"role": "user", "content": prompt} 14 | ] 15 | text = tokenizer.apply_chat_template( 16 | messages, 17 | tokenize=False, 18 | add_generation_prompt=True 19 | ) 20 | texts.append(text) 21 | prompts = texts 22 | 23 | inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) 24 | 25 | outputs = model.generate( 26 | input_ids=inputs["input_ids"].to(model.device), 27 | attention_mask=inputs["attention_mask"].to(model.device), 28 | do_sample=False, ## change this to follow official usage 29 | max_new_tokens=5 30 | ) 31 | generated_ids = [ 32 | output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs["input_ids"], outputs) 33 | ] 34 | 35 | decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 36 | 37 | for i, prompt in enumerate(prompts): 38 | print(f"Prompt: {prompt}") 39 | print(f"Generated: {decoded_outputs[i]}") 40 | print("-" * 50) 41 | return decoded_outputs[0] 42 | -------------------------------------------------------------------------------- /test/test_cuda/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | # autoawq 3 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation 4 | auto-gptq 5 | datasets 6 | einops 7 | # gptqmodel>=2.0 8 | # pip install -v git+https://github.com/ModelCloud/GPTQModel.git@v2.2.0 --no-build-isolation 9 | intel-extension-for-pytorch 10 | lm-eval>=0.4.2,<0.5 11 | numpy < 2.0 12 | optimum 13 | pandas 14 | pillow 15 | py-cpuinfo 16 | torch 17 | torchvision 18 | tqdm 19 | transformers==4.51.3 20 | -------------------------------------------------------------------------------- /test/test_cuda/requirements_vlm.txt: -------------------------------------------------------------------------------- 1 | # git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 2 | # pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git 3 | # pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation 4 | accelerate 5 | # autoawq 6 | bitsandbytes==0.44.0 7 | datasets 8 | einops 9 | flash-attn==2.5.8 10 | intel-extension-for-transformers 11 | lm-eval>=0.4.2,<0.5 12 | numpy < 2.0 13 | optimum 14 | pandas 15 | protobuf==3.20.2 16 | pillow 17 | py-cpuinfo 18 | torch==2.3.0 19 | torchvision 20 | triton==2.3.0 21 | tqdm 22 | transformers==4.45.0 23 | xformers 24 | -------------------------------------------------------------------------------- /test/test_cuda/test_2_3bits.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | import re 6 | 7 | sys.path.insert(0, "../..") 8 | import torch 9 | import transformers 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | 12 | from auto_round import AutoRound 13 | from auto_round.eval.evaluation import simple_evaluate 14 | from lm_eval.utils import make_table # pylint: disable=E0401 15 | from auto_round.testing_utils import require_autogptq, require_greater_than_050 16 | 17 | 18 | def get_accuracy(data): 19 | match = re.search(r'\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|', data) 20 | 21 | if match: 22 | accuracy = float(match.group(1)) 23 | return accuracy 24 | else: 25 | return 0.0 26 | 27 | 28 | class TestAutoRound(unittest.TestCase): 29 | @classmethod 30 | def setUpClass(self): 31 | self.save_dir = "./saved" 32 | self.tasks = "lambada_openai" 33 | 34 | @classmethod 35 | def tearDownClass(self): 36 | shutil.rmtree("./saved", ignore_errors=True) 37 | shutil.rmtree("runs", ignore_errors=True) 38 | 39 | @require_autogptq 40 | def test_3bits_autoround(self): 41 | model_name = "/models/opt-125m" 42 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") 43 | tokenizer = AutoTokenizer.from_pretrained(model_name) 44 | autoround = AutoRound(model, tokenizer, bits=3) 45 | autoround.quantize() 46 | 47 | # autoround.save_quantized(self.save_dir, format="auto_gptq", inplace=False) 48 | autoround.save_quantized(self.save_dir, format="auto_round", inplace=False) 49 | model_args = f"pretrained={self.save_dir}" 50 | res = simple_evaluate(model="hf", model_args=model_args, 51 | # tasks="arc_easy", 52 | tasks=self.tasks, 53 | batch_size="auto") 54 | 55 | ## 0.2529 56 | accuracy = res['results']['lambada_openai']['acc,none'] 57 | assert accuracy > 0.3 58 | shutil.rmtree("./saved", ignore_errors=True) 59 | 60 | def test_3bits_asym_autoround(self): 61 | model_name = "/models/opt-125m" 62 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") 63 | tokenizer = AutoTokenizer.from_pretrained(model_name) 64 | bits, sym = 3, False 65 | autoround = AutoRound(model, tokenizer, bits=bits, sym=sym) 66 | autoround.quantize() 67 | autoround.save_quantized(self.save_dir, format="auto_round", inplace=False) 68 | model_args = f"pretrained={self.save_dir}" 69 | res = simple_evaluate(model="hf", model_args=model_args, 70 | # tasks="arc_easy", 71 | tasks=self.tasks, 72 | batch_size="auto") 73 | 74 | ## 0.3423 75 | accuracy = res['results']['lambada_openai']['acc,none'] 76 | assert accuracy > 0.32 77 | shutil.rmtree("./saved", ignore_errors=True) 78 | 79 | @require_greater_than_050 80 | def test_norm_bias_tuning(self): 81 | model_name = "/models/opt-125m" 82 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") 83 | tokenizer = AutoTokenizer.from_pretrained(model_name) 84 | autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True) 85 | autoround.quantize() 86 | 87 | ##test auto_round format 88 | autoround.save_quantized(self.save_dir, format="auto_round", inplace=False) 89 | model_args = f"pretrained={self.save_dir}" 90 | res = simple_evaluate(model="hf", model_args=model_args, 91 | tasks=self.tasks, 92 | batch_size="auto") 93 | res = make_table(res) ##0.2212 0.1844 94 | accuracy = get_accuracy(res) 95 | assert accuracy > 0.18 96 | shutil.rmtree("./saved", ignore_errors=True) 97 | 98 | @require_greater_than_050 99 | def test_2bits_autoround(self): 100 | model_name = "/models/opt-125m" 101 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") 102 | tokenizer = AutoTokenizer.from_pretrained(model_name) 103 | autoround = AutoRound(model, tokenizer, bits=2, group_size=64) 104 | autoround.quantize() 105 | 106 | ##test auto_round format 107 | autoround.save_quantized(self.save_dir, format="auto_round", inplace=False) 108 | model_args = f"pretrained={self.save_dir}" 109 | res = simple_evaluate(model="hf", model_args=model_args, 110 | tasks=self.tasks, 111 | batch_size="auto") 112 | res = make_table(res) ##0.1985 113 | accuracy = get_accuracy(res) 114 | assert accuracy > 0.18 115 | shutil.rmtree("./saved", ignore_errors=True) 116 | 117 | 118 | autoround.save_quantized(self.save_dir, format="auto_gptq", inplace=False) 119 | model_args = f"pretrained={self.save_dir}" 120 | res = simple_evaluate(model="hf", model_args=model_args, 121 | tasks=self.tasks, 122 | batch_size="auto") 123 | res = make_table(res) ##0.1985 124 | accuracy = get_accuracy(res) 125 | assert accuracy > 0.18 126 | shutil.rmtree("./saved", ignore_errors=True) 127 | 128 | if __name__ == "__main__": 129 | unittest.main() 130 | -------------------------------------------------------------------------------- /test/test_cuda/test_calib_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import json 8 | 9 | import torch 10 | from transformers import AutoModelForCausalLM, AutoTokenizer 11 | 12 | from auto_round import AutoRound 13 | 14 | 15 | class TestLocalCalibDataset(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(self): 18 | json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] 19 | os.makedirs("./saved", exist_ok=True) 20 | self.json_file = "./saved/tmp.json" 21 | with open(self.json_file, "w") as json_file: 22 | json.dump(json_data, json_file, indent=4) 23 | 24 | jsonl_data = [{"text": "哈哈,開心點"}, {"text": "hello world"}] 25 | os.makedirs("./saved", exist_ok=True) 26 | self.jsonl_file = "./saved/tmp.jsonl" 27 | with open(self.jsonl_file, "w") as jsonl_file: 28 | for item in jsonl_data: 29 | json.dump(item, jsonl_file, ensure_ascii=False) 30 | jsonl_file.write('\n') 31 | 32 | model_name = "facebook/opt-125m" 33 | self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) 34 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 35 | 36 | def test_combine_dataset(self): 37 | dataset = ( 38 | "NeelNanda/pile-10k" + ",codeparrot/github-code-clean" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese") 39 | bits, group_size, sym = 4, 128, True 40 | autoround = AutoRound( 41 | self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset 42 | ) 43 | autoround.quantize() 44 | 45 | 46 | if __name__ == "__main__": 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /test/test_cuda/test_conv1d.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | 10 | from auto_round import AutoRound 11 | from auto_round.testing_utils import require_gptqmodel 12 | from _test_helpers import model_infer 13 | class LLMDataLoader: 14 | def __init__(self): 15 | self.batch_size = 1 16 | 17 | def __iter__(self): 18 | for i in range(2): 19 | yield torch.ones([1, 10], dtype=torch.long) 20 | 21 | 22 | class TestQuantizationConv1d(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(self): 25 | self.model_name = "MBZUAI/LaMini-GPT-124M" 26 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) 27 | self.llm_dataloader = LLMDataLoader() 28 | 29 | @classmethod 30 | def tearDownClass(self): 31 | shutil.rmtree("./saved", ignore_errors=True) 32 | shutil.rmtree("runs", ignore_errors=True) 33 | 34 | @require_gptqmodel 35 | def test_quant(self): 36 | self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) 37 | bits, group_size, sym = 4, 128, True 38 | from auto_round import AutoRoundConfig 39 | autoround = AutoRound( 40 | self.model, 41 | self.tokenizer, 42 | bits=bits, 43 | group_size=group_size, 44 | sym=sym, 45 | iters=2, 46 | seqlen=2, 47 | dataset=self.llm_dataloader, 48 | 49 | ) 50 | 51 | autoround.quantize() 52 | autoround.save_quantized("./saved") 53 | 54 | model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True) 55 | model_infer(model, self.tokenizer) 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /test/test_cuda/test_multiple_card_calib.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import shutil 5 | import unittest 6 | 7 | sys.path.insert(0, "../..") 8 | 9 | from auto_round.testing_utils import multi_card 10 | 11 | def get_accuracy(data): 12 | match = re.search(r'\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|', data) 13 | 14 | if match: 15 | accuracy = float(match.group(1)) 16 | return accuracy 17 | else: 18 | return 0.0 19 | 20 | 21 | class TestAutoRound(unittest.TestCase): 22 | @classmethod 23 | def setUpClass(self): 24 | self.save_dir = "./saved" 25 | self.tasks = "lambada_openai" 26 | 27 | @classmethod 28 | def tearDownClass(self): 29 | shutil.rmtree("./saved", ignore_errors=True) 30 | shutil.rmtree("runs", ignore_errors=True) 31 | 32 | @multi_card 33 | def test_multiple_card_calib(self): 34 | python_path = sys.executable 35 | 36 | ##test llm script 37 | res = os.system( 38 | f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --disable_eval --iters 1 --nsamples 1 --output_dir None") 39 | if res > 0 or res == -1: 40 | assert False, "cmd line test fail, please have a check" 41 | 42 | 43 | if __name__ == "__main__": 44 | unittest.main() 45 | 46 | 47 | -------------------------------------------------------------------------------- /test/test_xpu/test_autoround.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import sys 4 | import unittest 5 | 6 | sys.path.insert(0, "../..") 7 | import torch 8 | import transformers 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | from auto_round import AutoRoundConfig 12 | from auto_round import AutoRound 13 | 14 | class LLMDataLoader: 15 | def __init__(self): 16 | self.batch_size = 1 17 | 18 | def __iter__(self): 19 | for i in range(3): 20 | yield torch.ones([1, 10], dtype=torch.long) 21 | 22 | 23 | class TestAutoRoundXPU(unittest.TestCase): 24 | @classmethod 25 | def setUpClass(self): 26 | 27 | self.llm_dataloader = LLMDataLoader() 28 | 29 | @classmethod 30 | def tearDownClass(self): 31 | shutil.rmtree("./saved", ignore_errors=True) 32 | shutil.rmtree("runs", ignore_errors=True) 33 | pass 34 | 35 | def test_gptq_format(self): 36 | model_name = "facebook/opt-125m" 37 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True, 38 | device_map="auto") 39 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 40 | bits, group_size, sym = 4, 128, True 41 | 42 | autoround = AutoRound( 43 | model, 44 | tokenizer, 45 | bits=bits, 46 | group_size=group_size, 47 | sym=sym, 48 | iters=2, 49 | seqlen=2, 50 | dataset=self.llm_dataloader, 51 | ) 52 | quantized_model_path = "./saved" 53 | autoround.quantize_and_save(output_dir=quantized_model_path) 54 | 55 | from auto_round import AutoRoundConfig 56 | quantization_config = AutoRoundConfig( 57 | backend="auto" 58 | ) 59 | 60 | model = AutoModelForCausalLM.from_pretrained(quantized_model_path, 61 | device_map="auto", quantization_config=quantization_config) 62 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 63 | text = "There is a girl who likes adventure," 64 | inputs = tokenizer(text, return_tensors="pt").to(model.device) 65 | res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) 66 | print(res) 67 | assert ("!!!" not in res) 68 | 69 | 70 | 71 | def test_awq_format(self): 72 | model_name = "facebook/opt-125m" 73 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True, 74 | device_map="xpu") 75 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 76 | bits, group_size, sym = 4, 128, True 77 | autoround = AutoRound( 78 | model, 79 | tokenizer, 80 | bits=bits, 81 | group_size=group_size, 82 | sym=sym, 83 | iters=2, 84 | seqlen=2, 85 | dataset=self.llm_dataloader, 86 | ) 87 | quantized_model_path = "./saved" 88 | autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") 89 | 90 | quantized_model_path = "./saved" 91 | from auto_round import AutoRoundConfig 92 | quantization_config = AutoRoundConfig( 93 | backend="auto" 94 | ) 95 | 96 | model = AutoModelForCausalLM.from_pretrained(quantized_model_path, 97 | device_map="auto", quantization_config=quantization_config) 98 | tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) 99 | text = "There is a girl who likes adventure," 100 | inputs = tokenizer(text, return_tensors="pt").to(model.device) 101 | res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) 102 | print(res) 103 | assert ("!!!" not in res) 104 | 105 | if __name__ == "__main__": 106 | unittest.main() -------------------------------------------------------------------------------- /third-party-programs.txt: -------------------------------------------------------------------------------- 1 | Third Party Programs File 2 | 3 | This file contains the list of third party software ("third party programs") 4 | contained in the Intel software and their required notices and/or license 5 | terms. This third party software, even if included with the distribution of 6 | the Intel software, may be governed by separate license terms, including 7 | without limitation, third party license terms, other Intel software license 8 | terms, and open source software license terms. These separate license terms 9 | govern your use of the third party programs as set forth in in the 10 | "THIRD-PARTY-PROGRAMS" file. 11 | 12 | Third party programs and their corresponding required notices and/or license terms are listed 13 | below. 14 | =============================================================================== 15 | 1. Pytorch 16 | 17 | From PyTorch: 18 | 19 | Copyright (c) 2016- Facebook, Inc (Adam Paszke) 20 | Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 21 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 22 | Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 23 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 24 | Copyright (c) 2011-2013 NYU (Clement Farabet) 25 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 26 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 27 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 28 | 29 | From Caffe2: 30 | 31 | Copyright (c) 2016-present, Facebook Inc. All rights reserved. 32 | 33 | All contributions by Facebook: 34 | Copyright (c) 2016 Facebook Inc. 35 | 36 | All contributions by Google: 37 | Copyright (c) 2015 Google Inc. 38 | All rights reserved. 39 | 40 | All contributions by Yangqing Jia: 41 | Copyright (c) 2015 Yangqing Jia 42 | All rights reserved. 43 | 44 | All contributions by Kakao Brain: 45 | Copyright 2019-2020 Kakao Brain 46 | 47 | All contributions by Cruise LLC: 48 | Copyright (c) 2022 Cruise LLC. 49 | All rights reserved. 50 | 51 | All contributions from Caffe: 52 | Copyright(c) 2013, 2014, 2015, the respective contributors 53 | All rights reserved. 54 | 55 | All other contributions: 56 | Copyright(c) 2015, 2016 the respective contributors 57 | All rights reserved. 58 | 59 | Caffe2 uses a copyright model similar to Caffe: each contributor holds 60 | copyright over their contributions to Caffe2. The project versioning records 61 | all such contribution and copyright details. If a contributor wants to further 62 | mark their specific copyright on a particular contribution, they should 63 | indicate their copyright solely in the commit message of the change when it is 64 | committed. 65 | 66 | All rights reserved. 67 | 68 | Redistribution and use in source and binary forms, with or without 69 | modification, are permitted provided that the following conditions are met: 70 | 71 | 1. Redistributions of source code must retain the above copyright 72 | notice, this list of conditions and the following disclaimer. 73 | 74 | 2. Redistributions in binary form must reproduce the above copyright 75 | notice, this list of conditions and the following disclaimer in the 76 | documentation and/or other materials provided with the distribution. 77 | 78 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America 79 | and IDIAP Research Institute nor the names of its contributors may be 80 | used to endorse or promote products derived from this software without 81 | specific prior written permission. 82 | 83 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 84 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 85 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 86 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 87 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 88 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 89 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 90 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 91 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 92 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 93 | POSSIBILITY OF SUCH DAMAGE. 94 | 95 | 96 | =============================================================================== 97 | 2. lm-evaluation-harness 98 | Copyright (c) 2020 EleutherAI 99 | 100 | AutoGPTQ 101 | Copyright (c) 2023 潘其威(William) 102 | 103 | MIT License 104 | 105 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 106 | 107 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 108 | 109 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 110 | 111 | 112 | =============================================================================== --------------------------------------------------------------------------------