├── .azure-pipelines ├── docker │ └── Dockerfile.devel ├── model-test.yml ├── scripts │ ├── change_color.sh │ ├── codeScan │ │ └── codespell │ │ │ └── nc_dict.txt │ ├── install_nc.sh │ ├── models │ │ ├── collect_results.py │ │ ├── env_setup.sh │ │ ├── generate_report.py │ │ ├── run_onnxrt_llm_models_trigger.sh │ │ ├── run_onnxrt_models_trigger.sh │ │ ├── summarize_results.py │ │ └── templates │ │ │ └── model.jinja2 │ └── ut │ │ ├── collect_log.sh │ │ ├── compare_coverage.sh │ │ ├── coverage.ort │ │ └── run_ort.sh ├── template │ ├── docker-template.yml │ ├── model-template.yml │ └── ut-template.yml └── ut-3x-ort.yml ├── .github ├── license_template.txt ├── pull_request_template.md └── workflows │ └── lint.yaml ├── .gitignore ├── .lintrunner.toml ├── LICENSE ├── README.md ├── SECURITY.md ├── docs ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── SECURITY.md ├── autotune.md ├── calibration.md ├── design.md ├── imgs │ ├── architecture.png │ ├── common │ │ ├── code.svg │ │ └── right.svg │ ├── lwq_ort.png │ ├── smoothquant.png │ ├── sq_convert.png │ ├── sq_pc.png │ └── workflow.png ├── installation_guide.md ├── quantization.md ├── quantization_layer_wise.md ├── quantization_weight_only.md └── smooth_quant.md ├── examples ├── .config │ └── model_params_onnxrt.json ├── image_recognition │ └── resnet50 │ │ └── quantization │ │ └── ptq_static │ │ ├── README.md │ │ ├── main.py │ │ ├── prepare_model.py │ │ ├── requirements.txt │ │ ├── run_benchmark.sh │ │ └── run_quant.sh └── nlp │ ├── bert │ └── quantization │ │ ├── ptq_dynamic │ │ ├── README.md │ │ ├── main.py │ │ ├── prepare_data.sh │ │ ├── prepare_model.py │ │ ├── requirements.txt │ │ ├── run_benchmark.sh │ │ └── run_quant.sh │ │ └── ptq_static │ │ ├── README.md │ │ ├── main.py │ │ ├── prepare_data.sh │ │ ├── prepare_model.py │ │ ├── requirements.txt │ │ ├── run_benchmark.sh │ │ └── run_quant.sh │ └── huggingface_model │ ├── text_generation │ └── quantization │ │ └── weight_only │ │ ├── README.md │ │ ├── evaluation │ │ ├── __init__.py │ │ ├── accuracy.py │ │ ├── evaluator.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── huggingface.py │ │ └── utils.py │ │ ├── main.py │ │ ├── prepare_model.py │ │ ├── requirements.txt │ │ ├── run_benchmark.sh │ │ └── run_quant.sh │ └── text_to_image │ └── stable_diffusion_v1_5 │ └── quantization │ └── ptq_static │ ├── README.md │ ├── imgs │ ├── fp32.png │ └── int8.png │ ├── main.py │ ├── requirements.txt │ ├── run_benchmark.sh │ └── run_quant.sh ├── onnx_neural_compressor ├── __init__.py ├── algorithms │ ├── __init__.py │ ├── layer_wise │ │ ├── __init__.py │ │ └── core.py │ ├── post_training_quant │ │ ├── __init__.py │ │ ├── calibrate.py │ │ ├── calibrator.py │ │ ├── operators │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── argmax.py │ │ │ ├── attention.py │ │ │ ├── base_op.py │ │ │ ├── binary_op.py │ │ │ ├── concat.py │ │ │ ├── conv.py │ │ │ ├── direct_q8.py │ │ │ ├── embed_layernorm.py │ │ │ ├── gather.py │ │ │ ├── gavgpool.py │ │ │ ├── gemm.py │ │ │ ├── lstm.py │ │ │ ├── matmul.py │ │ │ ├── maxpool.py │ │ │ ├── pad.py │ │ │ ├── pooling.py │ │ │ ├── reduce.py │ │ │ ├── resize.py │ │ │ ├── split.py │ │ │ └── unary_op.py │ │ └── quantizer.py │ ├── smoother │ │ ├── __init__.py │ │ ├── calibrator.py │ │ └── core.py │ ├── utility.py │ └── weight_only │ │ ├── __init__.py │ │ ├── awq.py │ │ ├── gptq.py │ │ └── rtn.py ├── constants.py ├── data_reader.py ├── logger.py ├── onnx_model.py ├── quantization │ ├── __init__.py │ ├── algorithm_entry.py │ ├── config.py │ ├── matmul_4bits_quantizer.py │ ├── matmul_nbits_quantizer.py │ ├── quant_utils.py │ ├── quantize.py │ └── tuning.py ├── utility.py └── version.py ├── pyproject.toml ├── requirements-lintrunner.txt ├── requirements.txt ├── setup.py └── test ├── quantization ├── layer_wise │ └── test_layer_wise.py ├── post_training_quant │ ├── test_calibrate.py │ ├── test_operators.py │ ├── test_post_training_quant.py │ └── test_quant_utils.py ├── test_algorithm_utility.py ├── test_autotune.py ├── test_config.py ├── test_smooth_quant.py └── weight_only │ ├── test_awq.py │ ├── test_gptq.py │ └── test_rtn.py ├── requirements.txt └── utils ├── test_general.py ├── test_logger.py ├── test_onnx_model.py ├── test_param.py └── test_utility.py /.azure-pipelines/docker/Dockerfile.devel: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ARG UBUNTU_VER=22.04 16 | FROM ubuntu:${UBUNTU_VER} as devel 17 | 18 | # See http://bugs.python.org/issue19846 19 | ENV LANG C.UTF-8 20 | 21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 22 | python3 \ 23 | python3-pip \ 24 | python3-dev \ 25 | python3-distutils \ 26 | autoconf \ 27 | build-essential \ 28 | git \ 29 | libgl1-mesa-glx \ 30 | libglib2.0-0 \ 31 | numactl \ 32 | time \ 33 | wget \ 34 | bc \ 35 | jq \ 36 | vim 37 | 38 | RUN ln -sf $(which python3) /usr/bin/python 39 | 40 | RUN python -m pip --no-cache-dir install --upgrade pip 41 | RUN python -m pip install --no-cache-dir setuptools 42 | 43 | RUN pip list 44 | 45 | WORKDIR / 46 | 47 | -------------------------------------------------------------------------------- /.azure-pipelines/model-test.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - onnx_neural_compressor 12 | - setup.py 13 | - requirements.txt 14 | - .azure-pipelines/scripts/models 15 | - .azure-pipelines/model-test.yml 16 | - .azure-pipelines/template/model-template.yml 17 | exclude: 18 | - test 19 | 20 | variables: 21 | OUT_SCRIPT_PATH: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models 22 | SCRIPT_PATH: /neural_compressor/.azure-pipelines/scripts 23 | 24 | parameters: 25 | - name: algorithms 26 | type: object 27 | default: 28 | - SQ 29 | - WOQ 30 | - name: models 31 | type: object 32 | default: 33 | - bert_base_MRPC 34 | - bert_base_MRPC_dynamic 35 | - resnet50-v1-12_qdq 36 | - resnet50-v1-12 37 | 38 | stages: 39 | # - stage: ONNX_LLM_Models 40 | # displayName: Run ONNX LLM Model 41 | # pool: ICX-16C 42 | # dependsOn: [] 43 | # jobs: 44 | # - ${{ each algorithm in parameters.algorithms }}: 45 | # - job: 46 | # steps: 47 | # - template: template/model-template.yml 48 | # parameters: 49 | # modelName: "facebook/opt-125m" 50 | # algorithm: "${{ algorithm }}" 51 | # script_path: "run_onnxrt_llm_models_trigger.sh" 52 | 53 | - stage: ONNX_Models 54 | displayName: Run ONNX Model 55 | pool: MODEL_PERF_TEST 56 | dependsOn: [] 57 | jobs: 58 | - ${{ each model in parameters.models }}: 59 | - job: 60 | displayName: ${{ model }} 61 | steps: 62 | - template: template/model-template.yml 63 | parameters: 64 | modelName: "${{ model }}" 65 | algorithm: "Quantize" 66 | script_path: "run_onnxrt_models_trigger.sh" 67 | 68 | - stage: GenerateLogs 69 | displayName: Generate Report 70 | pool: 71 | vmImage: "ubuntu-latest" 72 | dependsOn: [ONNX_Models] 73 | jobs: 74 | - job: GenerateReport 75 | steps: 76 | - script: | 77 | echo ${BUILD_SOURCESDIRECTORY} 78 | rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true 79 | echo y | docker system prune 80 | displayName: "Clean workspace" 81 | - checkout: self 82 | clean: true 83 | displayName: "Checkout out Repo" 84 | - task: DownloadPipelineArtifact@2 85 | inputs: 86 | artifact: 87 | patterns: "**/result.json" 88 | path: $(OUT_SCRIPT_PATH) 89 | - task: UsePythonVersion@0 90 | displayName: "Use Python 3.10" 91 | inputs: 92 | versionSpec: "3.10" 93 | - script: | 94 | cd ${OUT_SCRIPT_PATH} 95 | mkdir generated last_generated 96 | python -u summarize_results.py --logs_dir $(OUT_SCRIPT_PATH) --output_dir generated 97 | displayName: "Summarize all results" 98 | - task: DownloadPipelineArtifact@2 99 | continueOnError: true 100 | inputs: 101 | source: "specific" 102 | artifact: "FinalReport" 103 | patterns: "**.json" 104 | path: $(OUT_SCRIPT_PATH)/last_generated 105 | project: $(System.TeamProject) 106 | pipeline: "onc model test" 107 | runVersion: "specific" 108 | runId: $(refer_buildId) 109 | displayName: "Download last logs" 110 | - script: | 111 | echo "------ Generating final report.html ------" 112 | cd ${OUT_SCRIPT_PATH} 113 | pip install jinja2 114 | python generate_report.py --json_path generated/summary.json --last_json_path last_generated/summary.json 115 | displayName: "Generate report" 116 | - task: PublishPipelineArtifact@1 117 | inputs: 118 | targetPath: $(OUT_SCRIPT_PATH)/generated 119 | artifact: FinalReport 120 | publishLocation: "pipeline" 121 | displayName: "Publish report" 122 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/change_color.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------- general approach start---------------- 4 | 5 | # 1. import this file: 6 | # source path/change_color.sh 7 | # 2. use COLOR/BG: 8 | # $VARIABLE_NAME && out_put_content && $RESET 9 | # 3. COLOR + BG: 10 | # $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET 11 | # 4. custom 12 | # abbreviation(change number) 13 | # txt number range (30, 37) 14 | # bg number range (40, 47) 15 | # special effects number range (1, 7) 16 | # echo -en \\E[number1 + ; + number2 + ; + number3 + m" 17 | # e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m" 18 | 19 | # -------------- general approach end----------------== 20 | 21 | 22 | # general setting 23 | # ------------- light_color start---------------- 24 | # black 25 | LIGHT_BLACK="echo -en \\E[30m" 26 | # red 27 | LIGHT_RED="echo -en \\E[31m" 28 | # green 29 | LIGHT_GREEN="echo -en \\E[32m" 30 | # yellow 31 | LIGHT_YELLOW="echo -en \\E[33m" 32 | # blue 33 | LIGHT_BLUE="echo -en \\E[34m" 34 | # purple 35 | LIGHT_PURPLE="echo -en \\E[35m" 36 | # cyan 37 | LIGHT_CYAN="echo -en \\E[36m" 38 | # gray 39 | LIGHT_GRAY="echo -en \\E[37m" 40 | # ------------- light_color end---------------- 41 | 42 | # ------------- bold_color start---------------- 43 | # black 44 | BOLD_BLACK="echo -en \\E[1;30m" 45 | # red 46 | BOLD_RED="echo -en \\E[1;31m" 47 | # green 48 | BOLD_GREEN="echo -en \\E[1;32m" 49 | # yellow 50 | BOLD_YELLOW="echo -en \\E[1;33m" 51 | # blue 52 | BOLD_BLUE="echo -en \\E[1;34m" 53 | # purple 54 | BOLD_PURPLE="echo -en \\E[1;35m" 55 | # cyan 56 | BOLD_CYAN="echo -en \\E[1;36m" 57 | # gray 58 | BOLD_GRAY="echo -en \\E[1;37m" 59 | # ------------- bold_color end---------------- 60 | 61 | # ------------- background_color start---------------- 62 | # black 63 | BG_BLACK="echo -en \\E[40m" 64 | # red 65 | BG_RED="echo -en \\E[41m" 66 | # green 67 | BG_GREEN="echo -en \\E[42m" 68 | # yellow 69 | BG_YELLOW="echo -en \\E[43m" 70 | # blue 71 | BG_BLUE="echo -en \\E[44m" 72 | # purple 73 | BG_PURPLE="echo -en \\E[45m" 74 | # cyan 75 | BG_CYAN="echo -en \\E[46m" 76 | # gray 77 | BG_GRAY="echo -en \\E[47m" 78 | # ------------- background_color end---------------- 79 | 80 | # close 81 | RESET="echo -en \\E[0m" 82 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/codeScan/codespell/nc_dict.txt: -------------------------------------------------------------------------------- 1 | datas 2 | nd 3 | ot 4 | ue 5 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/install_nc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo -e "\n Install Neural Compressor ... " 4 | cd /neural-compressor 5 | 6 | python -m pip install --no-cache-dir -r requirements.txt 7 | python setup.py bdist_wheel 8 | pip install dist/onnx_neural_compressor*.whl --force-reinstall 9 | 10 | echo -e "\n pip list after install Neural Compressor ... " 11 | pip list 12 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/models/collect_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--model", required=True, type=str) 8 | parser.add_argument("--build_id", required=True, type=str) 9 | args = parser.parse_args() 10 | 11 | URL = ( 12 | "https://dev.azure.com/lpot-inc/onnx-neural-compressor/_build/results?buildId=" 13 | + args.build_id 14 | + "&view=artifacts&pathAsName=false&type=publishedArtifacts" 15 | ) 16 | REFER_SUMMARY_PATH = "/neural-compressor/.azure-pipelines/scripts/models/summary.json" 17 | 18 | 19 | def str_to_float(value): 20 | try: 21 | return round(float(value), 4) 22 | except ValueError: 23 | return value 24 | 25 | 26 | def get_refer_data(): 27 | if not os.path.exists(REFER_SUMMARY_PATH): 28 | print(f"The file '{REFER_SUMMARY_PATH}' does not exist.") 29 | return {} 30 | 31 | with open(REFER_SUMMARY_PATH, "r") as file: 32 | refer = json.load(file) 33 | return refer 34 | 35 | 36 | def check_status(performance, accuracy): 37 | refer = get_refer_data() 38 | 39 | refer_accuracy = refer.get(args.model, {}).get("accuracy", {}).get("value", "N/A") 40 | refer_performance = refer.get(args.model, {}).get("performance", {}).get("value", "N/A") 41 | print(f"{accuracy=}\n{refer_accuracy=}\n{performance=}\n{refer_performance=}") 42 | 43 | assert accuracy != "N/A" and performance != "N/A" 44 | if refer_accuracy != "N/A": 45 | assert abs(accuracy - refer_accuracy) <= 0.001 46 | if refer_performance != "N/A": 47 | assert (refer_performance - performance) / refer_performance <= 0.08 48 | 49 | 50 | def main(): 51 | result_dict = { 52 | args.model: { 53 | "performance": {"value": "N/A", "log_path": URL}, 54 | "accuracy": {"value": "N/A", "log_path": URL}, 55 | } 56 | } 57 | 58 | pattern = { 59 | "performance": r"Throughput: ([\d.]+)", 60 | "accuracy": r"Accuracy: ([\d.]+)", 61 | } 62 | 63 | for mode, _ in result_dict[args.model].items(): 64 | log_file = f"/neural-compressor/.azure-pipelines/scripts/models/{args.model}/{mode}.log" 65 | if not os.path.exists(log_file): 66 | print(f"The file '{log_file}' does not exist.") 67 | continue 68 | 69 | with open(log_file, "r") as file: 70 | log_content = file.read() 71 | 72 | match = re.search(pattern[mode], log_content) 73 | 74 | if match: 75 | result_dict[args.model][mode]["value"] = str_to_float(match.group(1)) 76 | 77 | with open(f"/neural-compressor/.azure-pipelines/scripts/models/{args.model}/result.json", "w") as json_file: 78 | json.dump(result_dict, json_file, indent=4) 79 | 80 | check_status(result_dict[args.model]["performance"]["value"], result_dict[args.model]["accuracy"]["value"]) 81 | 82 | 83 | if __name__ == "__main__": 84 | main() 85 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/models/env_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | PATTERN='[-a-zA-Z0-9_]*=' 4 | 5 | for i in "$@"; do 6 | case $i in 7 | --model=*) 8 | model=${i//${PATTERN}/} 9 | ;; 10 | *) 11 | echo "Parameter $i not recognized." 12 | exit 1 13 | ;; 14 | esac 15 | done 16 | 17 | CONFIG_PATH="/neural-compressor/examples/.config/model_params_onnxrt.json" 18 | model_src_dir=$(jq -r ".\"onnxrt\".\"$model\".\"model_src_dir\"" "$CONFIG_PATH") 19 | 20 | log_dir="/neural-compressor/.azure-pipelines/scripts/models" 21 | 22 | $BOLD_YELLOW && echo "======= creat log_dir =========" && $RESET 23 | if [ -d "${log_dir}/${model}" ]; then 24 | $BOLD_GREEN && echo "${log_dir}/${model} already exists, don't need to mkdir." && $RESET 25 | else 26 | $BOLD_GREEN && echo "no log dir ${log_dir}/${model}, create." && $RESET 27 | cd "${log_dir}" 28 | mkdir "${model}" 29 | fi 30 | 31 | $BOLD_YELLOW && echo "====== install ONC ======" && $RESET 32 | cd /neural-compressor 33 | source .azure-pipelines/scripts/change_color.sh 34 | /bin/bash .azure-pipelines/scripts/install_nc.sh 35 | 36 | $BOLD_YELLOW && echo "====== install requirements ======" && $RESET 37 | cd "/neural-compressor/examples/$model_src_dir" 38 | pip install -r requirements.txt 39 | pip list 40 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/models/generate_report.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | from jinja2 import Environment, FileSystemLoader 6 | 7 | parser = argparse.ArgumentParser(allow_abbrev=False) 8 | parser.add_argument("--json_path", type=str, required=True) 9 | parser.add_argument("--last_json_path", type=str, required=True) 10 | args = parser.parse_args() 11 | 12 | 13 | def get_data(json_path): 14 | """ 15 | { 16 | model: { 17 | "performance": {"value": "N/A"|number, "log_path": string}, 18 | "accuracy": {"value": "N/A"|number, "log_path": string}, 19 | } 20 | } 21 | """ 22 | if os.path.exists(json_path): 23 | with open(json_path, "r") as f: 24 | return json.load(f) 25 | else: 26 | return {} 27 | 28 | 29 | def get_ratio(cur, last): 30 | if cur == "N/A" or last == "N/A": 31 | ratio = "N/A" 32 | else: 33 | ratio = (float(cur) - float(last)) / float(last) * 100 34 | ratio = round(float(ratio), 2) 35 | return ratio 36 | 37 | 38 | def get_accuracy_ratio(current_json, last_accuracy_dict): 39 | compare_result_dict = [] 40 | for model, item in current_json.items(): 41 | current_accuracy = item.get("accuracy", {}).get("value", "N/A") 42 | last_accuracy = last_accuracy_dict.get(model, {}).get("accuracy", {}).get("value", "N/A") 43 | accuracy_ratio = get_ratio(current_accuracy, last_accuracy) 44 | 45 | current_performance = item.get("performance", {}).get("value", "N/A") 46 | last_performance = last_accuracy_dict.get(model, {}).get("performance", {}).get("value", "N/A") 47 | performance_ratio = get_ratio(current_performance, last_performance) 48 | 49 | if accuracy_ratio == "N/A" or performance_ratio == "N/A": 50 | status = "FAILURE" 51 | elif accuracy_ratio != 0: 52 | status = "FAILURE" 53 | elif performance_ratio > 8 or performance_ratio < -8: 54 | status = "FAILURE" 55 | else: 56 | status = "SUCCESS" 57 | 58 | format_ratio = lambda x: f"{x}%" if x != "N/A" else x 59 | 60 | compare_result_dict.append( 61 | { 62 | "model": model, 63 | "current_accuracy": current_accuracy, 64 | "last_accuracy": last_accuracy, 65 | "accuracy_ratio": format_ratio(accuracy_ratio), 66 | "current_performance": current_performance, 67 | "last_performance": last_performance, 68 | "performance_ratio": format_ratio(performance_ratio), 69 | "status": status, 70 | } 71 | ) 72 | return compare_result_dict 73 | 74 | 75 | def generate(rendered_template): 76 | with open("generated/report.html", "w") as html_file: 77 | html_file.write(rendered_template) 78 | 79 | 80 | def main(): 81 | path = "{}/templates/".format(os.path.dirname(__file__)) 82 | BUILD_BUILDID = os.getenv("BUILD_BUILDID") 83 | 84 | loader = FileSystemLoader(path) 85 | env = Environment(loader=loader) 86 | template = env.get_template("model.jinja2") 87 | 88 | data = get_data(args.json_path) 89 | last_data = get_data(args.last_json_path) 90 | data = get_accuracy_ratio(data, last_data) 91 | info = { 92 | "url": f"https://dev.azure.com/lpot-inc/onnx-neural-compressor/_build/results?buildId={BUILD_BUILDID}", 93 | "branch": os.getenv("SYSTEM_PULLREQUEST_SOURCEBRANCH"), 94 | "commit": os.getenv("BUILD_SOURCEVERSION"), 95 | "build_number": BUILD_BUILDID, 96 | } 97 | 98 | rendered_template = template.render(data=data, info=info) 99 | generate(rendered_template) 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/models/run_onnxrt_llm_models_trigger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | set -xe 4 | PATTERN='[-a-zA-Z0-9_]*=' 5 | 6 | for i in "$@"; do 7 | case $i in 8 | --stage=*) 9 | stage=${i//${PATTERN}/} 10 | ;; 11 | --model=*) 12 | model=${i//${PATTERN}/} 13 | ;; 14 | *) 15 | echo "Parameter $i not recognized." 16 | exit 1 17 | ;; 18 | esac 19 | done 20 | 21 | model_src_dir=/neural-compressor/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only 22 | dataset_location=NeelNanda/pile-10k 23 | input_model=/tf_dataset2/models/huggingface/opt-125m 24 | batch_size=16 25 | 26 | function run_prepare_model() { 27 | python prepare_model.py --input_model="$input_model" --output_model="./model_export" --task=text-generation-with-past 28 | } 29 | 30 | function run_quantize() { 31 | bash run_quant.sh --input_model="./model_export" \ 32 | --output_model="./model_tune" \ 33 | --batch_size="$batch_size" \ 34 | --dataset="$dataset_location" \ 35 | --tokenizer="$model" \ 36 | --algorithm=WOQ_TUNE 37 | } 38 | 39 | function run_accuracy() { 40 | bash run_benchmark.sh --input_model="./model_tune" \ 41 | --batch_size="$batch_size" \ 42 | --mode=accuracy \ 43 | --tokenizer="$model" \ 44 | --tasks=lambada_openai | tee -a accuracy.log 45 | } 46 | 47 | function main() { 48 | cd "$model_src_dir" 49 | if [ "$stage" == "prepare_model" ]; then 50 | run_prepare_model 51 | elif [ "$stage" == "quantize" ]; then 52 | run_quantize 53 | elif [ "$stage" == "accuracy" ]; then 54 | run_accuracy 55 | else 56 | exit 1 57 | fi 58 | } 59 | 60 | main 61 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/models/run_onnxrt_models_trigger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | set -xe 4 | PATTERN='[-a-zA-Z0-9_]*=' 5 | 6 | for i in "$@"; do 7 | case $i in 8 | --stage=*) 9 | stage=${i//${PATTERN}/} 10 | ;; 11 | --model=*) 12 | model=${i//${PATTERN}/} 13 | ;; 14 | *) 15 | echo "Parameter $i not recognized." 16 | exit 1 17 | ;; 18 | esac 19 | done 20 | 21 | log_dir="/neural-compressor/.azure-pipelines/scripts/models/$model" 22 | CONFIG_PATH="/neural-compressor/examples/.config/model_params_onnxrt.json" 23 | model_src_dir=$(jq -r ".\"onnxrt\".\"$model\".\"model_src_dir\"" "$CONFIG_PATH") 24 | if [[ "$model" == *"resnet"* ]]; then 25 | dataset_location="/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000/ILSVRC2012_img_val" 26 | label_path="/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000/val.txt" 27 | else 28 | dataset_location=$(jq -r ".\"onnxrt\".\"$model\".\"dataset_location\"" "$CONFIG_PATH") 29 | fi 30 | 31 | input_model=$(jq -r ".\"onnxrt\".\"$model\".\"input_model\"" "$CONFIG_PATH") 32 | 33 | function run_prepare_model() { 34 | if [ -f "$input_model" ]; then 35 | echo "model exists" 36 | else 37 | echo "model not found" && exit 1 38 | fi 39 | } 40 | 41 | function run_quantize() { 42 | if [[ "$model" == "bert_base_MRPC" ]]; then 43 | bash run_quant.sh --input_model="$input_model" \ 44 | --dataset_location="$dataset_location" \ 45 | --label_path="$label_path" \ 46 | --output_model="./model_tune" \ 47 | --quant_format="QDQ" | tee -a "$log_dir/tuning.log" 48 | else 49 | bash run_quant.sh --input_model="$input_model" \ 50 | --dataset_location="$dataset_location" \ 51 | --label_path="$label_path" \ 52 | --output_model="./model_tune" | tee -a "$log_dir/tuning.log" 53 | fi 54 | } 55 | 56 | function run_accuracy() { 57 | bash run_benchmark.sh --input_model="./model_tune" \ 58 | --dataset_location="$dataset_location" \ 59 | --label_path="$label_path" \ 60 | --mode="accuracy" \ 61 | --batch_size="16" | tee -a "$log_dir/accuracy.log" 62 | } 63 | 64 | function run_performance() { 65 | bash run_benchmark.sh --input_model="./model_tune" \ 66 | --dataset_location="$dataset_location" \ 67 | --label_path="$label_path" \ 68 | --mode="performance" \ 69 | --intra_op_num_threads="8" \ 70 | --batch_size="1" | tee -a "$log_dir/performance.log" 71 | } 72 | 73 | function main() { 74 | cd "/neural-compressor/examples/$model_src_dir" 75 | if [ "$stage" == "prepare_model" ]; then 76 | run_prepare_model 77 | elif [ "$stage" == "quantize" ]; then 78 | run_quantize 79 | elif [ "$stage" == "accuracy" ]; then 80 | run_accuracy 81 | elif [ "$stage" == "performance" ]; then 82 | run_performance 83 | else 84 | echo "invalid stage: $stage" && exit 1 85 | fi 86 | } 87 | 88 | main 89 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/models/summarize_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--logs_dir", required=True, type=str) 7 | parser.add_argument("--output_dir", required=True, type=str) 8 | args = parser.parse_args() 9 | 10 | 11 | def read_json_file(file_path): 12 | with open(file_path, "r") as file: 13 | return json.load(file) 14 | 15 | 16 | def write_json_file(data, file_path): 17 | with open(file_path, "w") as file: 18 | json.dump(data, file, indent=4) 19 | 20 | 21 | def merge_json_files(root_dir, output_file): 22 | merged_data = {} 23 | 24 | for subdir, _, files in os.walk(root_dir): 25 | for file in files: 26 | if file.endswith(".json"): 27 | file_path = os.path.join(subdir, file) 28 | try: 29 | json_data = read_json_file(file_path) 30 | merged_data.update(json_data) 31 | except json.JSONDecodeError: 32 | print(f"Error decoding JSON from file: {file_path}") 33 | 34 | print(merged_data) 35 | write_json_file(merged_data, f"{output_file}/summary.json") 36 | 37 | 38 | def main(): 39 | merge_json_files(args.logs_dir, args.output_dir) 40 | print(f"All JSON files have been merged into {args.output_dir}") 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/models/templates/model.jinja2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ONNX Neural Compressor - Model test 9 | 55 | 56 | 57 | 58 |
59 |

Model test 60 | [ Job - {{ info['build_number'] }} ] 61 |

62 | 63 |

Summary

64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 75 | 79 | 80 |
RepoTest BranchCommit ID
Neural Compressor{{info['branch'] }} 74 | 76 | {{ 77 | info['commit'] }} 78 |
81 | 82 |

Model Test

83 | 84 | 85 | {% for title in ["Platform", "Model", "Accuracy(new|last)", "Ratio(Accuracy)", 86 | "Performance(new|last)", "Ratio(Performance)", "Status"] %} 87 | 88 | {% endfor %} 89 | 90 |
91 | {% for item in data %} 92 |
93 | 94 | 95 | 96 | 97 | 98 | 99 | {% if item.status == 'SUCCESS' %} 100 | 101 | {% else %} 102 | 103 | {% endif %} 104 | 105 | {% endfor %} 106 | 107 |
{{ title }}
ICX{{ item.model }}{{ item.current_accuracy }} | {{ item.last_accuracy }}{{ item.accuracy_ratio }}{{ item.current_performance }} | {{ item.last_performance }}{{ item.performance_ratio }}{{ item.status }}{{ item.status }}
108 |
109 | 110 | 111 | -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/coverage.ort: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | 4 | [report] 5 | include = 6 | */onnx_neural_compressor/** 7 | exclude_lines = 8 | pragma: no cover 9 | raise NotImplementedError 10 | raise TypeError 11 | if self.device == "gpu": 12 | if device == "gpu": 13 | except ImportError: 14 | except Exception as e: -------------------------------------------------------------------------------- /.azure-pipelines/scripts/ut/run_ort.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -c "import neural_compressor as nc" 3 | test_case="run ONNXRT" 4 | echo "${test_case}" 5 | 6 | # install requirements 7 | echo "set up UT env..." 8 | pip install -r /neural-compressor/test/requirements.txt 9 | pip install pytest-cov 10 | pip install pytest-html 11 | pip list 12 | 13 | export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.ort 14 | inc_path=$(python -c 'import onnx_neural_compressor; print(onnx_neural_compressor.__path__[0])') 15 | cd /neural-compressor/test || exit 1 16 | 17 | LOG_DIR=/neural-compressor/log_dir 18 | mkdir -p ${LOG_DIR} 19 | ut_log_name=${LOG_DIR}/ut_ort.log 20 | pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html . 2>&1 | tee -a ${ut_log_name} 21 | 22 | cp report.html ${LOG_DIR}/ 23 | 24 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then 25 | echo "Find errors in pytest case, please check the output..." 26 | echo "Please search for '== FAILURES ==' or '== ERRORS =='" 27 | exit 1 28 | fi 29 | 30 | # if ut pass, collect the coverage file into artifacts 31 | cp .coverage ${LOG_DIR}/.coverage 32 | 33 | echo "UT finished successfully! " -------------------------------------------------------------------------------- /.azure-pipelines/template/docker-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: dockerConfigName 3 | type: string 4 | default: "commonDockerConfig" 5 | - name: repoName 6 | type: string 7 | default: "neural-compressor" 8 | - name: repoTag 9 | type: string 10 | default: "py310" 11 | - name: dockerFileName 12 | type: string 13 | default: "Dockerfile" 14 | - name: containerName 15 | type: string 16 | - name: repo 17 | type: string 18 | default: "https://github.com/onnx/neural-compressor" 19 | 20 | steps: 21 | - task: Bash@3 22 | inputs: 23 | targetType: "inline" 24 | script: | 25 | docker ps -a 26 | if [[ $(docker ps -a | grep -i '${{ parameters.containerName }}'$) ]]; then 27 | docker start $(docker ps -aq) 28 | echo "remove left files through container ..." 29 | docker exec ${{ parameters.containerName }} bash -c "ls -a /neural-compressor && rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* && ls -a /neural-compressor || true" 30 | fi 31 | displayName: "Docker workspace clean up" 32 | 33 | - ${{ if eq(parameters.dockerConfigName, 'commonDockerConfig') }}: 34 | - script: | 35 | rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true 36 | echo y | docker system prune 37 | displayName: "Clean workspace" 38 | 39 | - checkout: self 40 | clean: true 41 | displayName: "Checkout out Repo" 42 | 43 | - ${{ if eq(parameters.dockerConfigName, 'gitCloneDockerConfig') }}: 44 | - script: | 45 | rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true 46 | mkdir ${BUILD_SOURCESDIRECTORY} 47 | chmod 777 ${BUILD_SOURCESDIRECTORY} 48 | echo y | docker system prune 49 | displayName: "Clean workspace" 50 | 51 | - checkout: none 52 | 53 | - script: | 54 | git clone ${{ parameters.repo }} ${BUILD_SOURCESDIRECTORY} 55 | git config --global --add safe.directory ${BUILD_SOURCESDIRECTORY} 56 | cd ${BUILD_SOURCESDIRECTORY} 57 | git checkout main 58 | displayName: "Checkout out main" 59 | 60 | - script: | 61 | if [[ ! $(docker images | grep -i ${{ parameters.repoName }}:${{ parameters.repoTag }}) ]]; then 62 | docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/${{parameters.dockerFileName}}.devel -t ${{ parameters.repoName }}:${{ parameters.repoTag }} . 63 | fi 64 | docker images | grep -i ${{ parameters.repoName }} 65 | if [[ $? -ne 0 ]]; then 66 | echo "NO Such Repo" 67 | exit 1 68 | fi 69 | displayName: "Build develop docker image" 70 | 71 | - script: | 72 | docker stop $(docker ps -aq) 73 | docker rm -vf ${{ parameters.containerName }} || true 74 | env | sort 75 | displayName: "Clean docker container" 76 | 77 | - ${{ if ne(parameters.containerName, '') }}: 78 | - task: Bash@3 79 | inputs: 80 | targetType: "inline" 81 | script: | 82 | docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \ 83 | -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor -v /tf_dataset:/tf_dataset -v /tf_dataset2:/tf_dataset2 ${{ parameters.repoName }}:${{ parameters.repoTag }} 84 | echo "Show the container list after docker run ... " 85 | docker ps -a 86 | displayName: "Docker run - ${{ parameters.containerName }} Container" 87 | -------------------------------------------------------------------------------- /.azure-pipelines/template/model-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: modelName 3 | type: string 4 | - name: modelContainerName 5 | type: string 6 | default: "ONC" 7 | - name: algorithm 8 | type: string 9 | - name: script_path 10 | type: string 11 | default: "run_onnxrt_models_trigger.sh" 12 | 13 | steps: 14 | - template: docker-template.yml 15 | parameters: 16 | dockerConfigName: "commonDockerConfig" 17 | repoName: "neural-compressor" 18 | repoTag: "py310" 19 | dockerFileName: "Dockerfile" 20 | containerName: ${{ parameters.modelContainerName }} 21 | 22 | - script: | 23 | docker exec ${{ parameters.modelContainerName }} bash -c \ 24 | "cd /neural-compressor/.azure-pipelines/scripts/models && bash env_setup.sh --model=${{ parameters.modelName }}" 25 | displayName: Env setup 26 | 27 | - script: | 28 | docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ 29 | && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='prepare_model'" 30 | displayName: Export Models 31 | 32 | - task: DownloadPipelineArtifact@2 33 | continueOnError: true 34 | inputs: 35 | source: "specific" 36 | artifact: "FinalReport" 37 | patterns: "**.json" 38 | path: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models/ 39 | project: $(System.TeamProject) 40 | pipeline: "onc model test" 41 | runVersion: "specific" 42 | runId: $(refer_buildId) 43 | displayName: "Download refer logs" 44 | 45 | - script: | 46 | docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ 47 | && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='quantize'" 48 | displayName: Quantize 49 | 50 | - script: | 51 | docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ 52 | && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='accuracy'" 53 | displayName: Run Accuracy Test 54 | 55 | - ${{ if eq(parameters.algorithm, 'Quantize') }}: 56 | - script: | 57 | docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ 58 | && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='performance'" 59 | displayName: Run Performance Test 60 | 61 | - task: Bash@3 62 | inputs: 63 | targetType: "inline" 64 | script: | 65 | docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \ 66 | && python collect_results.py --model=${{ parameters.modelName }} --build_id=$(Build.BuildId)" 67 | displayName: Collect Log & Check Results 68 | 69 | - task: PublishPipelineArtifact@1 70 | inputs: 71 | targetPath: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models/${{ parameters.modelName }}/ 72 | artifact: ${{ parameters.algorithm }}_${{ parameters.modelName }} 73 | publishLocation: "pipeline" 74 | 75 | - task: Bash@3 76 | condition: always() 77 | inputs: 78 | targetType: "inline" 79 | script: | 80 | docker exec ${{ parameters.modelContainerName }} bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true" 81 | displayName: "Docker Clean Up" 82 | -------------------------------------------------------------------------------- /.azure-pipelines/template/ut-template.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: dockerConfigName 3 | type: string 4 | default: "commonDockerConfig" 5 | - name: repo 6 | type: string 7 | default: "https://github.com/onnx/neural-compressor" 8 | - name: utScriptFileName 9 | type: string 10 | - name: uploadPath 11 | type: string 12 | - name: utArtifact 13 | type: string 14 | - name: utTestMode 15 | type: string 16 | default: "coverage" 17 | - name: utContainerName 18 | type: string 19 | default: "utTest" 20 | 21 | steps: 22 | - template: docker-template.yml 23 | parameters: 24 | dockerConfigName: ${{ parameters.dockerConfigName }} 25 | repoName: "neural-compressor" 26 | repoTag: "py310" 27 | dockerFileName: "Dockerfile" 28 | containerName: ${{ parameters.utContainerName }} 29 | repo: ${{ parameters.repo }} 30 | 31 | - script: | 32 | docker exec ${{ parameters.utContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts \ 33 | && bash install_nc.sh \ 34 | && bash ut/${{ parameters.utScriptFileName }}.sh" 35 | displayName: "Run UT" 36 | 37 | - task: PublishPipelineArtifact@1 38 | condition: succeededOrFailed() 39 | inputs: 40 | targetPath: ${{ parameters.uploadPath }} 41 | artifact: $(System.JobAttempt)_${{ parameters.utArtifact }}_report 42 | publishLocation: "pipeline" 43 | 44 | - ${{ if eq(parameters.utTestMode, 'coverage') }}: 45 | - task: PublishPipelineArtifact@1 46 | inputs: 47 | targetPath: ${{ parameters.uploadPath }} 48 | artifact: ${{ parameters.utArtifact }}_coverage 49 | publishLocation: "pipeline" 50 | 51 | - task: Bash@3 52 | condition: always() 53 | inputs: 54 | targetType: "inline" 55 | script: | 56 | docker exec ${{ parameters.utContainerName }} bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true" 57 | displayName: "Docker clean up" 58 | -------------------------------------------------------------------------------- /.azure-pipelines/ut-3x-ort.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | pr: 4 | autoCancel: true 5 | drafts: false 6 | branches: 7 | include: 8 | - main 9 | paths: 10 | include: 11 | - onnx_neural_compressor 12 | - test 13 | - setup.py 14 | - requirements.txt 15 | - .azure-pipelines/scripts/ut 16 | - .azure-pipelines/ut-3x-ort.yml 17 | 18 | pool: ICX-16C 19 | 20 | variables: 21 | IMAGE_NAME: "neural-compressor" 22 | IMAGE_TAG: "py310" 23 | UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir 24 | DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir 25 | ARTIFACT_NAME: "UT_coverage_report_ort" 26 | REPO: $(Build.Repository.Uri) 27 | 28 | stages: 29 | - stage: ONNXRT 30 | displayName: Unit Test ONNXRT 31 | dependsOn: [] 32 | jobs: 33 | - job: 34 | displayName: Unit Test ONNXRT 35 | steps: 36 | - template: template/ut-template.yml 37 | parameters: 38 | dockerConfigName: "commonDockerConfig" 39 | utScriptFileName: "run_ort" 40 | uploadPath: $(UPLOAD_PATH) 41 | utArtifact: "ut" 42 | 43 | 44 | - stage: ONNXRT_baseline 45 | displayName: Unit Test ONNXRT baseline 46 | dependsOn: [] 47 | jobs: 48 | - job: 49 | displayName: Unit Test ONNXRT baseline 50 | steps: 51 | - template: template/ut-template.yml 52 | parameters: 53 | dockerConfigName: "gitCloneDockerConfig" 54 | utScriptFileName: "run_ort" 55 | uploadPath: $(UPLOAD_PATH) 56 | utArtifact: "ut_baseline" 57 | repo: $(REPO) 58 | 59 | - stage: Coverage 60 | displayName: "Coverage Compare" 61 | pool: 62 | vmImage: "ubuntu-latest" 63 | dependsOn: [ONNXRT, ONNXRT_baseline] 64 | jobs: 65 | - job: CollectDatafiles 66 | steps: 67 | - script: | 68 | if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then 69 | docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} . 70 | fi 71 | docker images | grep -i ${IMAGE_NAME} 72 | if [[ $? -ne 0 ]]; then 73 | echo "NO Such Repo" 74 | exit 1 75 | fi 76 | displayName: "Build develop docker image" 77 | 78 | - task: DownloadPipelineArtifact@2 79 | inputs: 80 | artifact: 81 | patterns: '*_coverage/.coverage' 82 | path: $(DOWNLOAD_PATH) 83 | 84 | - script: | 85 | echo "--- create container ---" 86 | docker run -d -it --name="collectLogs" -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash 87 | echo "--- docker ps ---" 88 | docker ps 89 | echo "--- collect logs ---" 90 | docker exec collectLogs /bin/bash +x -c "cd /neural-compressor/.azure-pipelines/scripts \ 91 | && bash install_nc.sh \ 92 | && bash ut/collect_log.sh" 93 | displayName: "Collect UT Coverage" 94 | 95 | - task: PublishPipelineArtifact@1 96 | condition: succeededOrFailed() 97 | inputs: 98 | targetPath: $(UPLOAD_PATH) 99 | artifact: $(ARTIFACT_NAME) 100 | publishLocation: "pipeline" 101 | 102 | - task: Bash@3 103 | condition: always() 104 | inputs: 105 | targetType: "inline" 106 | script: | 107 | docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true" 108 | displayName: "Docker clean up" 109 | -------------------------------------------------------------------------------- /.github/license_template.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Intel Corporation 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Type of Change 2 | 3 | feature or bug fix or documentation or validation or others 4 | API changed or not 5 | 6 | ## Description 7 | 8 | detail description 9 | 10 | ## Expected Behavior & Potential Risk 11 | 12 | the expected behavior that triggered by this PR 13 | 14 | ## How has this PR been tested? 15 | 16 | how to reproduce the test (including hardware information) 17 | 18 | ## Dependency Change? 19 | 20 | any library dependency introduced or removed 21 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) ONNX Neural Compressor Project Contributors 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | name: Lint 6 | 7 | on: 8 | push: 9 | branches: 10 | - main 11 | pull_request: 12 | merge_group: 13 | 14 | permissions: # set top-level default permissions as security best practice 15 | contents: read 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }} 19 | cancel-in-progress: true 20 | 21 | jobs: 22 | optional-lint: 23 | name: Optional Lint 24 | runs-on: ubuntu-latest 25 | steps: 26 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 27 | - name: misspell # Check spellings as well 28 | uses: reviewdog/action-misspell@5bd7be2fc7ae56a517184f5c4bbcf2fd7afe3927 # v1.17.0 29 | with: 30 | github_token: ${{ secrets.github_token }} 31 | locale: "US" 32 | reporter: github-pr-check 33 | level: info 34 | filter_mode: diff_context 35 | - name: shellcheck # Static check shell scripts 36 | uses: reviewdog/action-shellcheck@72365a51bf6476fe952a117c3ff703eb7775e40a # v1.20.0 37 | with: 38 | github_token: ${{ secrets.github_token }} 39 | reporter: github-pr-check 40 | level: info 41 | filter_mode: diff_context 42 | 43 | enforce-style: 44 | name: Enforce style 45 | runs-on: ubuntu-latest 46 | permissions: 47 | security-events: write 48 | steps: 49 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 50 | - name: Setup Python 51 | uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 52 | with: 53 | python-version: "3.12" 54 | - name: Install ONNX Neural Compressor 55 | run: | 56 | pip install . 57 | - name: Install dependencies 58 | run: | 59 | python -m pip install lintrunner lintrunner-adapters 60 | lintrunner init 61 | - name: Run lintrunner on all files 62 | run: | 63 | set +e 64 | if ! lintrunner --force-color --all-files --tee-json=lint.json -v; then 65 | echo "" 66 | echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m" 67 | echo -e "\e[1m\e[36mSee https://github.com/onnx/neural-compressor/blob/main/.lintrunner.toml for setup instructions.\e[0m" 68 | exit 1 69 | fi 70 | - name: Produce SARIF 71 | if: always() 72 | run: | 73 | python -m lintrunner_adapters to-sarif lint.json lintrunner.sarif 74 | - name: Upload SARIF file 75 | # Use always() to always upload SARIF even if lintrunner returns with error code 76 | # To toggle linter comments in the files page, press `i` on the keyboard 77 | if: always() 78 | continue-on-error: true 79 | uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4 80 | with: 81 | # Path to SARIF file relative to the root of the repository 82 | sarif_file: lintrunner.sarif 83 | category: lintrunner 84 | checkout_path: ${{ github.workspace }} 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .vscode 3 | .idea 4 | /venv/ 5 | */__pycache__ 6 | .ipynb_checkpoints/ 7 | *.snapshot 8 | *.csv 9 | *.pb 10 | *.ckpt 11 | *.log 12 | *.swp 13 | *.onnx 14 | *.so 15 | *.egg-info/ 16 | .eggs/ 17 | dist/ 18 | tags 19 | build/ 20 | _build 21 | lpot_workspace/ 22 | .torch/ 23 | node_modules 24 | build_tmp 25 | -------------------------------------------------------------------------------- /.lintrunner.toml: -------------------------------------------------------------------------------- 1 | # Configuration for lintrunner https://github.com/suo/lintrunner 2 | # You can install the dependencies and initialize with 3 | # 4 | # ```sh 5 | # pip install lintrunner lintrunner-adapters 6 | # lintrunner init 7 | # ``` 8 | # 9 | # This will install lintrunner on your system and download all the necessary 10 | # dependencies to run linters locally. 11 | # If you want to see what lintrunner init will install, run 12 | # `lintrunner init --dry-run`. 13 | # 14 | # To lint local changes: 15 | # 16 | # ```bash 17 | # lintrunner 18 | # ``` 19 | # 20 | # To lint all files: 21 | # 22 | # ```bash 23 | # lintrunner --all-files 24 | # ``` 25 | # 26 | # To format files: 27 | # 28 | # ```bash 29 | # lintrunner -a 30 | # ``` 31 | # 32 | # To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner). 33 | # To update an existing linting rule or create a new one, modify this file or create a 34 | # new adapter following examples in https://github.com/justinchuby/lintrunner-adapters. 35 | merge_base_with = 'main' 36 | 37 | [[linter]] 38 | code = 'RUFF' 39 | include_patterns = [ 40 | '**/*.py', 41 | '**/*.pyi', 42 | ] 43 | exclude_patterns = [ 44 | '*_pb2*', 45 | '.setuptools-cmake-build/*', 46 | 'docs/**', 47 | ] 48 | command = [ 49 | 'python', 50 | '-m', 51 | 'lintrunner_adapters', 52 | 'run', 53 | 'ruff_linter', 54 | '--config=pyproject.toml', 55 | '@{{PATHSFILE}}' 56 | ] 57 | init_command = [ 58 | 'python', 59 | '-m', 60 | 'lintrunner_adapters', 61 | 'run', 62 | 'pip_init', 63 | '--dry-run={{DRYRUN}}', 64 | '--requirement=requirements-lintrunner.txt', 65 | ] 66 | is_formatter = true 67 | 68 | [[linter]] 69 | code = 'BLACK-ISORT' 70 | include_patterns = [ 71 | '**/*.py', 72 | ] 73 | exclude_patterns = [ 74 | '*_pb2*', 75 | '.setuptools-cmake-build/*', 76 | 'cmake/**', 77 | 'docs/**', 78 | ] 79 | command = [ 80 | 'python', 81 | '-m', 82 | 'lintrunner_adapters', 83 | 'run', 84 | 'black_isort_linter', 85 | '--', 86 | '@{{PATHSFILE}}' 87 | ] 88 | init_command = [ 89 | 'python', 90 | '-m', 91 | 'lintrunner_adapters', 92 | 'run', 93 | 'pip_init', 94 | '--dry-run={{DRYRUN}}', 95 | '--requirement=requirements-lintrunner.txt', 96 | ] 97 | is_formatter = true 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | Neural Compressor 4 | =========================== 5 |

An open-source Python library supporting popular model compression techniques for ONNX

6 | 7 | [![python](https://img.shields.io/badge/python-3.8%2B-blue)](https://github.com/onnx/neural-compressor) 8 | [![version](https://img.shields.io/badge/release-1.0-green)](https://github.com/onnx/neural-compressor/releases) 9 | [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/onnx/neural-compressor/blob/master/LICENSE) 10 | 11 | 12 | --- 13 |
14 | 15 | Neural Compressor aims to provide popular model compression techniques inherited from [Intel Neural Compressor](https://github.com/intel/neural-compressor) yet focused on ONNX model quantization such as SmoothQuant, weight-only quantization through [ONNX Runtime](https://onnxruntime.ai/). In particular, the tool provides the key features, typical examples, and open collaborations as below: 16 | 17 | * Support a wide range of Intel hardware such as [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html) and AIPC 18 | 19 | * Validate popular LLMs such as [LLama2](./examples/nlp/huggingface_model/text_generation/), [Llama3](./examples/nlp/huggingface_model/text_generation/), [Qwen2](./examples/nlp/huggingface_model/text_generation/) and broad models such as [BERT-base](./examples/nlp/bert/quantization), and [ResNet50](./examples/image_recognition/resnet50/quantization/ptq_static) from popular model hubs such as [Hugging Face](https://huggingface.co/), [ONNX Model Zoo](https://github.com/onnx/models#models), by leveraging automatic [accuracy-driven](./docs/design.md#workflow) quantization strategies 20 | 21 | * Collaborate with software platforms such as [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [ONNX](https://github.com/onnx/models#models) and [ONNX Runtime](https://github.com/microsoft/onnxruntime) 22 | 23 | ## Installation 24 | 25 | ### Install from source 26 | ```Shell 27 | git clone https://github.com/onnx/neural-compressor.git 28 | cd neural-compressor 29 | pip install -r requirements.txt 30 | pip install . 31 | ``` 32 | 33 | > **Note**: 34 | > Further installation methods can be found under [Installation Guide](./docs/installation_guide.md). 35 | 36 | ## Getting Started 37 | 38 | Setting up the environment: 39 | ```bash 40 | pip install onnx-neural-compressor "onnxruntime>=1.17.0" onnx 41 | ``` 42 | After successfully installing these packages, try your first quantization program. 43 | > Notes: please install from source before the formal pypi release. 44 | 45 | ### Weight-Only Quantization (LLMs) 46 | Following example code demonstrates Weight-Only Quantization on LLMs, device will be selected for efficiency automatically when multiple devices are available. 47 | 48 | Run the example: 49 | ```python 50 | from onnx_neural_compressor.quantization import matmul_nbits_quantizer 51 | 52 | algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig() 53 | quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( 54 | model, 55 | n_bits=4, 56 | block_size=32, 57 | is_symmetric=True, 58 | algo_config=algo_config, 59 | ) 60 | quant.process() 61 | best_model = quant.model 62 | ``` 63 | 64 | ### Static Quantization 65 | 66 | ```python 67 | from onnx_neural_compressor.quantization import quantize, config 68 | from onnx_neural_compressor import data_reader 69 | 70 | 71 | class DataReader(data_reader.CalibrationDataReader): 72 | def __init__(self): 73 | self.encoded_list = [] 74 | # append data into self.encoded_list 75 | 76 | self.iter_next = iter(self.encoded_list) 77 | 78 | def get_next(self): 79 | return next(self.iter_next, None) 80 | 81 | def rewind(self): 82 | self.iter_next = iter(self.encoded_list) 83 | 84 | 85 | data_reader = DataReader() 86 | qconfig = config.StaticQuantConfig(calibration_data_reader=data_reader) 87 | quantize(model, output_model_path, qconfig) 88 | ``` 89 | 90 | ## Documentation 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 |
Overview
ArchitectureWorkflowExamples
Feature
QuantizationSmoothQuant
Weight-Only Quantization (INT8/INT4) Layer-Wise Quantization
121 | 122 | 123 | 124 | ## Additional Content 125 | 126 | * [Contribution Guidelines](./docs/source/CONTRIBUTING.md) 127 | * [Security Policy](SECURITY.md) 128 | 129 | ## Communication 130 | - [GitHub Issues](https://github.com/onnx/neural-compressor/issues): mainly for bug reports, new feature requests, question asking, etc. 131 | - [Email](mailto:inc.maintainers@intel.com): welcome to raise any interesting research ideas on model compression techniques by email for collaborations. 132 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 6 | 7 | # Security Policy 8 | 9 | ## Reporting a Vulnerability 10 | If you think you have found a security vulnerability, please send a report to onnx-security@lists.lfaidata.foundation. Please do not post security vulnerabilities on Slack. 11 | 12 | We don't currently have a PGP key, unfortunately. 13 | 14 | An ONNX committer will send you a response indicating the next steps in handling your report. After the initial reply to your report, the committer will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance. 15 | 16 | Important: Please don't disclose the vulnerability before it has been fixed and announced, to protect our users. 17 | 18 | ## Security announcements 19 | Please subscribe to the [announcements mailing list](https://lists.lfaidata.foundation/g/onnx-announce), where we post notifications and remediation details for security vulnerabilities. 20 | -------------------------------------------------------------------------------- /docs/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | Contributor Covenant Code of Conduct 2 | ======================= 3 | 4 | The ONNX Code Of Conduct is described at https://onnx.ai/codeofconduct.html 5 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contribution Guidelines 2 | ======================= 3 | 1. [Create Pull Request](#create-pull-request) 4 | 2. [Pull Request Checklist](#pull-request-checklist) 5 | 3. [Pull Request Template](#pull-request-template) 6 | 4. [Pull Request Acceptance Criteria](#pull-request-acceptance-criteria) 7 | 5. [Pull Request Status Checks Overview](#pull-request-status-checks-overview) 8 | 6. [Support](#support) 9 | 7. [Contributor Covenant Code of Conduct](#contributor-covenant-code-of-conduct) 10 | 11 | ## Create Pull Request 12 | If you have improvements to Neural Compressor, send your pull requests for 13 | [review](https://github.com/onnx/neural-compressor/pulls). 14 | If you are new to GitHub, view the pull request [How To](https://help.github.com/articles/using-pull-requests/). 15 | ### Step-by-Step guidelines 16 | - Star this repository using the button `Star` in the top right corner. 17 | - Fork this Repository using the button `Fork` in the top right corner. 18 | - Clone your forked repository to your pc. 19 | `git clone "url to your repo"` 20 | - Create a new branch for your modifications. 21 | `git checkout -b new-branch` 22 | - Add your files with `git add -A`, commit `git commit -s -m "This is my commit message"` and push `git push origin new-branch`. 23 | - Create a [pull request](https://github.com/onnx/neural-compressor/pulls). 24 | 25 | ## Pull Request Checklist 26 | 27 | Before sending your pull requests, follow the information below: 28 | 29 | - Add unit tests in [Unit Tests](https://github.com/onnx/neural-compressor/tree/main/test) to cover the code you would like to contribute. 30 | - Neural Compressor has adopted the [Developer Certificate of Origin](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin), you must agree to the terms of Developer Certificate of Origin by signing off each of your commits with `-s`, e.g. `git commit -s -m 'This is my commit message'`. 31 | 32 | ## Pull Request Template 33 | 34 | See [PR template](/.github/pull_request_template.md) 35 | 36 | ## Pull Request Acceptance Criteria 37 | - At least two approvals from reviewers 38 | 39 | - All detected status checks pass 40 | 41 | - All conversations solved 42 | 43 | - Third-party dependency license compatible 44 | 45 | ## Pull Request Status Checks Overview 46 | Neural Compressor use [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/pipelines/?view=azure-devops) for CI test. 47 | And generally use [Azure Cloud Instance](https://azure.microsoft.com/en-us/pricing/purchase-options/pay-as-you-go) to deploy pipelines, e.g. Standard E16s v5. 48 | | Test Name | Test Scope | Test Pass Criteria | 49 | |-------------------------------|-----------------------------------------------|---------------------------| 50 | | Pre-commit CI | [pre-comnnit config](../../.pre-commit-config.yaml) | PASS | 51 | | [DCO](https://github.com/apps/dco/) | Use `git commit -s` to sign off | PASS | 52 | | Unit Test | Pytest scripts under [test](/test) | PASS (No failure, No core dump, No segmentation fault, No coverage drop) | 53 | 54 | ## Support 55 | 56 | Submit your questions, feature requests, and bug reports to the 57 | [GitHub issues](https://github.com/onnx/neural-compressor/issues) page. You may also reach out to [Maintainers](mailto:inc.maintainers@intel.com). 58 | 59 | ## Contributor Covenant Code of Conduct 60 | 61 | This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant Code of Conduct](./CODE_OF_CONDUCT.md). 62 | -------------------------------------------------------------------------------- /docs/SECURITY.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | Security Policy 8 | =============== 9 | 10 | ## Reporting a Vulnerability 11 | If you think you have found a security vulnerability, please send a report to onnx-security@lists.lfaidata.foundation. Please do not post security vulnerabilities on Slack. 12 | 13 | We don't currently have a PGP key, unfortunately. 14 | 15 | An ONNX committer will send you a response indicating the next steps in handling your report. After the initial reply to your report, the committer will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance. 16 | 17 | Important: Please don't disclose the vulnerability before it has been fixed and announced, to protect our users. 18 | 19 | ## Security announcements 20 | Please subscribe to the [announcements mailing list](https://lists.lfaidata.foundation/g/onnx-announce), where we post notifications and remediation details for security vulnerabilities. 21 | -------------------------------------------------------------------------------- /docs/autotune.md: -------------------------------------------------------------------------------- 1 | AutoTune 2 | ======================================== 3 | 4 | 1. [Overview](#overview) 5 | 2. [How it Works](#how-it-works) 6 | 3. [Working with Autotune](#working-with-autotune) 7 | 4. [Get Started](#get-started) 8 | 9 | 10 | ## Overview 11 | 12 | Neural Compressor aims to help users quickly deploy low-precision models by leveraging popular compression techniques, such as post-training quantization and weight-only quantization algorithms. Despite having a variety of these algorithms, finding the appropriate configuration for a model can be difficult and time-consuming. To address this, we built the `autotune` module which identifies the best algorithm configuration for models to achieve optimal performance under the certain accuracy criteria. This module allows users to easily use predefined tuning recipes and customize the tuning space as needed. 13 | 14 | ## How it Works 15 | 16 | The autotune module constructs the tuning space according to the pre-defined tuning set or users' tuning set. It iterates the tuning space and applies the configuration on given float model then records and compares its evaluation result with the baseline. The tuning process stops when meeting the exit policy. 17 | The workflow is as below: 18 | 19 | 20 | Workflow 21 | 22 | 23 | 24 | ## Working with Autotune 25 | 26 | The `autotune` API can be used across all algorithms supported by Neural Compressor. It accepts three primary arguments: `model_input`, `tune_config`, and `eval_fn`. 27 | 28 | The `TuningConfig` class defines the tuning process, including the tuning space, order, and exit policy. 29 | 30 | - Define the tuning space 31 | 32 | User can define the tuning space by setting `config_set` with an algorithm configuration or a set of configurations. 33 | ```python 34 | # Use the default tuning space 35 | config_set = config.get_woq_tuning_config() 36 | 37 | # Customize the tuning space with one algorithm configurations 38 | config_set = config.RTNConfig(weight_sym=False, weight_group_size=[32, 64]) 39 | 40 | # Customize the tuning space with two algorithm configurations 41 | config_set = [ 42 | config.RTNConfig(weight_sym=False, weight_group_size=32), 43 | config.GPTQConfig(weight_group_size=128, weight_sym=False), 44 | ] 45 | ``` 46 | 47 | - Define the tuning order 48 | 49 | The tuning order determines how the process traverses the tuning space and samples configurations. Users can customize it by configuring the `sampler`. Currently, we provide the [`default_sampler`](https://github.com/onnx/neural-compressor/blob/main/onnx_neural_compressor/quantization/tuning.py#L210), which samples configurations sequentially, always in the same order. 50 | 51 | - Define the exit policy 52 | 53 | The exit policy includes two components: accuracy goal (`tolerable_loss`) and the allowed number of trials (`max_trials`). The tuning process will stop when either condition is met. 54 | 55 | ## Get Started 56 | The example below demonstrates how to autotune a ONNX model on four `RTNConfig` configurations. 57 | 58 | ```python 59 | from onnx_neural_compressor.quantization import config, tuning 60 | 61 | 62 | def eval_fn(model) -> float: 63 | return ... 64 | 65 | 66 | tune_config = tuning.TuningConfig( 67 | config_set=config.RTNConfig( 68 | weight_sym=[False, True], 69 | weight_group_size=[32, 128] 70 | ), 71 | tolerable_loss=0.2, 72 | max_trials=10, 73 | ) 74 | q_model = tuning.autotune(model, tune_config=tune_config, eval_fn=eval_fn) 75 | ``` -------------------------------------------------------------------------------- /docs/calibration.md: -------------------------------------------------------------------------------- 1 | # Calibration Algorithms in Quantization 2 | 3 | 1. [Introduction](#introduction) 4 | 2. [Calibration Algorithms](#calibration-algorithms) 5 | 3. [Support Matrix](#support-matrix) 6 | 7 | ## Introduction 8 | 9 | Quantization proves beneficial in terms of reducing the memory and computational requirements of the model. Uniform quantization transforms the input value $x ∈ [β, α]$ to lie within $[−2^{b−1}, 2^{b−1} − 1]$, where $[β, α]$ is the range of real values chosen for quantization and $b$ is the bit-width of the signed integer representation. Calibration is the process of determining the $α$ and $β$ for model weights and activations. Refer to this [link](.quantization.md#quantization-fundamentals) for more quantization fundamentals 10 | 11 | ## Calibration Algorithms 12 | 13 | Currently, Neural Compressor supports three popular calibration algorithms: 14 | 15 | - MinMax: This method gets the maximum and minimum of input values as $α$ and $β$ [^1]. It preserves the entire range and is the simplest approach. 16 | 17 | - Entropy: This method minimizes the KL divergence to reduce the information loss between full-precision and quantized data [^2]. Its primary focus is on preserving essential information. 18 | 19 | - Percentile: This method only considers a specific percentage of values for calculating the range, ignoring the remainder which may contain outliers [^3]. It enhances resolution by excluding extreme values but still retaining noteworthy data. 20 | 21 | > `kl` is used to represent the Entropy calibration algorithm in Neural Compressor. 22 | 23 | ## Reference 24 | 25 | [^1]: Vanhoucke, Vincent, Andrew Senior, and Mark Z. Mao. "Improving the speed of neural networks on CPUs." (2011). 26 | 27 | [^2]: Szymon Migacz. "Nvidia 8-bit inference width tensorrt." (2017). 28 | 29 | [^3]: McKinstry, Jeffrey L., et al. "Discovering low-precision networks close to full-precision networks for efficient embedded inference." arXiv preprint arXiv:1809.04191 (2018). 30 | -------------------------------------------------------------------------------- /docs/design.md: -------------------------------------------------------------------------------- 1 | Design 2 | ===== 3 | Neural Compressor features an architecture and workflow that aids in increasing performance and faster deployments across infrastructures. 4 | 5 | ## Architecture 6 | 7 | 8 | Architecture 9 | 10 | 11 | ## Workflow 12 | 13 | 14 | Workflow 15 | 16 | -------------------------------------------------------------------------------- /docs/imgs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/architecture.png -------------------------------------------------------------------------------- /docs/imgs/common/code.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/imgs/common/right.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/imgs/lwq_ort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/lwq_ort.png -------------------------------------------------------------------------------- /docs/imgs/smoothquant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/smoothquant.png -------------------------------------------------------------------------------- /docs/imgs/sq_convert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/sq_convert.png -------------------------------------------------------------------------------- /docs/imgs/sq_pc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/sq_pc.png -------------------------------------------------------------------------------- /docs/imgs/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/workflow.png -------------------------------------------------------------------------------- /docs/installation_guide.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | 1. [Installation](#installation) 4 | 5 | 1.1. [Prerequisites](#prerequisites) 6 | 7 | 1.2. [Install from Binary](#install-from-binary) 8 | 9 | 1.3. [Install from Source](#install-from-source) 10 | 11 | 2. [System Requirements](#system-requirements) 12 | 13 | 2.1. [Validated Hardware Environment](#validated-hardware-environment) 14 | 15 | 2.2. [Validated Software Environment](#validated-software-environment) 16 | 17 | ## Installation 18 | ### Prerequisites 19 | You can install Neural Compressor from binary or source. 20 | 21 | The following prerequisites and requirements must be satisfied for a successful installation: 22 | 23 | - Python version: 3.8 or 3.9 or 3.10 or 3.11 24 | 25 | ### Install from Binary 26 | ```Shell 27 | # install stable basic version from pypi 28 | pip install onnx-neural-compressor 29 | ``` 30 | 31 | ### Install from Source 32 | 33 | ```Shell 34 | git clone https://github.com/onnx/neural-compressor.git 35 | cd neural-compressor 36 | pip install -r requirements.txt 37 | pip install . 38 | ``` 39 | 40 | ## System Requirements 41 | 42 | ### Validated Hardware Environment 43 | #### Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64): 44 | 45 | * Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids) 46 | * Intel Xeon CPU Max Series (formerly Sapphire Rapids HBM) 47 | * Intel Core Ultra Processors (Meteor Lake, Lunar Lake) 48 | 49 | ### Validated Software Environment 50 | 51 | * OS version: CentOS 8.4, Ubuntu 22.04 52 | * Python version: 3.10 53 | * ONNX Runtime version: 1.18.1 54 | -------------------------------------------------------------------------------- /docs/quantization_layer_wise.md: -------------------------------------------------------------------------------- 1 | Layer Wise Quantization (LWQ) 2 | ===== 3 | 4 | 1. [Introduction](#introduction) 5 | 6 | 2. [Supported Framework Model Matrix](#supported-framework-model-matrix) 7 | 8 | 3. [Examples](#examples) 9 | 10 | ## Introduction 11 | 12 | Large language models (LLMs) have shown exceptional performance across various tasks, meanwhile, the substantial parameter size poses significant challenges for deployment. Layer-wise quantization(LWQ) can greatly reduce the memory footprint of LLMs, usually 80-90% reduction, which means that users can quantize LLMs even on single node using GPU or CPU. We can quantize the model under memory-constrained devices, therefore making the huge-sized LLM quantization possible. 13 | 14 | 15 | 16 | *Figure 1: The process of layer-wise quantization for ONNX model. The graph of LLM is split into several parts, and each subgraph is quantized in turn.* 17 | 18 | ## Supported Framework Model Matrix 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
Types/FrameworkONNX Runtime
W8A8 Post Training Static Quantization
Weight-only QuantizationRTN
AWQ
GPTQ
48 | 49 | ## Examples 50 | 51 | ```python 52 | from onnx_neural_compressor.quantization import matmul_4bits_quantizer 53 | 54 | algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=True) 55 | quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( 56 | model, 57 | algo_config=algo_config, 58 | ) 59 | quant.process() 60 | qmodel = quant.model 61 | ``` 62 | -------------------------------------------------------------------------------- /examples/image_recognition/resnet50/quantization/ptq_static/README.md: -------------------------------------------------------------------------------- 1 | # Step-by-Step 2 | 3 | This example load an image classification model from [ONNX Model Zoo](https://github.com/onnx/models) and confirm its accuracy and speed based on [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads). You need to download this dataset yourself. 4 | 5 | # Prerequisite 6 | 7 | ## 1. Environment 8 | 9 | ```shell 10 | pip install onnx-neural-compressor 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | 15 | ## 2. Prepare Model 16 | 17 | ```shell 18 | python prepare_model.py --output_model='resnet50-v1-12.onnx' 19 | ``` 20 | 21 | ## 3. Prepare Dataset 22 | 23 | Download dataset [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads). 24 | 25 | Download label: 26 | 27 | ```shell 28 | wget http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz 29 | tar -xvzf caffe_ilsvrc12.tar.gz val.txt 30 | ``` 31 | 32 | # Run 33 | 34 | 35 | ## 1. Quantization 36 | 37 | Quantize model with QLinearOps: 38 | 39 | ```bash 40 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx 41 | --dataset_location=/path/to/imagenet \ 42 | --label_path=/path/to/val.txt \ 43 | --output_model=path/to/save 44 | ``` 45 | 46 | Quantize model with QDQ mode: 47 | 48 | ```bash 49 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx 50 | --dataset_location=/path/to/imagenet \ 51 | --label_path=/path/to/val.txt \ 52 | --output_model=path/to/save \ 53 | --quant_format=QDQ 54 | ``` 55 | 56 | ## 2. Benchmark 57 | 58 | ```bash 59 | bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx 60 | --dataset_location=/path/to/imagenet \ 61 | --label_path=/path/to/val.txt \ 62 | --mode=performance # or accuracy 63 | ``` 64 | -------------------------------------------------------------------------------- /examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import urllib.request 5 | 6 | MODEL_URL = "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-v1-12.onnx" 7 | MAX_TIMES_RETRY_DOWNLOAD = 5 8 | 9 | 10 | def parse_arguments(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--input_model", type=str, required=False, default="resnet50-v1-12.onnx") 13 | parser.add_argument("--output_model", type=str, required=True) 14 | return parser.parse_args() 15 | 16 | 17 | def progressbar(cur, total=100): 18 | percent = "{:.2%}".format(cur / total) 19 | sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) 20 | sys.stdout.flush() 21 | 22 | 23 | def schedule(blocknum, blocksize, totalsize): 24 | if totalsize == 0: 25 | percent = 0 26 | else: 27 | percent = min(1.0, blocknum * blocksize / totalsize) * 100 28 | progressbar(percent) 29 | 30 | 31 | def download_model(url, model_name, retry_times=5): 32 | if os.path.isfile(model_name): 33 | print(f"{model_name} exists, skip download") 34 | return True 35 | 36 | print("download model...") 37 | retries = 0 38 | while retries < retry_times: 39 | try: 40 | urllib.request.urlretrieve(url, model_name, schedule) 41 | break 42 | except KeyboardInterrupt: 43 | return False 44 | except: 45 | retries += 1 46 | print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}") 47 | return retries < retry_times 48 | 49 | 50 | def prepare_model(input_model, output_model): 51 | # Download model from [ONNX Model Zoo](https://github.com/onnx/models) 52 | download_model(MODEL_URL, output_model, MAX_TIMES_RETRY_DOWNLOAD) 53 | 54 | 55 | if __name__ == "__main__": 56 | args = parse_arguments() 57 | prepare_model(args.input_model, args.output_model) 58 | -------------------------------------------------------------------------------- /examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt: -------------------------------------------------------------------------------- 1 | onnx 2 | onnxruntime 3 | torch 4 | torchvision 5 | onnxruntime-extensions 6 | pillow>=8.2.0 # not directly required, pinned by Snyk to avoid a vulnerability 7 | opencv-python 8 | scikit-learn 9 | -------------------------------------------------------------------------------- /examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | run_benchmark 7 | 8 | } 9 | 10 | # init params 11 | function init_params { 12 | for var in "$@" 13 | do 14 | case $var in 15 | --input_model=*) 16 | input_model=$(echo "$var" |cut -f2 -d=) 17 | ;; 18 | --dataset_location=*) 19 | dataset_location=$(echo "$var" |cut -f2 -d=) 20 | ;; 21 | --label_path=*) 22 | label_path=$(echo "$var" |cut -f2 -d=) 23 | ;; 24 | --mode=*) 25 | mode=$(echo "$var" |cut -f2 -d=) 26 | ;; 27 | --intra_op_num_threads=*) 28 | intra_op_num_threads=$(echo "$var" |cut -f2 -d=) 29 | ;; 30 | esac 31 | done 32 | 33 | } 34 | 35 | # run_benchmark 36 | function run_benchmark { 37 | 38 | python main.py \ 39 | --model_path "${input_model}" \ 40 | --dataset_location "${dataset_location}" \ 41 | --label_path "${label_path-${dataset_location}/../val.txt}" \ 42 | --mode "${mode}" \ 43 | --batch_size 1 \ 44 | --intra_op_num_threads "${intra_op_num_threads-4}" \ 45 | --benchmark 46 | 47 | } 48 | 49 | main "$@" 50 | -------------------------------------------------------------------------------- /examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | run_tuning 7 | 8 | } 9 | 10 | # init params 11 | function init_params { 12 | 13 | for var in "$@" 14 | do 15 | case $var in 16 | --input_model=*) 17 | input_model=$(echo "$var" |cut -f2 -d=) 18 | ;; 19 | --output_model=*) 20 | output_model=$(echo "$var" |cut -f2 -d=) 21 | ;; 22 | --dataset_location=*) 23 | dataset_location=$(echo "$var" |cut -f2 -d=) 24 | ;; 25 | --label_path=*) 26 | label_path=$(echo "$var" |cut -f2 -d=) 27 | ;; 28 | --quant_format=*) 29 | quant_format=$(echo "$var" |cut -f2 -d=) 30 | ;; 31 | esac 32 | done 33 | 34 | } 35 | 36 | # run_tuning 37 | function run_tuning { 38 | python main.py \ 39 | --model_path "${input_model}" \ 40 | --dataset_location "${dataset_location}" \ 41 | --label_path "${label_path-${dataset_location}/../val.txt}" \ 42 | --output_model "${output_model}" \ 43 | --quant_format "${quant_format-QOperator}" \ 44 | --tune 45 | } 46 | 47 | main "$@" 48 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_dynamic/README.md: -------------------------------------------------------------------------------- 1 | # Step-by-Step 2 | 3 | This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). 4 | 5 | # Prerequisite 6 | 7 | ## 1. Environment 8 | 9 | ```shell 10 | pip install onnx-neural-compressor 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | 15 | ## 2. Prepare Dataset 16 | 17 | download the GLUE data with `prepare_data.sh` script. 18 | 19 | ```shell 20 | export GLUE_DIR=path/to/glue_data 21 | export TASK_NAME=MRPC 22 | 23 | bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME 24 | ``` 25 | 26 | ## 3. Prepare Model 27 | 28 | ```shell 29 | python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx' 30 | ``` 31 | 32 | # Run 33 | 34 | ## 1. Quantization 35 | 36 | Dynamic quantization: 37 | 38 | ```bash 39 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx 40 | --output_model=path/to/model_tune \ # model path as *.onnx 41 | --dataset_location=path/to/glue_data 42 | ``` 43 | 44 | ## 2. Benchmark 45 | 46 | ```bash 47 | bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx 48 | --dataset_location=path/to/glue_data \ 49 | --batch_size=batch_size \ 50 | --mode=performance # or accuracy 51 | ``` 52 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | download_data 7 | 8 | } 9 | 10 | # init params 11 | function init_params { 12 | 13 | for var in "$@" 14 | do 15 | case $var in 16 | --data_dir=*) 17 | data_dir=$(echo "$var" |cut -f2 -d=) 18 | ;; 19 | --task_name=*) 20 | task_name=$(echo "$var" |cut -f2 -d=) 21 | ;; 22 | esac 23 | done 24 | 25 | } 26 | 27 | # run_tuning 28 | function download_data { 29 | wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py 30 | python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}" 31 | } 32 | 33 | main "$@" 34 | 35 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import urllib.request 5 | import zipfile 6 | 7 | import torch 8 | import transformers 9 | 10 | # Please refer to [Bert-GLUE_OnnxRuntime_quantization guide] 11 | # (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) 12 | # for detailed model export. 13 | 14 | MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip" 15 | MAX_TIMES_RETRY_DOWNLOAD = 5 16 | 17 | 18 | def parse_arguments(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip") 21 | parser.add_argument("--output_model", type=str, required=True) 22 | parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs") 23 | return parser.parse_args() 24 | 25 | 26 | def progressbar(cur, total=100): 27 | percent = "{:.2%}".format(cur / total) 28 | sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) 29 | sys.stdout.flush() 30 | 31 | 32 | def schedule(blocknum, blocksize, totalsize): 33 | if totalsize == 0: 34 | percent = 0 35 | else: 36 | percent = min(1.0, blocknum * blocksize / totalsize) * 100 37 | progressbar(percent) 38 | 39 | 40 | def is_zip_file(filename): 41 | try: 42 | with open(filename, "rb") as f: 43 | magic_number = f.read(4) 44 | return magic_number == b"PK\x03\x04" # ZIP file magic number 45 | except OSError: 46 | return False 47 | 48 | 49 | def extrafile(filename, target_folder="."): 50 | with zipfile.ZipFile(filename, "r") as zin: 51 | zin.extractall(target_folder) 52 | 53 | 54 | def download_model(url, model_name, retry_times=5): 55 | if os.path.isdir(model_name): 56 | return model_name 57 | elif os.path.exists(model_name) and is_zip_file(model_name): 58 | print("file downloaded") 59 | extrafile(model_name) 60 | return True 61 | 62 | print("download model...") 63 | retries = 0 64 | while retries < retry_times: 65 | try: 66 | urllib.request.urlretrieve(url, model_name, schedule) 67 | extrafile(model_name) 68 | break 69 | except KeyboardInterrupt: 70 | return False 71 | except: 72 | retries += 1 73 | print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}") 74 | return retries < retry_times 75 | 76 | 77 | def export_model(model, output_model, max_len=128): 78 | with torch.no_grad(): 79 | inputs = { 80 | "input_ids": torch.ones(1, max_len, dtype=torch.int64), 81 | "attention_mask": torch.ones(1, max_len, dtype=torch.int64), 82 | "token_type_ids": torch.ones(1, max_len, dtype=torch.int64), 83 | } 84 | 85 | symbolic_names = {0: "batch_size", 1: "max_seq_len"} 86 | torch.onnx.export( 87 | model, # model being run 88 | ( 89 | inputs["input_ids"], 90 | inputs["attention_mask"], 91 | inputs["token_type_ids"], 92 | ), # model input (or a tuple for multiple inputs) 93 | output_model, # where to save the model (can be a file or file-like object) 94 | opset_version=14, # the ONNX version to export the model 95 | do_constant_folding=True, # whether to execute constant folding 96 | input_names=["input_ids", "input_mask", "segment_ids"], # the model's input names 97 | output_names=["output"], # the model's output names 98 | dynamic_axes={ 99 | "input_ids": symbolic_names, # variable length axes 100 | "input_mask": symbolic_names, 101 | "segment_ids": symbolic_names, 102 | }, 103 | ) 104 | assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!" 105 | print("ONNX Model exported to {0}".format(output_model)) 106 | 107 | 108 | def prepare_model(input_model, output_model, max_len): 109 | is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD) 110 | if is_download_successful: 111 | folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC" 112 | model = transformers.BertForSequenceClassification.from_pretrained(folder_name) 113 | export_model(model, output_model, max_len) 114 | 115 | 116 | if __name__ == "__main__": 117 | args = parse_arguments() 118 | prepare_model(args.input_model, args.output_model, args.max_len) 119 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_dynamic/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | accelerate 4 | onnx 5 | onnxruntime 6 | coloredlogs 7 | sympy 8 | onnxruntime-extensions 9 | scikit-learn 10 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | 6 | init_params "$@" 7 | run_benchmark 8 | 9 | } 10 | 11 | # init params 12 | function init_params { 13 | for var in "$@" 14 | do 15 | case $var in 16 | --input_model=*) 17 | input_model=$(echo "$var" |cut -f2 -d=) 18 | ;; 19 | --mode=*) 20 | mode=$(echo "$var" |cut -f2 -d=) 21 | ;; 22 | --dataset_location=*) 23 | dataset_location=$(echo "$var" |cut -f2 -d=) 24 | ;; 25 | --batch_size=*) 26 | batch_size=$(echo "$var" |cut -f2 -d=) 27 | ;; 28 | --intra_op_num_threads=*) 29 | intra_op_num_threads=$(echo "$var" |cut -f2 -d=) 30 | ;; 31 | esac 32 | done 33 | 34 | } 35 | 36 | # run_benchmark 37 | function run_benchmark { 38 | if [[ ${mode} == "accuracy" ]]; then 39 | dynamic_length=False 40 | elif [[ ${mode} == "performance" ]]; then 41 | dynamic_length=True 42 | else 43 | echo "Error: No such mode: ${mode}" 44 | exit 1 45 | fi 46 | 47 | model_name_or_path="bert-base-uncased" 48 | task_name="mrpc" 49 | 50 | python main.py \ 51 | --model_path "${input_model}" \ 52 | --model_name_or_path "${model_name_or_path}" \ 53 | --data_path "${dataset_location}" \ 54 | --task "${task_name}" \ 55 | --batch_size "${batch_size}" \ 56 | --mode "${mode}" \ 57 | --dynamic_length "${dynamic_length}" \ 58 | --intra_op_num_threads "${intra_op_num_threads-4}" \ 59 | --benchmark 60 | 61 | } 62 | 63 | main "$@" 64 | 65 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | run_tuning 7 | } 8 | 9 | # init params 10 | function init_params { 11 | for var in "$@" 12 | do 13 | case $var in 14 | --input_model=*) 15 | input_model=$(echo "$var" |cut -f2 -d=) 16 | ;; 17 | --output_model=*) 18 | output_model=$(echo "$var" |cut -f2 -d=) 19 | ;; 20 | --dataset_location=*) 21 | dataset_location=$(echo "$var" |cut -f2 -d=) 22 | ;; 23 | esac 24 | done 25 | 26 | } 27 | 28 | # run_tuning 29 | function run_tuning { 30 | model_name_or_path="bert-base-uncased" 31 | batch_size=8 32 | task_name="mrpc" 33 | 34 | python main.py \ 35 | --model_path "${input_model}" \ 36 | --output_model "${output_model}" \ 37 | --model_name_or_path "${model_name_or_path}" \ 38 | --data_path "${dataset_location}" \ 39 | --task "${task_name}" \ 40 | --batch_size "${batch_size}" \ 41 | --tune 42 | } 43 | 44 | main "$@" 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_static/README.md: -------------------------------------------------------------------------------- 1 | Step-by-Step 2 | ============ 3 | 4 | This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). 5 | 6 | # Prerequisite 7 | 8 | ## 1. Environment 9 | 10 | ```shell 11 | pip install onnx-neural-compressor 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | ## 2. Prepare Dataset 16 | 17 | download the GLUE data with `prepare_data.sh` script. 18 | ```shell 19 | export GLUE_DIR=path/to/glue_data 20 | export TASK_NAME=MRPC 21 | 22 | bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME 23 | ``` 24 | 25 | ## 3. Prepare Model 26 | 27 | ```shell 28 | python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx' 29 | ``` 30 | 31 | # Run 32 | 33 | ## 1. Quantization 34 | 35 | Static quantization with QOperator format: 36 | 37 | ```bash 38 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx 39 | --output_model=path/to/model_tune \ 40 | --dataset_location=path/to/glue_data \ 41 | --quant_format="QOperator" 42 | ``` 43 | 44 | Static quantization with QDQ format: 45 | 46 | ```bash 47 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx 48 | --output_model=path/to/model_tune \ # model path as *.onnx 49 | --dataset_location=path/to/glue_data \ 50 | --quant_format="QDQ" 51 | ``` 52 | 53 | ## 2. Benchmark 54 | 55 | ```bash 56 | bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx 57 | --dataset_location=path/to/glue_data \ 58 | --batch_size=batch_size \ 59 | --mode=performance # or accuracy 60 | ``` 61 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_static/prepare_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | download_data 7 | 8 | } 9 | 10 | # init params 11 | function init_params { 12 | 13 | for var in "$@" 14 | do 15 | case $var in 16 | --data_dir=*) 17 | data_dir=$(echo "$var" |cut -f2 -d=) 18 | ;; 19 | --task_name=*) 20 | task_name=$(echo "$var" |cut -f2 -d=) 21 | ;; 22 | esac 23 | done 24 | 25 | } 26 | 27 | # run_tuning 28 | function download_data { 29 | wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py 30 | python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}" 31 | } 32 | 33 | main "$@" 34 | 35 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_static/prepare_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import urllib.request 5 | import zipfile 6 | 7 | import torch 8 | import transformers 9 | 10 | # Please refer to [Bert-GLUE_OnnxRuntime_quantization guide] 11 | # (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb) 12 | # for detailed model export. 13 | 14 | MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip" 15 | MAX_TIMES_RETRY_DOWNLOAD = 5 16 | 17 | 18 | def parse_arguments(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip") 21 | parser.add_argument("--output_model", type=str, required=True) 22 | parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs") 23 | return parser.parse_args() 24 | 25 | 26 | def progressbar(cur, total=100): 27 | percent = "{:.2%}".format(cur / total) 28 | sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent)) 29 | sys.stdout.flush() 30 | 31 | 32 | def schedule(blocknum, blocksize, totalsize): 33 | if totalsize == 0: 34 | percent = 0 35 | else: 36 | percent = min(1.0, blocknum * blocksize / totalsize) * 100 37 | progressbar(percent) 38 | 39 | 40 | def is_zip_file(filename): 41 | try: 42 | with open(filename, "rb") as f: 43 | magic_number = f.read(4) 44 | return magic_number == b"PK\x03\x04" # ZIP file magic number 45 | except OSError: 46 | return False 47 | 48 | 49 | def extrafile(filename, target_folder="."): 50 | with zipfile.ZipFile(filename, "r") as zin: 51 | zin.extractall(target_folder) 52 | 53 | 54 | def download_model(url, model_name, retry_times=5): 55 | if os.path.isdir(model_name): 56 | return model_name 57 | elif os.path.exists(model_name) and is_zip_file(model_name): 58 | print("file downloaded") 59 | extrafile(model_name) 60 | return True 61 | 62 | print("download model...") 63 | retries = 0 64 | while retries < retry_times: 65 | try: 66 | urllib.request.urlretrieve(url, model_name, schedule) 67 | extrafile(model_name) 68 | break 69 | except KeyboardInterrupt: 70 | return False 71 | except: 72 | retries += 1 73 | print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}") 74 | return retries < retry_times 75 | 76 | 77 | def export_model(model, output_model, max_len=128): 78 | with torch.no_grad(): 79 | inputs = { 80 | "input_ids": torch.ones(1, max_len, dtype=torch.int64), 81 | "attention_mask": torch.ones(1, max_len, dtype=torch.int64), 82 | "token_type_ids": torch.ones(1, max_len, dtype=torch.int64), 83 | } 84 | 85 | symbolic_names = {0: "batch_size", 1: "max_seq_len"} 86 | torch.onnx.export( 87 | model, # model being run 88 | ( 89 | inputs["input_ids"], 90 | inputs["attention_mask"], 91 | inputs["token_type_ids"], 92 | ), # model input (or a tuple for multiple inputs) 93 | output_model, # where to save the model (can be a file or file-like object) 94 | opset_version=14, # the ONNX version to export the model 95 | do_constant_folding=True, # whether to execute constant folding 96 | input_names=["input_ids", "input_mask", "segment_ids"], # the model's input names 97 | output_names=["output"], # the model's output names 98 | dynamic_axes={ 99 | "input_ids": symbolic_names, # variable length axes 100 | "input_mask": symbolic_names, 101 | "segment_ids": symbolic_names, 102 | }, 103 | ) 104 | assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!" 105 | print("ONNX Model exported to {0}".format(output_model)) 106 | 107 | 108 | def prepare_model(input_model, output_model, max_len): 109 | is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD) 110 | if is_download_successful: 111 | folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC" 112 | model = transformers.BertForSequenceClassification.from_pretrained(folder_name) 113 | export_model(model, output_model, max_len) 114 | 115 | 116 | if __name__ == "__main__": 117 | args = parse_arguments() 118 | prepare_model(args.input_model, args.output_model, args.max_len) 119 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_static/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | accelerate 4 | onnx 5 | onnxruntime 6 | coloredlogs 7 | sympy 8 | onnxruntime-extensions 9 | scikit-learn 10 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_static/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | 6 | init_params "$@" 7 | run_benchmark 8 | 9 | } 10 | 11 | # init params 12 | function init_params { 13 | for var in "$@" 14 | do 15 | case $var in 16 | --input_model=*) 17 | input_model=$(echo "$var" |cut -f2 -d=) 18 | ;; 19 | --mode=*) 20 | mode=$(echo "$var" |cut -f2 -d=) 21 | ;; 22 | --dataset_location=*) 23 | dataset_location=$(echo "$var" |cut -f2 -d=) 24 | ;; 25 | --batch_size=*) 26 | batch_size=$(echo "$var" |cut -f2 -d=) 27 | ;; 28 | --intra_op_num_threads=*) 29 | intra_op_num_threads=$(echo "$var" |cut -f2 -d=) 30 | ;; 31 | esac 32 | done 33 | 34 | } 35 | 36 | # run_benchmark 37 | function run_benchmark { 38 | if [[ ${mode} == "accuracy" ]]; then 39 | dynamic_length=False 40 | elif [[ ${mode} == "performance" ]]; then 41 | dynamic_length=True 42 | else 43 | echo "Error: No such mode: ${mode}" 44 | exit 1 45 | fi 46 | 47 | model_name_or_path="bert-base-uncased" 48 | task_name="mrpc" 49 | 50 | python main.py \ 51 | --model_path "${input_model}" \ 52 | --model_name_or_path "${model_name_or_path}" \ 53 | --data_path "${dataset_location}" \ 54 | --task "${task_name}" \ 55 | --batch_size "${batch_size}" \ 56 | --mode "${mode}" \ 57 | --intra_op_num_threads "${intra_op_num_threads-4}" \ 58 | --dynamic_length "${dynamic_length}" \ 59 | --benchmark 60 | 61 | } 62 | 63 | main "$@" 64 | 65 | -------------------------------------------------------------------------------- /examples/nlp/bert/quantization/ptq_static/run_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | run_tuning 7 | } 8 | 9 | # init params 10 | function init_params { 11 | for var in "$@" 12 | do 13 | case $var in 14 | --input_model=*) 15 | input_model=$(echo "$var" |cut -f2 -d=) 16 | ;; 17 | --output_model=*) 18 | output_model=$(echo "$var" |cut -f2 -d=) 19 | ;; 20 | --dataset_location=*) 21 | dataset_location=$(echo "$var" |cut -f2 -d=) 22 | ;; 23 | --quant_format=*) 24 | quant_format=$(echo "$var" |cut -f2 -d=) 25 | ;; 26 | esac 27 | done 28 | 29 | } 30 | 31 | # run_tuning 32 | function run_tuning { 33 | model_name_or_path="bert-base-uncased" 34 | batch_size=8 35 | task_name="mrpc" 36 | model_type="bert" 37 | 38 | python main.py \ 39 | --model_path "${input_model}" \ 40 | --output_model "${output_model}" \ 41 | --model_name_or_path "${model_name_or_path}" \ 42 | --data_path "${dataset_location}" \ 43 | --task "${task_name}" \ 44 | --batch_size "${batch_size}" \ 45 | --model_type "${model_type}" \ 46 | --quant_format "${quant_format}" \ 47 | --tune 48 | } 49 | 50 | main "$@" 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md: -------------------------------------------------------------------------------- 1 | Step-by-Step 2 | ============ 3 | 4 | This example confirms llama's weight only accuracy on [lambada](https://huggingface.co/datasets/lambada). 5 | 6 | # Prerequisite 7 | 8 | ## 1. Environment 9 | ```shell 10 | pip install onnx-neural-compressor 11 | pip install -r requirements.txt 12 | ``` 13 | > Note: Validated ONNX Runtime [Version](/docs/installation_guide.md#validated-software-environment). 14 | 15 | ## 2. Prepare Model 16 | 17 | Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. We verified weight-only quantization on other models as follows. 18 | 19 | | Model | Num Hidden Layers| Num Attention Heads | Hidden Size | 20 | | --- | --- | --- | --- | 21 | | [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 | 22 | | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 | 23 | | [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 | 24 | | [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 | 25 | | [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 | 26 | | [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 | 27 | | [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 32 | 32 | 4096 | 28 | | [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | 32 | 32 | 3072 | 29 | | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 28 | 28 | 3584 | 30 | 31 | Export to ONNX model: 32 | ```bash 33 | python prepare_model.py --input_model="meta-llama/Llama-2-7b-hf" \ 34 | --task=text-generation-with-past \ # or text-generation 35 | ``` 36 | 37 | 38 | # Run 39 | 40 | ## 1. Quantization 41 | 42 | Set `algorithm=WOQ_TUNE` to tune weight-only quantization algorithm or specify algorithm to `RTN` or `GPTQ` or `AWQ`. 43 | 44 | `quant_format=QDQ` works only when: 45 | - onnxruntime >= 1.19.0 46 | - opset version of the model >= 21 47 | - quantized bits is in [4, 8] 48 | 49 | otherwise it will execute QOperator automatically. 50 | 51 | ```bash 52 | bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model 53 | --output_model=/path/to/model_tune \ # folder path to save onnx model 54 | --batch_size=batch_size # optional \ 55 | --dataset=NeelNanda/pile-10k \ 56 | --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer 57 | --algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ \ 58 | --quant_format=QDQ # support QOperator and QDQ 59 | ``` 60 | 61 | ## 2. Benchmark 62 | 63 | Accuracy: 64 | 65 | ```bash 66 | bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model 67 | --batch_size=batch_size \ # optional 68 | --mode=accuracy \ 69 | --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer 70 | --tasks=lambada_openai 71 | ``` 72 | 73 | Performance: 74 | ```bash 75 | numactl -m 0 -C 0-23 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model 76 | --mode=performance \ 77 | --batch_size=batch_size # optional \ 78 | --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer 79 | --intra_op_num_threads=24 80 | 81 | ``` 82 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Refer from https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval 16 | 17 | from evaluation.accuracy import cli_evaluate as evaluate 18 | from evaluation.utils import LMEvalParser 19 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from evaluation.models import huggingface 17 | 18 | # TODO: implement __all__ 19 | 20 | 21 | try: 22 | # enable hf hub transfer if available 23 | import hf_transfer # type: ignore # noqa 24 | import huggingface_hub.constants # type: ignore 25 | 26 | huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True 27 | except ImportError: 28 | pass 29 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class LMEvalParser: 17 | def __init__( 18 | self, 19 | model="hf", 20 | tasks="lambada_openai", 21 | model_args="", 22 | user_model=None, 23 | tokenizer=None, 24 | num_fewshot=None, 25 | batch_size=1, 26 | max_batch_size=None, 27 | provider=None, 28 | output_path=None, 29 | limit=None, 30 | use_cache=None, 31 | cache_requests=None, 32 | check_integrity=False, 33 | write_out=False, 34 | log_samples=False, 35 | show_config=False, 36 | include_path=None, 37 | gen_kwargs=None, 38 | verbosity="INFO", 39 | wandb_args="", 40 | predict_only=False, 41 | seed=[0, 1234, 1234], 42 | trust_remote_code=False, 43 | ): 44 | self.model = model 45 | self.tasks = tasks 46 | self.model_args = model_args 47 | self.user_model = user_model 48 | self.tokenizer = tokenizer 49 | self.num_fewshot = num_fewshot 50 | self.batch_size = batch_size 51 | self.max_batch_size = max_batch_size 52 | self.provider = provider 53 | self.output_path = output_path 54 | self.limit = limit 55 | self.use_cache = use_cache 56 | self.cache_requests = cache_requests 57 | self.check_integrity = check_integrity 58 | self.write_out = write_out 59 | self.log_samples = log_samples 60 | self.show_config = show_config 61 | self.include_path = include_path 62 | self.gen_kwargs = gen_kwargs 63 | self.verbosity = verbosity 64 | self.wandb_args = wandb_args 65 | self.predict_only = predict_only 66 | self.seed = seed 67 | self.trust_remote_code = trust_remote_code 68 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/prepare_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | 5 | import optimum.version 6 | from packaging import version 7 | 8 | OPTIMUM114_VERSION = version.Version("1.14.0") 9 | 10 | 11 | def parse_arguments(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--input_model", type=str, required=True, default="") 14 | parser.add_argument("--output_model", type=str, required=False, default=None) 15 | parser.add_argument( 16 | "--task", 17 | type=str, 18 | required=False, 19 | default="text-generation-with-past", 20 | choices=["text-generation-with-past", "text-generation"], 21 | ) 22 | args = parser.parse_args() 23 | if args.output_model is None: 24 | args.output_model = os.path.basename(args.input_model) + "-onnx" 25 | return args 26 | 27 | 28 | def prepare_model(input_model, output_model, task): 29 | print("\nexport model...") 30 | if version.Version(optimum.version.__version__) < OPTIMUM114_VERSION: 31 | raise ImportError("Please upgrade optimum to >= 1.14.0") 32 | 33 | subprocess.run( 34 | [ 35 | "optimum-cli", 36 | "export", 37 | "onnx", 38 | "--model", 39 | f"{input_model}", 40 | "--task", 41 | task, 42 | f"{output_model}", 43 | "--trust-remote-code", 44 | ], 45 | stdout=subprocess.PIPE, 46 | text=True, 47 | ) 48 | 49 | assert os.path.exists(output_model), f"{output_model} doesn't exist!" 50 | 51 | 52 | if __name__ == "__main__": 53 | args = parse_arguments() 54 | prepare_model(args.input_model, args.output_model, args.task) 55 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | onnx 4 | onnxruntime 5 | onnxruntime-extensions 6 | datasets 7 | optimum 8 | lm-eval==0.4.2 9 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | 6 | init_params "$@" 7 | run_benchmark 8 | 9 | } 10 | 11 | # init params 12 | function init_params { 13 | for var in "$@" 14 | do 15 | case $var in 16 | --input_model=*) 17 | input_model=$(echo "$var" |cut -f2 -d=) 18 | ;; 19 | --batch_size=*) 20 | batch_size=$(echo "$var" |cut -f2 -d=) 21 | ;; 22 | --tokenizer=*) 23 | tokenizer=$(echo "$var" |cut -f2 -d=) 24 | ;; 25 | --mode=*) 26 | mode=$(echo "$var" |cut -f2 -d=) 27 | ;; 28 | --intra_op_num_threads=*) 29 | intra_op_num_threads=$(echo "$var" |cut -f2 -d=) 30 | ;; 31 | esac 32 | done 33 | 34 | } 35 | 36 | # run_benchmark 37 | function run_benchmark { 38 | 39 | # Check if the input_model ends with the filename extension ".onnx" 40 | if [[ $input_model =~ \.onnx$ ]]; then 41 | # If the string ends with the filename extension, get the path of the file 42 | input_model=$(dirname "$input_model") 43 | fi 44 | 45 | extra_cmd="" 46 | 47 | if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then 48 | extra_cmd=$extra_cmd"--trust_remote_code True " 49 | fi 50 | 51 | if [ "${batch_size}" ]; then 52 | extra_cmd=$extra_cmd"--batch_size ${batch_size} " 53 | fi 54 | if [ "${tokenizer}" ]; then 55 | extra_cmd=$extra_cmd"--tokenizer ${tokenizer} " 56 | fi 57 | if [ "${tasks}" ]; then 58 | extra_cmd=$extra_cmd"--tasks ${tasks} " 59 | fi 60 | if [ "${intra_op_num_threads}" ]; then 61 | extra_cmd=$extra_cmd"--intra_op_num_threads ${intra_op_num_threads} " 62 | fi 63 | 64 | extra_cmd=$extra_cmd"--benchmark" 65 | eval "python main.py --model_path ${input_model} --mode ${mode} ${extra_cmd}" 66 | 67 | } 68 | 69 | main "$@" 70 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | run_tuning 7 | } 8 | 9 | # init params 10 | function init_params { 11 | for var in "$@" 12 | do 13 | case $var in 14 | --input_model=*) 15 | input_model=$(echo "$var" |cut -f2 -d=) 16 | ;; 17 | --output_model=*) 18 | output_model=$(echo "$var" |cut -f2 -d=) 19 | ;; 20 | --batch_size=*) 21 | batch_size=$(echo "$var" |cut -f2 -d=) 22 | ;; 23 | --dataset=*) 24 | dataset=$(echo "$var" |cut -f2 -d=) 25 | ;; 26 | --tokenizer=*) 27 | tokenizer=$(echo "$var" |cut -f2 -d=) 28 | ;; 29 | --algorithm=*) 30 | algorithm=$(echo "$var" |cut -f2 -d=) 31 | ;; 32 | --quant_format=*) 33 | quant_format=$(echo "$var" |cut -f2 -d=) 34 | ;; 35 | esac 36 | done 37 | 38 | } 39 | 40 | # run_tuning 41 | function run_tuning { 42 | 43 | # Check if the input_model ends with the filename extension ".onnx" 44 | if [[ $input_model =~ \.onnx$ ]]; then 45 | # If the string ends with the filename extension, get the path of the file 46 | input_model=$(dirname "$input_model") 47 | fi 48 | 49 | # Check if the output_model ends with the filename extension ".onnx" 50 | if [[ $output_model =~ \.onnx$ ]]; then 51 | # If the string ends with the filename extension, get the path of the file 52 | output_model=$(dirname "$output_model") 53 | fi 54 | 55 | # Check if the directory exists 56 | if [ ! -d "$output_model" ]; then 57 | # If the directory doesn't exist, create it 58 | mkdir -p "$output_model" 59 | echo "Created directory $output_model" 60 | fi 61 | 62 | extra_cmd="" 63 | 64 | if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then 65 | nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul" 66 | extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True " 67 | fi 68 | if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then 69 | nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul" 70 | extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} " 71 | fi 72 | if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then 73 | nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul" 74 | extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} " 75 | fi 76 | 77 | if [ "${tokenizer}" ]; then 78 | extra_cmd=$extra_cmd"--tokenizer ${tokenizer} " 79 | fi 80 | if [ "${batch_size}" ]; then 81 | extra_cmd=$extra_cmd"--batch_size ${batch_size} " 82 | fi 83 | if [ "${dataset}" ]; then 84 | extra_cmd=$extra_cmd"--dataset ${dataset} " 85 | fi 86 | if [ "${algorithm}" ]; then 87 | extra_cmd=$extra_cmd"--algorithm ${algorithm} " 88 | fi 89 | if [ "${tasks}" ]; then 90 | extra_cmd=$extra_cmd"--tasks ${tasks} " 91 | fi 92 | if [ "${quant_format}" ]; then 93 | extra_cmd=$extra_cmd"--quant_format ${quant_format} " 94 | fi 95 | 96 | extra_cmd=$extra_cmd"--layer_wise --tune" 97 | eval "python main.py --model_path ${input_model} --output_model ${output_model} ${extra_cmd}" 98 | } 99 | 100 | main "$@" 101 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/README.md: -------------------------------------------------------------------------------- 1 | Step-by-Step 2 | ============ 3 | 4 | This example shows how to quantize the unet model of [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) with SmoothQuant and generate images with the quantized unet. 5 | 6 | # Prerequisite 7 | 8 | ## 1. Environment 9 | ```shell 10 | pip install -r requirements.txt 11 | ``` 12 | > Note: Validated ONNX Runtime [Version](/docs/installation_guide.md#validated-software-environment). 13 | 14 | ## 2. Prepare Model 15 | 16 | 17 | ```bash 18 | git clone https://github.com/huggingface/diffusers.git 19 | cd diffusers/scripts 20 | python convert_stable_diffusion_checkpoint_to_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion 21 | ``` 22 | 23 | # Run 24 | 25 | ## 1. Quantization 26 | 27 | ```bash 28 | bash run_quant.sh --input_model=/path/to/stable-diffusion \ # folder path of stable-diffusion 29 | --output_model=/path/to/save/unet_model \ # model path as *.onnx 30 | --alpha=0.7 # optional 31 | ``` 32 | 33 | ## 2. Benchmark 34 | 35 | ```bash 36 | bash run_benchmark.sh --input_model=/path/to/stable-diffusion \ # folder path of stable-diffusion 37 | --quantized_unet_path=/path/to/quantized/unet.onnx \ # optional, run fp32 model if not provided 38 | --prompt="a photo of an astronaut riding a horse on mars" \ # optional 39 | --image_path=image.png # optional 40 | ``` 41 | 42 | Benchmark will print the throughput data and save the generated image. 43 | Our test results with default parameters is (fp32 vs int8): 44 |

45 | fp32 46 | int8 47 |

48 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/fp32.png -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/int8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/int8.png -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | diffusers 3 | onnx 4 | onnxruntime 5 | onnxruntime-extensions 6 | onnx_neural_compressor 7 | transformers==4.42.0 # restricted by model export 8 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | 6 | init_params "$@" 7 | run_benchmark 8 | 9 | } 10 | 11 | # init params 12 | function init_params { 13 | for var in "$@" 14 | do 15 | case $var in 16 | --input_model=*) 17 | input_model=$(echo "$var" |cut -f2 -d=) 18 | ;; 19 | --quantized_unet_path=*) 20 | quantized_unet_path=$(echo "$var" |cut -f2 -d=) 21 | ;; 22 | --batch_size=*) 23 | batch_size=$(echo "$var" |cut -f2 -d=) 24 | ;; 25 | --prompt=*) 26 | prompt=$(echo "$var" |cut -f2 -d=) 27 | ;; 28 | --image_path=*) 29 | image_path=$(echo "$var" |cut -f2 -d=) 30 | ;; 31 | esac 32 | done 33 | 34 | } 35 | 36 | # run_benchmark 37 | function run_benchmark { 38 | 39 | # Check if the input_model ends with the filename extension ".onnx" 40 | if [[ $input_model =~ \.onnx$ ]]; then 41 | # If the string ends with the filename extension, get the path of the file 42 | input_model=$(dirname "$input_model") 43 | fi 44 | 45 | extra_cmd="" 46 | 47 | if [ "$quantized_unet_path" ]; then 48 | extra_cmd=$extra_cmd"--quantized_unet_path=${quantized_unet_path} " 49 | fi 50 | 51 | if [ "$prompt" ]; then 52 | extra_cmd=$extra_cmd"--prompt=${prompt} " 53 | fi 54 | 55 | if [ "$image_path" ]; then 56 | extra_cmd=$extra_cmd"--image_path=${image_path} " 57 | fi 58 | 59 | if [ "$batch_size" ]; then 60 | extra_cmd=$extra_cmd"--batch_size=${batch_size} " 61 | fi 62 | extra_cmd=$extra_cmd"--benchmark" 63 | eval "python main.py --model_path=${input_model} ${extra_cmd}" 64 | } 65 | 66 | main "$@" 67 | 68 | -------------------------------------------------------------------------------- /examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/run_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | function main { 5 | init_params "$@" 6 | run_tuning 7 | } 8 | 9 | # init params 10 | function init_params { 11 | for var in "$@" 12 | do 13 | case $var in 14 | --input_model=*) 15 | input_model=$(echo "$var" |cut -f2 -d=) 16 | ;; 17 | --output_model=*) 18 | output_model=$(echo "$var" |cut -f2 -d=) 19 | ;; 20 | --alpha=*) 21 | alpha=$(echo "$var" |cut -f2 -d=) 22 | ;; 23 | esac 24 | done 25 | 26 | } 27 | 28 | # run_tuning 29 | function run_tuning { 30 | 31 | # Check if the input_model ends with the filename extension ".onnx" 32 | if [[ $input_model =~ \.onnx$ ]]; then 33 | # If the string ends with the filename extension, get the path of the file 34 | input_model=$(dirname "$input_model") 35 | fi 36 | 37 | # Check if the directory exists 38 | if [ ! -d "$(dirname "$output_model")" ]; then 39 | # If the directory doesn't exist, create it 40 | mkdir -p "$(dirname "$output_model")" 41 | echo "Created directory $(dirname "$output_model")" 42 | fi 43 | 44 | python main.py \ 45 | --model_path "${input_model}" \ 46 | --output_model "${output_model}" \ 47 | --alpha "${alpha-0.7}" \ 48 | --tune 49 | } 50 | 51 | main "$@" 52 | 53 | -------------------------------------------------------------------------------- /onnx_neural_compressor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Neural Compressor: An open-source Python library supporting popular model compression techniques for ONNX models.""" 15 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/layer_wise/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Operators for onnx model.""" 15 | 16 | import glob 17 | from os import path 18 | 19 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 20 | 21 | modules = glob.glob(path.join(path.dirname(__file__), "*.py")) 22 | 23 | for f in modules: 24 | if path.isfile(f) and not f.startswith("__") and not f.endswith("__init__.py"): 25 | __import__(path.basename(f)[:-3], globals(), locals(), level=1) 26 | 27 | OPERATORS = base_op.OPERATORS 28 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Activation operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="LeakyRelu, Sigmoid", mode=[constants.STATIC_QUANT]) 24 | class ActivationOperator(base_op.Operator): 25 | """Activation operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(ActivationOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def quantize_check(self): 32 | """Check if quantizaion can be done.""" 33 | node = self.node 34 | data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0]) 35 | if not data_found: 36 | return False 37 | return True 38 | 39 | def quantize(self): 40 | """Do quantizaion.""" 41 | node = self.node 42 | super().quantize() 43 | node.name = node.name + "_quant" 44 | 45 | def convert_check(self): 46 | """Check if conversion can be done.""" 47 | node = self.node 48 | 49 | children = self.quantizer.model.get_children(node) 50 | if len(children) == 0 or not node.name.endswith("_quant"): 51 | return False 52 | return True 53 | 54 | def convert(self): 55 | """Convert to QOperator format.""" 56 | node = self.node 57 | 58 | parent = self.quantizer.model.get_parents(node)[0] 59 | child = self.quantizer.model.get_children(node)[0] 60 | 61 | inputs = [] 62 | inputs.extend(parent.input) 63 | inputs.extend(child.input[1:]) 64 | 65 | qlinear_activation_output = child.output[0] 66 | kwargs = {} 67 | for attribute in node.attribute: # pragma: no cover 68 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 69 | kwargs["domain"] = quant_utils.ms_domain 70 | 71 | qlinear_activation_node = onnx.helper.make_node( 72 | "QLinear" + node.op_type, inputs, [qlinear_activation_output], node.name, **kwargs 73 | ) 74 | 75 | self.quantizer.new_nodes.append(qlinear_activation_node) 76 | self.quantizer.remove_nodes.extend([parent, child, node]) 77 | 78 | 79 | @base_op.op_registry(op_types="Relu, Clip", mode=[constants.STATIC_QUANT]) 80 | class RemovableActivationOperator(base_op.Operator): 81 | """Removable activation operator.""" 82 | 83 | def __init__(self, onnx_quantizer, onnx_node): 84 | """Initialization.""" 85 | super(RemovableActivationOperator, self).__init__(onnx_quantizer, onnx_node) 86 | 87 | def quantize_check(self): 88 | """Check if quantizaion can be done.""" 89 | node = self.node 90 | if node.input[0] not in self.quantizer.quantized_value_map: 91 | return False 92 | return True 93 | 94 | def quantize(self): 95 | """Do quantization.""" 96 | node = self.node 97 | if node.output[0] in [i.name for i in self.quantizer.model.model.graph.output]: 98 | self.quantizer.dequantize_tensor(node, node.input[0]) 99 | else: 100 | self.quantizer.model.replace_input_of_all_nodes(node.output[0], node.input[0]) 101 | self.quantizer.remove_nodes.append(node) 102 | 103 | 104 | @base_op.op_registry( 105 | op_types="Softmax, BiasGelu, Elu, Exp, FastGelu, Gelu, Softplus, Tanh", mode=[constants.STATIC_QUANT] 106 | ) 107 | class Float16ActivationOperator(base_op.Operator): 108 | """Float16 Activation operator.""" 109 | 110 | def __init__(self, onnx_quantizer, onnx_node): 111 | """Initialization.""" 112 | super(Float16ActivationOperator, self).__init__(onnx_quantizer, onnx_node) 113 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ArgMax operator.""" 15 | 16 | from onnx_neural_compressor import constants, utility 17 | from onnx_neural_compressor.algorithms import utility as quant_utils 18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 19 | 20 | 21 | @base_op.op_registry(op_types="ArgMax", mode=[constants.STATIC_QUANT]) 22 | class ArgMaxOperator(base_op.Operator): 23 | """ArgMax operator.""" 24 | 25 | def __init__(self, onnx_quantizer, onnx_node): 26 | """Initialization.""" 27 | super(ArgMaxOperator, self).__init__(onnx_quantizer, onnx_node) 28 | 29 | def convert_check(self): 30 | """Check if conversion can be done.""" 31 | node = self.node 32 | return True 33 | 34 | def convert(self): 35 | """Convert to quantized format.""" 36 | node = self.node 37 | origin_name = node.input[0].split("_argmax_node")[0] 38 | 39 | if origin_name in self.quantizer.quantized_value_map: 40 | node.name = node.name + "_quant" 41 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Attention operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="Attention", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]) 24 | class AttentionOperator(base_op.Operator): 25 | """Attention operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(AttentionOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def quantize(self): 32 | """Do quantizaion.""" 33 | node = self.node 34 | self.quantizer.quantize_inputs(node, [0, 1]) 35 | node.name = node.name + "_quant" 36 | 37 | def convert(self): 38 | """Convert QDQ mode to QOperator format.""" 39 | node = self.node 40 | parents = self.quantizer.model.get_parents(node) 41 | quantized_name = [] 42 | scale = [] 43 | zp = [] 44 | for parent in parents[:2]: 45 | if parent.op_type == "DynamicQuantizeLinear": 46 | quantized_name.append(parent.output[0]) 47 | scale.append(parent.output[1]) 48 | zp.append(parent.output[2]) 49 | elif parent.op_type == "DequantizeLinear": 50 | quantized_name.append(parent.input[0]) 51 | scale.append(parent.input[1]) 52 | zp.append(parent.input[2]) 53 | self.quantizer.remove_nodes.append(parent) 54 | 55 | inputs = [] 56 | inputs.extend(quantized_name) 57 | inputs.append(node.input[2]) 58 | inputs.extend(scale) 59 | inputs.append(node.input[3] if len(node.input) > 3 else "") 60 | inputs.extend(zp) 61 | if len(node.input) > 4: 62 | inputs.append(node.input[4]) 63 | 64 | kwargs = {} 65 | for attribute in node.attribute: # pragma: no cover 66 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 67 | kwargs["domain"] = quant_utils.ms_domain 68 | qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, node.name, **kwargs) 69 | self.quantizer.new_nodes.append(qattention_node) 70 | 71 | self.quantizer.remove_nodes.append(node) 72 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Base Operator.""" 15 | 16 | from onnx_neural_compressor import constants, quantization 17 | 18 | OPERATORS = { 19 | "dynamic_quant": {}, 20 | "static_quant": {}, 21 | } 22 | 23 | 24 | def op_registry(op_types, mode): 25 | """The class decorator used to register all Operator subclasses.""" 26 | 27 | def decorator_op(cls): 28 | assert cls.__name__.endswith( 29 | "Operator" 30 | ), "The name of subclass of Operator should end with 'Operator' substring." 31 | for item in mode: 32 | if cls.__name__[: -len("Operator")] in OPERATORS[item]: # pragma: no cover 33 | raise ValueError("Cannot have two operators with the same name for {} mode.".format(item)) 34 | break 35 | for single_op_type in [op_type.strip() for op_type in op_types.split(",")]: 36 | for item in mode: 37 | OPERATORS[item][single_op_type] = cls 38 | return cls 39 | 40 | return decorator_op 41 | 42 | 43 | class Operator(object): 44 | """Base Operator.""" 45 | 46 | def __init__(self, onnx_quantizer, onnx_node): 47 | """Initialization.""" 48 | self.quantizer = onnx_quantizer 49 | self.node = onnx_node 50 | node_name = self.node.name.split("_quant")[0] 51 | if node_name in self.quantizer.config: 52 | self.dtype = self.quantizer.config[node_name] 53 | self.disable_qdq_for_node_output = ( 54 | True if onnx_node.op_type in onnx_quantizer.optypes_to_exclude_output_quant else False 55 | ) 56 | self.per_channel = False 57 | self.calibrate_method = 0 # minmax 58 | self.weight_sym = True 59 | self.weight_dtype = None 60 | self.activation_dtype = None 61 | self.activation_sym = False 62 | if node_name in self.quantizer.config: 63 | if self.quantizer.config[node_name] not in self.quantizer.fallback_list: 64 | self.per_channel = self.quantizer.config[node_name]["per_channel"] 65 | self.calibrate_method = self.quantizer.config[node_name]["calibrate_method"] 66 | self.weight_sym = self.quantizer.config[node_name]["weight_sym"] 67 | self.weight_dtype = self.quantizer.config[node_name]["weight_type"] 68 | self.activation_dtype = self.quantizer.config[node_name]["activation_type"] 69 | self.activation_sym = self.quantizer.config[node_name]["activation_sym"] 70 | 71 | def quantize_check(self): 72 | """Check if quantizaion can be done.""" 73 | return True 74 | 75 | def quantize(self): 76 | """Do quantizaion.""" 77 | node = self.node 78 | self.quantizer.quantize_inputs(node) 79 | if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT: 80 | self.quantizer.quantize_outputs(node) 81 | 82 | def convert_check(self): 83 | """Check if conversion can be done.""" 84 | node = self.node 85 | 86 | if not node.name.endswith("_quant"): 87 | return False 88 | return True 89 | 90 | def convert(self): 91 | """Convert to QOperator format.""" 92 | return 93 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Direct8Bit Operator.""" 15 | 16 | from onnx_neural_compressor import constants, utility 17 | from onnx_neural_compressor.algorithms import utility as quant_utils 18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 19 | 20 | 21 | @base_op.op_registry( 22 | op_types="Reshape, Transpose, Squeeze, Unsqueeze, Flatten, Expand, Slice, " 23 | "SpaceToDepth, DepthToSpace, Upsample, Tile, CenterCropPad", 24 | mode=[constants.STATIC_QUANT], 25 | ) 26 | class Direct8BitOperator(base_op.Operator): 27 | """Direct8Bit Operator.""" 28 | 29 | def __init__(self, onnx_quantizer, onnx_node): 30 | """Initialization.""" 31 | super(Direct8BitOperator, self).__init__(onnx_quantizer, onnx_node) 32 | 33 | def quantize_check(self): 34 | """Check if quantizaion can be done.""" 35 | node = self.node 36 | if not self.quantizer.is_valid_quantize_weight(node.input[0]): 37 | return False 38 | return True 39 | 40 | def quantize(self): 41 | """Do quantizaion.""" 42 | node = self.node 43 | self.quantizer.quantize_inputs(self.node, [0], initializer_use_weight_qType=False, direct_int8=True) 44 | if not self.disable_qdq_for_node_output: 45 | self.quantizer.quantize_outputs(self.node, direct_int8=True) 46 | node.name = node.name + "_quant" 47 | 48 | def convert_check(self): 49 | """Check if conversion can be done.""" 50 | node = self.node 51 | parents = self.quantizer.model.get_parents(node) 52 | children = self.quantizer.model.get_children(node) 53 | if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): 54 | return False 55 | return True 56 | 57 | def convert(self): 58 | """Convert to QOperator format.""" 59 | node = self.node 60 | parents = self.quantizer.model.get_parents(node) 61 | children = self.quantizer.model.get_children(node) 62 | if any([i.op_type == "DequantizeLinear" for i in parents]) and any( 63 | [i.op_type == "QuantizeLinear" for i in children] 64 | ): 65 | for parent in parents: 66 | if parent.op_type == "DequantizeLinear": 67 | # make sure parent DequantizeLinear of input 0 is not used by other ops 68 | if len(self.quantizer.model.get_children(parent)) == 1 and not self.quantizer.model.is_graph_output( 69 | parents[0].output[0] 70 | ): 71 | self.quantizer.remove_nodes.append(parent) 72 | self.node.input[0] = parent.input[0] 73 | break 74 | for child in children: 75 | if child.op_type == "QuantizeLinear": 76 | self.quantizer.remove_nodes.append(child) 77 | self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") 78 | node.output[0] = node.output[0] + "_quantized" 79 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """EmbedLayerNormalization Operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="EmbedLayerNormalization", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]) 24 | class EmbedLayerNormalizationOperator(base_op.Operator): 25 | """EmbedLayerNormalization Operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(EmbedLayerNormalizationOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def quantize(self): 32 | """Do quantizaion.""" 33 | node = self.node 34 | self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6]) 35 | node.name = node.name + "_quant" 36 | 37 | def convert(self): 38 | """Convert to QOperator format.""" 39 | node = self.node 40 | 41 | parents = [i for i in self.quantizer.model.get_parents(node) if i.op_type == "DequantizeLinear"] 42 | inputs = [] 43 | # 'input_ids' 44 | inputs.extend([node.input[0]]) 45 | # 'segment_ids' 46 | inputs.extend([node.input[1]]) 47 | for parent in parents: 48 | inputs.append(parent.input[0]) 49 | # 'mask' (optional) 50 | if len(node.input) > 7: 51 | inputs.append(node.input[7]) 52 | 53 | for parent in parents: 54 | inputs.append(parent.input[1]) 55 | for parent in parents: 56 | inputs.append(parent.input[2]) 57 | 58 | kwargs = {} 59 | for attribute in node.attribute: # pragma: no cover 60 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 61 | kwargs["domain"] = quant_utils.ms_domain 62 | 63 | qembed_layer_norm_node = onnx.helper.make_node( 64 | "QEmbedLayerNormalization", inputs, node.output, node.name, **kwargs 65 | ) 66 | self.quantizer.new_nodes.append(qembed_layer_norm_node) 67 | self.quantizer.remove_nodes.extend(parents) 68 | self.quantizer.remove_nodes.append(node) 69 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Gather Operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry( 24 | op_types="Gather, GatherElements, GatherND", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT] 25 | ) 26 | class GatherOperator(base_op.Operator): 27 | """Gather Operator.""" 28 | 29 | def __init__(self, onnx_quantizer, onnx_node): 30 | """Initialization.""" 31 | super(GatherOperator, self).__init__(onnx_quantizer, onnx_node) 32 | 33 | def quantize_check(self): 34 | """Check if quantizaion can be done.""" 35 | node = self.node 36 | if not self.quantizer.is_valid_quantize_weight(node.input[0]): 37 | return False 38 | return True 39 | 40 | def quantize(self): 41 | """Do quantizaion.""" 42 | node = self.node 43 | self.quantizer.quantize_inputs(node, [0], initializer_use_weight_qType=False) 44 | if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT: 45 | self.quantizer.quantize_outputs(node) 46 | node.name = node.name + "_quant" 47 | 48 | def convert_check(self): 49 | """Check if conversion can be done.""" 50 | node = self.node 51 | parents = self.quantizer.model.get_parents(node) 52 | children = self.quantizer.model.get_children(node) 53 | if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"): 54 | return False 55 | 56 | return True 57 | 58 | def convert(self): 59 | """Convert to QOperator format.""" 60 | # DQ-Gather-Q-DQ-op 61 | node = self.node 62 | 63 | parents = self.quantizer.model.get_parents(node) 64 | children = self.quantizer.model.get_children(node) 65 | 66 | if any([i.op_type == "DequantizeLinear" for i in parents]): 67 | 68 | inputs = [] 69 | inputs.append(parents[0].input[0]) 70 | inputs.append(node.input[1]) 71 | 72 | out_scale = 1.0 73 | out_zp = 0 74 | gather_new_output = node.output[0] + "_quantized" # dynamic quant output name 75 | for child in children: 76 | if child.op_type == "QuantizeLinear": 77 | out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1])) 78 | out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2])) 79 | gather_new_output = children[0].output[0] # static quant output name 80 | self.quantizer.remove_nodes.append(child) 81 | 82 | kwargs = {} 83 | for attribute in node.attribute: # pragma: no cover 84 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 85 | 86 | gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs) 87 | self.quantizer.new_nodes.append(gather_node) 88 | if any([i.op_type != "QuantizeLinear" for i in children]): 89 | dq_inputs = [] 90 | dq_inputs.append(gather_new_output) 91 | dq_inputs.extend(parents[0].input[1:]) 92 | dq_node = onnx.helper.make_node( 93 | "DequantizeLinear", dq_inputs, [node.output[0]], node.name + "_DequantizeLinear" 94 | ) 95 | self.quantizer.new_nodes.append(dq_node) 96 | 97 | # int8 weight will be recalculated for the first time 98 | if ( 99 | any([child.op_type == "QuantizeLinear" for child in children]) 100 | and self.quantizer.model.get_initializer(parents[0].input[0]) is not None 101 | and parents[0].input[0] not in self.quantizer.recalculate_quantized_value 102 | ): 103 | int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0])) 104 | in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1])) 105 | in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2])) 106 | new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp 107 | self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype)) 108 | self.quantizer.recalculate_quantized_value.append(parents[0].input[0]) 109 | self.quantizer.remove_nodes.extend([node, parents[0]]) 110 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """GlobalAveragePool Operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="GlobalAveragePool", mode=[constants.STATIC_QUANT]) 24 | class GlobalAveragePoolOperator(base_op.Operator): 25 | """GlobalAveragePool Operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(GlobalAveragePoolOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def convert_check(self): 32 | """Check if conversion can be done.""" 33 | node = self.node 34 | children = self.quantizer.model.get_children(node) 35 | if len(children) == 0: # pragma: no cover 36 | return False 37 | return True 38 | 39 | def convert(self): 40 | """Convert to QOperator format.""" 41 | node = self.node 42 | 43 | parent = self.quantizer.model.get_parents(node)[0] 44 | child = self.quantizer.model.get_children(node)[0] 45 | 46 | kwargs = {} 47 | for attribute in node.attribute: 48 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 49 | kwargs["domain"] = quant_utils.ms_domain 50 | kwargs["channels_last"] = 0 51 | 52 | inputs = parent.input 53 | inputs.extend(child.input[1:]) 54 | 55 | qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, child.output, node.name + "_quant", **kwargs) 56 | self.quantizer.new_nodes += [qnode] 57 | self.quantizer.remove_nodes.append(child) 58 | self.quantizer.remove_nodes.append(parent) 59 | self.quantizer.remove_nodes.append(node) 60 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Gemm Operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, logger 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="Gemm", mode=[constants.STATIC_QUANT]) 24 | class GemmOperator(base_op.Operator): 25 | """Gemm Operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(GemmOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def quantize_check(self): 32 | """Check if quantizaion can be done.""" 33 | node = self.node 34 | if len(node.input) == 3 and not quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()): 35 | 36 | logger.warning( 37 | "Bias of Gemm node '{}' is not constant. " 38 | "Exclude this node can get better performance.".format(node.name) 39 | ) 40 | if self.quantizer.quant_format != "qdq": 41 | return False 42 | return True 43 | 44 | def quantize(self): 45 | """Do quantizaion.""" 46 | node = self.node 47 | self.quantizer.quantize_inputs(node, [0]) 48 | if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()): 49 | self.quantizer.quantize_weights_per_channel( 50 | node, [1], self.weight_dtype, self.weight_sym, 0 if quant_utils.is_B_transposed(node) else 1 51 | ) 52 | else: 53 | self.quantizer.quantize_inputs(node, [1]) 54 | 55 | if len(node.input) == 3 and quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()): 56 | self.quantizer.quantize_bias_tensor(node) 57 | beta_attribute = [attr for attr in node.attribute if attr.name == "beta"] 58 | if len(beta_attribute): 59 | beta_attribute[0].f = 1.0 60 | 61 | if not self.disable_qdq_for_node_output: 62 | self.quantizer.quantize_outputs(node) 63 | node.name = node.name + "_quant" 64 | 65 | def convert(self): 66 | """Convert to QOperator format.""" 67 | node = self.node 68 | 69 | parents = self.quantizer.model.get_parents(node) 70 | qgemm_inputs = [] 71 | for parent in parents[:-1]: 72 | qgemm_inputs.extend(parent.input) 73 | qgemm_inputs.append(parents[-1].input[0]) 74 | 75 | kwargs = {} 76 | for attribute in node.attribute: 77 | if attribute.name != "beta": 78 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 79 | kwargs["domain"] = quant_utils.ms_domain 80 | 81 | qgemm_output = node.output[0] 82 | if not self.disable_qdq_for_node_output: 83 | child = self.quantizer.model.get_children(node)[0] 84 | self.quantizer.remove_nodes.append(child) 85 | qgemm_output = child.output[0] 86 | qgemm_inputs.extend(child.input[1:]) 87 | qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], node.name, **kwargs) 88 | 89 | self.quantizer.new_nodes.append(qgemm_node) 90 | self.quantizer.remove_nodes.extend(parents) 91 | self.quantizer.remove_nodes.append(node) 92 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """MaxPool Operator.""" 15 | 16 | from onnx_neural_compressor import constants, utility 17 | from onnx_neural_compressor.algorithms import utility as quant_utils 18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 19 | 20 | 21 | @base_op.op_registry(op_types="MaxPool", mode=[constants.STATIC_QUANT]) 22 | class MaxPoolOperator(base_op.Operator): 23 | """MaxPool Operator.""" 24 | 25 | def __init__(self, onnx_quantizer, onnx_node): 26 | """Initialization.""" 27 | super(MaxPoolOperator, self).__init__(onnx_quantizer, onnx_node) 28 | 29 | def quantize_check(self): 30 | """Check if quantizaion can be done.""" 31 | node = self.node 32 | # if opset version is less than 12, just no change 33 | if self.quantizer.opset_version < 12: # pragma: no cover 34 | return False 35 | 36 | if not self.quantizer.is_valid_quantize_weight(node.input[0]): # pragma: no cover 37 | return False 38 | 39 | return True 40 | 41 | def quantize(self): 42 | """Do quantizaion.""" 43 | node = self.node 44 | self.quantizer.quantize_inputs(self.node, direct_int8=True) 45 | if not self.disable_qdq_for_node_output: 46 | self.quantizer.quantize_outputs(self.node, direct_int8=True) 47 | node.name = node.name + "_quant" 48 | 49 | def convert_check(self): 50 | """Check if conversion can be done.""" 51 | node = self.node 52 | children = self.quantizer.model.get_children(node) 53 | if len(children) == 0 or not node.name.endswith("_quant"): # pragma: no cover 54 | return False 55 | return True 56 | 57 | def convert(self): 58 | """Convert to QOperator format.""" 59 | node = self.node 60 | parent = self.quantizer.model.get_parents(node)[0] 61 | children = self.quantizer.model.get_children(node) 62 | if parent.op_type != "DequantizeLinear" or all( 63 | [i.op_type != "QuantizeLinear" for i in children] 64 | ): # pragma: no cover 65 | return 66 | node.input[0] = parent.input[0] 67 | node.output[0] = node.output[0].replace("_QuantizeInput", "_quantized") 68 | for child in children: 69 | if child.op_type == "QuantizeLinear": 70 | self.quantizer.remove_nodes.append(child) 71 | for n in self.quantizer.model.get_children(child): 72 | self.quantizer.model.replace_node_input(n, child.output[0], node.output[0]) 73 | 74 | self.quantizer.remove_nodes.append(parent) 75 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Pad Operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="Pad", mode=[constants.STATIC_QUANT]) 24 | class PadOperator(base_op.Operator): 25 | """Pad Operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(PadOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def quantize_check(self): 32 | """Check if quantizaion can be done.""" 33 | # if opset version is less than 11, just no change 34 | if self.quantizer.opset_version < 11: # pragma: no cover 35 | return False 36 | return True 37 | 38 | def quantize(self): 39 | """Do quantizaion.""" 40 | node = self.node 41 | self.quantizer.quantize_inputs(node, [0]) 42 | if not self.disable_qdq_for_node_output: 43 | self.quantizer.quantize_outputs(node) 44 | node.name = node.name + "_quant" 45 | 46 | def convert_check(self): 47 | """Check if conversion can be done.""" 48 | node = self.node 49 | children = self.quantizer.model.get_children(node) 50 | if len(children) == 0 or not node.name.endswith("_quant"): # pragma: no cover 51 | return False 52 | return True 53 | 54 | def convert(self): 55 | """Convert to QOperator format.""" 56 | node = self.node 57 | 58 | parent = self.quantizer.model.get_parents(node)[0] 59 | child = self.quantizer.model.get_children(node)[0] 60 | 61 | kwargs = {} 62 | for attribute in node.attribute: 63 | kv = quant_utils.attribute_to_kwarg(attribute) 64 | kwargs.update(kv) 65 | 66 | if "mode" not in kwargs or kwargs["mode"] == b"constant": 67 | if len(node.input) > 2: # There is 3rd input 'constant_value' 68 | zp_tensor = self.quantizer.model.get_initializer(parent.input[2]) 69 | scale_tensor = self.quantizer.model.get_initializer(parent.input[1]) 70 | 71 | padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2]) 72 | if padding_constant_initializer is not None: 73 | zp_array = onnx.numpy_helper.to_array(zp_tensor) 74 | zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0] 75 | scale_array = onnx.numpy_helper.to_array(scale_tensor) 76 | scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0] 77 | padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer) 78 | quantized_padding_constant_array = quant_utils.quantize_nparray( 79 | onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype), 80 | padding_constant_array, 81 | scale_value, 82 | zp_value, 83 | ) 84 | quantized_padding_constant_name = node.input[2] + "_quantized" 85 | quantized_padding_constant_initializer = onnx.numpy_helper.from_array( 86 | quantized_padding_constant_array, quantized_padding_constant_name 87 | ) 88 | # Suppose this padding constant initializer only used by the node 89 | self.quantizer.model.remove_initializer(padding_constant_initializer) 90 | self.quantizer.model.add_initializer(quantized_padding_constant_initializer) 91 | node.input[2] = quantized_padding_constant_name 92 | else: 93 | self.quantizer.quantize_inputs(node, [2], False) 94 | node.input[2] = node.input[2] + "_DequantizeLinear" 95 | else: 96 | # pad zero_point for original zero 97 | node.input.extend([parent.input[2]]) 98 | 99 | # Create an entry for output quantized value 100 | node.input[0] = parent.input[0] 101 | node.output[0] = child.output[0] 102 | self.quantizer.remove_nodes.extend([parent, child]) 103 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """AveragePool Operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="AveragePool", mode=[constants.STATIC_QUANT]) 24 | class PoolOperator(base_op.Operator): 25 | """AveragePool Operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(PoolOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def quantize_check(self): 32 | """Check if quantizaion can be done.""" 33 | node = self.node 34 | if not self.quantizer.is_valid_quantize_weight(node.input[0]): 35 | return False 36 | return True 37 | 38 | def quantize(self): 39 | """Do quantizaion.""" 40 | node = self.node 41 | super().quantize() 42 | node.name = node.name + "_quant" 43 | 44 | def convert_check(self): 45 | """Check if conversion can be done.""" 46 | node = self.node 47 | parents = self.quantizer.model.get_parents(node) 48 | children = self.quantizer.model.get_children(node) 49 | 50 | if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"): 51 | return False 52 | return True 53 | 54 | def convert(self): 55 | """Convert to QOperator format.""" 56 | node = self.node 57 | 58 | parents = self.quantizer.model.get_parents(node) 59 | children = self.quantizer.model.get_children(node) 60 | 61 | if all([i.op_type == "DequantizeLinear" for i in parents]) and any( 62 | [i.op_type == "QuantizeLinear" for i in children] 63 | ): 64 | qlinear_output_name = node.output[0] + "_quantized" 65 | inputs = [] 66 | inputs.extend(parents[0].input) 67 | inputs.extend([i for i in children if i.op_type == "QuantizeLinear"][0].input[1:]) 68 | kwargs = {} 69 | for attribute in node.attribute: 70 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 71 | kwargs["domain"] = quant_utils.ms_domain 72 | qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, [qlinear_output_name], node.name, **kwargs) 73 | 74 | self.quantizer.remove_nodes.extend(parents) 75 | for child in children: 76 | if child.op_type == "QuantizeLinear": 77 | self.quantizer.remove_nodes.append(child) 78 | self.quantizer.model.replace_input_of_all_nodes(child.output[0], qnode.output[0]) 79 | 80 | self.quantizer.new_nodes.append(qnode) 81 | self.quantizer.remove_nodes.append(node) 82 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Reduce Operator.""" 15 | 16 | from onnx_neural_compressor import constants, utility 17 | from onnx_neural_compressor.algorithms import utility as quant_utils 18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 19 | 20 | 21 | @base_op.op_registry( 22 | op_types="ReduceMean, ReduceLogSum, ReduceLogSumExp, " "ReduceL1, ReduceL2, ReduceProd, ReduceSum, ReduceSumSquare", 23 | mode=[constants.STATIC_QUANT], 24 | ) 25 | class ReduceOperator(base_op.Operator): 26 | """Reduce Operator.""" 27 | 28 | def __init__(self, onnx_quantizer, onnx_node): 29 | """Initialization.""" 30 | super(ReduceOperator, self).__init__(onnx_quantizer, onnx_node) 31 | 32 | 33 | @base_op.op_registry(op_types="ReduceMax, ReduceMin", mode=[constants.STATIC_QUANT]) 34 | class ReduceMinMaxOperator(base_op.Operator): 35 | """ReduceMin and ReduceMax Operator.""" 36 | 37 | def __init__(self, onnx_quantizer, onnx_node): 38 | """Initialization.""" 39 | super(ReduceMinMaxOperator, self).__init__(onnx_quantizer, onnx_node) 40 | 41 | def quantize_check(self): 42 | """Check if quantizaion can be done.""" 43 | node = self.node 44 | if not self.quantizer.is_valid_quantize_weight(node.input[0]): 45 | return False 46 | return True 47 | 48 | def quantize(self): 49 | """Do quantizaion.""" 50 | node = self.node 51 | self.quantizer.quantize_inputs(self.node, [0], direct_int8=True) 52 | if not self.disable_qdq_for_node_output: 53 | self.quantizer.quantize_outputs(self.node, direct_int8=True) 54 | node.name = node.name + "_quant" 55 | 56 | def convert_check(self): 57 | """Check if conversion can be done.""" 58 | node = self.node 59 | parents = self.quantizer.model.get_parents(node) 60 | children = self.quantizer.model.get_children(node) 61 | if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): 62 | return False 63 | return True 64 | 65 | def convert(self): 66 | """Convert to QOperator format.""" 67 | node = self.node 68 | 69 | parents = self.quantizer.model.get_parents(node) 70 | children = self.quantizer.model.get_children(node) 71 | if any([i.op_type == "DequantizeLinear" for i in parents]) and any( 72 | [i.op_type == "QuantizeLinear" for i in children] 73 | ): 74 | for parent in parents: 75 | if parent.op_type == "DequantizeLinear": 76 | self.node.input[0] = parent.input[0] 77 | self.quantizer.remove_nodes.append(parents[0]) 78 | break 79 | for child in children: 80 | if child.op_type == "QuantizeLinear": 81 | self.quantizer.remove_nodes.append(child) 82 | self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") 83 | node.output[0] = node.output[0] + "_quantized" 84 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Resize Operator.""" 15 | 16 | from onnx_neural_compressor import constants, utility 17 | from onnx_neural_compressor.algorithms import utility as quant_utils 18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 19 | 20 | 21 | @base_op.op_registry(op_types="Resize", mode=[constants.STATIC_QUANT]) 22 | class ResizeOperator(base_op.Operator): 23 | """Resize Operator.""" 24 | 25 | def __init__(self, onnx_quantizer, onnx_node): 26 | """Initialization.""" 27 | super(ResizeOperator, self).__init__(onnx_quantizer, onnx_node) 28 | 29 | def quantize_check(self): 30 | """Check if quantizaion can be done.""" 31 | node = self.node 32 | # if version is less than 11, just keep this node 33 | if self.quantizer.opset_version < 11: 34 | return False 35 | if not self.quantizer.is_valid_quantize_weight(node.input[0]): 36 | return False 37 | return True 38 | 39 | def quantize(self): 40 | """Do quantizaion.""" 41 | node = self.node 42 | self.quantizer.quantize_inputs(node, [0], direct_int8=True) 43 | if not self.disable_qdq_for_node_output: 44 | self.quantizer.quantize_outputs(self.node, direct_int8=True) 45 | node.name = node.name + "_quant" 46 | 47 | def convert_check(self): 48 | """Check if conversion can be done.""" 49 | node = self.node 50 | parents = self.quantizer.model.get_parents(node) 51 | children = self.quantizer.model.get_children(node) 52 | if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): 53 | return False 54 | return True 55 | 56 | def convert(self): 57 | """Convert to QOperator format.""" 58 | node = self.node 59 | 60 | parents = self.quantizer.model.get_parents(node) 61 | children = self.quantizer.model.get_children(node) 62 | 63 | if any([i.op_type == "DequantizeLinear" for i in parents]) and any( 64 | [i.op_type == "QuantizeLinear" for i in children] 65 | ): 66 | for parent in parents: 67 | if parent.op_type == "DequantizeLinear" and parent.output[0] == node.input[0]: 68 | self.node.input[0] = parent.input[0] 69 | self.quantizer.remove_nodes.append(parent) 70 | break 71 | for child in children: 72 | if child.op_type == "QuantizeLinear": 73 | self.quantizer.remove_nodes.append(child) 74 | self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") 75 | node.output[0] = node.output[0] + "_quantized" 76 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/split.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Split Operator.""" 15 | 16 | import onnx 17 | 18 | from onnx_neural_compressor import constants, utility 19 | from onnx_neural_compressor.algorithms import utility as quant_utils 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 21 | 22 | 23 | @base_op.op_registry(op_types="Split", mode=[constants.STATIC_QUANT]) 24 | class SplitOperator(base_op.Operator): 25 | """Split Operator.""" 26 | 27 | def __init__(self, onnx_quantizer, onnx_node): 28 | """Initialization.""" 29 | super(SplitOperator, self).__init__(onnx_quantizer, onnx_node) 30 | 31 | def quantize_check(self): 32 | """Check if quantizaion can be done.""" 33 | node = self.node 34 | data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0]) 35 | if not data_found: 36 | return False 37 | if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]): 38 | return False 39 | return True 40 | 41 | def quantize(self): 42 | """Do quantizaion.""" 43 | node = self.node 44 | self.quantizer.quantize_inputs(node, [0]) 45 | if not self.disable_qdq_for_node_output: 46 | self.quantizer.quantize_outputs(self.node, direct_int8=True) 47 | node.name = node.name + "_quant" 48 | 49 | def convert_check(self): 50 | """Check if conversion can be done.""" 51 | node = self.node 52 | parent = self.quantizer.model.get_parents(node)[0] 53 | children = self.quantizer.model.get_children(node) 54 | if ( 55 | parent.op_type != "DequantizeLinear" or len(children) == 0 or not node.name.endswith("_quant") 56 | ): # pragma: no cover 57 | return False 58 | return True 59 | 60 | def convert(self): 61 | """Convert to QOperator format.""" 62 | node = self.node 63 | 64 | parent = self.quantizer.model.get_parents(node)[0] 65 | kwargs = {} 66 | for attribute in node.attribute: # pragma: no cover 67 | kwargs.update(quant_utils.attribute_to_kwarg(attribute)) 68 | 69 | quantized_input_names = [] 70 | quantized_input_names.append(parent.input[0]) 71 | if len(node.input) > 1: # pragma: no cover 72 | quantized_input_names.extend(node.input[1:]) 73 | outputs = [] 74 | input_name_to_nodes = self.quantizer.model.input_name_to_nodes() 75 | for output in node.output: 76 | if output in input_name_to_nodes: 77 | child = input_name_to_nodes[output][0] 78 | if child.op_type == "QuantizeLinear": 79 | self.quantizer.remove_nodes.append(child) 80 | outputs.append(child.output[0]) 81 | else: # pragma: no cover 82 | outputs.append(output) 83 | else: # pragma: no cover 84 | outputs.append(output + "_quantized") 85 | 86 | quantized_node = onnx.helper.make_node(node.op_type, quantized_input_names, outputs, node.name, **kwargs) 87 | self.quantizer.new_nodes.append(quantized_node) 88 | self.quantizer.remove_nodes.extend([parent, node]) 89 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Unary operator.""" 15 | 16 | from onnx_neural_compressor import constants, utility 17 | from onnx_neural_compressor.algorithms import utility as quant_utils 18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op 19 | 20 | 21 | @base_op.op_registry(op_types="Exp, Log, Round, Sqrt", mode=[constants.STATIC_QUANT]) 22 | class UnaryOperator(base_op.Operator): 23 | """Unary operator.""" 24 | 25 | def __init__(self, onnx_quantizer, onnx_node): 26 | """Initialization.""" 27 | super(UnaryOperator, self).__init__(onnx_quantizer, onnx_node) 28 | 29 | 30 | @base_op.op_registry(op_types="Abs, Shrink, Sign", mode=[constants.STATIC_QUANT]) 31 | class UnaryDirect8BitOperator(base_op.Operator): 32 | """Unary operator.""" 33 | 34 | def __init__(self, onnx_quantizer, onnx_node): 35 | """Initialization.""" 36 | super(UnaryDirect8BitOperator, self).__init__(onnx_quantizer, onnx_node) 37 | 38 | def quantize_check(self): 39 | """Check if quantizaion can be done.""" 40 | node = self.node 41 | if not self.quantizer.is_valid_quantize_weight(node.input[0]): 42 | return False 43 | return True 44 | 45 | def quantize(self): 46 | """Do quantizaion.""" 47 | node = self.node 48 | self.quantizer.quantize_inputs(self.node, [0], direct_int8=True) 49 | if not self.disable_qdq_for_node_output: 50 | self.quantizer.quantize_outputs(self.node, direct_int8=True) 51 | node.name = node.name + "_quant" 52 | 53 | def convert_check(self): 54 | """Check if conversion can be done.""" 55 | node = self.node 56 | parents = self.quantizer.model.get_parents(node) 57 | children = self.quantizer.model.get_children(node) 58 | if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"): 59 | return False 60 | return True 61 | 62 | def convert(self): 63 | """Convert to QOperator format.""" 64 | node = self.node 65 | 66 | parents = self.quantizer.model.get_parents(node) 67 | children = self.quantizer.model.get_children(node) 68 | if any([i.op_type == "DequantizeLinear" for i in parents]) and any( 69 | [i.op_type == "QuantizeLinear" for i in children] 70 | ): 71 | for parent in parents: 72 | if parent.op_type == "DequantizeLinear": 73 | self.node.input[0] = parent.input[0] 74 | self.quantizer.remove_nodes.append(parents[0]) 75 | break 76 | for child in children: 77 | if child.op_type == "QuantizeLinear": 78 | self.quantizer.remove_nodes.append(child) 79 | self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized") 80 | node.output[0] = node.output[0] + "_quantized" 81 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/smoother/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /onnx_neural_compressor/algorithms/weight_only/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /onnx_neural_compressor/data_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import abc 16 | 17 | 18 | class CalibrationDataReader(metaclass=abc.ABCMeta): 19 | @classmethod 20 | def __subclasshook__(cls, subclass): 21 | return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented 22 | 23 | @abc.abstractmethod 24 | def get_next(self) -> dict: 25 | """generate the input data dict for ONNXinferenceSession run""" 26 | raise NotImplementedError 27 | 28 | def __iter__(self): 29 | return self 30 | 31 | def __next__(self): 32 | result = self.get_next() 33 | if result is None: 34 | raise StopIteration 35 | return result 36 | 37 | @abc.abstractmethod 38 | def rewind(self): 39 | """Regenerate data.""" 40 | raise NotImplementedError 41 | -------------------------------------------------------------------------------- /onnx_neural_compressor/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import os 17 | 18 | 19 | def _pretty_dict(value, indent=0): 20 | """Make the logger dict pretty.""" 21 | prefix = "\n" + " " * (indent + 4) 22 | if isinstance(value, dict): 23 | items = [prefix + repr(key) + ": " + _pretty_dict(value[key], indent + 4) for key in value] 24 | return "{%s}" % (",".join(items) + "\n" + " " * indent) 25 | elif isinstance(value, list): 26 | items = [prefix + _pretty_dict(item, indent + 4) for item in value] 27 | return "[%s]" % (",".join(items) + "\n" + " " * indent) 28 | elif isinstance(value, tuple): 29 | items = [prefix + _pretty_dict(item, indent + 4) for item in value] 30 | return "(%s)" % (",".join(items) + "\n" + " " * indent) 31 | else: 32 | return repr(value) 33 | 34 | 35 | LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper() 36 | _logger = logging.getLogger("onnx_neural_compressor") 37 | _logger.handlers.clear() 38 | _logger.setLevel(LOGLEVEL) 39 | formatter = logging.Formatter("%(asctime)s [%(levelname)s][%(filename)s:%(lineno)d] %(message)s", "%Y-%m-%d %H:%M:%S") 40 | streamHandler = logging.StreamHandler() 41 | streamHandler.setFormatter(formatter) 42 | _logger.addHandler(streamHandler) 43 | _logger.propagate = False 44 | 45 | 46 | def log(level, msg, *args, **kwargs): 47 | """Output log with the level as a parameter.""" 48 | kwargs.setdefault("stacklevel", 2) 49 | if isinstance(msg, dict): 50 | for _, line in enumerate(_pretty_dict(msg).split("\n")): 51 | _logger.log(level, line, *args, **kwargs) 52 | else: 53 | _logger.log(level, msg, *args, **kwargs) 54 | 55 | 56 | def debug(msg, *args, **kwargs): 57 | """Output log with the debug level.""" 58 | kwargs.setdefault("stacklevel", 2) 59 | if isinstance(msg, dict): 60 | for _, line in enumerate(_pretty_dict(msg).split("\n")): 61 | _logger.debug(line, *args, **kwargs) 62 | else: 63 | _logger.debug(msg, *args, **kwargs) 64 | 65 | 66 | def error(msg, *args, **kwargs): 67 | """Output log with the error level.""" 68 | kwargs.setdefault("stacklevel", 2) 69 | if isinstance(msg, dict): 70 | for _, line in enumerate(_pretty_dict(msg).split("\n")): 71 | _logger.error(line, *args, **kwargs) 72 | else: 73 | _logger.error(msg, *args, **kwargs) 74 | 75 | 76 | def fatal(msg, *args, **kwargs): 77 | """Output log with the fatal level.""" 78 | kwargs.setdefault("stacklevel", 2) 79 | if isinstance(msg, dict): 80 | for _, line in enumerate(_pretty_dict(msg).split("\n")): 81 | _logger.fatal(line, *args, **kwargs) 82 | else: 83 | _logger.fatal(msg, *args, **kwargs) 84 | 85 | 86 | def info(msg, *args, **kwargs): 87 | """Output log with the info level.""" 88 | kwargs.setdefault("stacklevel", 2) 89 | if isinstance(msg, dict): 90 | for _, line in enumerate(_pretty_dict(msg).split("\n")): 91 | _logger.info(line, *args, **kwargs) 92 | else: 93 | _logger.info(msg, *args, **kwargs) 94 | 95 | 96 | def warning(msg, *args, **kwargs): 97 | """Output log with the warning level (Alias of the method warn).""" 98 | kwargs.setdefault("stacklevel", 2) 99 | if isinstance(msg, dict): 100 | for _, line in enumerate(_pretty_dict(msg).split("\n")): 101 | _logger.warning(line, *args, **kwargs) 102 | else: 103 | _logger.warning(msg, *args, **kwargs) 104 | -------------------------------------------------------------------------------- /onnx_neural_compressor/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from onnx_neural_compressor.quantization.quant_utils import CalibrationMethod, QuantFormat, QuantType 16 | from onnx_neural_compressor.quantization.quantize import quantize 17 | -------------------------------------------------------------------------------- /onnx_neural_compressor/quantization/matmul_4bits_quantizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Union # isort: skip 16 | 17 | import onnx 18 | import onnxruntime as ort 19 | 20 | from onnx_neural_compressor.quantization import matmul_nbits_quantizer 21 | 22 | RTNWeightOnlyQuantConfig = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig 23 | AWQWeightOnlyQuantConfig = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig 24 | GPTQWeightOnlyQuantConfig = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig 25 | 26 | 27 | class MatMul4BitsQuantizer(matmul_nbits_quantizer.MatMulNBitsQuantizer): 28 | 29 | def __init__( 30 | self, 31 | model: Union[onnx.ModelProto, str], 32 | block_size: int = 128, 33 | is_symmetric: bool = False, 34 | is_signed: bool = False, 35 | accuracy_level: int = 0, 36 | nodes_to_exclude=None, 37 | algo_config: matmul_nbits_quantizer.WeightOnlyQuantConfig = None, 38 | providers: List[str] = ["CPUExecutionProvider"], 39 | optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, 40 | ): 41 | super().__init__( 42 | model=model, 43 | block_size=block_size, 44 | is_symmetric=is_symmetric, 45 | is_signed=is_signed, 46 | accuracy_level=accuracy_level, 47 | nodes_to_exclude=nodes_to_exclude, 48 | algo_config=algo_config, 49 | n_bits=4, 50 | providers=providers, 51 | optimization_level=optimization_level, 52 | ) 53 | -------------------------------------------------------------------------------- /onnx_neural_compressor/quantization/quant_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 MIT HAN Lab 2 | # This source code is licensed under the MIT license 3 | # 4 | # Copyright (c) 2024 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import enum 19 | 20 | import onnx 21 | 22 | 23 | class QuantType(enum.Enum): # pragma: no cover 24 | """Represent QuantType value.""" 25 | 26 | QInt8 = 0 27 | QUInt8 = 1 28 | QInt4 = 4 29 | QUInt4 = 5 30 | 31 | @property 32 | def tensor_type(self): 33 | if self == QuantType.QInt8: 34 | return onnx.TensorProto.INT8 35 | if self == QuantType.QUInt8: 36 | return onnx.TensorProto.UINT8 37 | if self == QuantType.QInt8: 38 | return onnx.TensorProto.INT4 39 | if self == QuantType.QUInt4: 40 | return onnx.TensorProto.UINT4 41 | raise ValueError(f"Unexpected value qtype={self!r}.") 42 | 43 | 44 | class QuantFormat(enum.Enum): 45 | QOperator = 0 46 | QDQ = 1 47 | 48 | 49 | class CalibrationMethod(enum.Enum): 50 | MinMax = 0 51 | Entropy = 1 52 | Percentile = 2 53 | Distribution = 3 54 | -------------------------------------------------------------------------------- /onnx_neural_compressor/quantization/quantize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pathlib 16 | import tempfile 17 | from typing import Union 18 | 19 | import onnx 20 | import onnxruntime as ort 21 | from onnxruntime.quantization.quantize import QuantConfig 22 | 23 | from onnx_neural_compressor.quantization import algorithm_entry as algos 24 | from onnx_neural_compressor.quantization import config 25 | 26 | 27 | # ORT-like user-facing API 28 | def quantize( 29 | model_input: Union[str, pathlib.Path, onnx.ModelProto], 30 | model_output: Union[str, pathlib.Path], 31 | quant_config: config.BaseConfig, 32 | optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, 33 | ): 34 | with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir: 35 | if optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL: 36 | sess_options = ort.SessionOptions() 37 | sess_options.graph_optimization_level = optimization_level 38 | sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() 39 | sess_options.add_session_config_entry( 40 | "session.optimized_model_external_initializers_file_name", "opt.onnx_data" 41 | ) 42 | sess_options.add_session_config_entry( 43 | "session.optimized_model_external_initializers_min_size_in_bytes", "1024" 44 | ) 45 | session = ort.InferenceSession(model_input, sess_options, provides=["CPUExecutionProvider"]) 46 | del session 47 | model_input = sess_options.optimized_model_filepath 48 | 49 | if isinstance(quant_config, config.StaticQuantConfig): 50 | if quant_config.extra_options.get("SmoothQuant", False): 51 | algos.smooth_quant_entry( 52 | model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output 53 | ) 54 | else: 55 | algos.static_quantize_entry( 56 | model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output 57 | ) 58 | elif isinstance(quant_config, config.DynamicQuantConfig): 59 | algos.dynamic_quantize_entry(model_input, quant_config, model_output=model_output) 60 | else: 61 | raise TypeError( 62 | "Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig." 63 | ) 64 | -------------------------------------------------------------------------------- /onnx_neural_compressor/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Neural Compressor: An open-source Python library supporting popular model compression techniques for ONNX models.""" 15 | __version__ = "1.0" 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile = "black" 3 | line_length = 120 4 | extend_skip_glob = ["**/__init__.py"] 5 | 6 | [tool.black] 7 | line-length = 120 8 | 9 | [tool.codespell] 10 | skip = '*.po,*.ts,*.js,*.map,*.js.map,*.css.map,.azure-pipelines/scripts/codeScan/codespell/inc_dict.txt' 11 | count = '' 12 | quiet-level = 3 13 | ignore-words = ".azure-pipelines/scripts/codeScan/codespell/nc_dict.txt" 14 | 15 | 16 | [tool.ruff] 17 | # Exclude a variety of commonly ignored directories. 18 | exclude = [ 19 | ".bzr", 20 | ".direnv", 21 | ".eggs", 22 | ".git", 23 | ".git-rewrite", 24 | ".hg", 25 | ".ipynb_checkpoints", 26 | ".mypy_cache", 27 | ".nox", 28 | ".pants.d", 29 | ".pyenv", 30 | ".pytest_cache", 31 | ".pytype", 32 | ".ruff_cache", 33 | ".svn", 34 | ".tox", 35 | ".venv", 36 | ".vscode", 37 | "__pypackages__", 38 | "_build", 39 | "buck-out", 40 | "build", 41 | "dist", 42 | "node_modules", 43 | "site-packages", 44 | "venv", 45 | ] 46 | 47 | # Same as Black. 48 | line-length = 120 49 | indent-width = 4 50 | 51 | # Assume Python 3.8 52 | target-version = "py38" 53 | 54 | [tool.ruff.lint] 55 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 56 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 57 | # McCabe complexity (`C901`) by default. 58 | select = ["E4", "E7", "E9", "F"] 59 | ignore = [ 60 | "E402", # Module level import not at top of file 61 | "E501", # Line too long (121 > 120 characters) 62 | "E721", # Do not compare types, use isinstance() 63 | "E722", # Do not use bare except 64 | "E731", # Do not assign a lambda expression, use a def 65 | "E741", # Do not use variables named ‘l’, ‘O’, or ‘I’ 66 | "F401", # {name} imported but unused 67 | "F403", # from {name} import * used; unable to detect undefined names 68 | "F405", # {name} may be undefined, or defined from star imports 69 | "F841", # Local variable is assigned to but never used{name} 70 | ] 71 | 72 | # Allow fix for all enabled rules (when `--fix`) is provided. 73 | fixable = ["ALL"] 74 | unfixable = [] 75 | 76 | # Allow unused variables when underscore-prefixed. 77 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 78 | 79 | ignore-init-module-imports = true 80 | 81 | [tool.ruff.format] 82 | # Like Black, use double quotes for strings. 83 | quote-style = "double" 84 | 85 | # Like Black, indent with spaces, rather than tabs. 86 | indent-style = "space" 87 | 88 | # Like Black, respect magic trailing commas. 89 | skip-magic-trailing-comma = false 90 | 91 | # Like Black, automatically detect the appropriate line ending. 92 | line-ending = "auto" 93 | 94 | # Enable auto-formatting of code examples in docstrings. Markdown, 95 | # reStructuredText code/literal blocks and doctests are all supported. 96 | # 97 | # This is currently disabled by default, but it is planned for this 98 | # to be opt-out in the future. 99 | docstring-code-format = false 100 | 101 | # Set the line length limit used when formatting code snippets in 102 | # docstrings. 103 | # 104 | # This only has an effect when the `docstring-code-format` setting is 105 | # enabled. 106 | docstring-code-line-length = "dynamic" 107 | -------------------------------------------------------------------------------- /requirements-lintrunner.txt: -------------------------------------------------------------------------------- 1 | lintrunner_adapters 2 | ruff==0.4.5 3 | black==24.3.0 4 | isort==5.13.2 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # onnxruntime doesn't support numpy>=2.0.0. The restriction will be removed once they fix it. 2 | numpy<2.0.0 3 | onnx 4 | onnxruntime 5 | onnxruntime-extensions 6 | psutil 7 | py-cpuinfo 8 | pydantic 9 | transformers 10 | prettytable 11 | scipy 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import re 3 | import subprocess 4 | 5 | import setuptools 6 | 7 | 8 | def is_commit_on_tag(): 9 | try: 10 | result = subprocess.run( 11 | ["git", "describe", "--exact-match", "--tags"], capture_output=True, text=True, check=True 12 | ) 13 | tag_name = result.stdout.strip() 14 | return tag_name 15 | except subprocess.CalledProcessError: 16 | return False 17 | 18 | 19 | def get_build_version(): 20 | if is_commit_on_tag(): 21 | return __version__ 22 | try: 23 | result = subprocess.run(["git", "describe", "--tags"], capture_output=True, text=True, check=True) 24 | _, distance, commit = result.stdout.strip().split("-") 25 | return f"{__version__}.dev{distance}+{commit}" 26 | except subprocess.CalledProcessError: 27 | return __version__ 28 | 29 | 30 | try: 31 | filepath = "./onnx_neural_compressor/version.py" 32 | with io.open(filepath) as version_file: 33 | (__version__,) = re.findall('__version__ = "(.*)"', version_file.read()) 34 | except Exception as error: 35 | assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) 36 | 37 | if __name__ == "__main__": 38 | 39 | setuptools.setup( 40 | name="onnx_neural_compressor", 41 | author="Intel AIPT Team", 42 | version=get_build_version(), 43 | author_email="tai.huang@intel.com, mengni.wang@intel.com, yuwen.zhou@intel.com, suyue.chen@intel.com", 44 | description="Repository of Neural Compressor ORT", 45 | long_description=io.open("README.md", "r", encoding="utf-8").read(), 46 | long_description_content_type="text/markdown", 47 | keywords="quantization", 48 | license="Apache 2.0", 49 | url="", 50 | packages=setuptools.find_packages(), 51 | include_package_data=True, 52 | install_requires=[ 53 | "onnx", 54 | "onnxruntime", 55 | "onnxruntime-extensions", 56 | "psutil", 57 | "numpy<2.0.0", 58 | "py-cpuinfo", 59 | "pydantic", 60 | "transformers", 61 | ], 62 | python_requires=">=3.8.0", 63 | classifiers=[ 64 | "Intended Audience :: Science/Research", 65 | "Programming Language :: Python :: 3", 66 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 67 | "License :: OSI Approved :: Apache Software License", 68 | ], 69 | ) 70 | -------------------------------------------------------------------------------- /test/quantization/post_training_quant/test_quant_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import onnx 5 | 6 | from onnx_neural_compressor.algorithms import utility as quant_utils 7 | 8 | 9 | class TestQuantUtility(unittest.TestCase): 10 | 11 | def test_pad_tensor(self): 12 | data = np.random.random((100, 32)) 13 | group_size = 32 14 | k_blocks = (100 - 1) // 32 + 1 15 | pad_data = quant_utils.pad_tensor(data, group_size, k_blocks) 16 | self.assertEqual(pad_data.shape, (k_blocks * group_size, 32)) 17 | 18 | def test_quant_dequant_data(self): 19 | data = np.random.random((100, 32)) 20 | qrange = quant_utils.get_qmin_qmax_for_qType( 21 | qType=onnx.TensorProto.UINT8, 22 | reduce_range=False, 23 | sym=True, 24 | ) 25 | self.assertEqual(qrange[0], 0) 26 | self.assertEqual(qrange[1], 255) 27 | 28 | rmin = np.min(np.min(data), 0) 29 | rmax = np.max(np.max(data), 0) 30 | 31 | _, _, zero_point, scale, quantized_data = quant_utils.quantize_data( 32 | data=data, 33 | qType=onnx.TensorProto.UINT8, 34 | sym=True, 35 | ) 36 | 37 | dq_data = quant_utils.dequantize_data( 38 | tensor_value=quantized_data, 39 | scale_value=scale, 40 | zo_value=zero_point, 41 | ) 42 | self.assertLess(np.max(np.abs(dq_data - data)), 0.005) 43 | 44 | _, _, zero_point, scale, quantized_data = quant_utils.quantize_data_per_channel( 45 | data=data, 46 | qType=onnx.TensorProto.UINT8, 47 | sym=True, 48 | axis=1, 49 | ) 50 | 51 | dq_data = quant_utils.dequantize_data( 52 | tensor_value=quantized_data, 53 | scale_value=scale, 54 | zo_value=zero_point, 55 | axis=1, 56 | ) 57 | 58 | self.assertLess(np.max(np.abs(dq_data - data)), 0.005) 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest.main() 63 | -------------------------------------------------------------------------------- /test/quantization/test_algorithm_utility.py: -------------------------------------------------------------------------------- 1 | """Tests for algorithm utility components.""" 2 | 3 | import os 4 | import unittest 5 | 6 | import numpy as np 7 | import onnx 8 | 9 | from onnx_neural_compressor import onnx_model 10 | from onnx_neural_compressor.algorithms import utility as quant_utils 11 | 12 | 13 | def find_onnx_file(folder_path): 14 | # return first .onnx file path in folder_path 15 | for root, dirs, files in os.walk(folder_path): 16 | for file in files: 17 | if file.endswith(".onnx"): 18 | return os.path.join(root, file) 19 | return None 20 | 21 | 22 | class TestUtilityFunctions(unittest.TestCase): 23 | 24 | def test_is_B_transposed(self): 25 | node = onnx.helper.make_node( 26 | "Gemm", 27 | inputs=["a", "b", "c"], 28 | outputs=["y"], 29 | alpha=0.25, 30 | beta=0.35, 31 | transA=1, 32 | transB=1, 33 | ) 34 | self.assertTrue(quant_utils.is_B_transposed(node)) 35 | 36 | node = onnx.helper.make_node( 37 | "Gemm", 38 | inputs=["a", "b", "c"], 39 | outputs=["y"], 40 | alpha=0.25, 41 | beta=0.35, 42 | ) 43 | self.assertFalse(quant_utils.is_B_transposed(node)) 44 | 45 | def test_make_woq_dq_node(self): 46 | node = onnx.helper.make_node("MatMul", ["input", "weight"], "output", name="Matmul") 47 | with self.assertRaises(ValueError): 48 | quant_utils.make_weight_only_dequant_node( 49 | node=node, 50 | weight_shape=(32, 32), 51 | block_size=16, 52 | num_bits=32, 53 | dtype="int", 54 | q_weight=np.random.randint(0, 10, size=(2, 32), dtype=np.uint8), 55 | scale=np.random.random((2, 32)), 56 | zero_point=np.zeros((2, 32)), 57 | ) 58 | 59 | def test_split_shared_bias(self): 60 | input = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 15, 15]) 61 | output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 5, 11, 11]) 62 | bias_initializer = onnx.numpy_helper.from_array(np.random.random(5).astype(np.float32), name="bias") 63 | conv1_weight_initializer = onnx.numpy_helper.from_array( 64 | np.random.randint(-1, 2, [5, 3, 3, 3]).astype(np.float32), name="conv1_weight" 65 | ) 66 | conv1_node = onnx.helper.make_node("Conv", ["add_out", "conv1_weight", "bias"], ["conv1_output"], name="conv1") 67 | conv2_weight_initializer = onnx.numpy_helper.from_array( 68 | np.random.randint(-1, 2, [5, 5, 3, 3]).astype(np.float32), name="conv2_weight" 69 | ) 70 | conv2_node = onnx.helper.make_node("Conv", ["add_out", "conv2_weight", "bias"], ["conv2_output"], name="conv2") 71 | initializers = [conv1_weight_initializer, conv2_weight_initializer, bias_initializer] 72 | graph = onnx.helper.make_graph([conv1_node, conv2_node], "test", [input], [output], initializer=initializers) 73 | model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) 74 | 75 | update_model = quant_utils.split_shared_bias(onnx_model.ONNXModel(model)) 76 | split = any(["_nc_split_" in i.name for i in update_model.initializer()]) 77 | self.assertTrue(split) 78 | 79 | def test_get_qmin_qmax_for_qType(self): 80 | with self.assertRaises(ValueError): 81 | quant_utils.get_qmin_qmax_for_qType(onnx.TensorProto.INT64) 82 | 83 | qmin, qmax = quant_utils.get_qmin_qmax_for_qType(onnx.TensorProto.INT8, reduce_range=True) 84 | self.assertEqual(qmin, -64) 85 | self.assertEqual(qmax, 64) 86 | 87 | 88 | if __name__ == "__main__": 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /test/requirements.txt: -------------------------------------------------------------------------------- 1 | optimum 2 | pytest 3 | torch < 2.5.0 4 | -------------------------------------------------------------------------------- /test/utils/test_logger.py: -------------------------------------------------------------------------------- 1 | """Tests for logger components.""" 2 | 3 | import unittest 4 | 5 | from onnx_neural_compressor import logger 6 | 7 | log_msg_lst = [ 8 | "call logger log function.", 9 | {"msg": "call logger log function."}, 10 | ["call logger warning function", "done"], 11 | ("call logger warning function", "done"), 12 | # the following log will be prettified 13 | {"msg": "call logger warning function"}, 14 | {"msg": {("bert", "embedding"): {"weight": {"dtype": ["unint8", "int8"]}}}}, 15 | {"msg": {("bert", "embedding"): {"op": ("a", "b")}}}, 16 | # the following log will not be prettified 17 | [{"msg": "call logger warning function"}, {"msg2": "done"}], 18 | ({"msg": "call logger warning function"}, {"msg2": "done"}), 19 | ({"msg": [{"sub_msg": "call logger"}, {"sub_msg2": "call warning function"}]}, {"msg2": "done"}), 20 | ] 21 | 22 | 23 | class TestLogger(unittest.TestCase): 24 | 25 | def test_logger(self): 26 | 27 | for msg in log_msg_lst: 28 | logger.log(0, msg) 29 | logger.log(1, msg) 30 | logger.debug(msg) 31 | logger.error(msg) 32 | logger.fatal(msg) 33 | logger.info(msg) 34 | logger.warning(msg) 35 | 36 | 37 | if __name__ == "__main__": 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /test/utils/test_param.py: -------------------------------------------------------------------------------- 1 | """Tests for tuning param components.""" 2 | 3 | import unittest 4 | from typing import List 5 | 6 | from onnx_neural_compressor.quantization import config 7 | 8 | 9 | class TestTuningParam(unittest.TestCase): 10 | 11 | def test_is_tunable_same_type(self): 12 | # Test when tunable_type has the same type as the default value 13 | param = config.TuningParam("param_name", [1, 2, 3], List[int]) 14 | self.assertTrue(param.is_tunable([4, 5, 6])) 15 | self.assertFalse(param.is_tunable(["not_an_int"])) 16 | 17 | def test_is_tunable_recursive(self): 18 | # Test recursive type checking for iterables 19 | param = config.TuningParam("param_name", [[1, 2], [3, 4]], List[List[int]]) 20 | self.assertTrue(param.is_tunable([[5, 6], [7, 8]])) 21 | # TODO: double check if this is the expected behavior 22 | self.assertTrue(param.is_tunable([[5, 6], [7, "8"]])) 23 | self.assertEqual( 24 | str(param), "TuningParam(name=param_name, tunable_type=typing.List[typing.List[int]], options=None)." 25 | ) 26 | 27 | 28 | if __name__ == "__main__": 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /test/utils/test_utility.py: -------------------------------------------------------------------------------- 1 | """Tests for utility components.""" 2 | 3 | import os 4 | import shutil 5 | import unittest 6 | 7 | import onnx 8 | import onnxruntime 9 | import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer 10 | import optimum.exporters.onnx 11 | 12 | from onnx_neural_compressor import onnx_model, utility 13 | 14 | 15 | def find_onnx_file(folder_path): 16 | # return first .onnx file path in folder_path 17 | for root, dirs, files in os.walk(folder_path): 18 | for file in files: 19 | if file.endswith(".onnx"): 20 | return os.path.join(root, file) 21 | return None 22 | 23 | 24 | class TestOptions(unittest.TestCase): 25 | 26 | def test_set_random_seed(self): 27 | seed = 12345 28 | utility.set_random_seed(seed) 29 | self.assertEqual(utility.options.random_seed, seed) 30 | 31 | # non int type 32 | seed = "12345" 33 | with self.assertRaises(AssertionError): 34 | utility.set_random_seed(seed) 35 | 36 | 37 | class TestCPUInfo(unittest.TestCase): 38 | 39 | def test_cpu_info(self): 40 | cpu_info = utility.CpuInfo() 41 | assert cpu_info.cores_per_socket > 0, "CPU count should be greater than 0" 42 | assert isinstance(cpu_info.bf16, bool), "bf16 should be a boolean" 43 | assert isinstance(cpu_info.vnni, bool), "avx512 should be a boolean" 44 | 45 | 46 | class TestLazyImport(unittest.TestCase): 47 | 48 | def test_lazy_import(self): 49 | # Test import 50 | pydantic = utility.LazyImport("pydantic") 51 | assert pydantic.__name__ == "pydantic", "pydantic should be imported" 52 | 53 | def test_lazy_import_error(self): 54 | # Test import error 55 | with self.assertRaises(ImportError): 56 | non_existent_module = utility.LazyImport("non_existent_module") 57 | non_existent_module.non_existent_function() 58 | 59 | 60 | class TestSingletonDecorator: 61 | 62 | def test_singleton_decorator(self): 63 | 64 | @utility.singleton 65 | class TestSingleton: 66 | 67 | def __init__(self): 68 | self.value = 0 69 | 70 | instance = TestSingleton() 71 | instance.value = 1 72 | instance2 = TestSingleton() 73 | assert instance2.value == 1, "Singleton should return the same instance" 74 | 75 | 76 | class TestGetVersion(unittest.TestCase): 77 | 78 | def test_get_version(self): 79 | from onnx_neural_compressor import version 80 | 81 | self.assertTrue(isinstance(version.__version__, str)) 82 | 83 | 84 | class TestUtilityFunctions(unittest.TestCase): 85 | 86 | def test_check_value(self): 87 | src = [1, 2, 3] 88 | supported_type = int 89 | supported_value = [1, 2, 3] 90 | result = utility.check_value("name", src, supported_type, supported_value) 91 | self.assertTrue(result) 92 | 93 | src = [1, 2, 3] 94 | supported_type = list 95 | with self.assertRaises(AssertionError) as cm: 96 | utility.check_value("name", src, supported_type) 97 | self.assertEqual( 98 | str(cm.exception), 99 | "Type of 'name' items should be but not [, , ]", 100 | ) 101 | 102 | src = 1 103 | supported_type = list 104 | with self.assertRaises(AssertionError) as cm: 105 | utility.check_value("name", src, supported_type) 106 | self.assertEqual(str(cm.exception), "Type of 'name' should be but not ") 107 | 108 | src = "a" 109 | supported_type = str 110 | supported_value = ["b"] 111 | with self.assertRaises(AssertionError) as cm: 112 | utility.check_value("name", src, supported_type, supported_value) 113 | self.assertEqual(str(cm.exception), "'a' is not in supported 'name': ['b']. Skip setting it.") 114 | 115 | src = ["a"] 116 | supported_type = str 117 | supported_value = ["b"] 118 | with self.assertRaises(AssertionError) as cm: 119 | utility.check_value("name", src, supported_type, supported_value) 120 | self.assertEqual(str(cm.exception), "['a'] is not in supported 'name': ['b']. Skip setting it.") 121 | 122 | 123 | if __name__ == "__main__": 124 | unittest.main() 125 | --------------------------------------------------------------------------------