├── .azure-pipelines
    ├── docker
    │   └── Dockerfile.devel
    ├── model-test.yml
    ├── scripts
    │   ├── change_color.sh
    │   ├── codeScan
    │   │   └── codespell
    │   │   │   └── nc_dict.txt
    │   ├── install_nc.sh
    │   ├── models
    │   │   ├── collect_results.py
    │   │   ├── env_setup.sh
    │   │   ├── generate_report.py
    │   │   ├── run_onnxrt_llm_models_trigger.sh
    │   │   ├── run_onnxrt_models_trigger.sh
    │   │   ├── summarize_results.py
    │   │   └── templates
    │   │   │   └── model.jinja2
    │   └── ut
    │   │   ├── collect_log.sh
    │   │   ├── compare_coverage.sh
    │   │   ├── coverage.ort
    │   │   └── run_ort.sh
    ├── template
    │   ├── docker-template.yml
    │   ├── model-template.yml
    │   └── ut-template.yml
    └── ut-3x-ort.yml
├── .github
    ├── license_template.txt
    ├── pull_request_template.md
    └── workflows
    │   └── lint.yaml
├── .gitignore
├── .lintrunner.toml
├── LICENSE
├── README.md
├── SECURITY.md
├── docs
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── SECURITY.md
    ├── autotune.md
    ├── calibration.md
    ├── design.md
    ├── imgs
    │   ├── architecture.png
    │   ├── common
    │   │   ├── code.svg
    │   │   └── right.svg
    │   ├── lwq_ort.png
    │   ├── smoothquant.png
    │   ├── sq_convert.png
    │   ├── sq_pc.png
    │   └── workflow.png
    ├── installation_guide.md
    ├── quantization.md
    ├── quantization_layer_wise.md
    ├── quantization_weight_only.md
    └── smooth_quant.md
├── examples
    ├── .config
    │   └── model_params_onnxrt.json
    ├── image_recognition
    │   └── resnet50
    │   │   └── quantization
    │   │       └── ptq_static
    │   │           ├── README.md
    │   │           ├── main.py
    │   │           ├── prepare_model.py
    │   │           ├── requirements.txt
    │   │           ├── run_benchmark.sh
    │   │           └── run_quant.sh
    └── nlp
    │   ├── bert
    │       └── quantization
    │       │   ├── ptq_dynamic
    │       │       ├── README.md
    │       │       ├── main.py
    │       │       ├── prepare_data.sh
    │       │       ├── prepare_model.py
    │       │       ├── requirements.txt
    │       │       ├── run_benchmark.sh
    │       │       └── run_quant.sh
    │       │   └── ptq_static
    │       │       ├── README.md
    │       │       ├── main.py
    │       │       ├── prepare_data.sh
    │       │       ├── prepare_model.py
    │       │       ├── requirements.txt
    │       │       ├── run_benchmark.sh
    │       │       └── run_quant.sh
    │   └── huggingface_model
    │       ├── text_generation
    │           └── quantization
    │           │   └── weight_only
    │           │       ├── README.md
    │           │       ├── evaluation
    │           │           ├── __init__.py
    │           │           ├── accuracy.py
    │           │           ├── evaluator.py
    │           │           ├── models
    │           │           │   ├── __init__.py
    │           │           │   └── huggingface.py
    │           │           └── utils.py
    │           │       ├── main.py
    │           │       ├── prepare_model.py
    │           │       ├── requirements.txt
    │           │       ├── run_benchmark.sh
    │           │       └── run_quant.sh
    │       └── text_to_image
    │           └── stable_diffusion_v1_5
    │               └── quantization
    │                   └── ptq_static
    │                       ├── README.md
    │                       ├── imgs
    │                           ├── fp32.png
    │                           └── int8.png
    │                       ├── main.py
    │                       ├── requirements.txt
    │                       ├── run_benchmark.sh
    │                       └── run_quant.sh
├── onnx_neural_compressor
    ├── __init__.py
    ├── algorithms
    │   ├── __init__.py
    │   ├── layer_wise
    │   │   ├── __init__.py
    │   │   └── core.py
    │   ├── post_training_quant
    │   │   ├── __init__.py
    │   │   ├── calibrate.py
    │   │   ├── calibrator.py
    │   │   ├── operators
    │   │   │   ├── __init__.py
    │   │   │   ├── activation.py
    │   │   │   ├── argmax.py
    │   │   │   ├── attention.py
    │   │   │   ├── base_op.py
    │   │   │   ├── binary_op.py
    │   │   │   ├── concat.py
    │   │   │   ├── conv.py
    │   │   │   ├── direct_q8.py
    │   │   │   ├── embed_layernorm.py
    │   │   │   ├── gather.py
    │   │   │   ├── gavgpool.py
    │   │   │   ├── gemm.py
    │   │   │   ├── lstm.py
    │   │   │   ├── matmul.py
    │   │   │   ├── maxpool.py
    │   │   │   ├── pad.py
    │   │   │   ├── pooling.py
    │   │   │   ├── reduce.py
    │   │   │   ├── resize.py
    │   │   │   ├── split.py
    │   │   │   └── unary_op.py
    │   │   └── quantizer.py
    │   ├── smoother
    │   │   ├── __init__.py
    │   │   ├── calibrator.py
    │   │   └── core.py
    │   ├── utility.py
    │   └── weight_only
    │   │   ├── __init__.py
    │   │   ├── awq.py
    │   │   ├── gptq.py
    │   │   └── rtn.py
    ├── constants.py
    ├── data_reader.py
    ├── logger.py
    ├── onnx_model.py
    ├── quantization
    │   ├── __init__.py
    │   ├── algorithm_entry.py
    │   ├── config.py
    │   ├── matmul_4bits_quantizer.py
    │   ├── matmul_nbits_quantizer.py
    │   ├── quant_utils.py
    │   ├── quantize.py
    │   └── tuning.py
    ├── utility.py
    └── version.py
├── pyproject.toml
├── requirements-lintrunner.txt
├── requirements.txt
├── setup.py
└── test
    ├── quantization
        ├── layer_wise
        │   └── test_layer_wise.py
        ├── post_training_quant
        │   ├── test_calibrate.py
        │   ├── test_operators.py
        │   ├── test_post_training_quant.py
        │   └── test_quant_utils.py
        ├── test_algorithm_utility.py
        ├── test_autotune.py
        ├── test_config.py
        ├── test_smooth_quant.py
        └── weight_only
        │   ├── test_awq.py
        │   ├── test_gptq.py
        │   └── test_rtn.py
    ├── requirements.txt
    └── utils
        ├── test_general.py
        ├── test_logger.py
        ├── test_onnx_model.py
        ├── test_param.py
        └── test_utility.py


/.azure-pipelines/docker/Dockerfile.devel:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ARG UBUNTU_VER=22.04
16 | FROM ubuntu:${UBUNTU_VER} as devel
17 | 
18 | # See http://bugs.python.org/issue19846
19 | ENV LANG C.UTF-8
20 | 
21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
22 |     python3 \
23 |     python3-pip \
24 |     python3-dev \
25 |     python3-distutils \
26 |     autoconf \
27 |     build-essential \
28 |     git \
29 |     libgl1-mesa-glx \
30 |     libglib2.0-0 \
31 |     numactl \
32 |     time \
33 |     wget \
34 |     bc \
35 |     jq \
36 |     vim
37 | 
38 | RUN ln -sf $(which python3) /usr/bin/python
39 | 
40 | RUN python -m pip --no-cache-dir install --upgrade pip
41 | RUN python -m pip install --no-cache-dir setuptools
42 | 
43 | RUN pip list
44 | 
45 | WORKDIR /
46 | 
47 | 


--------------------------------------------------------------------------------
/.azure-pipelines/model-test.yml:
--------------------------------------------------------------------------------
  1 | trigger: none
  2 | 
  3 | pr:
  4 |   autoCancel: true
  5 |   drafts: false
  6 |   branches:
  7 |     include:
  8 |       - main
  9 |   paths:
 10 |     include:
 11 |       - onnx_neural_compressor
 12 |       - setup.py
 13 |       - requirements.txt
 14 |       - .azure-pipelines/scripts/models
 15 |       - .azure-pipelines/model-test.yml
 16 |       - .azure-pipelines/template/model-template.yml
 17 |     exclude:
 18 |       - test
 19 | 
 20 | variables:
 21 |   OUT_SCRIPT_PATH: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models
 22 |   SCRIPT_PATH: /neural_compressor/.azure-pipelines/scripts
 23 | 
 24 | parameters:
 25 |   - name: algorithms
 26 |     type: object
 27 |     default:
 28 |       - SQ
 29 |       - WOQ
 30 |   - name: models
 31 |     type: object
 32 |     default:
 33 |       - bert_base_MRPC
 34 |       - bert_base_MRPC_dynamic
 35 |       - resnet50-v1-12_qdq
 36 |       - resnet50-v1-12
 37 | 
 38 | stages:
 39 |   # - stage: ONNX_LLM_Models
 40 |   #   displayName: Run ONNX LLM Model
 41 |   #   pool: ICX-16C
 42 |   #   dependsOn: []
 43 |   #   jobs:
 44 |   #     - ${{ each algorithm in parameters.algorithms }}:
 45 |   #         - job:
 46 |   #           steps:
 47 |   #             - template: template/model-template.yml
 48 |   #               parameters:
 49 |   #                 modelName: "facebook/opt-125m"
 50 |   #                 algorithm: "${{ algorithm }}"
 51 |   #                 script_path: "run_onnxrt_llm_models_trigger.sh"
 52 | 
 53 |   - stage: ONNX_Models
 54 |     displayName: Run ONNX Model
 55 |     pool: MODEL_PERF_TEST
 56 |     dependsOn: []
 57 |     jobs:
 58 |       - ${{ each model in parameters.models }}:
 59 |           - job:
 60 |             displayName: ${{ model }}
 61 |             steps:
 62 |               - template: template/model-template.yml
 63 |                 parameters:
 64 |                   modelName: "${{ model }}"
 65 |                   algorithm: "Quantize"
 66 |                   script_path: "run_onnxrt_models_trigger.sh"
 67 | 
 68 |   - stage: GenerateLogs
 69 |     displayName: Generate Report
 70 |     pool:
 71 |       vmImage: "ubuntu-latest"
 72 |     dependsOn: [ONNX_Models]
 73 |     jobs:
 74 |       - job: GenerateReport
 75 |         steps:
 76 |           - script: |
 77 |               echo ${BUILD_SOURCESDIRECTORY}
 78 |               rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
 79 |               echo y | docker system prune
 80 |             displayName: "Clean workspace"
 81 |           - checkout: self
 82 |             clean: true
 83 |             displayName: "Checkout out Repo"
 84 |           - task: DownloadPipelineArtifact@2
 85 |             inputs:
 86 |               artifact:
 87 |               patterns: "**/result.json"
 88 |               path: $(OUT_SCRIPT_PATH)
 89 |           - task: UsePythonVersion@0
 90 |             displayName: "Use Python 3.10"
 91 |             inputs:
 92 |               versionSpec: "3.10"
 93 |           - script: |
 94 |               cd ${OUT_SCRIPT_PATH}
 95 |               mkdir generated last_generated
 96 |               python -u summarize_results.py --logs_dir $(OUT_SCRIPT_PATH) --output_dir generated
 97 |             displayName: "Summarize all results"
 98 |           - task: DownloadPipelineArtifact@2
 99 |             continueOnError: true
100 |             inputs:
101 |               source: "specific"
102 |               artifact: "FinalReport"
103 |               patterns: "**.json"
104 |               path: $(OUT_SCRIPT_PATH)/last_generated
105 |               project: $(System.TeamProject)
106 |               pipeline: "onc model test"
107 |               runVersion: "specific"
108 |               runId: $(refer_buildId)
109 |             displayName: "Download last logs"
110 |           - script: |
111 |               echo "------ Generating final report.html ------"
112 |               cd ${OUT_SCRIPT_PATH}
113 |               pip install jinja2
114 |               python generate_report.py --json_path generated/summary.json --last_json_path last_generated/summary.json
115 |             displayName: "Generate report"
116 |           - task: PublishPipelineArtifact@1
117 |             inputs:
118 |               targetPath: $(OUT_SCRIPT_PATH)/generated
119 |               artifact: FinalReport
120 |               publishLocation: "pipeline"
121 |             displayName: "Publish report"
122 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/change_color.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # -------------- general approach start----------------
 4 | 
 5 | # 1. import this file: 
 6 |     # source path/change_color.sh
 7 | # 2. use COLOR/BG: 
 8 |     # $VARIABLE_NAME && out_put_content && $RESET
 9 | # 3. COLOR + BG:
10 |     # $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET
11 | # 4. custom
12 |     # abbreviation(change number)
13 |         # txt number range (30, 37)
14 |         # bg number range (40, 47)
15 |         # special effects number range (1, 7)
16 |         # echo -en \\E[number1 + ; + number2 + ; + number3 + m"
17 |         # e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m"
18 | 
19 | # -------------- general approach end----------------==
20 | 
21 | 
22 | # general setting
23 | # ------------- light_color start----------------
24 | # black
25 | LIGHT_BLACK="echo -en \\E[30m"
26 | # red
27 | LIGHT_RED="echo -en \\E[31m"
28 | # green
29 | LIGHT_GREEN="echo -en \\E[32m"
30 | # yellow
31 | LIGHT_YELLOW="echo -en \\E[33m"
32 | # blue
33 | LIGHT_BLUE="echo -en \\E[34m"
34 | # purple
35 | LIGHT_PURPLE="echo -en \\E[35m"
36 | # cyan
37 | LIGHT_CYAN="echo -en \\E[36m"
38 | # gray
39 | LIGHT_GRAY="echo -en \\E[37m"
40 | # ------------- light_color end----------------
41 | 
42 | # ------------- bold_color start----------------
43 | # black
44 | BOLD_BLACK="echo -en \\E[1;30m"
45 | # red
46 | BOLD_RED="echo -en \\E[1;31m"
47 | # green
48 | BOLD_GREEN="echo -en \\E[1;32m"
49 | # yellow
50 | BOLD_YELLOW="echo -en \\E[1;33m"
51 | # blue
52 | BOLD_BLUE="echo -en \\E[1;34m"
53 | # purple
54 | BOLD_PURPLE="echo -en \\E[1;35m"
55 | # cyan
56 | BOLD_CYAN="echo -en \\E[1;36m"
57 | # gray
58 | BOLD_GRAY="echo -en \\E[1;37m"
59 | # ------------- bold_color end----------------
60 | 
61 | # ------------- background_color start----------------
62 | # black
63 | BG_BLACK="echo -en \\E[40m"
64 | # red
65 | BG_RED="echo -en \\E[41m"
66 | # green
67 | BG_GREEN="echo -en \\E[42m"
68 | # yellow
69 | BG_YELLOW="echo -en \\E[43m"
70 | # blue
71 | BG_BLUE="echo -en \\E[44m"
72 | # purple
73 | BG_PURPLE="echo -en \\E[45m"
74 | # cyan
75 | BG_CYAN="echo -en \\E[46m"
76 | # gray
77 | BG_GRAY="echo -en \\E[47m"
78 | # ------------- background_color end----------------
79 | 
80 | # close 
81 | RESET="echo -en \\E[0m"
82 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/codeScan/codespell/nc_dict.txt:
--------------------------------------------------------------------------------
1 | datas
2 | nd
3 | ot
4 | ue
5 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/install_nc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo -e "\n Install Neural Compressor ... "
 4 | cd /neural-compressor
 5 | 
 6 | python -m pip install --no-cache-dir -r requirements.txt
 7 | python setup.py bdist_wheel
 8 | pip install dist/onnx_neural_compressor*.whl --force-reinstall
 9 | 
10 | echo -e "\n pip list after install Neural Compressor ... "
11 | pip list
12 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/models/collect_results.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import re
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("--model", required=True, type=str)
 8 | parser.add_argument("--build_id", required=True, type=str)
 9 | args = parser.parse_args()
10 | 
11 | URL = (
12 |     "https://dev.azure.com/lpot-inc/onnx-neural-compressor/_build/results?buildId="
13 |     + args.build_id
14 |     + "&view=artifacts&pathAsName=false&type=publishedArtifacts"
15 | )
16 | REFER_SUMMARY_PATH = "/neural-compressor/.azure-pipelines/scripts/models/summary.json"
17 | 
18 | 
19 | def str_to_float(value):
20 |     try:
21 |         return round(float(value), 4)
22 |     except ValueError:
23 |         return value
24 | 
25 | 
26 | def get_refer_data():
27 |     if not os.path.exists(REFER_SUMMARY_PATH):
28 |         print(f"The file '{REFER_SUMMARY_PATH}' does not exist.")
29 |         return {}
30 | 
31 |     with open(REFER_SUMMARY_PATH, "r") as file:
32 |         refer = json.load(file)
33 |     return refer
34 | 
35 | 
36 | def check_status(performance, accuracy):
37 |     refer = get_refer_data()
38 | 
39 |     refer_accuracy = refer.get(args.model, {}).get("accuracy", {}).get("value", "N/A")
40 |     refer_performance = refer.get(args.model, {}).get("performance", {}).get("value", "N/A")
41 |     print(f"{accuracy=}\n{refer_accuracy=}\n{performance=}\n{refer_performance=}")
42 | 
43 |     assert accuracy != "N/A" and performance != "N/A"
44 |     if refer_accuracy != "N/A":
45 |         assert abs(accuracy - refer_accuracy) <= 0.001
46 |     if refer_performance != "N/A":
47 |         assert (refer_performance - performance) / refer_performance <= 0.08
48 | 
49 | 
50 | def main():
51 |     result_dict = {
52 |         args.model: {
53 |             "performance": {"value": "N/A", "log_path": URL},
54 |             "accuracy": {"value": "N/A", "log_path": URL},
55 |         }
56 |     }
57 | 
58 |     pattern = {
59 |         "performance": r"Throughput: ([\d.]+)",
60 |         "accuracy": r"Accuracy: ([\d.]+)",
61 |     }
62 | 
63 |     for mode, _ in result_dict[args.model].items():
64 |         log_file = f"/neural-compressor/.azure-pipelines/scripts/models/{args.model}/{mode}.log"
65 |         if not os.path.exists(log_file):
66 |             print(f"The file '{log_file}' does not exist.")
67 |             continue
68 | 
69 |         with open(log_file, "r") as file:
70 |             log_content = file.read()
71 | 
72 |         match = re.search(pattern[mode], log_content)
73 | 
74 |         if match:
75 |             result_dict[args.model][mode]["value"] = str_to_float(match.group(1))
76 | 
77 |     with open(f"/neural-compressor/.azure-pipelines/scripts/models/{args.model}/result.json", "w") as json_file:
78 |         json.dump(result_dict, json_file, indent=4)
79 | 
80 |     check_status(result_dict[args.model]["performance"]["value"], result_dict[args.model]["accuracy"]["value"])
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     main()
85 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/models/env_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | PATTERN='[-a-zA-Z0-9_]*='
 4 | 
 5 | for i in "$@"; do
 6 |     case $i in
 7 |     --model=*)
 8 |         model=${i//${PATTERN}/}
 9 |         ;;
10 |     *)
11 |         echo "Parameter $i not recognized."
12 |         exit 1
13 |         ;;
14 |     esac
15 | done
16 | 
17 | CONFIG_PATH="/neural-compressor/examples/.config/model_params_onnxrt.json"
18 | model_src_dir=$(jq -r ".\"onnxrt\".\"$model\".\"model_src_dir\"" "$CONFIG_PATH")
19 | 
20 | log_dir="/neural-compressor/.azure-pipelines/scripts/models"
21 | 
22 | $BOLD_YELLOW && echo "======= creat log_dir =========" && $RESET
23 | if [ -d "${log_dir}/${model}" ]; then
24 |     $BOLD_GREEN && echo "${log_dir}/${model} already exists, don't need to mkdir." && $RESET
25 | else
26 |     $BOLD_GREEN && echo "no log dir ${log_dir}/${model}, create." && $RESET
27 |     cd "${log_dir}"
28 |     mkdir "${model}"
29 | fi
30 | 
31 | $BOLD_YELLOW && echo "====== install ONC ======" && $RESET
32 | cd /neural-compressor
33 | source .azure-pipelines/scripts/change_color.sh
34 | /bin/bash .azure-pipelines/scripts/install_nc.sh
35 | 
36 | $BOLD_YELLOW && echo "====== install requirements ======" && $RESET
37 | cd "/neural-compressor/examples/$model_src_dir"
38 | pip install -r requirements.txt
39 | pip list
40 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/models/generate_report.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | from jinja2 import Environment, FileSystemLoader
  6 | 
  7 | parser = argparse.ArgumentParser(allow_abbrev=False)
  8 | parser.add_argument("--json_path", type=str, required=True)
  9 | parser.add_argument("--last_json_path", type=str, required=True)
 10 | args = parser.parse_args()
 11 | 
 12 | 
 13 | def get_data(json_path):
 14 |     """
 15 |     {
 16 |         model: {
 17 |             "performance": {"value": "N/A"|number, "log_path": string},
 18 |             "accuracy": {"value": "N/A"|number, "log_path": string},
 19 |         }
 20 |     }
 21 |     """
 22 |     if os.path.exists(json_path):
 23 |         with open(json_path, "r") as f:
 24 |             return json.load(f)
 25 |     else:
 26 |         return {}
 27 | 
 28 | 
 29 | def get_ratio(cur, last):
 30 |     if cur == "N/A" or last == "N/A":
 31 |         ratio = "N/A"
 32 |     else:
 33 |         ratio = (float(cur) - float(last)) / float(last) * 100
 34 |         ratio = round(float(ratio), 2)
 35 |     return ratio
 36 | 
 37 | 
 38 | def get_accuracy_ratio(current_json, last_accuracy_dict):
 39 |     compare_result_dict = []
 40 |     for model, item in current_json.items():
 41 |         current_accuracy = item.get("accuracy", {}).get("value", "N/A")
 42 |         last_accuracy = last_accuracy_dict.get(model, {}).get("accuracy", {}).get("value", "N/A")
 43 |         accuracy_ratio = get_ratio(current_accuracy, last_accuracy)
 44 | 
 45 |         current_performance = item.get("performance", {}).get("value", "N/A")
 46 |         last_performance = last_accuracy_dict.get(model, {}).get("performance", {}).get("value", "N/A")
 47 |         performance_ratio = get_ratio(current_performance, last_performance)
 48 | 
 49 |         if accuracy_ratio == "N/A" or performance_ratio == "N/A":
 50 |             status = "FAILURE"
 51 |         elif accuracy_ratio != 0:
 52 |             status = "FAILURE"
 53 |         elif performance_ratio > 8 or performance_ratio < -8:
 54 |             status = "FAILURE"
 55 |         else:
 56 |             status = "SUCCESS"
 57 | 
 58 |         format_ratio = lambda x: f"{x}%" if x != "N/A" else x
 59 | 
 60 |         compare_result_dict.append(
 61 |             {
 62 |                 "model": model,
 63 |                 "current_accuracy": current_accuracy,
 64 |                 "last_accuracy": last_accuracy,
 65 |                 "accuracy_ratio": format_ratio(accuracy_ratio),
 66 |                 "current_performance": current_performance,
 67 |                 "last_performance": last_performance,
 68 |                 "performance_ratio": format_ratio(performance_ratio),
 69 |                 "status": status,
 70 |             }
 71 |         )
 72 |     return compare_result_dict
 73 | 
 74 | 
 75 | def generate(rendered_template):
 76 |     with open("generated/report.html", "w") as html_file:
 77 |         html_file.write(rendered_template)
 78 | 
 79 | 
 80 | def main():
 81 |     path = "{}/templates/".format(os.path.dirname(__file__))
 82 |     BUILD_BUILDID = os.getenv("BUILD_BUILDID")
 83 | 
 84 |     loader = FileSystemLoader(path)
 85 |     env = Environment(loader=loader)
 86 |     template = env.get_template("model.jinja2")
 87 | 
 88 |     data = get_data(args.json_path)
 89 |     last_data = get_data(args.last_json_path)
 90 |     data = get_accuracy_ratio(data, last_data)
 91 |     info = {
 92 |         "url": f"https://dev.azure.com/lpot-inc/onnx-neural-compressor/_build/results?buildId={BUILD_BUILDID}",
 93 |         "branch": os.getenv("SYSTEM_PULLREQUEST_SOURCEBRANCH"),
 94 |         "commit": os.getenv("BUILD_SOURCEVERSION"),
 95 |         "build_number": BUILD_BUILDID,
 96 |     }
 97 | 
 98 |     rendered_template = template.render(data=data, info=info)
 99 |     generate(rendered_template)
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/models/run_onnxrt_llm_models_trigger.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | set -xe
 4 | PATTERN='[-a-zA-Z0-9_]*='
 5 | 
 6 | for i in "$@"; do
 7 |     case $i in
 8 |     --stage=*)
 9 |         stage=${i//${PATTERN}/}
10 |         ;;
11 |     --model=*)
12 |         model=${i//${PATTERN}/}
13 |         ;;
14 |     *)
15 |         echo "Parameter $i not recognized."
16 |         exit 1
17 |         ;;
18 |     esac
19 | done
20 | 
21 | model_src_dir=/neural-compressor/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only
22 | dataset_location=NeelNanda/pile-10k
23 | input_model=/tf_dataset2/models/huggingface/opt-125m
24 | batch_size=16
25 | 
26 | function run_prepare_model() {
27 |     python prepare_model.py --input_model="$input_model" --output_model="./model_export" --task=text-generation-with-past
28 | }
29 | 
30 | function run_quantize() {
31 |     bash run_quant.sh --input_model="./model_export" \
32 |         --output_model="./model_tune" \
33 |         --batch_size="$batch_size" \
34 |         --dataset="$dataset_location" \
35 |         --tokenizer="$model" \
36 |         --algorithm=WOQ_TUNE
37 | }
38 | 
39 | function run_accuracy() {
40 |     bash run_benchmark.sh --input_model="./model_tune" \
41 |         --batch_size="$batch_size" \
42 |         --mode=accuracy \
43 |         --tokenizer="$model" \
44 |         --tasks=lambada_openai | tee -a accuracy.log
45 | }
46 | 
47 | function main() {
48 |     cd "$model_src_dir"
49 |     if [ "$stage" == "prepare_model" ]; then
50 |         run_prepare_model
51 |     elif [ "$stage" == "quantize" ]; then
52 |         run_quantize
53 |     elif [ "$stage" == "accuracy" ]; then
54 |         run_accuracy
55 |     else
56 |         exit 1
57 |     fi
58 | }
59 | 
60 | main
61 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/models/run_onnxrt_models_trigger.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | set -xe
 4 | PATTERN='[-a-zA-Z0-9_]*='
 5 | 
 6 | for i in "$@"; do
 7 |     case $i in
 8 |     --stage=*)
 9 |         stage=${i//${PATTERN}/}
10 |         ;;
11 |     --model=*)
12 |         model=${i//${PATTERN}/}
13 |         ;;
14 |     *)
15 |         echo "Parameter $i not recognized."
16 |         exit 1
17 |         ;;
18 |     esac
19 | done
20 | 
21 | log_dir="/neural-compressor/.azure-pipelines/scripts/models/$model"
22 | CONFIG_PATH="/neural-compressor/examples/.config/model_params_onnxrt.json"
23 | model_src_dir=$(jq -r ".\"onnxrt\".\"$model\".\"model_src_dir\"" "$CONFIG_PATH")
24 | if [[ "$model" == *"resnet"* ]]; then
25 |     dataset_location="/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000/ILSVRC2012_img_val"
26 |     label_path="/tf_dataset2/datasets/imagenet/ImagenetRaw/ImagenetRaw_small_5000/val.txt"
27 | else
28 |     dataset_location=$(jq -r ".\"onnxrt\".\"$model\".\"dataset_location\"" "$CONFIG_PATH")
29 | fi
30 | 
31 | input_model=$(jq -r ".\"onnxrt\".\"$model\".\"input_model\"" "$CONFIG_PATH")
32 | 
33 | function run_prepare_model() {
34 |     if [ -f "$input_model" ]; then
35 |         echo "model exists"
36 |     else
37 |         echo "model not found" && exit 1
38 |     fi
39 | }
40 | 
41 | function run_quantize() {
42 |     if [[ "$model" == "bert_base_MRPC" ]]; then
43 |         bash run_quant.sh --input_model="$input_model" \
44 |             --dataset_location="$dataset_location" \
45 |             --label_path="$label_path" \
46 |             --output_model="./model_tune" \
47 |             --quant_format="QDQ" | tee -a "$log_dir/tuning.log"
48 |     else
49 |         bash run_quant.sh --input_model="$input_model" \
50 |             --dataset_location="$dataset_location" \
51 |             --label_path="$label_path" \
52 |             --output_model="./model_tune" | tee -a "$log_dir/tuning.log"
53 |     fi
54 | }
55 | 
56 | function run_accuracy() {
57 |     bash run_benchmark.sh --input_model="./model_tune" \
58 |         --dataset_location="$dataset_location" \
59 |         --label_path="$label_path" \
60 |         --mode="accuracy" \
61 |         --batch_size="16" | tee -a "$log_dir/accuracy.log"
62 | }
63 | 
64 | function run_performance() {
65 |     bash run_benchmark.sh --input_model="./model_tune" \
66 |         --dataset_location="$dataset_location" \
67 |         --label_path="$label_path" \
68 |         --mode="performance" \
69 |         --intra_op_num_threads="8" \
70 |         --batch_size="1" | tee -a "$log_dir/performance.log"
71 | }
72 | 
73 | function main() {
74 |     cd "/neural-compressor/examples/$model_src_dir"
75 |     if [ "$stage" == "prepare_model" ]; then
76 |         run_prepare_model
77 |     elif [ "$stage" == "quantize" ]; then
78 |         run_quantize
79 |     elif [ "$stage" == "accuracy" ]; then
80 |         run_accuracy
81 |     elif [ "$stage" == "performance" ]; then
82 |         run_performance
83 |     else
84 |         echo "invalid stage: $stage" && exit 1
85 |     fi
86 | }
87 | 
88 | main
89 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/models/summarize_results.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--logs_dir", required=True, type=str)
 7 | parser.add_argument("--output_dir", required=True, type=str)
 8 | args = parser.parse_args()
 9 | 
10 | 
11 | def read_json_file(file_path):
12 |     with open(file_path, "r") as file:
13 |         return json.load(file)
14 | 
15 | 
16 | def write_json_file(data, file_path):
17 |     with open(file_path, "w") as file:
18 |         json.dump(data, file, indent=4)
19 | 
20 | 
21 | def merge_json_files(root_dir, output_file):
22 |     merged_data = {}
23 | 
24 |     for subdir, _, files in os.walk(root_dir):
25 |         for file in files:
26 |             if file.endswith(".json"):
27 |                 file_path = os.path.join(subdir, file)
28 |                 try:
29 |                     json_data = read_json_file(file_path)
30 |                     merged_data.update(json_data)
31 |                 except json.JSONDecodeError:
32 |                     print(f"Error decoding JSON from file: {file_path}")
33 | 
34 |     print(merged_data)
35 |     write_json_file(merged_data, f"{output_file}/summary.json")
36 | 
37 | 
38 | def main():
39 |     merge_json_files(args.logs_dir, args.output_dir)
40 |     print(f"All JSON files have been merged into {args.output_dir}")
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/models/templates/model.jinja2:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |     <title>ONNX Neural Compressor - Model test</title>
  9 |     <style>
 10 |         body {
 11 |             margin: 0;
 12 |             padding: 0;
 13 |             background: white no-repeat left top;
 14 |         }
 15 | 
 16 |         #main {
 17 |             margin: 20px auto 10px auto;
 18 |             background: white;
 19 |             border-radius: 8px;
 20 |             -moz-border-radius: 8px;
 21 |             -webkit-border-radius: 8px;
 22 |             padding: 0 30px 30px 30px;
 23 |             border: 1px solid #adaa9f;
 24 |             box-shadow: 0 2px 2px #9c9c9c;
 25 |             -moz-box-shadow: 0 2px 2px #9c9c9c;
 26 |             -webkit-box-shadow: 0 2px 2px #9c9c9c;
 27 |         }
 28 | 
 29 |         .model-test-table {
 30 |             width: 100%;
 31 |             margin: 0 auto;
 32 |             border-collapse: separate;
 33 |             border-spacing: 0;
 34 |             text-shadow: 0 1px 0 #fff;
 35 |             color: #2a2a2a;
 36 |             background: #fafafa;
 37 |             background-image: -moz-linear-gradient(top, #fff, #eaeaea, #fff);
 38 |             background-image: -webkit-gradient(linear, center bottom, center top, from(#fff), color-stop(0.5, #eaeaea), to(#fff));
 39 |             font-family: Verdana, Arial, Helvetica
 40 |         }
 41 | 
 42 |         .model-test-table th,
 43 |         td {
 44 |             text-align: center;
 45 |             height: 25px;
 46 |             line-height: 25px;
 47 |             padding: 0 8px;
 48 |             border: 1px solid #cdcdcd;
 49 |             box-shadow: 0 1px 0 white;
 50 |             -moz-box-shadow: 0 1px 0 white;
 51 |             -webkit-box-shadow: 0 1px 0 white;
 52 |             white-space: nowrap;
 53 |         }
 54 |     </style>
 55 | </head>
 56 | 
 57 | <body>
 58 |     <div id="main">
 59 |         <h1 align="center">Model test
 60 |             [ <a href={{ info['url'] }}>Job - {{ info['build_number'] }}</a> ]
 61 |         </h1>
 62 | 
 63 |         <h2>Summary</h2>
 64 |         <table class="model-test-table">
 65 |             <tr>
 66 |                 <th>Repo</th>
 67 |                 <th colspan=4>Test Branch</th>
 68 |                 <th colspan=4>Commit ID</th>
 69 |             </tr>
 70 |             <tr>
 71 |                 <td><a href="https://github.com/onnx/neural-compressor.git">Neural Compressor</a></td>
 72 |                 <th colspan=4><a href={{"%s%s"|format("https://github.com/onnx/neural-compressor/tree/",
 73 |                         info['branch'])}}>{{info['branch'] }}</a>
 74 |                 </th>
 75 |                 <th colspan=4>
 76 |                     <a href={{"%s%s"|format("https://github.com/onnx/neural-compressor/commit/", info['commit'])}}>{{
 77 |                         info['commit'] }}</a>
 78 |                 </th>
 79 |             </tr>
 80 |         </table>
 81 | 
 82 |         <h2>Model Test</h2>
 83 |         <table class="model-test-table" style="margin: 0 auto 0 0;">
 84 |             <tr>
 85 |                 {% for title in ["Platform", "Model", "Accuracy(new|last)", "Ratio(Accuracy)",
 86 |                 "Performance(new|last)", "Ratio(Performance)", "Status"] %}
 87 |                 <th style="width: 5%">{{ title }}</th>
 88 |                 {% endfor %}
 89 |             </tr>
 90 |             <div class="user-container">
 91 |                 {% for item in data %}
 92 |                 <tr>
 93 |                     <td>ICX</td>
 94 |                     <td><a href={{ item.url }}>{{ item.model }}</a></td>
 95 |                     <td>{{ item.current_accuracy }} | {{ item.last_accuracy }}</td>
 96 |                     <td>{{ item.accuracy_ratio }}</td>
 97 |                     <td>{{ item.current_performance }} | {{ item.last_performance }}</td>
 98 |                     <td>{{ item.performance_ratio }}</td>
 99 |                     {% if item.status == 'SUCCESS' %}
100 |                     <td style="color:green">{{ item.status }}</td>
101 |                     {% else %}
102 |                     <td style="color: red;">{{ item.status }}</td>
103 |                     {% endif %}
104 |                 </tr>
105 |                 {% endfor %}
106 |             </div>
107 |         </table>
108 |     </div>
109 | </body>
110 | 
111 | </html>


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/coverage.ort:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | 
 4 | [report]
 5 | include =
 6 |  */onnx_neural_compressor/**
 7 | exclude_lines =
 8 |  pragma: no cover
 9 |  raise NotImplementedError
10 |  raise TypeError
11 |  if self.device == "gpu":
12 |  if device == "gpu":
13 |  except ImportError:
14 |  except Exception as e:


--------------------------------------------------------------------------------
/.azure-pipelines/scripts/ut/run_ort.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | python -c "import neural_compressor as nc"
 3 | test_case="run ONNXRT"
 4 | echo "${test_case}"
 5 | 
 6 | # install requirements
 7 | echo "set up UT env..."
 8 | pip install -r /neural-compressor/test/requirements.txt
 9 | pip install pytest-cov
10 | pip install pytest-html
11 | pip list
12 | 
13 | export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/coverage.ort
14 | inc_path=$(python -c 'import onnx_neural_compressor; print(onnx_neural_compressor.__path__[0])')
15 | cd /neural-compressor/test || exit 1
16 | 
17 | LOG_DIR=/neural-compressor/log_dir
18 | mkdir -p ${LOG_DIR}
19 | ut_log_name=${LOG_DIR}/ut_ort.log
20 | pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html . 2>&1 | tee -a ${ut_log_name}
21 | 
22 | cp report.html ${LOG_DIR}/
23 | 
24 | if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
25 |     echo "Find errors in pytest case, please check the output..."
26 |     echo "Please search for '== FAILURES ==' or '== ERRORS =='"
27 |     exit 1
28 | fi
29 | 
30 | # if ut pass, collect the coverage file into artifacts
31 | cp .coverage ${LOG_DIR}/.coverage
32 | 
33 | echo "UT finished successfully! "


--------------------------------------------------------------------------------
/.azure-pipelines/template/docker-template.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: dockerConfigName
 3 |     type: string
 4 |     default: "commonDockerConfig"
 5 |   - name: repoName
 6 |     type: string
 7 |     default: "neural-compressor"
 8 |   - name: repoTag
 9 |     type: string
10 |     default: "py310"
11 |   - name: dockerFileName
12 |     type: string
13 |     default: "Dockerfile"
14 |   - name: containerName
15 |     type: string
16 |   - name: repo
17 |     type: string
18 |     default: "https://github.com/onnx/neural-compressor"
19 | 
20 | steps:
21 |   - task: Bash@3
22 |     inputs:
23 |       targetType: "inline"
24 |       script: |
25 |         docker ps -a
26 |         if [[ $(docker ps -a | grep -i '${{ parameters.containerName }}'$) ]]; then
27 |             docker start $(docker ps -aq)
28 |             echo "remove left files through container ..."
29 |             docker exec ${{ parameters.containerName }} bash -c "ls -a /neural-compressor && rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* && ls -a /neural-compressor  || true"
30 |         fi
31 |     displayName: "Docker workspace clean up"
32 | 
33 |   - ${{ if eq(parameters.dockerConfigName, 'commonDockerConfig') }}:
34 |       - script: |
35 |           rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
36 |           echo y | docker system prune
37 |         displayName: "Clean workspace"
38 | 
39 |       - checkout: self
40 |         clean: true
41 |         displayName: "Checkout out Repo"
42 | 
43 |   - ${{ if eq(parameters.dockerConfigName, 'gitCloneDockerConfig') }}:
44 |       - script: |
45 |           rm -fr ${BUILD_SOURCESDIRECTORY} || sudo rm -fr ${BUILD_SOURCESDIRECTORY} || true
46 |           mkdir ${BUILD_SOURCESDIRECTORY}
47 |           chmod 777 ${BUILD_SOURCESDIRECTORY}
48 |           echo y | docker system prune
49 |         displayName: "Clean workspace"
50 | 
51 |       - checkout: none
52 | 
53 |       - script: |
54 |           git clone ${{ parameters.repo }} ${BUILD_SOURCESDIRECTORY}
55 |           git config --global --add safe.directory ${BUILD_SOURCESDIRECTORY}
56 |           cd ${BUILD_SOURCESDIRECTORY}
57 |           git checkout main
58 |         displayName: "Checkout out main"
59 | 
60 |   - script: |
61 |       if [[ ! $(docker images | grep -i ${{ parameters.repoName }}:${{ parameters.repoTag }}) ]]; then
62 |         docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/${{parameters.dockerFileName}}.devel -t ${{ parameters.repoName }}:${{ parameters.repoTag }} .
63 |       fi
64 |       docker images | grep -i ${{ parameters.repoName }}
65 |       if [[ $? -ne 0 ]]; then
66 |         echo "NO Such Repo"
67 |         exit 1
68 |       fi
69 |     displayName: "Build develop docker image"
70 | 
71 |   - script: |
72 |       docker stop $(docker ps -aq)
73 |       docker rm -vf ${{ parameters.containerName }} || true
74 |       env | sort
75 |     displayName: "Clean docker container"
76 | 
77 |   - ${{ if ne(parameters.containerName, '') }}:
78 |       - task: Bash@3
79 |         inputs:
80 |           targetType: "inline"
81 |           script: |
82 |             docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
83 |             -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor -v /tf_dataset:/tf_dataset -v /tf_dataset2:/tf_dataset2 ${{ parameters.repoName }}:${{ parameters.repoTag }}
84 |             echo "Show the container list after docker run ... "
85 |             docker ps -a
86 |         displayName: "Docker run - ${{ parameters.containerName }} Container"
87 | 


--------------------------------------------------------------------------------
/.azure-pipelines/template/model-template.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: modelName
 3 |     type: string
 4 |   - name: modelContainerName
 5 |     type: string
 6 |     default: "ONC"
 7 |   - name: algorithm
 8 |     type: string
 9 |   - name: script_path
10 |     type: string
11 |     default: "run_onnxrt_models_trigger.sh"
12 | 
13 | steps:
14 |   - template: docker-template.yml
15 |     parameters:
16 |       dockerConfigName: "commonDockerConfig"
17 |       repoName: "neural-compressor"
18 |       repoTag: "py310"
19 |       dockerFileName: "Dockerfile"
20 |       containerName: ${{ parameters.modelContainerName }}
21 | 
22 |   - script: |
23 |       docker exec ${{ parameters.modelContainerName }} bash -c \
24 |         "cd /neural-compressor/.azure-pipelines/scripts/models && bash env_setup.sh --model=${{ parameters.modelName }}"
25 |     displayName: Env setup
26 | 
27 |   - script: |
28 |       docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
29 |       && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='prepare_model'"
30 |     displayName: Export Models
31 | 
32 |   - task: DownloadPipelineArtifact@2
33 |     continueOnError: true
34 |     inputs:
35 |       source: "specific"
36 |       artifact: "FinalReport"
37 |       patterns: "**.json"
38 |       path: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models/
39 |       project: $(System.TeamProject)
40 |       pipeline: "onc model test"
41 |       runVersion: "specific"
42 |       runId: $(refer_buildId)
43 |     displayName: "Download refer logs"
44 | 
45 |   - script: |
46 |       docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
47 |       && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='quantize'"
48 |     displayName: Quantize
49 | 
50 |   - script: |
51 |       docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
52 |       && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='accuracy'"
53 |     displayName: Run Accuracy Test
54 | 
55 |   - ${{ if eq(parameters.algorithm, 'Quantize') }}:
56 |       - script: |
57 |           docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
58 |           && bash ${{ parameters.script_path }} --model=${{ parameters.modelName }} --stage='performance'"
59 |         displayName: Run Performance Test
60 | 
61 |   - task: Bash@3
62 |     inputs:
63 |       targetType: "inline"
64 |       script: |
65 |         docker exec ${{ parameters.modelContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts/models \
66 |         && python collect_results.py --model=${{ parameters.modelName }} --build_id=$(Build.BuildId)"
67 |     displayName: Collect Log & Check Results
68 | 
69 |   - task: PublishPipelineArtifact@1
70 |     inputs:
71 |       targetPath: $(Build.SourcesDirectory)/.azure-pipelines/scripts/models/${{ parameters.modelName }}/
72 |       artifact: ${{ parameters.algorithm }}_${{ parameters.modelName }}
73 |       publishLocation: "pipeline"
74 | 
75 |   - task: Bash@3
76 |     condition: always()
77 |     inputs:
78 |       targetType: "inline"
79 |       script: |
80 |         docker exec ${{ parameters.modelContainerName }} bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
81 |     displayName: "Docker Clean Up"
82 | 


--------------------------------------------------------------------------------
/.azure-pipelines/template/ut-template.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: dockerConfigName
 3 |     type: string
 4 |     default: "commonDockerConfig"
 5 |   - name: repo
 6 |     type: string
 7 |     default: "https://github.com/onnx/neural-compressor"
 8 |   - name: utScriptFileName
 9 |     type: string
10 |   - name: uploadPath
11 |     type: string
12 |   - name: utArtifact
13 |     type: string
14 |   - name: utTestMode
15 |     type: string
16 |     default: "coverage"
17 |   - name: utContainerName
18 |     type: string
19 |     default: "utTest"
20 | 
21 | steps:
22 |   - template: docker-template.yml
23 |     parameters:
24 |       dockerConfigName: ${{ parameters.dockerConfigName }}
25 |       repoName: "neural-compressor"
26 |       repoTag: "py310"
27 |       dockerFileName: "Dockerfile"
28 |       containerName: ${{ parameters.utContainerName }}
29 |       repo: ${{ parameters.repo }}
30 | 
31 |   - script: |
32 |       docker exec ${{ parameters.utContainerName }} bash -c "cd /neural-compressor/.azure-pipelines/scripts \
33 |       && bash install_nc.sh \
34 |       && bash ut/${{ parameters.utScriptFileName }}.sh"
35 |     displayName: "Run UT"
36 | 
37 |   - task: PublishPipelineArtifact@1
38 |     condition: succeededOrFailed()
39 |     inputs:
40 |       targetPath: ${{ parameters.uploadPath }}
41 |       artifact: $(System.JobAttempt)_${{ parameters.utArtifact }}_report
42 |       publishLocation: "pipeline"
43 | 
44 |   - ${{ if eq(parameters.utTestMode, 'coverage') }}:
45 |     - task: PublishPipelineArtifact@1
46 |       inputs:
47 |         targetPath: ${{ parameters.uploadPath }}
48 |         artifact: ${{ parameters.utArtifact }}_coverage
49 |         publishLocation: "pipeline"
50 | 
51 |   - task: Bash@3
52 |     condition: always()
53 |     inputs:
54 |       targetType: "inline"
55 |       script: |
56 |         docker exec ${{ parameters.utContainerName }} bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
57 |     displayName: "Docker clean up"
58 | 


--------------------------------------------------------------------------------
/.azure-pipelines/ut-3x-ort.yml:
--------------------------------------------------------------------------------
  1 | trigger: none
  2 | 
  3 | pr:
  4 |   autoCancel: true
  5 |   drafts: false
  6 |   branches:
  7 |     include:
  8 |       - main
  9 |   paths:
 10 |     include:
 11 |       - onnx_neural_compressor
 12 |       - test
 13 |       - setup.py
 14 |       - requirements.txt
 15 |       - .azure-pipelines/scripts/ut
 16 |       - .azure-pipelines/ut-3x-ort.yml
 17 | 
 18 | pool: ICX-16C
 19 | 
 20 | variables:
 21 |   IMAGE_NAME: "neural-compressor"
 22 |   IMAGE_TAG: "py310"
 23 |   UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
 24 |   DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
 25 |   ARTIFACT_NAME: "UT_coverage_report_ort"
 26 |   REPO: $(Build.Repository.Uri)
 27 | 
 28 | stages:
 29 |   - stage: ONNXRT
 30 |     displayName: Unit Test ONNXRT
 31 |     dependsOn: []
 32 |     jobs:
 33 |       - job:
 34 |         displayName: Unit Test ONNXRT
 35 |         steps:
 36 |           - template: template/ut-template.yml
 37 |             parameters:
 38 |               dockerConfigName: "commonDockerConfig"
 39 |               utScriptFileName: "run_ort"
 40 |               uploadPath: $(UPLOAD_PATH)
 41 |               utArtifact: "ut"
 42 | 
 43 | 
 44 |   - stage: ONNXRT_baseline
 45 |     displayName: Unit Test ONNXRT baseline
 46 |     dependsOn: []
 47 |     jobs:
 48 |       - job:
 49 |         displayName: Unit Test ONNXRT baseline
 50 |         steps:
 51 |           - template: template/ut-template.yml
 52 |             parameters:
 53 |               dockerConfigName: "gitCloneDockerConfig"
 54 |               utScriptFileName: "run_ort"
 55 |               uploadPath: $(UPLOAD_PATH)
 56 |               utArtifact: "ut_baseline"
 57 |               repo: $(REPO)
 58 | 
 59 |   - stage: Coverage
 60 |     displayName: "Coverage Compare"
 61 |     pool:
 62 |       vmImage: "ubuntu-latest"
 63 |     dependsOn: [ONNXRT, ONNXRT_baseline]
 64 |     jobs:
 65 |       - job: CollectDatafiles
 66 |         steps:
 67 |           - script: |
 68 |               if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then
 69 |                 docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
 70 |               fi
 71 |               docker images | grep -i ${IMAGE_NAME}
 72 |               if [[ $? -ne 0 ]]; then
 73 |                 echo "NO Such Repo"
 74 |                 exit 1
 75 |               fi
 76 |             displayName: "Build develop docker image"
 77 | 
 78 |           - task: DownloadPipelineArtifact@2
 79 |             inputs:
 80 |               artifact:
 81 |               patterns: '*_coverage/.coverage'
 82 |               path: $(DOWNLOAD_PATH)
 83 | 
 84 |           - script: |
 85 |               echo "--- create container ---"
 86 |               docker run -d -it --name="collectLogs"  -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor  ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash
 87 |               echo "--- docker ps ---"
 88 |               docker ps
 89 |               echo "--- collect logs ---"
 90 |               docker exec collectLogs /bin/bash  +x -c "cd /neural-compressor/.azure-pipelines/scripts \
 91 |               && bash install_nc.sh \
 92 |               && bash ut/collect_log.sh"
 93 |             displayName: "Collect UT Coverage"
 94 | 
 95 |           - task: PublishPipelineArtifact@1
 96 |             condition: succeededOrFailed()
 97 |             inputs:
 98 |               targetPath: $(UPLOAD_PATH)
 99 |               artifact: $(ARTIFACT_NAME)
100 |               publishLocation: "pipeline"
101 | 
102 |           - task: Bash@3
103 |             condition: always()
104 |             inputs:
105 |               targetType: "inline"
106 |               script: |
107 |                 docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
108 |             displayName: "Docker clean up"
109 | 


--------------------------------------------------------------------------------
/.github/license_template.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Intel Corporation
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Type of Change
 2 | 
 3 | feature or bug fix or documentation or validation or others  
 4 | API changed or not
 5 | 
 6 | ## Description
 7 | 
 8 | detail description
 9 | 
10 | ## Expected Behavior & Potential Risk
11 | 
12 | the expected behavior that triggered by this PR 
13 | 
14 | ## How has this PR been tested?
15 | 
16 | how to reproduce the test (including hardware information)
17 | 
18 | ## Dependency Change?
19 | 
20 | any library dependency introduced or removed
21 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) ONNX Neural Compressor Project Contributors
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | name: Lint
 6 | 
 7 | on:
 8 |   push:
 9 |     branches:
10 |       - main
11 |   pull_request:
12 |   merge_group:
13 | 
14 | permissions:  # set top-level default permissions as security best practice
15 |   contents: read
16 | 
17 | concurrency:
18 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
19 |   cancel-in-progress: true
20 | 
21 | jobs:
22 |   optional-lint:
23 |     name: Optional Lint
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
27 |       - name: misspell # Check spellings as well
28 |         uses: reviewdog/action-misspell@5bd7be2fc7ae56a517184f5c4bbcf2fd7afe3927 # v1.17.0
29 |         with:
30 |           github_token: ${{ secrets.github_token }}
31 |           locale: "US"
32 |           reporter: github-pr-check
33 |           level: info
34 |           filter_mode: diff_context
35 |       - name: shellcheck # Static check shell scripts
36 |         uses: reviewdog/action-shellcheck@72365a51bf6476fe952a117c3ff703eb7775e40a # v1.20.0
37 |         with:
38 |           github_token: ${{ secrets.github_token }}
39 |           reporter: github-pr-check
40 |           level: info
41 |           filter_mode: diff_context
42 | 
43 |   enforce-style:
44 |     name: Enforce style
45 |     runs-on: ubuntu-latest
46 |     permissions:
47 |       security-events: write
48 |     steps:
49 |       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
50 |       - name: Setup Python
51 |         uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
52 |         with:
53 |           python-version: "3.12"
54 |       - name: Install ONNX Neural Compressor
55 |         run: |
56 |           pip install .
57 |       - name: Install dependencies
58 |         run: |
59 |           python -m pip install lintrunner lintrunner-adapters
60 |           lintrunner init
61 |       - name: Run lintrunner on all files
62 |         run: |
63 |           set +e
64 |           if ! lintrunner --force-color --all-files --tee-json=lint.json -v; then
65 |               echo ""
66 |               echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
67 |               echo -e "\e[1m\e[36mSee https://github.com/onnx/neural-compressor/blob/main/.lintrunner.toml for setup instructions.\e[0m"
68 |               exit 1
69 |           fi
70 |       - name: Produce SARIF
71 |         if: always()
72 |         run: |
73 |           python -m lintrunner_adapters to-sarif lint.json lintrunner.sarif
74 |       - name: Upload SARIF file
75 |         # Use always() to always upload SARIF even if lintrunner returns with error code
76 |         # To toggle linter comments in the files page, press `i` on the keyboard
77 |         if: always()
78 |         continue-on-error: true
79 |         uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4
80 |         with:
81 |           # Path to SARIF file relative to the root of the repository
82 |           sarif_file: lintrunner.sarif
83 |           category: lintrunner
84 |           checkout_path: ${{ github.workspace }}
85 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .vscode
 3 | .idea
 4 | /venv/
 5 | */__pycache__
 6 | .ipynb_checkpoints/
 7 | *.snapshot
 8 | *.csv
 9 | *.pb
10 | *.ckpt
11 | *.log
12 | *.swp
13 | *.onnx
14 | *.so
15 | *.egg-info/
16 | .eggs/
17 | dist/
18 | tags
19 | build/
20 | _build
21 | lpot_workspace/
22 | .torch/
23 | node_modules
24 | build_tmp
25 | 


--------------------------------------------------------------------------------
/.lintrunner.toml:
--------------------------------------------------------------------------------
 1 | # Configuration for lintrunner https://github.com/suo/lintrunner
 2 | # You can install the dependencies and initialize with
 3 | #
 4 | # ```sh
 5 | # pip install lintrunner lintrunner-adapters
 6 | # lintrunner init
 7 | # ```
 8 | #
 9 | # This will install lintrunner on your system and download all the necessary
10 | # dependencies to run linters locally.
11 | # If you want to see what lintrunner init will install, run
12 | # `lintrunner init --dry-run`.
13 | #
14 | # To lint local changes:
15 | #
16 | # ```bash
17 | # lintrunner
18 | # ```
19 | #
20 | # To lint all files:
21 | #
22 | # ```bash
23 | # lintrunner --all-files
24 | # ```
25 | #
26 | # To format files:
27 | #
28 | # ```bash
29 | # lintrunner -a
30 | # ```
31 | #
32 | # To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
33 | # To update an existing linting rule or create a new one, modify this file or create a
34 | # new adapter following examples in https://github.com/justinchuby/lintrunner-adapters.
35 | merge_base_with = 'main'
36 | 
37 | [[linter]]
38 | code = 'RUFF'
39 | include_patterns = [
40 |     '**/*.py',
41 |     '**/*.pyi',
42 | ]
43 | exclude_patterns = [
44 |     '*_pb2*',
45 |     '.setuptools-cmake-build/*',
46 |     'docs/**',
47 | ]
48 | command = [
49 |     'python',
50 |     '-m',
51 |     'lintrunner_adapters',
52 |     'run',
53 |     'ruff_linter',
54 |     '--config=pyproject.toml',
55 |     '@{{PATHSFILE}}'
56 | ]
57 | init_command = [
58 |     'python',
59 |     '-m',
60 |     'lintrunner_adapters',
61 |     'run',
62 |     'pip_init',
63 |     '--dry-run={{DRYRUN}}',
64 |     '--requirement=requirements-lintrunner.txt',
65 | ]
66 | is_formatter = true
67 | 
68 | [[linter]]
69 | code = 'BLACK-ISORT'
70 | include_patterns = [
71 |     '**/*.py',
72 | ]
73 | exclude_patterns = [
74 |     '*_pb2*',
75 |     '.setuptools-cmake-build/*',
76 |     'cmake/**',
77 |     'docs/**',
78 | ]
79 | command = [
80 |     'python',
81 |     '-m',
82 |     'lintrunner_adapters',
83 |     'run',
84 |     'black_isort_linter',
85 |     '--',
86 |     '@{{PATHSFILE}}'
87 | ]
88 | init_command = [
89 |     'python',
90 |     '-m',
91 |     'lintrunner_adapters',
92 |     'run',
93 |     'pip_init',
94 |     '--dry-run={{DRYRUN}}',
95 |     '--requirement=requirements-lintrunner.txt',
96 | ]
97 | is_formatter = true
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | Neural Compressor
  4 | ===========================
  5 | <h3> An open-source Python library supporting popular model compression techniques for ONNX</h3>
  6 | 
  7 | [![python](https://img.shields.io/badge/python-3.8%2B-blue)](https://github.com/onnx/neural-compressor)
  8 | [![version](https://img.shields.io/badge/release-1.0-green)](https://github.com/onnx/neural-compressor/releases)
  9 | [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/onnx/neural-compressor/blob/master/LICENSE)
 10 | 
 11 | 
 12 | ---
 13 | <div align="left">
 14 | 
 15 | Neural Compressor aims to provide popular model compression techniques inherited from [Intel Neural Compressor](https://github.com/intel/neural-compressor) yet focused on ONNX model quantization such as SmoothQuant, weight-only quantization through [ONNX Runtime](https://onnxruntime.ai/). In particular, the tool provides the key features, typical examples, and open collaborations as below:
 16 | 
 17 | * Support a wide range of Intel hardware such as [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html) and AIPC
 18 | 
 19 | * Validate popular LLMs such as [LLama2](./examples/nlp/huggingface_model/text_generation/), [Llama3](./examples/nlp/huggingface_model/text_generation/), [Qwen2](./examples/nlp/huggingface_model/text_generation/) and broad models such as [BERT-base](./examples/nlp/bert/quantization), and [ResNet50](./examples/image_recognition/resnet50/quantization/ptq_static) from popular model hubs such as [Hugging Face](https://huggingface.co/), [ONNX Model Zoo](https://github.com/onnx/models#models), by leveraging automatic [accuracy-driven](./docs/design.md#workflow) quantization strategies
 20 | 
 21 | * Collaborate with software platforms such as [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [ONNX](https://github.com/onnx/models#models) and [ONNX Runtime](https://github.com/microsoft/onnxruntime)
 22 | 
 23 | ## Installation
 24 | 
 25 | ### Install from source
 26 | ```Shell
 27 | git clone https://github.com/onnx/neural-compressor.git
 28 | cd neural-compressor
 29 | pip install -r requirements.txt
 30 | pip install .
 31 | ```
 32 | 
 33 | > **Note**:
 34 | > Further installation methods can be found under [Installation Guide](./docs/installation_guide.md).
 35 | 
 36 | ## Getting Started
 37 | 
 38 | Setting up the environment:
 39 | ```bash
 40 | pip install onnx-neural-compressor "onnxruntime>=1.17.0" onnx
 41 | ```
 42 | After successfully installing these packages, try your first quantization program.
 43 | > Notes: please install from source before the formal pypi release.
 44 | 
 45 | ### Weight-Only Quantization (LLMs)
 46 | Following example code demonstrates Weight-Only Quantization on LLMs, device will be selected for efficiency automatically when multiple devices are available.
 47 | 
 48 | Run the example:
 49 | ```python
 50 | from onnx_neural_compressor.quantization import matmul_nbits_quantizer
 51 | 
 52 | algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig()
 53 | quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
 54 |     model,
 55 |     n_bits=4,
 56 |     block_size=32,
 57 |     is_symmetric=True,
 58 |     algo_config=algo_config,
 59 | )
 60 | quant.process()
 61 | best_model = quant.model
 62 | ```
 63 | 
 64 | ### Static Quantization
 65 | 
 66 | ```python
 67 | from onnx_neural_compressor.quantization import quantize, config
 68 | from onnx_neural_compressor import data_reader
 69 | 
 70 | 
 71 | class DataReader(data_reader.CalibrationDataReader):
 72 |     def __init__(self):
 73 |         self.encoded_list = []
 74 |         # append data into self.encoded_list
 75 | 
 76 |         self.iter_next = iter(self.encoded_list)
 77 | 
 78 |     def get_next(self):
 79 |         return next(self.iter_next, None)
 80 | 
 81 |     def rewind(self):
 82 |         self.iter_next = iter(self.encoded_list)
 83 | 
 84 | 
 85 | data_reader = DataReader()
 86 | qconfig = config.StaticQuantConfig(calibration_data_reader=data_reader)
 87 | quantize(model, output_model_path, qconfig)
 88 | ```
 89 | 
 90 | ## Documentation
 91 | 
 92 | <table class="docutils">
 93 |   <thead>
 94 |   <tr>
 95 |     <th colspan="8">Overview</th>
 96 |   </tr>
 97 |   </thead>
 98 |   <tbody>
 99 |     <tr>
100 |       <td colspan="3" align="center"><a href="./docs/design.md#architecture">Architecture</a></td>
101 |       <td colspan="3" align="center"><a href="./docs/design.md#workflow">Workflow</a></td>
102 |       <td colspan="3" align="center"><a href="./examples/">Examples</a></td>
103 |     </tr>
104 |   </tbody>
105 |   <thead>
106 |     <tr>
107 |       <th colspan="8">Feature</th>
108 |     </tr>
109 |   </thead>
110 |   <tbody>
111 |     <tr>
112 |         <td colspan="4" align="center"><a href="./docs/quantization.md">Quantization</a></td>
113 |           <td colspan="4" align="center"><a href="./docs/smooth_quant.md">SmoothQuant</td>
114 |       <tr>
115 |           <td colspan="4" align="center"><a href="./docs/quantization_weight_only.md">Weight-Only Quantization (INT8/INT4) </td>
116 |            </td>
117 |           <td colspan="4" align="center"><a href="./docs/quantization_layer_wise.md">Layer-Wise Quantization </td>
118 |       </tr>
119 |   </tbody>
120 | </table>
121 | 
122 | 
123 | 
124 | ## Additional Content
125 | 
126 | * [Contribution Guidelines](./docs/source/CONTRIBUTING.md)
127 | * [Security Policy](SECURITY.md)
128 | 
129 | ## Communication
130 | - [GitHub Issues](https://github.com/onnx/neural-compressor/issues): mainly for bug reports, new feature requests, question asking, etc.
131 | - [Email](mailto:inc.maintainers@intel.com): welcome to raise any interesting research ideas on model compression techniques by email for collaborations.
132 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Copyright (c) ONNX Project Contributors
 3 | 
 4 | SPDX-License-Identifier: Apache-2.0
 5 | -->
 6 | 
 7 | # Security Policy
 8 | 
 9 | ## Reporting a Vulnerability
10 | If you think you have found a security vulnerability, please send a report to onnx-security@lists.lfaidata.foundation. Please do not post security vulnerabilities on Slack.
11 | 
12 | We don't currently have a PGP key, unfortunately.
13 | 
14 | An ONNX committer will send you a response indicating the next steps in handling your report. After the initial reply to your report, the committer will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
15 | 
16 | Important: Please don't disclose the vulnerability before it has been fixed and announced, to protect our users.
17 | 
18 | ## Security announcements
19 | Please subscribe to the [announcements mailing list](https://lists.lfaidata.foundation/g/onnx-announce), where we post notifications and remediation details for security vulnerabilities.
20 | 


--------------------------------------------------------------------------------
/docs/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | Contributor Covenant Code of Conduct
2 | =======================
3 | 
4 | The ONNX Code Of Conduct is described at https://onnx.ai/codeofconduct.html
5 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contribution Guidelines
 2 | =======================
 3 | 1. [Create Pull Request](#create-pull-request)
 4 | 2. [Pull Request Checklist](#pull-request-checklist)
 5 | 3. [Pull Request Template](#pull-request-template)
 6 | 4. [Pull Request Acceptance Criteria](#pull-request-acceptance-criteria)
 7 | 5. [Pull Request Status Checks Overview](#pull-request-status-checks-overview)
 8 | 6. [Support](#support)
 9 | 7. [Contributor Covenant Code of Conduct](#contributor-covenant-code-of-conduct)
10 | 
11 | ## Create Pull Request
12 | If you have improvements to Neural Compressor, send your pull requests for
13 | [review](https://github.com/onnx/neural-compressor/pulls).
14 | If you are new to GitHub, view the pull request [How To](https://help.github.com/articles/using-pull-requests/).
15 | ### Step-by-Step guidelines
16 | - Star this repository using the button `Star` in the top right corner.
17 | - Fork this Repository using the button `Fork` in the top right corner.
18 | - Clone your forked repository to your pc.
19 | `git clone "url to your repo"`
20 | - Create a new branch for your modifications.
21 | `git checkout -b new-branch`
22 | - Add your files with `git add -A`, commit `git commit -s -m "This is my commit message"` and push `git push origin new-branch`.
23 | - Create a [pull request](https://github.com/onnx/neural-compressor/pulls).
24 | 
25 | ## Pull Request Checklist
26 | 
27 | Before sending your pull requests, follow the information below:
28 | 
29 | - Add unit tests in [Unit Tests](https://github.com/onnx/neural-compressor/tree/main/test) to cover the code you would like to contribute.
30 | - Neural Compressor has adopted the [Developer Certificate of Origin](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin), you must agree to the terms of Developer Certificate of Origin by signing off each of your commits with `-s`, e.g. `git commit -s -m 'This is my commit message'`.
31 | 
32 | ## Pull Request Template
33 | 
34 | See [PR template](/.github/pull_request_template.md)
35 | 
36 | ## Pull Request Acceptance Criteria
37 | - At least two approvals from reviewers
38 | 
39 | - All detected status checks pass
40 | 
41 | - All conversations solved
42 | 
43 | - Third-party dependency license compatible
44 | 
45 | ## Pull Request Status Checks Overview
46 | Neural Compressor use [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/pipelines/?view=azure-devops) for CI test.
47 | And generally use [Azure Cloud Instance](https://azure.microsoft.com/en-us/pricing/purchase-options/pay-as-you-go) to deploy pipelines, e.g. Standard E16s v5.
48 | |     Test Name                 |     Test Scope                                |     Test Pass Criteria    |
49 | |-------------------------------|-----------------------------------------------|---------------------------|
50 | |     Pre-commit CI                 |      [pre-comnnit config](../../.pre-commit-config.yaml)      |     PASS          |
51 | |     [DCO](https://github.com/apps/dco/)     |     Use `git commit -s` to sign off     |     PASS          |
52 | |     Unit Test                 |     Pytest scripts under [test](/test)                |      PASS (No failure, No core dump, No segmentation fault, No coverage drop)      |
53 | 
54 | ## Support
55 | 
56 | Submit your questions, feature requests, and bug reports to the
57 | [GitHub issues](https://github.com/onnx/neural-compressor/issues) page. You may also reach out to [Maintainers](mailto:inc.maintainers@intel.com).
58 | 
59 | ## Contributor Covenant Code of Conduct
60 | 
61 | This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant Code of Conduct](./CODE_OF_CONDUCT.md).
62 | 


--------------------------------------------------------------------------------
/docs/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Copyright (c) ONNX Project Contributors
 3 | -->
 4 | 
 5 | <!--- SPDX-License-Identifier: Apache-2.0 -->
 6 | 
 7 | Security Policy
 8 | ===============
 9 | 
10 | ## Reporting a Vulnerability
11 | If you think you have found a security vulnerability, please send a report to onnx-security@lists.lfaidata.foundation. Please do not post security vulnerabilities on Slack.
12 | 
13 | We don't currently have a PGP key, unfortunately.
14 | 
15 | An ONNX committer will send you a response indicating the next steps in handling your report. After the initial reply to your report, the committer will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
16 | 
17 | Important: Please don't disclose the vulnerability before it has been fixed and announced, to protect our users.
18 | 
19 | ## Security announcements
20 | Please subscribe to the [announcements mailing list](https://lists.lfaidata.foundation/g/onnx-announce), where we post notifications and remediation details for security vulnerabilities.
21 | 


--------------------------------------------------------------------------------
/docs/autotune.md:
--------------------------------------------------------------------------------
 1 | AutoTune
 2 | ========================================
 3 | 
 4 | 1. [Overview](#overview)
 5 | 2. [How it Works](#how-it-works)
 6 | 3. [Working with Autotune](#working-with-autotune)
 7 | 4. [Get Started](#get-started)
 8 | 
 9 | 
10 | ## Overview
11 | 
12 | Neural Compressor aims to help users quickly deploy low-precision models by leveraging popular compression techniques, such as post-training quantization and weight-only quantization algorithms. Despite having a variety of these algorithms, finding the appropriate configuration for a model can be difficult and time-consuming. To address this, we built the `autotune` module which identifies the best algorithm configuration for models to achieve optimal performance under the certain accuracy criteria. This module allows users to easily use predefined tuning recipes and customize the tuning space as needed.
13 | 
14 | ## How it Works
15 | 
16 | The autotune module constructs the tuning space according to the pre-defined tuning set or users' tuning set. It iterates the tuning space and applies the configuration on given float model then records and compares its evaluation result with the baseline. The tuning process stops when meeting the exit policy.
17 | The workflow is as below:
18 | 
19 | <a target="_blank" href="imgs/workflow.png">
20 |   <img src="imgs/workflow.png" alt="Workflow">
21 | </a>
22 | 
23 | 
24 | ## Working with Autotune
25 | 
26 | The `autotune` API can be used across all algorithms supported by Neural Compressor. It accepts three primary arguments: `model_input`, `tune_config`, and `eval_fn`.
27 | 
28 | The `TuningConfig` class defines the tuning process, including the tuning space, order, and exit policy.
29 | 
30 | - Define the tuning space
31 | 
32 |   User can define the tuning space by setting `config_set` with an algorithm configuration or a set of configurations.
33 |   ```python
34 |   # Use the default tuning space
35 |   config_set = config.get_woq_tuning_config()
36 | 
37 |   # Customize the tuning space with one algorithm configurations
38 |   config_set = config.RTNConfig(weight_sym=False, weight_group_size=[32, 64])
39 | 
40 |   # Customize the tuning space with two algorithm configurations
41 |   config_set = [
42 |     config.RTNConfig(weight_sym=False, weight_group_size=32),
43 |     config.GPTQConfig(weight_group_size=128, weight_sym=False),
44 |   ]
45 |   ```
46 | 
47 | - Define the tuning order
48 | 
49 |   The tuning order determines how the process traverses the tuning space and samples configurations. Users can customize it by configuring the `sampler`. Currently, we provide the [`default_sampler`](https://github.com/onnx/neural-compressor/blob/main/onnx_neural_compressor/quantization/tuning.py#L210), which samples configurations sequentially, always in the same order.
50 | 
51 | - Define the exit policy
52 | 
53 |   The exit policy includes two components: accuracy goal (`tolerable_loss`) and the allowed number of trials (`max_trials`). The tuning process will stop when either condition is met.
54 | 
55 | ## Get Started
56 | The example below demonstrates how to autotune a ONNX model on four `RTNConfig` configurations.
57 | 
58 | ```python
59 | from onnx_neural_compressor.quantization import config, tuning
60 | 
61 | 
62 | def eval_fn(model) -> float:
63 |     return ...
64 | 
65 | 
66 | tune_config = tuning.TuningConfig(
67 |     config_set=config.RTNConfig(
68 |         weight_sym=[False, True],
69 |         weight_group_size=[32, 128]
70 |     ),
71 |     tolerable_loss=0.2,
72 |     max_trials=10,
73 | )
74 | q_model = tuning.autotune(model, tune_config=tune_config, eval_fn=eval_fn)
75 | ```


--------------------------------------------------------------------------------
/docs/calibration.md:
--------------------------------------------------------------------------------
 1 | # Calibration Algorithms in Quantization
 2 | 
 3 | 1. [Introduction](#introduction)
 4 | 2. [Calibration Algorithms](#calibration-algorithms)
 5 | 3. [Support Matrix](#support-matrix)
 6 | 
 7 | ## Introduction
 8 | 
 9 | Quantization proves beneficial in terms of reducing the memory and computational requirements of the model. Uniform quantization transforms the input value $x ∈ [β, α]$ to lie within $[−2^{b−1}, 2^{b−1} − 1]$, where $[β, α]$ is the range of real values chosen for quantization and $b$ is the bit-width of the signed integer representation. Calibration is the process of determining the $α$ and $β$ for model weights and activations. Refer to this [link](.quantization.md#quantization-fundamentals) for more quantization fundamentals
10 | 
11 | ## Calibration Algorithms
12 | 
13 | Currently, Neural Compressor supports three popular calibration algorithms:
14 | 
15 | - MinMax: This method gets the maximum and minimum of input values as $α$ and $β$ [^1]. It preserves the entire range and is the simplest approach.
16 | 
17 | - Entropy: This method minimizes the KL divergence to reduce the information loss between full-precision and quantized data [^2]. Its primary focus is on preserving essential information.
18 | 
19 | - Percentile: This method only considers a specific percentage of values for calculating the range, ignoring the remainder which may contain outliers [^3]. It enhances resolution by excluding extreme values but still retaining noteworthy data.
20 | 
21 | > `kl` is used to represent the Entropy calibration algorithm in Neural Compressor.
22 | 
23 | ## Reference
24 | 
25 | [^1]: Vanhoucke, Vincent, Andrew Senior, and Mark Z. Mao. "Improving the speed of neural networks on CPUs." (2011).
26 | 
27 | [^2]: Szymon Migacz. "Nvidia 8-bit inference width tensorrt." (2017).
28 | 
29 | [^3]: McKinstry, Jeffrey L., et al. "Discovering low-precision networks close to full-precision networks for efficient embedded inference." arXiv preprint arXiv:1809.04191 (2018).
30 | 


--------------------------------------------------------------------------------
/docs/design.md:
--------------------------------------------------------------------------------
 1 | Design
 2 | =====
 3 | Neural Compressor features an architecture and workflow that aids in increasing performance and faster deployments across infrastructures.
 4 | 
 5 | ## Architecture
 6 | 
 7 | <a target="_blank" href="imgs/architecture.png">
 8 |   <img src="imgs/architecture.png" alt="Architecture" width=914 height=350>
 9 | </a>
10 | 
11 | ## Workflow
12 | 
13 | <a target="_blank" href="imgs/workflow.png">
14 |   <img src="imgs/workflow.png" alt="Workflow" width=914 height=390>
15 | </a>
16 | 


--------------------------------------------------------------------------------
/docs/imgs/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/architecture.png


--------------------------------------------------------------------------------
/docs/imgs/common/code.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1668669300692" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="2905" width="20" height="20" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M516 673c0 4.4 3.4 8 7.5 8h185c4.1 0 7.5-3.6 7.5-8v-48c0-4.4-3.4-8-7.5-8h-185c-4.1 0-7.5 3.6-7.5 8v48zM321.1 679.1l192-161c3.8-3.2 3.8-9.1 0-12.3l-192-160.9c-5.2-4.4-13.1-0.7-13.1 6.1v62.7c0 2.4 1 4.6 2.9 6.1L420.7 512l-109.8 92.2c-1.8 1.5-2.9 3.8-2.9 6.1V673c0 6.8 7.9 10.5 13.1 6.1z" p-id="2906" fill="#0068b5"></path><path d="M880 112H144c-17.7 0-32 14.3-32 32v736c0 17.7 14.3 32 32 32h736c17.7 0 32-14.3 32-32V144c0-17.7-14.3-32-32-32z m-40 728H184V184h656v656z" p-id="2907" fill="#0068b5"></path></svg>


--------------------------------------------------------------------------------
/docs/imgs/common/right.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1668669721270" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="4487" width="10" height="10" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M761.056 532.128c0.512-0.992 1.344-1.824 1.792-2.848 8.8-18.304 5.92-40.704-9.664-55.424L399.936 139.744c-19.264-18.208-49.632-17.344-67.872 1.888-18.208 19.264-17.376 49.632 1.888 67.872l316.96 299.84-315.712 304.288c-19.072 18.4-19.648 48.768-1.248 67.872 9.408 9.792 21.984 14.688 34.56 14.688 12 0 24-4.48 33.312-13.44l350.048-337.376c0.672-0.672 0.928-1.6 1.6-2.304 0.512-0.48 1.056-0.832 1.568-1.344C757.76 538.88 759.2 535.392 761.056 532.128z" p-id="4488" fill="#0068b5"></path></svg>


--------------------------------------------------------------------------------
/docs/imgs/lwq_ort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/lwq_ort.png


--------------------------------------------------------------------------------
/docs/imgs/smoothquant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/smoothquant.png


--------------------------------------------------------------------------------
/docs/imgs/sq_convert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/sq_convert.png


--------------------------------------------------------------------------------
/docs/imgs/sq_pc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/sq_pc.png


--------------------------------------------------------------------------------
/docs/imgs/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/docs/imgs/workflow.png


--------------------------------------------------------------------------------
/docs/installation_guide.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | 1. [Installation](#installation)
 4 | 
 5 |     1.1. [Prerequisites](#prerequisites)
 6 | 
 7 |     1.2. [Install from Binary](#install-from-binary)
 8 | 
 9 |     1.3. [Install from Source](#install-from-source)
10 | 
11 | 2. [System Requirements](#system-requirements)
12 | 
13 |    2.1. [Validated Hardware Environment](#validated-hardware-environment)
14 | 
15 |    2.2. [Validated Software Environment](#validated-software-environment)
16 | 
17 | ## Installation
18 | ### Prerequisites
19 | You can install Neural Compressor from binary or source.
20 | 
21 | The following prerequisites and requirements must be satisfied for a successful installation:
22 | 
23 | - Python version: 3.8 or 3.9 or 3.10 or 3.11
24 | 
25 | ### Install from Binary
26 |   ```Shell
27 |   # install stable basic version from pypi
28 |   pip install onnx-neural-compressor
29 |   ```
30 | 
31 | ### Install from Source
32 | 
33 |   ```Shell
34 |   git clone https://github.com/onnx/neural-compressor.git
35 |   cd neural-compressor
36 |   pip install -r requirements.txt
37 |   pip install .
38 |   ```
39 | 
40 | ## System Requirements
41 | 
42 | ### Validated Hardware Environment
43 | #### Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
44 | 
45 | * Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids)
46 | * Intel Xeon CPU Max Series (formerly Sapphire Rapids HBM)
47 | * Intel Core Ultra Processors (Meteor Lake, Lunar Lake)
48 | 
49 | ### Validated Software Environment
50 | 
51 | * OS version: CentOS 8.4, Ubuntu 22.04
52 | * Python version: 3.10
53 | * ONNX Runtime version: 1.18.1
54 | 


--------------------------------------------------------------------------------
/docs/quantization_layer_wise.md:
--------------------------------------------------------------------------------
 1 | Layer Wise Quantization (LWQ)
 2 | =====
 3 | 
 4 | 1. [Introduction](#introduction)
 5 | 
 6 | 2. [Supported Framework Model Matrix](#supported-framework-model-matrix)
 7 | 
 8 | 3. [Examples](#examples)
 9 | 
10 | ## Introduction
11 | 
12 | Large language models (LLMs) have shown exceptional performance across various tasks, meanwhile, the substantial parameter size poses significant challenges for deployment. Layer-wise quantization(LWQ) can greatly reduce the memory footprint of LLMs, usually 80-90% reduction, which means that users can quantize LLMs even on single node using GPU or CPU. We can quantize the model under memory-constrained devices, therefore making the huge-sized LLM quantization possible.
13 | 
14 | <img src="./imgs/lwq_ort.png" width=900 height=400>
15 | 
16 | *Figure 1: The process of layer-wise quantization for ONNX model. The graph of LLM is split into several parts, and each subgraph is quantized in turn.*
17 | 
18 | ## Supported Framework Model Matrix
19 | 
20 | 
21 | <table class="tg">
22 | <thead>
23 |   <tr>
24 |     <th colspan="2" style="text-align:center;vertical-align:middle">Types/Framework</th>
25 |     <th style="text-align:center;vertical-align:middle">ONNX Runtime</th>
26 |   </tr>
27 | </thead>
28 | <tbody>
29 |   <tr>
30 |     <td style="text-align:center;vertical-align:middle" colspan="2">W8A8 Post Training Static Quantization</td>
31 |     <td style="text-align:center;vertical-align:middle">&#10005;</td>
32 |   </tr>
33 |   <tr>
34 |     <td style="text-align:center;vertical-align:middle" rowspan="3">Weight-only Quantization</td>
35 |     <td style="text-align:center;vertical-align:middle">RTN</td>
36 |     <td style="text-align:center;vertical-align:middle">&#10004;</td></td>
37 |   </tr>
38 |   <tr>
39 |     <td style="text-align:center;vertical-align:middle">AWQ</td>
40 |     <td style="text-align:center;vertical-align:middle">&#10005;</td>
41 |   </tr>
42 |   <tr>
43 |     <td style="text-align:center;vertical-align:middle">GPTQ</td>
44 |     <td style="text-align:center;vertical-align:middle">&#10004;</td>
45 |   </tr>
46 | </tbody>
47 | </table>
48 | 
49 | ## Examples
50 | 
51 | ```python
52 | from onnx_neural_compressor.quantization import matmul_4bits_quantizer
53 | 
54 | algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=True)
55 | quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(
56 |     model,
57 |     algo_config=algo_config,
58 | )
59 | quant.process()
60 | qmodel = quant.model
61 | ```
62 | 


--------------------------------------------------------------------------------
/examples/image_recognition/resnet50/quantization/ptq_static/README.md:
--------------------------------------------------------------------------------
 1 | # Step-by-Step
 2 | 
 3 | This example load an image classification model from [ONNX Model Zoo](https://github.com/onnx/models) and confirm its accuracy and speed based on [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads). You need to download this dataset yourself.
 4 | 
 5 | # Prerequisite
 6 | 
 7 | ## 1. Environment
 8 | 
 9 | ```shell
10 | pip install onnx-neural-compressor
11 | pip install -r requirements.txt
12 | ```
13 | 
14 | 
15 | ## 2. Prepare Model
16 | 
17 | ```shell
18 | python prepare_model.py --output_model='resnet50-v1-12.onnx'
19 | ```
20 | 
21 | ## 3. Prepare Dataset
22 | 
23 | Download dataset [ILSVR2012 validation Imagenet dataset](http://www.image-net.org/challenges/LSVRC/2012/downloads).
24 | 
25 | Download label:
26 | 
27 | ```shell
28 | wget http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz
29 | tar -xvzf caffe_ilsvrc12.tar.gz val.txt
30 | ```
31 | 
32 | # Run
33 | 
34 | 
35 | ## 1. Quantization
36 | 
37 | Quantize model with QLinearOps:
38 | 
39 | ```bash
40 | bash run_quant.sh --input_model=path/to/model \  # model path as *.onnx
41 |                    --dataset_location=/path/to/imagenet \
42 |                    --label_path=/path/to/val.txt \
43 |                    --output_model=path/to/save
44 | ```
45 | 
46 | Quantize model with QDQ mode:
47 | 
48 | ```bash
49 | bash run_quant.sh --input_model=path/to/model \  # model path as *.onnx
50 |                    --dataset_location=/path/to/imagenet \
51 |                    --label_path=/path/to/val.txt \
52 |                    --output_model=path/to/save \
53 |                    --quant_format=QDQ
54 | ```
55 | 
56 | ## 2. Benchmark
57 | 
58 | ```bash
59 | bash run_benchmark.sh --input_model=path/to/model \  # model path as *.onnx
60 |                       --dataset_location=/path/to/imagenet \
61 |                       --label_path=/path/to/val.txt \
62 |                       --mode=performance # or accuracy
63 | ```
64 | 


--------------------------------------------------------------------------------
/examples/image_recognition/resnet50/quantization/ptq_static/prepare_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | import urllib.request
 5 | 
 6 | MODEL_URL = "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-v1-12.onnx"
 7 | MAX_TIMES_RETRY_DOWNLOAD = 5
 8 | 
 9 | 
10 | def parse_arguments():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--input_model", type=str, required=False, default="resnet50-v1-12.onnx")
13 |     parser.add_argument("--output_model", type=str, required=True)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def progressbar(cur, total=100):
18 |     percent = "{:.2%}".format(cur / total)
19 |     sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent))
20 |     sys.stdout.flush()
21 | 
22 | 
23 | def schedule(blocknum, blocksize, totalsize):
24 |     if totalsize == 0:
25 |         percent = 0
26 |     else:
27 |         percent = min(1.0, blocknum * blocksize / totalsize) * 100
28 |     progressbar(percent)
29 | 
30 | 
31 | def download_model(url, model_name, retry_times=5):
32 |     if os.path.isfile(model_name):
33 |         print(f"{model_name} exists, skip download")
34 |         return True
35 | 
36 |     print("download model...")
37 |     retries = 0
38 |     while retries < retry_times:
39 |         try:
40 |             urllib.request.urlretrieve(url, model_name, schedule)
41 |             break
42 |         except KeyboardInterrupt:
43 |             return False
44 |         except:
45 |             retries += 1
46 |             print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}")
47 |     return retries < retry_times
48 | 
49 | 
50 | def prepare_model(input_model, output_model):
51 |     # Download model from [ONNX Model Zoo](https://github.com/onnx/models)
52 |     download_model(MODEL_URL, output_model, MAX_TIMES_RETRY_DOWNLOAD)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     args = parse_arguments()
57 |     prepare_model(args.input_model, args.output_model)
58 | 


--------------------------------------------------------------------------------
/examples/image_recognition/resnet50/quantization/ptq_static/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx
2 | onnxruntime
3 | torch
4 | torchvision
5 | onnxruntime-extensions
6 | pillow>=8.2.0 # not directly required, pinned by Snyk to avoid a vulnerability
7 | opencv-python
8 | scikit-learn
9 | 


--------------------------------------------------------------------------------
/examples/image_recognition/resnet50/quantization/ptq_static/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 |   init_params "$@"
 6 |   run_benchmark
 7 | 
 8 | }
 9 | 
10 | # init params
11 | function init_params {
12 |   for var in "$@"
13 |   do
14 |     case $var in
15 |       --input_model=*)
16 |           input_model=$(echo "$var" |cut -f2 -d=)
17 |       ;;
18 |       --dataset_location=*)
19 |           dataset_location=$(echo "$var" |cut -f2 -d=)
20 |       ;;
21 |       --label_path=*)
22 |           label_path=$(echo "$var" |cut -f2 -d=)
23 |       ;;
24 |       --mode=*)
25 |           mode=$(echo "$var" |cut -f2 -d=)
26 |       ;;
27 |       --intra_op_num_threads=*)
28 |           intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
29 |       ;;
30 |     esac
31 |   done
32 | 
33 | }
34 | 
35 | # run_benchmark
36 | function run_benchmark {
37 | 
38 |     python main.py \
39 |             --model_path "${input_model}" \
40 |             --dataset_location "${dataset_location}" \
41 |             --label_path "${label_path-${dataset_location}/../val.txt}" \
42 |             --mode "${mode}" \
43 |             --batch_size 1 \
44 |             --intra_op_num_threads "${intra_op_num_threads-4}" \
45 |             --benchmark
46 |             
47 | }
48 | 
49 | main "$@"
50 | 


--------------------------------------------------------------------------------
/examples/image_recognition/resnet50/quantization/ptq_static/run_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 |   init_params "$@"
 6 |   run_tuning
 7 | 
 8 | }
 9 | 
10 | # init params
11 | function init_params {
12 | 
13 |   for var in "$@"
14 |   do
15 |     case $var in
16 |       --input_model=*)
17 |           input_model=$(echo "$var" |cut -f2 -d=)
18 |       ;;
19 |       --output_model=*)
20 |           output_model=$(echo "$var" |cut -f2 -d=)
21 |       ;;
22 |       --dataset_location=*)
23 |           dataset_location=$(echo "$var" |cut -f2 -d=)
24 |       ;;
25 |       --label_path=*)
26 |           label_path=$(echo "$var" |cut -f2 -d=)
27 |       ;;
28 |       --quant_format=*)
29 |           quant_format=$(echo "$var" |cut -f2 -d=)
30 |       ;;
31 |     esac
32 |   done
33 | 
34 | }
35 | 
36 | # run_tuning
37 | function run_tuning {
38 |     python main.py \
39 |             --model_path "${input_model}" \
40 |             --dataset_location "${dataset_location}" \
41 |             --label_path "${label_path-${dataset_location}/../val.txt}" \
42 |             --output_model "${output_model}" \
43 |             --quant_format "${quant_format-QOperator}" \
44 |             --tune
45 | }
46 | 
47 | main "$@"
48 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_dynamic/README.md:
--------------------------------------------------------------------------------
 1 | # Step-by-Step
 2 | 
 3 | This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/).
 4 | 
 5 | # Prerequisite
 6 | 
 7 | ## 1. Environment
 8 | 
 9 | ```shell
10 | pip install onnx-neural-compressor
11 | pip install -r requirements.txt
12 | ```
13 | 
14 | 
15 | ## 2. Prepare Dataset
16 | 
17 | download the GLUE data with `prepare_data.sh` script.
18 | 
19 | ```shell
20 | export GLUE_DIR=path/to/glue_data
21 | export TASK_NAME=MRPC
22 | 
23 | bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
24 | ```
25 | 
26 | ## 3. Prepare Model
27 | 
28 | ```shell
29 | python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx'
30 | ```
31 | 
32 | # Run
33 | 
34 | ## 1. Quantization
35 | 
36 | Dynamic quantization:
37 | 
38 | ```bash
39 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx
40 |                    --output_model=path/to/model_tune \ # model path as *.onnx
41 |                    --dataset_location=path/to/glue_data
42 | ```
43 | 
44 | ## 2. Benchmark
45 | 
46 | ```bash
47 | bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx
48 |                       --dataset_location=path/to/glue_data \
49 |                       --batch_size=batch_size \
50 |                       --mode=performance # or accuracy
51 | ```
52 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_dynamic/prepare_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 |   init_params "$@"
 6 |   download_data
 7 | 
 8 | }
 9 | 
10 | # init params
11 | function init_params {
12 | 
13 |   for var in "$@"
14 |   do
15 |     case $var in
16 |       --data_dir=*)
17 |           data_dir=$(echo "$var" |cut -f2 -d=)
18 |       ;;
19 |       --task_name=*)
20 |           task_name=$(echo "$var" |cut -f2 -d=)
21 |       ;;
22 |     esac
23 |   done
24 | 
25 | }
26 | 
27 | # run_tuning
28 | function download_data {
29 |     wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py
30 |     python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}"
31 | }
32 | 
33 | main "$@"
34 | 
35 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_dynamic/prepare_model.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import urllib.request
  5 | import zipfile
  6 | 
  7 | import torch
  8 | import transformers
  9 | 
 10 | # Please refer to [Bert-GLUE_OnnxRuntime_quantization guide]
 11 | # (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb)
 12 | # for detailed model export.
 13 | 
 14 | MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip"
 15 | MAX_TIMES_RETRY_DOWNLOAD = 5
 16 | 
 17 | 
 18 | def parse_arguments():
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip")
 21 |     parser.add_argument("--output_model", type=str, required=True)
 22 |     parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs")
 23 |     return parser.parse_args()
 24 | 
 25 | 
 26 | def progressbar(cur, total=100):
 27 |     percent = "{:.2%}".format(cur / total)
 28 |     sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent))
 29 |     sys.stdout.flush()
 30 | 
 31 | 
 32 | def schedule(blocknum, blocksize, totalsize):
 33 |     if totalsize == 0:
 34 |         percent = 0
 35 |     else:
 36 |         percent = min(1.0, blocknum * blocksize / totalsize) * 100
 37 |     progressbar(percent)
 38 | 
 39 | 
 40 | def is_zip_file(filename):
 41 |     try:
 42 |         with open(filename, "rb") as f:
 43 |             magic_number = f.read(4)
 44 |             return magic_number == b"PK\x03\x04"  # ZIP file magic number
 45 |     except OSError:
 46 |         return False
 47 | 
 48 | 
 49 | def extrafile(filename, target_folder="."):
 50 |     with zipfile.ZipFile(filename, "r") as zin:
 51 |         zin.extractall(target_folder)
 52 | 
 53 | 
 54 | def download_model(url, model_name, retry_times=5):
 55 |     if os.path.isdir(model_name):
 56 |         return model_name
 57 |     elif os.path.exists(model_name) and is_zip_file(model_name):
 58 |         print("file downloaded")
 59 |         extrafile(model_name)
 60 |         return True
 61 | 
 62 |     print("download model...")
 63 |     retries = 0
 64 |     while retries < retry_times:
 65 |         try:
 66 |             urllib.request.urlretrieve(url, model_name, schedule)
 67 |             extrafile(model_name)
 68 |             break
 69 |         except KeyboardInterrupt:
 70 |             return False
 71 |         except:
 72 |             retries += 1
 73 |             print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}")
 74 |     return retries < retry_times
 75 | 
 76 | 
 77 | def export_model(model, output_model, max_len=128):
 78 |     with torch.no_grad():
 79 |         inputs = {
 80 |             "input_ids": torch.ones(1, max_len, dtype=torch.int64),
 81 |             "attention_mask": torch.ones(1, max_len, dtype=torch.int64),
 82 |             "token_type_ids": torch.ones(1, max_len, dtype=torch.int64),
 83 |         }
 84 | 
 85 |         symbolic_names = {0: "batch_size", 1: "max_seq_len"}
 86 |         torch.onnx.export(
 87 |             model,  # model being run
 88 |             (
 89 |                 inputs["input_ids"],
 90 |                 inputs["attention_mask"],
 91 |                 inputs["token_type_ids"],
 92 |             ),  # model input (or a tuple for multiple inputs)
 93 |             output_model,  # where to save the model (can be a file or file-like object)
 94 |             opset_version=14,  # the ONNX version to export the model
 95 |             do_constant_folding=True,  # whether to execute constant folding
 96 |             input_names=["input_ids", "input_mask", "segment_ids"],  # the model's input names
 97 |             output_names=["output"],  # the model's output names
 98 |             dynamic_axes={
 99 |                 "input_ids": symbolic_names,  # variable length axes
100 |                 "input_mask": symbolic_names,
101 |                 "segment_ids": symbolic_names,
102 |             },
103 |         )
104 |         assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!"
105 |         print("ONNX Model exported to {0}".format(output_model))
106 | 
107 | 
108 | def prepare_model(input_model, output_model, max_len):
109 |     is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD)
110 |     if is_download_successful:
111 |         folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC"
112 |         model = transformers.BertForSequenceClassification.from_pretrained(folder_name)
113 |         export_model(model, output_model, max_len)
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     args = parse_arguments()
118 |     prepare_model(args.input_model, args.output_model, args.max_len)
119 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_dynamic/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | transformers
 3 | accelerate
 4 | onnx
 5 | onnxruntime
 6 | coloredlogs
 7 | sympy
 8 | onnxruntime-extensions
 9 | scikit-learn
10 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_dynamic/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 | 
 6 |   init_params "$@"
 7 |   run_benchmark
 8 | 
 9 | }
10 | 
11 | # init params
12 | function init_params {
13 |   for var in "$@"
14 |   do
15 |     case $var in
16 |       --input_model=*)
17 |           input_model=$(echo "$var" |cut -f2 -d=)
18 |       ;;
19 |       --mode=*)
20 |           mode=$(echo "$var" |cut -f2 -d=)
21 |       ;;
22 |       --dataset_location=*)
23 |           dataset_location=$(echo "$var" |cut -f2 -d=)
24 |       ;;
25 |       --batch_size=*)
26 |           batch_size=$(echo "$var" |cut -f2 -d=)
27 |       ;;
28 |       --intra_op_num_threads=*)
29 |           intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
30 |       ;;
31 |     esac
32 |   done
33 | 
34 | }
35 | 
36 | # run_benchmark
37 | function run_benchmark {
38 |     if [[ ${mode} == "accuracy" ]]; then
39 |       dynamic_length=False
40 |     elif [[ ${mode} == "performance" ]]; then
41 |       dynamic_length=True
42 |     else
43 |       echo "Error: No such mode: ${mode}"
44 |       exit 1
45 |     fi
46 | 
47 |     model_name_or_path="bert-base-uncased"
48 |     task_name="mrpc"
49 | 
50 |     python main.py \
51 |            --model_path "${input_model}" \
52 |            --model_name_or_path "${model_name_or_path}" \
53 |            --data_path "${dataset_location}" \
54 |            --task "${task_name}" \
55 |            --batch_size "${batch_size}" \
56 |            --mode "${mode}" \
57 |            --dynamic_length "${dynamic_length}" \
58 |            --intra_op_num_threads "${intra_op_num_threads-4}" \
59 |            --benchmark
60 |             
61 | }
62 | 
63 | main "$@"
64 | 
65 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_dynamic/run_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 |   init_params "$@"
 6 |   run_tuning
 7 | }
 8 | 
 9 | # init params
10 | function init_params {
11 |   for var in "$@"
12 |   do
13 |     case $var in
14 |       --input_model=*)
15 |           input_model=$(echo "$var" |cut -f2 -d=)
16 |       ;;
17 |       --output_model=*)
18 |           output_model=$(echo "$var" |cut -f2 -d=)
19 |       ;;
20 |       --dataset_location=*)
21 |           dataset_location=$(echo "$var" |cut -f2 -d=)
22 |       ;;
23 |     esac
24 |   done
25 | 
26 | }
27 | 
28 | # run_tuning
29 | function run_tuning {
30 |     model_name_or_path="bert-base-uncased"
31 |     batch_size=8
32 |     task_name="mrpc"
33 | 
34 |     python main.py \
35 |            --model_path "${input_model}" \
36 |            --output_model "${output_model}" \
37 |            --model_name_or_path "${model_name_or_path}" \
38 |            --data_path "${dataset_location}" \
39 |            --task "${task_name}" \
40 |            --batch_size "${batch_size}" \
41 |            --tune
42 | }
43 | 
44 | main "$@"
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_static/README.md:
--------------------------------------------------------------------------------
 1 | Step-by-Step
 2 | ============
 3 | 
 4 | This example load a BERT model and confirm its accuracy and speed based on [GLUE data](https://gluebenchmark.com/). 
 5 | 
 6 | # Prerequisite
 7 | 
 8 | ## 1. Environment
 9 | 
10 | ```shell
11 | pip install onnx-neural-compressor
12 | pip install -r requirements.txt
13 | ```
14 | 
15 | ## 2. Prepare Dataset
16 | 
17 | download the GLUE data with `prepare_data.sh` script.
18 | ```shell
19 | export GLUE_DIR=path/to/glue_data
20 | export TASK_NAME=MRPC
21 | 
22 | bash prepare_data.sh --data_dir=$GLUE_DIR --task_name=$TASK_NAME
23 | ```
24 | 
25 | ## 3. Prepare Model
26 | 
27 | ```shell
28 | python prepare_model.py --input_model='MRPC.zip' --output_model='bert.onnx'
29 | ```
30 | 
31 | # Run
32 | 
33 | ## 1. Quantization
34 | 
35 | Static quantization with QOperator format:
36 | 
37 | ```bash
38 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx
39 |                    --output_model=path/to/model_tune \
40 |                    --dataset_location=path/to/glue_data \
41 |                    --quant_format="QOperator"
42 | ```
43 | 
44 | Static quantization with QDQ format:
45 | 
46 | ```bash
47 | bash run_quant.sh --input_model=path/to/model \ # model path as *.onnx
48 |                    --output_model=path/to/model_tune \ # model path as *.onnx
49 |                    --dataset_location=path/to/glue_data \
50 |                    --quant_format="QDQ"
51 | ```
52 | 
53 | ## 2. Benchmark
54 | 
55 | ```bash
56 | bash run_benchmark.sh --input_model=path/to/model \ # model path as *.onnx
57 |                       --dataset_location=path/to/glue_data \
58 |                       --batch_size=batch_size \
59 |                       --mode=performance # or accuracy
60 | ```
61 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_static/prepare_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 |   init_params "$@"
 6 |   download_data
 7 | 
 8 | }
 9 | 
10 | # init params
11 | function init_params {
12 | 
13 |   for var in "$@"
14 |   do
15 |     case $var in
16 |       --data_dir=*)
17 |           data_dir=$(echo "$var" |cut -f2 -d=)
18 |       ;;
19 |       --task_name=*)
20 |           task_name=$(echo "$var" |cut -f2 -d=)
21 |       ;;
22 |     esac
23 |   done
24 | 
25 | }
26 | 
27 | # run_tuning
28 | function download_data {
29 |     wget https://raw.githubusercontent.com/huggingface/transformers/f98ef14d161d7bcdc9808b5ec399981481411cc1/utils/download_glue_data.py
30 |     python download_glue_data.py --data_dir="${data_dir}" --tasks="${task_name}"
31 | }
32 | 
33 | main "$@"
34 | 
35 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_static/prepare_model.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import urllib.request
  5 | import zipfile
  6 | 
  7 | import torch
  8 | import transformers
  9 | 
 10 | # Please refer to [Bert-GLUE_OnnxRuntime_quantization guide]
 11 | # (https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb)
 12 | # for detailed model export.
 13 | 
 14 | MODEL_URL = "https://download.pytorch.org/tutorial/MRPC.zip"
 15 | MAX_TIMES_RETRY_DOWNLOAD = 5
 16 | 
 17 | 
 18 | def parse_arguments():
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument("--input_model", type=str, required=False, default="MRPC.zip")
 21 |     parser.add_argument("--output_model", type=str, required=True)
 22 |     parser.add_argument("--max_len", type=int, default=128, help="Maximum length of the sentence pairs")
 23 |     return parser.parse_args()
 24 | 
 25 | 
 26 | def progressbar(cur, total=100):
 27 |     percent = "{:.2%}".format(cur / total)
 28 |     sys.stdout.write("\r[%-100s] %s" % ("#" * int(cur), percent))
 29 |     sys.stdout.flush()
 30 | 
 31 | 
 32 | def schedule(blocknum, blocksize, totalsize):
 33 |     if totalsize == 0:
 34 |         percent = 0
 35 |     else:
 36 |         percent = min(1.0, blocknum * blocksize / totalsize) * 100
 37 |     progressbar(percent)
 38 | 
 39 | 
 40 | def is_zip_file(filename):
 41 |     try:
 42 |         with open(filename, "rb") as f:
 43 |             magic_number = f.read(4)
 44 |             return magic_number == b"PK\x03\x04"  # ZIP file magic number
 45 |     except OSError:
 46 |         return False
 47 | 
 48 | 
 49 | def extrafile(filename, target_folder="."):
 50 |     with zipfile.ZipFile(filename, "r") as zin:
 51 |         zin.extractall(target_folder)
 52 | 
 53 | 
 54 | def download_model(url, model_name, retry_times=5):
 55 |     if os.path.isdir(model_name):
 56 |         return model_name
 57 |     elif os.path.exists(model_name) and is_zip_file(model_name):
 58 |         print("file downloaded")
 59 |         extrafile(model_name)
 60 |         return True
 61 | 
 62 |     print("download model...")
 63 |     retries = 0
 64 |     while retries < retry_times:
 65 |         try:
 66 |             urllib.request.urlretrieve(url, model_name, schedule)
 67 |             extrafile(model_name)
 68 |             break
 69 |         except KeyboardInterrupt:
 70 |             return False
 71 |         except:
 72 |             retries += 1
 73 |             print(f"Download failed{', Retry downloading...' if retries < retry_times else '!'}")
 74 |     return retries < retry_times
 75 | 
 76 | 
 77 | def export_model(model, output_model, max_len=128):
 78 |     with torch.no_grad():
 79 |         inputs = {
 80 |             "input_ids": torch.ones(1, max_len, dtype=torch.int64),
 81 |             "attention_mask": torch.ones(1, max_len, dtype=torch.int64),
 82 |             "token_type_ids": torch.ones(1, max_len, dtype=torch.int64),
 83 |         }
 84 | 
 85 |         symbolic_names = {0: "batch_size", 1: "max_seq_len"}
 86 |         torch.onnx.export(
 87 |             model,  # model being run
 88 |             (
 89 |                 inputs["input_ids"],
 90 |                 inputs["attention_mask"],
 91 |                 inputs["token_type_ids"],
 92 |             ),  # model input (or a tuple for multiple inputs)
 93 |             output_model,  # where to save the model (can be a file or file-like object)
 94 |             opset_version=14,  # the ONNX version to export the model
 95 |             do_constant_folding=True,  # whether to execute constant folding
 96 |             input_names=["input_ids", "input_mask", "segment_ids"],  # the model's input names
 97 |             output_names=["output"],  # the model's output names
 98 |             dynamic_axes={
 99 |                 "input_ids": symbolic_names,  # variable length axes
100 |                 "input_mask": symbolic_names,
101 |                 "segment_ids": symbolic_names,
102 |             },
103 |         )
104 |         assert os.path.exists(output_model), f"Export failed! {output_model} doesn't exist!"
105 |         print("ONNX Model exported to {0}".format(output_model))
106 | 
107 | 
108 | def prepare_model(input_model, output_model, max_len):
109 |     is_download_successful = download_model(MODEL_URL, input_model, MAX_TIMES_RETRY_DOWNLOAD)
110 |     if is_download_successful:
111 |         folder_name = is_download_successful if isinstance(is_download_successful, str) else "./MRPC"
112 |         model = transformers.BertForSequenceClassification.from_pretrained(folder_name)
113 |         export_model(model, output_model, max_len)
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     args = parse_arguments()
118 |     prepare_model(args.input_model, args.output_model, args.max_len)
119 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_static/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | transformers
 3 | accelerate
 4 | onnx
 5 | onnxruntime
 6 | coloredlogs
 7 | sympy
 8 | onnxruntime-extensions
 9 | scikit-learn
10 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_static/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 | 
 6 |   init_params "$@"
 7 |   run_benchmark
 8 | 
 9 | }
10 | 
11 | # init params
12 | function init_params {
13 |   for var in "$@"
14 |   do
15 |     case $var in
16 |       --input_model=*)
17 |           input_model=$(echo "$var" |cut -f2 -d=)
18 |       ;;
19 |       --mode=*)
20 |           mode=$(echo "$var" |cut -f2 -d=)
21 |       ;;
22 |       --dataset_location=*)
23 |           dataset_location=$(echo "$var" |cut -f2 -d=)
24 |       ;;
25 |       --batch_size=*)
26 |           batch_size=$(echo "$var" |cut -f2 -d=)
27 |       ;;
28 |       --intra_op_num_threads=*)
29 |           intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
30 |       ;;
31 |     esac
32 |   done
33 | 
34 | }
35 | 
36 | # run_benchmark
37 | function run_benchmark {
38 |     if [[ ${mode} == "accuracy" ]]; then
39 |       dynamic_length=False
40 |     elif [[ ${mode} == "performance" ]]; then
41 |       dynamic_length=True
42 |     else
43 |       echo "Error: No such mode: ${mode}"
44 |       exit 1
45 |     fi
46 | 
47 |     model_name_or_path="bert-base-uncased"
48 |     task_name="mrpc"
49 | 
50 |     python main.py \
51 |            --model_path "${input_model}" \
52 |            --model_name_or_path "${model_name_or_path}" \
53 |            --data_path "${dataset_location}" \
54 |            --task "${task_name}" \
55 |            --batch_size "${batch_size}" \
56 |            --mode "${mode}" \
57 |            --intra_op_num_threads "${intra_op_num_threads-4}" \
58 |            --dynamic_length "${dynamic_length}" \
59 |            --benchmark
60 |             
61 | }
62 | 
63 | main "$@"
64 | 
65 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/quantization/ptq_static/run_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 |   init_params "$@"
 6 |   run_tuning
 7 | }
 8 | 
 9 | # init params
10 | function init_params {
11 |   for var in "$@"
12 |   do
13 |     case $var in
14 |       --input_model=*)
15 |           input_model=$(echo "$var" |cut -f2 -d=)
16 |       ;;
17 |       --output_model=*)
18 |           output_model=$(echo "$var" |cut -f2 -d=)
19 |       ;;
20 |       --dataset_location=*)
21 |           dataset_location=$(echo "$var" |cut -f2 -d=)
22 |       ;;
23 |       --quant_format=*)
24 |           quant_format=$(echo "$var" |cut -f2 -d=)
25 |       ;;
26 |     esac
27 |   done
28 | 
29 | }
30 | 
31 | # run_tuning
32 | function run_tuning {
33 |     model_name_or_path="bert-base-uncased"
34 |     batch_size=8
35 |     task_name="mrpc"
36 |     model_type="bert"
37 | 
38 |     python main.py \
39 |            --model_path "${input_model}" \
40 |            --output_model "${output_model}" \
41 |            --model_name_or_path "${model_name_or_path}" \
42 |            --data_path "${dataset_location}" \
43 |            --task "${task_name}" \
44 |            --batch_size "${batch_size}" \
45 |            --model_type "${model_type}" \
46 |            --quant_format "${quant_format}" \
47 |            --tune
48 | }
49 | 
50 | main "$@"
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md:
--------------------------------------------------------------------------------
 1 | Step-by-Step
 2 | ============
 3 | 
 4 | This example confirms llama's weight only accuracy on [lambada](https://huggingface.co/datasets/lambada).
 5 | 
 6 | # Prerequisite
 7 | 
 8 | ## 1. Environment
 9 | ```shell
10 | pip install onnx-neural-compressor
11 | pip install -r requirements.txt
12 | ```
13 | > Note: Validated ONNX Runtime [Version](/docs/installation_guide.md#validated-software-environment).
14 | 
15 | ## 2. Prepare Model
16 | 
17 | Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. We verified weight-only quantization on other models as follows.
18 | 
19 | | Model | Num Hidden Layers| Num Attention Heads | Hidden Size |
20 | | --- | --- | --- | --- |
21 | | [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 |
22 | | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 |
23 | | [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 |
24 | | [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 |
25 | | [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 |
26 | | [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 |
27 | | [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 32 | 32 | 4096 |
28 | | [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | 32 | 32 | 3072 |
29 | | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | 28 | 28 | 3584 |
30 | 
31 | Export to ONNX model:
32 | ```bash
33 | python prepare_model.py  --input_model="meta-llama/Llama-2-7b-hf" \
34 |                          --task=text-generation-with-past \ # or text-generation
35 | ```
36 | 
37 | 
38 | # Run
39 | 
40 | ## 1. Quantization
41 | 
42 | Set `algorithm=WOQ_TUNE` to tune weight-only quantization algorithm or specify algorithm to `RTN` or `GPTQ` or `AWQ`.
43 | 
44 | `quant_format=QDQ` works only when:
45 | - onnxruntime >= 1.19.0
46 | - opset version of the model >= 21
47 | - quantized bits is in [4, 8]
48 | 
49 | otherwise it will execute QOperator automatically.
50 | 
51 | ```bash
52 | bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
53 |                   --output_model=/path/to/model_tune \ # folder path to save onnx model
54 |                   --batch_size=batch_size # optional \
55 |                   --dataset=NeelNanda/pile-10k \
56 |                   --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
57 |                   --algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ \
58 |                   --quant_format=QDQ # support QOperator and QDQ
59 | ```
60 | 
61 | ## 2. Benchmark
62 | 
63 | Accuracy:
64 | 
65 | ```bash
66 | bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
67 |                       --batch_size=batch_size \ # optional
68 |                       --mode=accuracy \
69 |                       --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
70 |                       --tasks=lambada_openai
71 | ```
72 | 
73 | Performance:
74 | ```bash
75 | numactl -m 0 -C 0-23 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
76 |                                            --mode=performance \
77 |                                            --batch_size=batch_size # optional \
78 |                                            --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
79 |                                            --intra_op_num_threads=24
80 | 
81 | ```
82 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Refer from https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval
16 | 
17 | from evaluation.accuracy import cli_evaluate as evaluate
18 | from evaluation.utils import LMEvalParser
19 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from evaluation.models import huggingface
17 | 
18 | # TODO: implement __all__
19 | 
20 | 
21 | try:
22 |     # enable hf hub transfer if available
23 |     import hf_transfer  # type: ignore # noqa
24 |     import huggingface_hub.constants  # type: ignore
25 | 
26 |     huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
27 | except ImportError:
28 |     pass
29 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/evaluation/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class LMEvalParser:
17 |     def __init__(
18 |         self,
19 |         model="hf",
20 |         tasks="lambada_openai",
21 |         model_args="",
22 |         user_model=None,
23 |         tokenizer=None,
24 |         num_fewshot=None,
25 |         batch_size=1,
26 |         max_batch_size=None,
27 |         provider=None,
28 |         output_path=None,
29 |         limit=None,
30 |         use_cache=None,
31 |         cache_requests=None,
32 |         check_integrity=False,
33 |         write_out=False,
34 |         log_samples=False,
35 |         show_config=False,
36 |         include_path=None,
37 |         gen_kwargs=None,
38 |         verbosity="INFO",
39 |         wandb_args="",
40 |         predict_only=False,
41 |         seed=[0, 1234, 1234],
42 |         trust_remote_code=False,
43 |     ):
44 |         self.model = model
45 |         self.tasks = tasks
46 |         self.model_args = model_args
47 |         self.user_model = user_model
48 |         self.tokenizer = tokenizer
49 |         self.num_fewshot = num_fewshot
50 |         self.batch_size = batch_size
51 |         self.max_batch_size = max_batch_size
52 |         self.provider = provider
53 |         self.output_path = output_path
54 |         self.limit = limit
55 |         self.use_cache = use_cache
56 |         self.cache_requests = cache_requests
57 |         self.check_integrity = check_integrity
58 |         self.write_out = write_out
59 |         self.log_samples = log_samples
60 |         self.show_config = show_config
61 |         self.include_path = include_path
62 |         self.gen_kwargs = gen_kwargs
63 |         self.verbosity = verbosity
64 |         self.wandb_args = wandb_args
65 |         self.predict_only = predict_only
66 |         self.seed = seed
67 |         self.trust_remote_code = trust_remote_code
68 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/prepare_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import subprocess
 4 | 
 5 | import optimum.version
 6 | from packaging import version
 7 | 
 8 | OPTIMUM114_VERSION = version.Version("1.14.0")
 9 | 
10 | 
11 | def parse_arguments():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--input_model", type=str, required=True, default="")
14 |     parser.add_argument("--output_model", type=str, required=False, default=None)
15 |     parser.add_argument(
16 |         "--task",
17 |         type=str,
18 |         required=False,
19 |         default="text-generation-with-past",
20 |         choices=["text-generation-with-past", "text-generation"],
21 |     )
22 |     args = parser.parse_args()
23 |     if args.output_model is None:
24 |         args.output_model = os.path.basename(args.input_model) + "-onnx"
25 |     return args
26 | 
27 | 
28 | def prepare_model(input_model, output_model, task):
29 |     print("\nexport model...")
30 |     if version.Version(optimum.version.__version__) < OPTIMUM114_VERSION:
31 |         raise ImportError("Please upgrade optimum to >= 1.14.0")
32 | 
33 |     subprocess.run(
34 |         [
35 |             "optimum-cli",
36 |             "export",
37 |             "onnx",
38 |             "--model",
39 |             f"{input_model}",
40 |             "--task",
41 |             task,
42 |             f"{output_model}",
43 |             "--trust-remote-code",
44 |         ],
45 |         stdout=subprocess.PIPE,
46 |         text=True,
47 |     )
48 | 
49 |     assert os.path.exists(output_model), f"{output_model} doesn't exist!"
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     args = parse_arguments()
54 |     prepare_model(args.input_model, args.output_model, args.task)
55 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | transformers
3 | onnx
4 | onnxruntime
5 | onnxruntime-extensions
6 | datasets
7 | optimum
8 | lm-eval==0.4.2
9 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 | 
 6 |   init_params "$@"
 7 |   run_benchmark
 8 | 
 9 | }
10 | 
11 | # init params
12 | function init_params {
13 |   for var in "$@"
14 |   do
15 |     case $var in
16 |       --input_model=*)
17 |           input_model=$(echo "$var" |cut -f2 -d=)
18 |       ;;
19 |       --batch_size=*)
20 |           batch_size=$(echo "$var" |cut -f2 -d=)
21 |       ;;
22 |       --tokenizer=*)
23 |           tokenizer=$(echo "$var" |cut -f2 -d=)
24 |       ;;
25 |       --mode=*)
26 |           mode=$(echo "$var" |cut -f2 -d=)
27 |       ;;
28 |       --intra_op_num_threads=*)
29 |           intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
30 |       ;;
31 |     esac
32 |   done
33 | 
34 | }
35 | 
36 | # run_benchmark
37 | function run_benchmark {
38 | 
39 |     # Check if the input_model ends with the filename extension ".onnx"
40 |     if [[ $input_model =~ \.onnx$ ]]; then
41 |         # If the string ends with the filename extension, get the path of the file
42 |         input_model=$(dirname "$input_model")
43 |     fi
44 | 
45 |     extra_cmd=""
46 | 
47 |     if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
48 |         extra_cmd=$extra_cmd"--trust_remote_code True "
49 |     fi
50 | 
51 |     if [ "${batch_size}" ]; then
52 | 	extra_cmd=$extra_cmd"--batch_size ${batch_size} "
53 |     fi
54 |     if [ "${tokenizer}" ]; then
55 | 	extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
56 |     fi
57 |     if [ "${tasks}" ]; then
58 | 	extra_cmd=$extra_cmd"--tasks ${tasks} "
59 |     fi
60 |     if [ "${intra_op_num_threads}" ]; then
61 | 	extra_cmd=$extra_cmd"--intra_op_num_threads ${intra_op_num_threads} "
62 |     fi
63 | 
64 |     extra_cmd=$extra_cmd"--benchmark"
65 |     eval "python main.py --model_path ${input_model} --mode ${mode} ${extra_cmd}"
66 | 
67 | }
68 | 
69 | main "$@"
70 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -x
  3 | 
  4 | function main {
  5 |   init_params "$@"
  6 |   run_tuning
  7 | }
  8 | 
  9 | # init params
 10 | function init_params {
 11 |   for var in "$@"
 12 |   do
 13 |     case $var in
 14 |       --input_model=*)
 15 |           input_model=$(echo "$var" |cut -f2 -d=)
 16 |       ;;
 17 |       --output_model=*)
 18 |           output_model=$(echo "$var" |cut -f2 -d=)
 19 |       ;;
 20 |       --batch_size=*)
 21 |           batch_size=$(echo "$var" |cut -f2 -d=)
 22 |       ;;
 23 |       --dataset=*)
 24 |           dataset=$(echo "$var" |cut -f2 -d=)
 25 |       ;;
 26 |       --tokenizer=*)
 27 |           tokenizer=$(echo "$var" |cut -f2 -d=)
 28 |       ;;
 29 |       --algorithm=*)
 30 |           algorithm=$(echo "$var" |cut -f2 -d=)
 31 |       ;;
 32 |       --quant_format=*)
 33 |           quant_format=$(echo "$var" |cut -f2 -d=)
 34 |       ;;
 35 |     esac
 36 |   done
 37 | 
 38 | }
 39 | 
 40 | # run_tuning
 41 | function run_tuning {
 42 | 
 43 |     # Check if the input_model ends with the filename extension ".onnx"
 44 |     if [[ $input_model =~ \.onnx$ ]]; then
 45 |         # If the string ends with the filename extension, get the path of the file
 46 |         input_model=$(dirname "$input_model")
 47 |     fi
 48 | 
 49 |     # Check if the output_model ends with the filename extension ".onnx"
 50 |     if [[ $output_model =~ \.onnx$ ]]; then
 51 |         # If the string ends with the filename extension, get the path of the file
 52 |         output_model=$(dirname "$output_model")
 53 |     fi
 54 | 
 55 |     # Check if the directory exists
 56 |     if [ ! -d "$output_model" ]; then
 57 |         # If the directory doesn't exist, create it
 58 | 	mkdir -p "$output_model"
 59 | 	echo "Created directory $output_model"
 60 |     fi
 61 | 
 62 |     extra_cmd=""
 63 | 
 64 |     if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
 65 |         nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul"
 66 |         extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True "
 67 |     fi
 68 |     if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then
 69 |         nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul"
 70 |         extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
 71 |     fi
 72 |     if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then
 73 |         nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul"
 74 |         extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
 75 |     fi
 76 | 
 77 |     if [ "${tokenizer}" ]; then
 78 | 	extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
 79 |     fi
 80 |     if [ "${batch_size}" ]; then
 81 | 	extra_cmd=$extra_cmd"--batch_size ${batch_size} "
 82 |     fi
 83 |     if [ "${dataset}" ]; then
 84 | 	extra_cmd=$extra_cmd"--dataset ${dataset} "
 85 |     fi
 86 |     if [ "${algorithm}" ]; then
 87 | 	extra_cmd=$extra_cmd"--algorithm ${algorithm} "
 88 |     fi
 89 |     if [ "${tasks}" ]; then
 90 | 	extra_cmd=$extra_cmd"--tasks ${tasks} "
 91 |     fi
 92 |     if [ "${quant_format}" ]; then
 93 | 	extra_cmd=$extra_cmd"--quant_format ${quant_format} "
 94 |     fi
 95 | 
 96 |     extra_cmd=$extra_cmd"--layer_wise --tune"
 97 |     eval "python main.py --model_path ${input_model} --output_model ${output_model} ${extra_cmd}"
 98 | }
 99 | 
100 | main "$@"
101 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/README.md:
--------------------------------------------------------------------------------
 1 | Step-by-Step
 2 | ============
 3 | 
 4 | This example shows how to quantize the unet model of [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) with SmoothQuant and generate images with the quantized unet.
 5 | 
 6 | # Prerequisite
 7 | 
 8 | ## 1. Environment
 9 | ```shell
10 | pip install -r requirements.txt
11 | ```
12 | > Note: Validated ONNX Runtime [Version](/docs/installation_guide.md#validated-software-environment).
13 | 
14 | ## 2. Prepare Model
15 | 
16 | 
17 | ```bash
18 | git clone https://github.com/huggingface/diffusers.git
19 | cd diffusers/scripts
20 | python convert_stable_diffusion_checkpoint_to_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion
21 | ```
22 | 
23 | # Run
24 | 
25 | ## 1. Quantization
26 | 
27 | ```bash
28 | bash run_quant.sh --input_model=/path/to/stable-diffusion \ # folder path of stable-diffusion
29 |                   --output_model=/path/to/save/unet_model \ # model path as *.onnx
30 |                   --alpha=0.7 # optional
31 | ```
32 | 
33 | ## 2. Benchmark
34 | 
35 | ```bash
36 | bash run_benchmark.sh --input_model=/path/to/stable-diffusion \ # folder path of stable-diffusion
37 |                       --quantized_unet_path=/path/to/quantized/unet.onnx \ # optional, run fp32 model if not provided
38 |                       --prompt="a photo of an astronaut riding a horse on mars" \ # optional
39 |                       --image_path=image.png # optional
40 | ```
41 | 
42 | Benchmark will print the throughput data and save the generated image.
43 | Our test results with default parameters is (fp32 vs int8):
44 | <p float="left">
45 |   <img src="./imgs/fp32.png" width = "300" height = "300" alt="fp32" align=center />
46 |   <img src="./imgs/int8.png" width = "300" height = "300" alt="int8" align=center />
47 | </p>
48 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/fp32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/fp32.png


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/int8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onnx/neural-compressor/3dd43d3d852d43c1133005dbe440cfc18a1d7471/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/imgs/int8.png


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | diffusers
3 | onnx
4 | onnxruntime
5 | onnxruntime-extensions
6 | onnx_neural_compressor
7 | transformers==4.42.0 # restricted by model export
8 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 | 
 6 |   init_params "$@"
 7 |   run_benchmark
 8 | 
 9 | }
10 | 
11 | # init params
12 | function init_params {
13 |   for var in "$@"
14 |   do
15 |     case $var in
16 |       --input_model=*)
17 |           input_model=$(echo "$var" |cut -f2 -d=)
18 |       ;;
19 |       --quantized_unet_path=*)
20 |           quantized_unet_path=$(echo "$var" |cut -f2 -d=)
21 |       ;;
22 |       --batch_size=*)
23 |           batch_size=$(echo "$var" |cut -f2 -d=)
24 |       ;;
25 |       --prompt=*)
26 | 	  prompt=$(echo "$var" |cut -f2 -d=)
27 |       ;;
28 |       --image_path=*)
29 | 	  image_path=$(echo "$var" |cut -f2 -d=)
30 |       ;;
31 |     esac
32 |   done
33 | 
34 | }
35 | 
36 | # run_benchmark
37 | function run_benchmark {
38 |     
39 |     # Check if the input_model ends with the filename extension ".onnx"
40 |     if [[ $input_model =~ \.onnx$ ]]; then
41 |         # If the string ends with the filename extension, get the path of the file
42 |         input_model=$(dirname "$input_model")
43 |     fi
44 | 
45 |     extra_cmd=""
46 | 
47 |     if [ "$quantized_unet_path" ]; then
48 |         extra_cmd=$extra_cmd"--quantized_unet_path=${quantized_unet_path} "
49 |     fi
50 | 
51 |     if [ "$prompt" ]; then
52 | 	extra_cmd=$extra_cmd"--prompt=${prompt} "
53 |     fi
54 | 
55 |     if [ "$image_path" ]; then
56 | 	extra_cmd=$extra_cmd"--image_path=${image_path} "
57 |     fi
58 | 
59 |     if [ "$batch_size" ]; then
60 | 	extra_cmd=$extra_cmd"--batch_size=${batch_size} "
61 |     fi
62 |     extra_cmd=$extra_cmd"--benchmark"
63 |     eval "python main.py --model_path=${input_model} ${extra_cmd}"
64 | }
65 | 
66 | main "$@"
67 | 
68 | 


--------------------------------------------------------------------------------
/examples/nlp/huggingface_model/text_to_image/stable_diffusion_v1_5/quantization/ptq_static/run_quant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | function main {
 5 |   init_params "$@"
 6 |   run_tuning
 7 | }
 8 | 
 9 | # init params
10 | function init_params {
11 |   for var in "$@"
12 |   do
13 |     case $var in
14 |       --input_model=*)
15 |           input_model=$(echo "$var" |cut -f2 -d=)
16 |       ;;
17 |       --output_model=*)
18 |           output_model=$(echo "$var" |cut -f2 -d=)
19 |       ;;
20 |       --alpha=*)
21 |           alpha=$(echo "$var" |cut -f2 -d=)
22 |       ;;
23 |     esac
24 |   done
25 | 
26 | }
27 | 
28 | # run_tuning
29 | function run_tuning {
30 | 
31 |     # Check if the input_model ends with the filename extension ".onnx"
32 |     if [[ $input_model =~ \.onnx$ ]]; then
33 |         # If the string ends with the filename extension, get the path of the file
34 |         input_model=$(dirname "$input_model")
35 |     fi
36 | 
37 |     # Check if the directory exists
38 |     if [ ! -d "$(dirname "$output_model")" ]; then
39 |         # If the directory doesn't exist, create it
40 | 	mkdir -p "$(dirname "$output_model")"
41 | 	echo "Created directory $(dirname "$output_model")"
42 |     fi
43 | 
44 |     python main.py \
45 |             --model_path "${input_model}" \
46 |             --output_model "${output_model}" \
47 | 	    --alpha "${alpha-0.7}" \
48 |             --tune
49 | }
50 | 
51 | main "$@"
52 | 
53 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Neural Compressor: An open-source Python library supporting popular model compression techniques for ONNX models."""
15 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/layer_wise/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Operators for onnx model."""
15 | 
16 | import glob
17 | from os import path
18 | 
19 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
20 | 
21 | modules = glob.glob(path.join(path.dirname(__file__), "*.py"))
22 | 
23 | for f in modules:
24 |     if path.isfile(f) and not f.startswith("__") and not f.endswith("__init__.py"):
25 |         __import__(path.basename(f)[:-3], globals(), locals(), level=1)
26 | 
27 | OPERATORS = base_op.OPERATORS
28 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/activation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Activation operator."""
 15 | 
 16 | import onnx
 17 | 
 18 | from onnx_neural_compressor import constants, utility
 19 | from onnx_neural_compressor.algorithms import utility as quant_utils
 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
 21 | 
 22 | 
 23 | @base_op.op_registry(op_types="LeakyRelu, Sigmoid", mode=[constants.STATIC_QUANT])
 24 | class ActivationOperator(base_op.Operator):
 25 |     """Activation operator."""
 26 | 
 27 |     def __init__(self, onnx_quantizer, onnx_node):
 28 |         """Initialization."""
 29 |         super(ActivationOperator, self).__init__(onnx_quantizer, onnx_node)
 30 | 
 31 |     def quantize_check(self):
 32 |         """Check if quantizaion can be done."""
 33 |         node = self.node
 34 |         data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
 35 |         if not data_found:
 36 |             return False
 37 |         return True
 38 | 
 39 |     def quantize(self):
 40 |         """Do quantizaion."""
 41 |         node = self.node
 42 |         super().quantize()
 43 |         node.name = node.name + "_quant"
 44 | 
 45 |     def convert_check(self):
 46 |         """Check if conversion can be done."""
 47 |         node = self.node
 48 | 
 49 |         children = self.quantizer.model.get_children(node)
 50 |         if len(children) == 0 or not node.name.endswith("_quant"):
 51 |             return False
 52 |         return True
 53 | 
 54 |     def convert(self):
 55 |         """Convert to QOperator format."""
 56 |         node = self.node
 57 | 
 58 |         parent = self.quantizer.model.get_parents(node)[0]
 59 |         child = self.quantizer.model.get_children(node)[0]
 60 | 
 61 |         inputs = []
 62 |         inputs.extend(parent.input)
 63 |         inputs.extend(child.input[1:])
 64 | 
 65 |         qlinear_activation_output = child.output[0]
 66 |         kwargs = {}
 67 |         for attribute in node.attribute:  # pragma: no cover
 68 |             kwargs.update(quant_utils.attribute_to_kwarg(attribute))
 69 |         kwargs["domain"] = quant_utils.ms_domain
 70 | 
 71 |         qlinear_activation_node = onnx.helper.make_node(
 72 |             "QLinear" + node.op_type, inputs, [qlinear_activation_output], node.name, **kwargs
 73 |         )
 74 | 
 75 |         self.quantizer.new_nodes.append(qlinear_activation_node)
 76 |         self.quantizer.remove_nodes.extend([parent, child, node])
 77 | 
 78 | 
 79 | @base_op.op_registry(op_types="Relu, Clip", mode=[constants.STATIC_QUANT])
 80 | class RemovableActivationOperator(base_op.Operator):
 81 |     """Removable activation operator."""
 82 | 
 83 |     def __init__(self, onnx_quantizer, onnx_node):
 84 |         """Initialization."""
 85 |         super(RemovableActivationOperator, self).__init__(onnx_quantizer, onnx_node)
 86 | 
 87 |     def quantize_check(self):
 88 |         """Check if quantizaion can be done."""
 89 |         node = self.node
 90 |         if node.input[0] not in self.quantizer.quantized_value_map:
 91 |             return False
 92 |         return True
 93 | 
 94 |     def quantize(self):
 95 |         """Do quantization."""
 96 |         node = self.node
 97 |         if node.output[0] in [i.name for i in self.quantizer.model.model.graph.output]:
 98 |             self.quantizer.dequantize_tensor(node, node.input[0])
 99 |         else:
100 |             self.quantizer.model.replace_input_of_all_nodes(node.output[0], node.input[0])
101 |             self.quantizer.remove_nodes.append(node)
102 | 
103 | 
104 | @base_op.op_registry(
105 |     op_types="Softmax, BiasGelu, Elu, Exp, FastGelu, Gelu, Softplus, Tanh", mode=[constants.STATIC_QUANT]
106 | )
107 | class Float16ActivationOperator(base_op.Operator):
108 |     """Float16 Activation operator."""
109 | 
110 |     def __init__(self, onnx_quantizer, onnx_node):
111 |         """Initialization."""
112 |         super(Float16ActivationOperator, self).__init__(onnx_quantizer, onnx_node)
113 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/argmax.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """ArgMax operator."""
15 | 
16 | from onnx_neural_compressor import constants, utility
17 | from onnx_neural_compressor.algorithms import utility as quant_utils
18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
19 | 
20 | 
21 | @base_op.op_registry(op_types="ArgMax", mode=[constants.STATIC_QUANT])
22 | class ArgMaxOperator(base_op.Operator):
23 |     """ArgMax operator."""
24 | 
25 |     def __init__(self, onnx_quantizer, onnx_node):
26 |         """Initialization."""
27 |         super(ArgMaxOperator, self).__init__(onnx_quantizer, onnx_node)
28 | 
29 |     def convert_check(self):
30 |         """Check if conversion can be done."""
31 |         node = self.node
32 |         return True
33 | 
34 |     def convert(self):
35 |         """Convert to quantized format."""
36 |         node = self.node
37 |         origin_name = node.input[0].split("_argmax_node")[0]
38 | 
39 |         if origin_name in self.quantizer.quantized_value_map:
40 |             node.name = node.name + "_quant"
41 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Attention operator."""
15 | 
16 | import onnx
17 | 
18 | from onnx_neural_compressor import constants, utility
19 | from onnx_neural_compressor.algorithms import utility as quant_utils
20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
21 | 
22 | 
23 | @base_op.op_registry(op_types="Attention", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT])
24 | class AttentionOperator(base_op.Operator):
25 |     """Attention operator."""
26 | 
27 |     def __init__(self, onnx_quantizer, onnx_node):
28 |         """Initialization."""
29 |         super(AttentionOperator, self).__init__(onnx_quantizer, onnx_node)
30 | 
31 |     def quantize(self):
32 |         """Do quantizaion."""
33 |         node = self.node
34 |         self.quantizer.quantize_inputs(node, [0, 1])
35 |         node.name = node.name + "_quant"
36 | 
37 |     def convert(self):
38 |         """Convert QDQ mode to QOperator format."""
39 |         node = self.node
40 |         parents = self.quantizer.model.get_parents(node)
41 |         quantized_name = []
42 |         scale = []
43 |         zp = []
44 |         for parent in parents[:2]:
45 |             if parent.op_type == "DynamicQuantizeLinear":
46 |                 quantized_name.append(parent.output[0])
47 |                 scale.append(parent.output[1])
48 |                 zp.append(parent.output[2])
49 |             elif parent.op_type == "DequantizeLinear":
50 |                 quantized_name.append(parent.input[0])
51 |                 scale.append(parent.input[1])
52 |                 zp.append(parent.input[2])
53 |                 self.quantizer.remove_nodes.append(parent)
54 | 
55 |         inputs = []
56 |         inputs.extend(quantized_name)
57 |         inputs.append(node.input[2])
58 |         inputs.extend(scale)
59 |         inputs.append(node.input[3] if len(node.input) > 3 else "")
60 |         inputs.extend(zp)
61 |         if len(node.input) > 4:
62 |             inputs.append(node.input[4])
63 | 
64 |         kwargs = {}
65 |         for attribute in node.attribute:  # pragma: no cover
66 |             kwargs.update(quant_utils.attribute_to_kwarg(attribute))
67 |         kwargs["domain"] = quant_utils.ms_domain
68 |         qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, node.name, **kwargs)
69 |         self.quantizer.new_nodes.append(qattention_node)
70 | 
71 |         self.quantizer.remove_nodes.append(node)
72 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/base_op.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Base Operator."""
15 | 
16 | from onnx_neural_compressor import constants, quantization
17 | 
18 | OPERATORS = {
19 |     "dynamic_quant": {},
20 |     "static_quant": {},
21 | }
22 | 
23 | 
24 | def op_registry(op_types, mode):
25 |     """The class decorator used to register all Operator subclasses."""
26 | 
27 |     def decorator_op(cls):
28 |         assert cls.__name__.endswith(
29 |             "Operator"
30 |         ), "The name of subclass of Operator should end with 'Operator' substring."
31 |         for item in mode:
32 |             if cls.__name__[: -len("Operator")] in OPERATORS[item]:  # pragma: no cover
33 |                 raise ValueError("Cannot have two operators with the same name for {} mode.".format(item))
34 |                 break
35 |         for single_op_type in [op_type.strip() for op_type in op_types.split(",")]:
36 |             for item in mode:
37 |                 OPERATORS[item][single_op_type] = cls
38 |         return cls
39 | 
40 |     return decorator_op
41 | 
42 | 
43 | class Operator(object):
44 |     """Base Operator."""
45 | 
46 |     def __init__(self, onnx_quantizer, onnx_node):
47 |         """Initialization."""
48 |         self.quantizer = onnx_quantizer
49 |         self.node = onnx_node
50 |         node_name = self.node.name.split("_quant")[0]
51 |         if node_name in self.quantizer.config:
52 |             self.dtype = self.quantizer.config[node_name]
53 |         self.disable_qdq_for_node_output = (
54 |             True if onnx_node.op_type in onnx_quantizer.optypes_to_exclude_output_quant else False
55 |         )
56 |         self.per_channel = False
57 |         self.calibrate_method = 0  # minmax
58 |         self.weight_sym = True
59 |         self.weight_dtype = None
60 |         self.activation_dtype = None
61 |         self.activation_sym = False
62 |         if node_name in self.quantizer.config:
63 |             if self.quantizer.config[node_name] not in self.quantizer.fallback_list:
64 |                 self.per_channel = self.quantizer.config[node_name]["per_channel"]
65 |                 self.calibrate_method = self.quantizer.config[node_name]["calibrate_method"]
66 |                 self.weight_sym = self.quantizer.config[node_name]["weight_sym"]
67 |                 self.weight_dtype = self.quantizer.config[node_name]["weight_type"]
68 |                 self.activation_dtype = self.quantizer.config[node_name]["activation_type"]
69 |                 self.activation_sym = self.quantizer.config[node_name]["activation_sym"]
70 | 
71 |     def quantize_check(self):
72 |         """Check if quantizaion can be done."""
73 |         return True
74 | 
75 |     def quantize(self):
76 |         """Do quantizaion."""
77 |         node = self.node
78 |         self.quantizer.quantize_inputs(node)
79 |         if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT:
80 |             self.quantizer.quantize_outputs(node)
81 | 
82 |     def convert_check(self):
83 |         """Check if conversion can be done."""
84 |         node = self.node
85 | 
86 |         if not node.name.endswith("_quant"):
87 |             return False
88 |         return True
89 | 
90 |     def convert(self):
91 |         """Convert to QOperator format."""
92 |         return
93 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/direct_q8.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Direct8Bit Operator."""
15 | 
16 | from onnx_neural_compressor import constants, utility
17 | from onnx_neural_compressor.algorithms import utility as quant_utils
18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
19 | 
20 | 
21 | @base_op.op_registry(
22 |     op_types="Reshape, Transpose, Squeeze, Unsqueeze, Flatten, Expand, Slice, "
23 |     "SpaceToDepth, DepthToSpace, Upsample, Tile, CenterCropPad",
24 |     mode=[constants.STATIC_QUANT],
25 | )
26 | class Direct8BitOperator(base_op.Operator):
27 |     """Direct8Bit Operator."""
28 | 
29 |     def __init__(self, onnx_quantizer, onnx_node):
30 |         """Initialization."""
31 |         super(Direct8BitOperator, self).__init__(onnx_quantizer, onnx_node)
32 | 
33 |     def quantize_check(self):
34 |         """Check if quantizaion can be done."""
35 |         node = self.node
36 |         if not self.quantizer.is_valid_quantize_weight(node.input[0]):
37 |             return False
38 |         return True
39 | 
40 |     def quantize(self):
41 |         """Do quantizaion."""
42 |         node = self.node
43 |         self.quantizer.quantize_inputs(self.node, [0], initializer_use_weight_qType=False, direct_int8=True)
44 |         if not self.disable_qdq_for_node_output:
45 |             self.quantizer.quantize_outputs(self.node, direct_int8=True)
46 |         node.name = node.name + "_quant"
47 | 
48 |     def convert_check(self):
49 |         """Check if conversion can be done."""
50 |         node = self.node
51 |         parents = self.quantizer.model.get_parents(node)
52 |         children = self.quantizer.model.get_children(node)
53 |         if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
54 |             return False
55 |         return True
56 | 
57 |     def convert(self):
58 |         """Convert to QOperator format."""
59 |         node = self.node
60 |         parents = self.quantizer.model.get_parents(node)
61 |         children = self.quantizer.model.get_children(node)
62 |         if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
63 |             [i.op_type == "QuantizeLinear" for i in children]
64 |         ):
65 |             for parent in parents:
66 |                 if parent.op_type == "DequantizeLinear":
67 |                     # make sure parent DequantizeLinear of input 0 is not used by other ops
68 |                     if len(self.quantizer.model.get_children(parent)) == 1 and not self.quantizer.model.is_graph_output(
69 |                         parents[0].output[0]
70 |                     ):
71 |                         self.quantizer.remove_nodes.append(parent)
72 |                     self.node.input[0] = parent.input[0]
73 |                     break
74 |             for child in children:
75 |                 if child.op_type == "QuantizeLinear":
76 |                     self.quantizer.remove_nodes.append(child)
77 |                     self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
78 |             node.output[0] = node.output[0] + "_quantized"
79 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/embed_layernorm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """EmbedLayerNormalization Operator."""
15 | 
16 | import onnx
17 | 
18 | from onnx_neural_compressor import constants, utility
19 | from onnx_neural_compressor.algorithms import utility as quant_utils
20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
21 | 
22 | 
23 | @base_op.op_registry(op_types="EmbedLayerNormalization", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT])
24 | class EmbedLayerNormalizationOperator(base_op.Operator):
25 |     """EmbedLayerNormalization Operator."""
26 | 
27 |     def __init__(self, onnx_quantizer, onnx_node):
28 |         """Initialization."""
29 |         super(EmbedLayerNormalizationOperator, self).__init__(onnx_quantizer, onnx_node)
30 | 
31 |     def quantize(self):
32 |         """Do quantizaion."""
33 |         node = self.node
34 |         self.quantizer.quantize_inputs(node, [2, 3, 4, 5, 6])
35 |         node.name = node.name + "_quant"
36 | 
37 |     def convert(self):
38 |         """Convert to QOperator format."""
39 |         node = self.node
40 | 
41 |         parents = [i for i in self.quantizer.model.get_parents(node) if i.op_type == "DequantizeLinear"]
42 |         inputs = []
43 |         # 'input_ids'
44 |         inputs.extend([node.input[0]])
45 |         # 'segment_ids'
46 |         inputs.extend([node.input[1]])
47 |         for parent in parents:
48 |             inputs.append(parent.input[0])
49 |         # 'mask' (optional)
50 |         if len(node.input) > 7:
51 |             inputs.append(node.input[7])
52 | 
53 |         for parent in parents:
54 |             inputs.append(parent.input[1])
55 |         for parent in parents:
56 |             inputs.append(parent.input[2])
57 | 
58 |         kwargs = {}
59 |         for attribute in node.attribute:  # pragma: no cover
60 |             kwargs.update(quant_utils.attribute_to_kwarg(attribute))
61 |         kwargs["domain"] = quant_utils.ms_domain
62 | 
63 |         qembed_layer_norm_node = onnx.helper.make_node(
64 |             "QEmbedLayerNormalization", inputs, node.output, node.name, **kwargs
65 |         )
66 |         self.quantizer.new_nodes.append(qembed_layer_norm_node)
67 |         self.quantizer.remove_nodes.extend(parents)
68 |         self.quantizer.remove_nodes.append(node)
69 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/gather.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Gather Operator."""
 15 | 
 16 | import onnx
 17 | 
 18 | from onnx_neural_compressor import constants, utility
 19 | from onnx_neural_compressor.algorithms import utility as quant_utils
 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
 21 | 
 22 | 
 23 | @base_op.op_registry(
 24 |     op_types="Gather, GatherElements, GatherND", mode=[constants.DYNAMIC_QUANT, constants.STATIC_QUANT]
 25 | )
 26 | class GatherOperator(base_op.Operator):
 27 |     """Gather Operator."""
 28 | 
 29 |     def __init__(self, onnx_quantizer, onnx_node):
 30 |         """Initialization."""
 31 |         super(GatherOperator, self).__init__(onnx_quantizer, onnx_node)
 32 | 
 33 |     def quantize_check(self):
 34 |         """Check if quantizaion can be done."""
 35 |         node = self.node
 36 |         if not self.quantizer.is_valid_quantize_weight(node.input[0]):
 37 |             return False
 38 |         return True
 39 | 
 40 |     def quantize(self):
 41 |         """Do quantizaion."""
 42 |         node = self.node
 43 |         self.quantizer.quantize_inputs(node, [0], initializer_use_weight_qType=False)
 44 |         if not self.disable_qdq_for_node_output or self.quantizer.mode != constants.DYNAMIC_QUANT:
 45 |             self.quantizer.quantize_outputs(node)
 46 |         node.name = node.name + "_quant"
 47 | 
 48 |     def convert_check(self):
 49 |         """Check if conversion can be done."""
 50 |         node = self.node
 51 |         parents = self.quantizer.model.get_parents(node)
 52 |         children = self.quantizer.model.get_children(node)
 53 |         if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"):
 54 |             return False
 55 | 
 56 |         return True
 57 | 
 58 |     def convert(self):
 59 |         """Convert to QOperator format."""
 60 |         # DQ-Gather-Q-DQ-op
 61 |         node = self.node
 62 | 
 63 |         parents = self.quantizer.model.get_parents(node)
 64 |         children = self.quantizer.model.get_children(node)
 65 | 
 66 |         if any([i.op_type == "DequantizeLinear" for i in parents]):
 67 | 
 68 |             inputs = []
 69 |             inputs.append(parents[0].input[0])
 70 |             inputs.append(node.input[1])
 71 | 
 72 |             out_scale = 1.0
 73 |             out_zp = 0
 74 |             gather_new_output = node.output[0] + "_quantized"  # dynamic quant output name
 75 |             for child in children:
 76 |                 if child.op_type == "QuantizeLinear":
 77 |                     out_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[1]))
 78 |                     out_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(children[0].input[2]))
 79 |                     gather_new_output = children[0].output[0]  # static quant output name
 80 |                     self.quantizer.remove_nodes.append(child)
 81 | 
 82 |             kwargs = {}
 83 |             for attribute in node.attribute:  # pragma: no cover
 84 |                 kwargs.update(quant_utils.attribute_to_kwarg(attribute))
 85 | 
 86 |             gather_node = onnx.helper.make_node(node.op_type, inputs, [gather_new_output], node.name, **kwargs)
 87 |             self.quantizer.new_nodes.append(gather_node)
 88 |             if any([i.op_type != "QuantizeLinear" for i in children]):
 89 |                 dq_inputs = []
 90 |                 dq_inputs.append(gather_new_output)
 91 |                 dq_inputs.extend(parents[0].input[1:])
 92 |                 dq_node = onnx.helper.make_node(
 93 |                     "DequantizeLinear", dq_inputs, [node.output[0]], node.name + "_DequantizeLinear"
 94 |                 )
 95 |                 self.quantizer.new_nodes.append(dq_node)
 96 | 
 97 |             # int8 weight will be recalculated for the first time
 98 |             if (
 99 |                 any([child.op_type == "QuantizeLinear" for child in children])
100 |                 and self.quantizer.model.get_initializer(parents[0].input[0]) is not None
101 |                 and parents[0].input[0] not in self.quantizer.recalculate_quantized_value
102 |             ):
103 |                 int8_tensor = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[0]))
104 |                 in_scale = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[1]))
105 |                 in_zp = onnx.numpy_helper.to_array(self.quantizer.model.get_initializer(parents[0].input[2]))
106 |                 new_int8_tensor = (((int8_tensor.astype("float32") - in_zp) * in_scale) / out_scale).round() + out_zp
107 |                 self.quantizer.model.set_initializer(parents[0].input[0], new_int8_tensor.astype(int8_tensor.dtype))
108 |                 self.quantizer.recalculate_quantized_value.append(parents[0].input[0])
109 |             self.quantizer.remove_nodes.extend([node, parents[0]])
110 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/gavgpool.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """GlobalAveragePool Operator."""
15 | 
16 | import onnx
17 | 
18 | from onnx_neural_compressor import constants, utility
19 | from onnx_neural_compressor.algorithms import utility as quant_utils
20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
21 | 
22 | 
23 | @base_op.op_registry(op_types="GlobalAveragePool", mode=[constants.STATIC_QUANT])
24 | class GlobalAveragePoolOperator(base_op.Operator):
25 |     """GlobalAveragePool Operator."""
26 | 
27 |     def __init__(self, onnx_quantizer, onnx_node):
28 |         """Initialization."""
29 |         super(GlobalAveragePoolOperator, self).__init__(onnx_quantizer, onnx_node)
30 | 
31 |     def convert_check(self):
32 |         """Check if conversion can be done."""
33 |         node = self.node
34 |         children = self.quantizer.model.get_children(node)
35 |         if len(children) == 0:  # pragma: no cover
36 |             return False
37 |         return True
38 | 
39 |     def convert(self):
40 |         """Convert to QOperator format."""
41 |         node = self.node
42 | 
43 |         parent = self.quantizer.model.get_parents(node)[0]
44 |         child = self.quantizer.model.get_children(node)[0]
45 | 
46 |         kwargs = {}
47 |         for attribute in node.attribute:
48 |             kwargs.update(quant_utils.attribute_to_kwarg(attribute))
49 |         kwargs["domain"] = quant_utils.ms_domain
50 |         kwargs["channels_last"] = 0
51 | 
52 |         inputs = parent.input
53 |         inputs.extend(child.input[1:])
54 | 
55 |         qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, child.output, node.name + "_quant", **kwargs)
56 |         self.quantizer.new_nodes += [qnode]
57 |         self.quantizer.remove_nodes.append(child)
58 |         self.quantizer.remove_nodes.append(parent)
59 |         self.quantizer.remove_nodes.append(node)
60 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/gemm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Gemm Operator."""
15 | 
16 | import onnx
17 | 
18 | from onnx_neural_compressor import constants, logger
19 | from onnx_neural_compressor.algorithms import utility as quant_utils
20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
21 | 
22 | 
23 | @base_op.op_registry(op_types="Gemm", mode=[constants.STATIC_QUANT])
24 | class GemmOperator(base_op.Operator):
25 |     """Gemm Operator."""
26 | 
27 |     def __init__(self, onnx_quantizer, onnx_node):
28 |         """Initialization."""
29 |         super(GemmOperator, self).__init__(onnx_quantizer, onnx_node)
30 | 
31 |     def quantize_check(self):
32 |         """Check if quantizaion can be done."""
33 |         node = self.node
34 |         if len(node.input) == 3 and not quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()):
35 | 
36 |             logger.warning(
37 |                 "Bias of Gemm node '{}' is not constant. "
38 |                 "Exclude this node can get better performance.".format(node.name)
39 |             )
40 |             if self.quantizer.quant_format != "qdq":
41 |                 return False
42 |         return True
43 | 
44 |     def quantize(self):
45 |         """Do quantizaion."""
46 |         node = self.node
47 |         self.quantizer.quantize_inputs(node, [0])
48 |         if self.per_channel and quant_utils.find_by_name(node.input[1], self.quantizer.model.initializer()):
49 |             self.quantizer.quantize_weights_per_channel(
50 |                 node, [1], self.weight_dtype, self.weight_sym, 0 if quant_utils.is_B_transposed(node) else 1
51 |             )
52 |         else:
53 |             self.quantizer.quantize_inputs(node, [1])
54 | 
55 |         if len(node.input) == 3 and quant_utils.find_by_name(node.input[2], self.quantizer.model.initializer()):
56 |             self.quantizer.quantize_bias_tensor(node)
57 |             beta_attribute = [attr for attr in node.attribute if attr.name == "beta"]
58 |             if len(beta_attribute):
59 |                 beta_attribute[0].f = 1.0
60 | 
61 |         if not self.disable_qdq_for_node_output:
62 |             self.quantizer.quantize_outputs(node)
63 |         node.name = node.name + "_quant"
64 | 
65 |     def convert(self):
66 |         """Convert to QOperator format."""
67 |         node = self.node
68 | 
69 |         parents = self.quantizer.model.get_parents(node)
70 |         qgemm_inputs = []
71 |         for parent in parents[:-1]:
72 |             qgemm_inputs.extend(parent.input)
73 |         qgemm_inputs.append(parents[-1].input[0])
74 | 
75 |         kwargs = {}
76 |         for attribute in node.attribute:
77 |             if attribute.name != "beta":
78 |                 kwargs.update(quant_utils.attribute_to_kwarg(attribute))
79 |                 kwargs["domain"] = quant_utils.ms_domain
80 | 
81 |         qgemm_output = node.output[0]
82 |         if not self.disable_qdq_for_node_output:
83 |             child = self.quantizer.model.get_children(node)[0]
84 |             self.quantizer.remove_nodes.append(child)
85 |             qgemm_output = child.output[0]
86 |             qgemm_inputs.extend(child.input[1:])
87 |         qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], node.name, **kwargs)
88 | 
89 |         self.quantizer.new_nodes.append(qgemm_node)
90 |         self.quantizer.remove_nodes.extend(parents)
91 |         self.quantizer.remove_nodes.append(node)
92 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/maxpool.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """MaxPool Operator."""
15 | 
16 | from onnx_neural_compressor import constants, utility
17 | from onnx_neural_compressor.algorithms import utility as quant_utils
18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
19 | 
20 | 
21 | @base_op.op_registry(op_types="MaxPool", mode=[constants.STATIC_QUANT])
22 | class MaxPoolOperator(base_op.Operator):
23 |     """MaxPool Operator."""
24 | 
25 |     def __init__(self, onnx_quantizer, onnx_node):
26 |         """Initialization."""
27 |         super(MaxPoolOperator, self).__init__(onnx_quantizer, onnx_node)
28 | 
29 |     def quantize_check(self):
30 |         """Check if quantizaion can be done."""
31 |         node = self.node
32 |         # if opset version is less than 12, just no change
33 |         if self.quantizer.opset_version < 12:  # pragma: no cover
34 |             return False
35 | 
36 |         if not self.quantizer.is_valid_quantize_weight(node.input[0]):  # pragma: no cover
37 |             return False
38 | 
39 |         return True
40 | 
41 |     def quantize(self):
42 |         """Do quantizaion."""
43 |         node = self.node
44 |         self.quantizer.quantize_inputs(self.node, direct_int8=True)
45 |         if not self.disable_qdq_for_node_output:
46 |             self.quantizer.quantize_outputs(self.node, direct_int8=True)
47 |         node.name = node.name + "_quant"
48 | 
49 |     def convert_check(self):
50 |         """Check if conversion can be done."""
51 |         node = self.node
52 |         children = self.quantizer.model.get_children(node)
53 |         if len(children) == 0 or not node.name.endswith("_quant"):  # pragma: no cover
54 |             return False
55 |         return True
56 | 
57 |     def convert(self):
58 |         """Convert to QOperator format."""
59 |         node = self.node
60 |         parent = self.quantizer.model.get_parents(node)[0]
61 |         children = self.quantizer.model.get_children(node)
62 |         if parent.op_type != "DequantizeLinear" or all(
63 |             [i.op_type != "QuantizeLinear" for i in children]
64 |         ):  # pragma: no cover
65 |             return
66 |         node.input[0] = parent.input[0]
67 |         node.output[0] = node.output[0].replace("_QuantizeInput", "_quantized")
68 |         for child in children:
69 |             if child.op_type == "QuantizeLinear":
70 |                 self.quantizer.remove_nodes.append(child)
71 |                 for n in self.quantizer.model.get_children(child):
72 |                     self.quantizer.model.replace_node_input(n, child.output[0], node.output[0])
73 | 
74 |         self.quantizer.remove_nodes.append(parent)
75 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/pad.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Pad Operator."""
 15 | 
 16 | import onnx
 17 | 
 18 | from onnx_neural_compressor import constants, utility
 19 | from onnx_neural_compressor.algorithms import utility as quant_utils
 20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
 21 | 
 22 | 
 23 | @base_op.op_registry(op_types="Pad", mode=[constants.STATIC_QUANT])
 24 | class PadOperator(base_op.Operator):
 25 |     """Pad Operator."""
 26 | 
 27 |     def __init__(self, onnx_quantizer, onnx_node):
 28 |         """Initialization."""
 29 |         super(PadOperator, self).__init__(onnx_quantizer, onnx_node)
 30 | 
 31 |     def quantize_check(self):
 32 |         """Check if quantizaion can be done."""
 33 |         # if opset version is less than 11, just no change
 34 |         if self.quantizer.opset_version < 11:  # pragma: no cover
 35 |             return False
 36 |         return True
 37 | 
 38 |     def quantize(self):
 39 |         """Do quantizaion."""
 40 |         node = self.node
 41 |         self.quantizer.quantize_inputs(node, [0])
 42 |         if not self.disable_qdq_for_node_output:
 43 |             self.quantizer.quantize_outputs(node)
 44 |         node.name = node.name + "_quant"
 45 | 
 46 |     def convert_check(self):
 47 |         """Check if conversion can be done."""
 48 |         node = self.node
 49 |         children = self.quantizer.model.get_children(node)
 50 |         if len(children) == 0 or not node.name.endswith("_quant"):  # pragma: no cover
 51 |             return False
 52 |         return True
 53 | 
 54 |     def convert(self):
 55 |         """Convert to QOperator format."""
 56 |         node = self.node
 57 | 
 58 |         parent = self.quantizer.model.get_parents(node)[0]
 59 |         child = self.quantizer.model.get_children(node)[0]
 60 | 
 61 |         kwargs = {}
 62 |         for attribute in node.attribute:
 63 |             kv = quant_utils.attribute_to_kwarg(attribute)
 64 |             kwargs.update(kv)
 65 | 
 66 |         if "mode" not in kwargs or kwargs["mode"] == b"constant":
 67 |             if len(node.input) > 2:  # There is 3rd input 'constant_value'
 68 |                 zp_tensor = self.quantizer.model.get_initializer(parent.input[2])
 69 |                 scale_tensor = self.quantizer.model.get_initializer(parent.input[1])
 70 | 
 71 |                 padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
 72 |                 if padding_constant_initializer is not None:
 73 |                     zp_array = onnx.numpy_helper.to_array(zp_tensor)
 74 |                     zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
 75 |                     scale_array = onnx.numpy_helper.to_array(scale_tensor)
 76 |                     scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
 77 |                     padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
 78 |                     quantized_padding_constant_array = quant_utils.quantize_nparray(
 79 |                         onnx.helper.tensor_dtype_to_np_dtype(self.weight_dtype),
 80 |                         padding_constant_array,
 81 |                         scale_value,
 82 |                         zp_value,
 83 |                     )
 84 |                     quantized_padding_constant_name = node.input[2] + "_quantized"
 85 |                     quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
 86 |                         quantized_padding_constant_array, quantized_padding_constant_name
 87 |                     )
 88 |                     # Suppose this padding constant initializer only used by the node
 89 |                     self.quantizer.model.remove_initializer(padding_constant_initializer)
 90 |                     self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
 91 |                     node.input[2] = quantized_padding_constant_name
 92 |                 else:
 93 |                     self.quantizer.quantize_inputs(node, [2], False)
 94 |                     node.input[2] = node.input[2] + "_DequantizeLinear"
 95 |             else:
 96 |                 # pad zero_point for original zero
 97 |                 node.input.extend([parent.input[2]])
 98 | 
 99 |         # Create an entry for output quantized value
100 |         node.input[0] = parent.input[0]
101 |         node.output[0] = child.output[0]
102 |         self.quantizer.remove_nodes.extend([parent, child])
103 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/pooling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """AveragePool Operator."""
15 | 
16 | import onnx
17 | 
18 | from onnx_neural_compressor import constants, utility
19 | from onnx_neural_compressor.algorithms import utility as quant_utils
20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
21 | 
22 | 
23 | @base_op.op_registry(op_types="AveragePool", mode=[constants.STATIC_QUANT])
24 | class PoolOperator(base_op.Operator):
25 |     """AveragePool Operator."""
26 | 
27 |     def __init__(self, onnx_quantizer, onnx_node):
28 |         """Initialization."""
29 |         super(PoolOperator, self).__init__(onnx_quantizer, onnx_node)
30 | 
31 |     def quantize_check(self):
32 |         """Check if quantizaion can be done."""
33 |         node = self.node
34 |         if not self.quantizer.is_valid_quantize_weight(node.input[0]):
35 |             return False
36 |         return True
37 | 
38 |     def quantize(self):
39 |         """Do quantizaion."""
40 |         node = self.node
41 |         super().quantize()
42 |         node.name = node.name + "_quant"
43 | 
44 |     def convert_check(self):
45 |         """Check if conversion can be done."""
46 |         node = self.node
47 |         parents = self.quantizer.model.get_parents(node)
48 |         children = self.quantizer.model.get_children(node)
49 | 
50 |         if len(children) == 0 or len(parents) == 0 or not node.name.endswith("_quant"):
51 |             return False
52 |         return True
53 | 
54 |     def convert(self):
55 |         """Convert to QOperator format."""
56 |         node = self.node
57 | 
58 |         parents = self.quantizer.model.get_parents(node)
59 |         children = self.quantizer.model.get_children(node)
60 | 
61 |         if all([i.op_type == "DequantizeLinear" for i in parents]) and any(
62 |             [i.op_type == "QuantizeLinear" for i in children]
63 |         ):
64 |             qlinear_output_name = node.output[0] + "_quantized"
65 |             inputs = []
66 |             inputs.extend(parents[0].input)
67 |             inputs.extend([i for i in children if i.op_type == "QuantizeLinear"][0].input[1:])
68 |             kwargs = {}
69 |             for attribute in node.attribute:
70 |                 kwargs.update(quant_utils.attribute_to_kwarg(attribute))
71 |             kwargs["domain"] = quant_utils.ms_domain
72 |             qnode = onnx.helper.make_node("QLinear" + node.op_type, inputs, [qlinear_output_name], node.name, **kwargs)
73 | 
74 |             self.quantizer.remove_nodes.extend(parents)
75 |             for child in children:
76 |                 if child.op_type == "QuantizeLinear":
77 |                     self.quantizer.remove_nodes.append(child)
78 |                     self.quantizer.model.replace_input_of_all_nodes(child.output[0], qnode.output[0])
79 | 
80 |             self.quantizer.new_nodes.append(qnode)
81 |             self.quantizer.remove_nodes.append(node)
82 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/reduce.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Reduce Operator."""
15 | 
16 | from onnx_neural_compressor import constants, utility
17 | from onnx_neural_compressor.algorithms import utility as quant_utils
18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
19 | 
20 | 
21 | @base_op.op_registry(
22 |     op_types="ReduceMean, ReduceLogSum, ReduceLogSumExp, " "ReduceL1, ReduceL2, ReduceProd, ReduceSum, ReduceSumSquare",
23 |     mode=[constants.STATIC_QUANT],
24 | )
25 | class ReduceOperator(base_op.Operator):
26 |     """Reduce Operator."""
27 | 
28 |     def __init__(self, onnx_quantizer, onnx_node):
29 |         """Initialization."""
30 |         super(ReduceOperator, self).__init__(onnx_quantizer, onnx_node)
31 | 
32 | 
33 | @base_op.op_registry(op_types="ReduceMax, ReduceMin", mode=[constants.STATIC_QUANT])
34 | class ReduceMinMaxOperator(base_op.Operator):
35 |     """ReduceMin and ReduceMax Operator."""
36 | 
37 |     def __init__(self, onnx_quantizer, onnx_node):
38 |         """Initialization."""
39 |         super(ReduceMinMaxOperator, self).__init__(onnx_quantizer, onnx_node)
40 | 
41 |     def quantize_check(self):
42 |         """Check if quantizaion can be done."""
43 |         node = self.node
44 |         if not self.quantizer.is_valid_quantize_weight(node.input[0]):
45 |             return False
46 |         return True
47 | 
48 |     def quantize(self):
49 |         """Do quantizaion."""
50 |         node = self.node
51 |         self.quantizer.quantize_inputs(self.node, [0], direct_int8=True)
52 |         if not self.disable_qdq_for_node_output:
53 |             self.quantizer.quantize_outputs(self.node, direct_int8=True)
54 |         node.name = node.name + "_quant"
55 | 
56 |     def convert_check(self):
57 |         """Check if conversion can be done."""
58 |         node = self.node
59 |         parents = self.quantizer.model.get_parents(node)
60 |         children = self.quantizer.model.get_children(node)
61 |         if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
62 |             return False
63 |         return True
64 | 
65 |     def convert(self):
66 |         """Convert to QOperator format."""
67 |         node = self.node
68 | 
69 |         parents = self.quantizer.model.get_parents(node)
70 |         children = self.quantizer.model.get_children(node)
71 |         if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
72 |             [i.op_type == "QuantizeLinear" for i in children]
73 |         ):
74 |             for parent in parents:
75 |                 if parent.op_type == "DequantizeLinear":
76 |                     self.node.input[0] = parent.input[0]
77 |                     self.quantizer.remove_nodes.append(parents[0])
78 |                     break
79 |             for child in children:
80 |                 if child.op_type == "QuantizeLinear":
81 |                     self.quantizer.remove_nodes.append(child)
82 |                     self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
83 |             node.output[0] = node.output[0] + "_quantized"
84 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/resize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Resize Operator."""
15 | 
16 | from onnx_neural_compressor import constants, utility
17 | from onnx_neural_compressor.algorithms import utility as quant_utils
18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
19 | 
20 | 
21 | @base_op.op_registry(op_types="Resize", mode=[constants.STATIC_QUANT])
22 | class ResizeOperator(base_op.Operator):
23 |     """Resize Operator."""
24 | 
25 |     def __init__(self, onnx_quantizer, onnx_node):
26 |         """Initialization."""
27 |         super(ResizeOperator, self).__init__(onnx_quantizer, onnx_node)
28 | 
29 |     def quantize_check(self):
30 |         """Check if quantizaion can be done."""
31 |         node = self.node
32 |         # if version is less than 11, just keep this node
33 |         if self.quantizer.opset_version < 11:
34 |             return False
35 |         if not self.quantizer.is_valid_quantize_weight(node.input[0]):
36 |             return False
37 |         return True
38 | 
39 |     def quantize(self):
40 |         """Do quantizaion."""
41 |         node = self.node
42 |         self.quantizer.quantize_inputs(node, [0], direct_int8=True)
43 |         if not self.disable_qdq_for_node_output:
44 |             self.quantizer.quantize_outputs(self.node, direct_int8=True)
45 |         node.name = node.name + "_quant"
46 | 
47 |     def convert_check(self):
48 |         """Check if conversion can be done."""
49 |         node = self.node
50 |         parents = self.quantizer.model.get_parents(node)
51 |         children = self.quantizer.model.get_children(node)
52 |         if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
53 |             return False
54 |         return True
55 | 
56 |     def convert(self):
57 |         """Convert to QOperator format."""
58 |         node = self.node
59 | 
60 |         parents = self.quantizer.model.get_parents(node)
61 |         children = self.quantizer.model.get_children(node)
62 | 
63 |         if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
64 |             [i.op_type == "QuantizeLinear" for i in children]
65 |         ):
66 |             for parent in parents:
67 |                 if parent.op_type == "DequantizeLinear" and parent.output[0] == node.input[0]:
68 |                     self.node.input[0] = parent.input[0]
69 |                     self.quantizer.remove_nodes.append(parent)
70 |                     break
71 |             for child in children:
72 |                 if child.op_type == "QuantizeLinear":
73 |                     self.quantizer.remove_nodes.append(child)
74 |                     self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
75 |             node.output[0] = node.output[0] + "_quantized"
76 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/split.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Split Operator."""
15 | 
16 | import onnx
17 | 
18 | from onnx_neural_compressor import constants, utility
19 | from onnx_neural_compressor.algorithms import utility as quant_utils
20 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
21 | 
22 | 
23 | @base_op.op_registry(op_types="Split", mode=[constants.STATIC_QUANT])
24 | class SplitOperator(base_op.Operator):
25 |     """Split Operator."""
26 | 
27 |     def __init__(self, onnx_quantizer, onnx_node):
28 |         """Initialization."""
29 |         super(SplitOperator, self).__init__(onnx_quantizer, onnx_node)
30 | 
31 |     def quantize_check(self):
32 |         """Check if quantizaion can be done."""
33 |         node = self.node
34 |         data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
35 |         if not data_found:
36 |             return False
37 |         if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]):
38 |             return False
39 |         return True
40 | 
41 |     def quantize(self):
42 |         """Do quantizaion."""
43 |         node = self.node
44 |         self.quantizer.quantize_inputs(node, [0])
45 |         if not self.disable_qdq_for_node_output:
46 |             self.quantizer.quantize_outputs(self.node, direct_int8=True)
47 |         node.name = node.name + "_quant"
48 | 
49 |     def convert_check(self):
50 |         """Check if conversion can be done."""
51 |         node = self.node
52 |         parent = self.quantizer.model.get_parents(node)[0]
53 |         children = self.quantizer.model.get_children(node)
54 |         if (
55 |             parent.op_type != "DequantizeLinear" or len(children) == 0 or not node.name.endswith("_quant")
56 |         ):  # pragma: no cover
57 |             return False
58 |         return True
59 | 
60 |     def convert(self):
61 |         """Convert to QOperator format."""
62 |         node = self.node
63 | 
64 |         parent = self.quantizer.model.get_parents(node)[0]
65 |         kwargs = {}
66 |         for attribute in node.attribute:  # pragma: no cover
67 |             kwargs.update(quant_utils.attribute_to_kwarg(attribute))
68 | 
69 |         quantized_input_names = []
70 |         quantized_input_names.append(parent.input[0])
71 |         if len(node.input) > 1:  # pragma: no cover
72 |             quantized_input_names.extend(node.input[1:])
73 |         outputs = []
74 |         input_name_to_nodes = self.quantizer.model.input_name_to_nodes()
75 |         for output in node.output:
76 |             if output in input_name_to_nodes:
77 |                 child = input_name_to_nodes[output][0]
78 |                 if child.op_type == "QuantizeLinear":
79 |                     self.quantizer.remove_nodes.append(child)
80 |                     outputs.append(child.output[0])
81 |                 else:  # pragma: no cover
82 |                     outputs.append(output)
83 |             else:  # pragma: no cover
84 |                 outputs.append(output + "_quantized")
85 | 
86 |         quantized_node = onnx.helper.make_node(node.op_type, quantized_input_names, outputs, node.name, **kwargs)
87 |         self.quantizer.new_nodes.append(quantized_node)
88 |         self.quantizer.remove_nodes.extend([parent, node])
89 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/post_training_quant/operators/unary_op.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Unary operator."""
15 | 
16 | from onnx_neural_compressor import constants, utility
17 | from onnx_neural_compressor.algorithms import utility as quant_utils
18 | from onnx_neural_compressor.algorithms.post_training_quant.operators import base_op
19 | 
20 | 
21 | @base_op.op_registry(op_types="Exp, Log, Round, Sqrt", mode=[constants.STATIC_QUANT])
22 | class UnaryOperator(base_op.Operator):
23 |     """Unary operator."""
24 | 
25 |     def __init__(self, onnx_quantizer, onnx_node):
26 |         """Initialization."""
27 |         super(UnaryOperator, self).__init__(onnx_quantizer, onnx_node)
28 | 
29 | 
30 | @base_op.op_registry(op_types="Abs, Shrink, Sign", mode=[constants.STATIC_QUANT])
31 | class UnaryDirect8BitOperator(base_op.Operator):
32 |     """Unary operator."""
33 | 
34 |     def __init__(self, onnx_quantizer, onnx_node):
35 |         """Initialization."""
36 |         super(UnaryDirect8BitOperator, self).__init__(onnx_quantizer, onnx_node)
37 | 
38 |     def quantize_check(self):
39 |         """Check if quantizaion can be done."""
40 |         node = self.node
41 |         if not self.quantizer.is_valid_quantize_weight(node.input[0]):
42 |             return False
43 |         return True
44 | 
45 |     def quantize(self):
46 |         """Do quantizaion."""
47 |         node = self.node
48 |         self.quantizer.quantize_inputs(self.node, [0], direct_int8=True)
49 |         if not self.disable_qdq_for_node_output:
50 |             self.quantizer.quantize_outputs(self.node, direct_int8=True)
51 |         node.name = node.name + "_quant"
52 | 
53 |     def convert_check(self):
54 |         """Check if conversion can be done."""
55 |         node = self.node
56 |         parents = self.quantizer.model.get_parents(node)
57 |         children = self.quantizer.model.get_children(node)
58 |         if (len(children) == 0 and len(parents) == 0) or not node.name.endswith("_quant"):
59 |             return False
60 |         return True
61 | 
62 |     def convert(self):
63 |         """Convert to QOperator format."""
64 |         node = self.node
65 | 
66 |         parents = self.quantizer.model.get_parents(node)
67 |         children = self.quantizer.model.get_children(node)
68 |         if any([i.op_type == "DequantizeLinear" for i in parents]) and any(
69 |             [i.op_type == "QuantizeLinear" for i in children]
70 |         ):
71 |             for parent in parents:
72 |                 if parent.op_type == "DequantizeLinear":
73 |                     self.node.input[0] = parent.input[0]
74 |                     self.quantizer.remove_nodes.append(parents[0])
75 |                     break
76 |             for child in children:
77 |                 if child.op_type == "QuantizeLinear":
78 |                     self.quantizer.remove_nodes.append(child)
79 |                     self.quantizer.model.replace_input_of_all_nodes(child.output[0], node.output[0] + "_quantized")
80 |             node.output[0] = node.output[0] + "_quantized"
81 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/smoother/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/algorithms/weight_only/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/data_reader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import abc
16 | 
17 | 
18 | class CalibrationDataReader(metaclass=abc.ABCMeta):
19 |     @classmethod
20 |     def __subclasshook__(cls, subclass):
21 |         return hasattr(subclass, "get_next") and callable(subclass.get_next) or NotImplemented
22 | 
23 |     @abc.abstractmethod
24 |     def get_next(self) -> dict:
25 |         """generate the input data dict for ONNXinferenceSession run"""
26 |         raise NotImplementedError
27 | 
28 |     def __iter__(self):
29 |         return self
30 | 
31 |     def __next__(self):
32 |         result = self.get_next()
33 |         if result is None:
34 |             raise StopIteration
35 |         return result
36 | 
37 |     @abc.abstractmethod
38 |     def rewind(self):
39 |         """Regenerate data."""
40 |         raise NotImplementedError
41 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 Intel Corporation
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import logging
 16 | import os
 17 | 
 18 | 
 19 | def _pretty_dict(value, indent=0):
 20 |     """Make the logger dict pretty."""
 21 |     prefix = "\n" + " " * (indent + 4)
 22 |     if isinstance(value, dict):
 23 |         items = [prefix + repr(key) + ": " + _pretty_dict(value[key], indent + 4) for key in value]
 24 |         return "{%s}" % (",".join(items) + "\n" + " " * indent)
 25 |     elif isinstance(value, list):
 26 |         items = [prefix + _pretty_dict(item, indent + 4) for item in value]
 27 |         return "[%s]" % (",".join(items) + "\n" + " " * indent)
 28 |     elif isinstance(value, tuple):
 29 |         items = [prefix + _pretty_dict(item, indent + 4) for item in value]
 30 |         return "(%s)" % (",".join(items) + "\n" + " " * indent)
 31 |     else:
 32 |         return repr(value)
 33 | 
 34 | 
 35 | LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper()
 36 | _logger = logging.getLogger("onnx_neural_compressor")
 37 | _logger.handlers.clear()
 38 | _logger.setLevel(LOGLEVEL)
 39 | formatter = logging.Formatter("%(asctime)s [%(levelname)s][%(filename)s:%(lineno)d] %(message)s", "%Y-%m-%d %H:%M:%S")
 40 | streamHandler = logging.StreamHandler()
 41 | streamHandler.setFormatter(formatter)
 42 | _logger.addHandler(streamHandler)
 43 | _logger.propagate = False
 44 | 
 45 | 
 46 | def log(level, msg, *args, **kwargs):
 47 |     """Output log with the level as a parameter."""
 48 |     kwargs.setdefault("stacklevel", 2)
 49 |     if isinstance(msg, dict):
 50 |         for _, line in enumerate(_pretty_dict(msg).split("\n")):
 51 |             _logger.log(level, line, *args, **kwargs)
 52 |     else:
 53 |         _logger.log(level, msg, *args, **kwargs)
 54 | 
 55 | 
 56 | def debug(msg, *args, **kwargs):
 57 |     """Output log with the debug level."""
 58 |     kwargs.setdefault("stacklevel", 2)
 59 |     if isinstance(msg, dict):
 60 |         for _, line in enumerate(_pretty_dict(msg).split("\n")):
 61 |             _logger.debug(line, *args, **kwargs)
 62 |     else:
 63 |         _logger.debug(msg, *args, **kwargs)
 64 | 
 65 | 
 66 | def error(msg, *args, **kwargs):
 67 |     """Output log with the error level."""
 68 |     kwargs.setdefault("stacklevel", 2)
 69 |     if isinstance(msg, dict):
 70 |         for _, line in enumerate(_pretty_dict(msg).split("\n")):
 71 |             _logger.error(line, *args, **kwargs)
 72 |     else:
 73 |         _logger.error(msg, *args, **kwargs)
 74 | 
 75 | 
 76 | def fatal(msg, *args, **kwargs):
 77 |     """Output log with the fatal level."""
 78 |     kwargs.setdefault("stacklevel", 2)
 79 |     if isinstance(msg, dict):
 80 |         for _, line in enumerate(_pretty_dict(msg).split("\n")):
 81 |             _logger.fatal(line, *args, **kwargs)
 82 |     else:
 83 |         _logger.fatal(msg, *args, **kwargs)
 84 | 
 85 | 
 86 | def info(msg, *args, **kwargs):
 87 |     """Output log with the info level."""
 88 |     kwargs.setdefault("stacklevel", 2)
 89 |     if isinstance(msg, dict):
 90 |         for _, line in enumerate(_pretty_dict(msg).split("\n")):
 91 |             _logger.info(line, *args, **kwargs)
 92 |     else:
 93 |         _logger.info(msg, *args, **kwargs)
 94 | 
 95 | 
 96 | def warning(msg, *args, **kwargs):
 97 |     """Output log with the warning level (Alias of the method warn)."""
 98 |     kwargs.setdefault("stacklevel", 2)
 99 |     if isinstance(msg, dict):
100 |         for _, line in enumerate(_pretty_dict(msg).split("\n")):
101 |             _logger.warning(line, *args, **kwargs)
102 |     else:
103 |         _logger.warning(msg, *args, **kwargs)
104 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from onnx_neural_compressor.quantization.quant_utils import CalibrationMethod, QuantFormat, QuantType
16 | from onnx_neural_compressor.quantization.quantize import quantize
17 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import List, Union  # isort: skip
16 | 
17 | import onnx
18 | import onnxruntime as ort
19 | 
20 | from onnx_neural_compressor.quantization import matmul_nbits_quantizer
21 | 
22 | RTNWeightOnlyQuantConfig = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig
23 | AWQWeightOnlyQuantConfig = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig
24 | GPTQWeightOnlyQuantConfig = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig
25 | 
26 | 
27 | class MatMul4BitsQuantizer(matmul_nbits_quantizer.MatMulNBitsQuantizer):
28 | 
29 |     def __init__(
30 |         self,
31 |         model: Union[onnx.ModelProto, str],
32 |         block_size: int = 128,
33 |         is_symmetric: bool = False,
34 |         is_signed: bool = False,
35 |         accuracy_level: int = 0,
36 |         nodes_to_exclude=None,
37 |         algo_config: matmul_nbits_quantizer.WeightOnlyQuantConfig = None,
38 |         providers: List[str] = ["CPUExecutionProvider"],
39 |         optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
40 |     ):
41 |         super().__init__(
42 |             model=model,
43 |             block_size=block_size,
44 |             is_symmetric=is_symmetric,
45 |             is_signed=is_signed,
46 |             accuracy_level=accuracy_level,
47 |             nodes_to_exclude=nodes_to_exclude,
48 |             algo_config=algo_config,
49 |             n_bits=4,
50 |             providers=providers,
51 |             optimization_level=optimization_level,
52 |         )
53 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/quantization/quant_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 MIT HAN Lab
 2 | # This source code is licensed under the MIT license
 3 | #
 4 | # Copyright (c) 2024 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | import enum
19 | 
20 | import onnx
21 | 
22 | 
23 | class QuantType(enum.Enum):  # pragma: no cover
24 |     """Represent QuantType value."""
25 | 
26 |     QInt8 = 0
27 |     QUInt8 = 1
28 |     QInt4 = 4
29 |     QUInt4 = 5
30 | 
31 |     @property
32 |     def tensor_type(self):
33 |         if self == QuantType.QInt8:
34 |             return onnx.TensorProto.INT8
35 |         if self == QuantType.QUInt8:
36 |             return onnx.TensorProto.UINT8
37 |         if self == QuantType.QInt8:
38 |             return onnx.TensorProto.INT4
39 |         if self == QuantType.QUInt4:
40 |             return onnx.TensorProto.UINT4
41 |         raise ValueError(f"Unexpected value qtype={self!r}.")
42 | 
43 | 
44 | class QuantFormat(enum.Enum):
45 |     QOperator = 0
46 |     QDQ = 1
47 | 
48 | 
49 | class CalibrationMethod(enum.Enum):
50 |     MinMax = 0
51 |     Entropy = 1
52 |     Percentile = 2
53 |     Distribution = 3
54 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/quantization/quantize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pathlib
16 | import tempfile
17 | from typing import Union
18 | 
19 | import onnx
20 | import onnxruntime as ort
21 | from onnxruntime.quantization.quantize import QuantConfig
22 | 
23 | from onnx_neural_compressor.quantization import algorithm_entry as algos
24 | from onnx_neural_compressor.quantization import config
25 | 
26 | 
27 | # ORT-like user-facing API
28 | def quantize(
29 |     model_input: Union[str, pathlib.Path, onnx.ModelProto],
30 |     model_output: Union[str, pathlib.Path],
31 |     quant_config: config.BaseConfig,
32 |     optimization_level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
33 | ):
34 |     with tempfile.TemporaryDirectory(prefix="ort.opt.") as tmp_dir:
35 |         if optimization_level != ort.GraphOptimizationLevel.ORT_DISABLE_ALL:
36 |             sess_options = ort.SessionOptions()
37 |             sess_options.graph_optimization_level = optimization_level
38 |             sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix()
39 |             sess_options.add_session_config_entry(
40 |                 "session.optimized_model_external_initializers_file_name", "opt.onnx_data"
41 |             )
42 |             sess_options.add_session_config_entry(
43 |                 "session.optimized_model_external_initializers_min_size_in_bytes", "1024"
44 |             )
45 |             session = ort.InferenceSession(model_input, sess_options, provides=["CPUExecutionProvider"])
46 |             del session
47 |             model_input = sess_options.optimized_model_filepath
48 | 
49 |         if isinstance(quant_config, config.StaticQuantConfig):
50 |             if quant_config.extra_options.get("SmoothQuant", False):
51 |                 algos.smooth_quant_entry(
52 |                     model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output
53 |                 )
54 |             else:
55 |                 algos.static_quantize_entry(
56 |                     model_input, quant_config, quant_config.calibration_data_reader, model_output=model_output
57 |                 )
58 |         elif isinstance(quant_config, config.DynamicQuantConfig):
59 |             algos.dynamic_quantize_entry(model_input, quant_config, model_output=model_output)
60 |         else:
61 |             raise TypeError(
62 |                 "Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig."
63 |             )
64 | 


--------------------------------------------------------------------------------
/onnx_neural_compressor/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Intel Corporation
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Neural Compressor: An open-source Python library supporting popular model compression techniques for ONNX models."""
15 | __version__ = "1.0"
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.isort]
  2 | profile = "black"
  3 | line_length = 120
  4 | extend_skip_glob = ["**/__init__.py"]
  5 | 
  6 | [tool.black]
  7 | line-length = 120
  8 | 
  9 | [tool.codespell]
 10 | skip = '*.po,*.ts,*.js,*.map,*.js.map,*.css.map,.azure-pipelines/scripts/codeScan/codespell/inc_dict.txt'
 11 | count = ''
 12 | quiet-level = 3
 13 | ignore-words = ".azure-pipelines/scripts/codeScan/codespell/nc_dict.txt"
 14 | 
 15 | 
 16 | [tool.ruff]
 17 | # Exclude a variety of commonly ignored directories.
 18 | exclude = [
 19 |     ".bzr",
 20 |     ".direnv",
 21 |     ".eggs",
 22 |     ".git",
 23 |     ".git-rewrite",
 24 |     ".hg",
 25 |     ".ipynb_checkpoints",
 26 |     ".mypy_cache",
 27 |     ".nox",
 28 |     ".pants.d",
 29 |     ".pyenv",
 30 |     ".pytest_cache",
 31 |     ".pytype",
 32 |     ".ruff_cache",
 33 |     ".svn",
 34 |     ".tox",
 35 |     ".venv",
 36 |     ".vscode",
 37 |     "__pypackages__",
 38 |     "_build",
 39 |     "buck-out",
 40 |     "build",
 41 |     "dist",
 42 |     "node_modules",
 43 |     "site-packages",
 44 |     "venv",
 45 | ]
 46 | 
 47 | # Same as Black.
 48 | line-length = 120
 49 | indent-width = 4
 50 | 
 51 | # Assume Python 3.8
 52 | target-version = "py38"
 53 | 
 54 | [tool.ruff.lint]
 55 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
 56 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
 57 | # McCabe complexity (`C901`) by default.
 58 | select = ["E4", "E7", "E9", "F"]
 59 | ignore = [
 60 |     "E402", # Module level import not at top of file
 61 |     "E501", # Line too long (121 > 120 characters)
 62 |     "E721", # Do not compare types, use isinstance()
 63 |     "E722", # Do not use bare except
 64 |     "E731", # Do not assign a lambda expression, use a def
 65 |     "E741", # Do not use variables named ‘l’, ‘O’, or ‘I’
 66 |     "F401", # {name} imported but unused
 67 |     "F403", # from {name} import * used; unable to detect undefined names
 68 |     "F405", # {name} may be undefined, or defined from star imports
 69 |     "F841", # Local variable is assigned to but never used{name}
 70 | ]
 71 | 
 72 | # Allow fix for all enabled rules (when `--fix`) is provided.
 73 | fixable = ["ALL"]
 74 | unfixable = []
 75 | 
 76 | # Allow unused variables when underscore-prefixed.
 77 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 78 | 
 79 | ignore-init-module-imports = true
 80 | 
 81 | [tool.ruff.format]
 82 | # Like Black, use double quotes for strings.
 83 | quote-style = "double"
 84 | 
 85 | # Like Black, indent with spaces, rather than tabs.
 86 | indent-style = "space"
 87 | 
 88 | # Like Black, respect magic trailing commas.
 89 | skip-magic-trailing-comma = false
 90 | 
 91 | # Like Black, automatically detect the appropriate line ending.
 92 | line-ending = "auto"
 93 | 
 94 | # Enable auto-formatting of code examples in docstrings. Markdown,
 95 | # reStructuredText code/literal blocks and doctests are all supported.
 96 | #
 97 | # This is currently disabled by default, but it is planned for this
 98 | # to be opt-out in the future.
 99 | docstring-code-format = false
100 | 
101 | # Set the line length limit used when formatting code snippets in
102 | # docstrings.
103 | #
104 | # This only has an effect when the `docstring-code-format` setting is
105 | # enabled.
106 | docstring-code-line-length = "dynamic"
107 | 


--------------------------------------------------------------------------------
/requirements-lintrunner.txt:
--------------------------------------------------------------------------------
1 | lintrunner_adapters
2 | ruff==0.4.5
3 | black==24.3.0
4 | isort==5.13.2
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # onnxruntime doesn't support numpy>=2.0.0. The restriction will be removed once they fix it.
 2 | numpy<2.0.0
 3 | onnx
 4 | onnxruntime
 5 | onnxruntime-extensions
 6 | psutil
 7 | py-cpuinfo
 8 | pydantic
 9 | transformers
10 | prettytable
11 | scipy
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import re
 3 | import subprocess
 4 | 
 5 | import setuptools
 6 | 
 7 | 
 8 | def is_commit_on_tag():
 9 |     try:
10 |         result = subprocess.run(
11 |             ["git", "describe", "--exact-match", "--tags"], capture_output=True, text=True, check=True
12 |         )
13 |         tag_name = result.stdout.strip()
14 |         return tag_name
15 |     except subprocess.CalledProcessError:
16 |         return False
17 | 
18 | 
19 | def get_build_version():
20 |     if is_commit_on_tag():
21 |         return __version__
22 |     try:
23 |         result = subprocess.run(["git", "describe", "--tags"], capture_output=True, text=True, check=True)
24 |         _, distance, commit = result.stdout.strip().split("-")
25 |         return f"{__version__}.dev{distance}+{commit}"
26 |     except subprocess.CalledProcessError:
27 |         return __version__
28 | 
29 | 
30 | try:
31 |     filepath = "./onnx_neural_compressor/version.py"
32 |     with io.open(filepath) as version_file:
33 |         (__version__,) = re.findall('__version__ = "(.*)"', version_file.read())
34 | except Exception as error:
35 |     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
36 | 
37 | if __name__ == "__main__":
38 | 
39 |     setuptools.setup(
40 |         name="onnx_neural_compressor",
41 |         author="Intel AIPT Team",
42 |         version=get_build_version(),
43 |         author_email="tai.huang@intel.com, mengni.wang@intel.com, yuwen.zhou@intel.com, suyue.chen@intel.com",
44 |         description="Repository of Neural Compressor ORT",
45 |         long_description=io.open("README.md", "r", encoding="utf-8").read(),
46 |         long_description_content_type="text/markdown",
47 |         keywords="quantization",
48 |         license="Apache 2.0",
49 |         url="",
50 |         packages=setuptools.find_packages(),
51 |         include_package_data=True,
52 |         install_requires=[
53 |             "onnx",
54 |             "onnxruntime",
55 |             "onnxruntime-extensions",
56 |             "psutil",
57 |             "numpy<2.0.0",
58 |             "py-cpuinfo",
59 |             "pydantic",
60 |             "transformers",
61 |         ],
62 |         python_requires=">=3.8.0",
63 |         classifiers=[
64 |             "Intended Audience :: Science/Research",
65 |             "Programming Language :: Python :: 3",
66 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
67 |             "License :: OSI Approved :: Apache Software License",
68 |         ],
69 |     )
70 | 


--------------------------------------------------------------------------------
/test/quantization/post_training_quant/test_quant_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import onnx
 5 | 
 6 | from onnx_neural_compressor.algorithms import utility as quant_utils
 7 | 
 8 | 
 9 | class TestQuantUtility(unittest.TestCase):
10 | 
11 |     def test_pad_tensor(self):
12 |         data = np.random.random((100, 32))
13 |         group_size = 32
14 |         k_blocks = (100 - 1) // 32 + 1
15 |         pad_data = quant_utils.pad_tensor(data, group_size, k_blocks)
16 |         self.assertEqual(pad_data.shape, (k_blocks * group_size, 32))
17 | 
18 |     def test_quant_dequant_data(self):
19 |         data = np.random.random((100, 32))
20 |         qrange = quant_utils.get_qmin_qmax_for_qType(
21 |             qType=onnx.TensorProto.UINT8,
22 |             reduce_range=False,
23 |             sym=True,
24 |         )
25 |         self.assertEqual(qrange[0], 0)
26 |         self.assertEqual(qrange[1], 255)
27 | 
28 |         rmin = np.min(np.min(data), 0)
29 |         rmax = np.max(np.max(data), 0)
30 | 
31 |         _, _, zero_point, scale, quantized_data = quant_utils.quantize_data(
32 |             data=data,
33 |             qType=onnx.TensorProto.UINT8,
34 |             sym=True,
35 |         )
36 | 
37 |         dq_data = quant_utils.dequantize_data(
38 |             tensor_value=quantized_data,
39 |             scale_value=scale,
40 |             zo_value=zero_point,
41 |         )
42 |         self.assertLess(np.max(np.abs(dq_data - data)), 0.005)
43 | 
44 |         _, _, zero_point, scale, quantized_data = quant_utils.quantize_data_per_channel(
45 |             data=data,
46 |             qType=onnx.TensorProto.UINT8,
47 |             sym=True,
48 |             axis=1,
49 |         )
50 | 
51 |         dq_data = quant_utils.dequantize_data(
52 |             tensor_value=quantized_data,
53 |             scale_value=scale,
54 |             zo_value=zero_point,
55 |             axis=1,
56 |         )
57 | 
58 |         self.assertLess(np.max(np.abs(dq_data - data)), 0.005)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     unittest.main()
63 | 


--------------------------------------------------------------------------------
/test/quantization/test_algorithm_utility.py:
--------------------------------------------------------------------------------
 1 | """Tests for algorithm utility components."""
 2 | 
 3 | import os
 4 | import unittest
 5 | 
 6 | import numpy as np
 7 | import onnx
 8 | 
 9 | from onnx_neural_compressor import onnx_model
10 | from onnx_neural_compressor.algorithms import utility as quant_utils
11 | 
12 | 
13 | def find_onnx_file(folder_path):
14 |     # return first .onnx file path in folder_path
15 |     for root, dirs, files in os.walk(folder_path):
16 |         for file in files:
17 |             if file.endswith(".onnx"):
18 |                 return os.path.join(root, file)
19 |     return None
20 | 
21 | 
22 | class TestUtilityFunctions(unittest.TestCase):
23 | 
24 |     def test_is_B_transposed(self):
25 |         node = onnx.helper.make_node(
26 |             "Gemm",
27 |             inputs=["a", "b", "c"],
28 |             outputs=["y"],
29 |             alpha=0.25,
30 |             beta=0.35,
31 |             transA=1,
32 |             transB=1,
33 |         )
34 |         self.assertTrue(quant_utils.is_B_transposed(node))
35 | 
36 |         node = onnx.helper.make_node(
37 |             "Gemm",
38 |             inputs=["a", "b", "c"],
39 |             outputs=["y"],
40 |             alpha=0.25,
41 |             beta=0.35,
42 |         )
43 |         self.assertFalse(quant_utils.is_B_transposed(node))
44 | 
45 |     def test_make_woq_dq_node(self):
46 |         node = onnx.helper.make_node("MatMul", ["input", "weight"], "output", name="Matmul")
47 |         with self.assertRaises(ValueError):
48 |             quant_utils.make_weight_only_dequant_node(
49 |                 node=node,
50 |                 weight_shape=(32, 32),
51 |                 block_size=16,
52 |                 num_bits=32,
53 |                 dtype="int",
54 |                 q_weight=np.random.randint(0, 10, size=(2, 32), dtype=np.uint8),
55 |                 scale=np.random.random((2, 32)),
56 |                 zero_point=np.zeros((2, 32)),
57 |             )
58 | 
59 |     def test_split_shared_bias(self):
60 |         input = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 15, 15])
61 |         output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, [1, 5, 11, 11])
62 |         bias_initializer = onnx.numpy_helper.from_array(np.random.random(5).astype(np.float32), name="bias")
63 |         conv1_weight_initializer = onnx.numpy_helper.from_array(
64 |             np.random.randint(-1, 2, [5, 3, 3, 3]).astype(np.float32), name="conv1_weight"
65 |         )
66 |         conv1_node = onnx.helper.make_node("Conv", ["add_out", "conv1_weight", "bias"], ["conv1_output"], name="conv1")
67 |         conv2_weight_initializer = onnx.numpy_helper.from_array(
68 |             np.random.randint(-1, 2, [5, 5, 3, 3]).astype(np.float32), name="conv2_weight"
69 |         )
70 |         conv2_node = onnx.helper.make_node("Conv", ["add_out", "conv2_weight", "bias"], ["conv2_output"], name="conv2")
71 |         initializers = [conv1_weight_initializer, conv2_weight_initializer, bias_initializer]
72 |         graph = onnx.helper.make_graph([conv1_node, conv2_node], "test", [input], [output], initializer=initializers)
73 |         model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
74 | 
75 |         update_model = quant_utils.split_shared_bias(onnx_model.ONNXModel(model))
76 |         split = any(["_nc_split_" in i.name for i in update_model.initializer()])
77 |         self.assertTrue(split)
78 | 
79 |     def test_get_qmin_qmax_for_qType(self):
80 |         with self.assertRaises(ValueError):
81 |             quant_utils.get_qmin_qmax_for_qType(onnx.TensorProto.INT64)
82 | 
83 |         qmin, qmax = quant_utils.get_qmin_qmax_for_qType(onnx.TensorProto.INT8, reduce_range=True)
84 |         self.assertEqual(qmin, -64)
85 |         self.assertEqual(qmax, 64)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     unittest.main()
90 | 


--------------------------------------------------------------------------------
/test/requirements.txt:
--------------------------------------------------------------------------------
1 | optimum
2 | pytest
3 | torch < 2.5.0
4 | 


--------------------------------------------------------------------------------
/test/utils/test_logger.py:
--------------------------------------------------------------------------------
 1 | """Tests for logger components."""
 2 | 
 3 | import unittest
 4 | 
 5 | from onnx_neural_compressor import logger
 6 | 
 7 | log_msg_lst = [
 8 |     "call logger log function.",
 9 |     {"msg": "call logger log function."},
10 |     ["call logger warning function", "done"],
11 |     ("call logger warning function", "done"),
12 |     # the following log will be prettified
13 |     {"msg": "call logger warning function"},
14 |     {"msg": {("bert", "embedding"): {"weight": {"dtype": ["unint8", "int8"]}}}},
15 |     {"msg": {("bert", "embedding"): {"op": ("a", "b")}}},
16 |     # the following log will not be prettified
17 |     [{"msg": "call logger warning function"}, {"msg2": "done"}],
18 |     ({"msg": "call logger warning function"}, {"msg2": "done"}),
19 |     ({"msg": [{"sub_msg": "call logger"}, {"sub_msg2": "call warning function"}]}, {"msg2": "done"}),
20 | ]
21 | 
22 | 
23 | class TestLogger(unittest.TestCase):
24 | 
25 |     def test_logger(self):
26 | 
27 |         for msg in log_msg_lst:
28 |             logger.log(0, msg)
29 |             logger.log(1, msg)
30 |             logger.debug(msg)
31 |             logger.error(msg)
32 |             logger.fatal(msg)
33 |             logger.info(msg)
34 |             logger.warning(msg)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     unittest.main()
39 | 


--------------------------------------------------------------------------------
/test/utils/test_param.py:
--------------------------------------------------------------------------------
 1 | """Tests for tuning param components."""
 2 | 
 3 | import unittest
 4 | from typing import List
 5 | 
 6 | from onnx_neural_compressor.quantization import config
 7 | 
 8 | 
 9 | class TestTuningParam(unittest.TestCase):
10 | 
11 |     def test_is_tunable_same_type(self):
12 |         # Test when tunable_type has the same type as the default value
13 |         param = config.TuningParam("param_name", [1, 2, 3], List[int])
14 |         self.assertTrue(param.is_tunable([4, 5, 6]))
15 |         self.assertFalse(param.is_tunable(["not_an_int"]))
16 | 
17 |     def test_is_tunable_recursive(self):
18 |         # Test recursive type checking for iterables
19 |         param = config.TuningParam("param_name", [[1, 2], [3, 4]], List[List[int]])
20 |         self.assertTrue(param.is_tunable([[5, 6], [7, 8]]))
21 |         # TODO: double check if this is the expected behavior
22 |         self.assertTrue(param.is_tunable([[5, 6], [7, "8"]]))
23 |         self.assertEqual(
24 |             str(param), "TuningParam(name=param_name, tunable_type=typing.List[typing.List[int]], options=None)."
25 |         )
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     unittest.main()
30 | 


--------------------------------------------------------------------------------
/test/utils/test_utility.py:
--------------------------------------------------------------------------------
  1 | """Tests for utility components."""
  2 | 
  3 | import os
  4 | import shutil
  5 | import unittest
  6 | 
  7 | import onnx
  8 | import onnxruntime
  9 | import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer
 10 | import optimum.exporters.onnx
 11 | 
 12 | from onnx_neural_compressor import onnx_model, utility
 13 | 
 14 | 
 15 | def find_onnx_file(folder_path):
 16 |     # return first .onnx file path in folder_path
 17 |     for root, dirs, files in os.walk(folder_path):
 18 |         for file in files:
 19 |             if file.endswith(".onnx"):
 20 |                 return os.path.join(root, file)
 21 |     return None
 22 | 
 23 | 
 24 | class TestOptions(unittest.TestCase):
 25 | 
 26 |     def test_set_random_seed(self):
 27 |         seed = 12345
 28 |         utility.set_random_seed(seed)
 29 |         self.assertEqual(utility.options.random_seed, seed)
 30 | 
 31 |         # non int type
 32 |         seed = "12345"
 33 |         with self.assertRaises(AssertionError):
 34 |             utility.set_random_seed(seed)
 35 | 
 36 | 
 37 | class TestCPUInfo(unittest.TestCase):
 38 | 
 39 |     def test_cpu_info(self):
 40 |         cpu_info = utility.CpuInfo()
 41 |         assert cpu_info.cores_per_socket > 0, "CPU count should be greater than 0"
 42 |         assert isinstance(cpu_info.bf16, bool), "bf16 should be a boolean"
 43 |         assert isinstance(cpu_info.vnni, bool), "avx512 should be a boolean"
 44 | 
 45 | 
 46 | class TestLazyImport(unittest.TestCase):
 47 | 
 48 |     def test_lazy_import(self):
 49 |         # Test import
 50 |         pydantic = utility.LazyImport("pydantic")
 51 |         assert pydantic.__name__ == "pydantic", "pydantic should be imported"
 52 | 
 53 |     def test_lazy_import_error(self):
 54 |         # Test import error
 55 |         with self.assertRaises(ImportError):
 56 |             non_existent_module = utility.LazyImport("non_existent_module")
 57 |             non_existent_module.non_existent_function()
 58 | 
 59 | 
 60 | class TestSingletonDecorator:
 61 | 
 62 |     def test_singleton_decorator(self):
 63 | 
 64 |         @utility.singleton
 65 |         class TestSingleton:
 66 | 
 67 |             def __init__(self):
 68 |                 self.value = 0
 69 | 
 70 |         instance = TestSingleton()
 71 |         instance.value = 1
 72 |         instance2 = TestSingleton()
 73 |         assert instance2.value == 1, "Singleton should return the same instance"
 74 | 
 75 | 
 76 | class TestGetVersion(unittest.TestCase):
 77 | 
 78 |     def test_get_version(self):
 79 |         from onnx_neural_compressor import version
 80 | 
 81 |         self.assertTrue(isinstance(version.__version__, str))
 82 | 
 83 | 
 84 | class TestUtilityFunctions(unittest.TestCase):
 85 | 
 86 |     def test_check_value(self):
 87 |         src = [1, 2, 3]
 88 |         supported_type = int
 89 |         supported_value = [1, 2, 3]
 90 |         result = utility.check_value("name", src, supported_type, supported_value)
 91 |         self.assertTrue(result)
 92 | 
 93 |         src = [1, 2, 3]
 94 |         supported_type = list
 95 |         with self.assertRaises(AssertionError) as cm:
 96 |             utility.check_value("name", src, supported_type)
 97 |         self.assertEqual(
 98 |             str(cm.exception),
 99 |             "Type of 'name' items should be <class 'list'> but not [<class 'int'>, <class 'int'>, <class 'int'>]",
100 |         )
101 | 
102 |         src = 1
103 |         supported_type = list
104 |         with self.assertRaises(AssertionError) as cm:
105 |             utility.check_value("name", src, supported_type)
106 |         self.assertEqual(str(cm.exception), "Type of 'name' should be <class 'list'> but not <class 'int'>")
107 | 
108 |         src = "a"
109 |         supported_type = str
110 |         supported_value = ["b"]
111 |         with self.assertRaises(AssertionError) as cm:
112 |             utility.check_value("name", src, supported_type, supported_value)
113 |         self.assertEqual(str(cm.exception), "'a' is not in supported 'name': ['b']. Skip setting it.")
114 | 
115 |         src = ["a"]
116 |         supported_type = str
117 |         supported_value = ["b"]
118 |         with self.assertRaises(AssertionError) as cm:
119 |             utility.check_value("name", src, supported_type, supported_value)
120 |         self.assertEqual(str(cm.exception), "['a'] is not in supported 'name': ['b']. Skip setting it.")
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     unittest.main()
125 | 


--------------------------------------------------------------------------------