├── pipeline
    ├── requirements.txt
    ├── README.md
    └── pipeline.py
├── requirements.txt
├── assets
    ├── performance.png
    ├── operator_fusion.png
    ├── vit-performance.png
    ├── vision-transformer-architecture.webp
    └── sentence-transfomeres-performance.png
├── scripts
    └── text-classification.py
├── README.md
├── .gitignore
├── vision-transformers.ipynb
├── sentence-transformers.ipynb
└── notebook.ipynb


/pipeline/requirements.txt:
--------------------------------------------------------------------------------
1 | optimum[onnxruntime]==1.2.3
2 | mkl-include
3 | mkl


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | optimum[onnxruntime]==1.2.3
2 | evaluate 
3 | sklearn
4 | mkl-include
5 | mkl
6 | optuna


--------------------------------------------------------------------------------
/assets/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/optimum-transformers-optimizations/HEAD/assets/performance.png


--------------------------------------------------------------------------------
/assets/operator_fusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/optimum-transformers-optimizations/HEAD/assets/operator_fusion.png


--------------------------------------------------------------------------------
/assets/vit-performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/optimum-transformers-optimizations/HEAD/assets/vit-performance.png


--------------------------------------------------------------------------------
/assets/vision-transformer-architecture.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/optimum-transformers-optimizations/HEAD/assets/vision-transformer-architecture.webp


--------------------------------------------------------------------------------
/assets/sentence-transfomeres-performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/optimum-transformers-optimizations/HEAD/assets/sentence-transfomeres-performance.png


--------------------------------------------------------------------------------
/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Text Classification Optimum API Template
 2 | 
 3 | This is a template repository for Text Classification using Optimum and onnxruntime to support generic inference with Hugging Face Hub generic Inference API. There are two required steps:
 4 | 
 5 | 1. Specify the requirements by defining a `requirements.txt` file.
 6 | 2. Implement the `pipeline.py` `__init__` and `__call__` methods. These methods are called by the Inference API. The `__init__` method should load the model and preload the optimum model and tokenizers as well as the `text-classification` pipeline needed for inference. This is only called once. The `__call__` method performs the actual inference. Make sure to follow the same input/output specifications defined in the template for the pipeline to work.
 7 | 
 8 | add 
 9 | ```
10 | library_name: generic
11 | ```
12 | to the readme.


--------------------------------------------------------------------------------
/pipeline/pipeline.py:
--------------------------------------------------------------------------------
 1 | from typing import  Dict, List, Any
 2 | from optimum.onnxruntime import ORTModelForSequenceClassification
 3 | from transformers import pipeline, AutoTokenizer
 4 | 
 5 | 
 6 | class PreTrainedPipeline():
 7 |     def __init__(self, path=""):
 8 |         # load the optimized model
 9 |         model = ORTModelForSequenceClassification.from_pretrained(path)
10 |         tokenizer = AutoTokenizer.from_pretrained(path)
11 |         # create inference pipeline
12 |         self.pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
13 | 
14 | 
15 |     def __call__(self, inputs: Any) -> List[List[Dict[str, float]]]:
16 |         """
17 |         Args:
18 |             data (:obj:`str`):
19 |                 a string containing some text
20 |         Return:
21 |             A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
22 |                 - "label": A string representing what the label/class is. There can be multiple labels.
23 |                 - "score": A score between 0 and 1 describing how confident the model is for this label/class.
24 |         """
25 |         # pop inputs for pipeline
26 |         return self.pipeline(inputs)


--------------------------------------------------------------------------------
/scripts/text-classification.py:
--------------------------------------------------------------------------------
 1 | from optimum.onnxruntime import ORTOptimizer
 2 | from optimum.onnxruntime.configuration import OptimizationConfig
 3 | from optimum.onnxruntime import ORTQuantizer
 4 | from optimum.onnxruntime.configuration import AutoQuantizationConfig
 5 | from optimum.onnxruntime import ORTModelForSequenceClassification
 6 | from transformers import AutoTokenizer
 7 | 
 8 | model_id = "optimum/distilbert-base-uncased-finetuned-banking77"
 9 | dataset_id = "banking77"
10 | dynamic_onnx_path = Path("dynamic_onnx")
11 | 
12 | # load vanilla transformers and convert to onnx
13 | model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True)
14 | tokenizer = AutoTokenizer.from_pretrained(model_id)
15 | 
16 | # save onnx checkpoint and tokenizer
17 | model.save_pretrained(dynamic_onnx_path)
18 | tokenizer.save_pretrained(dynamic_onnx_path)
19 | 
20 | # create ORTOptimizer and define optimization configuration
21 | dynamic_optimizer = ORTOptimizer.from_pretrained(model_id, feature=model.pipeline_task)
22 | dynamic_optimization_config = OptimizationConfig(optimization_level=99)  # enable all optimizations
23 | 
24 | # apply the optimization configuration to the model
25 | dynamic_optimizer.export(
26 |     onnx_model_path=onnx_path / "model.onnx",
27 |     onnx_optimized_model_output_path=dynamic_onnx_path / "model-optimized.onnx",
28 |     optimization_config=dynamic_optimization_config,
29 | )
30 | 
31 | # create ORTQuantizer and define quantization configuration
32 | dynamic_quantizer = ORTQuantizer.from_pretrained(model_id, feature=model.pipeline_task)
33 | dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
34 | 
35 | # apply the quantization configuration to the model
36 | dynamic_quantizer.export(
37 |     onnx_model_path=dynamic_onnx_path / "model-optimized.onnx",
38 |     onnx_quantized_model_output_path=dynamic_onnx_path / "model-quantized.onnx",
39 |     quantization_config=dqconfig,
40 | )
41 | 
42 | import os
43 | 
44 | # get model file size
45 | size = os.path.getsize(dynamic_onnx_path / "model.onnx") / (1024 * 1024)
46 | print(f"Vanilla Onnx Model file size: {size:.2f} MB")
47 | size = os.path.getsize(dynamic_onnx_path / "model-quantized.onnx") / (1024 * 1024)
48 | print(f"Quantized Onnx Model file size: {size:.2f} MB")
49 | 
50 | from optimum.onnxruntime import ORTModelForSequenceClassification
51 | from transformers import pipeline, AutoTokenizer
52 | 
53 | model = ORTModelForSequenceClassification.from_pretrained(dynamic_onnx_path, file_name="model-quantized.onnx")
54 | 
55 | dynamic_clx = pipeline("text-classification", model=model, tokenizer=dynamic_quantizer.tokenizer)
56 | 
57 | from evaluate import evaluator
58 | from datasets import load_dataset
59 | 
60 | eval = evaluator("text-classification")
61 | eval_dataset = load_dataset("banking77", split="test")
62 | 
63 | results = eval.compute(
64 |     model_or_pipeline=dynamic_clx,
65 |     data=eval_dataset,
66 |     metric="accuracy",
67 |     input_column="text",
68 |     label_column="label",
69 |     label_mapping=model.config.label2id,
70 |     strategy="simple",
71 | )
72 | print(results)
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Optimizing Transformers with Optimum
 2 | 
 3 | ## Examples
 4 | 
 5 | * [DistilBERT](./notebook.ipynb)
 6 | * [Vision Transformers](./vision-transformers.ipynb)
 7 | * [Sentence Transformers](./sentence-transformers.ipynb)
 8 | 
 9 | 
10 | In this session, you will learn how to optimize Hugging Face Transformers models using Optimum. The session will show you how to dynamically quantize and optimize a DistilBERT model using [Hugging Face Optimum](https://huggingface.co/docs/optimum/index) and [ONNX Runtime](https://onnxruntime.ai/). Hugging Face Optimum is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardware.
11 | 
12 | Note: dynamic quantization is currently only supported for CPUs, so we will not be utilizing GPUs / CUDA in this session.
13 | 
14 | By the end of this session, you see how quantization and optimization with Hugging Face Optimum can result in significant increase in model latency while keeping almost 100% of the full-precision model. Furthermore, you’ll see how to easily apply some advanced quantization and optimization techniques shown here so that your models take much less of an accuracy hit than they would otherwise. 
15 | 
16 | You will learn how to:
17 | 1. Setup Development Environment
18 | 2. Convert a Hugging Face `Transformers` model to ONNX for inference
19 | 3. Apply graph optimization techniques to the ONNX model
20 | 4. Apply dynamic quantization using ORTQuantizer from 🤗 Optimum
21 | 5. Test inference with the quantized model
22 | 6. Evaluate the performance and speed
23 | 7. Push the quantized model to the Hub
24 | 8. Load and run inference with a quantized model from the hub
25 | 
26 | Let's get started! 🚀
27 | 
28 | 
29 | 
30 | ---
31 | 
32 | ## Setup
33 | 
34 | 
35 | ### [Miniconda](https://waylonwalker.com/install-miniconda/#installing-miniconda-on-linux) or [Micromamba](https://labs.epi2me.io/conda-or-mamba-for-production/) setup (conda alternative but smaller)
36 | 
37 | Miniconda
38 | ```bash
39 | mkdir -p ~/miniconda3
40 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
41 | bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
42 | rm -rf ~/miniconda3/miniconda.sh
43 | ~/miniconda3/bin/conda init bash
44 | ~/miniconda3/bin/conda init zsh
45 | ```
46 | ### Install python dependencies
47 | 
48 | ```bash
49 | pip install -r requirements.txt
50 | ```
51 | 
52 | # Text Classification Optimum API Template
53 | 
54 | This is a template repository for Text Classification using Optimum and onnxruntime to support generic inference with Hugging Face Hub generic Inference API. There are two required steps:
55 | 
56 | 1. Specify the requirements by defining a `requirements.txt` file.
57 | 2. Implement the `pipeline.py` `__init__` and `__call__` methods. These methods are called by the Inference API. The `__init__` method should load the model and preload the optimum model and tokenizers as well as the `text-classification` pipeline needed for inference. This is only called once. The `__call__` method performs the actual inference. Make sure to follow the same input/output specifications defined in the template for the pipeline to work.
58 | 
59 | add 
60 | ```
61 | library_name: generic
62 | ```
63 | to the readme.
64 | 
65 | _note: the `generic` community image currently only support `inputs` as parameter and no parameter._


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | onnx/
163 | dynamic_onnx/


--------------------------------------------------------------------------------
/vision-transformers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Accelerate Vision Transformer (ViT) with Quantization using Optimum\n",
  8 |     "\n",
  9 |     "In this session, you will learn how to optimize Vision Transformers models using Optimum. The session will show you how to dynamically quantize and optimize a ViT model using [Hugging Face Optimum](https://huggingface.co/docs/optimum/index) and [ONNX Runtime](https://onnxruntime.ai/). Hugging Face Optimum is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardware.\n",
 10 |     "\n",
 11 |     "Note: dynamic quantization is currently only supported for CPUs, so we will not be utilizing GPUs / CUDA in this session.\n",
 12 |     "\n",
 13 |     "By the end of this session, you see how quantization and optimization with Hugging Face Optimum can result in significant increase in model latency while keeping almost 100% of the full-precision model. Furthermore, you’ll see how to easily apply some advanced quantization and optimization techniques shown here so that your models take much less of an accuracy hit than they would otherwise. \n",
 14 |     "\n",
 15 |     "You will learn how to:\n",
 16 |     "1. Setup Development Environment\n",
 17 |     "2. Convert a Hugging Face `Transformers` model to ONNX for inference\n",
 18 |     "3. Apply dynamic quantization using `ORTQuantizer` from Optimum\n",
 19 |     "4. Test inference with the quantized model\n",
 20 |     "5. Evaluate the performance and speed\n",
 21 |     "\n",
 22 |     "Let's get started! 🚀\n",
 23 |     "\n",
 24 |     "_This tutorial was created and run on an c6i.xlarge AWS EC2 Instance._\n",
 25 |     "\n",
 26 |     "---\n",
 27 |     "\n",
 28 |     "## Quick intro: Vision Transformer (ViT) by Google Brain\n",
 29 |     "\n",
 30 |     "The Vision Transformer (ViT) is basically BERT, but applied to images. It attains excellent results compared to state-of-the-art convolutional networks. In order to provide images to the model, each image is split into a sequence of fixed-size patches (typically of resolution 16x16 or 32x32), which are linearly embedded. One also adds a [CLS] token at the beginning of the sequence in order to classify images. Next, one adds absolute position embeddings and provides this sequence to the Transformer encoder.\n",
 31 |     "\n",
 32 |     "![vision-transformer-architecture](./assets/vision-transformer-architecture.webp)\n",
 33 |     "\n",
 34 |     "- Paper: https://arxiv.org/abs/2010.11929\n",
 35 |     "- Official repo (in JAX): https://github.com/google-research/vision_transformer"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## 1. Setup Development Environment\n",
 43 |     "\n",
 44 |     "Our first step is to install Optimum, along with  Evaluate and some other libraries. Running the following cell will install all the required packages for us including Transformers, PyTorch, and ONNX Runtime utilities:"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "!pip install \"optimum[onnxruntime]==1.5.0\" evaluate[evaluator] sklearn mkl-include mkl --upgrade"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "> If you want to run inference on a GPU, you can install 🤗 Optimum with `pip install optimum[onnxruntime-gpu]`.\n"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## 2. Convert a Hugging Face `Transformers` model to ONNX for inference\n",
 68 |     "\n",
 69 |     "Before we can start qunatizing we need to convert our vanilla `transformers` model to the `onnx` format. To do this we will use the new [ORTModelForImageClassification](https://huggingface.co/docs/optimum/onnxruntime/modeling_ort#optimum.onnxruntime.ORTModelForImageClassification) class calling the `from_pretrained()` method with the `from_transformers` attribute. The model we are using is the a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the [beans](https://huggingface.co/datasets/beans) dataset ([nateraw/vit-base-beans](https://huggingface.co/nateraw/vit-base-beans)) achieving an accuracy of 96.88%.\n",
 70 |     "\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 6,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stderr",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "/home/ubuntu/miniconda3/lib/python3.9/site-packages/transformers/models/vit/modeling_vit.py:172: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
 83 |       "  if num_channels != self.num_channels:\n",
 84 |       "/home/ubuntu/miniconda3/lib/python3.9/site-packages/transformers/models/vit/modeling_vit.py:177: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
 85 |       "  if height != self.image_size[0] or width != self.image_size[1]:\n"
 86 |      ]
 87 |     },
 88 |     {
 89 |      "data": {
 90 |       "text/plain": [
 91 |        "['onnx/preprocessor_config.json']"
 92 |       ]
 93 |      },
 94 |      "execution_count": 6,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "from optimum.onnxruntime import ORTModelForImageClassification\n",
101 |     "from transformers import AutoFeatureExtractor\n",
102 |     "from pathlib import Path\n",
103 |     "\n",
104 |     "\n",
105 |     "model_id=\"nateraw/vit-base-beans\"\n",
106 |     "onnx_path = Path(\"onnx\")\n",
107 |     "\n",
108 |     "# load vanilla transformers and convert to onnx\n",
109 |     "model = ORTModelForImageClassification.from_pretrained(model_id, from_transformers=True)\n",
110 |     "preprocessor = AutoFeatureExtractor.from_pretrained(model_id)\n",
111 |     "\n",
112 |     "# save onnx checkpoint and tokenizer\n",
113 |     "model.save_pretrained(onnx_path)\n",
114 |     "preprocessor.save_pretrained(onnx_path)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "One neat thing about 🤗 Optimum, is that allows you to run ONNX models with the `pipeline()` function from 🤗 Transformers. This means that you get all the pre- and post-processing features for free, without needing to re-implement them for each model! Here's how you can run inference with our vanilla ONNX model:\n",
122 |     "\n",
123 |     "`https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg`  \n",
124 |     "![test-image](https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 3,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "[{'score': 0.9405876994132996, 'label': 'angular_leaf_spot'}, {'score': 0.03206056356430054, 'label': 'bean_rust'}, {'score': 0.02735181152820587, 'label': 'healthy'}]\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "from transformers import pipeline\n",
142 |     "\n",
143 |     "vanilla_clf = pipeline(\"image-classification\", model=model, feature_extractor=preprocessor)\n",
144 |     "print(vanilla_clf(\"https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg\"))"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "If you want to learn more about exporting transformers model check-out [Convert Transformers to ONNX with Hugging Face Optimum](https://www.philschmid.de/convert-transformers-to-onnx) blog post\n",
152 |     "\n"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "## 3. Apply dynamic quantization using `ORTQuantizer` from Optimum\n",
160 |     "\n",
161 |     "The `ORTQuantizer` can be used to apply dynamic quantization to decrease the size of the model size and accelerate latency and inference.\n",
162 |     "\n",
163 |     "_We use the `avx512_vnni` config since the instance is powered by an intel ice-lake CPU supporting avx512._"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 7,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "from optimum.onnxruntime import ORTQuantizer\n",
173 |     "from optimum.onnxruntime.configuration import AutoQuantizationConfig\n",
174 |     "\n",
175 |     "# create ORTQuantizer and define quantization configuration\n",
176 |     "dynamic_quantizer = ORTQuantizer.from_pretrained(model)\n",
177 |     "dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)\n",
178 |     "\n",
179 |     "# apply the quantization configuration to the model\n",
180 |     "model_quantized_path = dynamic_quantizer.quantize(\n",
181 |     "    save_dir=onnx_path,\n",
182 |     "    quantization_config=dqconfig,\n",
183 |     ")"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "Lets quickly check the new model size."
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 8,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "Model file size: 327.40 MB\n",
203 |       "Quantized Model file size: 84.49 MB\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "import os\n",
209 |     "\n",
210 |     "# get model file size\n",
211 |     "size = os.path.getsize(onnx_path / \"model.onnx\")/(1024*1024)\n",
212 |     "quantized_model = os.path.getsize(onnx_path / \"model_quantized.onnx\")/(1024*1024)\n",
213 |     "\n",
214 |     "print(f\"Model file size: {size:.2f} MB\")\n",
215 |     "print(f\"Quantized Model file size: {quantized_model:.2f} MB\")"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "## 4. Test inference with the quantized model\n",
223 |     "\n",
224 |     "[Optimum](https://huggingface.co/docs/optimum/main/en/pipelines#optimizing-with-ortoptimizer) has built-in support for [transformers pipelines](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#pipelines). This allows us to leverage the same API that we know from using PyTorch and TensorFlow models.\n",
225 |     "Therefore we can load our quantized model with `ORTModelForImageClassification` class and transformers `pipeline`."
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 9,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "name": "stdout",
235 |      "output_type": "stream",
236 |      "text": [
237 |       "[{'score': 0.9412522912025452, 'label': 'angular_leaf_spot'}, {'score': 0.031623296439647675, 'label': 'bean_rust'}, {'score': 0.027124471962451935, 'label': 'healthy'}]\n"
238 |      ]
239 |     }
240 |    ],
241 |    "source": [
242 |     "from optimum.onnxruntime import ORTModelForImageClassification\n",
243 |     "from transformers import pipeline, AutoFeatureExtractor\n",
244 |     "\n",
245 |     "model = ORTModelForImageClassification.from_pretrained(onnx_path, file_name=\"model_quantized.onnx\")\n",
246 |     "preprocessor = AutoFeatureExtractor.from_pretrained(onnx_path)\n",
247 |     "\n",
248 |     "q8_clf = pipeline(\"image-classification\", model=model, feature_extractor=preprocessor)\n",
249 |     "print(q8_clf(\"https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg\"))"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "## 5. Evaluate the performance and speed\n",
257 |     "\n",
258 |     "To evaluate the model performance and speed are we going to use a the `test` split of the [beans](https://huggingface.co/datasets/beans) dataset containing only 3 classes ('angular_leaf_spot', 'bean_rust', 'healthy') and 128 images. The evaluation was done by using [Huggingface/evaluate](https://huggingface.co/docs/evaluate/index) a library for easily evaluating machine learning models and datasets.\n",
259 |     "\n",
260 |     "We evaluated the vanilla model outside of this example using the same `evaluator` with the vanilla model achieving an accuraccy of `96.88%` on our dataset. \n"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 10,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "data": {
270 |       "application/vnd.jupyter.widget-view+json": {
271 |        "model_id": "b0917986e5ae4f428372289004f62a3c",
272 |        "version_major": 2,
273 |        "version_minor": 0
274 |       },
275 |       "text/plain": [
276 |        "Downloading builder script:   0%|          | 0.00/1.43k [00:00<?, ?B/s]"
277 |       ]
278 |      },
279 |      "metadata": {},
280 |      "output_type": "display_data"
281 |     },
282 |     {
283 |      "data": {
284 |       "application/vnd.jupyter.widget-view+json": {
285 |        "model_id": "bd587a5366c54de3bbeec329be1e9d50",
286 |        "version_major": 2,
287 |        "version_minor": 0
288 |       },
289 |       "text/plain": [
290 |        "Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]"
291 |       ]
292 |      },
293 |      "metadata": {},
294 |      "output_type": "display_data"
295 |     },
296 |     {
297 |      "name": "stderr",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "WARNING:datasets.builder:Using custom data configuration default\n"
301 |      ]
302 |     },
303 |     {
304 |      "name": "stdout",
305 |      "output_type": "stream",
306 |      "text": [
307 |       "Downloading and preparing dataset beans/default (download: 171.69 MiB, generated: 467.19 KiB, post-processed: Unknown size, total: 172.14 MiB) to /home/ubuntu/.cache/huggingface/datasets/beans/default/0.0.0/90c755fb6db1c0ccdad02e897a37969dbf070bed3755d4391e269ff70642d791...\n"
308 |      ]
309 |     },
310 |     {
311 |      "data": {
312 |       "application/vnd.jupyter.widget-view+json": {
313 |        "model_id": "657c3501617d44149d3e7380391cc20d",
314 |        "version_major": 2,
315 |        "version_minor": 0
316 |       },
317 |       "text/plain": [
318 |        "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
319 |       ]
320 |      },
321 |      "metadata": {},
322 |      "output_type": "display_data"
323 |     },
324 |     {
325 |      "data": {
326 |       "application/vnd.jupyter.widget-view+json": {
327 |        "model_id": "56875d7f0626494599e8557340b6ef63",
328 |        "version_major": 2,
329 |        "version_minor": 0
330 |       },
331 |       "text/plain": [
332 |        "Downloading data:   0%|          | 0.00/144M [00:00<?, ?B/s]"
333 |       ]
334 |      },
335 |      "metadata": {},
336 |      "output_type": "display_data"
337 |     },
338 |     {
339 |      "data": {
340 |       "application/vnd.jupyter.widget-view+json": {
341 |        "model_id": "da8547a320d84340ad6a7331819702ab",
342 |        "version_major": 2,
343 |        "version_minor": 0
344 |       },
345 |       "text/plain": [
346 |        "Downloading data:   0%|          | 0.00/18.5M [00:00<?, ?B/s]"
347 |       ]
348 |      },
349 |      "metadata": {},
350 |      "output_type": "display_data"
351 |     },
352 |     {
353 |      "data": {
354 |       "application/vnd.jupyter.widget-view+json": {
355 |        "model_id": "f1baea0ba719458abcc077d5b84a8ce6",
356 |        "version_major": 2,
357 |        "version_minor": 0
358 |       },
359 |       "text/plain": [
360 |        "Downloading data:   0%|          | 0.00/17.7M [00:00<?, ?B/s]"
361 |       ]
362 |      },
363 |      "metadata": {},
364 |      "output_type": "display_data"
365 |     },
366 |     {
367 |      "data": {
368 |       "application/vnd.jupyter.widget-view+json": {
369 |        "model_id": "98d168b6d6894dd493df584c7d9a9235",
370 |        "version_major": 2,
371 |        "version_minor": 0
372 |       },
373 |       "text/plain": [
374 |        "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
375 |       ]
376 |      },
377 |      "metadata": {},
378 |      "output_type": "display_data"
379 |     },
380 |     {
381 |      "data": {
382 |       "application/vnd.jupyter.widget-view+json": {
383 |        "model_id": "00d6209e063745e2bd231f87b119de4c",
384 |        "version_major": 2,
385 |        "version_minor": 0
386 |       },
387 |       "text/plain": [
388 |        "Generating train split:   0%|          | 0/1034 [00:00<?, ? examples/s]"
389 |       ]
390 |      },
391 |      "metadata": {},
392 |      "output_type": "display_data"
393 |     },
394 |     {
395 |      "data": {
396 |       "application/vnd.jupyter.widget-view+json": {
397 |        "model_id": "8250a2b900f04e60928f5c57614699b1",
398 |        "version_major": 2,
399 |        "version_minor": 0
400 |       },
401 |       "text/plain": [
402 |        "Generating validation split:   0%|          | 0/133 [00:00<?, ? examples/s]"
403 |       ]
404 |      },
405 |      "metadata": {},
406 |      "output_type": "display_data"
407 |     },
408 |     {
409 |      "data": {
410 |       "application/vnd.jupyter.widget-view+json": {
411 |        "model_id": "c2fa4e8cc3e048c1b6f141dae5ebb740",
412 |        "version_major": 2,
413 |        "version_minor": 0
414 |       },
415 |       "text/plain": [
416 |        "Generating test split:   0%|          | 0/128 [00:00<?, ? examples/s]"
417 |       ]
418 |      },
419 |      "metadata": {},
420 |      "output_type": "display_data"
421 |     },
422 |     {
423 |      "name": "stdout",
424 |      "output_type": "stream",
425 |      "text": [
426 |       "Dataset beans downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/beans/default/0.0.0/90c755fb6db1c0ccdad02e897a37969dbf070bed3755d4391e269ff70642d791. Subsequent calls will reuse this data.\n"
427 |      ]
428 |     },
429 |     {
430 |      "data": {
431 |       "application/vnd.jupyter.widget-view+json": {
432 |        "model_id": "9f05c78e8f074bae838a654e0f65912c",
433 |        "version_major": 2,
434 |        "version_minor": 0
435 |       },
436 |       "text/plain": [
437 |        "  0%|          | 0/1 [00:00<?, ?it/s]"
438 |       ]
439 |      },
440 |      "metadata": {},
441 |      "output_type": "display_data"
442 |     },
443 |     {
444 |      "name": "stdout",
445 |      "output_type": "stream",
446 |      "text": [
447 |       "Vanilla model: 96.88%\n",
448 |       "Quantized model: 96.88%\n",
449 |       "The quantized model achieves 99.99% accuracy of the fp32 model\n"
450 |      ]
451 |     }
452 |    ],
453 |    "source": [
454 |     "from evaluate import evaluator\n",
455 |     "from datasets import load_dataset\n",
456 |     "\n",
457 |     "e = evaluator(\"image-classification\")\n",
458 |     "eval_dataset = load_dataset(\"beans\",split=[\"test\"])[0]\n",
459 |     "\n",
460 |     "results = e.compute(\n",
461 |     "    model_or_pipeline=q8_clf,\n",
462 |     "    data=eval_dataset,\n",
463 |     "    metric=\"accuracy\",\n",
464 |     "    input_column=\"image\",\n",
465 |     "    label_column=\"labels\",\n",
466 |     "    label_mapping=model.config.label2id,\n",
467 |     "    strategy=\"simple\",\n",
468 |     ")\n",
469 |     "\n",
470 |     "print(f\"Vanilla model: 96.88%\")\n",
471 |     "print(f\"Quantized model: {results['accuracy']*100:.2f}%\")\n",
472 |     "print(f\"The quantized model achieves {round(results['accuracy']/0.9688,4)*100:.2f}% accuracy of the fp32 model\")"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {},
478 |    "source": [
479 |     "Okay, now let's test the performance (latency) of our quantized model. We are going to use a the [beans sample](https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg) for the benchmark. To keep it simple, we are going to use a python loop and calculate the avg,mean & p95 latency for our vanilla model and for the quantized model.\n"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 11,
485 |    "metadata": {},
486 |    "outputs": [
487 |     {
488 |      "name": "stdout",
489 |      "output_type": "stream",
490 |      "text": [
491 |       "Vanilla model: P95 latency (ms) - 165.06651640004284; Average latency (ms) - 149.00 +\\- 11.22;\n",
492 |       "Quantized model: P95 latency (ms) - 63.56140074997256; Average latency (ms) - 62.81 +\\- 2.18;\n",
493 |       "Improvement through quantization: 2.6x\n"
494 |      ]
495 |     }
496 |    ],
497 |    "source": [
498 |     "from time import perf_counter\n",
499 |     "import numpy as np \n",
500 |     "from PIL import Image\n",
501 |     "import requests\n",
502 |     "\n",
503 |     "payload=\"https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg\"\n",
504 |     "\n",
505 |     "def measure_latency(pipe):\n",
506 |     "    # prepare date\n",
507 |     "    image = Image.open(requests.get(payload, stream=True).raw)\n",
508 |     "    inputs = pipe.feature_extractor(images=image, return_tensors=\"pt\")\n",
509 |     "    latencies = []\n",
510 |     "    # warm up\n",
511 |     "    for _ in range(10):\n",
512 |     "        _ = pipe.model(**inputs)\n",
513 |     "    # Timed run\n",
514 |     "    for _ in range(200):\n",
515 |     "        start_time = perf_counter()\n",
516 |     "        _ =  pipe.model(**inputs)\n",
517 |     "        latency = perf_counter() - start_time\n",
518 |     "        latencies.append(latency)\n",
519 |     "    # Compute run statistics\n",
520 |     "    time_avg_ms = 1000 * np.mean(latencies)\n",
521 |     "    time_std_ms = 1000 * np.std(latencies)\n",
522 |     "    time_p95_ms = 1000 * np.percentile(latencies,95)\n",
523 |     "    return f\"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\\- {time_std_ms:.2f};\", time_p95_ms\n",
524 |     "\n",
525 |     "\n",
526 |     "vanilla_model=measure_latency(vanilla_clf)\n",
527 |     "quantized_model=measure_latency(q8_clf)\n",
528 |     "\n",
529 |     "print(f\"Vanilla model: {vanilla_model[0]}\")\n",
530 |     "print(f\"Quantized model: {quantized_model[0]}\")\n",
531 |     "print(f\"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x\")"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "markdown",
536 |    "metadata": {},
537 |    "source": [
538 |     "We managed to accelerate our model latency from 165ms to 64ms or 2.6x while keeping 99.99% of the accuracy. \n",
539 |     "\n",
540 |     "![performance](assets/vit-performance.png)"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "markdown",
545 |    "metadata": {},
546 |    "source": [
547 |     "## Conclusion\n",
548 |     "\n",
549 |     "We successfully quantized our vanilla Transformers model with Hugging Face and managed to accelerate our model latency 165ms to 64ms or 2.6x while keeping 99.99% of the accuracy. \n",
550 |     "\n",
551 |     "But I have to say that this isn't a plug and play process you can transfer to any Transformers model, task or dataset. \n",
552 |     "\n"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "markdown",
557 |    "metadata": {},
558 |    "source": []
559 |   }
560 |  ],
561 |  "metadata": {
562 |   "interpreter": {
563 |    "hash": "7a2c4b191d1ae843dde5cb5f4d1f62fa892f6b79b0f9392a84691e890e33c5a4"
564 |   },
565 |   "kernelspec": {
566 |    "display_name": "Python 3.9.12 ('base')",
567 |    "language": "python",
568 |    "name": "python3"
569 |   },
570 |   "language_info": {
571 |    "codemirror_mode": {
572 |     "name": "ipython",
573 |     "version": 3
574 |    },
575 |    "file_extension": ".py",
576 |    "mimetype": "text/x-python",
577 |    "name": "python",
578 |    "nbconvert_exporter": "python",
579 |    "pygments_lexer": "ipython3",
580 |    "version": "3.9.12"
581 |   }
582 |  },
583 |  "nbformat": 4,
584 |  "nbformat_minor": 2
585 | }
586 | 


--------------------------------------------------------------------------------
/sentence-transformers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Accelerate Sentence Transformers with Optimum\n",
  8 |     "\n",
  9 |     "In this session, you will learn how to optimize [Sentence Transformers](https://huggingface.co/sentence-transformers) using Optimum. The session will show you how to dynamically quantize and optimize a MiniLM [Sentence Transformers](https://huggingface.co/sentence-transformers) model using [Hugging Face Optimum](https://huggingface.co/docs/optimum/index) and [ONNX Runtime](https://onnxruntime.ai/). Hugging Face Optimum is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardware.\n",
 10 |     "\n",
 11 |     "_Note: dynamic quantization is currently only supported for CPUs, so we will not be utilizing GPUs / CUDA in this session._\n",
 12 |     "\n",
 13 |     "By the end of this session, you see how quantization and optimization with Hugging Face Optimum can result in significant increase in model latency.\n",
 14 |     "\n",
 15 |     "You will learn how to:\n",
 16 |     "1. Setup Development Environment\n",
 17 |     "2. Convert a Sentence Transformers model to ONNX and create custom Inference Pipeline\n",
 18 |     "3. Apply graph optimization techniques to the ONNX model\n",
 19 |     "4. Apply dynamic quantization using `ORTQuantizer` from Optimum\n",
 20 |     "5. Test inference with the quantized model\n",
 21 |     "6. Evaluate the performance and speed\n",
 22 |     "\n",
 23 |     "Let's get started! 🚀\n",
 24 |     "\n",
 25 |     "_This tutorial was created and run on an c6i.xlarge AWS EC2 Instance._\n",
 26 |     "\n",
 27 |     "## Quick intro: What are Sentence Transformers\n",
 28 |     "\n",
 29 |     "[Sentence Transformers](https://huggingface.co/sentence-transformers) is a Python library for state-of-the-art sentence, text and image embeddings. The initial work is described in our paper [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084).\n",
 30 |     "\n",
 31 |     "[Sentence Transformers](https://huggingface.co/sentence-transformers) can be used to compute embeddings for more than 100 languages and to build solutions for semantic textual similar, semantic search, or paraphrase mining.\n",
 32 |     "\n",
 33 |     "---"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## 1. Setup Development Environment\n",
 41 |     "\n",
 42 |     "Our first step is to install Optimum, along with  Evaluate and some other libraries. Running the following cell will install all the required packages for us including Transformers, PyTorch, and ONNX Runtime utilities:"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!pip install \"optimum[onnxruntime]==1.5.0\" transformers evaluate mkl-include mkl --upgrade"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "> If you want to run inference on a GPU, you can install 🤗 Optimum with `pip install optimum[onnxruntime-gpu]`.\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Convert a Sentence Transformers model to ONNX and create custom Inference Pipeline\n",
 66 |     "\n",
 67 |     "Before we can start qunatizing we need to convert our vanilla `sentence-transformers` model to the `onnx` format. To do this we will use the new [ORTModelForFeatureExtraction](https://huggingface.co/docs/optimum/main/en/onnxruntime/modeling_ort#optimum.onnxruntime.ORTModelForFeatureExtraction) class calling the `from_pretrained()` method with the `from_transformers` attribute. The model we are using is the [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) which maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search and was trained on the [1-billion sentence dataset](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2#training-data). "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from optimum.onnxruntime import ORTModelForFeatureExtraction\n",
 77 |     "from transformers import AutoTokenizer\n",
 78 |     "from pathlib import Path\n",
 79 |     "\n",
 80 |     "\n",
 81 |     "model_id=\"sentence-transformers/all-MiniLM-L6-v2\"\n",
 82 |     "onnx_path = Path(\"onnx\")\n",
 83 |     "\n",
 84 |     "# load vanilla transformers and convert to onnx\n",
 85 |     "model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)\n",
 86 |     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
 87 |     "\n",
 88 |     "# save onnx checkpoint and tokenizer\n",
 89 |     "model.save_pretrained(onnx_path)\n",
 90 |     "tokenizer.save_pretrained(onnx_path)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "When using `sentence-transformers` natively you can run inference by loading your model in the `SentenceTransformer` class and then calling the `.encode()` method. However this only works with the PyTorch based checkpoints, which we no longer have. To run inference using the Optimum `ORTModelForFeatureExtraction` class, we need to write some methods ourselves. Below we create a `SentenceEmbeddingPipeline` based on [\"How to create a custom pipeline?\"](https://huggingface.co/docs/transformers/v4.21.0/en/add_new_pipeline) from the Transformers documentation."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 4,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "from transformers import Pipeline\n",
107 |     "import torch.nn.functional as F\n",
108 |     "import torch \n",
109 |     "\n",
110 |     "# copied from the model card\n",
111 |     "def mean_pooling(model_output, attention_mask):\n",
112 |     "    token_embeddings = model_output[0] #First element of model_output contains all token embeddings\n",
113 |     "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
114 |     "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
115 |     "\n",
116 |     "\n",
117 |     "class SentenceEmbeddingPipeline(Pipeline):\n",
118 |     "    def _sanitize_parameters(self, **kwargs):\n",
119 |     "        # we don't have any hyperameters to sanitize\n",
120 |     "        preprocess_kwargs = {}\n",
121 |     "        return preprocess_kwargs, {}, {}\n",
122 |     "      \n",
123 |     "    def preprocess(self, inputs):\n",
124 |     "        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')\n",
125 |     "        return encoded_inputs\n",
126 |     "\n",
127 |     "    def _forward(self, model_inputs):\n",
128 |     "        outputs = self.model(**model_inputs)\n",
129 |     "        return {\"outputs\": outputs, \"attention_mask\": model_inputs[\"attention_mask\"]}\n",
130 |     "\n",
131 |     "    def postprocess(self, model_outputs):\n",
132 |     "        # Perform pooling\n",
133 |     "        sentence_embeddings = mean_pooling(model_outputs[\"outputs\"], model_outputs['attention_mask'])\n",
134 |     "        # Normalize embeddings\n",
135 |     "        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
136 |     "        return sentence_embeddings\n",
137 |     "      \n"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "We can now initialize our `SentenceEmbeddingPipeline` using our `ORTModelForFeatureExtraction` model and perform inference."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 5,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "# init pipeline      \n",
162 |     "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
163 |     "\n",
164 |     "# run inference\n",
165 |     "pred = vanilla_emb(\"Could you assist me in finding my lost card?\")\n",
166 |     "\n",
167 |     "# print an excerpt from the sentence embedding\n",
168 |     "print(pred[0][:5])"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "If you want to learn more about exporting transformers model check-out [Convert Transformers to ONNX with Hugging Face Optimum](https://www.philschmid.de/convert-transformers-to-onnx) blog post\n",
176 |     "\n"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## 3. Apply graph optimization techniques to the ONNX model\n",
184 |     "\n",
185 |     "Graph optimizations are essentially graph-level transformations, ranging from small graph simplifications and node eliminations to more complex node fusions and layout optimizations. \n",
186 |     "Examples of graph optimizations include:\n",
187 |     "* **Constant folding**: evaluate constant expressions at compile time instead of runtime\n",
188 |     "* **Redundant node elimination**: remove redundant nodes without changing graph structure\n",
189 |     "* **Operator fusion**: merge one node (i.e. operator) into another so they can be executed together\n",
190 |     "\n",
191 |     "\n",
192 |     "![operator fusion](./assets/operator_fusion.png)\n",
193 |     "\n",
194 |     "If you want to learn more about graph optimization you take a look at the [ONNX Runtime documentation](https://onnxruntime.ai/docs/performance/graph-optimizations.html). We are going to first optimize the model and then dynamically quantize to be able to use transformers specific operators such as QAttention for quantization of attention layers.\n",
195 |     "To apply graph optimizations to our ONNX model, we will use the `ORTOptimizer()`. The `ORTOptimizer` makes it with the help of a `OptimizationConfig` easy to optimize. The `OptimizationConfig` is the configuration class handling all the ONNX Runtime optimization parameters. "
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 9,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "name": "stderr",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "2022-11-18 14:35:05.036849905 [W:onnxruntime:, inference_session.cc:1458 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.\n"
208 |      ]
209 |     },
210 |     {
211 |      "data": {
212 |       "text/plain": [
213 |        "PosixPath('onnx')"
214 |       ]
215 |      },
216 |      "execution_count": 9,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "from optimum.onnxruntime import ORTOptimizer\n",
223 |     "from optimum.onnxruntime.configuration import OptimizationConfig\n",
224 |     "\n",
225 |     "# create ORTOptimizer and define optimization configuration\n",
226 |     "optimizer = ORTOptimizer.from_pretrained(model)\n",
227 |     "optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations\n",
228 |     "\n",
229 |     "# apply the optimization configuration to the model\n",
230 |     "optimizer.optimize(\n",
231 |     "    save_dir=onnx_path,\n",
232 |     "    optimization_config=optimization_config,\n",
233 |     ")"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "To test performance we can use the ORTModelForSequenceClassification class again and provide an additional `file_name` parameter to load our optimized model. _(This also works for models available on the hub)._"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 10,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "tensor([-0.0631,  0.0426,  0.0037,  0.0377,  0.0414])\n"
253 |      ]
254 |     }
255 |    ],
256 |    "source": [
257 |     "from optimum.onnxruntime import ORTModelForFeatureExtraction\n",
258 |     "\n",
259 |     "# load optimized model\n",
260 |     "model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name=\"model_optimized.onnx\")\n",
261 |     "\n",
262 |     "# create optimized pipeline\n",
263 |     "optimized_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
264 |     "pred = optimized_emb(\"Could you assist me in finding my lost card?\")\n",
265 |     "print(pred[0][:5])"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "## 4. Apply dynamic quantization using `ORTQuantizer` from Optimum\n",
273 |     "\n",
274 |     "After we have optimized our model we can accelerate it even more by quantizing it using the `ORTQuantizer`. The `ORTQuantizer` can be used to apply dynamic quantization to decrease the size of the model size and accelerate latency and inference.\n",
275 |     "\n",
276 |     "_We use the `avx512_vnni` config since the instance is powered by an intel ice-lake CPU supporting avx512._"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 11,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "from optimum.onnxruntime import ORTQuantizer\n",
286 |     "from optimum.onnxruntime.configuration import AutoQuantizationConfig\n",
287 |     "\n",
288 |     "# create ORTQuantizer and define quantization configuration\n",
289 |     "dynamic_quantizer = ORTQuantizer.from_pretrained(model)\n",
290 |     "dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)\n",
291 |     "\n",
292 |     "# apply the quantization configuration to the model\n",
293 |     "model_quantized_path = dynamic_quantizer.quantize(\n",
294 |     "    save_dir=onnx_path,\n",
295 |     "    quantization_config=dqconfig,\n",
296 |     ")"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "Lets quickly check the new model size."
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 12,
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "name": "stdout",
313 |      "output_type": "stream",
314 |      "text": [
315 |       "Model file size: 86.66 MB\n",
316 |       "Quantized Model file size: 63.49 MB\n"
317 |      ]
318 |     }
319 |    ],
320 |    "source": [
321 |     "import os\n",
322 |     "\n",
323 |     "# get model file size\n",
324 |     "size = os.path.getsize(onnx_path / \"model_optimized.onnx\")/(1024*1024)\n",
325 |     "quantized_model = os.path.getsize(onnx_path / \"model_optimized_quantized.onnx\")/(1024*1024)\n",
326 |     "\n",
327 |     "print(f\"Model file size: {size:.2f} MB\")\n",
328 |     "print(f\"Quantized Model file size: {quantized_model:.2f} MB\")"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "## 5. Test inference with the quantized model\n",
336 |     "\n",
337 |     "[Optimum](https://huggingface.co/docs/optimum/main/en/pipelines#optimizing-with-ortoptimizer) has built-in support for [transformers pipelines](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#pipelines). This allows us to leverage the same API that we know from using PyTorch and TensorFlow models.\n",
338 |     "Therefore we can load our quantized model with `ORTModelForSequenceClassification` class and transformers `pipeline`."
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 15,
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "name": "stdout",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "tensor([-0.0567,  0.0111, -0.0110,  0.0450,  0.0447])\n"
351 |      ]
352 |     }
353 |    ],
354 |    "source": [
355 |     "from optimum.onnxruntime import ORTModelForFeatureExtraction\n",
356 |     "from transformers import AutoTokenizer\n",
357 |     "\n",
358 |     "model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name=\"model_optimized_quantized.onnx\")\n",
359 |     "tokenizer = AutoTokenizer.from_pretrained(onnx_path)\n",
360 |     "\n",
361 |     "q8_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
362 |     "\n",
363 |     "pred = q8_emb(\"Could you assist me in finding my lost card?\")\n",
364 |     "print(pred[0][:5])"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "## 6. Evaluate the performance and speed\n",
372 |     "\n",
373 |     "As the last step, we want to take a detailed look at the performance and accuracy of our model. Applying optimization techniques, like graph optimizations or mixed-precision not only impact performance (latency) those also might have an impact on the accuracy of the model. So accelerating your model comes with a trade-off.\n",
374 |     "\n",
375 |     "We are going to evaluate our Sentence Transformers model / Sentence Embeddings on the [Semantic Textual Similarity Benchmark](https://huggingface.co/datasets/glue/viewer/stsb/validation) from the [GLUE](https://huggingface.co/datasets/glue) dataset. \n",
376 |     "\n",
377 |     "The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 1 to 5."
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "from datasets import load_dataset\n",
387 |     "from evaluate import load\n",
388 |     "\n",
389 |     "eval_dataset = load_dataset(\"glue\",\"stsb\",split=\"validation\")\n",
390 |     "metric = load('glue', 'stsb')\n",
391 |     "\n",
392 |     "# creating a subset for faster evaluation\n",
393 |     "# COMMENT IN to run evaluation on a subset of the dataset\n",
394 |     "# eval_dataset = eval_dataset.select(range(200))"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "We can now leverage the [map](https://huggingface.co/docs/datasets/v2.1.0/en/process#map) function of [datasets](https://huggingface.co/docs/datasets/index) to iterate over the validation set of `stsb` and run prediction for each data point. Therefore we write a `evaluate` helper method which uses our `SentenceEmbeddingsPipeline` and `sentence-transformers` helper methods."
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "def compute_sentence_similarity(sentence_1, sentence_2,pipeline):\n",
411 |     "    embedding_1 = pipeline(sentence_1)\n",
412 |     "    embedding_2 = pipeline(sentence_2)\n",
413 |     "    # compute cosine similarity between two sentences\n",
414 |     "    return torch.nn.functional.cosine_similarity(embedding_1, embedding_2, dim=1)\n",
415 |     "\n",
416 |     "\n",
417 |     "def evaluate_stsb(example):\n",
418 |     "  default = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], vanilla_emb)\n",
419 |     "  quantized = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], q8_emb)\n",
420 |     "  return {\n",
421 |     "      'reference': (example[\"label\"] - 1) / (5 - 1), # rescale to [0,1]\n",
422 |     "      'default': float(default),\n",
423 |     "      'quantized': float(quantized),\n",
424 |     "      }\n",
425 |     "\n",
426 |     "result = eval_dataset.map(evaluate_stsb)"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 18,
432 |    "metadata": {},
433 |    "outputs": [
434 |     {
435 |      "name": "stdout",
436 |      "output_type": "stream",
437 |      "text": [
438 |       "vanilla model: pearson=0.8696194683311959%\n",
439 |       "quantized model: pearson=0.8663451627961507%\n",
440 |       "The quantized model achieves 100.00% accuracy of the fp32 model\n"
441 |      ]
442 |     }
443 |    ],
444 |    "source": [
445 |     "default_acc = metric.compute(predictions=result[\"default\"], references=result[\"reference\"])\n",
446 |     "quantized = metric.compute(predictions=result[\"quantized\"], references=result[\"reference\"])\n",
447 |     "\n",
448 |     "print(f\"vanilla model: pearson={default_acc['pearson']}%\")\n",
449 |     "print(f\"quantized model: pearson={quantized['pearson']}%\")\n",
450 |     "print(f\"The quantized model achieves {round(quantized['pearson']/default_acc['pearson'],2)*100:.2f}% accuracy of the fp32 model\")"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {},
456 |    "source": [
457 |     "Okay, now let's test the performance (latency) of our quantized model. We are going to use a payload with a sequence length of 128 for the benchmark. To keep it simple, we are going to use a python loop and calculate the avg,mean & p95 latency for our vanilla model and for the quantized model.\n",
458 |     "\n"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "from time import perf_counter\n",
468 |     "import numpy as np \n",
469 |     "\n",
470 |     "payload=\"Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value. I cannot wait to see what is next for me\"\n",
471 |     "print(f'Payload sequence length: {len(tokenizer(payload)[\"input_ids\"])}')\n",
472 |     "\n",
473 |     "def measure_latency(pipe):\n",
474 |     "    latencies = []\n",
475 |     "    # warm up\n",
476 |     "    for _ in range(10):\n",
477 |     "        _ = pipe(payload)\n",
478 |     "    # Timed run\n",
479 |     "    for _ in range(300):\n",
480 |     "        start_time = perf_counter()\n",
481 |     "        _ =  pipe(payload)\n",
482 |     "        latency = perf_counter() - start_time\n",
483 |     "        latencies.append(latency)\n",
484 |     "    # Compute run statistics\n",
485 |     "    time_avg_ms = 1000 * np.mean(latencies)\n",
486 |     "    time_std_ms = 1000 * np.std(latencies)\n",
487 |     "    time_p95_ms = 1000 * np.percentile(latencies,95)\n",
488 |     "    return f\"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\\- {time_std_ms:.2f};\", time_p95_ms\n",
489 |     "\n",
490 |     "\n",
491 |     "vanilla_model=measure_latency(vanilla_emb)\n",
492 |     "quantized_model=measure_latency(q8_emb)\n",
493 |     "\n",
494 |     "print(f\"Vanilla model: {vanilla_model[0]}\")\n",
495 |     "print(f\"Quantized model: {quantized_model[0]}\")\n",
496 |     "print(f\"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x\")"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "markdown",
501 |    "metadata": {},
502 |    "source": [
503 |     "We managed to accelerate our model latency from 25.6ms to 12.3ms or 2.09x while keeping 100% of the accuracy on the `stsb` dataset. \n",
504 |     "\n",
505 |     "![performance](assets/sentence-transfomeres-performance.png)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "markdown",
510 |    "metadata": {},
511 |    "source": [
512 |     "## Conclusion\n",
513 |     "\n",
514 |     "We successfully quantized our vanilla Transformers model with Hugging Face and managed to accelerate our model latency from  25.6ms to 12.3ms or 2.09x while keeping 100% of the accuracy on the `stsb` dataset. \n",
515 |     "\n",
516 |     "But I have to say that this isn't a plug and play process you can transfer to any Transformers model, task or dataset. "
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "markdown",
521 |    "metadata": {},
522 |    "source": []
523 |   }
524 |  ],
525 |  "metadata": {
526 |   "kernelspec": {
527 |    "display_name": "Python 3.9.13 ('dev')",
528 |    "language": "python",
529 |    "name": "python3"
530 |   },
531 |   "language_info": {
532 |    "codemirror_mode": {
533 |     "name": "ipython",
534 |     "version": 3
535 |    },
536 |    "file_extension": ".py",
537 |    "mimetype": "text/x-python",
538 |    "name": "python",
539 |    "nbconvert_exporter": "python",
540 |    "pygments_lexer": "ipython3",
541 |    "version": "3.9.13"
542 |   },
543 |   "vscode": {
544 |    "interpreter": {
545 |     "hash": "a40944fb6d302ad2eace17cfbb714ee95a1e6c7ab311709595ca70171602490b"
546 |    }
547 |   }
548 |  },
549 |  "nbformat": 4,
550 |  "nbformat_minor": 2
551 | }
552 | 


--------------------------------------------------------------------------------
/notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Optimizing Transformers with Optimum\n",
  8 |     "\n",
  9 |     "In this session, you will learn how to optimize Hugging Face Transformers models using Optimum. The session will show you how to dynamically quantize and optimize a DistilBERT model using [Hugging Face Optimum](https://huggingface.co/docs/optimum/index) and [ONNX Runtime](https://onnxruntime.ai/). Hugging Face Optimum is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardware.\n",
 10 |     "\n",
 11 |     "Note: dynamic quantization is currently only supported for CPUs, so we will not be utilizing GPUs / CUDA in this session.\n",
 12 |     "\n",
 13 |     "By the end of this session, you see how quantization and optimization with Hugging Face Optimum can result in significant increase in model latency while keeping almost 100% of the full-precision model. Furthermore, you’ll see how to easily apply some advanced quantization and optimization techniques shown here so that your models take much less of an accuracy hit than they would otherwise. \n",
 14 |     "\n",
 15 |     "You will learn how to:\n",
 16 |     "1. Setup Development Environment\n",
 17 |     "2. Convert a Hugging Face `Transformers` model to ONNX for inference\n",
 18 |     "3. Apply graph optimization techniques to the ONNX model\n",
 19 |     "4. Apply dynamic quantization using `ORTQuantizer` from Optimum\n",
 20 |     "5. Test inference with the quantized model\n",
 21 |     "6. Evaluate the performance and speed\n",
 22 |     "7. Push the quantized model to the Hub\n",
 23 |     "8. Load and run inference with a quantized model from the hub\n",
 24 |     "\n",
 25 |     "Let's get started! 🚀\n",
 26 |     "\n",
 27 |     "_This tutorial was created and run on an c6i.xlarge AWS EC2 Instance._\n",
 28 |     "\n",
 29 |     "---"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## 1. Setup Development Environment\n",
 37 |     "\n",
 38 |     "Our first step is to install Optimum, along with  Evaluate and some other libraries. Running the following cell will install all the required packages for us including Transformers, PyTorch, and ONNX Runtime utilities:"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "!pip install \"optimum[onnxruntime]==1.5.0\" evaluate[evaluator] sklearn mkl-include mkl --upgrade"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "> If you want to run inference on a GPU, you can install 🤗 Optimum with `pip install optimum[onnxruntime-gpu]`.\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## 2. Convert a Hugging Face `Transformers` model to ONNX for inference\n",
 62 |     "\n",
 63 |     "Before we can start qunatizing we need to convert our vanilla `transformers` model to the `onnx` format. To do this we will use the new [ORTModelForSequenceClassification](https://huggingface.co/docs/optimum/main/en/onnxruntime/modeling_ort#optimum.onnxruntime.ORTModelForSequenceClassification) class calling the `from_pretrained()` method with the `from_transformers` attribute. The model we are using is the [optimum/distilbert-base-uncased-finetuned-banking77](https://huggingface.co/optimum/distilbert-base-uncased-finetuned-banking77) a fine-tuned DistilBERT model on the Banking77 dataset achieving an Accuracy score of `92.5` and as the feature (task) `text-classification`.\n",
 64 |     "\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 2,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "application/vnd.jupyter.widget-view+json": {
 75 |        "model_id": "7fcdcba4efb846a8a9d26790f6ee2f58",
 76 |        "version_major": 2,
 77 |        "version_minor": 0
 78 |       },
 79 |       "text/plain": [
 80 |        "Downloading:   0%|          | 0.00/5.81k [00:00<?, ?B/s]"
 81 |       ]
 82 |      },
 83 |      "metadata": {},
 84 |      "output_type": "display_data"
 85 |     },
 86 |     {
 87 |      "data": {
 88 |       "application/vnd.jupyter.widget-view+json": {
 89 |        "model_id": "ff3475d9d3bb44b4a7a5e580f1c07449",
 90 |        "version_major": 2,
 91 |        "version_minor": 0
 92 |       },
 93 |       "text/plain": [
 94 |        "Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]"
 95 |       ]
 96 |      },
 97 |      "metadata": {},
 98 |      "output_type": "display_data"
 99 |     },
100 |     {
101 |      "data": {
102 |       "application/vnd.jupyter.widget-view+json": {
103 |        "model_id": "00317439669f42da89631556e3fd0669",
104 |        "version_major": 2,
105 |        "version_minor": 0
106 |       },
107 |       "text/plain": [
108 |        "Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]"
109 |       ]
110 |      },
111 |      "metadata": {},
112 |      "output_type": "display_data"
113 |     },
114 |     {
115 |      "data": {
116 |       "application/vnd.jupyter.widget-view+json": {
117 |        "model_id": "a55a157c1e604313a6aff51d1dc51292",
118 |        "version_major": 2,
119 |        "version_minor": 0
120 |       },
121 |       "text/plain": [
122 |        "Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]"
123 |       ]
124 |      },
125 |      "metadata": {},
126 |      "output_type": "display_data"
127 |     },
128 |     {
129 |      "data": {
130 |       "application/vnd.jupyter.widget-view+json": {
131 |        "model_id": "0767fe7a7080494c8866ee000e93a109",
132 |        "version_major": 2,
133 |        "version_minor": 0
134 |       },
135 |       "text/plain": [
136 |        "Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]"
137 |       ]
138 |      },
139 |      "metadata": {},
140 |      "output_type": "display_data"
141 |     },
142 |     {
143 |      "data": {
144 |       "application/vnd.jupyter.widget-view+json": {
145 |        "model_id": "b0763d0b6a0c46819f7c6ee4849b375e",
146 |        "version_major": 2,
147 |        "version_minor": 0
148 |       },
149 |       "text/plain": [
150 |        "Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]"
151 |       ]
152 |      },
153 |      "metadata": {},
154 |      "output_type": "display_data"
155 |     },
156 |     {
157 |      "name": "stderr",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "/home/ubuntu/miniconda3/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py:213: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
161 |       "  mask, torch.tensor(torch.finfo(scores.dtype).min)\n"
162 |      ]
163 |     },
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "('onnx/tokenizer_config.json',\n",
168 |        " 'onnx/special_tokens_map.json',\n",
169 |        " 'onnx/vocab.txt',\n",
170 |        " 'onnx/added_tokens.json',\n",
171 |        " 'onnx/tokenizer.json')"
172 |       ]
173 |      },
174 |      "execution_count": 2,
175 |      "metadata": {},
176 |      "output_type": "execute_result"
177 |     }
178 |    ],
179 |    "source": [
180 |     "from optimum.onnxruntime import ORTModelForSequenceClassification\n",
181 |     "from transformers import AutoTokenizer\n",
182 |     "from pathlib import Path\n",
183 |     "\n",
184 |     "\n",
185 |     "model_id=\"optimum/distilbert-base-uncased-finetuned-banking77\"\n",
186 |     "dataset_id=\"banking77\"\n",
187 |     "onnx_path = Path(\"onnx\")\n",
188 |     "\n",
189 |     "# load vanilla transformers and convert to onnx\n",
190 |     "model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True)\n",
191 |     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
192 |     "\n",
193 |     "# save onnx checkpoint and tokenizer\n",
194 |     "model.save_pretrained(onnx_path)\n",
195 |     "tokenizer.save_pretrained(onnx_path)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "One neat thing about 🤗 Optimum, is that allows you to run ONNX models with the `pipeline()` function from 🤗 Transformers. This means that you get all the pre- and post-processing features for free, without needing to re-implement them for each model! Here's how you can run inference with our vanilla ONNX model:"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 3,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/plain": [
213 |        "[{'label': 'lost_or_stolen_card', 'score': 0.9664045572280884}]"
214 |       ]
215 |      },
216 |      "execution_count": 3,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "from transformers import pipeline\n",
223 |     "\n",
224 |     "vanilla_clf = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)\n",
225 |     "vanilla_clf(\"Could you assist me in finding my lost card?\")"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "If you want to learn more about exporting transformers model check-out [Convert Transformers to ONNX with Hugging Face Optimum](https://www.philschmid.de/convert-transformers-to-onnx) blog post\n",
233 |     "\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "## 3. Apply graph optimization techniques to the ONNX model\n",
241 |     "\n",
242 |     "Graph optimizations are essentially graph-level transformations, ranging from small graph simplifications and node eliminations to more complex node fusions and layout optimizations. \n",
243 |     "Examples of graph optimizations include:\n",
244 |     "* **Constant folding**: evaluate constant expressions at compile time instead of runtime\n",
245 |     "* **Redundant node elimination**: remove redundant nodes without changing graph structure\n",
246 |     "* **Operator fusion**: merge one node (i.e. operator) into another so they can be executed together\n",
247 |     "\n",
248 |     "\n",
249 |     "![operator fusion](./assets/operator_fusion.png)\n",
250 |     "\n",
251 |     "If you want to learn more about graph optimization you take a look at the [ONNX Runtime documentation](https://onnxruntime.ai/docs/performance/graph-optimizations.html). We are going to first optimize the model and then dynamically quantize to be able to use transformers specific operators such as QAttention for quantization of attention layers.\n",
252 |     "To apply graph optimizations to our ONNX model, we will use the `ORTOptimizer()`. The `ORTOptimizer` makes it with the help of a `OptimizationConfig` easy to optimize. The `OptimizationConfig` is the configuration class handling all the ONNX Runtime optimization parameters. "
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "from optimum.onnxruntime import ORTOptimizer\n",
262 |     "from optimum.onnxruntime.configuration import OptimizationConfig\n",
263 |     "\n",
264 |     "# create ORTOptimizer and define optimization configuration\n",
265 |     "optimizer = ORTOptimizer.from_pretrained(model)\n",
266 |     "optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations\n",
267 |     "\n",
268 |     "# apply the optimization configuration to the model\n",
269 |     "optimizer.optimize(\n",
270 |     "    save_dir=onnx_path,\n",
271 |     "    optimization_config=optimization_config,\n",
272 |     ")"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "To test performance we can use the ORTModelForSequenceClassification class again and provide an additional `file_name` parameter to load our optimized model. _(This also works for models available on the hub)._"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 5,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "[{'label': 'lost_or_stolen_card', 'score': 0.9664045572280884}]"
291 |       ]
292 |      },
293 |      "execution_count": 5,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "from transformers import pipeline\n",
300 |     "\n",
301 |     "# load optimized model\n",
302 |     "model = ORTModelForSequenceClassification.from_pretrained(onnx_path, file_name=\"model_optimized.onnx\")\n",
303 |     "\n",
304 |     "# create optimized pipeline\n",
305 |     "optimized_clf = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)\n",
306 |     "optimized_clf(\"Could you assist me in finding my lost card?\")"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "## 4. Apply dynamic quantization using `ORTQuantizer` from Optimum\n",
314 |     "\n",
315 |     "After we have optimized our model we can accelerate it even more by quantizing it using the `ORTQuantizer`. The `ORTQuantizer` can be used to apply dynamic quantization to decrease the size of the model size and accelerate latency and inference.\n",
316 |     "\n",
317 |     "_We use the `avx512_vnni` config since the instance is powered by an intel ice-lake CPU supporting avx512._"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 6,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "from optimum.onnxruntime import ORTQuantizer\n",
327 |     "from optimum.onnxruntime.configuration import AutoQuantizationConfig\n",
328 |     "\n",
329 |     "# create ORTQuantizer and define quantization configuration\n",
330 |     "dynamic_quantizer = ORTQuantizer.from_pretrained(model)\n",
331 |     "dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)\n",
332 |     "\n",
333 |     "# apply the quantization configuration to the model\n",
334 |     "model_quantized_path = dynamic_quantizer.quantize(\n",
335 |     "    save_dir=onnx_path,\n",
336 |     "    quantization_config=dqconfig,\n",
337 |     ")"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "Lets quickly check the new model size."
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 7,
350 |    "metadata": {},
351 |    "outputs": [
352 |     {
353 |      "name": "stdout",
354 |      "output_type": "stream",
355 |      "text": [
356 |       "Model file size: 255.65 MB\n",
357 |       "Quantized Model file size: 162.68 MB\n"
358 |      ]
359 |     }
360 |    ],
361 |    "source": [
362 |     "import os\n",
363 |     "\n",
364 |     "# get model file size\n",
365 |     "size = os.path.getsize(onnx_path / \"model_optimized.onnx\")/(1024*1024)\n",
366 |     "quantized_model = os.path.getsize(onnx_path / \"model_optimized_quantized.onnx\")/(1024*1024)\n",
367 |     "\n",
368 |     "print(f\"Model file size: {size:.2f} MB\")\n",
369 |     "print(f\"Quantized Model file size: {quantized_model:.2f} MB\")"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "## 5. Test inference with the quantized model\n",
377 |     "\n",
378 |     "[Optimum](https://huggingface.co/docs/optimum/main/en/pipelines#optimizing-with-ortoptimizer) has built-in support for [transformers pipelines](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#pipelines). This allows us to leverage the same API that we know from using PyTorch and TensorFlow models.\n",
379 |     "Therefore we can load our quantized model with `ORTModelForSequenceClassification` class and transformers `pipeline`."
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 8,
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "data": {
389 |       "text/plain": [
390 |        "[{'label': 'exchange_rate', 'score': 0.9792892336845398}]"
391 |       ]
392 |      },
393 |      "execution_count": 8,
394 |      "metadata": {},
395 |      "output_type": "execute_result"
396 |     }
397 |    ],
398 |    "source": [
399 |     "from optimum.onnxruntime import ORTModelForSequenceClassification\n",
400 |     "from transformers import pipeline, AutoTokenizer\n",
401 |     "\n",
402 |     "model = ORTModelForSequenceClassification.from_pretrained(onnx_path,file_name=\"model_optimized_quantized.onnx\")\n",
403 |     "tokenizer = AutoTokenizer.from_pretrained(onnx_path)\n",
404 |     "\n",
405 |     "q8_clf = pipeline(\"text-classification\",model=model, tokenizer=tokenizer)\n",
406 |     "\n",
407 |     "q8_clf(\"What is the exchange rate like on this app?\")"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "markdown",
412 |    "metadata": {},
413 |    "source": [
414 |     "## 6. Evaluate the performance and speed\n",
415 |     "\n",
416 |     "We can now leverage the map function of datasets to iterate over the validation set of squad 2 and run prediction for each data point. Therefore we write a evaluate helper method which uses our pipelines and applies some transformation to work with the squad v2 metric.\n",
417 |     "\n"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 9,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "data": {
427 |       "application/vnd.jupyter.widget-view+json": {
428 |        "model_id": "ea013c3d7fb5460baab887da0a6fabd5",
429 |        "version_major": 2,
430 |        "version_minor": 0
431 |       },
432 |       "text/plain": [
433 |        "Downloading builder script:   0%|          | 0.00/2.34k [00:00<?, ?B/s]"
434 |       ]
435 |      },
436 |      "metadata": {},
437 |      "output_type": "display_data"
438 |     },
439 |     {
440 |      "data": {
441 |       "application/vnd.jupyter.widget-view+json": {
442 |        "model_id": "b0d36c33b4e7441fa038783dbeb0f63a",
443 |        "version_major": 2,
444 |        "version_minor": 0
445 |       },
446 |       "text/plain": [
447 |        "Downloading metadata:   0%|          | 0.00/1.75k [00:00<?, ?B/s]"
448 |       ]
449 |      },
450 |      "metadata": {},
451 |      "output_type": "display_data"
452 |     },
453 |     {
454 |      "name": "stderr",
455 |      "output_type": "stream",
456 |      "text": [
457 |       "WARNING:datasets.builder:Using custom data configuration default\n"
458 |      ]
459 |     },
460 |     {
461 |      "name": "stdout",
462 |      "output_type": "stream",
463 |      "text": [
464 |       "Downloading and preparing dataset banking77/default (download: 1.03 MiB, generated: 897.51 KiB, post-processed: Unknown size, total: 1.91 MiB) to /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b...\n"
465 |      ]
466 |     },
467 |     {
468 |      "data": {
469 |       "application/vnd.jupyter.widget-view+json": {
470 |        "model_id": "e0bf883adada485c848027cc51615a81",
471 |        "version_major": 2,
472 |        "version_minor": 0
473 |       },
474 |       "text/plain": [
475 |        "Downloading data:   0%|          | 0.00/158k [00:00<?, ?B/s]"
476 |       ]
477 |      },
478 |      "metadata": {},
479 |      "output_type": "display_data"
480 |     },
481 |     {
482 |      "data": {
483 |       "application/vnd.jupyter.widget-view+json": {
484 |        "model_id": "b3d5d00d5609451c9d39d6bc4f99cf1d",
485 |        "version_major": 2,
486 |        "version_minor": 0
487 |       },
488 |       "text/plain": [
489 |        "Downloading data:   0%|          | 0.00/51.1k [00:00<?, ?B/s]"
490 |       ]
491 |      },
492 |      "metadata": {},
493 |      "output_type": "display_data"
494 |     },
495 |     {
496 |      "data": {
497 |       "application/vnd.jupyter.widget-view+json": {
498 |        "model_id": "6531ab7c63f641438d04b7aef3c8396a",
499 |        "version_major": 2,
500 |        "version_minor": 0
501 |       },
502 |       "text/plain": [
503 |        "Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]"
504 |       ]
505 |      },
506 |      "metadata": {},
507 |      "output_type": "display_data"
508 |     },
509 |     {
510 |      "data": {
511 |       "application/vnd.jupyter.widget-view+json": {
512 |        "model_id": "30321c8112be41c594d2a0ae64ceb6e2",
513 |        "version_major": 2,
514 |        "version_minor": 0
515 |       },
516 |       "text/plain": [
517 |        "Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]"
518 |       ]
519 |      },
520 |      "metadata": {},
521 |      "output_type": "display_data"
522 |     },
523 |     {
524 |      "name": "stdout",
525 |      "output_type": "stream",
526 |      "text": [
527 |       "Dataset banking77 downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b. Subsequent calls will reuse this data.\n"
528 |      ]
529 |     },
530 |     {
531 |      "data": {
532 |       "application/vnd.jupyter.widget-view+json": {
533 |        "model_id": "cba4cb3b35004b83ad3cd419aa40a192",
534 |        "version_major": 2,
535 |        "version_minor": 0
536 |       },
537 |       "text/plain": [
538 |        "Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]"
539 |       ]
540 |      },
541 |      "metadata": {},
542 |      "output_type": "display_data"
543 |     },
544 |     {
545 |      "name": "stdout",
546 |      "output_type": "stream",
547 |      "text": [
548 |       "{'accuracy': 0.9227272727272727, 'total_time_in_seconds': 19.384229860999994, 'samples_per_second': 158.89204895350477, 'latency_in_seconds': 0.006293581123701297}\n"
549 |      ]
550 |     }
551 |    ],
552 |    "source": [
553 |     "from evaluate import evaluator\n",
554 |     "from datasets import load_dataset \n",
555 |     "\n",
556 |     "eval = evaluator(\"text-classification\")\n",
557 |     "eval_dataset = load_dataset(\"banking77\", split=\"test\")\n",
558 |     "\n",
559 |     "results = eval.compute(\n",
560 |     "    model_or_pipeline=q8_clf,\n",
561 |     "    data=eval_dataset,\n",
562 |     "    metric=\"accuracy\",\n",
563 |     "    input_column=\"text\",\n",
564 |     "    label_column=\"label\",\n",
565 |     "    label_mapping=model.config.label2id,\n",
566 |     "    strategy=\"simple\",\n",
567 |     ")\n",
568 |     "print(results)"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": 10,
574 |    "metadata": {},
575 |    "outputs": [
576 |     {
577 |      "name": "stdout",
578 |      "output_type": "stream",
579 |      "text": [
580 |       "Vanilla model: 92.5%\n",
581 |       "Quantized model: 92.27%\n",
582 |       "The quantized model achieves 99.75% accuracy of the fp32 model\n"
583 |      ]
584 |     }
585 |    ],
586 |    "source": [
587 |     "print(f\"Vanilla model: 92.5%\")\n",
588 |     "print(f\"Quantized model: {results['accuracy']*100:.2f}%\")\n",
589 |     "print(f\"The quantized model achieves {round(results['accuracy']/0.925,4)*100:.2f}% accuracy of the fp32 model\")"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "markdown",
594 |    "metadata": {},
595 |    "source": [
596 |     "Okay, now let's test the performance (latency) of our quantized model. We are going to use a payload with a sequence length of 128 for the benchmark. To keep it simple, we are going to use a python loop and calculate the avg,mean & p95 latency for our vanilla model and for the quantized model.\n",
597 |     "\n"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "execution_count": null,
603 |    "metadata": {},
604 |    "outputs": [],
605 |    "source": [
606 |     "from time import perf_counter\n",
607 |     "import numpy as np \n",
608 |     "\n",
609 |     "payload=\"Hello my name is Philipp. I am getting in touch with you because i didn't get a response from you. What do I need to do to get my new card which I have requested 2 weeks ago? Please help me and answer this email in the next 7 days. Best regards and have a nice weekend \"*2\n",
610 |     "print(f'Payload sequence length: {len(tokenizer(payload)[\"input_ids\"])}')\n",
611 |     "\n",
612 |     "def measure_latency(pipe):\n",
613 |     "    latencies = []\n",
614 |     "    # warm up\n",
615 |     "    for _ in range(10):\n",
616 |     "        _ = pipe(payload)\n",
617 |     "    # Timed run\n",
618 |     "    for _ in range(300):\n",
619 |     "        start_time = perf_counter()\n",
620 |     "        _ =  pipe(payload)\n",
621 |     "        latency = perf_counter() - start_time\n",
622 |     "        latencies.append(latency)\n",
623 |     "    # Compute run statistics\n",
624 |     "    time_avg_ms = 1000 * np.mean(latencies)\n",
625 |     "    time_std_ms = 1000 * np.std(latencies)\n",
626 |     "    time_p95_ms = 1000 * np.percentile(latencies,95)\n",
627 |     "    return f\"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\\- {time_std_ms:.2f};\", time_p95_ms\n",
628 |     "\n",
629 |     "\n",
630 |     "vanilla_model=measure_latency(vanilla_clf)\n",
631 |     "quantized_model=measure_latency(q8_clf)\n",
632 |     "\n",
633 |     "print(f\"Vanilla model: {vanilla_model[0]}\")\n",
634 |     "print(f\"Quantized model: {quantized_model[0]}\")\n",
635 |     "print(f\"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x\")"
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "markdown",
640 |    "metadata": {},
641 |    "source": [
642 |     "We managed to accelerate our model latency from 68.4ms to 27.55ms or 2.48x while keeping 99.72% of the accuracy. \n",
643 |     "\n",
644 |     "![performance](assets/performance.png)"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "markdown",
649 |    "metadata": {},
650 |    "source": [
651 |     "## 7. Push the quantized model to the Hub\n",
652 |     "\n",
653 |     "The Optimum model classes like `ORTModelForSequenceClassification` are integrated with the Hugging Face Model Hub, which means you can not only load model from the Hub, but also push your models to the Hub with `push_to_hub()` method. That way we can now save our qunatized model on the Hub to be for example used inside our inference API.\n",
654 |     "\n",
655 |     "_We have to make sure that we are also saving the `tokenizer` as well as the `config.json` to have a good inference experience._\n",
656 |     "\n",
657 |     "If you haven't logged into the `huggingface hub` yet you can use the `notebook_login` to do so."
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": null,
663 |    "metadata": {},
664 |    "outputs": [],
665 |    "source": [
666 |     "from huggingface_hub import notebook_login\n",
667 |     "\n",
668 |     "notebook_login()"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "markdown",
673 |    "metadata": {},
674 |    "source": [
675 |     "After we have configured our hugging face hub credentials we can push the model."
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": 21,
681 |    "metadata": {},
682 |    "outputs": [
683 |     {
684 |      "name": "stderr",
685 |      "output_type": "stream",
686 |      "text": [
687 |       "/home/ubuntu/miniconda3/lib/python3.9/site-packages/huggingface_hub/hf_api.py:79: FutureWarning: `name` and `organization` input arguments are deprecated and will be removed in v0.8. Pass `repo_id` instead.\n",
688 |       "  warnings.warn(\n"
689 |      ]
690 |     }
691 |    ],
692 |    "source": [
693 |     "from transformers import AutoTokenizer\n",
694 |     "from optimum.onnxruntime import ORTModelForSequenceClassification\n",
695 |     "\n",
696 |     "tmp_store_directory=\"onnx_hub_repo\"\n",
697 |     "repository_id=\"distilbert-onnx-banking77\"\n",
698 |     "\n",
699 |     "model.save_pretrained(tmp_store_directory)\n",
700 |     "tokenizer.save_pretrained(tmp_store_directory)\n",
701 |     "\n",
702 |     "model.push_to_hub(tmp_store_directory,\n",
703 |     "                  repository_id=repository_id,\n",
704 |     "                  use_auth_token=True\n",
705 |     "                  )"
706 |    ]
707 |   },
708 |   {
709 |    "cell_type": "markdown",
710 |    "metadata": {},
711 |    "source": [
712 |     "## 8. Load and run inference with a quantized model from the hub\n",
713 |     "\n",
714 |     "This step serves as a demonstration of how you could use optimum in your api to load and use our qunatized model."
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": 22,
720 |    "metadata": {},
721 |    "outputs": [
722 |     {
723 |      "data": {
724 |       "application/vnd.jupyter.widget-view+json": {
725 |        "model_id": "1235fec9d2614cf1a59baf0fc95e90b2",
726 |        "version_major": 2,
727 |        "version_minor": 0
728 |       },
729 |       "text/plain": [
730 |        "Downloading:   0%|          | 0.00/5.78k [00:00<?, ?B/s]"
731 |       ]
732 |      },
733 |      "metadata": {},
734 |      "output_type": "display_data"
735 |     },
736 |     {
737 |      "data": {
738 |       "application/vnd.jupyter.widget-view+json": {
739 |        "model_id": "e6fb8d28e78441d990e6a4226c9407ea",
740 |        "version_major": 2,
741 |        "version_minor": 0
742 |       },
743 |       "text/plain": [
744 |        "Downloading:   0%|          | 0.00/173M [00:00<?, ?B/s]"
745 |       ]
746 |      },
747 |      "metadata": {},
748 |      "output_type": "display_data"
749 |     },
750 |     {
751 |      "data": {
752 |       "application/vnd.jupyter.widget-view+json": {
753 |        "model_id": "54a0c297709f49d498b9383cd8ea65dc",
754 |        "version_major": 2,
755 |        "version_minor": 0
756 |       },
757 |       "text/plain": [
758 |        "Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]"
759 |       ]
760 |      },
761 |      "metadata": {},
762 |      "output_type": "display_data"
763 |     },
764 |     {
765 |      "data": {
766 |       "application/vnd.jupyter.widget-view+json": {
767 |        "model_id": "1e7545a4491b4c5ea0a3c5a0986e0db9",
768 |        "version_major": 2,
769 |        "version_minor": 0
770 |       },
771 |       "text/plain": [
772 |        "Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]"
773 |       ]
774 |      },
775 |      "metadata": {},
776 |      "output_type": "display_data"
777 |     },
778 |     {
779 |      "data": {
780 |       "application/vnd.jupyter.widget-view+json": {
781 |        "model_id": "9752e5524cad42b9a7640d4c7272e656",
782 |        "version_major": 2,
783 |        "version_minor": 0
784 |       },
785 |       "text/plain": [
786 |        "Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]"
787 |       ]
788 |      },
789 |      "metadata": {},
790 |      "output_type": "display_data"
791 |     },
792 |     {
793 |      "data": {
794 |       "application/vnd.jupyter.widget-view+json": {
795 |        "model_id": "70b672c0b26f4de6861a1032591947d8",
796 |        "version_major": 2,
797 |        "version_minor": 0
798 |       },
799 |       "text/plain": [
800 |        "Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]"
801 |       ]
802 |      },
803 |      "metadata": {},
804 |      "output_type": "display_data"
805 |     },
806 |     {
807 |      "data": {
808 |       "text/plain": [
809 |        "[{'label': 'exchange_rate', 'score': 0.9794749021530151}]"
810 |       ]
811 |      },
812 |      "execution_count": 22,
813 |      "metadata": {},
814 |      "output_type": "execute_result"
815 |     }
816 |    ],
817 |    "source": [
818 |     "from optimum.onnxruntime import ORTModelForSequenceClassification\n",
819 |     "from transformers import pipeline, AutoTokenizer\n",
820 |     "\n",
821 |     "model = ORTModelForSequenceClassification.from_pretrained(\"philschmid/distilbert-onnx-banking77\")\n",
822 |     "tokenizer = AutoTokenizer.from_pretrained(\"philschmid/distilbert-onnx-banking77\")\n",
823 |     "\n",
824 |     "remote_clx = pipeline(\"text-classification\",model=model, tokenizer=tokenizer)\n",
825 |     "\n",
826 |     "remote_clx(\"What is the exchange rate like on this app?\")"
827 |    ]
828 |   },
829 |   {
830 |    "cell_type": "code",
831 |    "execution_count": 23,
832 |    "metadata": {},
833 |    "outputs": [
834 |     {
835 |      "name": "stdout",
836 |      "output_type": "stream",
837 |      "text": [
838 |       "5.37 ms ± 236 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
839 |      ]
840 |     }
841 |    ],
842 |    "source": [
843 |     "%%timeit\n",
844 |     "remote_clx(\"What is the exchange rate like on this app?\")"
845 |    ]
846 |   },
847 |   {
848 |    "cell_type": "markdown",
849 |    "metadata": {},
850 |    "source": [
851 |     "## Conclusion\n",
852 |     "\n",
853 |     "We successfully quantized our vanilla Transformers model with Hugging Face and managed to accelerate our model latency  68.4ms to 27.55ms or 2.48x while keeping 99.72% of the accuracy. "
854 |    ]
855 |   },
856 |   {
857 |    "cell_type": "markdown",
858 |    "metadata": {},
859 |    "source": []
860 |   }
861 |  ],
862 |  "metadata": {
863 |   "interpreter": {
864 |    "hash": "7a2c4b191d1ae843dde5cb5f4d1f62fa892f6b79b0f9392a84691e890e33c5a4"
865 |   },
866 |   "kernelspec": {
867 |    "display_name": "Python 3.9.12 ('base')",
868 |    "language": "python",
869 |    "name": "python3"
870 |   },
871 |   "language_info": {
872 |    "codemirror_mode": {
873 |     "name": "ipython",
874 |     "version": 3
875 |    },
876 |    "file_extension": ".py",
877 |    "mimetype": "text/x-python",
878 |    "name": "python",
879 |    "nbconvert_exporter": "python",
880 |    "pygments_lexer": "ipython3",
881 |    "version": "3.9.12"
882 |   }
883 |  },
884 |  "nbformat": 4,
885 |  "nbformat_minor": 2
886 | }
887 | 


--------------------------------------------------------------------------------