├── pytorch ├── benchmark_tools │ ├── common │ │ ├── __init__.py │ │ └── common_mlperf.py │ ├── inference │ │ ├── caffe │ │ │ ├── __init__.py │ │ │ └── proto │ │ │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── models │ │ │ ├── resnet50_v1 │ │ │ │ └── __model_def │ │ │ ├── resnext101_32x4d │ │ │ │ └── __model_def │ │ │ ├── resnet50 │ │ │ │ └── __model_def │ │ │ └── __init__.py │ │ └── calibration_int8.py │ └── run_caffe2.py ├── imagenet │ └── imagenet │ │ ├── requirements.txt │ │ ├── run_inference_cpu_accuracy.sh │ │ ├── run_inference_cpu_multi_instance_latency.sh │ │ ├── run_inference_cpu_multi_instance.sh │ │ └── README.md ├── dlrm │ ├── dlrm │ │ ├── input │ │ │ ├── trace.log │ │ │ ├── dist_emb_0.log │ │ │ ├── dist_emb_1.log │ │ │ └── dist_emb_2.log │ │ ├── requirements.txt │ │ ├── CODE_OF_CONDUCT.md │ │ ├── cython │ │ │ ├── cython_compile.py │ │ │ └── cython_criteo.py │ │ ├── LICENSE │ │ ├── CONTRIBUTING.md │ │ ├── bench │ │ │ ├── dlrm_s_criteo_kaggle.sh │ │ │ ├── dlrm_s_criteo_terabyte.sh │ │ │ └── dlrm_s_benchmark.sh │ │ ├── test │ │ │ └── dlrm_s_test.sh │ │ └── quorem │ │ │ └── qr_embedding_bag.py │ └── README.md ├── RESNET50V1.md ├── distributed │ └── README.md ├── README.md ├── ResNet50 │ └── README.md └── ResNext101_32x4d │ └── README.md ├── mxnet ├── wide_deep_criteo │ ├── launch_train.sh │ ├── launch_inference.sh │ ├── getdata.sh │ ├── model.py │ ├── data.py │ ├── train.py │ ├── README.md │ ├── inference.py │ ├── wd_gen_qsym_subgraph.py │ ├── wd_gen_qsym_subgraph_update.py │ └── update_model │ │ └── embedding-fuse.json └── blog │ ├── mxnet_v1.5_release │ ├── single-instance-rnn-mxnet-1.5.sh │ ├── single-instance-rnn-mxnet-mkl1.5.sh │ ├── 2instance-rnn-mxnet1.5.sh │ ├── 2instance-rnn-mxnet-mkl1.5.sh │ ├── rnn_benchmark.py │ ├── single-instance-cnn-mxnet-1.5.sh │ └── single-instance-fp32-cnn-mxnet-mkl1.5.sh │ └── medium_vnni │ ├── ec2_benchmark_base.sh │ └── ec2_benchmark_int8.sh ├── README.md └── third-party-programs.txt /pytorch/benchmark_tools/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/caffe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/caffe/proto/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytorch/imagenet/imagenet/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/input/trace.log: -------------------------------------------------------------------------------- 1 | 1, 2, 3, 4, 5, 3, 4, 1, 1, 6, 3 2 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/requirements.txt: -------------------------------------------------------------------------------- 1 | future 2 | numpy 3 | onnx 4 | pydot 5 | torch 6 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/input/dist_emb_0.log: -------------------------------------------------------------------------------- 1 | 1, 2, 3, 4, 5, 6 2 | 0, 1, 3, 4, 5 3 | 0.55, 0.64, 0.82, 0.91, 1.0 4 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/input/dist_emb_1.log: -------------------------------------------------------------------------------- 1 | 1, 2, 3, 4, 5, 6 2 | 0, 1, 3, 4, 5 3 | 0.55, 0.64, 0.82, 0.91, 1.0 4 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/input/dist_emb_2.log: -------------------------------------------------------------------------------- 1 | 1, 2, 3, 4, 5, 6 2 | 0, 1, 3, 4, 5 3 | 0.55, 0.64, 0.82, 0.91, 1.0 4 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/launch_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 4 | export OMP_NUM_THREADS=56 5 | 6 | python train.py --batch-size=1024 --data-dir=./data 7 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/__init__.py: -------------------------------------------------------------------------------- 1 | """init file of inference""" 2 | from inference.inference_caffe2 import Run 3 | from inference.inference_caffe2 import PrintNetDef 4 | from inference.calibration_int8 import Calibration 5 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/launch_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 4 | export OMP_NUM_THREADS=28 5 | 6 | echo 7 | echo "Running inference as benchmark mode..." 8 | numactl --physcpubind=0-27 --membind=0 python inference.py 9 | 10 | echo 11 | echo "Running inference as accuracy mode..." 12 | numactl --physcpubind=0-27 --membind=0 python inference.py --accuracy True 13 | 14 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/models/resnet50_v1/__model_def: -------------------------------------------------------------------------------- 1 | [Model Name] 2 | ResNet50_v1 3 | 4 | [Model Type] 5 | Caffe legacy 6 | 7 | [Output Type] 8 | Possibility 9 | 10 | [Model Description] 11 | Model definition for Caffe2 12 | 13 | [Init Net] 14 | init_net.pb 15 | 16 | [Init Net] 17 | init_net_int8.pb 18 | 19 | [Predict Net] 20 | predict_net.pb 21 | 22 | [Predict Net Int8] 23 | predict_net_int8.pb 24 | 25 | [Onnx Model] 26 | resnet50_onnx.pb 27 | 28 | [Crop Size] 29 | 224 30 | 31 | [Image Mean] 32 | 104 117 123 33 | 34 | [Scale] 35 | 0.0078125 36 | 37 | [Train Proto] 38 | 39 | 40 | [Deploy Proto] 41 | 42 | 43 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/getdata.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo 4 | echo "Start downloading criteo large dataset, it might take a long time" 5 | echo 6 | 7 | DATA_DIR="./data" 8 | if [[ ! -d "${DATA_DIR}" ]]; then 9 | echo "${DATA_DIR} doesn't exist, will create one"; 10 | mkdir -p data 11 | fi 12 | 13 | #training set 14 | echo "Downloading the training dataset..." 15 | wget -P ./data https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv; 16 | 17 | #validation set 18 | echo "Downloading the validation dataset..." 19 | wget -P ./data https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv; 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DISCONTINUATION OF PROJECT 2 | 3 | This project will no longer be maintained by Intel. 4 | 5 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 6 | 7 | Intel no longer accepts patches to this project. 8 | 9 | If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project. 10 | 11 | Contact: webadmin@linux.intel.com 12 | optimized-models 13 | ================== 14 | 15 | Intel optimized models for easy reproduction for users. 16 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/models/resnext101_32x4d/__model_def: -------------------------------------------------------------------------------- 1 | [Model Name] 2 | ResNeXt101_32x4d 3 | 4 | [Model Type] 5 | Normal 6 | 7 | [Output Type] 8 | Possibility 9 | 10 | [Model Description] 11 | Model definition for Caffe2 12 | 13 | [Init Net] 14 | init_net.pb 15 | 16 | [Init Net Int8] 17 | init_onnx_int8.pb 18 | 19 | [Predict Net] 20 | predict_net.pb 21 | 22 | [Predict Net Int8] 23 | predict_onnx_int8.pb 24 | 25 | [Onnx Model] 26 | resnext101_32x4d.onnx 27 | 28 | [Crop Size] 29 | 224 30 | 31 | [Image Mean] 32 | 0.485 0.456 0.406 33 | 34 | [Scale] 35 | 4.3668 4.4643 4.4444 36 | 37 | [Need Normalize] 38 | true 39 | 40 | [Color Format] 41 | RGB 42 | 43 | [Train Proto] 44 | 45 | 46 | [Deploy Proto] 47 | 48 | 49 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/models/resnet50/__model_def: -------------------------------------------------------------------------------- 1 | [Model Name] 2 | ResNet50 3 | 4 | [Model Type] 5 | Caffe legacy 6 | 7 | [Output Type] 8 | Possibility 9 | 10 | [Model Description] 11 | Model definition for Caffe2 12 | 13 | [Init Net] 14 | init_net.pb 15 | 16 | [Init Net Int8] 17 | init_onnx_int8.pb 18 | 19 | [Predict Net] 20 | predict_net.pb 21 | 22 | [Predict Net Int8] 23 | predict_onnx_int8.pb 24 | 25 | [Onnx Model] 26 | resnet50.onnx 27 | 28 | [Crop Size] 29 | 224 30 | 31 | [Image Mean] 32 | 0.485 0.456 0.406 33 | 34 | [Scale] 35 | 4.36681223 4.46428571 4.44444444 36 | 37 | [Need Normalize] 38 | true 39 | 40 | [Color Format] 41 | RGB 42 | 43 | [Train Proto] 44 | 45 | 46 | [Deploy Proto] 47 | 48 | 49 | [Model Source] 50 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/cython/cython_compile.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Description: compile .so from python code 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | from setuptools import setup 11 | from Cython.Build import cythonize 12 | from distutils.extension import Extension 13 | 14 | ext_modules = [ 15 | Extension( 16 | "data_utils_cython", 17 | ["data_utils_cython.pyx"], 18 | extra_compile_args=['-O3'], 19 | extra_link_args=['-O3'], 20 | ) 21 | ] 22 | 23 | setup( 24 | name='data_utils_cython', 25 | ext_modules=cythonize(ext_modules) 26 | ) 27 | -------------------------------------------------------------------------------- /mxnet/blog/mxnet_v1.5_release/single-instance-rnn-mxnet-1.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "MXNet Model FP32 single-instance LSTM Inference Performance" 4 | echo "Testing FP32 base models" 5 | echo "Installing mxnet 1.5" 6 | pip install mxnet 7 | 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 9 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l` 10 | export OMP_NUM_THREADS=$((vCPUs / 4)) 11 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS 12 | 13 | echo "-----LSTM FP32 4-layers inference-----" 14 | numactl --cpunodebind=0 --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4 15 | echo "-----LSTM FP32 8-layers inference-----" 16 | numactl --cpunodebind=0 --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8 17 | 18 | -------------------------------------------------------------------------------- /mxnet/blog/mxnet_v1.5_release/single-instance-rnn-mxnet-mkl1.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "MXNet Model FP32 single-instance LSTM Inference Performance" 4 | echo "Testing FP32 base models" 5 | echo "Installing mxnet-mkl 1.5" 6 | pip install mxnet-mkl 7 | 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 9 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l` 10 | export OMP_NUM_THREADS=$((vCPUs / 4)) 11 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS 12 | 13 | echo "-----LSTM FP32 4-layers inference-----" 14 | numactl --cpunodebind=0 --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4 15 | echo "-----LSTM FP32 8-layers inference-----" 16 | numactl --cpunodebind=0 --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8 17 | 18 | -------------------------------------------------------------------------------- /mxnet/blog/mxnet_v1.5_release/2instance-rnn-mxnet1.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "MXNet Model FP32 multi-instance LSTM Inference Performance" 4 | echo "Testing FP32 base models" 5 | echo "Installing mxnet1.5" 6 | pip install mxnet 7 | 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 9 | 10 | echo "-----LSTM FP32 4-layers multi-instance inference-----" 11 | OMP_NUM_THREADS=24 numactl --cpunodebind=0 --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4 & 12 | OMP_NUM_THREADS=24 numactl --cpunodebind=1 --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 4 13 | 14 | echo "-----LSTM FP32 8-layers multi-instance inference-----" 15 | OMP_NUM_THREADS=24 numactl --cpunodebind=0 --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8 & 16 | OMP_NUM_THREADS=24 numactl --cpunodebind=1 --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 8 17 | -------------------------------------------------------------------------------- /mxnet/blog/mxnet_v1.5_release/2instance-rnn-mxnet-mkl1.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "MXNet Model FP32 multi-instance LSTM Inference Performance" 4 | echo "Testing FP32 base models" 5 | echo "Installing mxnet-mkl 1.5" 6 | pip install mxnet-mkl 7 | 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 9 | 10 | echo "-----LSTM FP32 4-layers multi-instance inference-----" 11 | OMP_NUM_THREADS=24 numactl --cpunodebind=0 --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4 & 12 | OMP_NUM_THREADS=24 numactl --cpunodebind=1 --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 4 13 | 14 | echo "-----LSTM FP32 8-layers multi-instance inference-----" 15 | OMP_NUM_THREADS=24 numactl --cpunodebind=0 --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8 & 16 | OMP_NUM_THREADS=24 numactl --cpunodebind=1 --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 8 17 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to DLRM 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## Coding Style 30 | * 4 spaces for indentation rather than tabs 31 | * 80 character line length 32 | * in general, please maintain a consistent style with the rest of the code 33 | 34 | ## License 35 | By contributing to DLRM, you agree that your contributions will be licensed 36 | under the LICENSE file in the root directory of this source tree. 37 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/bench/dlrm_s_criteo_kaggle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | #WARNING: must have compiled PyTorch and caffe2 8 | 9 | #check if extra argument is passed to the test 10 | if [[ $# == 1 ]]; then 11 | dlrm_extra_option=$1 12 | else 13 | dlrm_extra_option="" 14 | fi 15 | #echo $dlrm_extra_option 16 | 17 | dlrm_pt_bin="python dlrm_s_pytorch.py" 18 | dlrm_c2_bin="python dlrm_s_caffe2.py" 19 | 20 | echo "run pytorch ..." 21 | # WARNING: the following parameters will be set based on the data set 22 | # --arch-embedding-size=... (sparse feature sizes) 23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp) 24 | $dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log 25 | 26 | echo "run caffe2 ..." 27 | # WARNING: the following parameters will be set based on the data set 28 | # --arch-embedding-size=... (sparse feature sizes) 29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp) 30 | $dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log 31 | 32 | echo "done" 33 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/bench/dlrm_s_criteo_terabyte.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | #WARNING: must have compiled PyTorch and caffe2 8 | 9 | #check if extra argument is passed to the test 10 | if [[ $# == 1 ]]; then 11 | dlrm_extra_option=$1 12 | else 13 | dlrm_extra_option="" 14 | fi 15 | #echo $dlrm_extra_option 16 | 17 | dlrm_pt_bin="python dlrm_s_pytorch.py" 18 | dlrm_c2_bin="python dlrm_s_caffe2.py" 19 | 20 | echo "run pytorch ..." 21 | # WARNING: the following parameters will be set based on the data set 22 | # --arch-embedding-size=... (sparse feature sizes) 23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp) 24 | $dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log 25 | 26 | echo "run caffe2 ..." 27 | # WARNING: the following parameters will be set based on the data set 28 | # --arch-embedding-size=... (sparse feature sizes) 29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp) 30 | $dlrm_c2_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_terabyte_c2.log 31 | 32 | echo "done" 33 | -------------------------------------------------------------------------------- /pytorch/imagenet/imagenet/run_inference_cpu_accuracy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ############################################################################### 4 | ### How to run? 5 | ### 1) install pytorch internal 6 | ### 2) install torchvision: for benchmarking ResNext101_32x4d, follow this steps: 7 | ### 1) git clone -b v0.5.0 https://github.com/pytorch/vision.git 8 | ### 2) replace original resnet.py with this fold's resnet.py 9 | ### 3) python setup.py install 10 | ### 3) conda install jemalloc 11 | ### 4) export LD_PRELOAD= "/YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV/lib/libjemalloc.so 12 | ### /opt/intel/compilers_and_libraries/linux/lib/intel64/libiomp5.so" 13 | ### 5) bash run_inference_cpu_accuracy.sh resnet50/resnext101_32x4d --bf16 14 | ### 15 | ############################################################################### 16 | 17 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024 18 | 19 | ARGS="" 20 | if [ "$1" == "resnet50" ]; then 21 | ARGS="$ARGS resnet50" 22 | echo "### running resnet50 model" 23 | else 24 | ARGS="$ARGS resnext101_32x4d" 25 | echo "### running resnext101_32x4d model" 26 | fi 27 | 28 | data_type=$2 29 | 30 | #echo "$data_type" 31 | 32 | if [ "$2" == "bf16" ]; then 33 | ARGS="$ARGS --bf16" 34 | echo "### running bf16 datatype" 35 | fi 36 | 37 | CORES=`lscpu | grep Core | awk '{print $4}'` 38 | SOCKETS=`lscpu | grep Socket | awk '{print $2}'` 39 | TOTAL_CORES=`expr $CORES \* $SOCKETS` 40 | 41 | KMP_SETTING="KMP_AFFINITY=granularity=fine,compact,1,0" 42 | 43 | BATCH_SIZE=256 44 | 45 | export OMP_NUM_THREADS=$TOTAL_CORES 46 | export $KMP_SETTING 47 | 48 | echo -e "### using OMP_NUM_THREADS=$TOTAL_CORES" 49 | echo -e "### using $KMP_SETTING\n\n" 50 | sleep 3 51 | 52 | if [ "$1" == "resnet50" ]; then 53 | python -u main.py -e -a $ARGS --mkldnn --pretrained -j $TOTAL_CORES $DATA_PATH -b $BATCH_SIZE 54 | else 55 | python -u main.py -e -a $ARGS --mkldnn --pretrained -j $TOTAL_CORES $DATA_PATH -b $BATCH_SIZE --checkpoint-dir checkpoints/resnext101_32x4d/checkpoint.pth.tar 56 | fi 57 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/test/dlrm_s_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | #WARNING: must have compiled PyTorch and caffe2 8 | 9 | #check if extra argument is passed to the test 10 | if [[ $# == 1 ]]; then 11 | dlrm_extra_option=$1 12 | else 13 | dlrm_extra_option="" 14 | fi 15 | #echo $dlrm_extra_option 16 | 17 | dlrm_py="python dlrm_s_pytorch.py" 18 | dlrm_c2="python dlrm_s_caffe2.py" 19 | 20 | echo "Running commands ..." 21 | #run pytorch 22 | echo $dlrm_py 23 | $dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1 24 | $dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2 25 | $dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3 26 | $dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4 27 | 28 | #run caffe2 29 | echo $dlrm_c2 30 | $dlrm_c2 --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc1 31 | $dlrm_c2 --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc2 32 | $dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc3 33 | $dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc4 34 | 35 | echo "Checking results ..." 36 | #check results 37 | #WARNING: correct test will have no difference in numeric values 38 | #(but might have some verbal difference, e.g. due to warnnings) 39 | #in the output file 40 | echo "diff test1 (no numeric values in the output = SUCCESS)" 41 | diff ccc1 ppp1 42 | echo "diff test2 (no numeric values in the output = SUCCESS)" 43 | diff ccc2 ppp2 44 | echo "diff test3 (no numeric values in the output = SUCCESS)" 45 | diff ccc3 ppp3 46 | echo "diff test4 (no numeric values in the output = SUCCESS)" 47 | diff ccc4 ppp4 48 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/cython/cython_criteo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Description: run dataset pre-processing in standalone mode 7 | # WARNING: These steps are required to work with Cython 8 | # 1. Instal Cython 9 | # > sudo yum install Cython 10 | # 2. Please copy data_utils.py into data_utils_cython.pyx 11 | # 3. Compile the data_utils_cython.pyx to generate .so 12 | # (it's important to keep extension .pyx rather than .py 13 | # to ensure the C/C++ .so no .py is loaded at import time) 14 | # > python cython_compile.py build_ext --inplace 15 | # This should create data_utils_cython.so, which can be loaded below with "import" 16 | # 4. Run standalone datatset preprocessing to generate .npz files 17 | # a. Kaggle 18 | # > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt 19 | # --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz 20 | # b. Terabyte 21 | # > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte 22 | # --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz 23 | 24 | from __future__ import absolute_import, division, print_function, unicode_literals 25 | 26 | import data_utils_cython as duc 27 | 28 | if __name__ == "__main__": 29 | ### import packages ### 30 | import argparse 31 | 32 | ### parse arguments ### 33 | parser = argparse.ArgumentParser( 34 | description="Preprocess Criteo dataset" 35 | ) 36 | # model related parameters 37 | parser.add_argument("--max-ind-range", type=int, default=-1) 38 | parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] 39 | parser.add_argument("--data-randomize", type=str, default="total") # or day or none 40 | parser.add_argument("--memory-map", action="store_true", default=False) 41 | parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte 42 | parser.add_argument("--raw-data-file", type=str, default="") 43 | parser.add_argument("--processed-data-file", type=str, default="") 44 | args = parser.parse_args() 45 | 46 | duc.loadDataset( 47 | args.data_set, 48 | args.max_ind_range, 49 | args.data_sub_sample_rate, 50 | args.data_randomize, 51 | "train", 52 | args.raw_data_file, 53 | args.processed_data_file, 54 | args.memory_map 55 | ) 56 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/model.py: -------------------------------------------------------------------------------- 1 | """Wide and Deep Model Definition""" 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | import mxnet as mx 20 | 21 | def wide_deep_model(num_linear_features, num_embed_features, num_cont_features, 22 | input_dims, hidden_units): 23 | """wide and deep model definition and generation""" 24 | # wide model 25 | csr_data = mx.symbol.Variable("csr_data", stype='csr') 26 | label = mx.symbol.Variable("softmax_label") 27 | 28 | norm_init = mx.initializer.Normal(sigma=0.01) 29 | # weight with row_sparse storage type to enable sparse gradient updates 30 | weight = mx.symbol.Variable("linear_weight", shape=(num_linear_features, hidden_units[3]), 31 | init=norm_init, stype='row_sparse') 32 | bias = mx.symbol.Variable("linear_bias", shape=(hidden_units[3],)) 33 | dot = mx.symbol.sparse.dot(csr_data, weight) 34 | linear_out = mx.symbol.broadcast_add(dot, bias) 35 | # deep model 36 | dns_data = mx.symbol.Variable("dns_data") 37 | # embedding features 38 | x = mx.symbol.slice(data=dns_data, begin=(0, 0), 39 | end=(None, num_embed_features)) 40 | embeds = mx.symbol.split(data=x, num_outputs=num_embed_features, squeeze_axis=1) 41 | # continuous features 42 | x = mx.symbol.slice(data=dns_data, begin=(0, num_embed_features), 43 | end=(None, num_embed_features + num_cont_features)) 44 | features = [x] 45 | 46 | for i, embed in enumerate(embeds): 47 | embed_weight = mx.symbol.Variable('embed_%d_weight' % i, stype='row_sparse') 48 | features.append(mx.symbol.sparse.Embedding(data=embed, weight=embed_weight, 49 | input_dim=input_dims, output_dim=hidden_units[0], sparse_grad=True)) 50 | 51 | hidden = mx.symbol.concat(*features, dim=1) 52 | hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[1]) 53 | hidden = mx.symbol.Activation(data=hidden, act_type='relu') 54 | hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[2]) 55 | hidden = mx.symbol.Activation(data=hidden, act_type='relu') 56 | deep_out = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[3]) 57 | 58 | out = mx.symbol.SoftmaxOutput(linear_out + deep_out, label, name='model') 59 | return out 60 | -------------------------------------------------------------------------------- /pytorch/imagenet/imagenet/run_inference_cpu_multi_instance_latency.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ###################################################################### 4 | ### How to run? 5 | ### 1) install pytorch internal 6 | ### 2) install torchvision: for benchmarking ResNext101_32x4d, follow this steps: 7 | ### 1) git clone -b v0.5.0 https://github.com/pytorch/vision.git 8 | ### 2) replace original resnet.py with this fold's resnet.py 9 | ### 3) python setup.py install 10 | ### 3) conda install jemalloc 11 | ### 4) export LD_PRELOAD= "/YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV/lib/libjemalloc.so 12 | ### /opt/intel/compilers_and_libraries/linux/lib/intel64/libiomp5.so" 13 | ### 5) Test cpu lantancy(14 instance, 4 core/ins). Just run: 14 | ### bash run_inference_cpu_multi_instance_lantency_bf16.sh resnet50/resnext101_32x4d 15 | ### 16 | ##################################################################3##### 17 | 18 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024 19 | 20 | ARGS="" 21 | if [[ "$1" == "resnet50" ]] 22 | then 23 | ARGS="$ARGS resnet50" 24 | echo "### running resnet50 model" 25 | else 26 | ARGS="$ARGS resnext101_32x4d" 27 | echo "### running resnext101_32x4d model" 28 | fi 29 | 30 | data_type=$2 31 | 32 | echo "$data_type" 33 | 34 | if [[ "$2" == "bf16" ]] 35 | then 36 | ARGS="$ARGS --bf16" 37 | echo "### running bf16 datatype" 38 | fi 39 | 40 | CORES=`lscpu | grep Core | awk '{print $4}'` 41 | SOCKETS=`lscpu | grep Socket | awk '{print $2}'` 42 | TOTAL_CORES=`expr $CORES \* $SOCKETS` 43 | 44 | # change this number to adjust number of instances 45 | CORES_PER_INSTANCE=4 46 | 47 | KMP_SETTING="KMP_AFFINITY=granularity=fine,compact,1,0" 48 | 49 | BATCH_SIZE=1 50 | 51 | export OMP_NUM_THREADS=$CORES_PER_INSTANCE 52 | export $KMP_SETTING 53 | 54 | echo -e "### using OMP_NUM_THREADS=$CORES_PER_INSTANCE" 55 | echo -e "### using $KMP_SETTING\n\n" 56 | sleep 3 57 | 58 | INSTANCES=`expr $TOTAL_CORES / $CORES_PER_INSTANCE` 59 | LAST_INSTANCE=`expr $INSTANCES - 1` 60 | INSTANCES_PER_SOCKET=`expr $INSTANCES / $SOCKETS` 61 | for i in $(seq 1 $LAST_INSTANCE); do 62 | numa_node_i=`expr $i / $INSTANCES_PER_SOCKET` 63 | start_core_i=`expr $i \* $CORES_PER_INSTANCE` 64 | end_core_i=`expr $start_core_i + $CORES_PER_INSTANCE - 1` 65 | LOG_i=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt 66 | 67 | echo "### running on instance $i, numa node $numa_node_i, core list {$start_core_i, $end_core_i}..." 68 | numactl --physcpubind=$start_core_i-$end_core_i --membind=$numa_node_i python -u main.py -e -a $ARGS \ 69 | --mkldnn --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_i & 70 | done 71 | 72 | 73 | numa_node_0=0 74 | start_core_0=0 75 | end_core_0=`expr $CORES_PER_INSTANCE - 1` 76 | LOG_0=inference_cpu_bs${BATCH_SIZE}_ins0.txt 77 | 78 | echo "### running on instance 0, numa node $numa_node_0, core list {$start_core_0, $end_core_0}...\n\n" 79 | numactl --physcpubind=$start_core_0-$end_core_0 --membind=$numa_node_0 python -u main.py -e -a $ARGS \ 80 | --mkldnn --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_0 81 | 82 | sleep 10 83 | echo -e "\n\n Sum sentences/s together:" 84 | for i in $(seq 0 $LAST_INSTANCE); do 85 | log=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt 86 | tail -n 2 $log 87 | done 88 | -------------------------------------------------------------------------------- /pytorch/imagenet/imagenet/run_inference_cpu_multi_instance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ############################################################################### 4 | ### How to run? 5 | ### 1) install pytorch internal 6 | ### 2) install torchvision: for benchmarking ResNext101_32x4d, follow this steps: 7 | ### 1) git clone -b v0.5.0 https://github.com/pytorch/vision.git 8 | ### 2) replace original resnet.py with this fold's resnet.py 9 | ### 3) python setup.py install 10 | ### 3) conda install jemalloc 11 | ### 4) export LD_PRELOAD= "/YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV/lib/libjemalloc.so 12 | ### /opt/intel/compilers_and_libraries/linux/lib/intel64/libiomp5.so" 13 | ### 5) Test cpu throughput(2 instance, 28 core/ins). Just run 14 | ### bash run_inference_cpu_multi_instance_bf16.sh resnet50/resnext101_32x4d 15 | ### 16 | ############################################################################### 17 | 18 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024 19 | 20 | ARGS="" 21 | if [[ "$1" == "resnet50" ]] 22 | then 23 | ARGS="$ARGS resnet50" 24 | echo "### running resnet50 model" 25 | else 26 | ARGS="$ARGS resnext101_32x4d" 27 | echo "### running resnext101_32x4d model" 28 | fi 29 | 30 | data_type=$2 31 | 32 | echo "$data_type" 33 | 34 | if [[ "$2" == "bf16" ]] 35 | then 36 | ARGS="$ARGS --bf16" 37 | echo "### running bf16 datatype" 38 | fi 39 | 40 | if [[ "$3" == "disable-mkldnn" ]] 41 | then 42 | unset LD_PRELOAD 43 | echo "### running non mkldnn model" 44 | else 45 | ARGS="$ARGS --mkldnn" 46 | echo "### running mkldnn backend" 47 | fi 48 | 49 | CORES=`lscpu | grep Core | awk '{print $4}'` 50 | SOCKETS=`lscpu | grep Socket | awk '{print $2}'` 51 | TOTAL_CORES=`expr $CORES \* $SOCKETS` 52 | 53 | # change this number to adjust number of instances 54 | CORES_PER_INSTANCE=$CORES 55 | 56 | KMP_SETTING="KMP_AFFINITY=granularity=fine,compact,1,0" 57 | 58 | BATCH_SIZE=128 59 | 60 | export OMP_NUM_THREADS=$CORES_PER_INSTANCE 61 | export $KMP_SETTING 62 | 63 | echo -e "### using OMP_NUM_THREADS=$CORES_PER_INSTANCE" 64 | echo -e "### using $KMP_SETTING\n\n" 65 | sleep 3 66 | 67 | INSTANCES=`expr $TOTAL_CORES / $CORES_PER_INSTANCE` 68 | LAST_INSTANCE=`expr $INSTANCES - 1` 69 | INSTANCES_PER_SOCKET=`expr $INSTANCES / $SOCKETS` 70 | for i in $(seq 1 $LAST_INSTANCE); do 71 | numa_node_i=`expr $i / $INSTANCES_PER_SOCKET` 72 | start_core_i=`expr $i \* $CORES_PER_INSTANCE` 73 | end_core_i=`expr $start_core_i + $CORES_PER_INSTANCE - 1` 74 | LOG_i=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt 75 | 76 | echo "### running on instance $i, numa node $numa_node_i, core list {$start_core_i, $end_core_i}..." 77 | numactl --physcpubind=$start_core_i-$end_core_i --membind=$numa_node_i python -u main.py -e -a $ARGS \ 78 | --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_i & 79 | done 80 | 81 | numa_node_0=0 82 | start_core_0=0 83 | end_core_0=`expr $CORES_PER_INSTANCE - 1` 84 | LOG_0=inference_cpu_bs${BATCH_SIZE}_ins0.txt 85 | 86 | echo "### running on instance 0, numa node $numa_node_0, core list {$start_core_0, $end_core_0}...\n\n" 87 | numactl --physcpubind=$start_core_0-$end_core_0 --membind=$numa_node_0 python -u main.py -e -a $ARGS \ 88 | --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_0 89 | 90 | sleep 10 91 | echo -e "\n\n Sum sentences/s together:" 92 | for i in $(seq 0 $LAST_INSTANCE); do 93 | log=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt 94 | tail -n 2 $log 95 | done 96 | -------------------------------------------------------------------------------- /mxnet/blog/mxnet_v1.5_release/rnn_benchmark.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import time 3 | import logging 4 | import argparse 5 | 6 | rnncell_type = ['rnn', 'lstm', 'gru', 'sru'] 7 | 8 | 9 | parser = argparse.ArgumentParser(description='MxNet RNN benchmark') 10 | parser.add_argument('--gpu', '-p', type=bool, default=False, help="whether use GPU, default is False") 11 | parser.add_argument('--cell_type', '-cell', type=str, default='lstm', 12 | help="cell type, can be \"LSTM, GRU, RNN, SRU\", default is LSTM.") 13 | parser.add_argument('--layer_num', '-l', type=int, default=1, help="layer num, default is 1.") 14 | 15 | 16 | warm_up = 20 17 | iter_num = 200 18 | 19 | 20 | def fused_module(input_shape, cell_type, layer_nums=1, ctx=mx.cpu(), layout="TNC"): 21 | 22 | assert cell_type in rnncell_type 23 | 24 | bs = input_shape[0] 25 | seq_len = input_shape[1] 26 | embed_dim = input_shape[2] 27 | hidden_size = input_shape[3] 28 | if layout == 'NTC': 29 | dshape = (bs, seq_len, embed_dim) 30 | elif layout == 'TNC': 31 | logging.warning('layout TNC is used!') 32 | dshape = (seq_len, bs, embed_dim) 33 | data = mx.sym.Variable('data') 34 | label = mx.sym.Variable('softmax_label') 35 | 36 | if cell_type == 'lstm': 37 | lstm_cell = mx.rnn.FusedRNNCell( 38 | hidden_size, num_layers=layer_nums, mode='lstm', get_next_state=False, prefix='l0_') 39 | rnn_sym, _ = lstm_cell.unroll( 40 | seq_len, data, layout=layout, merge_outputs=True) 41 | elif cell_type == 'gru': 42 | gru_cell = mx.rnn.FusedRNNCell(hidden_size, num_layers=layer_nums, mode='gru', prefix='l0_') 43 | rnn_sym, _ = gru_cell.unroll( 44 | seq_len, data, layout=layout, merge_outputs=True) 45 | 46 | mod = mx.mod.Module(rnn_sym, label_names=None, context=ctx) 47 | mod.bind(data_shapes=[('data', dshape)], label_shapes=None) 48 | 49 | mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) 50 | mod.init_optimizer(optimizer='sgd') 51 | return mod 52 | 53 | 54 | def rnncell_score_fused(mod): 55 | batch = mx.io.DataBatch(data=[mx.random.uniform(shape=mod.data_shapes[0][1])], label=[]) 56 | tic = time.time() 57 | 58 | mod.forward(batch, is_train=False) 59 | output = mod.get_outputs()[0] 60 | output.wait_to_read() 61 | 62 | fwd = time.time() - tic 63 | return fwd 64 | 65 | 66 | if __name__ == '__main__': 67 | 68 | ''' 69 | cell: unidirection-lstm 70 | hidden_size: 512/1024 71 | BS: 1/32 72 | sentence length/time step: 50/ 73 | layers: 1/4 74 | 75 | ''' 76 | # [bs, sequence length, embedding size, hidden size] 77 | input_shape_list = [[1, 50, 512, 512], [1, 50, 1024, 1024], [32, 50, 512, 512], [32, 50, 1024, 1024]] 78 | 79 | logging.basicConfig(level = logging.INFO) 80 | args = parser.parse_args() 81 | if args.gpu: 82 | ctx = mx.gpu(0) 83 | else: 84 | ctx = mx.cpu() 85 | 86 | cell = args.cell_type 87 | layer_nums = args.layer_num 88 | 89 | logging.warning('Fused RNN API Inference benchmarking started') 90 | 91 | 92 | for input_shape in input_shape_list: 93 | total_fwd = 0 94 | mod = fused_module(input_shape, cell, layer_nums, ctx) 95 | # mod.save_checkpoint('gnmt', 0) 96 | for i in range(warm_up + iter_num): 97 | fwd = rnncell_score_fused(mod) 98 | if i >= warm_up: 99 | total_fwd += fwd 100 | 101 | total_fwd = total_fwd / iter_num 102 | logging.info(str(input_shape) + ' time cost ' + str(total_fwd) + 's samples/sec = ' + str(input_shape[0]/total_fwd)) 103 | 104 | -------------------------------------------------------------------------------- /pytorch/RESNET50V1.md: -------------------------------------------------------------------------------- 1 | # Guide to Run Pytorch/caffe2 resnet50 v1 model 2 | 3 | - please use v1.0.5 4 | 5 | ## Download caffe resnet50 v1 model 6 | 7 | ``` 8 | download the Resnet-50-deploy.prototxt and Resnet-50-model.caffemodel from https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777 9 | the model is provide by https://github.com/KaimingHe/deep-residual-networks 10 | ``` 11 | 12 | 13 | ## Get pytorch source from github, prepare mkl2019 and build 14 | 15 | ``` 16 | git clone https://github.com/jgong5/pytorch -b int8_with_more_opts 17 | git submodule update --init --recursive 18 | ``` 19 | 20 | ``` 21 | download mkl from https://anaconda.org/anaconda/mkl/files?version=2019.3 and extract to mkl2019 folder 22 | download mkl-include from https://anaconda.org/anaconda/mkl-include/files and extract to mkl2019 folder 23 | copy system mkl folder: mkl/lib/intel64 to mkl2019/lib/ 24 | copy system mkl folder file: mkl/lib/libiomp5.so to mkl2019/lib/ 25 | ``` 26 | 27 | ``` 28 | export USE_MKLDNN=ON MKLDNN_USE_CBLAS=ON 29 | export MKLROOT=location/to/mkl2019 30 | python setup.py build 31 | ``` 32 | 33 | ## Transfer caffe model to pytorch/caffe2 model 34 | 35 | 36 | ``` 37 | export PYTHONPATH=src/to/caffe2/build 38 | cd pytorch/benchmark_tools 39 | python inference/caffe_translator.py Resnet-50-deploy.prototxt Resnet-50-model.caffemodel 40 | 41 | ``` 42 | you will get init_net.pb and predict_net.pb under the folder where you run the command 43 | 44 | ## Copy weight file and model file to tools folder 45 | 46 | ``` 47 | cp init_net.pb inference/models/resnet50_v1/ 48 | 49 | cp predict_net.pb inference/models/resnet50_v1/ 50 | ``` 51 | 52 | ## Prepare dataset 53 | 54 | ``` 55 | Please download the imagenet and validation file from the official site 56 | http://image-net.org/download.php 57 | 58 | Note: 59 | - ImageNet does not own the copyright of the images. For researchers and educators who wish to use the images for non-commercial research and/or educational purposes, ImageNet can provide access through their site under certain conditions and terms. 60 | 61 | ``` 62 | 63 | ## Prepare calibration dataset 64 | 65 | ``` 66 | Copy ILSVRC2012_val_00033000.JPEG to ILSVRC2012_val_00033999.JPEG totally 1000 images from the downloaded imagenet dataset folder to calibration folder 67 | find /path/to/your/dataset -type f | grep -E 'ILSVRC2012_val_00033[0-9]*' | xargs -i cp {} /path/to/your/calibration_dataset 68 | ``` 69 | 70 | ## Run calibration 71 | 72 | ``` 73 | export PYTHONPATH=/the/path/to/your/pytorch/src 74 | export LD_PRELOAD=the/location/of/libiomp5.so #libiomp5.so can be found under you mkl folder 75 | export OMP_NUM_THREADS=28 KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu 76 | ./run_caffe2.py -m $modelname -p calibration_folder -v validation_file -b "batchsize" -r calibration -o . --onnx 77 | 78 | There will be two files generated under the folder, and copy them to inference/models/resnet50_v1 79 | cp init_net_int8.pb inference/models/resnet50/init_onnx_int8.pb 80 | cp predict_net_int8.pb inference/models/resnet50/predict_onnx_int8.pb 81 | 82 | ``` 83 | 84 | 85 | ## Run int8 model 86 | 87 | ``` 88 | cd pytoch/benchmark_tools 89 | ./run_numctl.sh 90 | ``` 91 | 92 | 93 | 94 | 95 | ## Parse the result, the output of both fp32 and int8 model looks like below, 96 | 97 | ``` 98 | Images per second: 345.5456113865 99 | Total computing time: 144.6986978054 seconds 100 | Total image processing time: 491.1261794567 seconds 101 | Total model loading time: 4.4210910797 seconds 102 | Total images: 50000 103 | 104 | ``` 105 | Just use 'Images per second' as the Throughput 106 | 107 | -------------------------------------------------------------------------------- /mxnet/blog/medium_vnni/ec2_benchmark_base.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "MXNet Model Quantization Performance" 3 | echo "Testing FP32 base models" 4 | echo "Installing mxnet-mkl 1.5.0b20190623" 5 | pip install --pre mxnet-mkl==1.5.0b20190623 6 | echo "Downloading source code from incubator-mxnet repo" 7 | git clone https://github.com/apache/incubator-mxnet 8 | cd incubator-mxnet 9 | git checkout f44f6cfbe752fd8b8036307cecf6a30a30ad8557 10 | 11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l` 13 | export OMP_NUM_THREADS=$((vCPUs / 2)) 14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS 15 | 16 | # Reduce remote memory access 17 | export NNVM_EXEC_MATCH_RANGE=1 18 | unset MXNET_SUBGRAPH_BACKEND 19 | 20 | echo "=========test image classification models==========" 21 | cd ./example/quantization 22 | echo "=============resnet50_v1===============" 23 | echo "1. calibrating resnet50_v1 with calib-mode=naive, use 5 batches to do calibration" 24 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive 25 | echo "2. testing throughput of fp32 resnet50_v1" 26 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 27 | echo "3. testing latency of fp32 resnet50_v1" 28 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 29 | echo "=============resnet101_v1===============" 30 | echo "1. calibrating resnet101_v1 with calib-mode=naive, use 5 batches to do calibration" 31 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive 32 | echo "2. testing throughput of fp32 resnet101_v1" 33 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 34 | echo "3. testing latency of fp32 resnet101_v1" 35 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 36 | echo "=============mobilenet1.0===============" 37 | echo "1. calibrating mobilenet1.0 with calib-mode=naive, use 5 batches to do calibration" 38 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive 39 | echo "2. testing throughput of fp32 mobilenet1.0" 40 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 41 | echo "3. testing latency of fp32 mobilenet1.0" 42 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 43 | echo "=============inceptionv3===============" 44 | echo "1. calibrating inceptionv3 with calib-mode=naive, use 5 batches to do calibration" 45 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive 46 | echo "2. testing throughput of fp32 inceptionv3" 47 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 48 | echo "3. testing latency of fp32 inceptionv3" 49 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 50 | 51 | echo "=========test image detection models==========" 52 | echo "==============SSD VGG16================" 53 | echo "1. downloading model" 54 | cd ../ssd 55 | cd model/ && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip 56 | unzip ssd_vgg16_reduced_300-dd479559.zip && mv ssd_vgg16_reduced_300-dd479559.params ssd_vgg16_reduced_300-0000.params && mv ssd_vgg16_reduced_300-symbol-dd479559.json ssd_vgg16_reduced_300-symbol.json 57 | cd ../data && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip 58 | unzip ssd-val-fc19a535.zip && mv ssd-val-fc19a535.idx val.idx && mv ssd-val-fc19a535.lst val.lst && mv ssd-val-fc19a535.rec val.rec 59 | cd .. 60 | echo "2. testing throughput of fp32 SSD VGG16" 61 | python benchmark_score.py --batch_size=224 --deploy --prefix=./model/ssd_ 62 | echo "3. testing latency of fp32 SSD VGG16" 63 | python benchmark_score.py --batch_size=1 --deploy --prefix=./model/ssd_ 64 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/data.py: -------------------------------------------------------------------------------- 1 | """Processing data for criteo kaggle dataset""" 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | from csv import DictReader 20 | import os 21 | import numpy as np 22 | import mxnet as mx 23 | 24 | 25 | def get_uci_criteo(data_dir, data_name): 26 | """Get preprocessed data to feed into model""" 27 | data_file = os.path.join(data_dir, data_name) 28 | if (not os.path.exists(data_file)): 29 | print("Dataset " + data_file + " not present") 30 | csr, dns, label = preprocess_uci_criteo(data_name) 31 | return csr, dns, label 32 | 33 | 34 | 35 | # Label - Target variable that indicates if an ad was clicked (1) or not (0). 36 | # I1-I13 - A total of 13 columns of integer features (mostly count features). 37 | # C1-C26 - A total of 26 columns of categorical features. The values of 38 | # these features have been hashed onto 32 bits for anonymization purposes. 39 | CONTINUOUS_COLUMNS = ["I"+str(i) for i in range(1, 14)] # 1-13 inclusive 40 | CATEGORICAL_COLUMNS = ["C"+str(i) for i in range(1, 27)] # 1-26 inclusive 41 | LABEL_COLUMN = ["clicked"] 42 | 43 | TRAIN_DATA_COLUMNS = LABEL_COLUMN + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS 44 | FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS 45 | max_dict = {'I1': 1539, 'I2': 22066, 'I3': 65535, 'I4': 561, 'I5': 2655388, 'I6': 233523, 46 | 'I7': 26297, 'I8': 5106, 'I9': 24376, 'I10': 9, 'I11': 181, 'I12': 1807, 'I13': 6879} 47 | min_dict = {'I1': 0, 'I2': -3, 'I3': 0, 'I4': 0, 'I5': 0, 'I6': 0, 'I7': 0, 'I8': 0, 48 | 'I9': 0, 'I10': 0, 'I11': 0, 'I12': 0, 'I13': 0} 49 | 50 | 51 | def preprocess_uci_criteo(data_name): 52 | """Data preprocessing for criteo kaggle dataset""" 53 | hash_bucket_size = 1000 54 | #cont_defaults = [[0] for i in range(1, 14)] 55 | #cate_defaults = [[" "] for i in range(1, 27)] 56 | #label_defaults = [[0]] 57 | #column_headers = TRAIN_DATA_COLUMNS 58 | #record_defaults = label_defaults + cont_defaults + cate_defaults 59 | 60 | label_list = [] 61 | csr_list = [] 62 | dns_list = [] 63 | 64 | #csr_ncols = len(CATEGORICAL_COLUMNS) * hash_bucket_size 65 | dns_ncols = len(CONTINUOUS_COLUMNS) + len(CATEGORICAL_COLUMNS) 66 | with open(data_name) as f: 67 | for row in DictReader(f, fieldnames=TRAIN_DATA_COLUMNS): 68 | label_list.append(row['clicked']) 69 | # Sparse base columns. 70 | for name in CATEGORICAL_COLUMNS: 71 | csr_list.append((hash(row[name]) % hash_bucket_size, 1.0)) 72 | 73 | 74 | dns_row = [0] * dns_ncols 75 | dns_dim = 0 76 | # Embed wide columns into deep columns 77 | for col in CATEGORICAL_COLUMNS: 78 | dns_row[dns_dim] = hash(row[col].strip()) % hash_bucket_size 79 | dns_dim += 1 80 | # Continuous base columns. 81 | scale = 1 #align with Google WnD paper 82 | for col in CONTINUOUS_COLUMNS: 83 | #dns_row[dns_dim] = float(row[col].strip()) 84 | orig_range = float(max_dict[col] - min_dict[col]) 85 | dns_row[dns_dim] = (float(row[col].strip()) - min_dict[col]) * scale / orig_range 86 | dns_dim += 1 87 | # No transformations. 88 | 89 | dns_list.append(dns_row) 90 | data_list = [item[1] for item in csr_list] 91 | indices_list = [item[0] for item in csr_list] 92 | indptr_list = range(0, len(indices_list) + 1, len(CATEGORICAL_COLUMNS)) 93 | csr = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), 94 | shape=(len(label_list), hash_bucket_size * len(CATEGORICAL_COLUMNS))) 95 | dns = np.array(dns_list) 96 | label = np.array(label_list) 97 | return csr, dns, label 98 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/models/__init__.py: -------------------------------------------------------------------------------- 1 | """parse model def""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import logging 9 | 10 | class Models: 11 | """def model class""" 12 | model_def = {} 13 | model_def_file = "__model_def" 14 | 15 | def __init__(self): 16 | if len(self.model_def) == 0: 17 | self._ParseAllModels() 18 | 19 | def _ParseAllModels(self): 20 | path = os.path.dirname(__file__) 21 | dirs = os.listdir(path) 22 | for d in dirs: 23 | dir_path = os.path.join(path, d) 24 | full_path = os.path.join(dir_path, self.model_def_file) 25 | if os.path.isfile(full_path): 26 | with open(full_path, 'r') as mf: 27 | lines = [line.rstrip('\n') for line in mf.readlines()] 28 | self._LoadModelDef(lines, dir_path) 29 | 30 | def _LoadModelDef(self, defs, path): 31 | model_name = defs[defs.index("[Model Name]") + 1].lower() 32 | # model type in lowercase, e.g. "caffe legacy", "normal", "prototext" 33 | model_type = defs[defs.index("[Model Type]") + 1].lower() 34 | # output type in lowercase, e.g. "possibility", "segmentation", "post image" 35 | output_type = defs[defs.index("[Output Type]") + 1].lower() 36 | init_net = os.path.join(path, defs[defs.index("[Init Net]") + 1]) 37 | predict_net = os.path.join(path, defs[defs.index("[Predict Net]") + 1]) 38 | init_net_int8 = None 39 | if "[Init Net Int8]" in defs: 40 | init_net_int8 = os.path.join(path, defs[defs.index("[Init Net Int8]") + 1]) 41 | predict_net_int8 = None 42 | if "[Predict Net Int8]" in defs: 43 | predict_net_int8 = os.path.join(path, defs[defs.index("[Predict Net Int8]") + 1]) 44 | onnx_model = None 45 | if "[Onnx Model]" in defs: 46 | onnx_model = os.path.join(path, defs[defs.index("[Onnx Model]") + 1]) 47 | crop_size = defs[defs.index("[Crop Size]") + 1] 48 | image_mean = defs[defs.index("[Image Mean]") + 1] 49 | scale = 1 50 | if "[Scale]" in defs: 51 | scale = defs[defs.index("[Scale]") +1] 52 | rescale_size = 256 53 | if "[ReScale Size]" in defs: 54 | rescale_size = defs[defs.index("[ReScale Size]") +1] 55 | if len(image_mean) > 0: 56 | image_mean = os.path.join(path, image_mean) 57 | else: 58 | image_mean = None 59 | allow_device_override = True 60 | need_normalize = False 61 | if "[Allow Device Override]" in defs: 62 | allow_device_override = defs[defs.index("[Allow Device Override]") +1].lower() in ('yes', 'true', 't', '1') 63 | if "[Need Normalize]" in defs: 64 | need_normalize = defs[defs.index("[Need Normalize]") +1].lower() in ('yes', 'true', 't', '1') 65 | color_format = None 66 | if "[Color Format]" in defs: 67 | color_format = defs[defs.index("[Color Format]") + 1] 68 | 69 | if model_name in self.model_def: 70 | logging.warning("Already has model: {}. Ignored!" 71 | .format(model_name)) 72 | else: 73 | self.model_def[model_name] = { 74 | "model_name" : model_name, 75 | "model_type" : model_type, 76 | "output_type" : output_type, 77 | "init_net" : init_net, 78 | "predict_net" : predict_net, 79 | "init_net_int8" : init_net_int8, 80 | "predict_net_int8" : predict_net_int8, 81 | "onnx_model" : onnx_model, 82 | "crop_size" : crop_size, 83 | "image_mean" : image_mean, 84 | "scale" : scale, 85 | "rescale_size" : rescale_size, 86 | "allow_device_override": allow_device_override, 87 | "need_normalize" : need_normalize, 88 | "color_format" : color_format, 89 | } 90 | 91 | 92 | def ShowModels(): 93 | models = Models() 94 | logging.critical("All supported models for inference:\n{}" 95 | .format([str(s) for s in models.model_def])) 96 | 97 | def IsSupported(model): 98 | models = Models() 99 | return (model.lower() in models.model_def) 100 | 101 | def GetModelInfo(model): 102 | models = Models() 103 | return models.model_def[model.lower()] 104 | -------------------------------------------------------------------------------- /pytorch/distributed/README.md: -------------------------------------------------------------------------------- 1 | # Distributed Training with OneCCL in PyTorch 2 | 3 | ## Install anaconda 3.0 and Dependencies 4 | ```bash 5 | wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh 6 | chmod +x anaconda3.sh 7 | ./anaconda3.sh -b -p ~/anaconda3 8 | ./anaconda3/bin/conda create -n pytorch-ccl python=3.7 9 | export PATH=~/anaconda3/bin:$PATH 10 | source ./anaconda3/bin/activate pytorch-ccl 11 | conda config --append channels intel 12 | conda install ninja pyyaml setuptools cmake cffi typing 13 | conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps 14 | ``` 15 | ## Install PyTorch 16 | ```bash 17 | git clone https://github.com/pytorch/pytorch.git 18 | git submodule sync && git submodule update --init --recursive 19 | python setup.py install 20 | ``` 21 | ## Install oneCCL 22 | ```bash 23 | git clone https://github.com/oneapi-src/oneCCL.git 24 | cd {path-to-oneCCL} 25 | mkdir build && cd build 26 | cmake .. -DCMAKE_INSTALL_PREFIX=~/.local 27 | make -j install 28 | ``` 29 | ## Install torch-ccl 30 | ```bash 31 | git clone https://github.com/intel/torch-ccl.git 32 | source ~/.local/env/setvars.sh 33 | python setup.py install 34 | ``` 35 | ## Demo for using OneCCL in PyTorch 36 | ```python 37 | import os 38 | import torch 39 | import torch.nn as nn 40 | from torch.nn.parallel import DistributedDataParallel as DDP 41 | import torch.distributed as dist 42 | import torch_ccl 43 | 44 | class Model(nn.Module): 45 | def __init__(self): 46 | super(Model, self).__init__() 47 | self.linear = nn.Linear(4, 5) 48 | 49 | def forward(self, input): 50 | return self.linear(input) 51 | 52 | 53 | if __name__ == "__main__": 54 | 55 | os.environ['RANK'] = os.environ.get('PMI_RANK', -1) 56 | os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', -1) 57 | 58 | # Initialize the process group with ccl backend 59 | dist.init_process_group(backend='ccl') 60 | 61 | model = Model() 62 | if dist.get_world_size() > 1: 63 | model=DDP(model) 64 | 65 | for i in range(3): 66 | input = torch.randn(2, 4) 67 | labels = torch.randn(2, 5) 68 | loss_fn = nn.MSELoss() 69 | optimizer = torch.optim.SGD(model.parameters(), lr=0.001) 70 | 71 | # forward 72 | res = model(input) 73 | L=loss_fn(res, labels) 74 | 75 | # backward 76 | L.backward() 77 | 78 | # update 79 | optimizer.step() 80 | ``` 81 | ## Run Scripts & CPU Affinity 82 | 1. Distributed Training on Single Node 83 | ```bash 84 | source ~/.local/env/setvars.sh 85 | export LD_PRELOAD="${CONDA_PREFIX}/lib/libiomp5.so" 86 | export MASTER_ADDR="127.0.0.1" 87 | export MASTER_PORT="29500" 88 | 89 | # Example: 90 | # Run 2 processes on 2 sockets. (28 cores/socket, 4 cores for CCL, 24 cores for computation) 91 | # 92 | # CCL_WORKER_COUNT means per instance threads used by CCL. 93 | # CCL_WORKER_COUNT, CCL_WORKER_AFFINITY and I_MPI_PIN_DOMAIN should be consistent. 94 | 95 | export CCL_WORKER_COUNT=4 96 | export CCL_WORKER_AFFINITY="0,1,2,3,28,29,31,32" 97 | 98 | mpiexec.hydra -np 2 -ppn 2 -l -genv I_MPI_PIN_DOMAIN=[0x0000000FFFFFF0,0xFFFFFF00000000] \ 99 | -genv KMP_BLOCKTIME=1 -genv KMP_AFFINITY=granularity=fine,compact,1,0 \ 100 | -genv OMP_NUM_THREADS=24 python -u ut_memory.py 101 | ``` 102 | 2. Distributed Training on Multiple Nodes 103 | ```bash 104 | source ~/.local/env/setvars.sh 105 | export LD_PRELOAD="${CONDA_PREFIX}/lib/libiomp5.so" 106 | export MASTER_ADDR="10.xxx.xxx.xxx" # IP address on which users launch MPI command 107 | export MASTER_PORT="29500" 108 | 109 | # Example: 110 | # Run 4 processes on 2 Nodes, 2 sockets/Node (28 cores/socket, 4 cores for CCL, 24 cores for computation) 111 | # 112 | # CCL_WORKER_COUNT means per instance threads used by CCL. 113 | # CCL_WORKER_COUNT, CCL_WORKER_AFFINITY and I_MPI_PIN_DOMAIN should be consistent. 114 | # 115 | # `hostfile`: add all Nodes' IP into this file 116 | 117 | export CCL_WORKER_COUNT=4 118 | export CCL_WORKER_AFFINITY="0,1,2,3,28,29,31,32" 119 | 120 | mpiexec.hydra -f hostfile -np 4 -ppn 2 -l -genv I_MPI_PIN_DOMAIN=[0x0000000FFFFFF0,0xFFFFFF00000000] \ 121 | -genv KMP_BLOCKTIME=1 -genv KMP_AFFINITY=granularity=fine,compact,1,0 \ 122 | -genv OMP_NUM_THREADS=24 python -u ut_memory.py 123 | ``` 124 | -------------------------------------------------------------------------------- /third-party-programs.txt: -------------------------------------------------------------------------------- 1 | optimized-models Third Party Programs File 2 | 3 | This file contains the list of third party software (“third party programs”) contained 4 | in the Intel software and their required notices and/or license terms. This third party 5 | software, even if included with the distribution of the Intel software, may be governed 6 | by separate license terms, including without limitation, third party license terms, other 7 | Intel software license terms, and open source software license terms. These separate license 8 | terms govern your use of the third party programs as set forth in in the “third-party-programs.txt” or other similarly named text file. 9 | 10 | Third party programs and their corresponding required notices and/or license terms are listed below. 11 | 12 | ------------------------------------------------------------- 13 | 1. dlrm 14 | MIT License 15 | 16 | Copyright (c) Facebook, Inc. and its affiliates. 17 | 18 | Permission is hereby granted, free of charge, to any person obtaining a copy 19 | of this software and associated documentation files (the "Software"), to deal 20 | in the Software without restriction, including without limitation the rights 21 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 22 | copies of the Software, and to permit persons to whom the Software is 23 | furnished to do so, subject to the following conditions: 24 | 25 | The above copyright notice and this permission notice shall be included in all 26 | copies or substantial portions of the Software. 27 | 28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 31 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 33 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 34 | SOFTWARE. 35 | 36 | ------------------------------------------------------------- 37 | 2. PyTorch examples 38 | BSD 3-Clause License 39 | 40 | Copyright (c) 2017, 41 | All rights reserved. 42 | 43 | Redistribution and use in source and binary forms, with or without 44 | modification, are permitted provided that the following conditions are met: 45 | 46 | * Redistributions of source code must retain the above copyright notice, this 47 | list of conditions and the following disclaimer. 48 | 49 | * Redistributions in binary form must reproduce the above copyright notice, 50 | this list of conditions and the following disclaimer in the documentation 51 | and/or other materials provided with the distribution. 52 | 53 | * Neither the name of the copyright holder nor the names of its 54 | contributors may be used to endorse or promote products derived from 55 | this software without specific prior written permission. 56 | 57 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 58 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 59 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 60 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 61 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 63 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 64 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 65 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 66 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 67 | 68 | ------------------------------------------------------------- 69 | 3. torchvision 70 | BSD 3-Clause License 71 | 72 | Copyright (c) Soumith Chintala 2016, 73 | All rights reserved. 74 | 75 | Redistribution and use in source and binary forms, with or without 76 | modification, are permitted provided that the following conditions are met: 77 | 78 | * Redistributions of source code must retain the above copyright notice, this 79 | list of conditions and the following disclaimer. 80 | 81 | * Redistributions in binary form must reproduce the above copyright notice, 82 | this list of conditions and the following disclaimer in the documentation 83 | and/or other materials provided with the distribution. 84 | 85 | * Neither the name of the copyright holder nor the names of its 86 | contributors may be used to endorse or promote products derived from 87 | this software without specific prior written permission. 88 | 89 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 90 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 91 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 92 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 93 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 94 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 95 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 96 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 97 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 98 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 99 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/bench/dlrm_s_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | #check if extra argument is passed to the test 8 | if [[ $# == 1 ]]; then 9 | dlrm_extra_option=$1 10 | else 11 | dlrm_extra_option="" 12 | fi 13 | #echo $dlrm_extra_option 14 | 15 | cpu=1 16 | gpu=1 17 | pt=1 18 | c2=1 19 | 20 | ncores=28 #12 #6 21 | nsockets="0" 22 | 23 | ngpus="1 2 4 8" 24 | 25 | numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT 26 | dlrm_pt_bin="python dlrm_s_pytorch.py" 27 | dlrm_c2_bin="python dlrm_s_caffe2.py" 28 | 29 | data=random #synthetic 30 | print_freq=100 31 | rand_seed=727 32 | 33 | c2_net="async_scheduling" 34 | 35 | #Model param 36 | mb_size=2048 #1024 #512 #256 37 | nbatches=1000 #500 #100 38 | bot_mlp="512-512-64" 39 | top_mlp="1024-1024-1024-1" 40 | emb_size=64 41 | nindices=100 42 | emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000" 43 | interaction="dot" 44 | tnworkers=0 45 | tmb_size=16384 46 | 47 | #_args="--mini-batch-size="${mb_size}\ 48 | _args=" --num-batches="${nbatches}\ 49 | " --data-generation="${data}\ 50 | " --arch-mlp-bot="${bot_mlp}\ 51 | " --arch-mlp-top="${top_mlp}\ 52 | " --arch-sparse-feature-size="${emb_size}\ 53 | " --arch-embedding-size="${emb}\ 54 | " --num-indices-per-lookup="${nindices}\ 55 | " --arch-interaction-op="${interaction}\ 56 | " --numpy-rand-seed="${rand_seed}\ 57 | " --print-freq="${print_freq}\ 58 | " --print-time"\ 59 | " --enable-profiling " 60 | 61 | c2_args=" --caffe2-net-type="${c2_net} 62 | 63 | 64 | # CPU Benchmarking 65 | if [ $cpu = 1 ]; then 66 | echo "--------------------------------------------" 67 | echo "CPU Benchmarking - running on $ncores cores" 68 | echo "--------------------------------------------" 69 | if [ $pt = 1 ]; then 70 | outf="model1_CPU_PT_$ncores.log" 71 | outp="dlrm_s_pytorch.prof" 72 | echo "-------------------------------" 73 | echo "Running PT (log file: $outf)" 74 | echo "-------------------------------" 75 | cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf" 76 | echo $cmd 77 | eval $cmd 78 | min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') 79 | echo "Min time per iteration = $min" 80 | # move profiling file(s) 81 | mv $outp ${outf//".log"/".prof"} 82 | mv ${outp//".prof"/".json"} ${outf//".log"/".json"} 83 | 84 | fi 85 | if [ $c2 = 1 ]; then 86 | outf="model1_CPU_C2_$ncores.log" 87 | outp="dlrm_s_caffe2.prof" 88 | echo "-------------------------------" 89 | echo "Running C2 (log file: $outf)" 90 | echo "-------------------------------" 91 | cmd="$numa_cmd $dlrm_c2_bin --mini-batch-size=$mb_size $_args $c2_args $dlrm_extra_option 1> $outf 2> $outp" 92 | echo $cmd 93 | eval $cmd 94 | min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') 95 | echo "Min time per iteration = $min" 96 | # move profiling file (collected from stderr above) 97 | mv $outp ${outf//".log"/".prof"} 98 | fi 99 | fi 100 | 101 | # GPU Benchmarking 102 | if [ $gpu = 1 ]; then 103 | echo "--------------------------------------------" 104 | echo "GPU Benchmarking - running on $ngpus GPUs" 105 | echo "--------------------------------------------" 106 | for _ng in $ngpus 107 | do 108 | # weak scaling 109 | # _mb_size=$((mb_size*_ng)) 110 | # strong scaling 111 | _mb_size=$((mb_size*1)) 112 | _gpus=$(seq -s, 0 $((_ng-1))) 113 | cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus" 114 | echo "-------------------" 115 | echo "Using GPUS: "$_gpus 116 | echo "-------------------" 117 | if [ $pt = 1 ]; then 118 | outf="model1_GPU_PT_$_ng.log" 119 | outp="dlrm_s_pytorch.prof" 120 | echo "-------------------------------" 121 | echo "Running PT (log file: $outf)" 122 | echo "-------------------------------" 123 | cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf" 124 | echo $cmd 125 | eval $cmd 126 | min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') 127 | echo "Min time per iteration = $min" 128 | # move profiling file(s) 129 | mv $outp ${outf//".log"/".prof"} 130 | mv ${outp//".prof"/".json"} ${outf//".log"/".json"} 131 | fi 132 | if [ $c2 = 1 ]; then 133 | outf="model1_GPU_C2_$_ng.log" 134 | outp="dlrm_s_caffe2.prof" 135 | echo "-------------------------------" 136 | echo "Running C2 (log file: $outf)" 137 | echo "-------------------------------" 138 | cmd="$cuda_arg $dlrm_c2_bin --mini-batch-size=$_mb_size $_args $c2_args --use-gpu $dlrm_extra_option 1> $outf 2> $outp" 139 | echo $cmd 140 | eval $cmd 141 | min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}') 142 | echo "Min time per iteration = $min" 143 | # move profiling file (collected from stderr above) 144 | mv $outp ${outf//".log"/".prof"} 145 | fi 146 | done 147 | fi 148 | -------------------------------------------------------------------------------- /pytorch/README.md: -------------------------------------------------------------------------------- 1 | # Guide to run resnet50 fp32 and int8 models. please use release v1.0.0. 2 | 3 | - For resnet50_v1 guide, please see [`RESNET50V1.md`](RESNET50V1.md). 4 | 5 | ## Download resnet50 pytorch model 6 | 7 | ``` 8 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth 9 | ``` 10 | ## Download resnext pytorch model if you want 11 | ``` 12 | wget http://data.lip6.fr/cadene/pretrainedmodels/resnext101_32x4d-29e315fa.pth 13 | ``` 14 | 15 | ## Install legacy pytorch for transferring model from pytorch to onnx 16 | 17 | ``` 18 | pip install torchvision 19 | ``` 20 | 21 | ## Get pytoch source from github 22 | 23 | ``` 24 | git clone https://github.com/pytorch/pytorch.git 25 | git submodule update --init --recursive 26 | python setup.py build 27 | ``` 28 | 29 | ## Transfer pytorch model to onnx model 30 | below code is an example for resnet50: 31 | ``` 32 | import torch 33 | import torchvision.models as models 34 | from torch.autograd import Variable 35 | model = models.resnet50(pretrained=False) 36 | m = torch.load('resnet50-19c8e357.pth') 37 | model.load_state_dict(m) 38 | model.train(False) 39 | x = Variable(torch.randn(1, 3, 224, 224)) 40 | y = model(x) 41 | torch_out = torch.onnx._export(model, 42 | x, 43 | "resnet50.onnx", 44 | export_params=True) 45 | ``` 46 | ## Copy onnx file to tools folder 47 | 48 | ``` 49 | cp resnet50.onnx inference/models/resnet50/ 50 | ``` 51 | 52 | ## Prepare dataset 53 | 54 | ``` 55 | Please download the imagenet and validation file from the official site 56 | http://image-net.org/download.php 57 | 58 | Note: 59 | - ImageNet does not own the copyright of the images. For researchers and educators who wish to use the images for non-commercial research and/or educational purposes, ImageNet can provide access through their site under certain conditions and terms. 60 | 61 | 62 | 63 | ``` 64 | ## Prepare calibration dataset 65 | 66 | ``` 67 | Copy ILSVRC2012_val_00033000.JPEG to ILSVRC2012_val_00033999.JPEG totally 1000 images from the downloaded imagenet dataset folder to calibration folder 68 | find /path/to/your/dataset -type f | grep -E 'ILSVRC2012_val_00033[0-9]*' | xargs -i cp {} /path/to/your/calibration_dataset 69 | ``` 70 | 71 | ## Run calibration 72 | 73 | 74 | ``` 75 | export PYTHONPATH=/the/path/to/your/pytorch/src 76 | export LD_PRELOAD=the/location/of/libiomp5.so #libiomp5.so can be found under you mkl folder 77 | export OMP_NUM_THREADS=28 KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu 78 | ./run_caffe2.py -m $modelname -p calibration_folder -v validation_file -b "batchsize" -r calibration -o . --onnx 79 | 80 | There will be two files generated under the folder, and copy them to inference/models/resnet50 81 | cp init_net_int8.pb inference/models/resnet50/init_onnx_int8.pb 82 | cp predict_net_int8.pb inference/models/resnet50/predict_onnx_int8.pb 83 | 84 | ``` 85 | 86 | ## Run fp32 model 87 | 88 | ``` 89 | export PYTHONPATH=/the/path/to/your/pytorch/src 90 | export LD_PRELOAD=the/location/of/libiomp5.so #libiomp5.so can be found under you mkl folder 91 | export OMP_NUM_THREADS=28 KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu 92 | 93 | ./run_caffe2.py -m $modelname -p imagenet_folder -v validation_file -b "batchsize" -w 5 --onnx 94 | ``` 95 | If you want to run dummy data, please use the blow command 96 | ``` 97 | export PYTHONPATH=/the/path/to/your/pytorch/src 98 | export LD_PRELOAD=the/location/of/libiomp5.so #libiomp5.so can be found under you mkl folder 99 | export OMP_NUM_THREADS=28 KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu 100 | 101 | ./run_caffe2.py -m $modelname -b "batchsize" -w 5 -u -i 1000 --onnx 102 | ``` 103 | 104 | ## Run int8 model 105 | 106 | ``` 107 | export PYTHONPATH=/the/path/to/your/pytorch/src 108 | export LD_PRELOAD=the/location/of/libiomp5.so #libiomp5.so can be found under you mkl folder 109 | export OMP_NUM_THREADS=28 KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu 110 | 111 | ./run_caffe2.py -m $modelname -p imagenet_folder -v validation_file -b "batchsize" -w 5 -int8 112 | ``` 113 | If you want to run dummy data, please use the blow command 114 | ``` 115 | export PYTHONPATH=/the/path/to/your/pytorch/src 116 | export LD_PRELOAD=the/location/of/libiomp5.so #libiomp5.so can be found under you mkl folder 117 | export OMP_NUM_THREADS=28 KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu 118 | 119 | ./run_caffe2.py -m $modelname -b "batchsize" -w 5 -u -i 1000 -int8 120 | ``` 121 | 122 | 123 | 124 | 125 | ## Parse the result, the output of both fp32 and int8 model looks like below, 126 | 127 | ``` 128 | Images per second: 345.5456113865 129 | Total computing time: 144.6986978054 seconds 130 | Total image processing time: 491.1261794567 seconds 131 | Total model loading time: 4.4210910797 seconds 132 | Total images: 50000 133 | Accuracy: 75.36400% 134 | Top5Accuracy: 92.54200% 135 | 136 | ``` 137 | Just use 'Images per second' as the Throughput, 'Accuracy' as the Top1 accuracy and 'Top5Accuracy' as the Top5 Accuracy. 138 | 139 | -------------------------------------------------------------------------------- /pytorch/dlrm/README.md: -------------------------------------------------------------------------------- 1 | # Guide to run DLRM with FP32/BF16 data type 2 | 3 | ## Verified on 4 | 5 | | Item | Value | 6 | | -: | :- | 7 | | OS | Ubuntu 20.04 LTS | 8 | | Compiler | gcc 8.4.0 | 9 | | Memory | DDR4 3200MHz, 192GB/socket | 10 | 11 | ## Prepare your running environment 12 | 13 | 1. Install anaconda 3.0 14 | ``` 15 | wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh 16 | chmod +x anaconda3.sh 17 | ./anaconda3.sh -b -p ~/anaconda3 18 | ./anaconda3/bin/conda create -n dlrm python=3.7 19 | ``` 20 | 21 | 2. Setup anaconda virtual environment for DLRM 22 | ``` 23 | export PATH=~/anaconda3/bin:$PATH 24 | source ./anaconda3/bin/activate dlrm 25 | ``` 26 | 27 | 3. Install dependencies 28 | ``` 29 | # 1. 30 | pip install sklearn onnx tqdm lark-parser 31 | 32 | #2. 33 | conda config --append channels intel 34 | conda install ninja pyyaml setuptools cmake cffi typing 35 | conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps 36 | 37 | #3. 38 | wget https://github.com/gperftools/gperftools/releases/download/gperftools-2.7.90/gperftools-2.7.90.tar.gz 39 | tar -xzf gperftools-2.7.90.tar.gz 40 | cd gperftools-2.7.90 41 | ./configure --prefix=$HOME/.local 42 | make && make install 43 | ``` 44 | 45 | 4. Clone source code and build 46 | 47 | ``` 48 | # PyTorch 49 | git clone https://github.com/pytorch/pytorch.git 50 | git checkout tags/v1.5.0-rc3 -b v1.5-rc3 51 | git submodule sync && git submodule update --init --recursive 52 | 53 | # extension 54 | git clone https://github.com/intel/intel-extension-for-pytorch.git 55 | git checkout cpx-y20m06 56 | git submodule update --init –recursive 57 | 58 | # prepare patch to PyTorch 59 | cp {path/to/intel-pytorch-extension}/torch_patches/dlrm_fp32.patch {path/to/pytorch}/ 60 | cp {path/to/intel-pytorch-extension}/torch_patches/dpcpp-v1.5-rc3.patch {path/to/pytorch}/ 61 | 62 | # build PyTorch 63 | cd {path/to/pytorch} 64 | patch -p1 < dpcpp-v1.5-rc3.patch 65 | patch -p1 < dlrm_fp32.patch 66 | python setup.py install 67 | 68 | # build extension 69 | cd {path/to/intel-pytorch-extension} 70 | python setup.py install 71 | 72 | # DLRM 73 | git clone https://github.com/facebookresearch/dlrm.git 74 | git checkout 4705ea122d3cc693367f54e937db28c9c673d71b 75 | cd {path/to/dlrm} 76 | cp {path/to/intel-pytorch-extension}/torch_patches/models/mlperf_dlrm_ipex_OneDNN.diff ./ 77 | patch -p1 < mlperf_dlrm_ipex_oneDNN.diff 78 | ``` 79 | 80 | 5. Download data 81 | ``` 82 | cd /tmp && mkdir input 83 | curl -O http://azuremlsampleexperiments.blob.core.windows.net/criteo/day_{$(seq -s , 0 23)}.gz 84 | // unzip all download files into `input` folder. 85 | ``` 86 | 87 | 6. Running cmd 88 | ``` 89 | cd {path/to/dlrm} 90 | ################### NOTICE ############################### 91 | # configurable parameters in {run_and_time.h} according to your machine. 92 | ncores=24 # cores/socket 93 | nsockets=0 # numa 94 | DATASET_PATH=/temp/input # dataset location for DLRM 95 | ################### NOTICE END ########################### 96 | 97 | # FP32 cmd 98 | ./bench/run_and_time.sh 99 | 100 | # BF16 cmd 101 | ./bench/run_and_time.sh bf16 102 | ``` 103 | 104 | --- 105 | # Guide to run DLRM Facebook Model with INT8 data type 106 | 107 | ## Verified on 108 | 109 | | Item | Value | 110 | | -: | :- | 111 | | OS | Ubuntu 20.04 LTS | 112 | | Compiler | gcc 8.4.0 | 113 | | Memory | DDR4 3200MHz, 192GB/socket | 114 | 115 | 1. Install anaconda 3.0 116 | ``` 117 | wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh 118 | chmod +x anaconda3.sh 119 | ./anaconda3.sh -b -p ~/anaconda3 120 | ./anaconda3/bin/conda create -n dlrm python=3.7 121 | ``` 122 | 123 | 2. Setup anaconda virtual environment for DLRM 124 | ``` 125 | export PATH=~/anaconda3/bin:$PATH 126 | source ./anaconda3/bin/activate dlrm 127 | ``` 128 | 129 | 3. Install dependencies 130 | ``` 131 | # 1. 132 | pip install sklearn onnx tqdm 133 | 134 | # 2. 135 | conda config --append channels intel 136 | conda install ninja pyyaml setuptools cmake cffi typing 137 | conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps 138 | 139 | # 3. 140 | conda install jemalloc 141 | ``` 142 | 143 | 4. Clone source code and build 144 | ``` 145 | # PyTorch 146 | git clone https://github.com/pytorch/pytorch.git 147 | cd pytorch 148 | git checkout tags/v1.5.0 -b v1.5 149 | git submodule sync && git submodule update --init --recursive 150 | 151 | # prepare patch to PyTorch 152 | wget https://github.com/pytorch/pytorch/commit/cf28c6a31a5189a47007fb3907a248b3548ae7fd.patch 153 | 154 | # build PyTorch 155 | git apply cf28c6a31a5189a47007fb3907a248b3548ae7fd.patch 156 | python setup.py install 157 | 158 | # get DLRM model 159 | git clone https://github.com/intel/optimized-models.git 160 | cd optimized-models/pytorch/dlrm/dlrm 161 | ``` 162 | 163 | 5. Set environment 164 | ``` 165 | export LD_PRELOAD=${CONDA_PREFIX}/lib/libjemalloc.so:${CONDA_PREFIX}/lib/libiomp5.so 166 | ``` 167 | 168 | 6. Test command 169 | ``` 170 | # FP32 171 | OMP_NUM_THREADS=1 numactl --physcpubind=0-23 --membind=0 python dlrm_s_pytorch.py --mini-batch-size=16 --num-batches=1000 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --arch-interaction-op=dot --numpy-rand-seed=727 --print-freq=100 --print-time --inference-only --share-weight --num-instance=24 > model1_CPU_PT_24_fp32_inference.log 172 | 173 | # INT8 174 | OMP_NUM_THREADS=1 numactl --physcpubind=0-23 --membind=0 python dlrm_s_pytorch.py --mini-batch-size=16 --num-batches=1000 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --arch-interaction-op=dot --numpy-rand-seed=727 --print-freq=100 --print-time --inference-only --share-weight --do-int8-inference --num-instance=24 > model1_CPU_PT_24_int8_inference.log 175 | ``` 176 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/train.py: -------------------------------------------------------------------------------- 1 | """WnD training script""" 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | import argparse 20 | import os 21 | import pickle 22 | import mxnet as mx 23 | #from mxnet.test_utils import * 24 | from data import get_uci_criteo 25 | from model import wide_deep_model 26 | 27 | parser = argparse.ArgumentParser(description="Run sparse wide and deep classification ", 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 29 | parser.add_argument('--num-epoch', type=int, default=1, 30 | help='number of epochs to train') 31 | parser.add_argument('--batch-size', type=int, default=1000, 32 | help='number of examples per batch') 33 | parser.add_argument('--lr', type=float, default=0.001, 34 | help='learning rate') 35 | parser.add_argument('--cuda', action='store_true', default=False, 36 | help='Train on GPU with CUDA') 37 | parser.add_argument('--optimizer', type=str, default='adam', 38 | help='what optimizer to use', 39 | choices=["ftrl", "sgd", "adam"]) 40 | parser.add_argument('--log-interval', type=int, default=100, 41 | help='number of batches to wait before logging training status') 42 | parser.add_argument('--data-dir', type=str, default='large_version', 43 | help='folder for data') 44 | 45 | # Related to feature engineering, please see preprocess in data.py 46 | CRITEO = { 47 | 'train': 'train.csv', 48 | 'test': 'eval.csv', 49 | 'num_linear_features': 26000, 50 | 'num_embed_features': 26, 51 | 'num_cont_features': 13, 52 | 'embed_input_dims': 1000, 53 | 'hidden_units': [32, 1024, 512, 256], 54 | } 55 | def save_object(filename, obj): 56 | with open(filename, 'wb') as output: # Overwrites any existing file. 57 | pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) 58 | if __name__ == '__main__': 59 | import logging 60 | 61 | head = '%(asctime)-15s %(message)s' 62 | logging.basicConfig(level=logging.INFO, format=head) 63 | 64 | # arg parser 65 | args = parser.parse_args() 66 | logging.info(args) 67 | num_epoch = args.num_epoch 68 | batch_size = args.batch_size 69 | optimizer = args.optimizer 70 | log_interval = args.log_interval 71 | lr = args.lr 72 | ctx = mx.gpu(0) if args.cuda else mx.cpu() 73 | 74 | # dataset 75 | data_dir = os.path.join(os.getcwd(), args.data_dir) 76 | train_data = os.path.join(data_dir, CRITEO['train']) 77 | val_data = os.path.join(data_dir, CRITEO['test']) 78 | train_csr, train_dns, train_label = get_uci_criteo(data_dir, train_data) 79 | val_csr, val_dns, val_label = get_uci_criteo(data_dir, val_data) 80 | 81 | save_object('val_csr.pkl', val_csr) 82 | save_object('val_dns.pkl', val_dns) 83 | save_object('val_label.pkl', val_label) 84 | save_object('train_csr.pkl', train_csr) 85 | save_object('train_dns.pkl', train_dns) 86 | save_object('train_label.pkl', train_label) 87 | 88 | model = wide_deep_model(CRITEO['num_linear_features'], CRITEO['num_embed_features'], 89 | CRITEO['num_cont_features'], CRITEO['embed_input_dims'], 90 | CRITEO['hidden_units']) 91 | 92 | # data iterator 93 | train_data = mx.io.NDArrayIter({'csr_data': train_csr, 'dns_data': train_dns}, 94 | {'softmax_label': train_label}, batch_size, 95 | shuffle=True, last_batch_handle='discard') 96 | eval_data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns}, 97 | {'softmax_label': val_label}, batch_size, 98 | shuffle=True, last_batch_handle='discard') 99 | 100 | # module 101 | mod = mx.mod.Module(symbol=model, context=ctx, data_names=['csr_data', 'dns_data'], 102 | label_names=['softmax_label']) 103 | mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) 104 | mod.init_params() 105 | optim = mx.optimizer.create(optimizer, learning_rate=lr, rescale_grad=1.0 / batch_size) 106 | mod.init_optimizer(optimizer=optim) 107 | # use accuracy as the metric 108 | metric = mx.metric.create(['acc']) 109 | # get the sparse weight parameter 110 | speedometer = mx.callback.Speedometer(batch_size, log_interval) 111 | 112 | logging.info('Training started ...') 113 | 114 | data_iter = iter(train_data) 115 | for epoch in range(num_epoch): 116 | nbatch = 0 117 | metric.reset() 118 | for batch in data_iter: 119 | nbatch += 1 120 | mod.forward_backward(batch) 121 | # update all parameters (including the weight parameter) 122 | mod.update() 123 | # update training metric 124 | mod.update_metric(metric, batch.label) 125 | speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch, 126 | eval_metric=metric, locals=locals()) 127 | speedometer(speedometer_param) 128 | # evaluate metric on validation dataset 129 | score = mod.score(eval_data, ['acc']) 130 | logging.info('epoch %d, accuracy = %s', epoch, score[0][1]) 131 | 132 | mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=False) 133 | # reset the iterator for next pass of data 134 | data_iter.reset() 135 | 136 | logging.info('Training completed.') 137 | -------------------------------------------------------------------------------- /mxnet/blog/medium_vnni/ec2_benchmark_int8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "MXNet Model Quantization Performance" 3 | echo "Testing INT8 quantized models" 4 | echo "Installing mxnet-mkl 1.5.0b20190623" 5 | pip install --pre mxnet-mkl==1.5.0b20190623 6 | echo "downloading source code from incubator-mxnet repo" 7 | git clone https://github.com/apache/incubator-mxnet 8 | cd incubator-mxnet 9 | git checkout f44f6cfbe752fd8b8036307cecf6a30a30ad8557 10 | 11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l` 13 | export OMP_NUM_THREADS=$((vCPUs / 2)) 14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS 15 | 16 | # Reduce remote memory access 17 | export NNVM_EXEC_MATCH_RANGE=1 18 | # USE MKLDNN AS SUBGRAPH BACKEND 19 | export MXNET_SUBGRAPH_BACKEND=MKLDNN 20 | echo "Testing with MXNET_SUBGRAPH_BACKEND="$MXNET_SUBGRAPH_BACKEND 21 | 22 | echo "=========test image classification models==========" 23 | cd ./example/quantization 24 | echo "=============resnet50_v1===============" 25 | echo "1. calibrating resnet50_v1 with calib-mode=naive, use 5 batches to do calibration" 26 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive 27 | echo "2. testing throughput of fp32 resnet50_v1" 28 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 29 | echo "3. testing latency of fp32 resnet50_v1" 30 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 31 | echo "4. testing throughput of int8 resnet50_v1" 32 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 33 | echo "5. testing latency of int8 resnet50_v1" 34 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 35 | echo "=============resnet101_v1===============" 36 | echo "1. calibrating resnet101_v1 with calib-mode=naive, use 5 batches to do calibration" 37 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive 38 | echo "2. testing throughput of fp32 resnet101_v1" 39 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 40 | echo "3. testing latency of fp32 resnet101_v1" 41 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 42 | echo "4. testing throughput of int8 resnet101_v1" 43 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 44 | echo "5. testing latency of int8 resnet101_v1" 45 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-quantized-5batches-naive-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 46 | echo "=============mobilenet1.0===============" 47 | echo "1. calibrating mobilenet1.0 with calib-mode=naive, use 5 batches to do calibration" 48 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive 49 | echo "2. testing throughput of fp32 mobilenet1.0" 50 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 51 | echo "3. testing latency of fp32 mobilenet1.0" 52 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 53 | echo "4. testing throughput of int8 mobilenet1.0" 54 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 55 | echo "5. testing latency of int8 mobilenet1.0" 56 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 57 | echo "=============inceptionv3===============" 58 | echo "1. calibrating inceptionv3 with calib-mode=naive, use 5 batches to do calibration" 59 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive 60 | echo "2. testing throughput of fp32 inceptionv3" 61 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 62 | echo "3. testing latency of fp32 inceptionv3" 63 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 64 | echo "4. testing throughput of int8 inceptionv3" 65 | python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True 66 | echo "5. testing latency of int8 inceptionv3" 67 | python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True 68 | echo "=========test image detection models==========" 69 | echo "==============SSD VGG16================" 70 | echo "1. downloading model" 71 | cd ../ssd 72 | cd model/ && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip 73 | unzip ssd_vgg16_reduced_300-dd479559.zip && mv ssd_vgg16_reduced_300-dd479559.params ssd_vgg16_reduced_300-0000.params && mv ssd_vgg16_reduced_300-symbol-dd479559.json ssd_vgg16_reduced_300-symbol.json 74 | cd ../data && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip 75 | unzip ssd-val-fc19a535.zip && mv ssd-val-fc19a535.idx val.idx && mv ssd-val-fc19a535.lst val.lst && mv ssd-val-fc19a535.rec val.rec 76 | cd .. 77 | echo "2. quantizing model" 78 | python quantization.py 79 | echo "3. testing throughput of fp32 SSD VGG16" 80 | python benchmark_score.py --batch_size=224 --deploy --prefix=./model/ssd_ 81 | echo "4. testing throughput of int8 SSD VGG16" 82 | python benchmark_score.py --batch_size=224 --deploy --prefix=./model/cqssd_ 83 | echo "5. testing latency of fp32 SSD VGG16" 84 | python benchmark_score.py --batch_size=1 --deploy --prefix=./model/ssd_ 85 | echo "6. testing latency of int8 SSD VGG16" 86 | python benchmark_score.py --batch_size=1 --deploy --prefix=./model/cqssd_ 87 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/README.md: -------------------------------------------------------------------------------- 1 | ## TERMS OF USE: 2 | PLEASE NOTE THAT YOUR USE OF AND ACCESS TO KAGGLE'S SERVICES ARE SUBJECT TO THE TERMS. IF YOU DO NOT AGREE TO ALL OF THEM, YOU MAY NOT USE OR ACCESS THE SERVICES IN ANY MANNER. DETAILS SEE THE LINK: https://www.kaggle.com/terms 3 | 4 | How to get dataset: 5 | Goto the link: https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version to start download criteo large dataset, and it might take a long time. 6 | ``` 7 | mkdir large_version 8 | #Downloading the training dataset... 9 | wget -P ./large_version https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv 10 | #Downloading the validation dataset... 11 | wget -P ./large_version https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv 12 | ``` 13 | 14 | # 1. Steps to reproduce performance with OOB MXNet 15 | ``` 16 | git clone --recursive https://github.com/apache/incubator-mxnet.git 17 | cd incubator-mxnet 18 | git submodule update --recursive 19 | make -j USE_MKLDNN=1 USE_BLAS=mkl USE_OPENCV=1 20 | cd python 21 | python setup.py install [--user] 22 | export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH 23 | export PYTHONPATH=$PWD/python:$PYTHONPATH 24 | ``` 25 | ## Run the wide&deep: 26 | ``` 27 | cd optimized-models/mxnet/wide_deep_criteo/ 28 | python train.py 29 | python wd_gen_qsym_subgraph_update.py 30 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 31 | export OMP_NUM_THREADS=24 32 | ``` 33 | ### performance 34 | ``` 35 | # FP32 36 | numactl --physcpubind=0-23 --membind=0 python inference.py 37 | # Int8 38 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=WD-quantized-162batches-naive-symbol.json --param-file=WD-quantized-0000.params 39 | ``` 40 | ### Accuracy 41 | ``` 42 | # FP32 43 | numactl --physcpubind=0-23 --membind=0 python inference.py --accuracy True 44 | # Int8 45 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=WD-quantized-162batches-naive-symbol.json --param-file=WD-quantized-0000.params --accuracy True 46 | ``` 47 | 48 | # 2. Steps to reproduce performance with OOB MXNet and optimization patch 49 | ``` 50 | git clone --recursive https://github.com/apache/incubator-mxnet.git 51 | cd incubator-mxnet 52 | git submodule update --recursive 53 | git checkout 5d2a4510c2c226c6921a8a213d04461f68ca7173 54 | git apply --ignore-space-change --ignore-whitespace patch/patch.update 55 | make -j USE_MKLDNN=1 USE_BLAS=mkl USE_OPENCV=1 56 | cd python 57 | python setup.py install [--user] 58 | export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH 59 | export PYTHONPATH=$PWD/python:$PYTHONPATH 60 | ``` 61 | > Note: The patch.update are under review, [PR#14491](https://github.com/apache/incubator-mxnet/pull/14491), [PR#14492](https://github.com/apache/incubator-mxnet/pull/14492). After merged into master, no more patchs are needed. 62 | 63 | ## Run the wide&deep: 64 | ``` 65 | cd optimized-models/mxnet/wide_deep_criteo/ 66 | python train.py 67 | python wd_gen_qsym_subgraph_update.py 68 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 69 | export OMP_NUM_THREADS=24 70 | ``` 71 | ### Performance 72 | ``` 73 | # FP32 74 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding-fuse.json --param-file=checkpoint-0000.params 75 | # Int8 76 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params 77 | ``` 78 | ### Accuracy 79 | ``` 80 | # FP32 81 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding-fuse.json --param-file=checkpoint-0000.params --accuracy True 82 | # Int8 83 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params --accuracy True 84 | ``` 85 | 86 | # 3. Steps to reproduce performance with OOB MXNet and all internal optimization patch [Best so far] 87 | ``` 88 | git clone --recursive https://github.com/apache/incubator-mxnet.git 89 | cd incubator-mxnet 90 | git submodule update --recursive 91 | git checkout f1de8e51999ce3acaa95538d21a91fe43a0286ec 92 | git apply --ignore-space-change --ignore-whitespace patch/patch.diff 93 | cd 3rdparty/mkldnn 94 | git checkout 08bd90cca77683dd5d1c98068cea8b92ed05784d 95 | cd ../.. 96 | make -j USE_MKLDNN=1 USE_BLAS=mkl USE_OPENCV=1 97 | cd python 98 | python setup.py install [--user] 99 | export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH 100 | export PYTHONPATH=$PWD/python:$PYTHONPATH 101 | ``` 102 | ## Run the wide&deep: 103 | ``` 104 | cd optimized-models/mxnet/wide_deep_criteo/ 105 | python train.py 106 | python wd_gen_qsym_subgraph.py 107 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 108 | export OMP_NUM_THREADS=24 109 | ``` 110 | ### Performance 111 | ``` 112 | # FP32 113 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding-fuse.json --param-file=checkpoint-0000.params 114 | # Int8 115 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params 116 | ``` 117 | ### Accuracy 118 | ``` 119 | # FP32 120 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding-fuse.json --param-file=checkpoint-0000.params --accuracy True 121 | # Int8 122 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params --accuracy True 123 | ``` 124 | 125 | # FP32 Outputs 126 | ``` 127 | INFO:logger:Performance Mode 128 | INFO:logger:batch size = 1024 for inference 129 | INFO:logger:label_name = softmax_label 130 | INFO:logger:Loading symbol from file dl_framework-optimized-models/mxnet/wide_deep_criteo/embedding-fuse.json 131 | INFO:logger:Loading params from file dl_framework-optimized-models/mxnet/wide_deep_criteo/checkpoint-0000.params 132 | INFO:logger:Running model embedding-fuse.json for inference 133 | INFO:logger:Run [7812] Batchs Speed: xxxxxx.xx samples/sec 134 | ``` 135 | 136 | # Int8 Outputs 137 | ``` 138 | INFO:logger:Performance Mode 139 | INFO:logger:batch size = 1024 for inference 140 | INFO:logger:label_name = softmax_label 141 | INFO:logger:Loading symbol from file dl_framework-optimized-models/mxnet/wide_deep_criteo/embedding_fuse-quantized-1953batches-naive-symbol.json 142 | INFO:logger:Loading params from file dl_framework-optimized-models/mxnet/wide_deep_criteo/WD-quantized-0000.params 143 | INFO:logger:Running model embedding_fuse-quantized-1953batches-naive-symbol.json for inference 144 | INFO:logger:Run [7812] Batchs Speed: xxxxxx.xx samples/sec 145 | ``` 146 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/inference.py: -------------------------------------------------------------------------------- 1 | """inference script to support accuracy and performance benchmark""" 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | import argparse 20 | from datetime import datetime 21 | import logging 22 | import ctypes 23 | import time 24 | import os 25 | import pickle 26 | import mxnet as mx 27 | 28 | from mxnet import nd 29 | from mxnet.base import check_call, _LIB 30 | 31 | 32 | 33 | def load_model(_symbol_file, _param_file, _logger=None): 34 | """load existing symbol model""" 35 | cur_path = os.path.dirname(os.path.realpath(__file__)) 36 | symbol_file_path = os.path.join(cur_path, _symbol_file) 37 | if _logger is not None: 38 | _logger.info('Loading symbol from file %s' % symbol_file_path) 39 | symbol = mx.sym.load(symbol_file_path) 40 | 41 | param_file_path = os.path.join(cur_path, _param_file) 42 | if _logger is not None: 43 | _logger.info('Loading params from file %s' % param_file_path) 44 | save_dict = nd.load(param_file_path) 45 | _arg_params = {} 46 | _aux_params = {} 47 | for k, v in save_dict.items(): 48 | tp, name = k.split(':', 1) 49 | if tp == 'arg': 50 | _arg_params[name] = v 51 | if tp == 'aux': 52 | _aux_params[name] = v 53 | return symbol, _arg_params, _aux_params 54 | 55 | def advance_data_iter(data_iter, n): 56 | """use to warm up data for performance benchmark""" 57 | assert n >= 0 58 | if n == 0: 59 | return data_iter 60 | has_next_batch = True 61 | while has_next_batch: 62 | try: 63 | data_iter.next() 64 | n -= 1 65 | if n == 0: 66 | return data_iter 67 | except StopIteration: 68 | has_next_batch = False 69 | 70 | CRITEO = { 71 | 'train': 'train.csv', 72 | 'test': 'eval.csv', 73 | 'num_linear_features': 26000, 74 | 'num_embed_features': 26, 75 | 'num_cont_features': 13, 76 | 'embed_input_dims': 1000, 77 | 'hidden_units': [32, 1024, 512, 256], 78 | } 79 | def load_object(filename): 80 | with open(filename, 'rb') as input: 81 | return pickle.load(input) 82 | if __name__ == '__main__': 83 | parser = argparse.ArgumentParser(description='Score a model on a dataset') 84 | 85 | parser.add_argument('--symbol-file', type=str, default='checkpoint-symbol.json', help='symbol file path') 86 | parser.add_argument('--param-file', type=str, default='checkpoint-0000.params', help='param file path') 87 | parser.add_argument('--batch-size', type=int, default=1024) 88 | parser.add_argument('--label-name', type=str, default='softmax_label') 89 | parser.add_argument('--accuracy', action='store_true') 90 | parser.add_argument('--shuffle-dataset', action='store_true', default=True, 91 | help='shuffle the calibration dataset') 92 | parser.add_argument('--num-omp-threads', type=int, default=28) 93 | parser.add_argument('--num-batches', type=int, default=100000) 94 | parser.add_argument('--num-warmup', type=int, default=5000) 95 | parser.add_argument('--cuda', action='store_true', help='Inference on GPU with CUDA') 96 | parser.add_argument('--gpu-id', type=int, default=0) 97 | args = parser.parse_args() 98 | 99 | ctx = mx.gpu(args.gpu_id) if args.cuda else mx.cpu() 100 | 101 | logging.basicConfig() 102 | logger = logging.getLogger('logger') 103 | logger.setLevel(logging.INFO) 104 | 105 | if args.accuracy is True: 106 | logger.info('Accuracy Mode') 107 | else: 108 | logger.info('Performance Mode') 109 | 110 | symbol_file = args.symbol_file 111 | param_file = args.param_file 112 | 113 | 114 | batch_size = args.batch_size 115 | logger.info('batch size = %d for inference', batch_size) 116 | label_name = args.label_name 117 | logger.info('label_name = %s', label_name) 118 | 119 | if args.accuracy is False: 120 | val_csr = load_object('train_csr.pkl') 121 | val_dns = load_object('train_dns.pkl') 122 | val_label = load_object('train_label.pkl') 123 | else: 124 | val_csr = load_object('val_csr.pkl') 125 | val_dns = load_object('val_dns.pkl') 126 | val_label = load_object('val_label.pkl') 127 | 128 | # creating data iterator 129 | data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns}, 130 | {'softmax_label': val_label}, batch_size, 131 | shuffle=False, last_batch_handle='discard') 132 | 133 | # loading model 134 | sym, arg_params, aux_params = load_model(symbol_file, param_file, logger) 135 | 136 | 137 | # make sure that fp32 inference works on the same images as calibrated quantized model 138 | 139 | logger.info('Running model %s for inference', symbol_file) 140 | 141 | acc_m = mx.metric.create('acc') 142 | mod = mx.mod.Module(symbol=sym, context=ctx, data_names=['csr_data', 'dns_data'], label_names=[label_name, ]) 143 | mod.bind(for_training=False, 144 | data_shapes=data.provide_data, 145 | label_shapes=data.provide_label) 146 | mod.set_params(arg_params, aux_params) 147 | 148 | check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) 149 | batch_data = [] 150 | nbatch = 0 151 | while nbatch < args.num_batches: 152 | for batch in data: 153 | batch_data.append(batch) 154 | nbatch += 1 155 | if nbatch < args.num_batches: 156 | continue 157 | else: 158 | break 159 | data.hard_reset() 160 | #for data warmup 161 | wi = args.num_warmup 162 | i = 0 163 | for batch in batch_data: 164 | if i < wi: 165 | mod.forward(batch, is_train=False) 166 | i += 1 167 | else: 168 | break 169 | data.hard_reset() 170 | mx.nd.waitall() 171 | #real run 172 | if "DO_WIDE_DEEP_PROFILING" in os.environ: 173 | print("wide_deep profiling start !!!!!!!!!!!!!") 174 | mx.profiler.set_config(profile_symbolic=True, profile_imperative=True, profile_memory=False, profile_api=False) 175 | mx.profiler.set_state('run') 176 | nbatch = 0 177 | tic = time.time() 178 | logger.info('INFERENCING STARTED: %s', datetime.now().strftime("%m/%d/%Y %H:%M:%S.%f")[:-3]) 179 | for batch in batch_data: 180 | nbatch += 1 181 | mod.forward(batch, is_train=False) 182 | if args.accuracy is True: 183 | for output in mod.get_outputs(): 184 | output.wait_to_read() 185 | mod.update_metric(acc_m, batch.label) 186 | else: 187 | mx.nd.waitall() 188 | logger.info('INFERENCING FINISHED: %s', datetime.now().strftime("%m/%d/%Y %H:%M:%S.%f")[:-3]) 189 | speed = nbatch * batch_size / (time.time() - tic) 190 | logger.info("Run [%d] Batchs \tSpeed: %.2f samples/sec", nbatch, speed) 191 | if args.accuracy is True: 192 | logger.info(acc_m.get()) 193 | if "DO_WIDE_DEEP_PROFILING" in os.environ: 194 | print("wide_deep profiling end !") 195 | mx.profiler.set_state('stop') 196 | profiler_info = mx.profiler.dumps() 197 | print(profiler_info) 198 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/wd_gen_qsym_subgraph.py: -------------------------------------------------------------------------------- 1 | """Generate quantized graph based on original fp32 graph""" 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | import argparse 20 | import os 21 | import logging 22 | #import ctypes 23 | import pickle 24 | from mxnet import nd 25 | import mxnet as mx 26 | from mxnet.contrib.quantization import quantize_model 27 | 28 | 29 | 30 | def load_model(symbol_file, param_file, mlogger=None): 31 | """load existing symbol model""" 32 | cur_path = os.path.dirname(os.path.realpath(__file__)) 33 | symbol_file_path = os.path.join(cur_path, symbol_file) 34 | if mlogger is not None: 35 | mlogger.info('Loading symbol from file %s' % symbol_file_path) 36 | symbol = mx.sym.load(symbol_file_path) 37 | 38 | param_file_path = os.path.join(cur_path, param_file) 39 | if mlogger is not None: 40 | mlogger.info('Loading params from file %s' % param_file_path) 41 | save_dict = nd.load(param_file_path) 42 | marg_params = {} 43 | maux_params = {} 44 | for k, v in save_dict.items(): 45 | tp, name = k.split(':', 1) 46 | if tp == 'arg': 47 | marg_params[name] = v 48 | if tp == 'aux': 49 | maux_params[name] = v 50 | return symbol, marg_params, maux_params 51 | 52 | 53 | def save_symbol(fname, symbol, slogger=None): 54 | if slogger is not None: 55 | slogger.info('Saving symbol into file at %s' % fname) 56 | symbol.save(fname) 57 | 58 | 59 | def save_params(fname, parg_params, paux_params, plogger=None): 60 | if plogger is not None: 61 | plogger.info('Saving params into file at %s' % fname) 62 | save_dict = {('arg:%s' % k): v.as_in_context(mx.cpu()) for k, v in parg_params.items()} 63 | save_dict.update({('aux:%s' % k): v.as_in_context(mx.cpu()) for k, v in paux_params.items()}) 64 | mx.nd.save(fname, save_dict) 65 | 66 | def load_object(filename): 67 | with open(filename, 'rb') as input: 68 | return pickle.load(input) 69 | if __name__ == '__main__': 70 | parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model') 71 | parser.add_argument('--ctx', type=str, default='cpu') 72 | 73 | parser.add_argument('--batch-size', type=int, default=10000) 74 | parser.add_argument('--label-name', type=str, default='softmax_label') 75 | parser.add_argument('--calib-dataset', type=str, default='data/adult.data', 76 | help='path of the calibration dataset') 77 | parser.add_argument('--num-calib-batches', type=int, default=162, 78 | help='number of batches for calibration') 79 | parser.add_argument('--exclude-first-conv', action='store_true', default=True, 80 | help='excluding quantizing the first conv layer since the' 81 | ' number of channels is usually not a multiple of 4 in that layer' 82 | ' which does not satisfy the requirement of cuDNN') 83 | parser.add_argument('--calib-mode', type=str, default='naive', 84 | help='calibration mode used for generating calibration table for the quantized symbol; supports' 85 | ' 1. none: no calibration will be used. The thresholds for quantization will be calculated' 86 | ' on the fly. This will result in inference speed slowdown and loss of accuracy' 87 | ' in general.' 88 | ' 2. naive: simply take min and max values of layer outputs as thresholds for' 89 | ' quantization. In general, the inference accuracy worsens with more examples used in' 90 | ' calibration. It is recommended to use `entropy` mode as it produces more accurate' 91 | ' inference results.' 92 | ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal' 93 | ' thresholds. This mode is expected to produce the best inference accuracy of all three' 94 | ' kinds of quantized models if the calibration dataset is representative enough of the' 95 | ' inference dataset.') 96 | parser.add_argument('--quantized-dtype', type=str, default='uint8', 97 | choices=['int8', 'uint8'], 98 | help='quantization destination data type for input data') 99 | args = parser.parse_args() 100 | 101 | if args.ctx == 'gpu': 102 | ctx = mx.gpu(0) 103 | elif args.ctx == 'cpu': 104 | ctx = mx.cpu(0) 105 | else: 106 | raise ValueError('ctx %s is not supported in this script' % args.ctx) 107 | 108 | logging.basicConfig() 109 | logger = logging.getLogger('logger') 110 | logger.setLevel(logging.INFO) 111 | 112 | # get batch size 113 | batch_size = args.batch_size 114 | logger.info('batch size = %d for calibration', batch_size) 115 | # get number of batches for calibration 116 | num_calib_batches = args.num_calib_batches 117 | 118 | calib_mode = args.calib_mode 119 | if calib_mode != 'none': 120 | logger.info('number of batches = %d for calibration', num_calib_batches) 121 | 122 | val_csr = load_object('val_csr.pkl') 123 | val_dns = load_object('val_dns.pkl') 124 | val_label = load_object('val_label.pkl') 125 | 126 | # creating data iterator 127 | data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns}, 128 | {'softmax_label': val_label}, batch_size, 129 | shuffle=True, last_batch_handle='discard') 130 | # loading model 131 | sym, arg_params, aux_params = load_model('checkpoint-symbol.json', 'checkpoint-0000.params', logger) 132 | calib_layer = lambda name: (name.find('fullyconnected') != -1 or name.find('FullyConnected') != -1) 133 | cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, 134 | data_names=['csr_data', 'dns_data'], 135 | label_names=['softmax_label', ], 136 | ctx=ctx, 137 | calib_mode=calib_mode, calib_data=data, 138 | num_calib_examples=num_calib_batches*batch_size, 139 | calib_layer=calib_layer, quantized_dtype=args.quantized_dtype, 140 | calib_quantize_op=True, 141 | logger=logger) 142 | if calib_mode == 'entropy': 143 | suffix = '-quantized-%dbatches-entropy' % num_calib_batches 144 | elif calib_mode == 'naive': 145 | suffix = '-quantized-%dbatches-naive' % num_calib_batches 146 | else: 147 | raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`' 148 | % calib_mode) 149 | prefix = 'WD' 150 | sym_name = '%s-symbol.json' % (prefix + suffix) 151 | cqsym = cqsym.get_backend_symbol('MKLDNN_POST_FC_QUANTIZE') 152 | cqsym = cqsym.get_backend_symbol('MKLDNN_QFC_POST_RELU_FUSED') 153 | cqsym = cqsym.get_backend_symbol('MKLDNN_PARALLEL_EMBEDDING') 154 | save_symbol(sym_name, cqsym, logger) 155 | param_name = '%s-%04d.params' % (prefix + '-quantized', 0) 156 | save_params(param_name, qarg_params, aux_params, logger) 157 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/wd_gen_qsym_subgraph_update.py: -------------------------------------------------------------------------------- 1 | """Generate quantized graph based on original fp32 graph""" 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | import argparse 20 | import os 21 | import logging 22 | #import ctypes 23 | import pickle 24 | from mxnet import nd 25 | import mxnet as mx 26 | from mxnet.contrib.quantization import quantize_model 27 | 28 | 29 | 30 | def load_model(symbol_file, param_file, mlogger=None): 31 | """load existing symbol model""" 32 | cur_path = os.path.dirname(os.path.realpath(__file__)) 33 | symbol_file_path = os.path.join(cur_path, symbol_file) 34 | if mlogger is not None: 35 | mlogger.info('Loading symbol from file %s' % symbol_file_path) 36 | symbol = mx.sym.load(symbol_file_path) 37 | 38 | param_file_path = os.path.join(cur_path, param_file) 39 | if mlogger is not None: 40 | mlogger.info('Loading params from file %s' % param_file_path) 41 | save_dict = nd.load(param_file_path) 42 | marg_params = {} 43 | maux_params = {} 44 | for k, v in save_dict.items(): 45 | tp, name = k.split(':', 1) 46 | if tp == 'arg': 47 | marg_params[name] = v 48 | if tp == 'aux': 49 | maux_params[name] = v 50 | return symbol, marg_params, maux_params 51 | 52 | 53 | def save_symbol(fname, symbol, slogger=None): 54 | if slogger is not None: 55 | slogger.info('Saving symbol into file at %s' % fname) 56 | symbol.save(fname) 57 | 58 | 59 | def save_params(fname, parg_params, paux_params, plogger=None): 60 | if plogger is not None: 61 | plogger.info('Saving params into file at %s' % fname) 62 | save_dict = {('arg:%s' % k): v.as_in_context(mx.cpu()) for k, v in parg_params.items()} 63 | save_dict.update({('aux:%s' % k): v.as_in_context(mx.cpu()) for k, v in paux_params.items()}) 64 | mx.nd.save(fname, save_dict) 65 | 66 | def load_object(filename): 67 | with open(filename, 'rb') as input: 68 | return pickle.load(input) 69 | if __name__ == '__main__': 70 | parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model') 71 | parser.add_argument('--ctx', type=str, default='cpu') 72 | 73 | parser.add_argument('--batch-size', type=int, default=10000) 74 | parser.add_argument('--label-name', type=str, default='softmax_label') 75 | parser.add_argument('--calib-dataset', type=str, default='data/adult.data', 76 | help='path of the calibration dataset') 77 | parser.add_argument('--num-calib-batches', type=int, default=162, 78 | help='number of batches for calibration') 79 | parser.add_argument('--exclude-first-conv', action='store_true', default=True, 80 | help='excluding quantizing the first conv layer since the' 81 | ' number of channels is usually not a multiple of 4 in that layer' 82 | ' which does not satisfy the requirement of cuDNN') 83 | parser.add_argument('--calib-mode', type=str, default='naive', 84 | help='calibration mode used for generating calibration table for the quantized symbol; supports' 85 | ' 1. none: no calibration will be used. The thresholds for quantization will be calculated' 86 | ' on the fly. This will result in inference speed slowdown and loss of accuracy' 87 | ' in general.' 88 | ' 2. naive: simply take min and max values of layer outputs as thresholds for' 89 | ' quantization. In general, the inference accuracy worsens with more examples used in' 90 | ' calibration. It is recommended to use `entropy` mode as it produces more accurate' 91 | ' inference results.' 92 | ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal' 93 | ' thresholds. This mode is expected to produce the best inference accuracy of all three' 94 | ' kinds of quantized models if the calibration dataset is representative enough of the' 95 | ' inference dataset.') 96 | parser.add_argument('--quantized-dtype', type=str, default='uint8', 97 | choices=['int8', 'uint8'], 98 | help='quantization destination data type for input data') 99 | args = parser.parse_args() 100 | 101 | if args.ctx == 'gpu': 102 | ctx = mx.gpu(0) 103 | elif args.ctx == 'cpu': 104 | ctx = mx.cpu(0) 105 | else: 106 | raise ValueError('ctx %s is not supported in this script' % args.ctx) 107 | 108 | logging.basicConfig() 109 | logger = logging.getLogger('logger') 110 | logger.setLevel(logging.INFO) 111 | 112 | # get batch size 113 | batch_size = args.batch_size 114 | logger.info('batch size = %d for calibration', batch_size) 115 | # get number of batches for calibration 116 | num_calib_batches = args.num_calib_batches 117 | 118 | calib_mode = args.calib_mode 119 | if calib_mode != 'none': 120 | logger.info('number of batches = %d for calibration', num_calib_batches) 121 | 122 | val_csr = load_object('val_csr.pkl') 123 | val_dns = load_object('val_dns.pkl') 124 | val_label = load_object('val_label.pkl') 125 | 126 | # creating data iterator 127 | data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns}, 128 | {'softmax_label': val_label}, batch_size, 129 | shuffle=True, last_batch_handle='discard') 130 | # loading model 131 | sym, arg_params, aux_params = load_model('checkpoint-symbol.json', 'checkpoint-0000.params', logger) 132 | 133 | calib_layer = lambda name: (name.find('fullyconnected') != -1 or \ 134 | name.find('FullyConnected') != -1 or \ 135 | name.find('fully_connected') != -1 or \ 136 | name.find('concat0_output') != -1) 137 | sym = sym.get_backend_symbol('MKLDNN') 138 | excluded_sym_names = ['concat0', '_plus0'] 139 | cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params, 140 | data_names=['csr_data', 'dns_data'], 141 | label_names=['softmax_label', ], 142 | ctx=ctx, excluded_sym_names=excluded_sym_names, 143 | calib_mode=calib_mode, calib_data=data, 144 | num_calib_examples=num_calib_batches*batch_size, 145 | calib_layer=calib_layer, quantized_dtype=args.quantized_dtype, 146 | logger=logger) 147 | if calib_mode == 'entropy': 148 | suffix = '-quantized-%dbatches-entropy' % num_calib_batches 149 | elif calib_mode == 'naive': 150 | suffix = '-quantized-%dbatches-naive' % num_calib_batches 151 | else: 152 | raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`' 153 | % calib_mode) 154 | prefix = 'WD' 155 | sym_name = '%s-symbol.json' % (prefix + suffix) 156 | cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE') 157 | save_symbol(sym_name, cqsym, logger) 158 | param_name = '%s-%04d.params' % (prefix + '-quantized', 0) 159 | save_params(param_name, qarg_params, aux_params, logger) 160 | -------------------------------------------------------------------------------- /pytorch/imagenet/imagenet/README.md: -------------------------------------------------------------------------------- 1 | # ImageNet training in PyTorch 2 | 3 | This implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. 4 | 5 | ## Requirements 6 | 7 | - Install PyTorch ([pytorch.org](http://pytorch.org)) 8 | - `pip install -r requirements.txt` 9 | - Download the ImageNet dataset and move validation images to labeled subfolders 10 | - To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh 11 | 12 | ## Training 13 | 14 | To train a model, run `main.py` with the desired model architecture and the path to the ImageNet dataset: 15 | 16 | ```bash 17 | python main.py -a resnet18 [imagenet-folder with train and val folders] 18 | ``` 19 | 20 | The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG: 21 | 22 | ```bash 23 | python main.py -a alexnet --lr 0.01 [imagenet-folder with train and val folders] 24 | ``` 25 | 26 | ## Multi-processing Distributed Data Parallel Training ON GPU 27 | 28 | You should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance. 29 | 30 | ### Single node, multiple GPUs: 31 | 32 | ```bash 33 | python main.py -a resnet50 --dist-url 'tcp://127.0.0.1:FREEPORT' --dist-backend 'nccl' --multiprocessing-distributed --world-size 1 --rank 0 [imagenet-folder with train and val folders] 34 | ``` 35 | 36 | ### Multiple nodes: 37 | 38 | Node 0: 39 | ```bash 40 | python main.py -a resnet50 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' --dist-backend 'nccl' --multiprocessing-distributed --world-size 2 --rank 0 [imagenet-folder with train and val folders] 41 | ``` 42 | 43 | Node 1: 44 | ```bash 45 | python main.py -a resnet50 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' --dist-backend 'nccl' --multiprocessing-distributed --world-size 2 --rank 1 [imagenet-folder with train and val folders] 46 | ``` 47 | 48 | ## Multi-processing Distributed Data Parallel Training ON CPU: 49 | 50 | ### One node 2 instance: 51 | ```bash 52 | python main.py -a resnet18 --dist-url 'tcp://192.168.20.11:22384' --dist-backend 'gloo' --ppn 2 --world-size 1 --rank 0 -b 128 --mkldnn --multiprocessing-distributed /lustre/dataset/imagenet/img/ 53 | ``` 54 | ### One node(with two sockets) 2 instance(please change the num_threads in the running script): 55 | ```bash 56 | ./run_socket.sh 57 | ``` 58 | ### Two nodes 2 instance on each: 59 | 60 | Node 1: 61 | ```bash 62 | python main.py -a resnet18 --dist-url 'tcp://192.168.20.11:22384' --dist-backend 'gloo' --ppn 2 --world-size 2 --rank 0 -b 128 --mkldnn --multiprocessing-distributed /lustre/dataset/imagenet/img/ 63 | ``` 64 | 65 | Node 2: 66 | ```bash 67 | python main.py -a resnet18 --dist-url 'tcp://192.168.20.11:22384' --dist-backend 'gloo' --ppn 2 --world-size 2 --rank 1 -b 128 --mkldnn --multiprocessing-distributed /lustre/dataset/imagenet/img/ 68 | ``` 69 | 70 | ## INT8 inference 71 | 72 | Now we support resnet50 and resnext101 model. 73 | Run `main.py` with the desired model architecture and the path to the ImageNet dataset: 74 | 75 | ```bash 76 | python -u main.py -e -j $workers -a resnet50 -b 16 --INT8 "INT8_only" -qs "perChannel" --iter-calib 2500 -w 50 -qe "fbgemm" -i 100 [imagenet-folder with train and val folders] 77 | ``` 78 | 79 | ## Usage 80 | 81 | ``` 82 | usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N] 83 | [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e] 84 | [--pretrained] [--world-size WORLD_SIZE] [--rank RANK] 85 | [--ppn PPN] [--dist-url DIST_URL] [--dist-backend DIST_BACKEND] 86 | [--seed SEED] [--gpu GPU] [--multiprocessing-distributed] 87 | [--mkldnn] [--no-cuda] [-i N] [--iter-calib N] [-qe QENGINE] 88 | [-w N] [--INT8 INT8] [-t] [-qs QSCHEME] [-r] 89 | DIR 90 | 91 | PyTorch ImageNet Training 92 | 93 | positional arguments: 94 | DIR path to dataset 95 | 96 | optional arguments: 97 | -h, --help show this help message and exit 98 | -a ARCH, --arch ARCH model architecture: alexnet | densenet121 | 99 | densenet161 | densenet169 | densenet201 | googlenet | 100 | inception_v3 | mnasnet0_5 | mnasnet0_75 | mnasnet1_0 | 101 | mnasnet1_3 | mobilenet_v2 | resnet101 | resnet152 | 102 | resnet18 | resnet34 | resnet50 | resnext101_32x8d | 103 | resnext50_32x4d | shufflenet_v2_x0_5 | 104 | shufflenet_v2_x1_0 | shufflenet_v2_x1_5 | 105 | shufflenet_v2_x2_0 | squeezenet1_0 | squeezenet1_1 | 106 | vgg11 | vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn 107 | | vgg19 | vgg19_bn | wide_resnet101_2 | 108 | wide_resnet50_2 (default: resnet18) 109 | -j N, --workers N number of data loading workers (default: 4) 110 | --epochs N number of total epochs to run 111 | --start-epoch N manual epoch number (useful on restarts) 112 | -b N, --batch-size N mini-batch size (default: 256), this is the total 113 | batch size of all GPUs on the current node when using 114 | Data Parallel or Distributed Data Parallel 115 | --lr LR, --learning-rate LR 116 | initial learning rate 117 | --momentum M momentum 118 | --wd W, --weight-decay W 119 | weight decay (default: 1e-4) 120 | -p N, --print-freq N print frequency (default: 10) 121 | --resume PATH path to latest checkpoint (default: none) 122 | -e, --evaluate evaluate model on validation set 123 | --pretrained use pre-trained model 124 | --world-size WORLD_SIZE 125 | number of nodes for distributed training 126 | --rank RANK node rank for distributed training 127 | --ppn PPN number of processes on each node of distributed 128 | training 129 | --dist-url DIST_URL url used to set up distributed training 130 | --dist-backend DIST_BACKEND 131 | distributed backend 132 | --seed SEED seed for initializing training. 133 | --gpu GPU GPU id to use. 134 | --multiprocessing-distributed 135 | Use multi-processing distributed training to launch N 136 | processes per node, which has N GPUs. This is the 137 | fastest way to use PyTorch for either single node or 138 | multi node data parallel training 139 | --mkldnn use mkldnn weight cache 140 | --no-cuda disable CUDA 141 | -i N, --iterations N number of total iterations to run 142 | --iter-calib N number of iterations when calibration to run 143 | -qe QENGINE, --qengine QENGINE 144 | Choose qengine to run. "all", "fbgemm" or 145 | "mkldnn".(DEFAULT: all) 146 | -w N, --warmup-iterations N 147 | number of warmup iterations to run 148 | --INT8 INT8 Choose run mode. "no_INT8", "calibration_olny", 149 | "INT8_only", "INT8_and_fp32".(DEFAULT: no_INT8) 150 | -t, --profile Trigger profile on current topology. 151 | -qs QSCHEME, --qscheme QSCHEME 152 | The scheme of quantizer:"perTensor", "perChannel" 153 | -r, --reduce_range Choose reduce range flag. True or False. 154 | ``` 155 | ## Tips 156 | 157 | If we want to get a better performance when using MKLDNN backend, we can use a better alloctor: TCmalloc or Jemalloc. 158 | ### How to using TCmalloc 159 | 1. Install TCmalloc: 160 | ``` 161 | git clone https://github.com/gperftools/gperftools.git 162 | ./autogen.sh 163 | ./configure 164 | make 165 | make check(可选) 166 | make install 167 | make clean 168 | ``` 169 | 2. Using TCmalloc 170 | `export LD_PRELOAD=/lib/libtcmalloc.so` 171 | 3. Fine tune 172 | https://gperftools.github.io/gperftools/tcmalloc.html 173 | 174 | ### How to using Jemalloc 175 | 1. Install Jemalloc: 176 | https://github.com/jemalloc/jemalloc/blob/dev/INSTALL.md 177 | 2. Using Jemalloc 178 | `export LD_PRELOAD=/lib/libjemalloc.so` 179 | 3. Fine tune: 180 | https://github.com/jemalloc/jemalloc/blob/dev/TUNING.md 181 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/common/common_mlperf.py: -------------------------------------------------------------------------------- 1 | """ 2 | dataset related classes and methods 3 | """ 4 | 5 | # pylint: disable=unused-argument,missing-docstring 6 | 7 | import logging 8 | import sys 9 | import time 10 | 11 | import numpy as np 12 | 13 | from PIL import Image 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | log = logging.getLogger("dataset") 17 | 18 | class Item(): 19 | def __init__(self, label, img, idx): 20 | self.label = label 21 | self.img = img 22 | self.idx = idx 23 | self.start = time.time() 24 | 25 | 26 | def usleep(sec): 27 | if sys.platform == 'win32': 28 | # on windows time.sleep() doesn't work to well 29 | import ctypes 30 | kernel32 = ctypes.windll.kernel32 31 | timer = kernel32.CreateWaitableTimerA(ctypes.c_void_p(), True, ctypes.c_void_p()) 32 | delay = ctypes.c_longlong(int(-1 * (10 * 1000000 * sec))) 33 | kernel32.SetWaitableTimer(timer, ctypes.byref(delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False) 34 | kernel32.WaitForSingleObject(timer, 0xffffffff) 35 | else: 36 | time.sleep(sec) 37 | 38 | 39 | class Dataset(): 40 | def __init__(self): 41 | self.arrival = None 42 | self.image_list = [] 43 | self.label_list = [] 44 | self.image_list_inmemory = {} 45 | self.last_loaded = -1 46 | 47 | def preprocess(self, use_cache=True): 48 | raise NotImplementedError("Dataset:preprocess") 49 | 50 | def get_item_count(self): 51 | return len(self.image_list) 52 | 53 | def get_list(self): 54 | raise NotImplementedError("Dataset:get_list") 55 | 56 | def load_query_samples(self, sample_list): 57 | self.image_list_inmemory = {} 58 | for sample in sample_list: 59 | self.image_list_inmemory[sample], _ = self.get_item(sample) 60 | self.last_loaded = time.time() 61 | 62 | def unload_query_samples(self, sample_list): 63 | if sample_list: 64 | for sample in sample_list: 65 | if sample in self.image_list_inmemory : 66 | del self.image_list_inmemory[sample] 67 | else: 68 | self.image_list_inmemory = {} 69 | 70 | def get_samples(self, id_list): 71 | data = np.array([self.image_list_inmemory[id] for id in id_list]) 72 | return data, self.label_list[id_list] 73 | 74 | def get_item_loc(self, id): 75 | raise NotImplementedError("Dataset:get_item_loc") 76 | 77 | 78 | # 79 | # Post processing 80 | # 81 | class PostProcessCommon: 82 | def __init__(self, offset=0): 83 | self.offset = offset 84 | self.good = 0 85 | self.total = 0 86 | 87 | def __call__(self, results, ids, expected=None, result_dict=None): 88 | processed_results = [] 89 | n = len(results[0]) 90 | for idx in range(0, n): 91 | result = results[0][idx] + self.offset 92 | processed_results.append([result]) 93 | if result == expected[idx]: 94 | self.good += 1 95 | self.total += n 96 | return processed_results 97 | 98 | def add_results(self, results): 99 | pass 100 | 101 | def start(self): 102 | self.good = 0 103 | self.total = 0 104 | 105 | def finalize(self, results, ds=False, output_dir=None): 106 | results["good"] = self.good 107 | results["total"] = self.total 108 | 109 | 110 | class PostProcessArgMax: 111 | def __init__(self, offset=0): 112 | self.offset = offset 113 | self.good = 0 114 | self.total = 0 115 | 116 | def __call__(self, results, ids, expected=None, result_dict=None): 117 | processed_results = [] 118 | results = np.argmax(results[0], axis=1) 119 | n = results.shape[0] 120 | for idx in range(0, n): 121 | result = results[idx] + self.offset 122 | processed_results.append([result]) 123 | if result == expected[idx]: 124 | self.good += 1 125 | self.total += n 126 | return processed_results 127 | 128 | def add_results(self, results): 129 | pass 130 | 131 | def start(self): 132 | self.good = 0 133 | self.total = 0 134 | 135 | def finalize(self, results, ds=False, output_dir=None): 136 | results["good"] = self.good 137 | results["total"] = self.total 138 | 139 | 140 | # 141 | # pre-processing 142 | # 143 | 144 | def center_crop(img, out_height, out_width): 145 | width, height = img.size 146 | left = (width - out_width) / 2 147 | right = (width + out_width) / 2 148 | top = (height - out_height) / 2 149 | bottom = (height + out_height) / 2 150 | img = img.crop((left, top, right, bottom)) 151 | return img 152 | 153 | 154 | def resize_with_aspectratio(img, out_height, out_width, scale=87.5): 155 | width, height = img.size 156 | new_height = int(100. * out_height / scale) 157 | new_width = int(100. * out_width / scale) 158 | if height > width: 159 | w = new_width 160 | h = int(new_height * height / width) 161 | else: 162 | h = new_height 163 | w = int(new_width * width / height) 164 | img = img.resize((w, h), Image.BILINEAR) 165 | return img 166 | 167 | 168 | def pre_process_vgg(img, dims=None, need_transpose=False): 169 | if img.mode != 'RGB': 170 | img = img.convert('RGB') 171 | 172 | output_height, output_width, _ = dims 173 | 174 | img = resize_with_aspectratio(img, output_height, output_width) 175 | img = center_crop(img, output_height, output_width) 176 | img = np.asarray(img, dtype='float32') 177 | 178 | # normalize image 179 | means = np.array([123.68, 116.78, 103.94], dtype=np.float32) 180 | img -= means 181 | # transpose if needed 182 | if need_transpose: 183 | img = img.transpose([2, 0, 1]) 184 | return img 185 | 186 | 187 | def pre_process_mobilenet(img, dims=None, need_transpose=False): 188 | if img.mode != 'RGB': 189 | img = img.convert('RGB') 190 | 191 | output_height, output_width, _ = dims 192 | 193 | img = resize_with_aspectratio(img, output_height, output_width) 194 | img = center_crop(img, output_height, output_width) 195 | img = np.asarray(img, dtype='float32') 196 | 197 | img /= 255.0 198 | img -= 0.5 199 | img *= 2 200 | 201 | # transpose if needed 202 | if need_transpose: 203 | img = img.transpose([2, 0, 1]) 204 | return img 205 | 206 | 207 | def pre_process_coco_mobilenet(img, dims=None, need_transpose=False): 208 | if img.mode != 'RGB': 209 | img = img.convert('RGB') 210 | 211 | img_data = np.array(img.getdata()) 212 | img_data = img_data.astype(np.uint8) 213 | (im_width, im_height) = img.size 214 | img = img_data.reshape(im_height, im_width, 3) 215 | # transpose if needed 216 | if need_transpose: 217 | img = img.transpose([2, 0, 1]) 218 | return img 219 | 220 | def pre_process_coco_pt_mobilenet(img, dims=None, need_transpose=False): 221 | if img.mode != 'RGB': 222 | img = img.convert('RGB') 223 | 224 | img_data = np.array(img.getdata()) 225 | img_data = img_data.astype(np.float32) 226 | (im_width, im_height) = img.size 227 | img = img_data.reshape(im_height, im_width, 3) 228 | img -= 127.5 229 | img /= 127.5 230 | # transpose if needed 231 | if need_transpose: 232 | img = img.transpose([2, 0, 1]) 233 | return img 234 | 235 | def pre_process_coco_resnet34(img, dims=None, need_transpose=False): 236 | if img.mode != 'RGB': 237 | img = img.convert('RGB') 238 | 239 | if dims != None: 240 | im_height, im_width, _ = dims 241 | img = img.resize((im_width, im_height), Image.BILINEAR) 242 | mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) 243 | std = np.array([0.229, 0.224, 0.225], dtype=np.float32) 244 | img_data = np.array(img.getdata(), dtype=np.float32) 245 | (im_width, im_height) = img.size 246 | img = img_data.reshape(im_height, im_width, 3) 247 | img = img / 255. - mean 248 | img = img / std 249 | if need_transpose: 250 | img = img.transpose([2, 0, 1]) 251 | 252 | return img 253 | 254 | def pre_process_coco_resnet34_tf(img, dims=None, need_transpose=False): 255 | if img.mode != 'RGB': 256 | img = img.convert('RGB') 257 | 258 | if dims != None: 259 | im_height, im_width, _ = dims 260 | img = img.resize((im_width, im_height), Image.BILINEAR) # PIL.Image.BILINEAR 2 261 | mean = np.array([123.68, 116.78, 103.94], dtype=np.float32) 262 | img_data = np.array(img.getdata(), dtype=np.float32) 263 | (im_width, im_height) = img.size 264 | img = img_data.reshape(im_height, im_width, 3) 265 | img = img - mean 266 | if need_transpose: 267 | img = img.transpose([2, 0, 1]) 268 | 269 | return img 270 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/inference/calibration_int8.py: -------------------------------------------------------------------------------- 1 | """ 2 | module to run calibration 3 | """ 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | from __future__ import unicode_literals 8 | 9 | import sys 10 | import os 11 | import logging 12 | import numpy as np 13 | from caffe2.proto import caffe2_pb2 14 | from caffe2.python import core, workspace 15 | from caffe2.python import transformations as tf 16 | import inference.models as m 17 | from common import common_caffe2 as cc2 18 | 19 | def Calibration(args, extra_args): 20 | """ 21 | function to run calibration 22 | """ 23 | 24 | if not m.IsSupported(args.model): 25 | logging.error("Not supported model: {}".format(args.model)) 26 | m.ShowModels() 27 | return 28 | 29 | images_path = None 30 | if args.images_path: 31 | images_path = os.path.abspath(args.images_path) 32 | elif "CAFFE2_INF_IMG_PATH" in os.environ: 33 | images_path = os.path.abspath(os.environ["CAFFE2_INF_IMG_PATH"]) 34 | 35 | batch_size = 1 36 | if args.batch_size: 37 | batch_size = int(args.batch_size) 38 | if batch_size <= 0: 39 | logging.error("Invalid batch size {}. Exit!".format(batch_size)) 40 | return 41 | 42 | iterations = args.iterations if args.iterations else sys.maxsize 43 | logging.warning("Run Caffe2 in inference mode with args:\n{}" 44 | .format(vars(args))) 45 | 46 | model_info = m.GetModelInfo(args.model) 47 | logging.warning("The inference inputs of {0} model:\n{1}" 48 | .format( 49 | args.model, 50 | {str(k): str(v) for k, v in model_info.items()} 51 | )) 52 | 53 | crop_size = int(model_info["crop_size"]) 54 | if args.crop_size: 55 | crop_size = args.crop_size 56 | 57 | need_normalize = False 58 | if model_info["need_normalize"]: 59 | need_normalize = True 60 | 61 | mean = 128 62 | if str(model_info["image_mean"]) != 'None': 63 | mean_tmp = ((model_info["image_mean"]).split('/')[-1]).split(' ') 64 | if need_normalize: 65 | mean = np.zeros([3, crop_size, crop_size], dtype=np.float) 66 | mean[0, :, :] = float(mean_tmp[0]) # 104 67 | mean[1, :, :] = float(mean_tmp[1]) # 117 68 | mean[2, :, :] = float(mean_tmp[2]) # 124 69 | 70 | else: 71 | mean = np.zeros([3, crop_size, crop_size], dtype=np.int32) 72 | mean[0, :, :] = int(mean_tmp[0]) # 104 73 | mean[1, :, :] = int(mean_tmp[1]) # 117 74 | mean[2, :, :] = int(mean_tmp[2]) # 124 75 | 76 | scale = [1] 77 | if str(model_info["scale"]) != '': 78 | scale = (model_info["scale"]).split(' ') 79 | rescale_size = 256 80 | if str(model_info["rescale_size"]) != '': 81 | rescale_size = int(model_info["rescale_size"]) 82 | color_format = "BGR" 83 | if str(model_info["color_format"]) != '': 84 | color_format = model_info["color_format"] 85 | if args.onnx_model: 86 | init_def, predict_def = cc2.OnnxToCaffe2(model_info["onnx_model"]) 87 | else: 88 | with open(model_info["init_net"], 'rb') as i: 89 | if model_info["model_type"] == "prototext" or \ 90 | model_info["init_net"].split('.')[-1] == "pbtxt": 91 | import google.protobuf.text_format as ptxt 92 | init_def = ptxt.Parse(i.read(), caffe2_pb2.NetDef()) 93 | else: 94 | init_def = caffe2_pb2.NetDef() 95 | init_def.ParseFromString(i.read()) 96 | with open(model_info["predict_net"], 'rb') as p: 97 | if model_info["model_type"] == "prototext" or \ 98 | model_info["predict_net"].split('.')[-1] == "pbtxt": 99 | import google.protobuf.text_format as ptxt 100 | predict_def = ptxt.Parse(p.read(), caffe2_pb2.NetDef()) 101 | else: 102 | predict_def = caffe2_pb2.NetDef() 103 | predict_def.ParseFromString(p.read()) 104 | 105 | if model_info["model_type"] == "caffe legacy": 106 | cc2.MergeScaleBiasInBN(predict_def) 107 | cc2.RemoveUselessExternalInput(predict_def) 108 | 109 | dev_map = { 110 | "cpu": caffe2_pb2.CPU, 111 | "gpu": caffe2_pb2.CUDA, 112 | "cuda": caffe2_pb2.CUDA, 113 | "mkldnn": caffe2_pb2.MKLDNN, 114 | "opengl": caffe2_pb2.OPENGL, 115 | "opencl": caffe2_pb2.OPENCL, 116 | "ideep": caffe2_pb2.IDEEP, 117 | } 118 | device_opts = caffe2_pb2.DeviceOption() 119 | if args.device.lower() in dev_map: 120 | device_opts.device_type = dev_map[args.device.lower()] 121 | else: 122 | logging.error("Wrong device {}. Exit!".format(args.device)) 123 | return 124 | 125 | logging.warning("Start running calibration") 126 | 127 | if args.calibration_file: 128 | images, _ = cc2.ImageProc.BatchImagesByName(images_path, args.calibration_file, batch_size, iterations) 129 | else: 130 | images, _ = cc2.ImageProc.BatchImages(images_path, batch_size, iterations) 131 | # for kl_divergence calibration, we use the first 100 images to get 132 | # the min and max values, and the remaing images are applied to compute the hist. 133 | # if the len(images) <= 100, we extend the images with themselves. 134 | def data_gen(): 135 | images_calib = images 136 | if args.single_iter_calib: 137 | images_calib = [images[args.iter_calib]] 138 | for raw in images_calib: 139 | if model_info["model_type"] == "mlperf legacy vgg": 140 | imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfVGG(raw) 141 | elif model_info["model_type"] == "mlperf legacy mb": 142 | imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfMB(raw) 143 | else: 144 | imgs, _ = cc2.ImageProc.PreprocessImages( 145 | raw, crop_size, rescale_size, mean, scale, 1, need_normalize, color_format) 146 | #imgs, _ = cc2.ImageProc.PreprocessImagesByThreading( 147 | # raw, crop_size,rescale_size, mean, scale, 1) 148 | yield imgs 149 | del imgs 150 | 151 | cc2.UpdateDeviceOption(device_opts, init_def) 152 | workspace.RunNetOnce(init_def) 153 | 154 | cc2.UpdateDeviceOption(device_opts, predict_def) 155 | net = core.Net(model_info["model_name"]) 156 | net.Proto().CopyFrom(predict_def) 157 | if args.device.lower() == 'ideep' and not args.noptimize: 158 | logging.warning('Optimizing module {} ....................' 159 | .format(model_info["model_name"])) 160 | tf.optimizeForMKLDNN(net) 161 | predict_def = net.Proto() 162 | if predict_def.op[-1].type == 'Accuracy': 163 | init_label = np.ones((batch_size), dtype=np.int32) 164 | label = net.AddExternalInput('label') 165 | workspace.FeedBlob(label, init_label, device_opts) 166 | for i, op in enumerate(predict_def.op): 167 | if op.type == 'Accuracy': 168 | workspace.FeedBlob(str(predict_def.op[i].output[0]), init_label, device_opts) 169 | 170 | from inference.calibrator import Calibrator, KLCalib, AbsmaxCalib, EMACalib 171 | algorithm = AbsmaxCalib() 172 | kind = os.environ.get('INT8CALIB') 173 | if args.calib_algo: 174 | kind = args.calib_algo 175 | if kind == "absmax": 176 | algorithm = AbsmaxCalib() 177 | elif kind == "moving_average": 178 | ema_alpha = 0.5 179 | algorithm = EMACalib(ema_alpha) 180 | elif kind == "kl_divergence": 181 | kl_iter_num_for_range = 500 182 | while len(images) < 2*kl_iter_num_for_range: 183 | images += images 184 | algorithm = KLCalib(kl_iter_num_for_range) 185 | logging.warning('Use {} calibration method....................'.format(kind)) 186 | 187 | i = 0 188 | length = len(images) 189 | calib = Calibrator(algorithm, device_opts) 190 | for data in data_gen(): 191 | i += 1 192 | workspace.FeedBlob(predict_def.op[0].input[0], data, device_opts) 193 | logging.warning("in progress {}/{}(batch/batch total)".format(i, length)) 194 | calib.RunCalibIter(workspace, predict_def) 195 | 196 | predict_quantized, init_quantized = calib.DepositQuantizedModule(workspace, predict_def) 197 | 198 | cc2.SaveModel(args.output_file + '/init_net_int8.pb', init_quantized, 199 | args.output_file + '/predict_net_int8.pb', predict_quantized) 200 | cc2.SaveModelPtxt(args.output_file + '/predict_net_int8.pbtxt', predict_quantized) 201 | cc2.SaveModelPtxt(args.output_file + '/init_net_int8.pbtxt', init_quantized) 202 | 203 | 204 | if __name__ == '__main__': 205 | logging.critical("Do not run this script independently!") 206 | exit() 207 | -------------------------------------------------------------------------------- /mxnet/wide_deep_criteo/update_model/embedding-fuse.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": [ 3 | { 4 | "op": "null", 5 | "name": "csr_data", 6 | "attrs": {"__storage_type__": "2"}, 7 | "inputs": [] 8 | }, 9 | { 10 | "op": "null", 11 | "name": "linear_weight", 12 | "attrs": { 13 | "__init__": "[\"normal\", {\"sigma\": 0.01}]", 14 | "__shape__": "(26000, 256)", 15 | "__storage_type__": "1" 16 | }, 17 | "inputs": [] 18 | }, 19 | { 20 | "op": "dot", 21 | "name": "dot0", 22 | "inputs": [[0, 0, 0], [1, 0, 0]] 23 | }, 24 | { 25 | "op": "null", 26 | "name": "linear_bias", 27 | "attrs": {"__shape__": "(256,)"}, 28 | "inputs": [] 29 | }, 30 | { 31 | "op": "broadcast_add", 32 | "name": "broadcast_add0", 33 | "inputs": [[2, 0, 0], [3, 0, 0]] 34 | }, 35 | { 36 | "op": "null", 37 | "name": "dns_data", 38 | "inputs": [] 39 | }, 40 | { 41 | "op": "null", 42 | "name": "embed_0_weight", 43 | "attrs": {"__storage_type__": "1"}, 44 | "inputs": [] 45 | }, 46 | { 47 | "op": "null", 48 | "name": "embed_1_weight", 49 | "attrs": {"__storage_type__": "1"}, 50 | "inputs": [] 51 | }, 52 | { 53 | "op": "null", 54 | "name": "embed_2_weight", 55 | "attrs": {"__storage_type__": "1"}, 56 | "inputs": [] 57 | }, 58 | { 59 | "op": "null", 60 | "name": "embed_3_weight", 61 | "attrs": {"__storage_type__": "1"}, 62 | "inputs": [] 63 | }, 64 | { 65 | "op": "null", 66 | "name": "embed_4_weight", 67 | "attrs": {"__storage_type__": "1"}, 68 | "inputs": [] 69 | }, 70 | { 71 | "op": "null", 72 | "name": "embed_5_weight", 73 | "attrs": {"__storage_type__": "1"}, 74 | "inputs": [] 75 | }, 76 | { 77 | "op": "null", 78 | "name": "embed_6_weight", 79 | "attrs": {"__storage_type__": "1"}, 80 | "inputs": [] 81 | }, 82 | { 83 | "op": "null", 84 | "name": "embed_7_weight", 85 | "attrs": {"__storage_type__": "1"}, 86 | "inputs": [] 87 | }, 88 | { 89 | "op": "null", 90 | "name": "embed_8_weight", 91 | "attrs": {"__storage_type__": "1"}, 92 | "inputs": [] 93 | }, 94 | { 95 | "op": "null", 96 | "name": "embed_9_weight", 97 | "attrs": {"__storage_type__": "1"}, 98 | "inputs": [] 99 | }, 100 | { 101 | "op": "null", 102 | "name": "embed_10_weight", 103 | "attrs": {"__storage_type__": "1"}, 104 | "inputs": [] 105 | }, 106 | { 107 | "op": "null", 108 | "name": "embed_11_weight", 109 | "attrs": {"__storage_type__": "1"}, 110 | "inputs": [] 111 | }, 112 | { 113 | "op": "null", 114 | "name": "embed_12_weight", 115 | "attrs": {"__storage_type__": "1"}, 116 | "inputs": [] 117 | }, 118 | { 119 | "op": "null", 120 | "name": "embed_13_weight", 121 | "attrs": {"__storage_type__": "1"}, 122 | "inputs": [] 123 | }, 124 | { 125 | "op": "null", 126 | "name": "embed_14_weight", 127 | "attrs": {"__storage_type__": "1"}, 128 | "inputs": [] 129 | }, 130 | { 131 | "op": "null", 132 | "name": "embed_15_weight", 133 | "attrs": {"__storage_type__": "1"}, 134 | "inputs": [] 135 | }, 136 | { 137 | "op": "null", 138 | "name": "embed_16_weight", 139 | "attrs": {"__storage_type__": "1"}, 140 | "inputs": [] 141 | }, 142 | { 143 | "op": "null", 144 | "name": "embed_17_weight", 145 | "attrs": {"__storage_type__": "1"}, 146 | "inputs": [] 147 | }, 148 | { 149 | "op": "null", 150 | "name": "embed_18_weight", 151 | "attrs": {"__storage_type__": "1"}, 152 | "inputs": [] 153 | }, 154 | { 155 | "op": "null", 156 | "name": "embed_19_weight", 157 | "attrs": {"__storage_type__": "1"}, 158 | "inputs": [] 159 | }, 160 | { 161 | "op": "null", 162 | "name": "embed_20_weight", 163 | "attrs": {"__storage_type__": "1"}, 164 | "inputs": [] 165 | }, 166 | { 167 | "op": "null", 168 | "name": "embed_21_weight", 169 | "attrs": {"__storage_type__": "1"}, 170 | "inputs": [] 171 | }, 172 | { 173 | "op": "null", 174 | "name": "embed_22_weight", 175 | "attrs": {"__storage_type__": "1"}, 176 | "inputs": [] 177 | }, 178 | { 179 | "op": "null", 180 | "name": "embed_23_weight", 181 | "attrs": {"__storage_type__": "1"}, 182 | "inputs": [] 183 | }, 184 | { 185 | "op": "null", 186 | "name": "embed_24_weight", 187 | "attrs": {"__storage_type__": "1"}, 188 | "inputs": [] 189 | }, 190 | { 191 | "op": "null", 192 | "name": "embed_25_weight", 193 | "attrs": {"__storage_type__": "1"}, 194 | "inputs": [] 195 | }, 196 | { 197 | "op": "SliceSplitEmbeddingConcatFuse", 198 | "name": "SliceSplitEmbeddingConcatFuse_0", 199 | "attrs": { 200 | "concat_dim": "1", 201 | "cont_begin": "(0,26)", 202 | "cont_end": "(None,39)", 203 | "embed_begin": "(0,0)", 204 | "embed_end": "(None,26)", 205 | "input_dims": "[1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]", 206 | "num_outputs": "26", 207 | "output_dims": "[32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]", 208 | "squeeze_axis": "1" 209 | }, 210 | "inputs": [ 211 | [5, 0, 0], 212 | [6, 0, 0], 213 | [7, 0, 0], 214 | [8, 0, 0], 215 | [9, 0, 0], 216 | [10, 0, 0], 217 | [11, 0, 0], 218 | [12, 0, 0], 219 | [13, 0, 0], 220 | [14, 0, 0], 221 | [15, 0, 0], 222 | [16, 0, 0], 223 | [17, 0, 0], 224 | [18, 0, 0], 225 | [19, 0, 0], 226 | [20, 0, 0], 227 | [21, 0, 0], 228 | [22, 0, 0], 229 | [23, 0, 0], 230 | [24, 0, 0], 231 | [25, 0, 0], 232 | [26, 0, 0], 233 | [27, 0, 0], 234 | [28, 0, 0], 235 | [29, 0, 0], 236 | [30, 0, 0], 237 | [31, 0, 0] 238 | ] 239 | }, 240 | { 241 | "op": "null", 242 | "name": "fullyconnected0_weight", 243 | "attrs": {"num_hidden": "1024"}, 244 | "inputs": [] 245 | }, 246 | { 247 | "op": "null", 248 | "name": "fullyconnected0_bias", 249 | "attrs": {"num_hidden": "1024"}, 250 | "inputs": [] 251 | }, 252 | { 253 | "op": "FullyConnected", 254 | "name": "fullyconnected0", 255 | "attrs": {"num_hidden": "1024"}, 256 | "inputs": [[32, 0, 0], [33, 0, 0], [34, 0, 0]] 257 | }, 258 | { 259 | "op": "Activation", 260 | "name": "activation0", 261 | "attrs": {"act_type": "relu"}, 262 | "inputs": [[35, 0, 0]] 263 | }, 264 | { 265 | "op": "null", 266 | "name": "fullyconnected1_weight", 267 | "attrs": {"num_hidden": "512"}, 268 | "inputs": [] 269 | }, 270 | { 271 | "op": "null", 272 | "name": "fullyconnected1_bias", 273 | "attrs": {"num_hidden": "512"}, 274 | "inputs": [] 275 | }, 276 | { 277 | "op": "FullyConnected", 278 | "name": "fullyconnected1", 279 | "attrs": {"num_hidden": "512"}, 280 | "inputs": [[36, 0, 0], [37, 0, 0], [38, 0, 0]] 281 | }, 282 | { 283 | "op": "Activation", 284 | "name": "activation1", 285 | "attrs": {"act_type": "relu"}, 286 | "inputs": [[39, 0, 0]] 287 | }, 288 | { 289 | "op": "null", 290 | "name": "fullyconnected2_weight", 291 | "attrs": {"num_hidden": "256"}, 292 | "inputs": [] 293 | }, 294 | { 295 | "op": "null", 296 | "name": "fullyconnected2_bias", 297 | "attrs": {"num_hidden": "256"}, 298 | "inputs": [] 299 | }, 300 | { 301 | "op": "FullyConnected", 302 | "name": "fullyconnected2", 303 | "attrs": {"num_hidden": "256"}, 304 | "inputs": [[40, 0, 0], [41, 0, 0], [42, 0, 0]] 305 | }, 306 | { 307 | "op": "elemwise_add", 308 | "name": "_plus0", 309 | "inputs": [[4, 0, 0], [43, 0, 0]] 310 | }, 311 | { 312 | "op": "null", 313 | "name": "softmax_label", 314 | "inputs": [] 315 | }, 316 | { 317 | "op": "SoftmaxOutput", 318 | "name": "model", 319 | "inputs": [[44, 0, 0], [45, 0, 0]] 320 | } 321 | ], 322 | "arg_nodes": [ 323 | 0, 324 | 1, 325 | 3, 326 | 5, 327 | 6, 328 | 7, 329 | 8, 330 | 9, 331 | 10, 332 | 11, 333 | 12, 334 | 13, 335 | 14, 336 | 15, 337 | 16, 338 | 17, 339 | 18, 340 | 19, 341 | 20, 342 | 21, 343 | 22, 344 | 23, 345 | 24, 346 | 25, 347 | 26, 348 | 27, 349 | 28, 350 | 29, 351 | 30, 352 | 31, 353 | 33, 354 | 34, 355 | 37, 356 | 38, 357 | 41, 358 | 42, 359 | 45 360 | ], 361 | "node_row_ptr": [ 362 | 0, 363 | 1, 364 | 2, 365 | 3, 366 | 4, 367 | 5, 368 | 6, 369 | 7, 370 | 8, 371 | 9, 372 | 10, 373 | 11, 374 | 12, 375 | 13, 376 | 14, 377 | 15, 378 | 16, 379 | 17, 380 | 18, 381 | 19, 382 | 20, 383 | 21, 384 | 22, 385 | 23, 386 | 24, 387 | 25, 388 | 26, 389 | 27, 390 | 28, 391 | 29, 392 | 30, 393 | 31, 394 | 32, 395 | 33, 396 | 34, 397 | 35, 398 | 36, 399 | 37, 400 | 38, 401 | 39, 402 | 40, 403 | 41, 404 | 42, 405 | 43, 406 | 44, 407 | 45, 408 | 46, 409 | 47 410 | ], 411 | "heads": [[46, 0, 0]], 412 | "attrs": {"mxnet_version": ["int", 10500]} 413 | } 414 | -------------------------------------------------------------------------------- /pytorch/dlrm/dlrm/quorem/qr_embedding_bag.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Quotient-Remainder Trick 7 | # 8 | # Description: Applies quotient remainder-trick to embeddings to reduce 9 | # embedding sizes. 10 | # 11 | # References: 12 | # [1] Hao-Jun Michael Shi, Dheevatsa Mudigere, Maxim Naumov, Jiyan Yang, 13 | # "Compositional Embeddings Using Complementary Partitions for Memory-Efficient 14 | # Recommendation Systems", CoRR, arXiv:1909.02107, 2019 15 | 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | import torch 19 | import torch.nn as nn 20 | import torch.nn.functional as F 21 | from torch.nn.parameter import Parameter 22 | import numpy as np 23 | 24 | 25 | class QREmbeddingBag(nn.Module): 26 | r"""Computes sums or means over two 'bags' of embeddings, one using the quotient 27 | of the indices and the other using the remainder of the indices, without 28 | instantiating the intermediate embeddings, then performs an operation to combine these. 29 | 30 | For bags of constant length and no :attr:`per_sample_weights`, this class 31 | 32 | * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``, 33 | * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``, 34 | * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``. 35 | 36 | However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these 37 | operations. 38 | 39 | QREmbeddingBag also supports per-sample weights as an argument to the forward 40 | pass. This scales the output of the Embedding before performing a weighted 41 | reduction as specified by ``mode``. If :attr:`per_sample_weights`` is passed, the 42 | only supported ``mode`` is ``"sum"``, which computes a weighted sum according to 43 | :attr:`per_sample_weights`. 44 | 45 | Known Issues: 46 | Autograd breaks with multiple GPUs. It breaks only with multiple embeddings. 47 | 48 | Args: 49 | num_categories (int): total number of unique categories. The input indices must be in 50 | 0, 1, ..., num_categories - 1. 51 | embedding_dim (list): list of sizes for each embedding vector in each table. If ``"add"`` 52 | or ``"mult"`` operation are used, these embedding dimensions must be 53 | the same. If a single embedding_dim is used, then it will use this 54 | embedding_dim for both embedding tables. 55 | num_collisions (int): number of collisions to enforce. 56 | operation (string, optional): ``"concat"``, ``"add"``, or ``"mult". Specifies the operation 57 | to compose embeddings. ``"concat"`` concatenates the embeddings, 58 | ``"add"`` sums the embeddings, and ``"mult"`` multiplies 59 | (component-wise) the embeddings. 60 | Default: ``"mult"`` 61 | max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm` 62 | is renormalized to have norm :attr:`max_norm`. 63 | norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``. 64 | scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of 65 | the words in the mini-batch. Default ``False``. 66 | Note: this option is not supported when ``mode="max"``. 67 | mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag. 68 | ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights` 69 | into consideration. ``"mean"`` computes the average of the values 70 | in the bag, ``"max"`` computes the max value over each bag. 71 | Default: ``"mean"`` 72 | sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See 73 | Notes for more details regarding sparse gradients. Note: this option is not 74 | supported when ``mode="max"``. 75 | 76 | Attributes: 77 | weight (Tensor): the learnable weights of each embedding table is the module of shape 78 | `(num_embeddings, embedding_dim)` initialized using a uniform distribution 79 | with sqrt(1 / num_categories). 80 | 81 | Inputs: :attr:`input` (LongTensor), :attr:`offsets` (LongTensor, optional), and 82 | :attr:`per_index_weights` (Tensor, optional) 83 | 84 | - If :attr:`input` is 2D of shape `(B, N)`, 85 | 86 | it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and 87 | this will return ``B`` values aggregated in a way depending on the :attr:`mode`. 88 | :attr:`offsets` is ignored and required to be ``None`` in this case. 89 | 90 | - If :attr:`input` is 1D of shape `(N)`, 91 | 92 | it will be treated as a concatenation of multiple bags (sequences). 93 | :attr:`offsets` is required to be a 1D tensor containing the 94 | starting index positions of each bag in :attr:`input`. Therefore, 95 | for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as 96 | having ``B`` bags. Empty bags (i.e., having 0-length) will have 97 | returned vectors filled by zeros. 98 | 99 | per_sample_weights (Tensor, optional): a tensor of float / double weights, or None 100 | to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights` 101 | must have exactly the same shape as input and is treated as having the same 102 | :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``. 103 | 104 | 105 | Output shape: `(B, embedding_dim)` 106 | 107 | """ 108 | __constants__ = ['num_categories', 'embedding_dim', 'num_collisions', 109 | 'operation', 'max_norm', 'norm_type', 'scale_grad_by_freq', 110 | 'mode', 'sparse'] 111 | 112 | def __init__(self, num_categories, embedding_dim, num_collisions, 113 | operation='mult', max_norm=None, norm_type=2., 114 | scale_grad_by_freq=False, mode='mean', sparse=False, 115 | _weight=None): 116 | super(QREmbeddingBag, self).__init__() 117 | 118 | assert operation in ['concat', 'mult', 'add'], 'Not valid operation!' 119 | 120 | self.num_categories = num_categories 121 | if isinstance(embedding_dim, int) or len(embedding_dim) == 1: 122 | self.embedding_dim = [embedding_dim, embedding_dim] 123 | else: 124 | self.embedding_dim = embedding_dim 125 | self.num_collisions = num_collisions 126 | self.operation = operation 127 | self.max_norm = max_norm 128 | self.norm_type = norm_type 129 | self.scale_grad_by_freq = scale_grad_by_freq 130 | 131 | if self.operation == 'add' or self.operation == 'mult': 132 | assert self.embedding_dim[0] == self.embedding_dim[1], \ 133 | 'Embedding dimensions do not match!' 134 | 135 | self.num_embeddings = [int(np.ceil(num_categories / num_collisions)), 136 | num_collisions] 137 | 138 | if _weight is None: 139 | self.weight_q = Parameter(torch.Tensor(self.num_embeddings[0], self.embedding_dim[0])) 140 | self.weight_r = Parameter(torch.Tensor(self.num_embeddings[1], self.embedding_dim[1])) 141 | self.reset_parameters() 142 | else: 143 | assert list(_weight[0].shape) == [self.num_embeddings[0], self.embedding_dim[0]], \ 144 | 'Shape of weight for quotient table does not match num_embeddings and embedding_dim' 145 | assert list(_weight[1].shape) == [self.num_embeddings[1], self.embedding_dim[1]], \ 146 | 'Shape of weight for remainder table does not match num_embeddings and embedding_dim' 147 | self.weight_q = Parameter(_weight[0]) 148 | self.weight_r = Parameter(_weight[1]) 149 | self.mode = mode 150 | self.sparse = sparse 151 | 152 | def reset_parameters(self): 153 | nn.init.uniform_(self.weight_q, np.sqrt(1 / self.num_categories)) 154 | nn.init.uniform_(self.weight_r, np.sqrt(1 / self.num_categories)) 155 | 156 | def forward(self, input, offsets=None, per_sample_weights=None): 157 | input_q = (input / self.num_collisions).long() 158 | input_r = torch.remainder(input, self.num_collisions).long() 159 | 160 | embed_q = F.embedding_bag(input_q, self.weight_q, offsets, self.max_norm, 161 | self.norm_type, self.scale_grad_by_freq, self.mode, 162 | self.sparse, per_sample_weights) 163 | embed_r = F.embedding_bag(input_r, self.weight_r, offsets, self.max_norm, 164 | self.norm_type, self.scale_grad_by_freq, self.mode, 165 | self.sparse, per_sample_weights) 166 | 167 | if self.operation == 'concat': 168 | embed = torch.cat((embed_q, embed_r), dim=1) 169 | elif self.operation == 'add': 170 | embed = embed_q + embed_r 171 | elif self.operation == 'mult': 172 | embed = embed_q * embed_r 173 | 174 | return embed 175 | 176 | def extra_repr(self): 177 | s = '{num_embeddings}, {embedding_dim}' 178 | if self.max_norm is not None: 179 | s += ', max_norm={max_norm}' 180 | if self.norm_type != 2: 181 | s += ', norm_type={norm_type}' 182 | if self.scale_grad_by_freq is not False: 183 | s += ', scale_grad_by_freq={scale_grad_by_freq}' 184 | s += ', mode={mode}' 185 | return s.format(**self.__dict__) 186 | -------------------------------------------------------------------------------- /mxnet/blog/mxnet_v1.5_release/single-instance-cnn-mxnet-1.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "MXNet Model FP32 multi-instance Inference Performance " 4 | echo "Testing FP32 base models" 5 | echo "Installing mxnet 1.5" 6 | pip install mxnet 7 | echo "Downloading source code from incubator-mxnet repo" 8 | git clone https://github.com/apache/incubator-mxnet 9 | cd incubator-mxnet 10 | 11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l` 13 | export OMP_NUM_THREADS=$((vCPUs / 4)) 14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS 15 | export MXNET_SUBGRAPH_BACKEND=MKLDNN 16 | 17 | # Launch dummy data Inference 18 | #RN18 19 | cd ./example/quantization 20 | python imagenet_gen_qsym_mkldnn.py --model=resnet18_v1 --num-calib-batches=5 --calib-mode=naive 21 | echo "-----ResNet18 FP32 single-inst-----" 22 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 23 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 24 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 25 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 26 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 27 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 28 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 29 | 30 | #RN50 31 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive 32 | echo "-----ResNet50 FP32 single-inst-----" 33 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 34 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 35 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 36 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 37 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 38 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 39 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 40 | 41 | #RN101 42 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive 43 | echo "-----ResNet101 FP32 single-inst-----" 44 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 45 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 46 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 47 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 48 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 49 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 50 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 51 | 52 | #Squeezenet1.0 53 | python imagenet_gen_qsym_mkldnn.py --model=squeezenet1.0 --num-calib-batches=5 --calib-mode=naive 54 | echo "-----SqueezeNet1.0 FP32 single-inst-----" 55 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 56 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 57 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 58 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 59 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 60 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 61 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 62 | 63 | #MobileNet1.0 64 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive 65 | echo "-----MobileNet v1 FP32 single-inst-----" 66 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 67 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 68 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 69 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 70 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 71 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 72 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 73 | 74 | #MobileNet2.0 75 | python imagenet_gen_qsym_mkldnn.py --model=mobilenetv2_1.0 --num-calib-batches=5 --calib-mode=naive 76 | echo "-----MobileNet v2 FP32 single-inst-----" 77 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 78 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 79 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 80 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 81 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 82 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 83 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 84 | 85 | #inception v3 86 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive 87 | echo "-----Inception v3 FP32 single-inst-----" 88 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 89 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 90 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 91 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 92 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 93 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 94 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 95 | 96 | #ResNet152-v2 97 | python imagenet_gen_qsym_mkldnn.py --model=imagenet1k-resnet-152 --num-calib-batches=5 --calib-mode=naive 98 | echo "-----ResNet152-v2 FP32 single-inst-----" 99 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 100 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 101 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 102 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 103 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 104 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 105 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 106 | -------------------------------------------------------------------------------- /mxnet/blog/mxnet_v1.5_release/single-instance-fp32-cnn-mxnet-mkl1.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "MXNet Model FP32 multi-instance Inference Performance " 4 | echo "Testing FP32 base models" 5 | echo "Installing mxnet-mkl 1.5" 6 | pip install mxnet-mkl 7 | echo "Downloading source code from incubator-mxnet repo" 8 | git clone https://github.com/apache/incubator-mxnet 9 | cd incubator-mxnet 10 | 11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l` 13 | export OMP_NUM_THREADS=$((vCPUs / 4)) 14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS 15 | export MXNET_SUBGRAPH_BACKEND=MKLDNN 16 | 17 | # Launch dummy data Inference 18 | #RN18 19 | cd ./example/quantization 20 | python imagenet_gen_qsym_mkldnn.py --model=resnet18_v1 --num-calib-batches=5 --calib-mode=naive 21 | echo "-----ResNet18 FP32 single-inst-----" 22 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 23 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 24 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 25 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 26 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 27 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 28 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 29 | 30 | #RN50 31 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive 32 | echo "-----ResNet50 FP32 single-inst-----" 33 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 34 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 35 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 36 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 37 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 38 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 39 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 40 | 41 | #RN101 42 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive 43 | echo "-----ResNet101 FP32 single-inst-----" 44 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 45 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 46 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 47 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 48 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 49 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 50 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 51 | 52 | #Squeezenet1.0 53 | python imagenet_gen_qsym_mkldnn.py --model=squeezenet1.0 --num-calib-batches=5 --calib-mode=naive 54 | echo "-----SqueezeNet1.0 FP32 single-inst-----" 55 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 56 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 57 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 58 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 59 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 60 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 61 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 62 | 63 | #MobileNet1.0 64 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive 65 | echo "-----MobileNet v1 FP32 single-inst-----" 66 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 67 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 68 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 69 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 70 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 71 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 72 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 73 | 74 | #MobileNet2.0 75 | python imagenet_gen_qsym_mkldnn.py --model=mobilenetv2_1.0 --num-calib-batches=5 --calib-mode=naive 76 | echo "-----MobileNet v2 FP32 single-inst-----" 77 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 78 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 79 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 80 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 81 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 82 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 83 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 84 | 85 | #inception v3 86 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive 87 | echo "-----Inception v3 FP32 single-inst-----" 88 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 89 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 90 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 91 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 92 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 93 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 94 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 95 | 96 | #ResNet152-v2 97 | python imagenet_gen_qsym_mkldnn.py --model=imagenet1k-resnet-152 --num-calib-batches=5 --calib-mode=naive 98 | echo "-----ResNet152-v2 FP32 single-inst-----" 99 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 100 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 101 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 102 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 103 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 104 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 105 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 106 | -------------------------------------------------------------------------------- /pytorch/benchmark_tools/run_caffe2.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ## @package caffe2_tools 3 | # Module caffe2.tools.run_caffe2 4 | """ 5 | the main entry to run caffe2 model 6 | """ 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | from __future__ import unicode_literals 11 | 12 | import os 13 | import sys 14 | import timeit 15 | import logging 16 | import argparse 17 | 18 | LOG_FORMAT = "%(levelname)s:%(message)s" 19 | 20 | def ArgError(error): 21 | """ 22 | print help if arg is error 23 | """ 24 | logging.error("Please set {}. " 25 | "OR, refer to the help of this script (-h)" 26 | .format(error)) 27 | 28 | def Calibration(args, extr_args): 29 | """ 30 | function to do calibration. 31 | """ 32 | if not args.model: 33 | ArgError("model to run (-m)") 34 | return 35 | if args.print_net_def: 36 | import inference as inf 37 | inf.PrintNetDef(args.model, args.print_net_def) 38 | return 39 | if not args.device: 40 | ArgError("device (-d)") 41 | return 42 | if ( 43 | not args.dummydata and 44 | not args.images_path and 45 | not "CAFFE2_INF_IMG_PATH" in os.environ 46 | ): 47 | ArgError("the path of input images (-p)") 48 | return 49 | import inference as inf 50 | inf.Calibration(args, extra_args) 51 | 52 | 53 | def Inference(args, extra_args): 54 | """ 55 | function to do inference. 56 | """ 57 | if not args.model: 58 | ArgError("model to run (-m)") 59 | return 60 | if args.print_net_def: 61 | import inference as inf 62 | inf.PrintNetDef(args.model, args.print_net_def) 63 | return 64 | if not args.device: 65 | ArgError("device (-d)") 66 | return 67 | if ( 68 | not args.dummydata and 69 | not args.images_path and 70 | not "CAFFE2_INF_IMG_PATH" in os.environ 71 | ): 72 | ArgError("the path of input images (-p)") 73 | return 74 | import inference as inf 75 | inf.Run(args, extra_args) 76 | 77 | 78 | def GetArgumentParser(): 79 | """ 80 | to parse the argument 81 | """ 82 | parser = argparse.ArgumentParser(description="The scripts to run Caffe2.\n" 83 | "for example, to run alexnet inference:\n" 84 | "./run_caffe2.py -m alexnet\n" 85 | " -p /path/to/input/image\n" 86 | " -v /path/to/image/validate/index/file\n" 87 | ) 88 | parser.add_argument( 89 | "-a", "--optimization", 90 | type=str, 91 | help="Enable optimizations for running mode, split by comma.\n" 92 | "(Set 'all' to enable all optimizations for current running mode)\n" 93 | "-For inference, available optimizations:\n" 94 | "bn_folding,bn_inplace,fusion_conv_relu,fusion_conv_sum,remove_dropout," 95 | "int8_mode.\n" 96 | "-For training, available optimizations:\n" 97 | " " 98 | ) 99 | parser.add_argument( 100 | "-b", "--batch_size", 101 | type=int, 102 | default=1, 103 | help="The batch size. (DEFAULT: %(default)i)" 104 | ) 105 | parser.add_argument( 106 | "-c", "--crop_size", 107 | type=int, 108 | default=None, 109 | help="The crop size of input image. (DEFAULT: %(default)s)" 110 | ) 111 | parser.add_argument( 112 | "-d", "--device", 113 | type=str, 114 | default="ideep", 115 | help="Choose device to run. cpu, gpu or ideep." 116 | "(DEFAULT: %(default)s)" 117 | ) 118 | parser.add_argument( 119 | "-e", "--log_level", 120 | type=str, 121 | default="warning", 122 | help="The log level to show off. debug, info, warning, error, critical." 123 | "(DEFAULT: %(default)s)" 124 | ) 125 | parser.add_argument( 126 | "-f", "--forward_only", 127 | action='store_true', 128 | help="If set, only run the forward path." 129 | "(DEFAULT: %(default)s)" 130 | ) 131 | parser.add_argument( 132 | "-g", "--log", 133 | type=str, 134 | help="The log file path." 135 | ) 136 | parser.add_argument( 137 | "-i", "--iterations", 138 | type=int, 139 | help="Number of iterations to run the network." 140 | ) 141 | parser.add_argument( 142 | "-j", "--post_images_path", 143 | type=str, 144 | default=None, 145 | help="The path to store post images." 146 | ) 147 | parser.add_argument( 148 | "-l", "--label_file", 149 | type=str, 150 | help="The input label index file." 151 | ) 152 | parser.add_argument( 153 | "-m", "--model", 154 | type=str, 155 | help="The model to run." 156 | ) 157 | parser.add_argument( 158 | "-n", "--net_type", 159 | type=str, 160 | default="simple", 161 | help="The net type for Caffe2.(DEFAULT: %(default)s)" 162 | ) 163 | parser.add_argument( 164 | "-o", "--output_file", 165 | type=str, 166 | default=None, 167 | help="The output file to save the results of validating or label check." 168 | ) 169 | parser.add_argument( 170 | "-calib", "--calib_algo", 171 | type=str, 172 | help="The algorithm of calibration. absmax, moving_average, or l_divergence" 173 | ) 174 | parser.add_argument( 175 | "-single_iter_calib", "--single_iter_calib", 176 | action='store_true', 177 | help="Perform calibration on single batch images or not" 178 | ) 179 | parser.add_argument( 180 | "-iter_calib", "--iter_calib", 181 | type=int, 182 | default=None, 183 | help="Perform calibration on single batch images or not" 184 | ) 185 | parser.add_argument( 186 | "-int8", "--int8_model", 187 | action='store_true', 188 | help="Use the int8 model, instead of fp32 model." 189 | ) 190 | parser.add_argument( 191 | "-onnx", "--onnx_model", 192 | action='store_true', 193 | help="Use the onnx model, instead of caffe2 model." 194 | ) 195 | parser.add_argument( 196 | "-p", "--images_path", 197 | type=str, 198 | help="The path of input images." 199 | ) 200 | parser.add_argument( 201 | "-tp", "--tr_images_path", 202 | type=str, 203 | help="The path of input images for training." 204 | ) 205 | parser.add_argument( 206 | "-q", "--annotations", 207 | type=str, 208 | help="The path of Annotations file for VOC" 209 | ) 210 | parser.add_argument( 211 | "-r", "--mode", 212 | type=str, 213 | default="inference", 214 | help="Choose running mode. inference, calibration or training." 215 | "(DEFAULT: %(default)s)" 216 | ) 217 | parser.add_argument( 218 | "-calibf", "--calibration_file", 219 | type=str, 220 | help="Use the images in calibration_file for int8 calibration." 221 | ) 222 | parser.add_argument( 223 | "-s", "--show_supported_models", 224 | action='store_true', 225 | help="Show off all supported model for inference." 226 | ) 227 | parser.add_argument( 228 | "-t", "--profile", 229 | action='store_true', 230 | help="Trigger profile on current topology." 231 | ) 232 | parser.add_argument( 233 | "-u", "--dummydata", 234 | action='store_true', 235 | help="dummy dataset." 236 | ) 237 | parser.add_argument( 238 | "-uv", "--dummyvalue", 239 | type=str, 240 | default="random", 241 | help="the fill value for dummydata." 242 | "(DEFAULT: %(default)s)" 243 | ) 244 | parser.add_argument( 245 | "-v", "--validation_file", 246 | type=str, 247 | help="The input validation index file." 248 | ) 249 | parser.add_argument( 250 | "-w", "--warmup_iterations", 251 | type=int, 252 | default=0, 253 | help="Number of warm-up iterations before benchmarking." 254 | "(DEFAULT: %(default)i)" 255 | ) 256 | parser.add_argument( 257 | "-x", "--print_net_def", 258 | type=str, 259 | default=None, 260 | help="If set, only print out the net definition for the model.\n" 261 | "predict_net for topology, init_net for weight data." 262 | "(DEFAULT: %(default)s)" 263 | ) 264 | parser.add_argument( 265 | "-y", "--cosim", 266 | action='store_true', 267 | help="Trigger cosim on current topology." 268 | ) 269 | parser.add_argument( 270 | "-yi", "--int8_cosim", 271 | action='store_true', 272 | help="Trigger int8 cosim on current topology." 273 | ) 274 | parser.add_argument( 275 | "-z", "--noptimize", 276 | action='store_true', 277 | help="not Trigger optimization on current topology." 278 | ) 279 | return parser 280 | 281 | 282 | if __name__ == '__main__': 283 | args, extra_args = GetArgumentParser().parse_known_args() 284 | LOG_LEVEL_MAP = { 285 | "debug": logging.DEBUG, 286 | "info": logging.INFO, 287 | "warning": logging.WARNING, 288 | "error": logging.ERROR, 289 | "critical": logging.CRITICAL, 290 | } 291 | if args.log_level.lower() in LOG_LEVEL_MAP: 292 | log_level = LOG_LEVEL_MAP[args.log_level.lower()] 293 | else: 294 | log_level = None 295 | logging.warning("Wrong log level {}. Ignored!".format(args.log_level)) 296 | logging.basicConfig( 297 | format=LOG_FORMAT, 298 | filename=args.log, 299 | filemode="w", 300 | level=log_level) 301 | 302 | if args.show_supported_models: 303 | import inference.models as m 304 | 305 | m.ShowModels() 306 | elif len(sys.argv) == 1: 307 | GetArgumentParser().print_help() 308 | else: 309 | type_map = { 310 | "inference": Inference, 311 | "calibration": Calibration, 312 | } 313 | if args.mode.lower() in type_map: 314 | start_time = timeit.default_timer() 315 | type_map[args.mode.lower()](args, extra_args) 316 | elapsed_time = timeit.default_timer() - start_time 317 | logging.warning("Total time in {} mode: {:.10f} seconds" 318 | .format(args.mode, elapsed_time)) 319 | else: 320 | logging.error("Wrong running mode {}. Exit!".format(args.mode)) 321 | -------------------------------------------------------------------------------- /pytorch/ResNet50/README.md: -------------------------------------------------------------------------------- 1 | # Guide to run ResNet50 with FP32/BF16 data type 2 | 3 | ## Verified on 4 | 5 | | Item | Value | 6 | | -: | :- | 7 | | OS | Ubuntu 20.04 LTS | 8 | | Compiler | gcc 8.4.0 | 9 | | Memory | DDR4 3200MHz | 10 | 11 | ## Prepare your running environment 12 | 1. Setup for PyTorch build environment: 13 | ``` 14 | wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh 15 | chmod +x anaconda3.sh 16 | ./anaconda3.sh -b -p ~/anaconda3 17 | ./anaconda3/bin/conda create -yn pytorch python=3.7 18 | export PATH=~/anaconda3/bin:$PATH 19 | source ./anaconda3/bin/activate pytorch 20 | pip install sklearn onnx 21 | conda config --append channels intel 22 | conda install ninja pyyaml setuptools cmake cffi typing 23 | conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps 24 | ``` 25 | 26 | 2. Build and install PyTorch 27 | ``` 28 | git clone https://github.com/pytorch/pytorch.git 29 | cd pytorch 30 | git checkout gh/xiaobingsuper/18/orig 31 | python setup.py clean 32 | git submodule sync && git submodule update --init --recursive 33 | wget https://github.com/hongzhen1/pytorch/commit/3511d7f6bd2060e20cf77b770ae32ff538700f37.diff -O dataloader.diff 34 | git apply dataloader.diff 35 | cd third_party/ideep/ && git checkout master && git pull && git checkout pytorch_dnnl_dev && cd ../../ 36 | git add third_party/ideep && git submodule sync && git submodule update --init --recursive 37 | cd third_party/ideep # make sure ideep commit is 2bf943e 38 | cd ../../ 39 | pip install -r requirements.txt 40 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib 41 | python setup.py install 42 | ``` 43 | 44 | 3. Install jemalloc 45 | ``` 46 | cd .. 47 | git clone https://github.com/jemalloc/jemalloc.git 48 | cd jemalloc 49 | ./autogen.sh 50 | ./configure --prefix=$HOME/.local 51 | make 52 | make install 53 | ``` 54 | 55 | 4. download imagenet dataset 56 | reference: https://github.com/facebookarchive/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset 57 | 58 | 5. install vision & imagenet 59 | ``` 60 | cd .. 61 | git clone https://github.com/pytorch/vision 62 | cd vision 63 | python setup.py install 64 | 65 | cd .. 66 | git clone https://github.com/intel/optimized-models.git 67 | cd imagenet/imagenet 68 | ``` 69 | 70 | ## Example: 71 | 72 | Core(s) per socket: 24 73 | 74 | ``` 75 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024 76 | export DATA_PATH= 77 | ``` 78 | 79 | **Note:** change ip address (xxx.xxx.xxx.xxx) in the following commands to the one in your environment. 80 | 81 | ### FP32: 82 | * training benchmark (4 instances, 24 cores/ins): 83 | ``` 84 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 85 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C48-71 -m2 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C72-95 -m3 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & 86 | ``` 87 | 88 | * training accuracy for multi-nodes (4 nodes, batch_size=64 for every node): 89 | 90 | **Legends:** 91 | 92 | | flag | description | 93 | | -: | - | 94 | | -j | cores number of one node | 95 | | --world-size | node numbers | 96 | | --rank | node number | 97 | | bathc_size | 256/nodes | 98 | 99 | **Commands run on nodes:** 100 | 101 | * on node0 102 | ``` 103 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 104 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 105 | ``` 106 | 107 | * on node1 108 | ``` 109 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 110 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 111 | ``` 112 | 113 | * on node2 114 | ``` 115 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 116 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 117 | ``` 118 | 119 | * on node3 120 | ``` 121 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 122 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 123 | ``` 124 | 125 | * inference throughput benchmark (4 instances, 24 cores/ins): 126 | ``` 127 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 128 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 129 | bash run_inference_cpu_multi_instance.sh resnet50 130 | ``` 131 | 132 | * inference realtime bechmark (24 instances, 4 cores/ins): 133 | ``` 134 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 135 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 136 | bash run_inference_cpu_multi_instance_latency.sh resnet50 137 | ``` 138 | 139 | * inference accuracy: 140 | ``` 141 | bash run_inference_cpu_accuracy.sh resnet50 142 | ``` 143 | 144 | ### BF16: 145 | * training benchmark (4 instances, 24cores/ins): 146 | ``` 147 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 148 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C48-71 -m2 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16& MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C72-95 -m3 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & 149 | ``` 150 | 151 | * training accuracy (4 nodes, batch_size=64 for every node): 152 | 153 | **Legends:** 154 | 155 | | flag | description | 156 | | -: | - | 157 | | -j | cores number of one node | 158 | | --world-size | node numbers | 159 | | --rank | node number | 160 | | bathc_size | 256/nodes | 161 | 162 | **Commands run on nodes:** 163 | 164 | * on node0 165 | ``` 166 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 167 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 168 | ``` 169 | * on node1 170 | ``` 171 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 172 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 173 | ``` 174 | * on node2 175 | ``` 176 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 177 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 178 | ``` 179 | * on node3 180 | ``` 181 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 182 | python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 183 | ``` 184 | 185 | * inference throughput benchmark (4 instances, 24 cores/ins): 186 | ``` 187 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 188 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 189 | bash run_inference_cpu_multi_instance.sh resnet50 bf16 190 | ``` 191 | 192 | * inference realtime benchmark (24 instances, 4 cores/ins): 193 | ``` 194 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 195 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 196 | bash run_inference_cpu_multi_instance_latency.sh resnet50 bf16 197 | ``` 198 | 199 | * inference accuracy: 200 | ``` 201 | bash run_inference_cpu_accuracy.sh resnet50 bf16 202 | ``` 203 | -------------------------------------------------------------------------------- /pytorch/ResNext101_32x4d/README.md: -------------------------------------------------------------------------------- 1 | # Guide to run ResNext101_32x4d with FP32/BF16 data type 2 | 3 | ## Verified on 4 | 5 | | Item | Value | 6 | | -: | :- | 7 | | OS | Ubuntu 20.04 LTS | 8 | | Compiler | gcc 8.4.0 | 9 | | Memory | DDR4 3200MHz | 10 | 11 | ## Prepare your running environment 12 | 1. Setup for PyTorch build environment: 13 | ``` 14 | wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh 15 | chmod +x anaconda3.sh 16 | ./anaconda3.sh -b -p ~/anaconda3 17 | ./anaconda3/bin/conda create -yn pytorch python=3.7 18 | export PATH=~/anaconda3/bin:$PATH 19 | source ./anaconda3/bin/activate pytorch 20 | pip install sklearn onnx 21 | conda config --append channels intel 22 | conda install ninja pyyaml setuptools cmake cffi typing 23 | conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps 24 | ``` 25 | 26 | 2. Build and install PyTorch 27 | ``` 28 | git clone https://github.com/pytorch/pytorch.git 29 | cd pytorch 30 | git checkout gh/xiaobingsuper/18/orig 31 | python setup.py clean 32 | git submodule sync && git submodule update --init --recursive 33 | wget https://github.com/hongzhen1/pytorch/commit/3511d7f6bd2060e20cf77b770ae32ff538700f37.diff -O dataloader.diff 34 | git apply dataloader.diff 35 | cd third_party/ideep/ && git checkout master && git pull && git checkout pytorch_dnnl_dev && cd ../../ 36 | git add third_party/ideep && git submodule sync && git submodule update --init --recursive 37 | cd third_party/ideep # make sure ideep commit is 2bf943e 38 | cd ../../ 39 | pip install -r requirements.txt 40 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib 41 | python setup.py install 42 | ``` 43 | 44 | 3. Install jemalloc 45 | ``` 46 | cd .. 47 | git clone https://github.com/jemalloc/jemalloc.git 48 | cd jemalloc 49 | ./autogen.sh 50 | ./configure --prefix={path/to/jemalloc} (eg: /home/xxx/jemalloc/) 51 | make 52 | make install 53 | ``` 54 | 55 | 4. download imagenet dataset 56 | reference: https://github.com/facebookarchive/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset 57 | 58 | 5. install vision & imagenet 59 | ``` 60 | cd .. 61 | git clone https://github.com/intel/optimized-models.git 62 | 63 | git clone https://github.com/pytorch/vision 64 | cd vision 65 | cp ../optimized-models/imagenet/imagenet/resnet.py torchvision/models/resnet.py 66 | python setup.py install 67 | cd ../optimized-models/imagenet/imagenet 68 | ``` 69 | 70 | ## Example: 71 | 72 | Core(s) per socket: 24 73 | 74 | ``` 75 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024 76 | export DATA_PATH= 77 | ``` 78 | 79 | **Note:** change ip address (xxx.xxx.xxx.xxx) in the following commands to the one in your environment. 80 | 81 | ### FP32: 82 | * training benchmark (4 instances, 24 cores/ins): 83 | ``` 84 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 85 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx .xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C48-71 -m2 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C72-95 -m3 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & 86 | ``` 87 | 88 | * training accuracy for multi-nodes (4 nodes, batch_size=64 for every node): 89 | 90 | **Legends:** 91 | 92 | | flag | description | 93 | | -: | - | 94 | | -j | cores number of one node | 95 | | --world-size | node numbers | 96 | | --rank | node number | 97 | | bathc_size | 256/nodes | 98 | 99 | **Commands run on nodes** 100 | 101 | * on node0 102 | ``` 103 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 104 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 105 | ``` 106 | * on node1 107 | ``` 108 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 109 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 110 | ``` 111 | * on node2 112 | ``` 113 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 114 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 115 | ``` 116 | * on node3 117 | ``` 118 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 119 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 120 | ``` 121 | 122 | * inference throughput benchmark (4 instances, 24 cores/ins): 123 | ``` 124 | # bacth_size=128 125 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 126 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 127 | bash run_inference_cpu_multi_instance.sh resnext101_32x4d 128 | ``` 129 | 130 | * inference realtime bechmark (24 instances, 4 cores/ins): 131 | ``` 132 | # batch_size=128 133 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 134 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 135 | bash run_inference_cpu_multi_instance_latency.sh resnext101_32x4d 136 | ``` 137 | 138 | * inference accuracy: 139 | ``` 140 | bash run_inference_cpu_accuracy.sh resnext101_32x4d 141 | ``` 142 | 143 | ### BF16: 144 | * training benchmark (4 instances, 24cores/ins): 145 | ``` 146 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 147 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C48-71 -m2 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C72-95 -m3 python -u main_multinode.py -a resnext101_32x4d --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 148 | ``` 149 | 150 | * training accuracy (4 nodes, batch_size=64 for every node): 151 | 152 | **Legends:** 153 | 154 | | flag | description | 155 | | -: | - | 156 | | -j | cores number of one node | 157 | | --world-size | node numbers | 158 | | --rank | node number | 159 | | bathc_size | 256/nodes | 160 | 161 | **Commands run on nodes** 162 | 163 | * on node0 164 | ``` 165 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 166 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 167 | ``` 168 | * on node1 169 | ``` 170 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 171 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 172 | ``` 173 | * on node2 174 | ``` 175 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 176 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 177 | ``` 178 | * on node3 179 | ``` 180 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 181 | python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16 182 | ``` 183 | 184 | * inference throughput benchmark (4 instances, 24 cores/ins): 185 | ``` 186 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 187 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 188 | bash run_inference_cpu_multi_instance.sh resnext101_32x4d bf16 189 | ``` 190 | 191 | * inference realtime benchmark (24 instances, 4 cores/ins): 192 | ``` 193 | export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so 194 | export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" 195 | bash run_inference_cpu_multi_instance_latency.sh resnext101_32x4d bf16 196 | ``` 197 | 198 | * inference accuracy: 199 | ``` 200 | bash run_inference_cpu_accuracy.sh resnext101_32x4d bf16 201 | ``` 202 | --------------------------------------------------------------------------------