├── pytorch
    ├── benchmark_tools
    │   ├── common
    │   │   ├── __init__.py
    │   │   └── common_mlperf.py
    │   ├── inference
    │   │   ├── caffe
    │   │   │   ├── __init__.py
    │   │   │   └── proto
    │   │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── resnet50_v1
    │   │   │   │   └── __model_def
    │   │   │   ├── resnext101_32x4d
    │   │   │   │   └── __model_def
    │   │   │   ├── resnet50
    │   │   │   │   └── __model_def
    │   │   │   └── __init__.py
    │   │   └── calibration_int8.py
    │   └── run_caffe2.py
    ├── imagenet
    │   └── imagenet
    │   │   ├── requirements.txt
    │   │   ├── run_inference_cpu_accuracy.sh
    │   │   ├── run_inference_cpu_multi_instance_latency.sh
    │   │   ├── run_inference_cpu_multi_instance.sh
    │   │   └── README.md
    ├── dlrm
    │   ├── dlrm
    │   │   ├── input
    │   │   │   ├── trace.log
    │   │   │   ├── dist_emb_0.log
    │   │   │   ├── dist_emb_1.log
    │   │   │   └── dist_emb_2.log
    │   │   ├── requirements.txt
    │   │   ├── CODE_OF_CONDUCT.md
    │   │   ├── cython
    │   │   │   ├── cython_compile.py
    │   │   │   └── cython_criteo.py
    │   │   ├── LICENSE
    │   │   ├── CONTRIBUTING.md
    │   │   ├── bench
    │   │   │   ├── dlrm_s_criteo_kaggle.sh
    │   │   │   ├── dlrm_s_criteo_terabyte.sh
    │   │   │   └── dlrm_s_benchmark.sh
    │   │   ├── test
    │   │   │   └── dlrm_s_test.sh
    │   │   └── quorem
    │   │   │   └── qr_embedding_bag.py
    │   └── README.md
    ├── RESNET50V1.md
    ├── distributed
    │   └── README.md
    ├── README.md
    ├── ResNet50
    │   └── README.md
    └── ResNext101_32x4d
    │   └── README.md
├── mxnet
    ├── wide_deep_criteo
    │   ├── launch_train.sh
    │   ├── launch_inference.sh
    │   ├── getdata.sh
    │   ├── model.py
    │   ├── data.py
    │   ├── train.py
    │   ├── README.md
    │   ├── inference.py
    │   ├── wd_gen_qsym_subgraph.py
    │   ├── wd_gen_qsym_subgraph_update.py
    │   └── update_model
    │   │   └── embedding-fuse.json
    └── blog
    │   ├── mxnet_v1.5_release
    │       ├── single-instance-rnn-mxnet-1.5.sh
    │       ├── single-instance-rnn-mxnet-mkl1.5.sh
    │       ├── 2instance-rnn-mxnet1.5.sh
    │       ├── 2instance-rnn-mxnet-mkl1.5.sh
    │       ├── rnn_benchmark.py
    │       ├── single-instance-cnn-mxnet-1.5.sh
    │       └── single-instance-fp32-cnn-mxnet-mkl1.5.sh
    │   └── medium_vnni
    │       ├── ec2_benchmark_base.sh
    │       └── ec2_benchmark_int8.sh
├── README.md
└── third-party-programs.txt


/pytorch/benchmark_tools/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/caffe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/caffe/proto/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytorch/imagenet/imagenet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/input/trace.log:
--------------------------------------------------------------------------------
1 | 1, 2, 3, 4, 5, 3, 4, 1, 1, 6, 3
2 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/requirements.txt:
--------------------------------------------------------------------------------
1 | future
2 | numpy
3 | onnx
4 | pydot
5 | torch
6 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/input/dist_emb_0.log:
--------------------------------------------------------------------------------
1 | 1, 2, 3, 4, 5, 6
2 | 0, 1, 3, 4, 5
3 | 0.55, 0.64, 0.82, 0.91, 1.0
4 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/input/dist_emb_1.log:
--------------------------------------------------------------------------------
1 | 1, 2, 3, 4, 5, 6
2 | 0, 1, 3, 4, 5
3 | 0.55, 0.64, 0.82, 0.91, 1.0
4 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/input/dist_emb_2.log:
--------------------------------------------------------------------------------
1 | 1, 2, 3, 4, 5, 6
2 | 0, 1, 3, 4, 5
3 | 0.55, 0.64, 0.82, 0.91, 1.0
4 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/launch_train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
4 | export OMP_NUM_THREADS=56
5 | 
6 | python train.py --batch-size=1024 --data-dir=./data
7 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/__init__.py:
--------------------------------------------------------------------------------
1 | """init file of inference"""
2 | from inference.inference_caffe2 import Run
3 | from inference.inference_caffe2 import PrintNetDef
4 | from inference.calibration_int8 import Calibration
5 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/launch_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 4 | export OMP_NUM_THREADS=28
 5 | 
 6 | echo
 7 | echo "Running inference as benchmark mode..."
 8 | numactl --physcpubind=0-27 --membind=0 python inference.py
 9 | 
10 | echo
11 | echo "Running inference as accuracy mode..."
12 | numactl --physcpubind=0-27 --membind=0 python inference.py --accuracy True
13 | 
14 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/models/resnet50_v1/__model_def:
--------------------------------------------------------------------------------
 1 | [Model Name]
 2 | ResNet50_v1
 3 | 
 4 | [Model Type]
 5 | Caffe legacy
 6 | 
 7 | [Output Type]
 8 | Possibility
 9 | 
10 | [Model Description]
11 | Model definition for Caffe2
12 | 
13 | [Init Net]
14 | init_net.pb
15 | 
16 | [Init Net]
17 | init_net_int8.pb
18 | 
19 | [Predict Net]
20 | predict_net.pb
21 | 
22 | [Predict Net Int8]
23 | predict_net_int8.pb
24 | 
25 | [Onnx Model]
26 | resnet50_onnx.pb
27 | 
28 | [Crop Size]
29 | 224
30 | 
31 | [Image Mean]
32 | 104 117 123
33 | 
34 | [Scale]
35 | 0.0078125
36 | 
37 | [Train Proto]
38 | 
39 | 
40 | [Deploy Proto]
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/getdata.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo
 4 | echo "Start downloading criteo large dataset, it might take a long time"
 5 | echo
 6 | 
 7 | DATA_DIR="./data"
 8 | if [[ ! -d "${DATA_DIR}" ]]; then
 9 |   echo "${DATA_DIR} doesn't exist, will create one";
10 |   mkdir -p data
11 | fi
12 | 
13 | #training set
14 | echo "Downloading the training dataset..."
15 | wget -P ./data https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv;
16 | 
17 | #validation set
18 | echo "Downloading the validation dataset..."
19 | wget -P ./data https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv;
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | DISCONTINUATION OF PROJECT
 2 | 
 3 | This project will no longer be maintained by Intel.
 4 | 
 5 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project.  
 6 | 
 7 | Intel no longer accepts patches to this project.
 8 | 
 9 | If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project.  
10 | 
11 | Contact: webadmin@linux.intel.com
12 | optimized-models
13 | ==================
14 | 
15 | Intel optimized models for easy reproduction for users.
16 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/models/resnext101_32x4d/__model_def:
--------------------------------------------------------------------------------
 1 | [Model Name]
 2 | ResNeXt101_32x4d
 3 | 
 4 | [Model Type]
 5 | Normal
 6 | 
 7 | [Output Type]
 8 | Possibility
 9 | 
10 | [Model Description]
11 | Model definition for Caffe2
12 | 
13 | [Init Net]
14 | init_net.pb
15 | 
16 | [Init Net Int8]
17 | init_onnx_int8.pb
18 | 
19 | [Predict Net]
20 | predict_net.pb
21 | 
22 | [Predict Net Int8]
23 | predict_onnx_int8.pb
24 | 
25 | [Onnx Model]
26 | resnext101_32x4d.onnx
27 | 
28 | [Crop Size]
29 | 224
30 | 
31 | [Image Mean]
32 | 0.485 0.456 0.406
33 | 
34 | [Scale]
35 | 4.3668 4.4643 4.4444
36 | 
37 | [Need Normalize]
38 | true
39 | 
40 | [Color Format]
41 | RGB
42 | 
43 | [Train Proto]
44 | 
45 | 
46 | [Deploy Proto]
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/models/resnet50/__model_def:
--------------------------------------------------------------------------------
 1 | [Model Name]
 2 | ResNet50
 3 | 
 4 | [Model Type]
 5 | Caffe legacy
 6 | 
 7 | [Output Type]
 8 | Possibility
 9 | 
10 | [Model Description]
11 | Model definition for Caffe2
12 | 
13 | [Init Net]
14 | init_net.pb
15 | 
16 | [Init Net Int8]
17 | init_onnx_int8.pb
18 | 
19 | [Predict Net]
20 | predict_net.pb
21 | 
22 | [Predict Net Int8]
23 | predict_onnx_int8.pb
24 | 
25 | [Onnx Model]
26 | resnet50.onnx
27 | 
28 | [Crop Size]
29 | 224
30 | 
31 | [Image Mean]
32 | 0.485 0.456 0.406
33 | 
34 | [Scale]
35 | 4.36681223 4.46428571 4.44444444
36 | 
37 | [Need Normalize]
38 | true
39 | 
40 | [Color Format]
41 | RGB
42 | 
43 | [Train Proto]
44 | 
45 | 
46 | [Deploy Proto]
47 | 
48 | 
49 | [Model Source]
50 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/cython/cython_compile.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | # Description: compile .so from python code
 7 | 
 8 | from __future__ import absolute_import, division, print_function, unicode_literals
 9 | 
10 | from setuptools import setup
11 | from Cython.Build import cythonize
12 | from distutils.extension import Extension
13 | 
14 | ext_modules = [
15 |     Extension(
16 |         "data_utils_cython",
17 |         ["data_utils_cython.pyx"],
18 |         extra_compile_args=['-O3'],
19 |         extra_link_args=['-O3'],
20 |     )
21 | ]
22 | 
23 | setup(
24 |     name='data_utils_cython',
25 |     ext_modules=cythonize(ext_modules)
26 | )
27 | 


--------------------------------------------------------------------------------
/mxnet/blog/mxnet_v1.5_release/single-instance-rnn-mxnet-1.5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "MXNet Model FP32 single-instance LSTM Inference Performance"
 4 | echo "Testing FP32 base models"
 5 | echo "Installing mxnet 1.5"
 6 | pip install mxnet
 7 | 
 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 9 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l`
10 | export OMP_NUM_THREADS=$((vCPUs / 4))
11 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS
12 | 
13 | echo "-----LSTM FP32 4-layers inference-----"
14 | numactl --cpunodebind=0  --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4
15 | echo "-----LSTM FP32 8-layers inference-----"
16 | numactl --cpunodebind=0  --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8
17 | 
18 | 


--------------------------------------------------------------------------------
/mxnet/blog/mxnet_v1.5_release/single-instance-rnn-mxnet-mkl1.5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "MXNet Model FP32 single-instance LSTM Inference Performance"
 4 | echo "Testing FP32 base models"
 5 | echo "Installing mxnet-mkl 1.5"
 6 | pip install mxnet-mkl
 7 | 
 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 9 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l`
10 | export OMP_NUM_THREADS=$((vCPUs / 4))
11 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS
12 | 
13 | echo "-----LSTM FP32 4-layers inference-----"
14 | numactl --cpunodebind=0  --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4
15 | echo "-----LSTM FP32 8-layers inference-----"
16 | numactl --cpunodebind=0  --physcpubind=0-$((OMP_NUM_THREADS-1)) --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8
17 | 
18 | 


--------------------------------------------------------------------------------
/mxnet/blog/mxnet_v1.5_release/2instance-rnn-mxnet1.5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "MXNet Model FP32 multi-instance LSTM Inference Performance"
 4 | echo "Testing FP32 base models"
 5 | echo "Installing mxnet1.5"
 6 | pip install mxnet
 7 | 
 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 9 | 
10 | echo "-----LSTM FP32 4-layers multi-instance inference-----"
11 | OMP_NUM_THREADS=24 numactl --cpunodebind=0  --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4 &
12 | OMP_NUM_THREADS=24 numactl --cpunodebind=1  --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 4
13 | 
14 | echo "-----LSTM FP32 8-layers multi-instance inference-----"
15 | OMP_NUM_THREADS=24 numactl --cpunodebind=0  --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8 &
16 | OMP_NUM_THREADS=24 numactl --cpunodebind=1  --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 8
17 | 


--------------------------------------------------------------------------------
/mxnet/blog/mxnet_v1.5_release/2instance-rnn-mxnet-mkl1.5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "MXNet Model FP32 multi-instance LSTM Inference Performance"
 4 | echo "Testing FP32 base models"
 5 | echo "Installing mxnet-mkl 1.5"
 6 | pip install mxnet-mkl
 7 | 
 8 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 9 | 
10 | echo "-----LSTM FP32 4-layers multi-instance inference-----"
11 | OMP_NUM_THREADS=24 numactl --cpunodebind=0  --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 4 &
12 | OMP_NUM_THREADS=24 numactl --cpunodebind=1  --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 4
13 | 
14 | echo "-----LSTM FP32 8-layers multi-instance inference-----"
15 | OMP_NUM_THREADS=24 numactl --cpunodebind=0  --physcpubind=0-23 --membind=0 python rnn_benchmark.py --cell_type lstm --layer_num 8 &
16 | OMP_NUM_THREADS=24 numactl --cpunodebind=1  --physcpubind=24-47 --membind=1 python rnn_benchmark.py --cell_type lstm --layer_num 8
17 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to DLRM
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `master`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## Coding Style
30 | * 4 spaces for indentation rather than tabs
31 | * 80 character line length
32 | * in general, please maintain a consistent style with the rest of the code
33 | 
34 | ## License
35 | By contributing to DLRM, you agree that your contributions will be licensed
36 | under the LICENSE file in the root directory of this source tree.
37 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/bench/dlrm_s_criteo_kaggle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | #echo $dlrm_extra_option
16 | 
17 | dlrm_pt_bin="python dlrm_s_pytorch.py"
18 | dlrm_c2_bin="python dlrm_s_caffe2.py"
19 | 
20 | echo "run pytorch ..."
21 | # WARNING: the following parameters will be set based on the data set
22 | # --arch-embedding-size=... (sparse feature sizes)
23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
24 | $dlrm_pt_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_kaggle_pt.log
25 | 
26 | echo "run caffe2 ..."
27 | # WARNING: the following parameters will be set based on the data set
28 | # --arch-embedding-size=... (sparse feature sizes)
29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
30 | $dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
31 | 
32 | echo "done"
33 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/bench/dlrm_s_criteo_terabyte.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | #echo $dlrm_extra_option
16 | 
17 | dlrm_pt_bin="python dlrm_s_pytorch.py"
18 | dlrm_c2_bin="python dlrm_s_caffe2.py"
19 | 
20 | echo "run pytorch ..."
21 | # WARNING: the following parameters will be set based on the data set
22 | # --arch-embedding-size=... (sparse feature sizes)
23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
24 | $dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
25 | 
26 | echo "run caffe2 ..."
27 | # WARNING: the following parameters will be set based on the data set
28 | # --arch-embedding-size=... (sparse feature sizes)
29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
30 | $dlrm_c2_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_terabyte_c2.log
31 | 
32 | echo "done"
33 | 


--------------------------------------------------------------------------------
/pytorch/imagenet/imagenet/run_inference_cpu_accuracy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ###############################################################################
 4 | ### How to run?
 5 | ### 1) install pytorch internal
 6 | ### 2) install torchvision: for benchmarking ResNext101_32x4d, follow this steps:
 7 | ###    1) git clone -b v0.5.0 https://github.com/pytorch/vision.git
 8 | ###    2) replace original resnet.py with this fold's resnet.py
 9 | ###    3) python setup.py install
10 | ### 3) conda install jemalloc
11 | ### 4) export LD_PRELOAD= "/YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV/lib/libjemalloc.so
12 | ###    /opt/intel/compilers_and_libraries/linux/lib/intel64/libiomp5.so"
13 | ### 5) bash run_inference_cpu_accuracy.sh resnet50/resnext101_32x4d --bf16
14 | ###
15 | ###############################################################################
16 | 
17 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024
18 | 
19 | ARGS=""
20 | if [ "$1" == "resnet50" ]; then
21 |   ARGS="$ARGS resnet50"
22 |   echo "### running resnet50 model"
23 | else
24 |   ARGS="$ARGS resnext101_32x4d"
25 |   echo "### running resnext101_32x4d model"
26 | fi
27 | 
28 | data_type=$2
29 | 
30 | #echo "$data_type"
31 | 
32 | if [ "$2" == "bf16" ]; then
33 |   ARGS="$ARGS --bf16"
34 |   echo "### running bf16 datatype"
35 | fi
36 | 
37 | CORES=`lscpu | grep Core | awk '{print $4}'`
38 | SOCKETS=`lscpu | grep Socket | awk '{print $2}'`
39 | TOTAL_CORES=`expr $CORES \* $SOCKETS`
40 | 
41 | KMP_SETTING="KMP_AFFINITY=granularity=fine,compact,1,0"
42 | 
43 | BATCH_SIZE=256
44 | 
45 | export OMP_NUM_THREADS=$TOTAL_CORES
46 | export $KMP_SETTING
47 | 
48 | echo -e "### using OMP_NUM_THREADS=$TOTAL_CORES"
49 | echo -e "### using $KMP_SETTING\n\n"
50 | sleep 3
51 | 
52 | if [ "$1" == "resnet50" ]; then
53 |   python -u main.py -e -a $ARGS --mkldnn --pretrained -j $TOTAL_CORES $DATA_PATH -b $BATCH_SIZE
54 | else
55 |   python -u main.py -e -a $ARGS --mkldnn --pretrained -j $TOTAL_CORES $DATA_PATH -b $BATCH_SIZE --checkpoint-dir checkpoints/resnext101_32x4d/checkpoint.pth.tar
56 | fi
57 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/test/dlrm_s_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | #echo $dlrm_extra_option
16 | 
17 | dlrm_py="python dlrm_s_pytorch.py"
18 | dlrm_c2="python dlrm_s_caffe2.py"
19 | 
20 | echo "Running commands ..."
21 | #run pytorch
22 | echo $dlrm_py
23 | $dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1
24 | $dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2
25 | $dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3
26 | $dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4
27 | 
28 | #run caffe2
29 | echo $dlrm_c2
30 | $dlrm_c2 --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc1
31 | $dlrm_c2 --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc2
32 | $dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc3
33 | $dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc4
34 | 
35 | echo "Checking results ..."
36 | #check results
37 | #WARNING: correct test will have no difference in numeric values
38 | #(but might have some verbal difference, e.g. due to warnnings)
39 | #in the output file
40 | echo "diff test1 (no numeric values in the output = SUCCESS)"
41 | diff ccc1 ppp1
42 | echo "diff test2 (no numeric values in the output = SUCCESS)"
43 | diff ccc2 ppp2
44 | echo "diff test3 (no numeric values in the output = SUCCESS)"
45 | diff ccc3 ppp3
46 | echo "diff test4 (no numeric values in the output = SUCCESS)"
47 | diff ccc4 ppp4
48 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/cython/cython_criteo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | # Description: run dataset pre-processing in standalone mode
 7 | # WARNING: These steps are required to work with Cython
 8 | # 1. Instal Cython
 9 | # > sudo yum install Cython
10 | # 2. Please copy data_utils.py into data_utils_cython.pyx
11 | # 3. Compile the data_utils_cython.pyx to generate .so
12 | # (it's important to keep extension .pyx rather than .py
13 | #  to ensure the C/C++ .so no .py is loaded at import time)
14 | # > python cython_compile.py build_ext --inplace
15 | # This should create data_utils_cython.so, which can be loaded below with "import"
16 | # 4. Run standalone datatset preprocessing to generate .npz files
17 | # a. Kaggle
18 | # > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
19 | #   --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
20 | # b. Terabyte
21 | # > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
22 | #   --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
23 | 
24 | from __future__ import absolute_import, division, print_function, unicode_literals
25 | 
26 | import data_utils_cython as duc
27 | 
28 | if __name__ == "__main__":
29 |     ### import packages ###
30 |     import argparse
31 | 
32 |     ### parse arguments ###
33 |     parser = argparse.ArgumentParser(
34 |         description="Preprocess Criteo dataset"
35 |     )
36 |     # model related parameters
37 |     parser.add_argument("--max-ind-range", type=int, default=-1)
38 |     parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
39 |     parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
40 |     parser.add_argument("--memory-map", action="store_true", default=False)
41 |     parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
42 |     parser.add_argument("--raw-data-file", type=str, default="")
43 |     parser.add_argument("--processed-data-file", type=str, default="")
44 |     args = parser.parse_args()
45 | 
46 |     duc.loadDataset(
47 |         args.data_set,
48 |         args.max_ind_range,
49 |         args.data_sub_sample_rate,
50 |         args.data_randomize,
51 |         "train",
52 |         args.raw_data_file,
53 |         args.processed_data_file,
54 |         args.memory_map
55 |     )
56 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/model.py:
--------------------------------------------------------------------------------
 1 | """Wide and Deep Model Definition"""
 2 | # Licensed to the Apache Software Foundation (ASF) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The ASF licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | 
19 | import mxnet as mx
20 | 
21 | def wide_deep_model(num_linear_features, num_embed_features, num_cont_features,
22 |                     input_dims, hidden_units):
23 |     """wide and deep model definition and generation"""
24 |     # wide model
25 |     csr_data = mx.symbol.Variable("csr_data", stype='csr')
26 |     label = mx.symbol.Variable("softmax_label")
27 | 
28 |     norm_init = mx.initializer.Normal(sigma=0.01)
29 |     # weight with row_sparse storage type to enable sparse gradient updates
30 |     weight = mx.symbol.Variable("linear_weight", shape=(num_linear_features, hidden_units[3]),
31 |                                 init=norm_init, stype='row_sparse')
32 |     bias = mx.symbol.Variable("linear_bias", shape=(hidden_units[3],))
33 |     dot = mx.symbol.sparse.dot(csr_data, weight)
34 |     linear_out = mx.symbol.broadcast_add(dot, bias)
35 |     # deep model
36 |     dns_data = mx.symbol.Variable("dns_data")
37 |     # embedding features
38 |     x = mx.symbol.slice(data=dns_data, begin=(0, 0),
39 |                         end=(None, num_embed_features))
40 |     embeds = mx.symbol.split(data=x, num_outputs=num_embed_features, squeeze_axis=1)
41 |     # continuous features
42 |     x = mx.symbol.slice(data=dns_data, begin=(0, num_embed_features),
43 |                         end=(None, num_embed_features + num_cont_features))
44 |     features = [x]
45 | 
46 |     for i, embed in enumerate(embeds):
47 |         embed_weight = mx.symbol.Variable('embed_%d_weight' % i, stype='row_sparse')
48 |         features.append(mx.symbol.sparse.Embedding(data=embed, weight=embed_weight,
49 |                                                    input_dim=input_dims, output_dim=hidden_units[0], sparse_grad=True))
50 | 
51 |     hidden = mx.symbol.concat(*features, dim=1)
52 |     hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[1])
53 |     hidden = mx.symbol.Activation(data=hidden, act_type='relu')
54 |     hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[2])
55 |     hidden = mx.symbol.Activation(data=hidden, act_type='relu')
56 |     deep_out = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[3])
57 | 
58 |     out = mx.symbol.SoftmaxOutput(linear_out + deep_out, label, name='model')
59 |     return out
60 | 


--------------------------------------------------------------------------------
/pytorch/imagenet/imagenet/run_inference_cpu_multi_instance_latency.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ######################################################################
 4 | ### How to run?
 5 | ### 1) install pytorch internal
 6 | ### 2) install torchvision: for benchmarking ResNext101_32x4d, follow this steps:
 7 | ###    1) git clone -b v0.5.0 https://github.com/pytorch/vision.git
 8 | ###    2) replace original resnet.py with this fold's resnet.py
 9 | ###    3) python setup.py install
10 | ### 3) conda install jemalloc
11 | ### 4) export LD_PRELOAD= "/YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV/lib/libjemalloc.so
12 | ###    /opt/intel/compilers_and_libraries/linux/lib/intel64/libiomp5.so"
13 | ### 5) Test cpu lantancy(14 instance, 4 core/ins). Just run:
14 | ###    bash run_inference_cpu_multi_instance_lantency_bf16.sh resnet50/resnext101_32x4d
15 | ###
16 | ##################################################################3#####
17 | 
18 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024
19 | 
20 | ARGS=""
21 | if [[ "$1" == "resnet50" ]]
22 | then
23 |     ARGS="$ARGS resnet50"
24 |     echo "### running resnet50 model"
25 | else
26 |     ARGS="$ARGS resnext101_32x4d"
27 |     echo "### running resnext101_32x4d model"
28 | fi
29 | 
30 | data_type=$2
31 | 
32 | echo "$data_type"
33 | 
34 | if [[ "$2" == "bf16" ]]
35 | then
36 |     ARGS="$ARGS --bf16"
37 |     echo "### running bf16 datatype"
38 | fi
39 | 
40 | CORES=`lscpu | grep Core | awk '{print $4}'`
41 | SOCKETS=`lscpu | grep Socket | awk '{print $2}'`
42 | TOTAL_CORES=`expr $CORES \* $SOCKETS`
43 | 
44 | # change this number to adjust number of instances
45 | CORES_PER_INSTANCE=4
46 | 
47 | KMP_SETTING="KMP_AFFINITY=granularity=fine,compact,1,0"
48 | 
49 | BATCH_SIZE=1
50 | 
51 | export OMP_NUM_THREADS=$CORES_PER_INSTANCE
52 | export $KMP_SETTING
53 | 
54 | echo -e "### using OMP_NUM_THREADS=$CORES_PER_INSTANCE"
55 | echo -e "### using $KMP_SETTING\n\n"
56 | sleep 3
57 | 
58 | INSTANCES=`expr $TOTAL_CORES / $CORES_PER_INSTANCE`
59 | LAST_INSTANCE=`expr $INSTANCES - 1`
60 | INSTANCES_PER_SOCKET=`expr $INSTANCES / $SOCKETS`
61 | for i in $(seq 1 $LAST_INSTANCE); do
62 |     numa_node_i=`expr $i / $INSTANCES_PER_SOCKET`
63 |     start_core_i=`expr $i \* $CORES_PER_INSTANCE`
64 |     end_core_i=`expr $start_core_i + $CORES_PER_INSTANCE - 1`
65 |     LOG_i=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt
66 | 
67 |     echo "### running on instance $i, numa node $numa_node_i, core list {$start_core_i, $end_core_i}..."
68 |     numactl --physcpubind=$start_core_i-$end_core_i --membind=$numa_node_i python -u main.py -e -a $ARGS \
69 |         --mkldnn --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_i &
70 | done
71 | 
72 | 
73 | numa_node_0=0
74 | start_core_0=0
75 | end_core_0=`expr $CORES_PER_INSTANCE - 1`
76 | LOG_0=inference_cpu_bs${BATCH_SIZE}_ins0.txt
77 | 
78 | echo "### running on instance 0, numa node $numa_node_0, core list {$start_core_0, $end_core_0}...\n\n"
79 | numactl --physcpubind=$start_core_0-$end_core_0 --membind=$numa_node_0 python -u main.py -e -a $ARGS \
80 |         --mkldnn --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_0
81 | 
82 | sleep 10
83 | echo -e "\n\n Sum sentences/s together:"
84 | for i in $(seq 0 $LAST_INSTANCE); do
85 |     log=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt
86 |     tail -n 2 $log
87 | done
88 | 


--------------------------------------------------------------------------------
/pytorch/imagenet/imagenet/run_inference_cpu_multi_instance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ###############################################################################
 4 | ### How to run?
 5 | ### 1) install pytorch internal
 6 | ### 2) install torchvision: for benchmarking ResNext101_32x4d, follow this steps:
 7 | ###    1) git clone -b v0.5.0 https://github.com/pytorch/vision.git
 8 | ###    2) replace original resnet.py with this fold's resnet.py
 9 | ###    3) python setup.py install
10 | ### 3) conda install jemalloc
11 | ### 4) export LD_PRELOAD= "/YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV/lib/libjemalloc.so
12 | ###    /opt/intel/compilers_and_libraries/linux/lib/intel64/libiomp5.so"
13 | ### 5) Test cpu throughput(2 instance, 28 core/ins). Just run
14 | ###    bash run_inference_cpu_multi_instance_bf16.sh resnet50/resnext101_32x4d
15 | ###
16 | ###############################################################################
17 | 
18 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024
19 | 
20 | ARGS=""
21 | if [[ "$1" == "resnet50" ]]
22 | then
23 |     ARGS="$ARGS resnet50"
24 |     echo "### running resnet50 model"
25 | else
26 |     ARGS="$ARGS resnext101_32x4d"
27 |     echo "### running resnext101_32x4d model"
28 | fi
29 | 
30 | data_type=$2
31 | 
32 | echo "$data_type"
33 | 
34 | if [[ "$2" == "bf16" ]]
35 | then
36 |     ARGS="$ARGS --bf16"
37 |     echo "### running bf16 datatype"
38 | fi
39 | 
40 | if [[ "$3" == "disable-mkldnn" ]]
41 | then
42 |     unset LD_PRELOAD
43 |     echo "### running non mkldnn  model"
44 | else
45 |     ARGS="$ARGS --mkldnn"
46 |     echo "### running mkldnn backend"
47 | fi
48 | 
49 | CORES=`lscpu | grep Core | awk '{print $4}'`
50 | SOCKETS=`lscpu | grep Socket | awk '{print $2}'`
51 | TOTAL_CORES=`expr $CORES \* $SOCKETS`
52 | 
53 | # change this number to adjust number of instances
54 | CORES_PER_INSTANCE=$CORES
55 | 
56 | KMP_SETTING="KMP_AFFINITY=granularity=fine,compact,1,0"
57 | 
58 | BATCH_SIZE=128
59 | 
60 | export OMP_NUM_THREADS=$CORES_PER_INSTANCE
61 | export $KMP_SETTING
62 | 
63 | echo -e "### using OMP_NUM_THREADS=$CORES_PER_INSTANCE"
64 | echo -e "### using $KMP_SETTING\n\n"
65 | sleep 3
66 | 
67 | INSTANCES=`expr $TOTAL_CORES / $CORES_PER_INSTANCE`
68 | LAST_INSTANCE=`expr $INSTANCES - 1`
69 | INSTANCES_PER_SOCKET=`expr $INSTANCES / $SOCKETS`
70 | for i in $(seq 1 $LAST_INSTANCE); do
71 |     numa_node_i=`expr $i / $INSTANCES_PER_SOCKET`
72 |     start_core_i=`expr $i \* $CORES_PER_INSTANCE`
73 |     end_core_i=`expr $start_core_i + $CORES_PER_INSTANCE - 1`
74 |     LOG_i=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt
75 | 
76 |     echo "### running on instance $i, numa node $numa_node_i, core list {$start_core_i, $end_core_i}..."
77 |     numactl --physcpubind=$start_core_i-$end_core_i --membind=$numa_node_i python -u main.py -e -a $ARGS \
78 |         --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_i &
79 | done
80 | 
81 | numa_node_0=0
82 | start_core_0=0
83 | end_core_0=`expr $CORES_PER_INSTANCE - 1`
84 | LOG_0=inference_cpu_bs${BATCH_SIZE}_ins0.txt
85 | 
86 | echo "### running on instance 0, numa node $numa_node_0, core list {$start_core_0, $end_core_0}...\n\n"
87 | numactl --physcpubind=$start_core_0-$end_core_0 --membind=$numa_node_0 python -u main.py -e -a $ARGS \
88 |     --dummy -j $CORES_PER_INSTANCE $DATA_PATH -b $BATCH_SIZE 2>&1 | tee $LOG_0
89 | 
90 | sleep 10
91 | echo -e "\n\n Sum sentences/s together:"
92 | for i in $(seq 0 $LAST_INSTANCE); do
93 |     log=inference_cpu_bs${BATCH_SIZE}_ins${i}.txt
94 |     tail -n 2 $log
95 | done
96 | 


--------------------------------------------------------------------------------
/mxnet/blog/mxnet_v1.5_release/rnn_benchmark.py:
--------------------------------------------------------------------------------
  1 | import mxnet as mx
  2 | import time
  3 | import logging
  4 | import argparse
  5 | 
  6 | rnncell_type = ['rnn', 'lstm', 'gru', 'sru']
  7 | 
  8 | 
  9 | parser = argparse.ArgumentParser(description='MxNet RNN benchmark')
 10 | parser.add_argument('--gpu', '-p', type=bool, default=False, help="whether use GPU, default is False")
 11 | parser.add_argument('--cell_type', '-cell', type=str, default='lstm', 
 12 |                     help="cell type, can be \"LSTM, GRU, RNN, SRU\", default is LSTM.")
 13 | parser.add_argument('--layer_num', '-l', type=int, default=1, help="layer num, default is 1.")
 14 | 
 15 | 
 16 | warm_up = 20
 17 | iter_num = 200
 18 | 
 19 | 
 20 | def fused_module(input_shape, cell_type, layer_nums=1, ctx=mx.cpu(), layout="TNC"):
 21 |     
 22 |     assert cell_type in rnncell_type
 23 |     
 24 |     bs = input_shape[0]
 25 |     seq_len = input_shape[1]
 26 |     embed_dim = input_shape[2]
 27 |     hidden_size = input_shape[3]
 28 |     if layout == 'NTC':
 29 |         dshape = (bs, seq_len, embed_dim)
 30 |     elif layout == 'TNC':
 31 |         logging.warning('layout TNC is used!')
 32 |         dshape = (seq_len, bs, embed_dim)
 33 |     data = mx.sym.Variable('data')
 34 |     label = mx.sym.Variable('softmax_label')
 35 | 
 36 |     if cell_type == 'lstm':
 37 |         lstm_cell = mx.rnn.FusedRNNCell(
 38 |             hidden_size, num_layers=layer_nums, mode='lstm', get_next_state=False, prefix='l0_')
 39 |         rnn_sym, _ = lstm_cell.unroll(
 40 |             seq_len, data, layout=layout, merge_outputs=True)
 41 |     elif cell_type == 'gru':
 42 |         gru_cell = mx.rnn.FusedRNNCell(hidden_size, num_layers=layer_nums, mode='gru', prefix='l0_')
 43 |         rnn_sym, _ = gru_cell.unroll(
 44 |             seq_len, data, layout=layout, merge_outputs=True)
 45 | 
 46 |     mod = mx.mod.Module(rnn_sym, label_names=None, context=ctx)
 47 |     mod.bind(data_shapes=[('data', dshape)], label_shapes=None)
 48 | 
 49 |     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
 50 |     mod.init_optimizer(optimizer='sgd')
 51 |     return mod
 52 | 
 53 | 
 54 | def rnncell_score_fused(mod):
 55 |     batch = mx.io.DataBatch(data=[mx.random.uniform(shape=mod.data_shapes[0][1])], label=[])
 56 |     tic = time.time()
 57 | 
 58 |     mod.forward(batch, is_train=False)
 59 |     output = mod.get_outputs()[0]
 60 |     output.wait_to_read()
 61 | 
 62 |     fwd = time.time() - tic
 63 |     return fwd
 64 | 
 65 | 
 66 | if __name__ == '__main__':
 67 | 
 68 |     '''
 69 | 	cell: unidirection-lstm
 70 | 	hidden_size: 512/1024
 71 | 	BS: 1/32
 72 | 	sentence length/time step: 50/
 73 | 	layers: 1/4
 74 | 
 75 |     '''
 76 |     # [bs, sequence length, embedding size, hidden size]
 77 |     input_shape_list = [[1, 50, 512, 512], [1, 50, 1024, 1024], [32, 50, 512, 512], [32, 50, 1024, 1024]]
 78 |     
 79 |     logging.basicConfig(level = logging.INFO)
 80 |     args = parser.parse_args()
 81 |     if args.gpu:
 82 |         ctx = mx.gpu(0)
 83 |     else:
 84 |         ctx = mx.cpu()
 85 | 
 86 |     cell = args.cell_type
 87 |     layer_nums = args.layer_num
 88 | 	
 89 |     logging.warning('Fused RNN API Inference benchmarking started')
 90 |     
 91 |  
 92 |     for input_shape in input_shape_list:
 93 |         total_fwd = 0
 94 |         mod = fused_module(input_shape, cell, layer_nums, ctx)
 95 |         # mod.save_checkpoint('gnmt', 0)
 96 |         for i in range(warm_up + iter_num):
 97 |             fwd = rnncell_score_fused(mod)
 98 |             if i >= warm_up:
 99 |                 total_fwd += fwd
100 | 
101 |         total_fwd = total_fwd / iter_num
102 |         logging.info(str(input_shape) + ' time cost ' + str(total_fwd) + 's samples/sec = ' + str(input_shape[0]/total_fwd))
103 | 
104 | 


--------------------------------------------------------------------------------
/pytorch/RESNET50V1.md:
--------------------------------------------------------------------------------
  1 | # Guide to Run Pytorch/caffe2 resnet50 v1 model 
  2 | 
  3 | - please use v1.0.5
  4 | 
  5 | ## Download caffe resnet50 v1 model
  6 | 
  7 | ```
  8 | download the Resnet-50-deploy.prototxt and Resnet-50-model.caffemodel from https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777
  9 | the model is provide by https://github.com/KaimingHe/deep-residual-networks
 10 | ```
 11 | 
 12 | 
 13 | ## Get pytorch source from github, prepare mkl2019 and build
 14 | 
 15 | ```
 16 | git clone https://github.com/jgong5/pytorch -b int8_with_more_opts
 17 | git submodule update --init --recursive
 18 | ```
 19 | 
 20 | ```
 21 | download mkl from https://anaconda.org/anaconda/mkl/files?version=2019.3 and extract to mkl2019 folder
 22 | download mkl-include from https://anaconda.org/anaconda/mkl-include/files and extract to mkl2019 folder
 23 | copy system mkl folder: mkl/lib/intel64 to mkl2019/lib/
 24 | copy system mkl folder file: mkl/lib/libiomp5.so to mkl2019/lib/
 25 | ```
 26 | 
 27 | ```
 28 | export USE_MKLDNN=ON  MKLDNN_USE_CBLAS=ON
 29 | export MKLROOT=location/to/mkl2019
 30 | python setup.py build
 31 | ```
 32 | 
 33 | ## Transfer caffe model to pytorch/caffe2 model
 34 | 
 35 | 
 36 | ```
 37 |    export PYTHONPATH=src/to/caffe2/build
 38 |    cd pytorch/benchmark_tools
 39 |    python inference/caffe_translator.py Resnet-50-deploy.prototxt Resnet-50-model.caffemodel
 40 | 
 41 | ```
 42 |    you will get init_net.pb and predict_net.pb under the folder where you run the command
 43 | 
 44 | ## Copy weight file and model file to tools folder
 45 | 
 46 | ```
 47 |         cp init_net.pb  inference/models/resnet50_v1/
 48 | 
 49 |         cp predict_net.pb  inference/models/resnet50_v1/
 50 | ```
 51 | 
 52 | ## Prepare dataset
 53 | 
 54 | ```
 55 |         Please download the imagenet and validation file from the official site
 56 |         http://image-net.org/download.php
 57 |         
 58 | Note:
 59 | - ImageNet does not own the copyright of the images. For researchers and educators who wish to use the images for non-commercial research and/or educational purposes, ImageNet can provide access through their site under certain conditions and terms. 
 60 |                 
 61 | ```
 62 | 
 63 | ## Prepare calibration dataset
 64 | 
 65 | ```
 66 |         Copy ILSVRC2012_val_00033000.JPEG to ILSVRC2012_val_00033999.JPEG totally 1000 images from the downloaded imagenet dataset folder to calibration folder
 67 |         find /path/to/your/dataset -type f | grep -E 'ILSVRC2012_val_00033[0-9]*' | xargs -i cp {} /path/to/your/calibration_dataset
 68 | ```
 69 | 
 70 | ## Run calibration
 71 | 
 72 | ```
 73 |          export PYTHONPATH=/the/path/to/your/pytorch/src
 74 |          export LD_PRELOAD=the/location/of/libiomp5.so      #libiomp5.so can be found under you mkl folder
 75 |          export OMP_NUM_THREADS=28  KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu
 76 |          ./run_caffe2.py -m $modelname -p calibration_folder  -v validation_file  -b "batchsize"  -r calibration -o . --onnx
 77 | 
 78 |     There will be two files generated under the folder, and copy them to inference/models/resnet50_v1
 79 |          cp init_net_int8.pb inference/models/resnet50/init_onnx_int8.pb
 80 |          cp predict_net_int8.pb inference/models/resnet50/predict_onnx_int8.pb
 81 | 
 82 | ```
 83 | 
 84 | 
 85 | ## Run int8 model
 86 | 
 87 | ```
 88 |    cd pytoch/benchmark_tools
 89 |    ./run_numctl.sh
 90 | ```
 91 | 
 92 | 
 93 | 
 94 | 
 95 | ## Parse the result, the output of both fp32 and int8 model looks like below,
 96 | 
 97 | ```
 98 |          Images per second: 345.5456113865
 99 |          Total computing time: 144.6986978054 seconds
100 |          Total image processing time: 491.1261794567 seconds
101 |          Total model loading time: 4.4210910797 seconds
102 |          Total images: 50000
103 | 
104 | ```
105 |     Just use 'Images per second' as the Throughput
106 |     
107 | 


--------------------------------------------------------------------------------
/mxnet/blog/medium_vnni/ec2_benchmark_base.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "MXNet Model Quantization Performance"
 3 | echo "Testing FP32 base models"
 4 | echo "Installing mxnet-mkl 1.5.0b20190623"
 5 | pip install --pre mxnet-mkl==1.5.0b20190623
 6 | echo "Downloading source code from incubator-mxnet repo"
 7 | git clone https://github.com/apache/incubator-mxnet
 8 | cd incubator-mxnet
 9 | git checkout f44f6cfbe752fd8b8036307cecf6a30a30ad8557
10 | 
11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l`
13 | export OMP_NUM_THREADS=$((vCPUs / 2))
14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS
15 | 
16 | # Reduce remote memory access
17 | export NNVM_EXEC_MATCH_RANGE=1
18 | unset MXNET_SUBGRAPH_BACKEND
19 | 
20 | echo "=========test image classification models=========="
21 | cd ./example/quantization
22 | echo "=============resnet50_v1==============="
23 | echo "1. calibrating resnet50_v1 with calib-mode=naive, use 5 batches to do calibration"
24 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive
25 | echo "2. testing throughput of fp32 resnet50_v1"
26 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
27 | echo "3. testing latency of fp32 resnet50_v1"
28 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
29 | echo "=============resnet101_v1==============="
30 | echo "1. calibrating resnet101_v1 with calib-mode=naive, use 5 batches to do calibration"
31 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive
32 | echo "2. testing throughput of fp32 resnet101_v1"
33 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
34 | echo "3. testing latency of fp32 resnet101_v1"
35 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
36 | echo "=============mobilenet1.0==============="
37 | echo "1. calibrating mobilenet1.0 with calib-mode=naive, use 5 batches to do calibration"
38 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive
39 | echo "2. testing throughput of fp32 mobilenet1.0"
40 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
41 | echo "3. testing latency of fp32 mobilenet1.0"
42 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu  --benchmark=True
43 | echo "=============inceptionv3==============="
44 | echo "1. calibrating inceptionv3 with calib-mode=naive, use 5 batches to do calibration"
45 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive
46 | echo "2. testing throughput of fp32 inceptionv3"
47 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
48 | echo "3. testing latency of fp32 inceptionv3"
49 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=500 --ctx=cpu  --benchmark=True
50 | 
51 | echo "=========test image detection models=========="
52 | echo "==============SSD VGG16================"
53 | echo "1. downloading model"
54 | cd ../ssd
55 | cd model/ && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip
56 | unzip ssd_vgg16_reduced_300-dd479559.zip && mv ssd_vgg16_reduced_300-dd479559.params ssd_vgg16_reduced_300-0000.params && mv ssd_vgg16_reduced_300-symbol-dd479559.json ssd_vgg16_reduced_300-symbol.json
57 | cd ../data && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip
58 | unzip ssd-val-fc19a535.zip && mv ssd-val-fc19a535.idx val.idx && mv ssd-val-fc19a535.lst val.lst && mv ssd-val-fc19a535.rec val.rec
59 | cd ..
60 | echo "2. testing throughput of fp32 SSD VGG16"
61 | python benchmark_score.py --batch_size=224 --deploy --prefix=./model/ssd_
62 | echo "3. testing latency of fp32 SSD VGG16"
63 | python benchmark_score.py --batch_size=1 --deploy --prefix=./model/ssd_
64 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/data.py:
--------------------------------------------------------------------------------
 1 | """Processing data for criteo kaggle dataset"""
 2 | # Licensed to the Apache Software Foundation (ASF) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The ASF licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | 
19 | from csv import DictReader
20 | import os
21 | import numpy as np
22 | import mxnet as mx
23 | 
24 | 
25 | def get_uci_criteo(data_dir, data_name):
26 |     """Get preprocessed data to feed into model"""
27 |     data_file = os.path.join(data_dir, data_name)
28 |     if (not os.path.exists(data_file)):
29 |         print("Dataset " + data_file + " not present")
30 |     csr, dns, label = preprocess_uci_criteo(data_name)
31 |     return csr, dns, label
32 | 
33 | 
34 | 
35 | #    Label - Target variable that indicates if an ad was clicked (1) or not (0).
36 | #    I1-I13 - A total of 13 columns of integer features (mostly count features).
37 | #    C1-C26 - A total of 26 columns of categorical features. The values of
38 | #             these features have been hashed onto 32 bits for anonymization purposes.
39 | CONTINUOUS_COLUMNS = ["I"+str(i) for i in range(1, 14)] # 1-13 inclusive
40 | CATEGORICAL_COLUMNS = ["C"+str(i) for i in range(1, 27)] # 1-26 inclusive
41 | LABEL_COLUMN = ["clicked"]
42 | 
43 | TRAIN_DATA_COLUMNS = LABEL_COLUMN + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
44 | FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
45 | max_dict = {'I1': 1539, 'I2': 22066, 'I3': 65535, 'I4': 561, 'I5': 2655388, 'I6': 233523,
46 |             'I7': 26297, 'I8': 5106, 'I9': 24376, 'I10': 9, 'I11': 181, 'I12': 1807, 'I13': 6879}
47 | min_dict = {'I1': 0, 'I2': -3, 'I3': 0, 'I4': 0, 'I5': 0, 'I6': 0, 'I7': 0, 'I8': 0,
48 |             'I9': 0, 'I10': 0, 'I11': 0, 'I12': 0, 'I13': 0}
49 | 
50 | 
51 | def preprocess_uci_criteo(data_name):
52 |     """Data preprocessing for criteo kaggle dataset"""
53 |     hash_bucket_size = 1000
54 |     #cont_defaults = [[0] for i in range(1, 14)]
55 |     #cate_defaults = [[" "] for i in range(1, 27)]
56 |     #label_defaults = [[0]]
57 |     #column_headers = TRAIN_DATA_COLUMNS
58 |     #record_defaults = label_defaults + cont_defaults + cate_defaults
59 | 
60 |     label_list = []
61 |     csr_list = []
62 |     dns_list = []
63 | 
64 |     #csr_ncols = len(CATEGORICAL_COLUMNS) * hash_bucket_size
65 |     dns_ncols = len(CONTINUOUS_COLUMNS) + len(CATEGORICAL_COLUMNS)
66 |     with open(data_name) as f:
67 |         for row in DictReader(f, fieldnames=TRAIN_DATA_COLUMNS):
68 |             label_list.append(row['clicked'])
69 |             # Sparse base columns.
70 |             for name in CATEGORICAL_COLUMNS:
71 |                 csr_list.append((hash(row[name]) % hash_bucket_size, 1.0))
72 | 
73 | 
74 |             dns_row = [0] * dns_ncols
75 |             dns_dim = 0
76 |             # Embed wide columns into deep columns
77 |             for col in CATEGORICAL_COLUMNS:
78 |                 dns_row[dns_dim] = hash(row[col].strip()) % hash_bucket_size
79 |                 dns_dim += 1
80 |             # Continuous base columns.
81 |             scale = 1 #align with Google WnD paper
82 |             for col in CONTINUOUS_COLUMNS:
83 |                 #dns_row[dns_dim] = float(row[col].strip())
84 |                 orig_range = float(max_dict[col] - min_dict[col])
85 |                 dns_row[dns_dim] = (float(row[col].strip()) - min_dict[col]) * scale / orig_range
86 |                 dns_dim += 1
87 |             # No transformations.
88 | 
89 |             dns_list.append(dns_row)
90 |     data_list = [item[1] for item in csr_list]
91 |     indices_list = [item[0] for item in csr_list]
92 |     indptr_list = range(0, len(indices_list) + 1, len(CATEGORICAL_COLUMNS))
93 |     csr = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list),
94 |                                   shape=(len(label_list), hash_bucket_size * len(CATEGORICAL_COLUMNS)))
95 |     dns = np.array(dns_list)
96 |     label = np.array(label_list)
97 |     return csr, dns, label
98 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/models/__init__.py:
--------------------------------------------------------------------------------
  1 | """parse model def"""
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import logging
  9 | 
 10 | class Models:
 11 |     """def model class"""
 12 |     model_def = {}
 13 |     model_def_file = "__model_def"
 14 | 
 15 |     def __init__(self):
 16 |         if len(self.model_def) == 0:
 17 |             self._ParseAllModels()
 18 | 
 19 |     def _ParseAllModels(self):
 20 |         path = os.path.dirname(__file__)
 21 |         dirs = os.listdir(path)
 22 |         for d in dirs:
 23 |             dir_path = os.path.join(path, d)
 24 |             full_path = os.path.join(dir_path, self.model_def_file)
 25 |             if os.path.isfile(full_path):
 26 |                 with open(full_path, 'r') as mf:
 27 |                     lines = [line.rstrip('\n') for line in mf.readlines()]
 28 |                     self._LoadModelDef(lines, dir_path)
 29 | 
 30 |     def _LoadModelDef(self, defs, path):
 31 |         model_name = defs[defs.index("[Model Name]") + 1].lower()
 32 |         # model type in lowercase, e.g. "caffe legacy", "normal", "prototext"
 33 |         model_type = defs[defs.index("[Model Type]") + 1].lower()
 34 |         # output type in lowercase, e.g. "possibility", "segmentation", "post image"
 35 |         output_type = defs[defs.index("[Output Type]") + 1].lower()
 36 |         init_net = os.path.join(path, defs[defs.index("[Init Net]") + 1])
 37 |         predict_net = os.path.join(path, defs[defs.index("[Predict Net]") + 1])
 38 |         init_net_int8 = None
 39 |         if "[Init Net Int8]" in defs:
 40 |             init_net_int8 = os.path.join(path, defs[defs.index("[Init Net Int8]") + 1])
 41 |         predict_net_int8 = None
 42 |         if "[Predict Net Int8]" in defs:
 43 |             predict_net_int8 = os.path.join(path, defs[defs.index("[Predict Net Int8]") + 1])
 44 |         onnx_model = None
 45 |         if "[Onnx Model]" in defs:
 46 |             onnx_model = os.path.join(path, defs[defs.index("[Onnx Model]") + 1])
 47 |         crop_size = defs[defs.index("[Crop Size]") + 1]
 48 |         image_mean = defs[defs.index("[Image Mean]") + 1]
 49 |         scale = 1
 50 |         if "[Scale]" in defs:
 51 |             scale = defs[defs.index("[Scale]") +1]
 52 |         rescale_size = 256
 53 |         if "[ReScale Size]" in defs:
 54 |             rescale_size = defs[defs.index("[ReScale Size]") +1]
 55 |         if len(image_mean) > 0:
 56 |             image_mean = os.path.join(path, image_mean)
 57 |         else:
 58 |             image_mean = None
 59 |         allow_device_override = True
 60 |         need_normalize = False
 61 |         if "[Allow Device Override]" in defs:
 62 |             allow_device_override = defs[defs.index("[Allow Device Override]") +1].lower() in ('yes', 'true', 't', '1')
 63 |         if "[Need Normalize]" in defs:
 64 |             need_normalize = defs[defs.index("[Need Normalize]") +1].lower() in ('yes', 'true', 't', '1')
 65 |         color_format = None
 66 |         if "[Color Format]" in defs:
 67 |             color_format = defs[defs.index("[Color Format]") + 1]
 68 | 
 69 |         if model_name in self.model_def:
 70 |             logging.warning("Already has model: {}. Ignored!"
 71 |                             .format(model_name))
 72 |         else:
 73 |             self.model_def[model_name] = {
 74 |                 "model_name" : model_name,
 75 |                 "model_type" : model_type,
 76 |                 "output_type" : output_type,
 77 |                 "init_net" : init_net,
 78 |                 "predict_net" : predict_net,
 79 |                 "init_net_int8" : init_net_int8,
 80 |                 "predict_net_int8" : predict_net_int8,
 81 |                 "onnx_model" : onnx_model,
 82 |                 "crop_size" : crop_size,
 83 |                 "image_mean" : image_mean,
 84 |                 "scale" : scale,
 85 |                 "rescale_size" : rescale_size,
 86 |                 "allow_device_override": allow_device_override,
 87 |                 "need_normalize" : need_normalize,
 88 |                 "color_format" : color_format,
 89 |             }
 90 | 
 91 | 
 92 | def ShowModels():
 93 |     models = Models()
 94 |     logging.critical("All supported models for inference:\n{}"
 95 |                      .format([str(s) for s in models.model_def]))
 96 | 
 97 | def IsSupported(model):
 98 |     models = Models()
 99 |     return (model.lower() in models.model_def)
100 | 
101 | def GetModelInfo(model):
102 |     models = Models()
103 |     return models.model_def[model.lower()]
104 | 


--------------------------------------------------------------------------------
/pytorch/distributed/README.md:
--------------------------------------------------------------------------------
  1 | # Distributed Training with OneCCL in PyTorch
  2 | 
  3 | ## Install anaconda 3.0 and Dependencies
  4 | ```bash
  5 |     wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh
  6 |     chmod +x anaconda3.sh
  7 |     ./anaconda3.sh -b -p ~/anaconda3
  8 |     ./anaconda3/bin/conda create -n pytorch-ccl python=3.7
  9 |     export PATH=~/anaconda3/bin:$PATH
 10 |     source ./anaconda3/bin/activate pytorch-ccl
 11 |     conda config --append channels intel
 12 |     conda install ninja pyyaml setuptools cmake cffi typing
 13 |     conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps
 14 | ```   
 15 | ## Install PyTorch
 16 | ```bash
 17 |     git clone https://github.com/pytorch/pytorch.git
 18 |     git submodule sync && git submodule update --init --recursive
 19 |     python setup.py install
 20 | ```  
 21 | ## Install oneCCL
 22 | ```bash
 23 |     git clone https://github.com/oneapi-src/oneCCL.git
 24 |     cd {path-to-oneCCL}
 25 |     mkdir build && cd build
 26 |     cmake .. -DCMAKE_INSTALL_PREFIX=~/.local
 27 |     make -j install
 28 | ```
 29 | ## Install torch-ccl
 30 | ```bash
 31 |     git clone https://github.com/intel/torch-ccl.git
 32 |     source ~/.local/env/setvars.sh
 33 |     python setup.py install
 34 | ```
 35 | ## Demo for using OneCCL in PyTorch
 36 | ```python
 37 |     import os
 38 |     import torch
 39 |     import torch.nn as nn
 40 |     from torch.nn.parallel import DistributedDataParallel as DDP
 41 |     import torch.distributed as dist
 42 |     import torch_ccl
 43 |     
 44 |     class Model(nn.Module):
 45 |         def __init__(self):
 46 |             super(Model, self).__init__()
 47 |             self.linear = nn.Linear(4, 5)
 48 |     
 49 |         def forward(self, input):
 50 |             return self.linear(input)
 51 |     
 52 |     
 53 |     if __name__ == "__main__":
 54 |         
 55 |         os.environ['RANK'] = os.environ.get('PMI_RANK', -1)
 56 |         os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', -1)
 57 |       
 58 |         # Initialize the process group with ccl backend
 59 |         dist.init_process_group(backend='ccl')
 60 |     
 61 |         model = Model()
 62 |         if dist.get_world_size() > 1:
 63 |             model=DDP(model)
 64 |     
 65 |         for i in range(3):
 66 |             input = torch.randn(2, 4)
 67 |             labels = torch.randn(2, 5)
 68 |             loss_fn = nn.MSELoss()
 69 |             optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
 70 |     
 71 |             # forward
 72 |             res = model(input)
 73 |             L=loss_fn(res, labels)
 74 |     
 75 |             # backward
 76 |             L.backward()
 77 |     
 78 |             # update
 79 |             optimizer.step()
 80 | ```
 81 | ## Run Scripts & CPU Affinity
 82 | 1. Distributed Training on Single Node
 83 | ```bash
 84 |     source ~/.local/env/setvars.sh
 85 |     export LD_PRELOAD="${CONDA_PREFIX}/lib/libiomp5.so"
 86 |     export MASTER_ADDR="127.0.0.1"
 87 |     export MASTER_PORT="29500"
 88 | 
 89 |     # Example:
 90 |     # Run 2 processes on 2 sockets. (28 cores/socket, 4 cores for CCL, 24 cores for computation)
 91 |     #
 92 |     # CCL_WORKER_COUNT means per instance threads used by CCL.
 93 |     # CCL_WORKER_COUNT, CCL_WORKER_AFFINITY and I_MPI_PIN_DOMAIN should be consistent.
 94 | 
 95 |     export CCL_WORKER_COUNT=4
 96 |     export CCL_WORKER_AFFINITY="0,1,2,3,28,29,31,32"
 97 |     
 98 |     mpiexec.hydra -np 2 -ppn 2 -l -genv I_MPI_PIN_DOMAIN=[0x0000000FFFFFF0,0xFFFFFF00000000] \
 99 |                   -genv KMP_BLOCKTIME=1 -genv KMP_AFFINITY=granularity=fine,compact,1,0      \
100 |                   -genv OMP_NUM_THREADS=24 python -u ut_memory.py
101 | ```
102 | 2. Distributed Training on Multiple Nodes
103 | ```bash
104 |     source ~/.local/env/setvars.sh
105 |     export LD_PRELOAD="${CONDA_PREFIX}/lib/libiomp5.so"
106 |     export MASTER_ADDR="10.xxx.xxx.xxx"  # IP address on which users launch MPI command
107 |     export MASTER_PORT="29500"
108 | 
109 |     # Example:
110 |     # Run 4 processes on 2 Nodes, 2 sockets/Node (28 cores/socket, 4 cores for CCL, 24 cores for computation)
111 |     #
112 |     # CCL_WORKER_COUNT means per instance threads used by CCL.
113 |     # CCL_WORKER_COUNT, CCL_WORKER_AFFINITY and I_MPI_PIN_DOMAIN should be consistent.
114 |     #
115 |     # `hostfile`: add all Nodes' IP into this file
116 | 
117 |     export CCL_WORKER_COUNT=4
118 |     export CCL_WORKER_AFFINITY="0,1,2,3,28,29,31,32"
119 |     
120 |     mpiexec.hydra -f hostfile -np 4 -ppn 2 -l -genv I_MPI_PIN_DOMAIN=[0x0000000FFFFFF0,0xFFFFFF00000000] \
121 |                   -genv KMP_BLOCKTIME=1 -genv KMP_AFFINITY=granularity=fine,compact,1,0                  \
122 |                   -genv OMP_NUM_THREADS=24 python -u ut_memory.py
123 | ```
124 | 


--------------------------------------------------------------------------------
/third-party-programs.txt:
--------------------------------------------------------------------------------
 1 | optimized-models Third Party Programs File
 2 | 
 3 | This file contains the list of third party software (“third party programs”) contained
 4 | in the Intel software and their required notices and/or license terms. This third party
 5 | software, even if included with the distribution of the Intel software, may be governed
 6 | by separate license terms, including without limitation, third party license terms, other
 7 | Intel software license terms, and open source software license terms. These separate license
 8 | terms govern your use of the third party programs as set forth in in the “third-party-programs.txt” or other similarly named text file.
 9 | 
10 | Third party programs and their corresponding required notices and/or license terms are listed below.
11 | 
12 | -------------------------------------------------------------
13 | 1. dlrm
14 | MIT License
15 | 
16 | Copyright (c) Facebook, Inc. and its affiliates.
17 | 
18 | Permission is hereby granted, free of charge, to any person obtaining a copy
19 | of this software and associated documentation files (the "Software"), to deal
20 | in the Software without restriction, including without limitation the rights
21 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22 | copies of the Software, and to permit persons to whom the Software is
23 | furnished to do so, subject to the following conditions:
24 | 
25 | The above copyright notice and this permission notice shall be included in all
26 | copies or substantial portions of the Software.
27 | 
28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 | SOFTWARE.
35 | 
36 | -------------------------------------------------------------
37 | 2. PyTorch examples
38 | BSD 3-Clause License
39 | 
40 | Copyright (c) 2017,
41 | All rights reserved.
42 | 
43 | Redistribution and use in source and binary forms, with or without
44 | modification, are permitted provided that the following conditions are met:
45 | 
46 | * Redistributions of source code must retain the above copyright notice, this
47 |   list of conditions and the following disclaimer.
48 | 
49 | * Redistributions in binary form must reproduce the above copyright notice,
50 |   this list of conditions and the following disclaimer in the documentation
51 |   and/or other materials provided with the distribution.
52 | 
53 | * Neither the name of the copyright holder nor the names of its
54 |   contributors may be used to endorse or promote products derived from
55 |   this software without specific prior written permission.
56 | 
57 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
58 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
60 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
61 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
63 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
64 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
65 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
66 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67 | 
68 | -------------------------------------------------------------
69 | 3. torchvision
70 | BSD 3-Clause License
71 | 
72 | Copyright (c) Soumith Chintala 2016,
73 | All rights reserved.
74 | 
75 | Redistribution and use in source and binary forms, with or without
76 | modification, are permitted provided that the following conditions are met:
77 | 
78 | * Redistributions of source code must retain the above copyright notice, this
79 |   list of conditions and the following disclaimer.
80 | 
81 | * Redistributions in binary form must reproduce the above copyright notice,
82 |   this list of conditions and the following disclaimer in the documentation
83 |   and/or other materials provided with the distribution.
84 | 
85 | * Neither the name of the copyright holder nor the names of its
86 |   contributors may be used to endorse or promote products derived from
87 |   this software without specific prior written permission.
88 | 
89 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
90 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
91 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
92 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
93 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
94 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
95 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
96 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
97 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
98 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/bench/dlrm_s_benchmark.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | #check if extra argument is passed to the test
  8 | if [[ $# == 1 ]]; then
  9 |     dlrm_extra_option=$1
 10 | else
 11 |     dlrm_extra_option=""
 12 | fi
 13 | #echo $dlrm_extra_option
 14 | 
 15 | cpu=1
 16 | gpu=1
 17 | pt=1
 18 | c2=1
 19 | 
 20 | ncores=28 #12 #6
 21 | nsockets="0"
 22 | 
 23 | ngpus="1 2 4 8"
 24 | 
 25 | numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
 26 | dlrm_pt_bin="python dlrm_s_pytorch.py"
 27 | dlrm_c2_bin="python dlrm_s_caffe2.py"
 28 | 
 29 | data=random #synthetic
 30 | print_freq=100
 31 | rand_seed=727
 32 | 
 33 | c2_net="async_scheduling"
 34 | 
 35 | #Model param
 36 | mb_size=2048 #1024 #512 #256
 37 | nbatches=1000 #500 #100
 38 | bot_mlp="512-512-64"
 39 | top_mlp="1024-1024-1024-1"
 40 | emb_size=64
 41 | nindices=100
 42 | emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
 43 | interaction="dot"
 44 | tnworkers=0
 45 | tmb_size=16384
 46 | 
 47 | #_args="--mini-batch-size="${mb_size}\
 48 | _args=" --num-batches="${nbatches}\
 49 | " --data-generation="${data}\
 50 | " --arch-mlp-bot="${bot_mlp}\
 51 | " --arch-mlp-top="${top_mlp}\
 52 | " --arch-sparse-feature-size="${emb_size}\
 53 | " --arch-embedding-size="${emb}\
 54 | " --num-indices-per-lookup="${nindices}\
 55 | " --arch-interaction-op="${interaction}\
 56 | " --numpy-rand-seed="${rand_seed}\
 57 | " --print-freq="${print_freq}\
 58 | " --print-time"\
 59 | " --enable-profiling "
 60 | 
 61 | c2_args=" --caffe2-net-type="${c2_net}
 62 | 
 63 | 
 64 | # CPU Benchmarking
 65 | if [ $cpu = 1 ]; then
 66 |   echo "--------------------------------------------"
 67 |   echo "CPU Benchmarking - running on $ncores cores"
 68 |   echo "--------------------------------------------"
 69 |   if [ $pt = 1 ]; then
 70 |     outf="model1_CPU_PT_$ncores.log"
 71 |     outp="dlrm_s_pytorch.prof"
 72 |     echo "-------------------------------"
 73 |     echo "Running PT (log file: $outf)"
 74 |     echo "-------------------------------"
 75 |     cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
 76 |     echo $cmd
 77 |     eval $cmd
 78 |     min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
 79 |     echo "Min time per iteration = $min"
 80 |     # move profiling file(s)
 81 |     mv $outp ${outf//".log"/".prof"}
 82 |     mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
 83 | 
 84 |   fi
 85 |   if [ $c2 = 1 ]; then
 86 |     outf="model1_CPU_C2_$ncores.log"
 87 |     outp="dlrm_s_caffe2.prof"
 88 |     echo "-------------------------------"
 89 |     echo "Running C2 (log file: $outf)"
 90 |     echo "-------------------------------"
 91 |     cmd="$numa_cmd $dlrm_c2_bin --mini-batch-size=$mb_size $_args $c2_args $dlrm_extra_option 1> $outf 2> $outp"
 92 |     echo $cmd
 93 |     eval $cmd
 94 |     min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
 95 |     echo "Min time per iteration = $min"
 96 |     # move profiling file (collected from stderr above)
 97 |     mv $outp ${outf//".log"/".prof"}
 98 |   fi
 99 | fi
100 | 
101 | # GPU Benchmarking
102 | if [ $gpu = 1 ]; then
103 |   echo "--------------------------------------------"
104 |   echo "GPU Benchmarking - running on $ngpus GPUs"
105 |   echo "--------------------------------------------"
106 |   for _ng in $ngpus
107 |   do
108 |     # weak scaling
109 |     # _mb_size=$((mb_size*_ng))
110 |     # strong scaling
111 |     _mb_size=$((mb_size*1))
112 |     _gpus=$(seq -s, 0 $((_ng-1)))
113 |     cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
114 |     echo "-------------------"
115 |     echo "Using GPUS: "$_gpus
116 |     echo "-------------------"
117 |     if [ $pt = 1 ]; then
118 |       outf="model1_GPU_PT_$_ng.log"
119 |       outp="dlrm_s_pytorch.prof"
120 |       echo "-------------------------------"
121 |       echo "Running PT (log file: $outf)"
122 |       echo "-------------------------------"
123 |       cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
124 |       echo $cmd
125 |       eval $cmd
126 |       min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
127 |       echo "Min time per iteration = $min"
128 |       # move profiling file(s)
129 |       mv $outp ${outf//".log"/".prof"}
130 |       mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
131 |     fi
132 |     if [ $c2 = 1 ]; then
133 |       outf="model1_GPU_C2_$_ng.log"
134 |       outp="dlrm_s_caffe2.prof"
135 |       echo "-------------------------------"
136 |       echo "Running C2 (log file: $outf)"
137 |       echo "-------------------------------"
138 |       cmd="$cuda_arg $dlrm_c2_bin --mini-batch-size=$_mb_size $_args $c2_args --use-gpu $dlrm_extra_option 1> $outf 2> $outp"
139 |       echo $cmd
140 |       eval $cmd
141 |       min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
142 |       echo "Min time per iteration = $min"
143 |       # move profiling file (collected from stderr above)
144 |       mv $outp ${outf//".log"/".prof"}
145 |     fi
146 |   done
147 | fi
148 | 


--------------------------------------------------------------------------------
/pytorch/README.md:
--------------------------------------------------------------------------------
  1 | # Guide to run resnet50 fp32 and int8 models. please use release v1.0.0.
  2 | 
  3 | - For resnet50_v1 guide, please see [`RESNET50V1.md`](RESNET50V1.md).
  4 | 
  5 | ## Download resnet50 pytorch model
  6 | 
  7 | ```
  8 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth
  9 | ```
 10 | ## Download resnext pytorch model if you want
 11 | ```
 12 | wget http://data.lip6.fr/cadene/pretrainedmodels/resnext101_32x4d-29e315fa.pth
 13 | ```
 14 | 
 15 | ## Install legacy pytorch for transferring model from pytorch to onnx
 16 | 
 17 | ```
 18 | pip install torchvision
 19 | ```   
 20 | 
 21 | ## Get pytoch source from github
 22 | 
 23 | ```
 24 | git clone https://github.com/pytorch/pytorch.git 
 25 | git submodule update --init --recursive
 26 | python setup.py build
 27 | ```
 28 | 
 29 | ## Transfer pytorch model to onnx model
 30 |     below code is an example for resnet50:
 31 | ```
 32 |         import torch    
 33 |         import torchvision.models as models
 34 |         from torch.autograd import Variable
 35 |         model = models.resnet50(pretrained=False)
 36 |         m = torch.load('resnet50-19c8e357.pth')
 37 |         model.load_state_dict(m)
 38 |         model.train(False)
 39 |         x = Variable(torch.randn(1, 3, 224, 224))
 40 |         y = model(x)
 41 |         torch_out = torch.onnx._export(model, 
 42 |                                        x,
 43 |                                        "resnet50.onnx",
 44 |                                        export_params=True)
 45 | ```
 46 | ## Copy onnx file to tools folder
 47 | 
 48 | ```
 49 |         cp resnet50.onnx inference/models/resnet50/
 50 | ```
 51 | 
 52 | ## Prepare dataset
 53 | 
 54 | ```
 55 |         Please download the imagenet and validation file from the official site
 56 |         http://image-net.org/download.php
 57 |         
 58 | Note:
 59 | - ImageNet does not own the copyright of the images. For researchers and educators who wish to use the images for non-commercial research and/or educational purposes, ImageNet can provide access through their site under certain conditions and terms. 
 60 |                 
 61 | 
 62 | 
 63 | ```
 64 | ## Prepare calibration dataset
 65 | 
 66 | ```
 67 |         Copy ILSVRC2012_val_00033000.JPEG to ILSVRC2012_val_00033999.JPEG totally 1000 images from the downloaded imagenet dataset folder to calibration folder
 68 |         find /path/to/your/dataset -type f | grep -E 'ILSVRC2012_val_00033[0-9]*' | xargs -i cp {} /path/to/your/calibration_dataset
 69 | ```
 70 | 
 71 | ## Run calibration
 72 | 
 73 | 
 74 | ```
 75 |          export PYTHONPATH=/the/path/to/your/pytorch/src
 76 |          export LD_PRELOAD=the/location/of/libiomp5.so      #libiomp5.so can be found under you mkl folder
 77 |          export OMP_NUM_THREADS=28  KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu
 78 |          ./run_caffe2.py -m $modelname -p calibration_folder  -v validation_file  -b "batchsize"  -r calibration -o . --onnx
 79 | 
 80 |     There will be two files generated under the folder, and copy them to inference/models/resnet50
 81 |          cp init_net_int8.pb inference/models/resnet50/init_onnx_int8.pb
 82 |          cp predict_net_int8.pb inference/models/resnet50/predict_onnx_int8.pb
 83 | 
 84 | ```
 85 | 
 86 | ## Run fp32 model
 87 | 
 88 | ```
 89 |          export PYTHONPATH=/the/path/to/your/pytorch/src
 90 |          export LD_PRELOAD=the/location/of/libiomp5.so      #libiomp5.so can be found under you mkl folder
 91 |          export OMP_NUM_THREADS=28  KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu
 92 | 
 93 |          ./run_caffe2.py -m $modelname -p imagenet_folder  -v validation_file  -b "batchsize" -w 5  --onnx
 94 | ```
 95 |     If you want to run dummy data, please use the blow command
 96 | ```
 97 |          export PYTHONPATH=/the/path/to/your/pytorch/src
 98 |          export LD_PRELOAD=the/location/of/libiomp5.so      #libiomp5.so can be found under you mkl folder
 99 |          export OMP_NUM_THREADS=28  KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu
100 | 
101 |          ./run_caffe2.py -m $modelname -b "batchsize" -w 5 -u -i 1000 --onnx
102 | ```
103 | 
104 | ## Run int8 model
105 | 
106 | ```
107 |          export PYTHONPATH=/the/path/to/your/pytorch/src
108 |          export LD_PRELOAD=the/location/of/libiomp5.so      #libiomp5.so can be found under you mkl folder
109 |          export OMP_NUM_THREADS=28  KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu
110 |  
111 |          ./run_caffe2.py -m $modelname -p imagenet_folder  -v validation_file  -b "batchsize"  -w 5  -int8
112 | ```
113 |     If you want to run dummy data, please use the blow command
114 | ```
115 |          export PYTHONPATH=/the/path/to/your/pytorch/src
116 |          export LD_PRELOAD=the/location/of/libiomp5.so      #libiomp5.so can be found under you mkl folder
117 |          export OMP_NUM_THREADS=28  KMP_AFFINITY=proclist=[0-27],granularity=thread,explicit #28 is an example, it means cores of one socket of your cpu
118 | 
119 |          ./run_caffe2.py -m $modelname -b "batchsize" -w 5 -u -i 1000 -int8
120 | ```
121 | 
122 | 
123 | 
124 | 
125 | ## Parse the result, the output of both fp32 and int8 model looks like below,
126 | 
127 | ```
128 |          Images per second: 345.5456113865
129 |          Total computing time: 144.6986978054 seconds
130 |          Total image processing time: 491.1261794567 seconds
131 |          Total model loading time: 4.4210910797 seconds
132 |          Total images: 50000
133 |          Accuracy: 75.36400%
134 |          Top5Accuracy: 92.54200%
135 | 
136 | ```
137 |     Just use 'Images per second' as the Throughput, 'Accuracy' as the Top1 accuracy and 'Top5Accuracy' as the Top5 Accuracy.
138 |     
139 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/README.md:
--------------------------------------------------------------------------------
  1 | # Guide to run DLRM with FP32/BF16 data type
  2 | 
  3 | ## Verified on
  4 | 
  5 | | Item | Value |
  6 | | -: | :- |
  7 | | OS | Ubuntu 20.04 LTS |
  8 | | Compiler | gcc 8.4.0 |
  9 | | Memory | DDR4 3200MHz, 192GB/socket |
 10 | 
 11 | ## Prepare your running environment
 12 | 
 13 | 1. Install anaconda 3.0
 14 | ```
 15 |   wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh
 16 |   chmod +x anaconda3.sh
 17 |   ./anaconda3.sh -b -p ~/anaconda3
 18 |   ./anaconda3/bin/conda create -n dlrm python=3.7
 19 | ```
 20 | 
 21 | 2. Setup anaconda virtual environment for DLRM
 22 | ```
 23 |   export PATH=~/anaconda3/bin:$PATH
 24 |   source ./anaconda3/bin/activate dlrm
 25 | ```
 26 | 
 27 | 3. Install dependencies
 28 | ```
 29 |   # 1.
 30 |   pip install sklearn onnx tqdm lark-parser
 31 |   
 32 |   #2.
 33 |   conda config --append channels intel
 34 |   conda install ninja pyyaml setuptools cmake cffi typing
 35 |   conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps
 36 |   
 37 |   #3.
 38 |   wget https://github.com/gperftools/gperftools/releases/download/gperftools-2.7.90/gperftools-2.7.90.tar.gz
 39 |   tar -xzf gperftools-2.7.90.tar.gz
 40 |   cd gperftools-2.7.90
 41 |   ./configure --prefix=$HOME/.local
 42 |   make && make install
 43 | ```
 44 | 
 45 | 4. Clone source code and build
 46 | 
 47 | ```
 48 |   # PyTorch
 49 |   git clone https://github.com/pytorch/pytorch.git
 50 |   git checkout tags/v1.5.0-rc3 -b v1.5-rc3
 51 |   git submodule sync && git submodule update --init --recursive
 52 | 
 53 |   # extension
 54 |   git clone https://github.com/intel/intel-extension-for-pytorch.git
 55 |   git checkout cpx-y20m06
 56 |   git submodule update --init –recursive
 57 | 
 58 |   # prepare patch to PyTorch
 59 |   cp {path/to/intel-pytorch-extension}/torch_patches/dlrm_fp32.patch {path/to/pytorch}/
 60 |   cp {path/to/intel-pytorch-extension}/torch_patches/dpcpp-v1.5-rc3.patch {path/to/pytorch}/
 61 | 
 62 |   # build PyTorch
 63 |   cd {path/to/pytorch}
 64 |   patch -p1 < dpcpp-v1.5-rc3.patch
 65 |   patch -p1 < dlrm_fp32.patch
 66 |   python setup.py install
 67 | 
 68 |   # build extension
 69 |   cd {path/to/intel-pytorch-extension}
 70 |   python setup.py install
 71 | 
 72 |   # DLRM
 73 |   git clone https://github.com/facebookresearch/dlrm.git
 74 |   git checkout 4705ea122d3cc693367f54e937db28c9c673d71b
 75 |   cd {path/to/dlrm}
 76 |   cp {path/to/intel-pytorch-extension}/torch_patches/models/mlperf_dlrm_ipex_OneDNN.diff  ./
 77 |   patch -p1 < mlperf_dlrm_ipex_oneDNN.diff
 78 | ```
 79 | 
 80 | 5. Download data
 81 | ```
 82 |   cd /tmp && mkdir input
 83 |   curl -O http://azuremlsampleexperiments.blob.core.windows.net/criteo/day_{$(seq -s , 0 23)}.gz
 84 |   // unzip all download files into `input` folder.
 85 | ```
 86 | 
 87 | 6. Running cmd
 88 | ```
 89 |   cd {path/to/dlrm}
 90 |   ################### NOTICE ###############################
 91 |   # configurable parameters in {run_and_time.h} according to your machine.
 92 |   ncores=24                               # cores/socket
 93 |   nsockets=0                              # numa
 94 |   DATASET_PATH=/temp/input                # dataset location for DLRM
 95 |   ################### NOTICE END ###########################
 96 | 
 97 |   # FP32 cmd
 98 |   ./bench/run_and_time.sh
 99 | 
100 |   # BF16 cmd
101 |   ./bench/run_and_time.sh bf16
102 | ```
103 | 
104 | ---
105 | # Guide to run DLRM Facebook Model with INT8 data type
106 | 
107 | ## Verified on
108 | 
109 | | Item | Value |
110 | | -: | :- |
111 | | OS | Ubuntu 20.04 LTS |
112 | | Compiler | gcc 8.4.0 |
113 | | Memory | DDR4 3200MHz, 192GB/socket |
114 | 
115 | 1. Install anaconda 3.0
116 | ```
117 |   wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh
118 |   chmod +x anaconda3.sh
119 |   ./anaconda3.sh -b -p ~/anaconda3
120 |   ./anaconda3/bin/conda create -n dlrm python=3.7
121 | ```
122 | 
123 | 2. Setup anaconda virtual environment for DLRM
124 | ```
125 |   export PATH=~/anaconda3/bin:$PATH
126 |   source ./anaconda3/bin/activate dlrm
127 | ```
128 | 
129 | 3. Install dependencies
130 | ```
131 |   # 1.
132 |   pip install sklearn onnx tqdm
133 | 
134 |   # 2.
135 |   conda config --append channels intel
136 |   conda install ninja pyyaml setuptools cmake cffi typing 
137 |   conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps
138 | 
139 |   # 3.
140 |   conda install jemalloc
141 | ```
142 | 
143 | 4. Clone source code and build
144 | ```
145 |   # PyTorch
146 |   git clone https://github.com/pytorch/pytorch.git
147 |   cd pytorch
148 |   git checkout tags/v1.5.0 -b v1.5
149 |   git submodule sync && git submodule update --init --recursive
150 | 
151 |   # prepare patch to PyTorch
152 |   wget https://github.com/pytorch/pytorch/commit/cf28c6a31a5189a47007fb3907a248b3548ae7fd.patch
153 | 
154 |   # build PyTorch
155 |   git apply cf28c6a31a5189a47007fb3907a248b3548ae7fd.patch
156 |   python setup.py install
157 | 
158 |   # get DLRM model
159 |   git clone https://github.com/intel/optimized-models.git
160 |   cd optimized-models/pytorch/dlrm/dlrm
161 | ```
162 | 
163 | 5. Set environment
164 | ```
165 |   export LD_PRELOAD=${CONDA_PREFIX}/lib/libjemalloc.so:${CONDA_PREFIX}/lib/libiomp5.so
166 | ```
167 | 
168 | 6. Test command
169 | ```
170 |   # FP32
171 |   OMP_NUM_THREADS=1 numactl --physcpubind=0-23 --membind=0 python dlrm_s_pytorch.py --mini-batch-size=16 --num-batches=1000 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --arch-interaction-op=dot --numpy-rand-seed=727 --print-freq=100 --print-time --inference-only --share-weight --num-instance=24 > model1_CPU_PT_24_fp32_inference.log
172 | 
173 |   # INT8
174 |   OMP_NUM_THREADS=1 numactl --physcpubind=0-23 --membind=0 python dlrm_s_pytorch.py --mini-batch-size=16 --num-batches=1000 --data-generation=random --arch-mlp-bot=512-512-64 --arch-mlp-top=1024-1024-1024-1 --arch-sparse-feature-size=64 --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup=100 --arch-interaction-op=dot --numpy-rand-seed=727 --print-freq=100 --print-time --inference-only --share-weight --do-int8-inference --num-instance=24 > model1_CPU_PT_24_int8_inference.log
175 | ```
176 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/train.py:
--------------------------------------------------------------------------------
  1 | """WnD training script"""
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing,
 13 | # software distributed under the License is distributed on an
 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | # KIND, either express or implied.  See the License for the
 16 | # specific language governing permissions and limitations
 17 | # under the License.
 18 | 
 19 | import argparse
 20 | import os
 21 | import pickle
 22 | import mxnet as mx
 23 | #from mxnet.test_utils import *
 24 | from data import get_uci_criteo
 25 | from model import wide_deep_model
 26 | 
 27 | parser = argparse.ArgumentParser(description="Run sparse wide and deep classification ",
 28 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 29 | parser.add_argument('--num-epoch', type=int, default=1,
 30 |                     help='number of epochs to train')
 31 | parser.add_argument('--batch-size', type=int, default=1000,
 32 |                     help='number of examples per batch')
 33 | parser.add_argument('--lr', type=float, default=0.001,
 34 |                     help='learning rate')
 35 | parser.add_argument('--cuda', action='store_true', default=False,
 36 |                     help='Train on GPU with CUDA')
 37 | parser.add_argument('--optimizer', type=str, default='adam',
 38 |                     help='what optimizer to use',
 39 |                     choices=["ftrl", "sgd", "adam"])
 40 | parser.add_argument('--log-interval', type=int, default=100,
 41 |                     help='number of batches to wait before logging training status')
 42 | parser.add_argument('--data-dir', type=str, default='large_version',
 43 |                     help='folder for data')
 44 | 
 45 | # Related to feature engineering, please see preprocess in data.py
 46 | CRITEO = {
 47 |     'train': 'train.csv',
 48 |     'test': 'eval.csv',
 49 |     'num_linear_features': 26000,
 50 |     'num_embed_features': 26,
 51 |     'num_cont_features': 13,
 52 |     'embed_input_dims': 1000,
 53 |     'hidden_units': [32, 1024, 512, 256],
 54 | }
 55 | def save_object(filename, obj):
 56 |     with open(filename, 'wb') as output:  # Overwrites any existing file.
 57 |         pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
 58 | if __name__ == '__main__':
 59 |     import logging
 60 | 
 61 |     head = '%(asctime)-15s %(message)s'
 62 |     logging.basicConfig(level=logging.INFO, format=head)
 63 | 
 64 |     # arg parser
 65 |     args = parser.parse_args()
 66 |     logging.info(args)
 67 |     num_epoch = args.num_epoch
 68 |     batch_size = args.batch_size
 69 |     optimizer = args.optimizer
 70 |     log_interval = args.log_interval
 71 |     lr = args.lr
 72 |     ctx = mx.gpu(0) if args.cuda else mx.cpu()
 73 | 
 74 |     # dataset
 75 |     data_dir = os.path.join(os.getcwd(), args.data_dir)
 76 |     train_data = os.path.join(data_dir, CRITEO['train'])
 77 |     val_data = os.path.join(data_dir, CRITEO['test'])
 78 |     train_csr, train_dns, train_label = get_uci_criteo(data_dir, train_data)
 79 |     val_csr, val_dns, val_label = get_uci_criteo(data_dir, val_data)
 80 | 
 81 |     save_object('val_csr.pkl', val_csr)
 82 |     save_object('val_dns.pkl', val_dns)
 83 |     save_object('val_label.pkl', val_label)
 84 |     save_object('train_csr.pkl', train_csr)
 85 |     save_object('train_dns.pkl', train_dns)
 86 |     save_object('train_label.pkl', train_label)
 87 | 
 88 |     model = wide_deep_model(CRITEO['num_linear_features'], CRITEO['num_embed_features'],
 89 |                             CRITEO['num_cont_features'], CRITEO['embed_input_dims'],
 90 |                             CRITEO['hidden_units'])
 91 | 
 92 |     # data iterator
 93 |     train_data = mx.io.NDArrayIter({'csr_data': train_csr, 'dns_data': train_dns},
 94 |                                    {'softmax_label': train_label}, batch_size,
 95 |                                    shuffle=True, last_batch_handle='discard')
 96 |     eval_data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
 97 |                                   {'softmax_label': val_label}, batch_size,
 98 |                                   shuffle=True, last_batch_handle='discard')
 99 | 
100 |     # module
101 |     mod = mx.mod.Module(symbol=model, context=ctx, data_names=['csr_data', 'dns_data'],
102 |                         label_names=['softmax_label'])
103 |     mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
104 |     mod.init_params()
105 |     optim = mx.optimizer.create(optimizer, learning_rate=lr, rescale_grad=1.0 / batch_size)
106 |     mod.init_optimizer(optimizer=optim)
107 |     # use accuracy as the metric
108 |     metric = mx.metric.create(['acc'])
109 |     # get the sparse weight parameter
110 |     speedometer = mx.callback.Speedometer(batch_size, log_interval)
111 | 
112 |     logging.info('Training started ...')
113 | 
114 |     data_iter = iter(train_data)
115 |     for epoch in range(num_epoch):
116 |         nbatch = 0
117 |         metric.reset()
118 |         for batch in data_iter:
119 |             nbatch += 1
120 |             mod.forward_backward(batch)
121 |             # update all parameters (including the weight parameter)
122 |             mod.update()
123 |             # update training metric
124 |             mod.update_metric(metric, batch.label)
125 |             speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
126 |                                                        eval_metric=metric, locals=locals())
127 |             speedometer(speedometer_param)
128 |         # evaluate metric on validation dataset
129 |         score = mod.score(eval_data, ['acc'])
130 |         logging.info('epoch %d, accuracy = %s', epoch, score[0][1])
131 | 
132 |         mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=False)
133 |         # reset the iterator for next pass of data
134 |         data_iter.reset()
135 | 
136 |     logging.info('Training completed.')
137 | 


--------------------------------------------------------------------------------
/mxnet/blog/medium_vnni/ec2_benchmark_int8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "MXNet Model Quantization Performance"
 3 | echo "Testing INT8 quantized models"
 4 | echo "Installing mxnet-mkl 1.5.0b20190623"
 5 | pip install --pre mxnet-mkl==1.5.0b20190623
 6 | echo "downloading source code from incubator-mxnet repo"
 7 | git clone https://github.com/apache/incubator-mxnet
 8 | cd incubator-mxnet
 9 | git checkout f44f6cfbe752fd8b8036307cecf6a30a30ad8557
10 | 
11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l`
13 | export OMP_NUM_THREADS=$((vCPUs / 2))
14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS
15 | 
16 | # Reduce remote memory access
17 | export NNVM_EXEC_MATCH_RANGE=1
18 | # USE MKLDNN AS SUBGRAPH BACKEND
19 | export MXNET_SUBGRAPH_BACKEND=MKLDNN
20 | echo "Testing with MXNET_SUBGRAPH_BACKEND="$MXNET_SUBGRAPH_BACKEND
21 | 
22 | echo "=========test image classification models=========="
23 | cd ./example/quantization
24 | echo "=============resnet50_v1==============="
25 | echo "1. calibrating resnet50_v1 with calib-mode=naive, use 5 batches to do calibration"
26 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive
27 | echo "2. testing throughput of fp32 resnet50_v1"
28 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
29 | echo "3. testing latency of fp32 resnet50_v1"
30 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
31 | echo "4. testing throughput of int8 resnet50_v1"
32 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
33 | echo "5. testing latency of int8 resnet50_v1"
34 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
35 | echo "=============resnet101_v1==============="
36 | echo "1. calibrating resnet101_v1 with calib-mode=naive, use 5 batches to do calibration"
37 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive
38 | echo "2. testing throughput of fp32 resnet101_v1"
39 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
40 | echo "3. testing latency of fp32 resnet101_v1"
41 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
42 | echo "4. testing throughput of int8 resnet101_v1"
43 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
44 | echo "5. testing latency of int8 resnet101_v1"
45 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-quantized-5batches-naive-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
46 | echo "=============mobilenet1.0==============="
47 | echo "1. calibrating mobilenet1.0 with calib-mode=naive, use 5 batches to do calibration"
48 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive
49 | echo "2. testing throughput of fp32 mobilenet1.0"
50 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
51 | echo "3. testing latency of fp32 mobilenet1.0"
52 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu  --benchmark=True
53 | echo "4. testing throughput of int8 mobilenet1.0"
54 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
55 | echo "5. testing latency of int8 mobilenet1.0"
56 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
57 | echo "=============inceptionv3==============="
58 | echo "1. calibrating inceptionv3 with calib-mode=naive, use 5 batches to do calibration"
59 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive
60 | echo "2. testing throughput of fp32 inceptionv3"
61 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
62 | echo "3. testing latency of fp32 inceptionv3"
63 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=500 --ctx=cpu  --benchmark=True
64 | echo "4. testing throughput of int8 inceptionv3"
65 | python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
66 | echo "5. testing latency of int8 inceptionv3"
67 | python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=500 --ctx=cpu --benchmark=True
68 | echo "=========test image detection models=========="
69 | echo "==============SSD VGG16================"
70 | echo "1. downloading model"
71 | cd ../ssd
72 | cd model/ && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip
73 | unzip ssd_vgg16_reduced_300-dd479559.zip && mv ssd_vgg16_reduced_300-dd479559.params ssd_vgg16_reduced_300-0000.params && mv ssd_vgg16_reduced_300-symbol-dd479559.json ssd_vgg16_reduced_300-symbol.json
74 | cd ../data && wget http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip
75 | unzip ssd-val-fc19a535.zip && mv ssd-val-fc19a535.idx val.idx && mv ssd-val-fc19a535.lst val.lst && mv ssd-val-fc19a535.rec val.rec
76 | cd ..
77 | echo "2. quantizing model"
78 | python quantization.py
79 | echo "3. testing throughput of fp32 SSD VGG16"
80 | python benchmark_score.py --batch_size=224 --deploy --prefix=./model/ssd_
81 | echo "4. testing throughput of int8 SSD VGG16"
82 | python benchmark_score.py --batch_size=224 --deploy --prefix=./model/cqssd_
83 | echo "5. testing latency of fp32 SSD VGG16"
84 | python benchmark_score.py --batch_size=1 --deploy --prefix=./model/ssd_
85 | echo "6. testing latency of int8 SSD VGG16"
86 | python benchmark_score.py --batch_size=1 --deploy --prefix=./model/cqssd_
87 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/README.md:
--------------------------------------------------------------------------------
  1 | ## TERMS OF USE:
  2 | PLEASE NOTE THAT YOUR USE OF AND ACCESS TO KAGGLE'S SERVICES ARE SUBJECT TO THE TERMS. IF YOU DO NOT AGREE TO ALL OF THEM, YOU MAY NOT USE OR ACCESS THE SERVICES IN ANY MANNER. DETAILS SEE THE LINK: https://www.kaggle.com/terms
  3 | 
  4 | How to get dataset:
  5 | Goto the link: https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version to start download criteo large dataset, and it might take a long time.
  6 | ```
  7 | mkdir large_version
  8 | #Downloading the training dataset...
  9 | wget -P ./large_version https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv
 10 | #Downloading the validation dataset...
 11 | wget -P ./large_version https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv
 12 | ```
 13 | 
 14 | # 1. Steps to reproduce performance with OOB MXNet
 15 | ```
 16 | git clone --recursive https://github.com/apache/incubator-mxnet.git
 17 | cd incubator-mxnet
 18 | git submodule update --recursive
 19 | make -j USE_MKLDNN=1 USE_BLAS=mkl USE_OPENCV=1
 20 | cd python
 21 | python setup.py install [--user]
 22 | export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH
 23 | export PYTHONPATH=$PWD/python:$PYTHONPATH
 24 | ```
 25 | ## Run the wide&deep:
 26 | ```
 27 | cd optimized-models/mxnet/wide_deep_criteo/
 28 | python train.py
 29 | python wd_gen_qsym_subgraph_update.py
 30 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 31 | export OMP_NUM_THREADS=24
 32 | ```
 33 | ### performance
 34 | ```
 35 | # FP32
 36 | numactl --physcpubind=0-23 --membind=0 python inference.py
 37 | # Int8
 38 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=WD-quantized-162batches-naive-symbol.json --param-file=WD-quantized-0000.params
 39 | ```
 40 | ### Accuracy
 41 | ```
 42 | # FP32
 43 | numactl --physcpubind=0-23 --membind=0 python inference.py --accuracy True
 44 | # Int8
 45 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=WD-quantized-162batches-naive-symbol.json --param-file=WD-quantized-0000.params --accuracy True
 46 | ```
 47 | 
 48 | # 2. Steps to reproduce performance with OOB MXNet and optimization patch
 49 | ```
 50 | git clone --recursive https://github.com/apache/incubator-mxnet.git
 51 | cd incubator-mxnet
 52 | git submodule update --recursive
 53 | git checkout 5d2a4510c2c226c6921a8a213d04461f68ca7173
 54 | git apply --ignore-space-change --ignore-whitespace patch/patch.update
 55 | make -j USE_MKLDNN=1 USE_BLAS=mkl USE_OPENCV=1
 56 | cd python
 57 | python setup.py install [--user]
 58 | export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH
 59 | export PYTHONPATH=$PWD/python:$PYTHONPATH
 60 | ```
 61 | > Note: The patch.update are under review, [PR#14491](https://github.com/apache/incubator-mxnet/pull/14491), [PR#14492](https://github.com/apache/incubator-mxnet/pull/14492). After merged into master, no more patchs are needed.
 62 | 
 63 | ## Run the wide&deep:
 64 | ```
 65 | cd optimized-models/mxnet/wide_deep_criteo/
 66 | python train.py
 67 | python wd_gen_qsym_subgraph_update.py
 68 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 69 | export OMP_NUM_THREADS=24
 70 | ```
 71 | ### Performance
 72 | ```
 73 | # FP32
 74 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding-fuse.json --param-file=checkpoint-0000.params
 75 | # Int8
 76 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params
 77 | ```
 78 | ### Accuracy
 79 | ```
 80 | # FP32
 81 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding-fuse.json --param-file=checkpoint-0000.params --accuracy True
 82 | # Int8
 83 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./update_model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params --accuracy True
 84 | ```
 85 | 
 86 | # 3. Steps to reproduce performance with OOB MXNet and all internal optimization patch [Best so far]
 87 | ```
 88 | git clone --recursive https://github.com/apache/incubator-mxnet.git
 89 | cd incubator-mxnet
 90 | git submodule update --recursive
 91 | git checkout f1de8e51999ce3acaa95538d21a91fe43a0286ec
 92 | git apply --ignore-space-change --ignore-whitespace patch/patch.diff
 93 | cd 3rdparty/mkldnn
 94 | git checkout 08bd90cca77683dd5d1c98068cea8b92ed05784d
 95 | cd ../..
 96 | make -j USE_MKLDNN=1 USE_BLAS=mkl USE_OPENCV=1
 97 | cd python
 98 | python setup.py install [--user]
 99 | export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH
100 | export PYTHONPATH=$PWD/python:$PYTHONPATH
101 | ```
102 | ## Run the wide&deep:
103 | ```
104 | cd optimized-models/mxnet/wide_deep_criteo/
105 | python train.py
106 | python wd_gen_qsym_subgraph.py
107 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
108 | export OMP_NUM_THREADS=24
109 | ```
110 | ### Performance
111 | ```
112 | # FP32
113 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding-fuse.json --param-file=checkpoint-0000.params
114 | # Int8
115 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params
116 | ```
117 | ### Accuracy
118 | ```
119 | # FP32
120 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding-fuse.json --param-file=checkpoint-0000.params --accuracy True
121 | # Int8
122 | numactl --physcpubind=0-23 --membind=0 python inference.py --symbol-file=./model/embedding_fuse-quantized-1953batches-naive-symbol.json --param-file=WD-quantized-0000.params --accuracy True
123 | ```
124 | 
125 | # FP32 Outputs
126 | ```
127 | INFO:logger:Performance Mode
128 | INFO:logger:batch size = 1024 for inference
129 | INFO:logger:label_name = softmax_label
130 | INFO:logger:Loading symbol from file dl_framework-optimized-models/mxnet/wide_deep_criteo/embedding-fuse.json
131 | INFO:logger:Loading params from file dl_framework-optimized-models/mxnet/wide_deep_criteo/checkpoint-0000.params
132 | INFO:logger:Running model embedding-fuse.json for inference
133 | INFO:logger:Run [7812] Batchs   Speed: xxxxxx.xx samples/sec
134 | ```
135 | 
136 | # Int8 Outputs
137 | ```
138 | INFO:logger:Performance Mode
139 | INFO:logger:batch size = 1024 for inference
140 | INFO:logger:label_name = softmax_label
141 | INFO:logger:Loading symbol from file dl_framework-optimized-models/mxnet/wide_deep_criteo/embedding_fuse-quantized-1953batches-naive-symbol.json
142 | INFO:logger:Loading params from file dl_framework-optimized-models/mxnet/wide_deep_criteo/WD-quantized-0000.params
143 | INFO:logger:Running model embedding_fuse-quantized-1953batches-naive-symbol.json for inference
144 | INFO:logger:Run [7812] Batchs   Speed: xxxxxx.xx samples/sec
145 | ```
146 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/inference.py:
--------------------------------------------------------------------------------
  1 | """inference script to support accuracy and performance benchmark"""
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing,
 13 | # software distributed under the License is distributed on an
 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | # KIND, either express or implied.  See the License for the
 16 | # specific language governing permissions and limitations
 17 | # under the License.
 18 | 
 19 | import argparse
 20 | from datetime import datetime
 21 | import logging
 22 | import ctypes
 23 | import time
 24 | import os
 25 | import pickle
 26 | import mxnet as mx
 27 | 
 28 | from mxnet import nd
 29 | from mxnet.base import check_call, _LIB
 30 | 
 31 | 
 32 | 
 33 | def load_model(_symbol_file, _param_file, _logger=None):
 34 |     """load existing symbol model"""
 35 |     cur_path = os.path.dirname(os.path.realpath(__file__))
 36 |     symbol_file_path = os.path.join(cur_path, _symbol_file)
 37 |     if _logger is not None:
 38 |         _logger.info('Loading symbol from file %s' % symbol_file_path)
 39 |     symbol = mx.sym.load(symbol_file_path)
 40 | 
 41 |     param_file_path = os.path.join(cur_path, _param_file)
 42 |     if _logger is not None:
 43 |         _logger.info('Loading params from file %s' % param_file_path)
 44 |     save_dict = nd.load(param_file_path)
 45 |     _arg_params = {}
 46 |     _aux_params = {}
 47 |     for k, v in save_dict.items():
 48 |         tp, name = k.split(':', 1)
 49 |         if tp == 'arg':
 50 |             _arg_params[name] = v
 51 |         if tp == 'aux':
 52 |             _aux_params[name] = v
 53 |     return symbol, _arg_params, _aux_params
 54 | 
 55 | def advance_data_iter(data_iter, n):
 56 |     """use to warm up data for performance benchmark"""
 57 |     assert n >= 0
 58 |     if n == 0:
 59 |         return data_iter
 60 |     has_next_batch = True
 61 |     while has_next_batch:
 62 |         try:
 63 |             data_iter.next()
 64 |             n -= 1
 65 |             if n == 0:
 66 |                 return data_iter
 67 |         except StopIteration:
 68 |             has_next_batch = False
 69 | 
 70 | CRITEO = {
 71 |     'train': 'train.csv',
 72 |     'test': 'eval.csv',
 73 |     'num_linear_features': 26000,
 74 |     'num_embed_features': 26,
 75 |     'num_cont_features': 13,
 76 |     'embed_input_dims': 1000,
 77 |     'hidden_units': [32, 1024, 512, 256],
 78 | }
 79 | def load_object(filename):
 80 |     with open(filename, 'rb') as input:
 81 |         return pickle.load(input)
 82 | if __name__ == '__main__':
 83 |     parser = argparse.ArgumentParser(description='Score a model on a dataset')
 84 | 
 85 |     parser.add_argument('--symbol-file', type=str, default='checkpoint-symbol.json', help='symbol file path')
 86 |     parser.add_argument('--param-file', type=str, default='checkpoint-0000.params', help='param file path')
 87 |     parser.add_argument('--batch-size', type=int, default=1024)
 88 |     parser.add_argument('--label-name', type=str, default='softmax_label')
 89 |     parser.add_argument('--accuracy', action='store_true')
 90 |     parser.add_argument('--shuffle-dataset', action='store_true', default=True,
 91 |                         help='shuffle the calibration dataset')
 92 |     parser.add_argument('--num-omp-threads', type=int, default=28)
 93 |     parser.add_argument('--num-batches', type=int, default=100000)
 94 |     parser.add_argument('--num-warmup', type=int, default=5000)
 95 |     parser.add_argument('--cuda', action='store_true', help='Inference on GPU with CUDA')
 96 |     parser.add_argument('--gpu-id', type=int, default=0)
 97 |     args = parser.parse_args()
 98 | 
 99 |     ctx = mx.gpu(args.gpu_id) if args.cuda else mx.cpu()
100 | 
101 |     logging.basicConfig()
102 |     logger = logging.getLogger('logger')
103 |     logger.setLevel(logging.INFO)
104 | 
105 |     if args.accuracy is True:
106 |         logger.info('Accuracy Mode')
107 |     else:
108 |         logger.info('Performance Mode')
109 | 
110 |     symbol_file = args.symbol_file
111 |     param_file = args.param_file
112 | 
113 | 
114 |     batch_size = args.batch_size
115 |     logger.info('batch size = %d for inference', batch_size)
116 |     label_name = args.label_name
117 |     logger.info('label_name = %s', label_name)
118 | 
119 |     if args.accuracy is False:
120 |         val_csr = load_object('train_csr.pkl')
121 |         val_dns = load_object('train_dns.pkl')
122 |         val_label = load_object('train_label.pkl')
123 |     else:
124 |         val_csr = load_object('val_csr.pkl')
125 |         val_dns = load_object('val_dns.pkl')
126 |         val_label = load_object('val_label.pkl')
127 | 
128 |     # creating data iterator
129 |     data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
130 |                              {'softmax_label': val_label}, batch_size,
131 |                              shuffle=False, last_batch_handle='discard')
132 | 
133 |     # loading model
134 |     sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
135 | 
136 | 
137 |     # make sure that fp32 inference works on the same images as calibrated quantized model
138 | 
139 |     logger.info('Running model %s for inference', symbol_file)
140 | 
141 |     acc_m = mx.metric.create('acc')
142 |     mod = mx.mod.Module(symbol=sym, context=ctx, data_names=['csr_data', 'dns_data'], label_names=[label_name, ])
143 |     mod.bind(for_training=False,
144 |              data_shapes=data.provide_data,
145 |              label_shapes=data.provide_label)
146 |     mod.set_params(arg_params, aux_params)
147 | 
148 |     check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
149 |     batch_data = []
150 |     nbatch = 0
151 |     while nbatch < args.num_batches:
152 |         for batch in data:
153 |             batch_data.append(batch)
154 |             nbatch += 1
155 |             if nbatch < args.num_batches:
156 |                 continue
157 |             else:
158 |                 break
159 |         data.hard_reset()
160 |     #for data warmup
161 |     wi = args.num_warmup
162 |     i = 0
163 |     for batch in batch_data:
164 |         if i < wi:
165 |             mod.forward(batch, is_train=False)
166 |             i += 1
167 |         else:
168 |             break
169 |     data.hard_reset()
170 |     mx.nd.waitall()
171 |     #real run
172 |     if "DO_WIDE_DEEP_PROFILING" in os.environ:
173 |         print("wide_deep profiling start !!!!!!!!!!!!!")
174 |         mx.profiler.set_config(profile_symbolic=True, profile_imperative=True, profile_memory=False, profile_api=False)
175 |         mx.profiler.set_state('run')
176 |     nbatch = 0
177 |     tic = time.time()
178 |     logger.info('INFERENCING STARTED: %s', datetime.now().strftime("%m/%d/%Y %H:%M:%S.%f")[:-3])
179 |     for batch in batch_data:
180 |         nbatch += 1
181 |         mod.forward(batch, is_train=False)
182 |         if args.accuracy is True:
183 |             for output in mod.get_outputs():
184 |                 output.wait_to_read()
185 |             mod.update_metric(acc_m, batch.label)
186 |         else:
187 |             mx.nd.waitall()
188 |     logger.info('INFERENCING FINISHED: %s', datetime.now().strftime("%m/%d/%Y %H:%M:%S.%f")[:-3])
189 |     speed = nbatch * batch_size / (time.time() - tic)
190 |     logger.info("Run [%d] Batchs \tSpeed: %.2f samples/sec", nbatch, speed)
191 |     if args.accuracy is True:
192 |         logger.info(acc_m.get())
193 |     if "DO_WIDE_DEEP_PROFILING" in os.environ:
194 |         print("wide_deep profiling end !")
195 |         mx.profiler.set_state('stop')
196 |         profiler_info = mx.profiler.dumps()
197 |         print(profiler_info)
198 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/wd_gen_qsym_subgraph.py:
--------------------------------------------------------------------------------
  1 | """Generate quantized graph based on original fp32 graph"""
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing,
 13 | # software distributed under the License is distributed on an
 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | # KIND, either express or implied.  See the License for the
 16 | # specific language governing permissions and limitations
 17 | # under the License.
 18 | 
 19 | import argparse
 20 | import os
 21 | import logging
 22 | #import ctypes
 23 | import pickle
 24 | from mxnet import nd
 25 | import mxnet as mx
 26 | from mxnet.contrib.quantization import quantize_model
 27 | 
 28 | 
 29 | 
 30 | def load_model(symbol_file, param_file, mlogger=None):
 31 |     """load existing symbol model"""
 32 |     cur_path = os.path.dirname(os.path.realpath(__file__))
 33 |     symbol_file_path = os.path.join(cur_path, symbol_file)
 34 |     if mlogger is not None:
 35 |         mlogger.info('Loading symbol from file %s' % symbol_file_path)
 36 |     symbol = mx.sym.load(symbol_file_path)
 37 | 
 38 |     param_file_path = os.path.join(cur_path, param_file)
 39 |     if mlogger is not None:
 40 |         mlogger.info('Loading params from file %s' % param_file_path)
 41 |     save_dict = nd.load(param_file_path)
 42 |     marg_params = {}
 43 |     maux_params = {}
 44 |     for k, v in save_dict.items():
 45 |         tp, name = k.split(':', 1)
 46 |         if tp == 'arg':
 47 |             marg_params[name] = v
 48 |         if tp == 'aux':
 49 |             maux_params[name] = v
 50 |     return symbol, marg_params, maux_params
 51 | 
 52 | 
 53 | def save_symbol(fname, symbol, slogger=None):
 54 |     if slogger is not None:
 55 |         slogger.info('Saving symbol into file at %s' % fname)
 56 |     symbol.save(fname)
 57 | 
 58 | 
 59 | def save_params(fname, parg_params, paux_params, plogger=None):
 60 |     if plogger is not None:
 61 |         plogger.info('Saving params into file at %s' % fname)
 62 |     save_dict = {('arg:%s' % k): v.as_in_context(mx.cpu()) for k, v in parg_params.items()}
 63 |     save_dict.update({('aux:%s' % k): v.as_in_context(mx.cpu()) for k, v in paux_params.items()})
 64 |     mx.nd.save(fname, save_dict)
 65 | 
 66 | def load_object(filename):
 67 |     with open(filename, 'rb') as input:
 68 |         return pickle.load(input)
 69 | if __name__ == '__main__':
 70 |     parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model')
 71 |     parser.add_argument('--ctx', type=str, default='cpu')
 72 | 
 73 |     parser.add_argument('--batch-size', type=int, default=10000)
 74 |     parser.add_argument('--label-name', type=str, default='softmax_label')
 75 |     parser.add_argument('--calib-dataset', type=str, default='data/adult.data',
 76 |                         help='path of the calibration dataset')
 77 |     parser.add_argument('--num-calib-batches', type=int, default=162,
 78 |                         help='number of batches for calibration')
 79 |     parser.add_argument('--exclude-first-conv', action='store_true', default=True,
 80 |                         help='excluding quantizing the first conv layer since the'
 81 |                              ' number of channels is usually not a multiple of 4 in that layer'
 82 |                              ' which does not satisfy the requirement of cuDNN')
 83 |     parser.add_argument('--calib-mode', type=str, default='naive',
 84 |                         help='calibration mode used for generating calibration table for the quantized symbol; supports'
 85 |                              ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
 86 |                              ' on the fly. This will result in inference speed slowdown and loss of accuracy'
 87 |                              ' in general.'
 88 |                              ' 2. naive: simply take min and max values of layer outputs as thresholds for'
 89 |                              ' quantization. In general, the inference accuracy worsens with more examples used in'
 90 |                              ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
 91 |                              ' inference results.'
 92 |                              ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
 93 |                              ' thresholds. This mode is expected to produce the best inference accuracy of all three'
 94 |                              ' kinds of quantized models if the calibration dataset is representative enough of the'
 95 |                              ' inference dataset.')
 96 |     parser.add_argument('--quantized-dtype', type=str, default='uint8',
 97 |                         choices=['int8', 'uint8'],
 98 |                         help='quantization destination data type for input data')
 99 |     args = parser.parse_args()
100 | 
101 |     if args.ctx == 'gpu':
102 |         ctx = mx.gpu(0)
103 |     elif args.ctx == 'cpu':
104 |         ctx = mx.cpu(0)
105 |     else:
106 |         raise ValueError('ctx %s is not supported in this script' % args.ctx)
107 | 
108 |     logging.basicConfig()
109 |     logger = logging.getLogger('logger')
110 |     logger.setLevel(logging.INFO)
111 | 
112 |     # get batch size
113 |     batch_size = args.batch_size
114 |     logger.info('batch size = %d for calibration', batch_size)
115 |     # get number of batches for calibration
116 |     num_calib_batches = args.num_calib_batches
117 | 
118 |     calib_mode = args.calib_mode
119 |     if calib_mode != 'none':
120 |         logger.info('number of batches = %d for calibration', num_calib_batches)
121 | 
122 |     val_csr = load_object('val_csr.pkl')
123 |     val_dns = load_object('val_dns.pkl')
124 |     val_label = load_object('val_label.pkl')
125 | 
126 |     # creating data iterator
127 |     data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
128 |                              {'softmax_label': val_label}, batch_size,
129 |                              shuffle=True, last_batch_handle='discard')
130 |     # loading model
131 |     sym, arg_params, aux_params = load_model('checkpoint-symbol.json', 'checkpoint-0000.params', logger)
132 |     calib_layer = lambda name: (name.find('fullyconnected') != -1 or name.find('FullyConnected') != -1)
133 |     cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
134 |                                                     data_names=['csr_data', 'dns_data'],
135 |                                                     label_names=['softmax_label', ],
136 |                                                     ctx=ctx,
137 |                                                     calib_mode=calib_mode, calib_data=data,
138 |                                                     num_calib_examples=num_calib_batches*batch_size,
139 |                                                     calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
140 |                                                     calib_quantize_op=True,
141 |                                                     logger=logger)
142 |     if calib_mode == 'entropy':
143 |         suffix = '-quantized-%dbatches-entropy' % num_calib_batches
144 |     elif calib_mode == 'naive':
145 |         suffix = '-quantized-%dbatches-naive' % num_calib_batches
146 |     else:
147 |         raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
148 |                          % calib_mode)
149 |     prefix = 'WD'
150 |     sym_name = '%s-symbol.json' % (prefix + suffix)
151 |     cqsym = cqsym.get_backend_symbol('MKLDNN_POST_FC_QUANTIZE')
152 |     cqsym = cqsym.get_backend_symbol('MKLDNN_QFC_POST_RELU_FUSED')
153 |     cqsym = cqsym.get_backend_symbol('MKLDNN_PARALLEL_EMBEDDING')
154 |     save_symbol(sym_name, cqsym, logger)
155 |     param_name = '%s-%04d.params' % (prefix + '-quantized', 0)
156 |     save_params(param_name, qarg_params, aux_params, logger)
157 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/wd_gen_qsym_subgraph_update.py:
--------------------------------------------------------------------------------
  1 | """Generate quantized graph based on original fp32 graph"""
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing,
 13 | # software distributed under the License is distributed on an
 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | # KIND, either express or implied.  See the License for the
 16 | # specific language governing permissions and limitations
 17 | # under the License.
 18 | 
 19 | import argparse
 20 | import os
 21 | import logging
 22 | #import ctypes
 23 | import pickle
 24 | from mxnet import nd
 25 | import mxnet as mx
 26 | from mxnet.contrib.quantization import quantize_model
 27 | 
 28 | 
 29 | 
 30 | def load_model(symbol_file, param_file, mlogger=None):
 31 |     """load existing symbol model"""
 32 |     cur_path = os.path.dirname(os.path.realpath(__file__))
 33 |     symbol_file_path = os.path.join(cur_path, symbol_file)
 34 |     if mlogger is not None:
 35 |         mlogger.info('Loading symbol from file %s' % symbol_file_path)
 36 |     symbol = mx.sym.load(symbol_file_path)
 37 | 
 38 |     param_file_path = os.path.join(cur_path, param_file)
 39 |     if mlogger is not None:
 40 |         mlogger.info('Loading params from file %s' % param_file_path)
 41 |     save_dict = nd.load(param_file_path)
 42 |     marg_params = {}
 43 |     maux_params = {}
 44 |     for k, v in save_dict.items():
 45 |         tp, name = k.split(':', 1)
 46 |         if tp == 'arg':
 47 |             marg_params[name] = v
 48 |         if tp == 'aux':
 49 |             maux_params[name] = v
 50 |     return symbol, marg_params, maux_params
 51 | 
 52 | 
 53 | def save_symbol(fname, symbol, slogger=None):
 54 |     if slogger is not None:
 55 |         slogger.info('Saving symbol into file at %s' % fname)
 56 |     symbol.save(fname)
 57 | 
 58 | 
 59 | def save_params(fname, parg_params, paux_params, plogger=None):
 60 |     if plogger is not None:
 61 |         plogger.info('Saving params into file at %s' % fname)
 62 |     save_dict = {('arg:%s' % k): v.as_in_context(mx.cpu()) for k, v in parg_params.items()}
 63 |     save_dict.update({('aux:%s' % k): v.as_in_context(mx.cpu()) for k, v in paux_params.items()})
 64 |     mx.nd.save(fname, save_dict)
 65 | 
 66 | def load_object(filename):
 67 |     with open(filename, 'rb') as input:
 68 |         return pickle.load(input)
 69 | if __name__ == '__main__':
 70 |     parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model')
 71 |     parser.add_argument('--ctx', type=str, default='cpu')
 72 | 
 73 |     parser.add_argument('--batch-size', type=int, default=10000)
 74 |     parser.add_argument('--label-name', type=str, default='softmax_label')
 75 |     parser.add_argument('--calib-dataset', type=str, default='data/adult.data',
 76 |                         help='path of the calibration dataset')
 77 |     parser.add_argument('--num-calib-batches', type=int, default=162,
 78 |                         help='number of batches for calibration')
 79 |     parser.add_argument('--exclude-first-conv', action='store_true', default=True,
 80 |                         help='excluding quantizing the first conv layer since the'
 81 |                              ' number of channels is usually not a multiple of 4 in that layer'
 82 |                              ' which does not satisfy the requirement of cuDNN')
 83 |     parser.add_argument('--calib-mode', type=str, default='naive',
 84 |                         help='calibration mode used for generating calibration table for the quantized symbol; supports'
 85 |                              ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
 86 |                              ' on the fly. This will result in inference speed slowdown and loss of accuracy'
 87 |                              ' in general.'
 88 |                              ' 2. naive: simply take min and max values of layer outputs as thresholds for'
 89 |                              ' quantization. In general, the inference accuracy worsens with more examples used in'
 90 |                              ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
 91 |                              ' inference results.'
 92 |                              ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
 93 |                              ' thresholds. This mode is expected to produce the best inference accuracy of all three'
 94 |                              ' kinds of quantized models if the calibration dataset is representative enough of the'
 95 |                              ' inference dataset.')
 96 |     parser.add_argument('--quantized-dtype', type=str, default='uint8',
 97 |                         choices=['int8', 'uint8'],
 98 |                         help='quantization destination data type for input data')
 99 |     args = parser.parse_args()
100 | 
101 |     if args.ctx == 'gpu':
102 |         ctx = mx.gpu(0)
103 |     elif args.ctx == 'cpu':
104 |         ctx = mx.cpu(0)
105 |     else:
106 |         raise ValueError('ctx %s is not supported in this script' % args.ctx)
107 | 
108 |     logging.basicConfig()
109 |     logger = logging.getLogger('logger')
110 |     logger.setLevel(logging.INFO)
111 | 
112 |     # get batch size
113 |     batch_size = args.batch_size
114 |     logger.info('batch size = %d for calibration', batch_size)
115 |     # get number of batches for calibration
116 |     num_calib_batches = args.num_calib_batches
117 | 
118 |     calib_mode = args.calib_mode
119 |     if calib_mode != 'none':
120 |         logger.info('number of batches = %d for calibration', num_calib_batches)
121 | 
122 |     val_csr = load_object('val_csr.pkl')
123 |     val_dns = load_object('val_dns.pkl')
124 |     val_label = load_object('val_label.pkl')
125 | 
126 |     # creating data iterator
127 |     data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
128 |                              {'softmax_label': val_label}, batch_size,
129 |                              shuffle=True, last_batch_handle='discard')
130 |     # loading model
131 |     sym, arg_params, aux_params = load_model('checkpoint-symbol.json', 'checkpoint-0000.params', logger)
132 | 
133 |     calib_layer = lambda name: (name.find('fullyconnected') != -1 or \
134 |                                 name.find('FullyConnected') != -1 or \
135 |                                 name.find('fully_connected') != -1 or \
136 |                                 name.find('concat0_output') != -1)
137 |     sym = sym.get_backend_symbol('MKLDNN')
138 |     excluded_sym_names = ['concat0', '_plus0']
139 |     cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
140 |                                                     data_names=['csr_data', 'dns_data'],
141 |                                                     label_names=['softmax_label', ],
142 |                                                     ctx=ctx, excluded_sym_names=excluded_sym_names,
143 |                                                     calib_mode=calib_mode, calib_data=data,
144 |                                                     num_calib_examples=num_calib_batches*batch_size,
145 |                                                     calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
146 |                                                     logger=logger)
147 |     if calib_mode == 'entropy':
148 |         suffix = '-quantized-%dbatches-entropy' % num_calib_batches
149 |     elif calib_mode == 'naive':
150 |         suffix = '-quantized-%dbatches-naive' % num_calib_batches
151 |     else:
152 |         raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
153 |                          % calib_mode)
154 |     prefix = 'WD'
155 |     sym_name = '%s-symbol.json' % (prefix + suffix)
156 |     cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE')
157 |     save_symbol(sym_name, cqsym, logger)
158 |     param_name = '%s-%04d.params' % (prefix + '-quantized', 0)
159 |     save_params(param_name, qarg_params, aux_params, logger)
160 | 


--------------------------------------------------------------------------------
/pytorch/imagenet/imagenet/README.md:
--------------------------------------------------------------------------------
  1 | # ImageNet training in PyTorch
  2 | 
  3 | This implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
  4 | 
  5 | ## Requirements
  6 | 
  7 | - Install PyTorch ([pytorch.org](http://pytorch.org))
  8 | - `pip install -r requirements.txt`
  9 | - Download the ImageNet dataset and move validation images to labeled subfolders
 10 |     - To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
 11 | 
 12 | ## Training
 13 | 
 14 | To train a model, run `main.py` with the desired model architecture and the path to the ImageNet dataset:
 15 | 
 16 | ```bash
 17 | python main.py -a resnet18 [imagenet-folder with train and val folders]
 18 | ```
 19 | 
 20 | The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG:
 21 | 
 22 | ```bash
 23 | python main.py -a alexnet --lr 0.01 [imagenet-folder with train and val folders]
 24 | ```
 25 | 
 26 | ## Multi-processing Distributed Data Parallel Training ON GPU
 27 | 
 28 | You should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance.
 29 | 
 30 | ### Single node, multiple GPUs:
 31 | 
 32 | ```bash
 33 | python main.py -a resnet50 --dist-url 'tcp://127.0.0.1:FREEPORT' --dist-backend 'nccl' --multiprocessing-distributed --world-size 1 --rank 0 [imagenet-folder with train and val folders]
 34 | ```
 35 | 
 36 | ### Multiple nodes:
 37 | 
 38 | Node 0:
 39 | ```bash
 40 | python main.py -a resnet50 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' --dist-backend 'nccl' --multiprocessing-distributed --world-size 2 --rank 0 [imagenet-folder with train and val folders]
 41 | ```
 42 | 
 43 | Node 1:
 44 | ```bash
 45 | python main.py -a resnet50 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' --dist-backend 'nccl' --multiprocessing-distributed --world-size 2 --rank 1 [imagenet-folder with train and val folders]
 46 | ```
 47 | 
 48 | ## Multi-processing Distributed Data Parallel Training ON CPU: 
 49 | 
 50 | ### One node 2 instance:
 51 | ```bash
 52 | python main.py -a resnet18  --dist-url 'tcp://192.168.20.11:22384' --dist-backend 'gloo' --ppn 2 --world-size 1 --rank 0 -b 128 --mkldnn --multiprocessing-distributed /lustre/dataset/imagenet/img/
 53 | ```
 54 | ### One node(with two sockets) 2 instance(please change the num_threads in the running script):
 55 | ```bash
 56 | ./run_socket.sh
 57 | ```
 58 | ### Two nodes 2 instance on each:
 59 | 
 60 | Node 1:
 61 | ```bash
 62 | python main.py -a resnet18 --dist-url 'tcp://192.168.20.11:22384' --dist-backend 'gloo' --ppn 2 --world-size 2 --rank 0 -b 128 --mkldnn --multiprocessing-distributed /lustre/dataset/imagenet/img/
 63 | ```
 64 | 
 65 | Node 2:
 66 | ```bash
 67 | python main.py -a resnet18 --dist-url 'tcp://192.168.20.11:22384' --dist-backend 'gloo' --ppn 2 --world-size 2 --rank 1 -b 128 --mkldnn --multiprocessing-distributed /lustre/dataset/imagenet/img/
 68 | ```
 69 | 
 70 | ## INT8 inference
 71 | 
 72 | Now we support resnet50 and resnext101 model.
 73 | Run `main.py` with the desired model architecture and the path to the ImageNet dataset:
 74 | 
 75 | ```bash
 76 | python -u main.py -e -j $workers -a resnet50 -b 16 --INT8 "INT8_only" -qs "perChannel" --iter-calib 2500 -w 50 -qe "fbgemm"  -i 100 [imagenet-folder with train and val folders]
 77 | ```
 78 | 
 79 | ## Usage
 80 | 
 81 | ```
 82 | usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
 83 |                [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e]
 84 |                [--pretrained] [--world-size WORLD_SIZE] [--rank RANK]
 85 |                [--ppn PPN] [--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
 86 |                [--seed SEED] [--gpu GPU] [--multiprocessing-distributed]
 87 |                [--mkldnn] [--no-cuda] [-i N] [--iter-calib N] [-qe QENGINE]
 88 |                [-w N] [--INT8 INT8] [-t] [-qs QSCHEME] [-r]
 89 |                DIR
 90 | 
 91 | PyTorch ImageNet Training
 92 | 
 93 | positional arguments:
 94 |   DIR                   path to dataset
 95 | 
 96 | optional arguments:
 97 |   -h, --help            show this help message and exit
 98 |   -a ARCH, --arch ARCH  model architecture: alexnet | densenet121 |
 99 |                         densenet161 | densenet169 | densenet201 | googlenet |
100 |                         inception_v3 | mnasnet0_5 | mnasnet0_75 | mnasnet1_0 |
101 |                         mnasnet1_3 | mobilenet_v2 | resnet101 | resnet152 |
102 |                         resnet18 | resnet34 | resnet50 | resnext101_32x8d |
103 |                         resnext50_32x4d | shufflenet_v2_x0_5 |
104 |                         shufflenet_v2_x1_0 | shufflenet_v2_x1_5 |
105 |                         shufflenet_v2_x2_0 | squeezenet1_0 | squeezenet1_1 |
106 |                         vgg11 | vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn
107 |                         | vgg19 | vgg19_bn | wide_resnet101_2 |
108 |                         wide_resnet50_2 (default: resnet18)
109 |   -j N, --workers N     number of data loading workers (default: 4)
110 |   --epochs N            number of total epochs to run
111 |   --start-epoch N       manual epoch number (useful on restarts)
112 |   -b N, --batch-size N  mini-batch size (default: 256), this is the total
113 |                         batch size of all GPUs on the current node when using
114 |                         Data Parallel or Distributed Data Parallel
115 |   --lr LR, --learning-rate LR
116 |                         initial learning rate
117 |   --momentum M          momentum
118 |   --wd W, --weight-decay W
119 |                         weight decay (default: 1e-4)
120 |   -p N, --print-freq N  print frequency (default: 10)
121 |   --resume PATH         path to latest checkpoint (default: none)
122 |   -e, --evaluate        evaluate model on validation set
123 |   --pretrained          use pre-trained model
124 |   --world-size WORLD_SIZE
125 |                         number of nodes for distributed training
126 |   --rank RANK           node rank for distributed training
127 |   --ppn PPN             number of processes on each node of distributed
128 |                         training
129 |   --dist-url DIST_URL   url used to set up distributed training
130 |   --dist-backend DIST_BACKEND
131 |                         distributed backend
132 |   --seed SEED           seed for initializing training.
133 |   --gpu GPU             GPU id to use.
134 |   --multiprocessing-distributed
135 |                         Use multi-processing distributed training to launch N
136 |                         processes per node, which has N GPUs. This is the
137 |                         fastest way to use PyTorch for either single node or
138 |                         multi node data parallel training
139 |   --mkldnn              use mkldnn weight cache
140 |   --no-cuda             disable CUDA
141 |   -i N, --iterations N  number of total iterations to run
142 |   --iter-calib N        number of iterations when calibration to run
143 |   -qe QENGINE, --qengine QENGINE
144 |                         Choose qengine to run. "all", "fbgemm" or
145 |                         "mkldnn".(DEFAULT: all)
146 |   -w N, --warmup-iterations N
147 |                         number of warmup iterations to run
148 |   --INT8 INT8           Choose run mode. "no_INT8", "calibration_olny",
149 |                         "INT8_only", "INT8_and_fp32".(DEFAULT: no_INT8)
150 |   -t, --profile         Trigger profile on current topology.
151 |   -qs QSCHEME, --qscheme QSCHEME
152 |                         The scheme of quantizer:"perTensor", "perChannel"
153 |   -r, --reduce_range    Choose reduce range flag. True or False.
154 | ```
155 | ## Tips
156 | 
157 | If we want to get a better performance when using MKLDNN backend, we can use a better alloctor: TCmalloc or Jemalloc.
158 | ### How to using TCmalloc
159 | 1. Install TCmalloc:
160 | ```
161 | git clone https://github.com/gperftools/gperftools.git
162 | ./autogen.sh
163 | ./configure
164 | make
165 | make check(可选)
166 | make install
167 | make clean 
168 | ```
169 | 2. Using TCmalloc
170 | `export LD_PRELOAD=<your install tcmalloc path>/lib/libtcmalloc.so`
171 | 3. Fine tune
172 | https://gperftools.github.io/gperftools/tcmalloc.html
173 | 
174 | ### How to using Jemalloc
175 | 1. Install Jemalloc:
176 | https://github.com/jemalloc/jemalloc/blob/dev/INSTALL.md 
177 | 2. Using Jemalloc
178 | `export LD_PRELOAD=<your install jemalloc path>/lib/libjemalloc.so`
179 | 3. Fine tune:
180 | https://github.com/jemalloc/jemalloc/blob/dev/TUNING.md
181 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/common/common_mlperf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | dataset related classes and methods
  3 | """
  4 | 
  5 | # pylint: disable=unused-argument,missing-docstring
  6 | 
  7 | import logging
  8 | import sys
  9 | import time
 10 | 
 11 | import numpy as np
 12 | 
 13 | from PIL import Image
 14 | 
 15 | logging.basicConfig(level=logging.INFO)
 16 | log = logging.getLogger("dataset")
 17 | 
 18 | class Item():
 19 |     def __init__(self, label, img, idx):
 20 |         self.label = label
 21 |         self.img = img
 22 |         self.idx = idx
 23 |         self.start = time.time()
 24 | 
 25 | 
 26 | def usleep(sec):
 27 |     if sys.platform == 'win32':
 28 |         # on windows time.sleep() doesn't work to well
 29 |         import ctypes
 30 |         kernel32 = ctypes.windll.kernel32
 31 |         timer = kernel32.CreateWaitableTimerA(ctypes.c_void_p(), True, ctypes.c_void_p())
 32 |         delay = ctypes.c_longlong(int(-1 * (10 * 1000000 * sec)))
 33 |         kernel32.SetWaitableTimer(timer, ctypes.byref(delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False)
 34 |         kernel32.WaitForSingleObject(timer, 0xffffffff)
 35 |     else:
 36 |         time.sleep(sec)
 37 | 
 38 | 
 39 | class Dataset():
 40 |     def __init__(self):
 41 |         self.arrival = None
 42 |         self.image_list = []
 43 |         self.label_list = []
 44 |         self.image_list_inmemory = {}
 45 |         self.last_loaded = -1
 46 | 
 47 |     def preprocess(self, use_cache=True):
 48 |         raise NotImplementedError("Dataset:preprocess")
 49 | 
 50 |     def get_item_count(self):
 51 |         return len(self.image_list)
 52 | 
 53 |     def get_list(self):
 54 |         raise NotImplementedError("Dataset:get_list")
 55 | 
 56 |     def load_query_samples(self, sample_list):
 57 |         self.image_list_inmemory = {}
 58 |         for sample in sample_list:
 59 |             self.image_list_inmemory[sample], _ = self.get_item(sample)
 60 |         self.last_loaded = time.time()
 61 | 
 62 |     def unload_query_samples(self, sample_list):
 63 |         if sample_list:
 64 |             for sample in sample_list:
 65 |                 if sample in self.image_list_inmemory :
 66 |                     del self.image_list_inmemory[sample]
 67 |         else:
 68 |             self.image_list_inmemory = {}
 69 | 
 70 |     def get_samples(self, id_list):
 71 |         data = np.array([self.image_list_inmemory[id] for id in id_list])
 72 |         return data, self.label_list[id_list]
 73 | 
 74 |     def get_item_loc(self, id):
 75 |         raise NotImplementedError("Dataset:get_item_loc")
 76 | 
 77 | 
 78 | #
 79 | # Post processing
 80 | #
 81 | class PostProcessCommon:
 82 |     def __init__(self, offset=0):
 83 |         self.offset = offset
 84 |         self.good = 0
 85 |         self.total = 0
 86 | 
 87 |     def __call__(self, results, ids, expected=None, result_dict=None):
 88 |         processed_results = []
 89 |         n = len(results[0])
 90 |         for idx in range(0, n):
 91 |             result = results[0][idx] + self.offset
 92 |             processed_results.append([result])
 93 |             if result == expected[idx]:
 94 |                 self.good += 1
 95 |         self.total += n
 96 |         return processed_results
 97 | 
 98 |     def add_results(self, results):
 99 |         pass
100 | 
101 |     def start(self):
102 |         self.good = 0
103 |         self.total = 0
104 | 
105 |     def finalize(self, results, ds=False,  output_dir=None):
106 |         results["good"] = self.good
107 |         results["total"] = self.total
108 | 
109 | 
110 | class PostProcessArgMax:
111 |     def __init__(self, offset=0):
112 |         self.offset = offset
113 |         self.good = 0
114 |         self.total = 0
115 | 
116 |     def __call__(self, results, ids, expected=None, result_dict=None):
117 |         processed_results = []
118 |         results = np.argmax(results[0], axis=1)
119 |         n = results.shape[0]
120 |         for idx in range(0, n):
121 |             result = results[idx] + self.offset
122 |             processed_results.append([result])
123 |             if result == expected[idx]:
124 |                 self.good += 1
125 |         self.total += n
126 |         return processed_results
127 | 
128 |     def add_results(self, results):
129 |         pass
130 | 
131 |     def start(self):
132 |         self.good = 0
133 |         self.total = 0
134 | 
135 |     def finalize(self, results, ds=False, output_dir=None):
136 |         results["good"] = self.good
137 |         results["total"] = self.total
138 | 
139 | 
140 | #
141 | # pre-processing
142 | #
143 | 
144 | def center_crop(img, out_height, out_width):
145 |     width, height = img.size
146 |     left = (width - out_width) / 2
147 |     right = (width + out_width) / 2
148 |     top = (height - out_height) / 2
149 |     bottom = (height + out_height) / 2
150 |     img = img.crop((left, top, right, bottom))
151 |     return img
152 | 
153 | 
154 | def resize_with_aspectratio(img, out_height, out_width, scale=87.5):
155 |     width, height = img.size
156 |     new_height = int(100. * out_height / scale)
157 |     new_width = int(100. * out_width / scale)
158 |     if height > width:
159 |         w = new_width
160 |         h = int(new_height * height / width)
161 |     else:
162 |         h = new_height
163 |         w = int(new_width * width / height)
164 |     img = img.resize((w, h), Image.BILINEAR)
165 |     return img
166 | 
167 | 
168 | def pre_process_vgg(img, dims=None, need_transpose=False):
169 |     if img.mode != 'RGB':
170 |         img = img.convert('RGB')
171 | 
172 |     output_height, output_width, _ = dims
173 | 
174 |     img = resize_with_aspectratio(img, output_height, output_width)
175 |     img = center_crop(img, output_height, output_width)
176 |     img = np.asarray(img, dtype='float32')
177 | 
178 |     # normalize image
179 |     means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
180 |     img -= means
181 |     # transpose if needed
182 |     if need_transpose:
183 |         img = img.transpose([2, 0, 1])
184 |     return img
185 | 
186 | 
187 | def pre_process_mobilenet(img, dims=None, need_transpose=False):
188 |     if img.mode != 'RGB':
189 |         img = img.convert('RGB')
190 | 
191 |     output_height, output_width, _ = dims
192 | 
193 |     img = resize_with_aspectratio(img, output_height, output_width)
194 |     img = center_crop(img, output_height, output_width)
195 |     img = np.asarray(img, dtype='float32')
196 | 
197 |     img /= 255.0
198 |     img -= 0.5
199 |     img *= 2
200 | 
201 |     # transpose if needed
202 |     if need_transpose:
203 |         img = img.transpose([2, 0, 1])
204 |     return img
205 | 
206 | 
207 | def pre_process_coco_mobilenet(img, dims=None, need_transpose=False):
208 |     if img.mode != 'RGB':
209 |         img = img.convert('RGB')
210 | 
211 |     img_data = np.array(img.getdata())
212 |     img_data = img_data.astype(np.uint8)
213 |     (im_width, im_height) = img.size
214 |     img = img_data.reshape(im_height, im_width, 3)
215 |     # transpose if needed
216 |     if need_transpose:
217 |         img = img.transpose([2, 0, 1])
218 |     return img
219 | 
220 | def pre_process_coco_pt_mobilenet(img, dims=None, need_transpose=False):
221 |     if img.mode != 'RGB':
222 |         img = img.convert('RGB')
223 | 
224 |     img_data = np.array(img.getdata())
225 |     img_data = img_data.astype(np.float32)
226 |     (im_width, im_height) = img.size
227 |     img = img_data.reshape(im_height, im_width, 3)
228 |     img -= 127.5
229 |     img /= 127.5
230 |     # transpose if needed
231 |     if need_transpose:
232 |         img = img.transpose([2, 0, 1])
233 |     return img
234 | 
235 | def pre_process_coco_resnet34(img, dims=None, need_transpose=False):
236 |     if img.mode != 'RGB':
237 |         img = img.convert('RGB')
238 | 
239 |     if dims != None:
240 |         im_height, im_width, _ = dims
241 |         img = img.resize((im_width, im_height), Image.BILINEAR)
242 |     mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
243 |     std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
244 |     img_data = np.array(img.getdata(), dtype=np.float32)
245 |     (im_width, im_height) = img.size
246 |     img = img_data.reshape(im_height, im_width, 3)
247 |     img = img / 255. - mean
248 |     img = img / std
249 |     if need_transpose:
250 |         img = img.transpose([2, 0, 1])
251 | 
252 |     return img
253 | 
254 | def pre_process_coco_resnet34_tf(img, dims=None, need_transpose=False):
255 |     if img.mode != 'RGB':
256 |         img = img.convert('RGB')
257 | 
258 |     if dims != None:
259 |         im_height, im_width, _ = dims
260 |         img = img.resize((im_width, im_height), Image.BILINEAR) # PIL.Image.BILINEAR 2
261 |     mean = np.array([123.68, 116.78, 103.94], dtype=np.float32)
262 |     img_data = np.array(img.getdata(), dtype=np.float32)
263 |     (im_width, im_height) = img.size
264 |     img = img_data.reshape(im_height, im_width, 3)
265 |     img = img - mean
266 |     if need_transpose:
267 |         img = img.transpose([2, 0, 1])
268 | 
269 |     return img
270 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/inference/calibration_int8.py:
--------------------------------------------------------------------------------
  1 | """
  2 | module to run calibration
  3 | """
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | from __future__ import unicode_literals
  8 | 
  9 | import sys
 10 | import os
 11 | import logging
 12 | import numpy as np
 13 | from caffe2.proto import caffe2_pb2
 14 | from caffe2.python import core, workspace
 15 | from caffe2.python import transformations as tf
 16 | import inference.models as m
 17 | from common import common_caffe2 as cc2
 18 | 
 19 | def Calibration(args, extra_args):
 20 |     """
 21 |     function to run calibration
 22 |     """
 23 | 
 24 |     if not m.IsSupported(args.model):
 25 |         logging.error("Not supported model: {}".format(args.model))
 26 |         m.ShowModels()
 27 |         return
 28 | 
 29 |     images_path = None
 30 |     if args.images_path:
 31 |         images_path = os.path.abspath(args.images_path)
 32 |     elif "CAFFE2_INF_IMG_PATH" in os.environ:
 33 |         images_path = os.path.abspath(os.environ["CAFFE2_INF_IMG_PATH"])
 34 | 
 35 |     batch_size = 1
 36 |     if args.batch_size:
 37 |         batch_size = int(args.batch_size)
 38 |         if batch_size <= 0:
 39 |             logging.error("Invalid batch size {}. Exit!".format(batch_size))
 40 |             return
 41 | 
 42 |     iterations = args.iterations if args.iterations else sys.maxsize
 43 |     logging.warning("Run Caffe2 in inference mode with args:\n{}"
 44 |                     .format(vars(args)))
 45 | 
 46 |     model_info = m.GetModelInfo(args.model)
 47 |     logging.warning("The inference inputs of {0} model:\n{1}"
 48 |                     .format(
 49 |                         args.model,
 50 |                         {str(k): str(v) for k, v in model_info.items()}
 51 |                         ))
 52 | 
 53 |     crop_size = int(model_info["crop_size"])
 54 |     if args.crop_size:
 55 |         crop_size = args.crop_size
 56 | 
 57 |     need_normalize = False
 58 |     if model_info["need_normalize"]:
 59 |         need_normalize = True
 60 | 
 61 |     mean = 128
 62 |     if str(model_info["image_mean"]) != 'None':
 63 |         mean_tmp = ((model_info["image_mean"]).split('/')[-1]).split(' ')
 64 |         if need_normalize:
 65 |             mean = np.zeros([3, crop_size, crop_size], dtype=np.float)
 66 |             mean[0, :, :] = float(mean_tmp[0])  # 104
 67 |             mean[1, :, :] = float(mean_tmp[1])  # 117
 68 |             mean[2, :, :] = float(mean_tmp[2])  # 124
 69 | 
 70 |         else:
 71 |             mean = np.zeros([3, crop_size, crop_size], dtype=np.int32)
 72 |             mean[0, :, :] = int(mean_tmp[0])  # 104
 73 |             mean[1, :, :] = int(mean_tmp[1])  # 117
 74 |             mean[2, :, :] = int(mean_tmp[2])  # 124
 75 | 
 76 |     scale = [1]
 77 |     if str(model_info["scale"]) != '':
 78 |         scale = (model_info["scale"]).split(' ')
 79 |     rescale_size = 256
 80 |     if str(model_info["rescale_size"]) != '':
 81 |         rescale_size = int(model_info["rescale_size"])
 82 |     color_format = "BGR"
 83 |     if str(model_info["color_format"]) != '':
 84 |         color_format = model_info["color_format"]
 85 |     if args.onnx_model:
 86 |         init_def, predict_def = cc2.OnnxToCaffe2(model_info["onnx_model"])
 87 |     else:
 88 |         with open(model_info["init_net"], 'rb') as i:
 89 |             if model_info["model_type"] == "prototext" or \
 90 |                      model_info["init_net"].split('.')[-1] == "pbtxt":
 91 |                 import google.protobuf.text_format as ptxt
 92 |                 init_def = ptxt.Parse(i.read(), caffe2_pb2.NetDef())
 93 |             else:
 94 |                 init_def = caffe2_pb2.NetDef()
 95 |                 init_def.ParseFromString(i.read())
 96 |         with open(model_info["predict_net"], 'rb') as p:
 97 |             if model_info["model_type"] == "prototext" or \
 98 |                      model_info["predict_net"].split('.')[-1] == "pbtxt":
 99 |                 import google.protobuf.text_format as ptxt
100 |                 predict_def = ptxt.Parse(p.read(), caffe2_pb2.NetDef())
101 |             else:
102 |                 predict_def = caffe2_pb2.NetDef()
103 |                 predict_def.ParseFromString(p.read())
104 | 
105 |     if model_info["model_type"] == "caffe legacy":
106 |         cc2.MergeScaleBiasInBN(predict_def)
107 |         cc2.RemoveUselessExternalInput(predict_def)
108 | 
109 |     dev_map = {
110 |         "cpu": caffe2_pb2.CPU,
111 |         "gpu": caffe2_pb2.CUDA,
112 |         "cuda": caffe2_pb2.CUDA,
113 |         "mkldnn": caffe2_pb2.MKLDNN,
114 |         "opengl": caffe2_pb2.OPENGL,
115 |         "opencl": caffe2_pb2.OPENCL,
116 |         "ideep": caffe2_pb2.IDEEP,
117 |     }
118 |     device_opts = caffe2_pb2.DeviceOption()
119 |     if args.device.lower() in dev_map:
120 |         device_opts.device_type = dev_map[args.device.lower()]
121 |     else:
122 |         logging.error("Wrong device {}. Exit!".format(args.device))
123 |         return
124 | 
125 |     logging.warning("Start running calibration")
126 | 
127 |     if args.calibration_file:
128 |         images, _ = cc2.ImageProc.BatchImagesByName(images_path, args.calibration_file, batch_size, iterations)
129 |     else:
130 |         images, _ = cc2.ImageProc.BatchImages(images_path, batch_size, iterations)
131 |     # for kl_divergence calibration, we use the first 100 images to get
132 |     # the min and max values, and the remaing images are applied to compute the hist.
133 |     # if the len(images) <= 100, we extend the images with themselves.
134 |     def data_gen():
135 |         images_calib = images
136 |         if args.single_iter_calib:
137 |             images_calib = [images[args.iter_calib]]
138 |         for raw in images_calib:
139 |             if model_info["model_type"] == "mlperf legacy vgg":
140 |                 imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfVGG(raw)
141 |             elif model_info["model_type"] == "mlperf legacy mb":
142 |                 imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfMB(raw)
143 |             else:
144 |                 imgs, _ = cc2.ImageProc.PreprocessImages(
145 |                     raw, crop_size, rescale_size, mean, scale, 1, need_normalize, color_format)
146 |             #imgs, _ = cc2.ImageProc.PreprocessImagesByThreading(
147 |             #        raw, crop_size,rescale_size, mean, scale, 1)
148 |             yield imgs
149 |             del imgs
150 | 
151 |     cc2.UpdateDeviceOption(device_opts, init_def)
152 |     workspace.RunNetOnce(init_def)
153 | 
154 |     cc2.UpdateDeviceOption(device_opts, predict_def)
155 |     net = core.Net(model_info["model_name"])
156 |     net.Proto().CopyFrom(predict_def)
157 |     if args.device.lower() == 'ideep' and not args.noptimize:
158 |         logging.warning('Optimizing module {} ....................'
159 |                         .format(model_info["model_name"]))
160 |         tf.optimizeForMKLDNN(net)
161 |     predict_def = net.Proto()
162 |     if predict_def.op[-1].type == 'Accuracy':
163 |         init_label = np.ones((batch_size), dtype=np.int32)
164 |         label = net.AddExternalInput('label')
165 |         workspace.FeedBlob(label, init_label, device_opts)
166 |         for i, op in enumerate(predict_def.op):
167 |             if op.type == 'Accuracy':
168 |                 workspace.FeedBlob(str(predict_def.op[i].output[0]), init_label, device_opts)
169 | 
170 |     from inference.calibrator import Calibrator, KLCalib, AbsmaxCalib, EMACalib
171 |     algorithm = AbsmaxCalib()
172 |     kind = os.environ.get('INT8CALIB')
173 |     if args.calib_algo:
174 |         kind = args.calib_algo
175 |     if kind == "absmax":
176 |         algorithm = AbsmaxCalib()
177 |     elif kind == "moving_average":
178 |         ema_alpha = 0.5
179 |         algorithm = EMACalib(ema_alpha)
180 |     elif kind == "kl_divergence":
181 |         kl_iter_num_for_range = 500
182 |         while len(images) < 2*kl_iter_num_for_range:
183 |             images += images
184 |         algorithm = KLCalib(kl_iter_num_for_range)
185 |     logging.warning('Use {} calibration method....................'.format(kind))
186 | 
187 |     i = 0
188 |     length = len(images)
189 |     calib = Calibrator(algorithm, device_opts)
190 |     for data in data_gen():
191 |         i += 1
192 |         workspace.FeedBlob(predict_def.op[0].input[0], data, device_opts)
193 |         logging.warning("in progress {}/{}(batch/batch total)".format(i, length))
194 |         calib.RunCalibIter(workspace, predict_def)
195 | 
196 |     predict_quantized, init_quantized = calib.DepositQuantizedModule(workspace, predict_def)
197 | 
198 |     cc2.SaveModel(args.output_file + '/init_net_int8.pb', init_quantized,
199 |                   args.output_file + '/predict_net_int8.pb', predict_quantized)
200 |     cc2.SaveModelPtxt(args.output_file + '/predict_net_int8.pbtxt', predict_quantized)
201 |     cc2.SaveModelPtxt(args.output_file + '/init_net_int8.pbtxt', init_quantized)
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     logging.critical("Do not run this script independently!")
206 |     exit()
207 | 


--------------------------------------------------------------------------------
/mxnet/wide_deep_criteo/update_model/embedding-fuse.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes": [
  3 |     {
  4 |       "op": "null", 
  5 |       "name": "csr_data", 
  6 |       "attrs": {"__storage_type__": "2"}, 
  7 |       "inputs": []
  8 |     }, 
  9 |     {
 10 |       "op": "null", 
 11 |       "name": "linear_weight", 
 12 |       "attrs": {
 13 |         "__init__": "[\"normal\", {\"sigma\": 0.01}]", 
 14 |         "__shape__": "(26000, 256)", 
 15 |         "__storage_type__": "1"
 16 |       }, 
 17 |       "inputs": []
 18 |     }, 
 19 |     {
 20 |       "op": "dot", 
 21 |       "name": "dot0", 
 22 |       "inputs": [[0, 0, 0], [1, 0, 0]]
 23 |     }, 
 24 |     {
 25 |       "op": "null", 
 26 |       "name": "linear_bias", 
 27 |       "attrs": {"__shape__": "(256,)"}, 
 28 |       "inputs": []
 29 |     }, 
 30 |     {
 31 |       "op": "broadcast_add", 
 32 |       "name": "broadcast_add0", 
 33 |       "inputs": [[2, 0, 0], [3, 0, 0]]
 34 |     }, 
 35 |     {
 36 |       "op": "null", 
 37 |       "name": "dns_data", 
 38 |       "inputs": []
 39 |     }, 
 40 |     {
 41 |       "op": "null", 
 42 |       "name": "embed_0_weight", 
 43 |       "attrs": {"__storage_type__": "1"}, 
 44 |       "inputs": []
 45 |     }, 
 46 |     {
 47 |       "op": "null", 
 48 |       "name": "embed_1_weight", 
 49 |       "attrs": {"__storage_type__": "1"}, 
 50 |       "inputs": []
 51 |     }, 
 52 |     {
 53 |       "op": "null", 
 54 |       "name": "embed_2_weight", 
 55 |       "attrs": {"__storage_type__": "1"}, 
 56 |       "inputs": []
 57 |     }, 
 58 |     {
 59 |       "op": "null", 
 60 |       "name": "embed_3_weight", 
 61 |       "attrs": {"__storage_type__": "1"}, 
 62 |       "inputs": []
 63 |     }, 
 64 |     {
 65 |       "op": "null", 
 66 |       "name": "embed_4_weight", 
 67 |       "attrs": {"__storage_type__": "1"}, 
 68 |       "inputs": []
 69 |     }, 
 70 |     {
 71 |       "op": "null", 
 72 |       "name": "embed_5_weight", 
 73 |       "attrs": {"__storage_type__": "1"}, 
 74 |       "inputs": []
 75 |     }, 
 76 |     {
 77 |       "op": "null", 
 78 |       "name": "embed_6_weight", 
 79 |       "attrs": {"__storage_type__": "1"}, 
 80 |       "inputs": []
 81 |     }, 
 82 |     {
 83 |       "op": "null", 
 84 |       "name": "embed_7_weight", 
 85 |       "attrs": {"__storage_type__": "1"}, 
 86 |       "inputs": []
 87 |     }, 
 88 |     {
 89 |       "op": "null", 
 90 |       "name": "embed_8_weight", 
 91 |       "attrs": {"__storage_type__": "1"}, 
 92 |       "inputs": []
 93 |     }, 
 94 |     {
 95 |       "op": "null", 
 96 |       "name": "embed_9_weight", 
 97 |       "attrs": {"__storage_type__": "1"}, 
 98 |       "inputs": []
 99 |     }, 
100 |     {
101 |       "op": "null", 
102 |       "name": "embed_10_weight", 
103 |       "attrs": {"__storage_type__": "1"}, 
104 |       "inputs": []
105 |     }, 
106 |     {
107 |       "op": "null", 
108 |       "name": "embed_11_weight", 
109 |       "attrs": {"__storage_type__": "1"}, 
110 |       "inputs": []
111 |     }, 
112 |     {
113 |       "op": "null", 
114 |       "name": "embed_12_weight", 
115 |       "attrs": {"__storage_type__": "1"}, 
116 |       "inputs": []
117 |     }, 
118 |     {
119 |       "op": "null", 
120 |       "name": "embed_13_weight", 
121 |       "attrs": {"__storage_type__": "1"}, 
122 |       "inputs": []
123 |     }, 
124 |     {
125 |       "op": "null", 
126 |       "name": "embed_14_weight", 
127 |       "attrs": {"__storage_type__": "1"}, 
128 |       "inputs": []
129 |     }, 
130 |     {
131 |       "op": "null", 
132 |       "name": "embed_15_weight", 
133 |       "attrs": {"__storage_type__": "1"}, 
134 |       "inputs": []
135 |     }, 
136 |     {
137 |       "op": "null", 
138 |       "name": "embed_16_weight", 
139 |       "attrs": {"__storage_type__": "1"}, 
140 |       "inputs": []
141 |     }, 
142 |     {
143 |       "op": "null", 
144 |       "name": "embed_17_weight", 
145 |       "attrs": {"__storage_type__": "1"}, 
146 |       "inputs": []
147 |     }, 
148 |     {
149 |       "op": "null", 
150 |       "name": "embed_18_weight", 
151 |       "attrs": {"__storage_type__": "1"}, 
152 |       "inputs": []
153 |     }, 
154 |     {
155 |       "op": "null", 
156 |       "name": "embed_19_weight", 
157 |       "attrs": {"__storage_type__": "1"}, 
158 |       "inputs": []
159 |     }, 
160 |     {
161 |       "op": "null", 
162 |       "name": "embed_20_weight", 
163 |       "attrs": {"__storage_type__": "1"}, 
164 |       "inputs": []
165 |     }, 
166 |     {
167 |       "op": "null", 
168 |       "name": "embed_21_weight", 
169 |       "attrs": {"__storage_type__": "1"}, 
170 |       "inputs": []
171 |     }, 
172 |     {
173 |       "op": "null", 
174 |       "name": "embed_22_weight", 
175 |       "attrs": {"__storage_type__": "1"}, 
176 |       "inputs": []
177 |     }, 
178 |     {
179 |       "op": "null", 
180 |       "name": "embed_23_weight", 
181 |       "attrs": {"__storage_type__": "1"}, 
182 |       "inputs": []
183 |     }, 
184 |     {
185 |       "op": "null", 
186 |       "name": "embed_24_weight", 
187 |       "attrs": {"__storage_type__": "1"}, 
188 |       "inputs": []
189 |     }, 
190 |     {
191 |       "op": "null", 
192 |       "name": "embed_25_weight", 
193 |       "attrs": {"__storage_type__": "1"}, 
194 |       "inputs": []
195 |     }, 
196 |     {
197 |       "op": "SliceSplitEmbeddingConcatFuse", 
198 |       "name": "SliceSplitEmbeddingConcatFuse_0", 
199 |       "attrs": {
200 |         "concat_dim": "1", 
201 |         "cont_begin": "(0,26)", 
202 |         "cont_end": "(None,39)", 
203 |         "embed_begin": "(0,0)", 
204 |         "embed_end": "(None,26)", 
205 |         "input_dims": "[1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]", 
206 |         "num_outputs": "26", 
207 |         "output_dims": "[32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]", 
208 |         "squeeze_axis": "1"
209 |       }, 
210 |       "inputs": [
211 |         [5, 0, 0], 
212 |         [6, 0, 0], 
213 |         [7, 0, 0], 
214 |         [8, 0, 0], 
215 |         [9, 0, 0], 
216 |         [10, 0, 0], 
217 |         [11, 0, 0], 
218 |         [12, 0, 0], 
219 |         [13, 0, 0], 
220 |         [14, 0, 0], 
221 |         [15, 0, 0], 
222 |         [16, 0, 0], 
223 |         [17, 0, 0], 
224 |         [18, 0, 0], 
225 |         [19, 0, 0], 
226 |         [20, 0, 0], 
227 |         [21, 0, 0], 
228 |         [22, 0, 0], 
229 |         [23, 0, 0], 
230 |         [24, 0, 0], 
231 |         [25, 0, 0], 
232 |         [26, 0, 0], 
233 |         [27, 0, 0], 
234 |         [28, 0, 0], 
235 |         [29, 0, 0], 
236 |         [30, 0, 0], 
237 |         [31, 0, 0]
238 |       ]
239 |     }, 
240 |     {
241 |       "op": "null", 
242 |       "name": "fullyconnected0_weight", 
243 |       "attrs": {"num_hidden": "1024"}, 
244 |       "inputs": []
245 |     }, 
246 |     {
247 |       "op": "null", 
248 |       "name": "fullyconnected0_bias", 
249 |       "attrs": {"num_hidden": "1024"}, 
250 |       "inputs": []
251 |     }, 
252 |     {
253 |       "op": "FullyConnected", 
254 |       "name": "fullyconnected0", 
255 |       "attrs": {"num_hidden": "1024"}, 
256 |       "inputs": [[32, 0, 0], [33, 0, 0], [34, 0, 0]]
257 |     }, 
258 |     {
259 |       "op": "Activation", 
260 |       "name": "activation0", 
261 |       "attrs": {"act_type": "relu"}, 
262 |       "inputs": [[35, 0, 0]]
263 |     }, 
264 |     {
265 |       "op": "null", 
266 |       "name": "fullyconnected1_weight", 
267 |       "attrs": {"num_hidden": "512"}, 
268 |       "inputs": []
269 |     }, 
270 |     {
271 |       "op": "null", 
272 |       "name": "fullyconnected1_bias", 
273 |       "attrs": {"num_hidden": "512"}, 
274 |       "inputs": []
275 |     }, 
276 |     {
277 |       "op": "FullyConnected", 
278 |       "name": "fullyconnected1", 
279 |       "attrs": {"num_hidden": "512"}, 
280 |       "inputs": [[36, 0, 0], [37, 0, 0], [38, 0, 0]]
281 |     }, 
282 |     {
283 |       "op": "Activation", 
284 |       "name": "activation1", 
285 |       "attrs": {"act_type": "relu"}, 
286 |       "inputs": [[39, 0, 0]]
287 |     }, 
288 |     {
289 |       "op": "null", 
290 |       "name": "fullyconnected2_weight", 
291 |       "attrs": {"num_hidden": "256"}, 
292 |       "inputs": []
293 |     }, 
294 |     {
295 |       "op": "null", 
296 |       "name": "fullyconnected2_bias", 
297 |       "attrs": {"num_hidden": "256"}, 
298 |       "inputs": []
299 |     }, 
300 |     {
301 |       "op": "FullyConnected", 
302 |       "name": "fullyconnected2", 
303 |       "attrs": {"num_hidden": "256"}, 
304 |       "inputs": [[40, 0, 0], [41, 0, 0], [42, 0, 0]]
305 |     }, 
306 |     {
307 |       "op": "elemwise_add", 
308 |       "name": "_plus0", 
309 |       "inputs": [[4, 0, 0], [43, 0, 0]]
310 |     }, 
311 |     {
312 |       "op": "null", 
313 |       "name": "softmax_label", 
314 |       "inputs": []
315 |     }, 
316 |     {
317 |       "op": "SoftmaxOutput", 
318 |       "name": "model", 
319 |       "inputs": [[44, 0, 0], [45, 0, 0]]
320 |     }
321 |   ], 
322 |   "arg_nodes": [
323 |     0, 
324 |     1, 
325 |     3, 
326 |     5, 
327 |     6, 
328 |     7, 
329 |     8, 
330 |     9, 
331 |     10, 
332 |     11, 
333 |     12, 
334 |     13, 
335 |     14, 
336 |     15, 
337 |     16, 
338 |     17, 
339 |     18, 
340 |     19, 
341 |     20, 
342 |     21, 
343 |     22, 
344 |     23, 
345 |     24, 
346 |     25, 
347 |     26, 
348 |     27, 
349 |     28, 
350 |     29, 
351 |     30, 
352 |     31, 
353 |     33, 
354 |     34, 
355 |     37, 
356 |     38, 
357 |     41, 
358 |     42, 
359 |     45
360 |   ], 
361 |   "node_row_ptr": [
362 |     0, 
363 |     1, 
364 |     2, 
365 |     3, 
366 |     4, 
367 |     5, 
368 |     6, 
369 |     7, 
370 |     8, 
371 |     9, 
372 |     10, 
373 |     11, 
374 |     12, 
375 |     13, 
376 |     14, 
377 |     15, 
378 |     16, 
379 |     17, 
380 |     18, 
381 |     19, 
382 |     20, 
383 |     21, 
384 |     22, 
385 |     23, 
386 |     24, 
387 |     25, 
388 |     26, 
389 |     27, 
390 |     28, 
391 |     29, 
392 |     30, 
393 |     31, 
394 |     32, 
395 |     33, 
396 |     34, 
397 |     35, 
398 |     36, 
399 |     37, 
400 |     38, 
401 |     39, 
402 |     40, 
403 |     41, 
404 |     42, 
405 |     43, 
406 |     44, 
407 |     45, 
408 |     46, 
409 |     47
410 |   ], 
411 |   "heads": [[46, 0, 0]], 
412 |   "attrs": {"mxnet_version": ["int", 10500]}
413 | }
414 | 


--------------------------------------------------------------------------------
/pytorch/dlrm/dlrm/quorem/qr_embedding_bag.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | #
  6 | # Quotient-Remainder Trick
  7 | #
  8 | # Description: Applies quotient remainder-trick to embeddings to reduce
  9 | # embedding sizes.
 10 | #
 11 | # References:
 12 | # [1] Hao-Jun Michael Shi, Dheevatsa Mudigere, Maxim Naumov, Jiyan Yang,
 13 | # "Compositional Embeddings Using Complementary Partitions for Memory-Efficient
 14 | # Recommendation Systems", CoRR, arXiv:1909.02107, 2019
 15 | 
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | import torch
 19 | import torch.nn as nn
 20 | import torch.nn.functional as F
 21 | from torch.nn.parameter import Parameter
 22 | import numpy as np
 23 | 
 24 | 
 25 | class QREmbeddingBag(nn.Module):
 26 |     r"""Computes sums or means over two 'bags' of embeddings, one using the quotient
 27 |     of the indices and the other using the remainder of the indices, without
 28 |     instantiating the intermediate embeddings, then performs an operation to combine these.
 29 | 
 30 |     For bags of constant length and no :attr:`per_sample_weights`, this class
 31 | 
 32 |         * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
 33 |         * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
 34 |         * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
 35 | 
 36 |     However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
 37 |     operations.
 38 | 
 39 |     QREmbeddingBag also supports per-sample weights as an argument to the forward
 40 |     pass. This scales the output of the Embedding before performing a weighted
 41 |     reduction as specified by ``mode``. If :attr:`per_sample_weights`` is passed, the
 42 |     only supported ``mode`` is ``"sum"``, which computes a weighted sum according to
 43 |     :attr:`per_sample_weights`.
 44 | 
 45 |     Known Issues:
 46 |     Autograd breaks with multiple GPUs. It breaks only with multiple embeddings.
 47 | 
 48 |     Args:
 49 |         num_categories (int): total number of unique categories. The input indices must be in
 50 |                               0, 1, ..., num_categories - 1.
 51 |         embedding_dim (list): list of sizes for each embedding vector in each table. If ``"add"``
 52 |                               or ``"mult"`` operation are used, these embedding dimensions must be
 53 |                               the same. If a single embedding_dim is used, then it will use this
 54 |                               embedding_dim for both embedding tables.
 55 |         num_collisions (int): number of collisions to enforce.
 56 |         operation (string, optional): ``"concat"``, ``"add"``, or ``"mult". Specifies the operation
 57 |                                       to compose embeddings. ``"concat"`` concatenates the embeddings,
 58 |                                       ``"add"`` sums the embeddings, and ``"mult"`` multiplies
 59 |                                       (component-wise) the embeddings.
 60 |                                       Default: ``"mult"``
 61 |         max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
 62 |                                     is renormalized to have norm :attr:`max_norm`.
 63 |         norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
 64 |         scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
 65 |                                                 the words in the mini-batch. Default ``False``.
 66 |                                                 Note: this option is not supported when ``mode="max"``.
 67 |         mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
 68 |                                  ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights`
 69 |                                  into consideration. ``"mean"`` computes the average of the values
 70 |                                  in the bag, ``"max"`` computes the max value over each bag.
 71 |                                  Default: ``"mean"``
 72 |         sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
 73 |                                  Notes for more details regarding sparse gradients. Note: this option is not
 74 |                                  supported when ``mode="max"``.
 75 | 
 76 |     Attributes:
 77 |         weight (Tensor): the learnable weights of each embedding table is the module of shape
 78 |                          `(num_embeddings, embedding_dim)` initialized using a uniform distribution
 79 |                          with sqrt(1 / num_categories).
 80 | 
 81 |     Inputs: :attr:`input` (LongTensor), :attr:`offsets` (LongTensor, optional), and
 82 |         :attr:`per_index_weights` (Tensor, optional)
 83 | 
 84 |         - If :attr:`input` is 2D of shape `(B, N)`,
 85 | 
 86 |           it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
 87 |           this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
 88 |           :attr:`offsets` is ignored and required to be ``None`` in this case.
 89 | 
 90 |         - If :attr:`input` is 1D of shape `(N)`,
 91 | 
 92 |           it will be treated as a concatenation of multiple bags (sequences).
 93 |           :attr:`offsets` is required to be a 1D tensor containing the
 94 |           starting index positions of each bag in :attr:`input`. Therefore,
 95 |           for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as
 96 |           having ``B`` bags. Empty bags (i.e., having 0-length) will have
 97 |           returned vectors filled by zeros.
 98 | 
 99 |         per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
100 |             to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights`
101 |             must have exactly the same shape as input and is treated as having the same
102 |             :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``.
103 | 
104 | 
105 |     Output shape: `(B, embedding_dim)`
106 | 
107 |     """
108 |     __constants__ = ['num_categories', 'embedding_dim', 'num_collisions',
109 |                      'operation', 'max_norm', 'norm_type', 'scale_grad_by_freq',
110 |                      'mode', 'sparse']
111 | 
112 |     def __init__(self, num_categories, embedding_dim, num_collisions,
113 |                  operation='mult', max_norm=None, norm_type=2.,
114 |                  scale_grad_by_freq=False, mode='mean', sparse=False,
115 |                  _weight=None):
116 |         super(QREmbeddingBag, self).__init__()
117 | 
118 |         assert operation in ['concat', 'mult', 'add'], 'Not valid operation!'
119 | 
120 |         self.num_categories = num_categories
121 |         if isinstance(embedding_dim, int) or len(embedding_dim) == 1:
122 |             self.embedding_dim = [embedding_dim, embedding_dim]
123 |         else:
124 |             self.embedding_dim = embedding_dim
125 |         self.num_collisions = num_collisions
126 |         self.operation = operation
127 |         self.max_norm = max_norm
128 |         self.norm_type = norm_type
129 |         self.scale_grad_by_freq = scale_grad_by_freq
130 | 
131 |         if self.operation == 'add' or self.operation == 'mult':
132 |             assert self.embedding_dim[0] == self.embedding_dim[1], \
133 |                 'Embedding dimensions do not match!'
134 | 
135 |         self.num_embeddings = [int(np.ceil(num_categories / num_collisions)),
136 |             num_collisions]
137 | 
138 |         if _weight is None:
139 |             self.weight_q = Parameter(torch.Tensor(self.num_embeddings[0], self.embedding_dim[0]))
140 |             self.weight_r = Parameter(torch.Tensor(self.num_embeddings[1], self.embedding_dim[1]))
141 |             self.reset_parameters()
142 |         else:
143 |             assert list(_weight[0].shape) == [self.num_embeddings[0], self.embedding_dim[0]], \
144 |                 'Shape of weight for quotient table does not match num_embeddings and embedding_dim'
145 |             assert list(_weight[1].shape) == [self.num_embeddings[1], self.embedding_dim[1]], \
146 |                 'Shape of weight for remainder table does not match num_embeddings and embedding_dim'
147 |             self.weight_q = Parameter(_weight[0])
148 |             self.weight_r = Parameter(_weight[1])
149 |         self.mode = mode
150 |         self.sparse = sparse
151 | 
152 |     def reset_parameters(self):
153 |         nn.init.uniform_(self.weight_q, np.sqrt(1 / self.num_categories))
154 |         nn.init.uniform_(self.weight_r, np.sqrt(1 / self.num_categories))
155 | 
156 |     def forward(self, input, offsets=None, per_sample_weights=None):
157 |         input_q = (input / self.num_collisions).long()
158 |         input_r = torch.remainder(input, self.num_collisions).long()
159 | 
160 |         embed_q = F.embedding_bag(input_q, self.weight_q, offsets, self.max_norm,
161 |                                   self.norm_type, self.scale_grad_by_freq, self.mode,
162 |                                   self.sparse, per_sample_weights)
163 |         embed_r = F.embedding_bag(input_r, self.weight_r, offsets, self.max_norm,
164 |                                   self.norm_type, self.scale_grad_by_freq, self.mode,
165 |                                   self.sparse, per_sample_weights)
166 | 
167 |         if self.operation == 'concat':
168 |             embed = torch.cat((embed_q, embed_r), dim=1)
169 |         elif self.operation == 'add':
170 |             embed = embed_q + embed_r
171 |         elif self.operation == 'mult':
172 |             embed = embed_q * embed_r
173 | 
174 |         return embed
175 | 
176 |     def extra_repr(self):
177 |         s = '{num_embeddings}, {embedding_dim}'
178 |         if self.max_norm is not None:
179 |             s += ', max_norm={max_norm}'
180 |         if self.norm_type != 2:
181 |             s += ', norm_type={norm_type}'
182 |         if self.scale_grad_by_freq is not False:
183 |             s += ', scale_grad_by_freq={scale_grad_by_freq}'
184 |         s += ', mode={mode}'
185 |         return s.format(**self.__dict__)
186 | 


--------------------------------------------------------------------------------
/mxnet/blog/mxnet_v1.5_release/single-instance-cnn-mxnet-1.5.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | echo "MXNet Model FP32 multi-instance Inference Performance "
  4 | echo "Testing FP32 base models"
  5 | echo "Installing mxnet 1.5"
  6 | pip install mxnet
  7 | echo "Downloading source code from incubator-mxnet repo"
  8 | git clone https://github.com/apache/incubator-mxnet
  9 | cd incubator-mxnet
 10 | 
 11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l`
 13 | export OMP_NUM_THREADS=$((vCPUs / 4))
 14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS
 15 | export MXNET_SUBGRAPH_BACKEND=MKLDNN
 16 | 
 17 | # Launch dummy data Inference
 18 | #RN18
 19 | cd ./example/quantization
 20 | python imagenet_gen_qsym_mkldnn.py --model=resnet18_v1 --num-calib-batches=5 --calib-mode=naive
 21 | echo "-----ResNet18 FP32 single-inst-----"
 22 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 23 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 24 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 25 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 26 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 27 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 28 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 29 | 
 30 | #RN50
 31 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive
 32 | echo "-----ResNet50 FP32 single-inst-----"
 33 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 34 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 35 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 36 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 37 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 38 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 39 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 40 | 
 41 | #RN101
 42 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive
 43 | echo "-----ResNet101 FP32 single-inst-----"
 44 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 45 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 46 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 47 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 48 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 49 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 50 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 51 | 
 52 | #Squeezenet1.0
 53 | python imagenet_gen_qsym_mkldnn.py --model=squeezenet1.0 --num-calib-batches=5 --calib-mode=naive
 54 | echo "-----SqueezeNet1.0 FP32 single-inst-----"
 55 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 56 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 57 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 58 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 59 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 60 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 61 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 62 | 
 63 | #MobileNet1.0
 64 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive
 65 | echo "-----MobileNet v1 FP32 single-inst-----"
 66 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 67 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 68 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 69 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 70 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 71 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 72 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 73 | 
 74 | #MobileNet2.0
 75 | python imagenet_gen_qsym_mkldnn.py --model=mobilenetv2_1.0 --num-calib-batches=5 --calib-mode=naive
 76 | echo "-----MobileNet v2 FP32 single-inst-----"
 77 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 78 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 79 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 80 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 81 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 82 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 83 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 84 | 
 85 | #inception v3
 86 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive
 87 | echo "-----Inception v3 FP32 single-inst-----"
 88 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 89 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 90 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 91 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 92 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 93 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 94 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 95 | 
 96 | #ResNet152-v2
 97 | python imagenet_gen_qsym_mkldnn.py --model=imagenet1k-resnet-152 --num-calib-batches=5 --calib-mode=naive
 98 | echo "-----ResNet152-v2 FP32 single-inst-----"
 99 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
100 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
101 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
102 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
103 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
104 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
105 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
106 | 


--------------------------------------------------------------------------------
/mxnet/blog/mxnet_v1.5_release/single-instance-fp32-cnn-mxnet-mkl1.5.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | echo "MXNet Model FP32 multi-instance Inference Performance "
  4 | echo "Testing FP32 base models"
  5 | echo "Installing mxnet-mkl 1.5"
  6 | pip install mxnet-mkl
  7 | echo "Downloading source code from incubator-mxnet repo"
  8 | git clone https://github.com/apache/incubator-mxnet
  9 | cd incubator-mxnet
 10 | 
 11 | export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
 12 | export vCPUs=`cat /proc/cpuinfo | grep processor | wc -l`
 13 | export OMP_NUM_THREADS=$((vCPUs / 4))
 14 | echo "Test with OMP_NUM_THREADS="$OMP_NUM_THREADS
 15 | export MXNET_SUBGRAPH_BACKEND=MKLDNN
 16 | 
 17 | # Launch dummy data Inference
 18 | #RN18
 19 | cd ./example/quantization
 20 | python imagenet_gen_qsym_mkldnn.py --model=resnet18_v1 --num-calib-batches=5 --calib-mode=naive
 21 | echo "-----ResNet18 FP32 single-inst-----"
 22 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 23 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 24 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 25 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 26 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 27 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 28 | python imagenet_inference.py --symbol-file=./model/resnet18_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 29 | 
 30 | #RN50
 31 | python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive
 32 | echo "-----ResNet50 FP32 single-inst-----"
 33 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 34 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 35 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 36 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 37 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 38 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 39 | python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 40 | 
 41 | #RN101
 42 | python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive
 43 | echo "-----ResNet101 FP32 single-inst-----"
 44 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 45 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 46 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 47 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 48 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 49 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 50 | python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 51 | 
 52 | #Squeezenet1.0
 53 | python imagenet_gen_qsym_mkldnn.py --model=squeezenet1.0 --num-calib-batches=5 --calib-mode=naive
 54 | echo "-----SqueezeNet1.0 FP32 single-inst-----"
 55 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 56 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 57 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 58 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 59 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 60 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 61 | python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 62 | 
 63 | #MobileNet1.0
 64 | python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive
 65 | echo "-----MobileNet v1 FP32 single-inst-----"
 66 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 67 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 68 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 69 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 70 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 71 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 72 | python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 73 | 
 74 | #MobileNet2.0
 75 | python imagenet_gen_qsym_mkldnn.py --model=mobilenetv2_1.0 --num-calib-batches=5 --calib-mode=naive
 76 | echo "-----MobileNet v2 FP32 single-inst-----"
 77 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 78 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 79 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 80 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 81 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 82 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 83 | python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 84 | 
 85 | #inception v3
 86 | python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive
 87 | echo "-----Inception v3 FP32 single-inst-----"
 88 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 89 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 90 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 91 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 92 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 93 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 94 | python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
 95 | 
 96 | #ResNet152-v2
 97 | python imagenet_gen_qsym_mkldnn.py --model=imagenet1k-resnet-152 --num-calib-batches=5 --calib-mode=naive
 98 | echo "-----ResNet152-v2 FP32 single-inst-----"
 99 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=1 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
100 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=2 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
101 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=4 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
102 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=8 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
103 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=16 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
104 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=32 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
105 | python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=64 --num-inference-batches=1000 --ctx=cpu --benchmark=True 
106 | 


--------------------------------------------------------------------------------
/pytorch/benchmark_tools/run_caffe2.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | ## @package caffe2_tools
  3 | # Module caffe2.tools.run_caffe2
  4 | """
  5 | the main entry to run caffe2 model
  6 | """
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | from __future__ import unicode_literals
 11 | 
 12 | import os
 13 | import sys
 14 | import timeit
 15 | import logging
 16 | import argparse
 17 | 
 18 | LOG_FORMAT = "%(levelname)s:%(message)s"
 19 | 
 20 | def ArgError(error):
 21 |     """
 22 |     print help if arg is error
 23 |     """
 24 |     logging.error("Please set {}. "
 25 |                   "OR, refer to the help of this script (-h)"
 26 |                   .format(error))
 27 | 
 28 | def Calibration(args, extr_args):
 29 |     """
 30 |     function to do calibration.
 31 |     """
 32 |     if not args.model:
 33 |         ArgError("model to run (-m)")
 34 |         return
 35 |     if args.print_net_def:
 36 |         import inference as inf
 37 |         inf.PrintNetDef(args.model, args.print_net_def)
 38 |         return
 39 |     if not args.device:
 40 |         ArgError("device (-d)")
 41 |         return
 42 |     if (
 43 |             not args.dummydata and
 44 |             not args.images_path and
 45 |             not "CAFFE2_INF_IMG_PATH" in os.environ
 46 |     ):
 47 |         ArgError("the path of input images (-p)")
 48 |         return
 49 |     import inference as inf
 50 |     inf.Calibration(args, extra_args)
 51 | 
 52 | 
 53 | def Inference(args, extra_args):
 54 |     """
 55 |     function to do inference.
 56 |     """
 57 |     if not args.model:
 58 |         ArgError("model to run (-m)")
 59 |         return
 60 |     if args.print_net_def:
 61 |         import inference as inf
 62 |         inf.PrintNetDef(args.model, args.print_net_def)
 63 |         return
 64 |     if not args.device:
 65 |         ArgError("device (-d)")
 66 |         return
 67 |     if (
 68 |             not args.dummydata and
 69 |             not args.images_path and
 70 |             not "CAFFE2_INF_IMG_PATH" in os.environ
 71 |     ):
 72 |         ArgError("the path of input images (-p)")
 73 |         return
 74 |     import inference as inf
 75 |     inf.Run(args, extra_args)
 76 | 
 77 | 
 78 | def GetArgumentParser():
 79 |     """
 80 |     to parse the argument
 81 |     """
 82 |     parser = argparse.ArgumentParser(description="The scripts to run Caffe2.\n"
 83 |                                                  "for example, to run alexnet inference:\n"
 84 |                                                  "./run_caffe2.py -m alexnet\n"
 85 |                                                  " -p /path/to/input/image\n"
 86 |                                                  " -v /path/to/image/validate/index/file\n"
 87 |                                      )
 88 |     parser.add_argument(
 89 |         "-a", "--optimization",
 90 |         type=str,
 91 |         help="Enable optimizations for running mode, split by comma.\n"
 92 |              "(Set 'all' to enable all optimizations for current running mode)\n"
 93 |              "-For inference, available optimizations:\n"
 94 |              "bn_folding,bn_inplace,fusion_conv_relu,fusion_conv_sum,remove_dropout,"
 95 |              "int8_mode.\n"
 96 |              "-For training, available optimizations:\n"
 97 |              " "
 98 |     )
 99 |     parser.add_argument(
100 |         "-b", "--batch_size",
101 |         type=int,
102 |         default=1,
103 |         help="The batch size. (DEFAULT: %(default)i)"
104 |     )
105 |     parser.add_argument(
106 |         "-c", "--crop_size",
107 |         type=int,
108 |         default=None,
109 |         help="The crop size of input image. (DEFAULT: %(default)s)"
110 |     )
111 |     parser.add_argument(
112 |         "-d", "--device",
113 |         type=str,
114 |         default="ideep",
115 |         help="Choose device to run. cpu, gpu or ideep."
116 |              "(DEFAULT: %(default)s)"
117 |     )
118 |     parser.add_argument(
119 |         "-e", "--log_level",
120 |         type=str,
121 |         default="warning",
122 |         help="The log level to show off. debug, info, warning, error, critical."
123 |              "(DEFAULT: %(default)s)"
124 |     )
125 |     parser.add_argument(
126 |         "-f", "--forward_only",
127 |         action='store_true',
128 |         help="If set, only run the forward path."
129 |              "(DEFAULT: %(default)s)"
130 |     )
131 |     parser.add_argument(
132 |         "-g", "--log",
133 |         type=str,
134 |         help="The log file path."
135 |     )
136 |     parser.add_argument(
137 |         "-i", "--iterations",
138 |         type=int,
139 |         help="Number of iterations to run the network."
140 |     )
141 |     parser.add_argument(
142 |         "-j", "--post_images_path",
143 |         type=str,
144 |         default=None,
145 |         help="The path to store post images."
146 |     )
147 |     parser.add_argument(
148 |         "-l", "--label_file",
149 |         type=str,
150 |         help="The input label index file."
151 |     )
152 |     parser.add_argument(
153 |         "-m", "--model",
154 |         type=str,
155 |         help="The model to run."
156 |     )
157 |     parser.add_argument(
158 |         "-n", "--net_type",
159 |         type=str,
160 |         default="simple",
161 |         help="The net type for Caffe2.(DEFAULT: %(default)s)"
162 |     )
163 |     parser.add_argument(
164 |         "-o", "--output_file",
165 |         type=str,
166 |         default=None,
167 |         help="The output file to save the results of validating or label check."
168 |     )
169 |     parser.add_argument(
170 |         "-calib", "--calib_algo",
171 |         type=str,
172 |         help="The algorithm of calibration. absmax, moving_average, or l_divergence"
173 |     )
174 |     parser.add_argument(
175 |         "-single_iter_calib", "--single_iter_calib",
176 |         action='store_true',
177 |         help="Perform calibration on single batch images or not"
178 |     )
179 |     parser.add_argument(
180 |         "-iter_calib", "--iter_calib",
181 |         type=int,
182 |         default=None,
183 |         help="Perform calibration on single batch images or not"
184 |     )
185 |     parser.add_argument(
186 |         "-int8", "--int8_model",
187 |         action='store_true',
188 |         help="Use the int8 model, instead of fp32 model."
189 |     )
190 |     parser.add_argument(
191 |         "-onnx", "--onnx_model",
192 |         action='store_true',
193 |         help="Use the onnx model, instead of caffe2 model."
194 |     )
195 |     parser.add_argument(
196 |         "-p", "--images_path",
197 |         type=str,
198 |         help="The path of input images."
199 |     )
200 |     parser.add_argument(
201 |         "-tp", "--tr_images_path",
202 |         type=str,
203 |         help="The path of input images for training."
204 |     )
205 |     parser.add_argument(
206 |         "-q", "--annotations",
207 |         type=str,
208 |         help="The path of Annotations file for VOC"
209 |     )
210 |     parser.add_argument(
211 |         "-r", "--mode",
212 |         type=str,
213 |         default="inference",
214 |         help="Choose running mode. inference, calibration or training."
215 |              "(DEFAULT: %(default)s)"
216 |     )
217 |     parser.add_argument(
218 |         "-calibf", "--calibration_file",
219 |         type=str,
220 |         help="Use the images in calibration_file for int8 calibration."
221 |     )
222 |     parser.add_argument(
223 |         "-s", "--show_supported_models",
224 |         action='store_true',
225 |         help="Show off all supported model for inference."
226 |     )
227 |     parser.add_argument(
228 |         "-t", "--profile",
229 |         action='store_true',
230 |         help="Trigger profile on current topology."
231 |     )
232 |     parser.add_argument(
233 |         "-u", "--dummydata",
234 |         action='store_true',
235 |         help="dummy dataset."
236 |     )
237 |     parser.add_argument(
238 |         "-uv", "--dummyvalue",
239 |         type=str,
240 |         default="random",
241 |         help="the fill value for dummydata."
242 |              "(DEFAULT: %(default)s)"
243 |     )
244 |     parser.add_argument(
245 |         "-v", "--validation_file",
246 |         type=str,
247 |         help="The input validation index file."
248 |     )
249 |     parser.add_argument(
250 |         "-w", "--warmup_iterations",
251 |         type=int,
252 |         default=0,
253 |         help="Number of warm-up iterations before benchmarking."
254 |              "(DEFAULT: %(default)i)"
255 |     )
256 |     parser.add_argument(
257 |         "-x", "--print_net_def",
258 |         type=str,
259 |         default=None,
260 |         help="If set, only print out the net definition for the model.\n"
261 |              "predict_net for topology, init_net for weight data."
262 |              "(DEFAULT: %(default)s)"
263 |     )
264 |     parser.add_argument(
265 |         "-y", "--cosim",
266 |         action='store_true',
267 |         help="Trigger cosim on current topology."
268 |     )
269 |     parser.add_argument(
270 |         "-yi", "--int8_cosim",
271 |         action='store_true',
272 |         help="Trigger int8 cosim on current topology."
273 |     )
274 |     parser.add_argument(
275 |         "-z", "--noptimize",
276 |         action='store_true',
277 |         help="not Trigger optimization on current topology."
278 |     )
279 |     return parser
280 | 
281 | 
282 | if __name__ == '__main__':
283 |     args, extra_args = GetArgumentParser().parse_known_args()
284 |     LOG_LEVEL_MAP = {
285 |         "debug": logging.DEBUG,
286 |         "info": logging.INFO,
287 |         "warning": logging.WARNING,
288 |         "error": logging.ERROR,
289 |         "critical": logging.CRITICAL,
290 |     }
291 |     if args.log_level.lower() in LOG_LEVEL_MAP:
292 |         log_level = LOG_LEVEL_MAP[args.log_level.lower()]
293 |     else:
294 |         log_level = None
295 |         logging.warning("Wrong log level {}. Ignored!".format(args.log_level))
296 |     logging.basicConfig(
297 |         format=LOG_FORMAT,
298 |         filename=args.log,
299 |         filemode="w",
300 |         level=log_level)
301 | 
302 |     if args.show_supported_models:
303 |         import inference.models as m
304 | 
305 |         m.ShowModels()
306 |     elif len(sys.argv) == 1:
307 |         GetArgumentParser().print_help()
308 |     else:
309 |         type_map = {
310 |             "inference": Inference,
311 |             "calibration": Calibration,
312 |         }
313 |         if args.mode.lower() in type_map:
314 |             start_time = timeit.default_timer()
315 |             type_map[args.mode.lower()](args, extra_args)
316 |             elapsed_time = timeit.default_timer() - start_time
317 |             logging.warning("Total time in {} mode: {:.10f} seconds"
318 |                             .format(args.mode, elapsed_time))
319 |         else:
320 |             logging.error("Wrong running mode {}. Exit!".format(args.mode))
321 | 


--------------------------------------------------------------------------------
/pytorch/ResNet50/README.md:
--------------------------------------------------------------------------------
  1 | # Guide to run ResNet50 with FP32/BF16 data type
  2 | 
  3 | ## Verified on
  4 | 
  5 | | Item | Value |
  6 | | -: | :- |
  7 | | OS | Ubuntu 20.04 LTS |
  8 | | Compiler | gcc 8.4.0 |
  9 | | Memory | DDR4 3200MHz |
 10 | 
 11 | ## Prepare your running environment
 12 | 1. Setup for PyTorch build environment:
 13 | ```
 14 |   wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh
 15 |   chmod +x anaconda3.sh
 16 |   ./anaconda3.sh -b -p ~/anaconda3
 17 |   ./anaconda3/bin/conda create -yn pytorch python=3.7
 18 |   export PATH=~/anaconda3/bin:$PATH
 19 |   source ./anaconda3/bin/activate pytorch
 20 |   pip install sklearn onnx
 21 |   conda config --append channels intel
 22 |   conda install ninja pyyaml setuptools cmake cffi typing
 23 |   conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps
 24 | ```
 25 | 
 26 | 2. Build and install PyTorch
 27 | ```
 28 |   git clone https://github.com/pytorch/pytorch.git
 29 |   cd pytorch
 30 |   git checkout gh/xiaobingsuper/18/orig
 31 |   python setup.py clean
 32 |   git submodule sync &&  git submodule update --init --recursive
 33 |   wget https://github.com/hongzhen1/pytorch/commit/3511d7f6bd2060e20cf77b770ae32ff538700f37.diff -O dataloader.diff
 34 |   git apply dataloader.diff
 35 |   cd third_party/ideep/ && git checkout master && git pull && git checkout pytorch_dnnl_dev && cd ../../
 36 |   git add third_party/ideep && git submodule sync && git submodule update --init --recursive
 37 |   cd third_party/ideep # make sure ideep commit is 2bf943e
 38 |   cd ../../
 39 |   pip install -r requirements.txt
 40 |   export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib
 41 |   python setup.py install
 42 | ```
 43 | 
 44 | 3. Install jemalloc
 45 | ```
 46 |   cd ..
 47 |   git clone https://github.com/jemalloc/jemalloc.git
 48 |   cd jemalloc
 49 |   ./autogen.sh
 50 |   ./configure --prefix=$HOME/.local
 51 |   make
 52 |   make install
 53 | ```
 54 | 
 55 | 4. download imagenet dataset
 56 |   reference: https://github.com/facebookarchive/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset
 57 | 
 58 | 5. install vision & imagenet
 59 | ```
 60 |   cd ..
 61 |   git clone https://github.com/pytorch/vision
 62 |   cd vision
 63 |   python setup.py install
 64 | 
 65 |   cd ..
 66 |   git clone https://github.com/intel/optimized-models.git
 67 |   cd imagenet/imagenet
 68 | ```
 69 | 
 70 | ## Example:
 71 | 
 72 | Core(s) per socket: 24
 73 | 
 74 | ```
 75 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024
 76 | export DATA_PATH=<The path for imagenet dataset>
 77 | ```
 78 | 
 79 | **Note:** change ip address (xxx.xxx.xxx.xxx) in the following commands to the one in your environment.
 80 | 
 81 | ### FP32:
 82 | * training benchmark (4 instances, 24 cores/ins):
 83 | ```
 84 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
 85 |   MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C48-71 -m2 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C72-95 -m3 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" &
 86 | ```
 87 | 
 88 | * training accuracy for multi-nodes (4 nodes, batch_size=64 for every node):
 89 | 
 90 |   **Legends:**
 91 | 
 92 |   | flag | description |
 93 |   | -: | - |
 94 |   | -j | cores number of one node |
 95 |   | --world-size | node numbers |
 96 |   | --rank | node number |
 97 |   | bathc_size | 256/nodes |
 98 | 
 99 |   **Commands run on nodes:**
100 | 
101 |     * on node0
102 |   ```
103 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
104 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
105 |   ```
106 |   
107 |     * on node1
108 |   ```
109 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
110 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
111 |   ```
112 |   
113 |     * on node2
114 |   ```
115 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
116 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
117 |   ```
118 |   
119 |     * on node3
120 |   ```
121 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
122 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
123 |   ```
124 | 
125 | * inference throughput benchmark (4 instances, 24 cores/ins):
126 | ```
127 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
128 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
129 |   bash run_inference_cpu_multi_instance.sh resnet50
130 | ```
131 | 
132 | * inference realtime bechmark (24 instances, 4 cores/ins):
133 | ```
134 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
135 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
136 |   bash run_inference_cpu_multi_instance_latency.sh resnet50
137 | ```
138 | 
139 | * inference accuracy:
140 | ```
141 |   bash run_inference_cpu_accuracy.sh resnet50
142 | ```
143 | 
144 | ### BF16:
145 | * training benchmark (4 instances, 24cores/ins):
146 | ```
147 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
148 |   MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C48-71 -m2 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16& MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C72-95 -m3 python -u main_multinode.py -a resnet50 --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 &
149 | ```
150 | 
151 | * training accuracy (4 nodes, batch_size=64 for every node):
152 | 
153 |   **Legends:**
154 | 
155 |     | flag | description |
156 |     | -: | - |
157 |     | -j | cores number of one node |
158 |     | --world-size | node numbers |
159 |     | --rank | node number |
160 |     | bathc_size | 256/nodes |
161 | 
162 |   **Commands run on nodes:**
163 |   
164 |     * on node0
165 |   ```
166 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
167 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
168 |   ```
169 |     * on node1
170 |   ```
171 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
172 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
173 |   ```
174 |     * on node2
175 |   ```
176 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
177 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
178 |   ```
179 |     * on node3
180 |   ```
181 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
182 |     python -u main.py --lr 0.1 -a resnet50 --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
183 |   ```
184 | 
185 | * inference throughput benchmark (4 instances, 24 cores/ins):
186 | ```
187 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
188 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
189 |   bash run_inference_cpu_multi_instance.sh resnet50 bf16
190 | ```
191 | 
192 | * inference realtime benchmark (24 instances, 4 cores/ins):
193 | ```
194 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
195 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
196 |   bash run_inference_cpu_multi_instance_latency.sh resnet50 bf16
197 | ```
198 | 
199 | * inference accuracy:
200 | ```
201 |   bash run_inference_cpu_accuracy.sh resnet50 bf16
202 | ```
203 | 


--------------------------------------------------------------------------------
/pytorch/ResNext101_32x4d/README.md:
--------------------------------------------------------------------------------
  1 | # Guide to run ResNext101_32x4d with FP32/BF16 data type
  2 | 
  3 | ## Verified on
  4 | 
  5 | | Item | Value |
  6 | | -: | :- |
  7 | | OS | Ubuntu 20.04 LTS |
  8 | | Compiler | gcc 8.4.0 |
  9 | | Memory | DDR4 3200MHz |
 10 | 
 11 | ## Prepare your running environment
 12 | 1. Setup for PyTorch build environment:
 13 | ```
 14 |   wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh
 15 |   chmod +x anaconda3.sh
 16 |   ./anaconda3.sh -b -p ~/anaconda3
 17 |   ./anaconda3/bin/conda create -yn pytorch python=3.7
 18 |   export PATH=~/anaconda3/bin:$PATH
 19 |   source ./anaconda3/bin/activate pytorch
 20 |   pip install sklearn onnx
 21 |   conda config --append channels intel
 22 |   conda install ninja pyyaml setuptools cmake cffi typing 
 23 |   conda install intel-openmp mkl mkl-include numpy -c intel --no-update-deps
 24 | ```
 25 | 
 26 | 2. Build and install PyTorch
 27 | ```
 28 |   git clone https://github.com/pytorch/pytorch.git
 29 |   cd pytorch
 30 |   git checkout gh/xiaobingsuper/18/orig
 31 |   python setup.py clean
 32 |   git submodule sync &&  git submodule update --init --recursive
 33 |   wget https://github.com/hongzhen1/pytorch/commit/3511d7f6bd2060e20cf77b770ae32ff538700f37.diff -O dataloader.diff
 34 |   git apply dataloader.diff
 35 |   cd third_party/ideep/ && git checkout master && git pull && git checkout pytorch_dnnl_dev && cd ../../
 36 |   git add third_party/ideep && git submodule sync &&  git submodule update --init --recursive
 37 |   cd third_party/ideep # make sure ideep commit is 2bf943e
 38 |   cd ../../
 39 |   pip install -r requirements.txt
 40 |   export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}/lib
 41 |   python setup.py install
 42 | ```
 43 | 
 44 | 3. Install jemalloc
 45 | ```
 46 |   cd ..
 47 |   git clone  https://github.com/jemalloc/jemalloc.git
 48 |   cd jemalloc
 49 |   ./autogen.sh
 50 |   ./configure --prefix={path/to/jemalloc} (eg: /home/xxx/jemalloc/)
 51 |   make
 52 |   make install
 53 | ```
 54 | 
 55 | 4. download imagenet dataset
 56 |   reference: https://github.com/facebookarchive/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset
 57 | 
 58 | 5. install vision & imagenet
 59 | ```
 60 |   cd ..
 61 |   git clone https://github.com/intel/optimized-models.git
 62 | 
 63 |   git clone https://github.com/pytorch/vision
 64 |   cd vision
 65 |   cp ../optimized-models/imagenet/imagenet/resnet.py torchvision/models/resnet.py
 66 |   python setup.py install
 67 |   cd ../optimized-models/imagenet/imagenet
 68 | ```
 69 | 
 70 | ## Example:
 71 | 
 72 | Core(s) per socket: 24
 73 | 
 74 | ```
 75 | export DNNL_PRIMITIVE_CACHE_CAPACITY=1024
 76 | export DATA_PATH=<The path for imagenet dataset>
 77 | ```
 78 | 
 79 | **Note:** change ip address (xxx.xxx.xxx.xxx) in the following commands to the one in your environment.
 80 | 
 81 | ### FP32:
 82 | * training benchmark (4 instances, 24 cores/ins):
 83 | ```
 84 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
 85 |   MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx .xxx.xxx:7689"  & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689"  & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl  -C48-71 -m2 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689"  & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl  -C72-95 -m3 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" &
 86 | ```
 87 | 
 88 | * training accuracy for multi-nodes (4 nodes, batch_size=64 for every node):
 89 | 
 90 |   **Legends:**
 91 | 
 92 |   | flag | description |
 93 |   | -: | - |
 94 |   | -j | cores number of one node |
 95 |   | --world-size | node numbers |
 96 |   | --rank | node number |
 97 |   | bathc_size | 256/nodes |
 98 | 
 99 |   **Commands run on nodes**
100 | 
101 |     * on node0
102 |   ```
103 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
104 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
105 |   ```
106 |     * on node1
107 |   ```
108 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
109 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
110 |   ```
111 |     * on node2
112 |   ```
113 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
114 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
115 |   ```
116 |     * on node3
117 |   ```
118 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
119 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1
120 |   ```
121 | 
122 | * inference throughput benchmark (4 instances, 24 cores/ins):
123 | ```
124 |   # bacth_size=128
125 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
126 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
127 |   bash run_inference_cpu_multi_instance.sh resnext101_32x4d
128 | ```
129 | 
130 | * inference realtime bechmark (24 instances, 4 cores/ins):
131 | ```
132 |   # batch_size=128
133 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
134 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
135 |   bash run_inference_cpu_multi_instance_latency.sh resnext101_32x4d
136 | ```
137 | 
138 | * inference accuracy:
139 | ```
140 |   bash run_inference_cpu_accuracy.sh resnext101_32x4d
141 | ```
142 | 
143 | ### BF16:
144 | * training benchmark (4 instances, 24cores/ins):
145 | ```
146 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
147 |   MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C0-23 -m0 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl -C24-47 -m1 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl  -C48-71 -m2 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16 & MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" KMP_BLOCKTIME=1 KMP_HW_SUBSET=1t KMP_AFFINITY=granularity=fine,compact,1,0 OMP_NUM_THREADS=24 numactl  -C72-95 -m3 python -u main_multinode.py -a resnext101_32x4d  --mkldnn $DATA_PATH -b 128 -j 24 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --bf16
148 | ```
149 | 
150 | * training accuracy (4 nodes, batch_size=64 for every node):
151 | 
152 |   **Legends:**
153 | 
154 |     | flag | description |
155 |     | -: | - |
156 |     | -j | cores number of one node |
157 |     | --world-size | node numbers |
158 |     | --rank | node number |
159 |     | bathc_size | 256/nodes |
160 | 
161 |   **Commands run on nodes**
162 |   
163 |     * on node0
164 |   ```
165 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
166 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=0 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
167 |   ```
168 |     * on node1
169 |   ```
170 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
171 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=1 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
172 |   ```
173 |     * on node2
174 |   ```
175 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
176 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=2 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
177 |   ```
178 |     * on node3
179 |   ```
180 |     export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
181 |     python -u main.py --lr 0.1 -a resnext101_32x4d --mkldnn $DATA_PATH -b 64 -j 48 --world-size=4 --rank=3 --dist-backend=gloo --dist-url="tcp://xxx.xxx.xxx.xxx:7689" --seed 1 --bf16
182 |   ```
183 | 
184 | * inference throughput benchmark (4 instances, 24 cores/ins):
185 | ```
186 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
187 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
188 |   bash run_inference_cpu_multi_instance.sh resnext101_32x4d bf16
189 | ```
190 | 
191 | * inference realtime benchmark (24 instances, 4 cores/ins):
192 | ```
193 |   export LD_PRELOAD=$HOME/.local/lib/libjemalloc.so
194 |   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
195 |   bash run_inference_cpu_multi_instance_latency.sh resnext101_32x4d bf16
196 | ```
197 | 
198 | * inference accuracy:
199 | ```
200 |   bash run_inference_cpu_accuracy.sh resnext101_32x4d bf16
201 | ```
202 | 


--------------------------------------------------------------------------------