├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── build_helper.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── build_helper.rst
    │   ├── conf.py
    │   ├── index.rst
    │   ├── ncnnqat.rst
    │   └── setup.rst
├── ncnnqat
    ├── __init__.py
    └── quantize.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── src
    ├── fake_quantize.cpp
    ├── fake_quantize.cu
    └── fake_quantize.h
└── tests
    ├── ssd300
        ├── main.py
        └── src
        │   ├── __init__.py
        │   ├── coco.py
        │   ├── coco_pipeline.py
        │   ├── data.py
        │   ├── distributed.py
        │   ├── evaluate.py
        │   ├── model.py
        │   ├── train.py
        │   └── utils.py
    └── test_cifar10.py


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Shisen Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 9 | of the Software, and to permit persons to whom the Software is furnished to do
10 | so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenShisen/ncnnqat/253a413264507cf90089d1aa0e30c0ef30087cfe/MANIFEST.in


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Uncomment for debugging
  2 | # DEBUG := 1
  3 | # Pretty build
  4 | # Q ?= @
  5 | 
  6 | CXX := g++
  7 | python := python3
  8 | PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())')
  9 | PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]')
 10 | PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]')
 11 | 
 12 | CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())')
 13 | WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))')
 14 | INCLUDE_DIRS := ./ $(CUDA_DIR)/include
 15 | INCLUDE_DIRS += $(PYTHON_HEADER_DIR)
 16 | INCLUDE_DIRS += $(PYTORCH_INCLUDES)
 17 | 
 18 | # Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
 19 | # BLAS_INCLUDE := /path/to/your/blas
 20 | # BLAS_LIB := /path/to/your/blas
 21 | 
 22 | SRC_DIR := ./src
 23 | OBJ_DIR := ./obj
 24 | CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
 25 | CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
 26 | OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
 27 | CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
 28 | #STATIC_LIB := $(OBJ_DIR)/libquant_impl.a
 29 | STATIC_LIB := $(OBJ_DIR)/libquant_cuda.a
 30 | 
 31 | 
 32 | CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \
 33 | 		-gencode arch=compute_52,code=sm_52 \
 34 | 		-gencode arch=compute_60,code=sm_60 \
 35 | 		-gencode arch=compute_61,code=sm_61 \
 36 | 		-gencode arch=compute_70,code=sm_70 \
 37 | 		-gencode arch=compute_75,code=sm_75 \
 38 | 		-gencode arch=compute_75,code=compute_75
 39 | 
 40 | 
 41 | LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu
 42 | 
 43 | 
 44 | ifeq ($(DEBUG), 1)
 45 | 	COMMON_FLAGS += -DDEBUG -g -O0
 46 | 	NVCCFLAGS += -g -G # -rdc true
 47 | else
 48 | 	COMMON_FLAGS += -DNDEBUG -O3
 49 | endif
 50 | 
 51 | WARNINGS := -Wall -Wno-sign-compare -Wcomment
 52 | INCLUDE_DIRS += $(BLAS_INCLUDE)
 53 | CXXFLAGS += -MMD -MP
 54 | COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
 55 | 	     -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI)
 56 | CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
 57 | NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS)
 58 | 
 59 | default: $(STATIC_LIB)
 60 | 
 61 | $(OBJ_DIR):
 62 | 	@ mkdir -p $@
 63 | 	@ mkdir -p $@/cuda
 64 | 
 65 | $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
 66 | 	@ echo CXX $<
 67 | 	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@
 68 | 
 69 | $(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
 70 | 	@ echo NVCC $<
 71 | 	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
 72 | 		-odir $(@D)
 73 | 	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
 74 | 
 75 | $(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR)
 76 | 	$(RM) -f $(STATIC_LIB)
 77 | 	$(RM) -rf build dist
 78 | 	@ echo LD -o $@
 79 | 	ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS)
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | build:
 88 | 	$(python) setup.py build
 89 | 
 90 | upload:
 91 | 	$(python) setup.py sdist bdist_wheel
 92 | 
 93 | clean:
 94 | 	$(RM) -rf build dist ncnnqat.egg-info
 95 | 
 96 | test:
 97 | 	nosetests -s tests/test_merge_freeze_bn.py --nologcapture
 98 | 
 99 | lint:
100 | 	pylint ncnnqat --reports=n
101 | 
102 | lintfull:
103 | 	pylint ncnnqat
104 | 
105 | install:
106 | 	$(python) setup.py install 
107 | 
108 | uninstall:
109 | 	$(python) setup.py install --record install.log
110 | 	cat install.log | xargs rm -rf 
111 | 	$(RM) install.log
112 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div id="ncnnqat"></div>
  2 | 
  3 | # ncnnqat
  4 | 
  5 | ncnnqat is a quantize aware training package for NCNN on pytorch.
  6 | 
  7 | <div id="table-of-contents"></div>
  8 | 
  9 | ## Table of Contents
 10 | 
 11 | - [ncnnqat](#ncnnqat)
 12 |   - [Table of Contents](#table-of-contents)
 13 |   - [Installation](#installation)
 14 |   - [Usage](#usage)
 15 |   - [Code Examples](#code-examples)
 16 |   - [Results](#results)
 17 |   - [Todo](#todo)
 18 | 
 19 | 
 20 | <div id="installation"></div>  
 21 | 
 22 | ## Installation
 23 | 
 24 | * Supported Platforms: Linux
 25 | * Accelerators and GPUs: NVIDIA GPUs via CUDA driver ***10.1***.
 26 | * Dependencies:
 27 |   * python >= 3.5, < 4
 28 |   * pytorch >= 1.6
 29 |   * numpy >= 1.18.1
 30 |   * onnx >= 1.7.0
 31 |   * onnx-simplifier >= 0.3.6
 32 | 
 33 | * Install ncnnqat via pypi:  
 34 |   ```shell
 35 |   $ pip install ncnnqat (to do....)
 36 |   ```
 37 |   It is recommended to install from the source code
 38 | * or Install ncnnqat via repo：
 39 |   ```shell
 40 |   $ git clone https://github.com/ChenShisen/ncnnqat
 41 |   $ cd ncnnqat
 42 |   $ make install
 43 |   ```
 44 | 
 45 | <div id="usage"></div>
 46 | 
 47 | ## Usage
 48 | 
 49 | 
 50 | * register_quantization_hook and merge_freeze_bn
 51 | 
 52 |   (suggest finetuning from a well-trained model, do it after a few epochs of training otherwise.)
 53 | 
 54 |   ```python
 55 |   from ncnnqat import unquant_weight, merge_freeze_bn, register_quantization_hook
 56 |   ...
 57 |   ...
 58 |       for epoch in range(epoch_train):
 59 |           model.train()
 60 | 	  if epoch==well_epoch:
 61 | 	      register_quantization_hook(model)
 62 | 	  if epoch>=well_epoch:
 63 | 	      model = merge_freeze_bn(model)  #it will change bn to eval() mode during training
 64 |   ...
 65 |   ```
 66 | 
 67 | * Unquantize weight before update it
 68 | 
 69 |   ```python
 70 |   ...
 71 |   ... 
 72 |       if epoch>=well_epoch:
 73 |           model.apply(unquant_weight)  # using original weight while updating
 74 |       optimizer.step()
 75 |   ...
 76 |   ```
 77 | 
 78 | * Save weight and save ncnn quantize table after train
 79 | 
 80 | 
 81 |   ```python
 82 |   ...
 83 |   ...
 84 |       onnx_path = "./xxx/model.onnx"
 85 |       table_path="./xxx/model.table"
 86 |       dummy_input = torch.randn(1, 3, img_size, img_size, device='cuda')
 87 |       input_names = [ "input" ]
 88 |       output_names = [ "fc" ]
 89 |       torch.onnx.export(model, dummy_input, onnx_path, verbose=False, input_names=input_names, output_names=output_names)
 90 |       save_table(model,onnx_path=onnx_path,table=table_path)
 91 | 
 92 |   ...
 93 |   ```
 94 |   if use "model = nn.DataParallel(model)",pytorch unsupport torch.onnx.export,you should save state_dict first and  prepare a new model with one gpu,then you will export onnx model.
 95 |   
 96 |   ```python
 97 |   ...
 98 |   ...
 99 |       model_s = new_net() #
100 |       model_s.cuda()
101 |       register_quantization_hook(model_s)
102 |       #model_s = merge_freeze_bn(model_s)
103 |       onnx_path = "./xxx/model.onnx"
104 |       table_path="./xxx/model.table"
105 |       dummy_input = torch.randn(1, 3, img_size, img_size, device='cuda')
106 |       input_names = [ "input" ]
107 |       output_names = [ "fc" ]
108 |       model_s.load_state_dict({k.replace('module.',''):v for k,v in model.state_dict().items()}) #model_s = model     model = nn.DataParallel(model)
109 |             
110 |       torch.onnx.export(model_s, dummy_input, onnx_path, verbose=False, input_names=input_names, output_names=output_names)
111 |       save_table(model_s,onnx_path=onnx_path,table=table_path)
112 | 	  
113 | 
114 |   ...
115 |   ```
116 | 
117 | 
118 | <div id="code-examples"></div>
119 | 
120 | ## Code Examples
121 | 
122 |   Cifar10 quantization aware training example.
123 | 
124 |   ```python test/test_cifar10.py```
125 |   
126 |   SSD300 quantization aware training example.
127 |      
128 |   ```
129 |      ln -s /your_coco_path/coco ./tests/ssd300/data
130 |   ```
131 |   ```
132 |      python -m torch.distributed.launch \
133 |       --nproc_per_node=4 \
134 |       --nnodes=1 \
135 |       --node_rank=0 \
136 |       ./tests/ssd300/main.py \
137 |       -d ./tests/ssd300/data/coco
138 |   ```
139 |   ```
140 |       python ./tests/ssd300/main.py --onnx_save  #load model dict, export onnx and ncnn table
141 |   ```
142 | 
143 | <div id="results"></div>
144 | 
145 | ## Results  
146 | 
147 | * Cifar10
148 | 
149 | 
150 |   result：
151 | 
152 |     |  net   | fp32(onnx) | ncnnqat     | ncnn aciq     | ncnn kl |
153 |     | -------- |  -------- | -------- | -------- | -------- |
154 |     | mobilenet_v2     | 0.91  | 0.9066  | 0.9033 | 0.9066 |
155 |     | resnet18 | 0.94   | 0.93333   | 0.9367 | 0.937|
156 | 
157 | 
158 | * SSD300(resnet18|coco)
159 | 
160 | 
161 |     ```
162 |     fp32:
163 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.193
164 | 	 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.344
165 | 	 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.191
166 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.042
167 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.195
168 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.328
169 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.199
170 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.293
171 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.309
172 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.084
173 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.326
174 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.501
175 | 	Current AP: 0.19269
176 | 
177 |     ncnnqat:
178 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.192
179 | 	 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.342
180 | 	 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.194
181 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.041
182 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.194
183 | 	 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.327
184 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.197
185 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.291
186 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.307
187 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.082
188 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.325
189 | 	 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.497
190 | 	Current AP: 0.19202
191 |     ```
192 | 
193 | 
194 | <div id="todo"></div>
195 | 
196 | ## Todo
197 | 
198 |    ....
199 | 


--------------------------------------------------------------------------------
/build_helper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import subprocess
  4 | import sys
  5 | import tempfile
  6 | from distutils import ccompiler
  7 | 
  8 | 
  9 | def print_warning(*lines):
 10 |     print('**************************************************')
 11 |     for line in lines:
 12 |         print('*** WARNING: %s' % line)
 13 |     print('**************************************************')
 14 | 
 15 | 
 16 | def get_path(key):
 17 |     return os.environ.get(key, '').split(os.pathsep)
 18 | 
 19 | 
 20 | def search_on_path(filenames):
 21 |     for p in get_path('PATH'):
 22 |         for filename in filenames:
 23 |             full = os.path.join(p, filename)
 24 |             if os.path.exists(full):
 25 |                 return os.path.abspath(full)
 26 |     return None
 27 | 
 28 | 
 29 | minimum_cuda_version = 10010
 30 | maxinum_cuda_version = 10030
 31 | minimum_cudnn_version = 7000
 32 | 
 33 | 
 34 | def get_compiler_setting():
 35 |     nvcc_path = search_on_path(('nvcc', 'nvcc.exe'))
 36 |     cuda_path_default = None
 37 |     if nvcc_path is None:
 38 |         print_warning('nvcc not in path.', 'Please set path to nvcc.')
 39 |     else:
 40 |         cuda_path_default = os.path.normpath(
 41 |             os.path.join(os.path.dirname(nvcc_path), '..'))
 42 | 
 43 |     cuda_path = os.environ.get('CUDA_PATH', '')  # Nvidia default on Windows
 44 |     if len(cuda_path) > 0 and cuda_path != cuda_path_default:
 45 |         print_warning('nvcc path != CUDA_PATH',
 46 |                       'nvcc path: %s' % cuda_path_default,
 47 |                       'CUDA_PATH: %s' % cuda_path)
 48 | 
 49 |     if not os.path.exists(cuda_path):
 50 |         cuda_path = cuda_path_default
 51 | 
 52 |     if not cuda_path and os.path.exists('/usr/local/cuda'):
 53 |         cuda_path = '/usr/local/cuda'
 54 | 
 55 |     include_dirs = []
 56 |     library_dirs = []
 57 |     define_macros = []
 58 | 
 59 |     if cuda_path:
 60 |         include_dirs.append(os.path.join(cuda_path, 'include'))
 61 |         if sys.platform == 'win32':
 62 |             library_dirs.append(os.path.join(cuda_path, 'bin'))
 63 |             library_dirs.append(os.path.join(cuda_path, 'lib', 'x64'))
 64 |         else:
 65 |             library_dirs.append(os.path.join(cuda_path, 'lib64'))
 66 |             library_dirs.append(os.path.join(cuda_path, 'lib'))
 67 |     if sys.platform == 'darwin':
 68 |         library_dirs.append('/usr/local/cuda/lib')
 69 | 
 70 |     return {
 71 |         'include_dirs': include_dirs,
 72 |         'library_dirs': library_dirs,
 73 |         'define_macros': define_macros,
 74 |         'language': 'c++',
 75 |     }
 76 | 
 77 | 
 78 | def check_cuda_version():
 79 |     compiler = ccompiler.new_compiler()
 80 |     settings = get_compiler_setting()
 81 |     try:
 82 |         out = build_and_run(compiler,
 83 |                             '''
 84 |         #include <cuda.h>
 85 |         #include <stdio.h>
 86 |         int main(int argc, char* argv[]) {
 87 |           printf("%d", CUDA_VERSION);
 88 |           return 0;
 89 |         }
 90 |         ''',
 91 |                             include_dirs=settings['include_dirs'])
 92 | 
 93 |     except Exception as e:
 94 |         print_warning('Cannot check CUDA version', str(e))
 95 |         return False
 96 | 
 97 |     cuda_version = int(out)
 98 |     if cuda_version < minimum_cuda_version:
 99 |         print_warning('CUDA version is too old: %d' % cuda_version,
100 |                       'CUDA v10.1 or CUDA v10.2 is required')
101 |         return False
102 |     if cuda_version > maxinum_cuda_version:
103 |         print_warning('CUDA version is too new: %d' % cuda_version,
104 |                       'CUDA v10.1 or CUDA v10.2 is required')
105 | 
106 |     return True
107 | 
108 | 
109 | def check_cudnn_version():
110 |     compiler = ccompiler.new_compiler()
111 |     settings = get_compiler_setting()
112 |     try:
113 |         out = build_and_run(compiler,
114 |                             '''
115 |         #include <cudnn.h>
116 |         #include <stdio.h>
117 |         int main(int argc, char* argv[]) {
118 |           printf("%d", CUDNN_VERSION);
119 |           return 0;
120 |         }
121 |         ''',
122 |                             include_dirs=settings['include_dirs'])
123 | 
124 |     except Exception as e:
125 |         print_warning('Cannot check cuDNN version\n{0}'.format(e))
126 |         return False
127 | 
128 |     cudnn_version = int(out)
129 |     if cudnn_version < minimum_cudnn_version:
130 |         print_warning('cuDNN version is too old: %d' % cudnn_version,
131 |                       'cuDNN v7 or newer is required')
132 |         return False
133 | 
134 |     return True
135 | 
136 | 
137 | def build_and_run(compiler,
138 |                   source,
139 |                   libraries=(),
140 |                   include_dirs=(),
141 |                   library_dirs=()):
142 |     temp_dir = tempfile.mkdtemp()
143 | 
144 |     try:
145 |         fname = os.path.join(temp_dir, 'a.cpp')
146 |         with open(fname, 'w') as f:
147 |             f.write(source)
148 | 
149 |         objects = compiler.compile([fname],
150 |                                    output_dir=temp_dir,
151 |                                    include_dirs=include_dirs)
152 | 
153 |         try:
154 |             postargs = ['/MANIFEST'] if sys.platform == 'win32' else []
155 |             compiler.link_executable(objects,
156 |                                      os.path.join(temp_dir, 'a'),
157 |                                      libraries=libraries,
158 |                                      library_dirs=library_dirs,
159 |                                      extra_postargs=postargs,
160 |                                      target_lang='c++')
161 |         except Exception as e:
162 |             msg = 'Cannot build a stub file.\nOriginal error: {0}'.format(e)
163 |             raise Exception(msg)
164 | 
165 |         try:
166 |             out = subprocess.check_output(os.path.join(temp_dir, 'a'))
167 |             return out
168 | 
169 |         except Exception as e:
170 |             msg = 'Cannot execute a stub file.\nOriginal error: {0}'.format(e)
171 |             raise Exception(msg)
172 | 
173 |     finally:
174 |         shutil.rmtree(temp_dir, ignore_errors=True)
175 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/build_helper.rst:
--------------------------------------------------------------------------------
1 | build\_helper module
2 | ====================
3 | 
4 | .. automodule:: build_helper
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | import os
 4 | import sys
 5 | sys.path.insert(0, os.path.abspath('./../../'))
 6 | 
 7 | 
 8 | # -- Project information -----------------------------------------------------
 9 | 
10 | project = 'ncnnqat'
11 | copyright = '2021, Shisen Chen'
12 | author = 'Shisen Chen'
13 | 
14 | # The short X.Y version
15 | version = ''
16 | # The full version, including alpha/beta/rc tags
17 | release = '0.1.0'
18 | 
19 | 
20 | # -- General configuration ---------------------------------------------------
21 | 
22 | # If your documentation needs a minimal Sphinx version, state it here.
23 | #
24 | # needs_sphinx = '1.0'
25 | 
26 | # Add any Sphinx extension module names here, as strings. They can be
27 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
28 | # ones.
29 | extensions = [
30 |     'sphinx.ext.todo',
31 |     'sphinx.ext.githubpages',
32 |     'sphinx.ext.autodoc',
33 | ]
34 | 
35 | # Add any paths that contain templates here, relative to this directory.
36 | templates_path = ['_templates']
37 | 
38 | # The suffix(es) of source filenames.
39 | # You can specify multiple suffix as a list of string:
40 | #
41 | # source_suffix = ['.rst', '.md']
42 | source_suffix = '.rst'
43 | 
44 | # The master toctree document.
45 | master_doc = 'index'
46 | 
47 | # The language for content autogenerated by Sphinx. Refer to documentation
48 | # for a list of supported languages.
49 | #
50 | # This is also used if you do content translation via gettext catalogs.
51 | # Usually you set "language" from the command line for these cases.
52 | language = None
53 | 
54 | # List of patterns, relative to source directory, that match files and
55 | # directories to ignore when looking for source files.
56 | # This pattern also affects html_static_path and html_extra_path .
57 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
58 | 
59 | # The name of the Pygments (syntax highlighting) style to use.
60 | pygments_style = 'sphinx'
61 | 
62 | 
63 | # -- Options for HTML output -------------------------------------------------
64 | 
65 | # The theme to use for HTML and HTML Help pages.  See the documentation for
66 | # a list of builtin themes.
67 | #
68 | 
69 | # Theme options are theme-specific and customize the look and feel of a theme
70 | # further.  For a list of options available for each theme, see the
71 | # documentation.
72 | #
73 | # html_theme_options = {}
74 | 
75 | # Add any paths that contain custom static files (such as style sheets) here,
76 | # relative to this directory. They are copied after the builtin static files,
77 | # so a file named "default.css" will overwrite the builtin "default.css".
78 | html_static_path = ['_static']
79 | 
80 | # Custom sidebar templates, must be a dictionary that maps document names
81 | # to template names.
82 | #
83 | # The default sidebars (for documents that don't match any pattern) are
84 | # defined by theme itself.  Builtin themes are using these templates by
85 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
86 | # 'searchbox.html']``.
87 | #
88 | # html_sidebars = {}
89 | html_theme = 'sphinx_rtd_theme'
90 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. ncnnqat documentation master file, created by
 2 |    sphinx-quickstart on Fri Aug 21 03:52:34 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to ncnnqat's documentation!
 7 | ===================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/source/ncnnqat.rst:
--------------------------------------------------------------------------------
 1 | ncnnqat package
 2 | ===============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    ncnnqat
10 | 
11 | Module contents
12 | ---------------
13 | 
14 | .. automodule:: ncnnqat
15 |    :members:
16 |    :undoc-members:
17 |    :show-inheritance:
18 | 


--------------------------------------------------------------------------------
/docs/source/setup.rst:
--------------------------------------------------------------------------------
1 | setup module
2 | ============
3 | 
4 | .. automodule:: setup
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/ncnnqat/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | try:
 3 |     from .quantize import  unquant_weight, freeze_bn, \
 4 |         merge_freeze_bn, register_quantization_hook,save_table
 5 | except:
 6 |     raise
 7 | __all__ = [
 8 |         "unquant_weight", "freeze_bn", "merge_freeze_bn", \
 9 |         "register_quantization_hook","save_table"]
10 | 
11 | 


--------------------------------------------------------------------------------
/ncnnqat/quantize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import logging
  4 | import torch
  5 | import numpy as np
  6 | import onnx
  7 | 
  8 | from quant_cuda import fake_quantize
  9 | 
 10 | class FakeQuantCuda():
 11 |     r"""
 12 |     """
 13 |     def __init__(self,
 14 |                  bit_width=8,
 15 |                  type=1,
 16 |                  c=1
 17 |                  ):
 18 |         
 19 |         self._bit_width = bit_width
 20 |         self._type = type
 21 |         self._c = c
 22 |         
 23 | 
 24 |     def __call__(self, tensor,tensor_scale,tensor_movMax=None, aciq=0): #type=0,1,2=pre_conv_activate,w,after_conv_activate  
 25 |         r""" Converts float weights to quantized weights.
 26 | 
 27 |         Args:
 28 |             - tensor: input data
 29 |             - tensor_scale data scale data
 30 |             - tensor_movMax tensor max value 
 31 |             - aciq qat methed ,default turn of, use kl
 32 |         """
 33 |         
 34 |         #print(self._type,self._bit_width)
 35 |         #tensor.data = fake_quantize_c(tensor.data.detach().clone(),tensor_s.data.detach().clone(),self._bit_width,self._type)
 36 |         
 37 |         out = fake_quantize(tensor.data.detach().clone(),self._bit_width,self._type,self._c,aciq)
 38 |         tensor.data = out[0]
 39 |         tensor_scale.data = out[1]
 40 |         if self._type==0:
 41 |             tensor_movMax.data = out[2]
 42 |         #print("tensor_scale",tensor_scale)
 43 |             
 44 |         return tensor,tensor_scale,tensor_movMax
 45 | 
 46 | 
 47 | 
 48 | 
 49 | def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
 50 |     """ fuse convolution and batch norm's weight.
 51 | 
 52 |     Args:
 53 |         conv_w (torch.nn.Parameter): convolution weight.
 54 |         conv_b (torch.nn.Parameter): convolution bias.
 55 |         bn_rm (torch.nn.Parameter): batch norm running mean.
 56 |         bn_rv (torch.nn.Parameter): batch norm running variance.
 57 |         bn_eps (torch.nn.Parameter): batch norm epsilon.
 58 |         bn_w (torch.nn.Parameter): batch norm weight.
 59 |         bn_b (torch.nn.Parameter): batch norm weight.
 60 | 
 61 |     Returns:
 62 |         conv_w(torch.nn.Parameter): fused convolution weight.
 63 |         conv_b(torch.nn.Parameter): fused convllution bias.
 64 |     """
 65 | 
 66 |     if conv_b is None:
 67 |         conv_b = bn_rm.new_zeros(bn_rm.shape)
 68 |     bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
 69 | 
 70 |     conv_w = conv_w * \
 71 |         (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
 72 |     conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
 73 | 
 74 |     return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b)
 75 | 
 76 | 
 77 | def _fuse_conv_bn(conv, bn):
 78 |     conv.weight, conv.bias = \
 79 |         _fuse_conv_bn_weights(conv.weight, conv.bias,
 80 |                              bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
 81 |     return conv
 82 | 
 83 | 
 84 | def _fuse_modules(model):
 85 |     r"""Fuses a list of modules into a single module
 86 | 
 87 |     Fuses only the following sequence of modules:
 88 |     conv, bn
 89 |     All other sequences are left unchanged.
 90 |     For these sequences, fuse modules on weight level, keep model structure unchanged.
 91 | 
 92 |     Arguments:
 93 |         model: Model containing the modules to be fused
 94 | 
 95 |     Returns:
 96 |         model with fused modules.
 97 | 
 98 |     """
 99 |     children = list(model.named_children())
100 |     conv_module = None
101 |     conv_name = None
102 | 
103 |     for name, child in children:
104 |         if isinstance(child, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
105 |                               torch.nn.BatchNorm3d)):
106 |             if isinstance(conv_module, (torch.nn.Conv2d, torch.nn.Conv3d)):
107 |                 conv_module = _fuse_conv_bn(conv_module, child)
108 |                 model._modules[conv_name] = conv_module
109 |                 child.eval()
110 |                 child.running_mean = child.running_mean.new_full(
111 |                     child.running_mean.shape, 0)
112 |                 child.running_var = child.running_var.new_full(
113 |                     child.running_var.shape, 1)
114 |                 
115 |                 if child.weight is not None:
116 |                     child.weight.data = child.weight.data.new_full(
117 |                         child.weight.shape, 1)
118 |                 if child.bias is not None:
119 |                     child.bias.data = child.bias.data.new_full(
120 |                         child.bias.shape, 0)
121 |                 #print(child,child.bias)
122 |                 child.track_running_stats = False
123 |                 child.momentum = 0
124 |                 child.eps = 0
125 |                 #child.affine  = False
126 |             conv_module = None
127 |         elif isinstance(child, (torch.nn.Conv2d, torch.nn.Conv3d)):
128 |             conv_module = child
129 |             conv_name = name
130 |         else:
131 |             _fuse_modules(child)
132 |     return model
133 | 
134 | 
135 | def freeze_bn(m, freeze_bn_affine=True):
136 |     """Freeze batch normalization.
137 |         reference: https://arxiv.org/abs/1806.08342
138 | 
139 | 
140 |     Args:
141 |         - m (nn.module): torch module
142 |         - freeze_bn_affine (bool, optional): Freeze affine scale and
143 |         translation factor or not. Defaults: True.
144 |     """
145 | 
146 |     if isinstance(
147 |             m,
148 |         (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)):
149 | 
150 |         m.eval()
151 |         if freeze_bn_affine:
152 |             m.weight.requires_grad = False
153 |             m.bias.requires_grad = False
154 | 
155 | 
156 | def merge_freeze_bn(model):
157 |     """merge batch norm's weight into convolution, then freeze it.
158 | 
159 |     Args:
160 |         model (nn.module): model.
161 | 
162 |     Returns:
163 |         [nn.module]: model.
164 |     """
165 |     model = _fuse_modules(model) #merge conv bn ; mean 0 std 1 gama 1 beta 0 
166 |     model.apply(freeze_bn) # bn backward = false,bn not train 
167 |     return model
168 | 
169 | 
170 | def unquant_weight(m):
171 |     """ unquantize weight before update weight, avoid training turbulence.
172 | 
173 |     Args:
174 |         - m (nn.module): torch module.
175 |     """
176 |     try:
177 |         if hasattr(m, "weight_origin") and m.weight is not None:
178 |             m.weight.data.copy_(m.weight_origin.data)
179 |     except AttributeError:
180 |         pass
181 |     except TypeError:
182 |         pass
183 | 
184 | '''
185 | def quant_dequant_weight(m):
186 |     """ quant weight manually.
187 | 
188 |     Args:
189 |         - m (nn.module): torch module.
190 |     """
191 |     quant_handle = FakeQuantCuda()
192 |     try:
193 |         if hasattr(m, "weight_origin") and m.weight is not None:
194 |             m.weight_origin.data.copy_(m.weight.data)
195 |             m.weight.data = quant_handle(m.weight.data.detach().clone())
196 |     except AttributeError:
197 |         pass
198 |     except TypeError:
199 |         pass
200 | '''
201 | 
202 | def _quantizing_activation_ncnn(module, input):
203 |     """ quantize per-layer activation(input of layer) before layer calculate.
204 | 
205 |     Args:
206 |         - module (nn.module): torch module.
207 |         - input : layer input(tuple) ,torch tensor (nchw or n**).
208 |     """
209 |     #GOOGLE QAT  movMax = movMax*momenta + max(abs(tensor))*(1-momenta)    momenta = 0.95
210 |     #print("input.shape",input[0].shape)
211 |     aciq = 0
212 |     quant_handle = FakeQuantCuda(type=0,bit_width=8,c=1)
213 |     list_modified = []
214 |     if isinstance(input, tuple):
215 |         for item in input:
216 |             aciq = 0
217 |             item_type = item.dtype
218 |             if item.numel()/item.shape[0]>8000:
219 |                 aciq = 1
220 |             #quant_tuple = quant_handle(item.float(),module.activation_scale.data.detach().clone())
221 |             quant_tuple = quant_handle(item.float(),module.activation_scale.data.detach().clone(),tensor_movMax=module.activation_movMax.data.detach().clone(),aciq=aciq)
222 |             item = quant_tuple[0]
223 |             if item.dtype!=item_type:
224 |                 #print(item.dtype,item_type)
225 |                 item.to(item_type)
226 |             module.activation_scale.data = quant_tuple[1]
227 |             module.activation_movMax.data = quant_tuple[2]
228 |             #print(quant_tuple[2])
229 |             list_modified.append(item)
230 | 
231 |     else:
232 |         input_type = input.dtype
233 |         if input.numel()/input.shape[0]>8000:
234 |             aciq = 1
235 |         #quant_tuple = quant_handle(input.float(),module.activation_scale.data.detach().clone())
236 |         quant_tuple = quant_handle(input.float(),module.activation_scale.data.detach().clone(),tensor_movMax=module.activation_movMax.data.detach().clone(),aciq=aciq)
237 |         input = quant_tuple[0]
238 |         module.activation_scale.data = quant_tuple[1]
239 |         module.activation_movMax.data = quant_tuple[2]
240 |         if input.dtype!=input_type:
241 |             input.to(input_type)
242 |         list_modified.append(input)
243 |     tuple_input = tuple(list_modified)  
244 |     return tuple_input
245 | def _quantizing_weight_ncnn(module, input):
246 |     """ quantize per-channel weight before layer calculate.
247 | 
248 |     Args:
249 |         - module (nn.module): torch module.
250 |         - input : layer input(tuple) ,torch tensor (nchw or n**).
251 |     """
252 |     module_shape = module.weight.shape
253 |     #print("module_shape",module_shape)
254 |     channel = module_shape[0] #oikk
255 |     if isinstance(module,(torch.nn.Conv2d)) and module.groups!=1:   #depthwise
256 |         channel = module.groups
257 |     bit_width = 8
258 |     if isinstance(module,(torch.nn.Conv2d)) and module.stride==(1,1) and module.dilation==(1,1) and module.kernel_size==(3,3) and module.groups==1: #winnograd f(4,3)
259 |         bit_width=6
260 |         
261 |     aciq = 0    
262 |     weight_numel = module.weight.numel()
263 |     if weight_numel/channel>8000: #when > 8000 , max_var > threshold
264 |         aciq = 1
265 |         #print("aciq",aciq,module)
266 |     
267 | 
268 |     quant_handle = FakeQuantCuda(type=1,bit_width=bit_width,c=channel)
269 |     # print("quantizing weight.")
270 |     # print(module.weight[0][0][0])
271 |     module.weight_origin.data.copy_(module.weight.data) #copy float data to a new place
272 |     
273 |     quant_tuple = quant_handle(module.weight.data.detach().clone(),module.weight_scale.data.detach().clone(),aciq=aciq)#把原始数据 quant——dequant 此时数据是有损的，计算损失后，把备份数据考回原处做梯度计算
274 |     module.weight.data = quant_tuple[0]
275 |     module.weight_scale.data = quant_tuple[1]
276 |     # print(module.weight[0][0][0])
277 |     #print(module.weight_scale)
278 | 
279 | 
280 | def register_quantization_hook(model,
281 |                                quant_weight=True,
282 |                                quant_activation=True,
283 |                               ):
284 |     """register quantization hook for model.
285 | 
286 |     Args:
287 |         model (:class:`Module`): Module.
288 | 
289 |     Returns:
290 |         Module: self
291 |     """
292 | 
293 |     #  weight quantizing.
294 |     logger = logging.getLogger(__name__)
295 |     logger.setLevel(logging.INFO)
296 | 
297 |     for _, module in model._modules.items():
298 |         #print("module",module)
299 |         if len(list(module.children())) > 0:
300 |             register_quantization_hook(module, quant_weight, quant_activation)
301 |         else:
302 |             if quant_weight and hasattr(module,"weight") and module.weight is not None and isinstance(
303 |                     module, (torch.nn.Conv2d,torch.nn.Linear)):
304 |                 module.register_buffer('weight_origin', module.weight.detach().clone()) #数据备份空间
305 |                 #module.register_buffer("weight_scale", torch.ones([1,model._modules["conv1"].weight.shape[0]], dtype=torch.float).cuda()) #weight scale
306 |                 #module.register_buffer("weight_scale", torch.ones([1,module.weight.shape[0]], dtype=torch.float).cuda()) #weight scale module.weight.shape =[o,i,k,k]
307 |                 module.register_buffer("weight_scale", torch.ones([module.weight.shape[0]], dtype=torch.float).cuda()) #weight scale module.weight.shape =[o,i,k,k]
308 | 
309 |       
310 |                 module.register_forward_pre_hook(_quantizing_weight_ncnn)
311 |                 logger.info("Quantizing weight of %s", str(module))
312 |             
313 |             
314 |                 module.register_buffer("activation_scale", torch.tensor([1], dtype=torch.float).cuda())
315 |                 module.register_buffer("activation_movMax", torch.tensor([1], dtype=torch.float).cuda())
316 |                 #module.register_buffer("activation_momenta", torch.tensor([1], dtype=torch.float).cuda())
317 |                 module.register_forward_pre_hook(_quantizing_activation_ncnn)
318 |                 logger.info("Quantizing activation of %s", str(module))
319 | 
320 |     return model
321 |     
322 | def save_table(torch_model,onnx_path="model.onnx",table="model.table"):
323 |     f = open(table,"w",encoding='utf8')
324 |     static_dict_org = torch_model.state_dict()
325 |     static_dict = {k.replace('module.',''):v for k,v in static_dict_org.items()}
326 |     
327 |     
328 |     model = onnx.load(onnx_path)
329 |     node = model.graph.node
330 |     node_num = len(node)
331 |     
332 |     tail_layer = "_param_0"
333 |     split_char = " "
334 |     tab_char = "\n"
335 |     tail_len = 6
336 |     for each in range(node_num):
337 |         if node[each].op_type not in ["Conv","Gemm"]:
338 |             continue
339 |         #print(node[each].op_type)
340 |         pre_name = node[each].input[1]
341 |         #print(pre_name)
342 |         #print(pre_name.replace(pre_name.split(".")[-1],"weight_scale"))
343 |         scale_data = static_dict[pre_name.replace(pre_name.split(".")[-1],"weight_scale")]
344 |         list_scale = scale_data.cpu().numpy().flatten().tolist()
345 |         #print(node[each].name,node[each].op_type,node[each].input)
346 |         f.write(node[each].name + tail_layer)
347 |         for d in list_scale:
348 |             d = float(d)
349 |             f.write(split_char + "{:.6f}".format(d))
350 |         f.write(tab_char)
351 |     for each in range(node_num):
352 |         if node[each].op_type not in ["Conv","Gemm"]:
353 |             continue
354 |         pre_name = node[each].input[1]
355 |         scale_data = static_dict[pre_name.replace(pre_name.split(".")[-1],"activation_scale")]
356 |         list_scale = scale_data.cpu().numpy().flatten().tolist()
357 |         #print(node[each].name,node[each].op_type,node[each].input)
358 |         f.write(node[each].name)
359 |         for d in list_scale:
360 |             d = float(d)
361 |             f.write(split_char + "{:.6f}".format(d))
362 |             
363 |         f.write(tab_char)
364 |     f.close()
365 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=40.8.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | license_files = LICENSE.txt
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import pathlib
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | 
 5 | from build_helper import check_cuda_version
 6 | assert(check_cuda_version())
 7 | 
 8 | import os
 9 | os.system('make -j%d' % os.cpu_count())
10 | 
11 | here = pathlib.Path(__file__).parent.resolve()
12 | long_description = (here / 'README.md').read_text(encoding='utf-8')
13 | 
14 | setup(
15 |     name='ncnnqat',
16 |     version='0.1.0',
17 |     description='A ncnn quantization aware training tool on pytorch.',
18 |     long_description=long_description,
19 |     long_description_content_type='text/markdown',
20 |     url='https://github.com/ChenShisen/ncnnqat',
21 |     author='Shisen Chen',
22 |     author_email='napoleo54css@gmail.com',
23 |     license='MIT',
24 |     classifiers=[
25 |         'Development Status :: 5 - Production/Stable',
26 |         "Intended Audience :: Science/Research",
27 |         'Intended Audience :: Developers',
28 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
29 |         "Topic :: Software Development :: Libraries :: Python Modules",
30 |         'License :: OSI Approved :: MIT License',
31 |         'Programming Language :: Python :: 3',
32 |         'Programming Language :: Python :: 3.5',
33 |         'Programming Language :: Python :: 3.6',
34 |         'Programming Language :: Python :: 3.7',
35 |         'Programming Language :: Python :: 3.8',
36 |         'Programming Language :: Python :: 3 :: Only',
37 |     ],
38 |     keywords=[
39 |         "ncnn"
40 |         "quantization aware training",
41 |         "deep learning",
42 |         "neural network",
43 |         "CNN",
44 |         "machine learning",
45 |     ],
46 |     packages=find_packages(),
47 | 
48 |     python_requires='>=3.5, <4',
49 |     install_requires=[
50 |       "torch >= 1.5",
51 |       "numpy >= 1.18.1",
52 |       "onnx >= 1.7.0",
53 |       "onnx-simplifier >= 0.3.6"
54 |     ],
55 |     extras_require={
56 |         'test': ["torchvision>=0.4",
57 |                  "nose",
58 |                  "ddt"
59 |                  ],
60 |         'docs': [
61 |             'sphinx==2.4.4',
62 |             'sphinx_rtd_theme'
63 |         ]
64 |     },
65 |     ext_modules=[
66 |         CUDAExtension(
67 |             #name="quant_impl",
68 |             name="quant_cuda",
69 |             sources=[
70 |                 "./src/fake_quantize.cpp",
71 |             ],
72 |             libraries=['quant_cuda'],
73 |             library_dirs=['obj'],
74 |         )
75 |     ],
76 |     cmdclass={'build_ext': BuildExtension},
77 |     #test_suite="ncnnqat.test.test_cifar10",
78 | )
79 | 


--------------------------------------------------------------------------------
/src/fake_quantize.cpp:
--------------------------------------------------------------------------------
 1 | #include "fake_quantize.h"
 2 | 
 3 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
 4 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 5 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
 6 | 
 7 | std::vector<Tensor> fake_quantize(Tensor a, int bit_width,int type,int c,int aciq){
 8 |   CHECK_INPUT(a);
 9 |   return fake_quantize_cuda(a, bit_width,type,c,aciq);
10 | }
11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
12 |   m.def("fake_quantize", &fake_quantize, "NCNN Fake Quantization (CUDA)");
13 | }
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/src/fake_quantize.cu:
--------------------------------------------------------------------------------
  1 | #include "fake_quantize.h"
  2 | 
  3 | 
  4 | __global__ void max_reduce(float* __restrict__ data,float* out_ptr,int width,int lg_n) //preset data[i] >=0 
  5 | {
  6 |     __shared__ float* middleware[blockSize];                                
  7 |     const float min_positive_float = 1e-6;                                       
  8 |     int row = blockIdx.x * width + threadIdx.x;                             
  9 |     int bid = blockIdx.x;                                                   
 10 |     int tid = threadIdx.x;                                                  
 11 |     int tid_tmp = threadIdx.x;                                              
 12 | 
 13 |     //if(tid<width) {middleware[tid] = &(data[row]);}                       
 14 |     if(tid<width) middleware[tid] = data+row;                               
 15 |     else middleware[tid] = &min_positive_float;                             
 16 |     row+=blockSize;                                                         
 17 |     tid_tmp +=blockSize;                                                    
 18 |     while(tid_tmp<width)                                                    
 19 |     {
 20 |         //if(data[row]>*(middleware[tid])) middleware[tid] = &(data[Row]);
 21 | 	if(fabs(data[Row])>fabs(*(middleware[tid]))) middleware[tid] = data+row;
 22 | 	row+=blockSize;                                                     
 23 | 	tid_tmp +=blockSize;
 24 |     }
 25 |     __syncthreads();                                                        
 26 | 	
 27 |     //for(int i=blockSize/2; i>0; i/=2)
 28 |     for(int i=lg_n/2; i>0; i/=2)                                           
 29 |     {
 30 |         if(tid<i)
 31 | 	{
 32 | 	    if(fabs(*(middleware[tid+i]))>fabs(*(middleware[tid]))) middleware[tid]=middleware[tid+i];
 33 | 	}
 34 |         __syncthreads();
 35 |     }
 36 | 	
 37 |     if(tid==0) out_ptr[bid] = fabs(*(middleware[0]));                             	
 38 | }
 39 | __global__ void fake_quantize_layer_google(float* __restrict__ a,
 40 |                                            float* o,  
 41 |                                            float* o1,  
 42 |                                            float* mov_max,  
 43 | 					   int size,
 44 | 					   int bit_width,
 45 | 					   float* max_entry) 
 46 | {
 47 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 48 |     if (index < size) 
 49 |     {
 50 | 	const float momenta = 0.95; 
 51 | 	float mov_max_tmp = mov_max[0];
 52 | 	if(mov_max_tmp<1e-6) mov_max_tmp=fabs(*max_entry);  //movMax dafault 0 ,now first step set it a non zero data
 53 | 	else  mov_max_tmp= mov_max_tmp * momenta + fabs(*max_entry) * (1.-momenta);  // #GOOGLE QAT : movMax = movMax*momenta + max(abs(tensor))*(1-momenta)    momenta = 0.95
 54 |         float data_scale = __powf(2.,bit_width-1.)-1;
 55 | 		
 56 | 	float scale;
 57 |         if(mov_max_tmp < 1e-6) scale =  __fdividef(data_scale,1e-6);
 58 | 	else scale =  __fdividef(data_scale,mov_max_tmp);
 59 | 			
 60 | 	int o_int = round(a[index]*scale);
 61 | 	//o[index] = __fdividef(round(a[index]*scale),scale);
 62 | 	if(o_int>data_scale) o_int=(int)data_scale;
 63 | 	else if(o_int<-data_scale) o_int=(int)(-data_scale);
 64 | 	else {};
 65 | 	o[index] =  __fdividef(o_int*1.,scale);
 66 | 		
 67 | 	if(index==0) 
 68 | 	{
 69 | 	    o1[0] = scale;
 70 | 	    mov_max[0] = mov_max_tmp;
 71 | 	}
 72 |     }
 73 | }
 74 | 
 75 | 
 76 | __global__ void fake_quantize_layer_aciq(float* __restrict__ a,
 77 |                                          float* o,  
 78 |                                          float* o1,  
 79 |                                          float* mov_max,
 80 |                                          int feature_pixl_num,											
 81 | 					 int size,
 82 | 					 int bit_width,
 83 | 					 float* max_entry) 
 84 | {
 85 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 86 |     if (index < size) 
 87 |     {
 88 |         const float momenta = 0.95; 
 89 |         float mov_max_tmp = mov_max[0];
 90 |         if(mov_max_tmp<1e-6) mov_max_tmp=fabs(*max_entry);  //movMax dafault 0 ,now first step set it a non zero data
 91 |         else  mov_max_tmp= fabs(*max_entry);//mov_max_tmp * momenta + fabs(*max_entry) * (1.-momenta);  // #GOOGLE QAT : movMax = movMax*momenta + max(abs(tensor))*(1-momenta)    momenta = 0.95
 92 |         float data_scale = __powf(2.,bit_width-1.)-1;
 93 | 		
 94 |         const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
 95 |         const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * __logf(4.)));
 96 |         double std = (mov_max_tmp * 2 * gaussian_const) / sqrt(2 * __logf(feature_pixl_num));
 97 |         float threshold = (float)(alpha_gaussian[bit_width - 1] * std);
 98 | 		
 99 |         float scale;
100 |         if(threshold < 1e-6) scale =  __fdividef(data_scale,1e-6);
101 |         else scale =  __fdividef(data_scale,threshold);
102 | 	//float o_index = __fdividef(round(a[index]*scale),scale);
103 | 	int o_int = round(a[index]*scale);
104 | 	//o[index] = __fdividef(round(a[index]*scale),scale);
105 | 	if(o_int>data_scale) o_int=(int)data_scale;
106 | 	else if(o_int<-data_scale) o_int=(int)(-data_scale);
107 | 	else {};
108 | 	o[index] =  __fdividef(o_int*1.,scale);
109 | 		
110 | 	if(index==0) 
111 | 	{
112 | 	    o1[0] = scale;
113 | 	    mov_max[0] = mov_max_tmp;
114 | 	}
115 |     }
116 | }
117 | 
118 | __global__ void fake_quantize_channel_aciq(float* __restrict__ a,
119 |                                            float* o,  
120 |                                            float* o1,  
121 | 					   int size,
122 | 					   int bit_width,
123 | 					   float* max_entry_arr, //max_entry_arr already>0
124 | 					   int channel_num) 
125 | {
126 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
127 |     if (index < size)
128 |     {
129 | 	int channel = index/channel_num;
130 | 	float* max_entry = max_entry_arr+channel;
131 | 	float data_scale = __powf(2.,bit_width-1.)-1;
132 |         if((*max_entry) < 1e-6)
133 | 	{
134 | 	    //if(index%channel_num==0) o1[channel] = scale;
135 | 	    *max_entry = 1e-6;
136 |             //return;
137 |         }
138 | 	const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
139 | 	const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * __logf(4.)));
140 | 	double std = ((*max_entry) * 2 * gaussian_const) / sqrt(2 * __logf(channel_num));
141 | 	float threshold = (float)(alpha_gaussian[bit_width - 1] * std);
142 | 		
143 | 	float scale =  __fdividef(data_scale,threshold);
144 | 	int o_int = round(a[index]*scale);
145 | 	if(o_int>data_scale) o_int=(int)data_scale;
146 | 	else if(o_int<-data_scale) o_int=(int)(-data_scale);
147 | 	else {};
148 | 	o[index] = __fdividef(o_int*1.,scale);
149 | 	if(index%channel_num==0) o1[channel] = scale;
150 |     }
151 | }
152 | __global__ void fake_quantize_channel_cuda(float* __restrict__ a,
153 |                                            float* o,  
154 |                                            float* o1,  
155 | 					   int size,
156 | 					   int bit_width,
157 | 					   float* max_entry_arr, //max_entry_arr already>0
158 | 					   int channel_num) 
159 | {
160 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
161 |     if (index < size) 
162 |     {
163 | 	int channel = index/channel_num;
164 | 	float* max_entry = max_entry_arr+channel;
165 | 	float data_scale = __powf(2.,bit_width-1.)-1;
166 |         if((*max_entry) < 1e-6)
167 | 	{
168 | 	    //if(index%channel_num==0) o1[channel] = scale;
169 | 	    *max_entry = 1e-6;
170 |             //return;
171 |         }
172 | 	float scale =  __fdividef(data_scale,*max_entry);
173 | 	o[index] = __fdividef(round(a[index]*scale),scale);
174 | 	if(index%channel_num==0) o1[channel] = scale;
175 |     }
176 | }
177 | std::vector<Tensor> fake_quantize_activate_cuda(Tensor a, int bit_width ,int aciq)
178 | {
179 |     auto o = at::zeros_like(a); //q out
180 |     auto o1 = at::zeros({1}, a.options());  //scale
181 |     auto mov_max = at::zeros({1}, a.options());  //max of tensor  #GOOGLE QAT  movMax = movMax*momenta + max(abs(tensor))*(1-momenta)    momenta = 0.95
182 |     int64_t size = a.numel();
183 | 	
184 |     int batch_size = a.size(0);//batchsize
185 |     int feature_pixl_num = size/batch_size;
186 |     
187 |     Tensor max_entry = at::max(at::abs(a));
188 |     int blockNums = (size + blockSize - 1) / blockSize;
189 | 	
190 |     if(aciq==0) //movmax
191 |     {
192 | 	//printf("layer_max....");
193 | 	fake_quantize_layer_google<<<blockNums, blockSize>>>(a.data_ptr<float>(),
194 | 							     o.data_ptr<float>(),
195 | 							     o1.data_ptr<float>(),
196 | 							     mov_max.data_ptr<float>(),
197 | 							     size,
198 | 							     bit_width,
199 | 							     max_entry.data_ptr<float>());
200 |     }
201 |     else // aciq
202 |     {
203 | 	//printf("layer_aciq....");
204 | 	fake_quantize_layer_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
205 | 							   o.data_ptr<float>(),
206 | 							   o1.data_ptr<float>(),
207 | 							   mov_max.data_ptr<float>(),
208 | 							   feature_pixl_num,
209 | 							   size,
210 | 							   bit_width,
211 | 							   max_entry.data_ptr<float>());
212 |     }
213 |     return {o,o1,mov_max};
214 | }
215 | 
216 | 
217 | std::vector<Tensor> fake_quantize_weight_cuda(Tensor a, int bit_width,int c ,int aciq) 
218 | {
219 |     auto o = at::zeros_like(a); //q out
220 |     auto o1 = at::zeros({c}, a.options());  //scale
221 |     int64_t size = a.numel();
222 |     
223 |     int blockNums = (size + blockSize - 1) / blockSize;
224 |     int channel_num = size/c;
225 |     auto max_entry_arr = at::zeros({c}, a.options());
226 | 	
227 |     int lg_n = ceil(log2(channel_num*1.)); //2^x - channel_num >0 
228 |     lg_n = pow(2,lg_n); //2^x
229 |     if(lg_n>blockSize) lg_n=blockSize; //
230 | 	
231 |     max_reduce <<<c, blockSize >>> (a.data_ptr<float>(),
232 | 				    max_entry_arr.data_ptr<float>(),
233 | 				    channel_num,
234 | 				    lg_n); //c block , each block get a max value
235 | 	
236 |     if(aciq==0)
237 |     {
238 | 	//printf("weight_max....");
239 | 	fake_quantize_channel_cuda<<<blockNums, blockSize>>>(a.data_ptr<float>(),
240 | 							     o.data_ptr<float>(),
241 | 							     o1.data_ptr<float>(),
242 | 							     size,
243 | 							     bit_width,
244 | 							     max_entry_arr.data_ptr<float>(),  //max_entry_arr already>0
245 | 							     channel_num);
246 |     }
247 |     else
248 |     {
249 | 	//printf("weight_aciq....");
250 | 	fake_quantize_channel_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
251 | 							     o.data_ptr<float>(),
252 | 							     o1.data_ptr<float>(),
253 | 							     size,
254 | 							     bit_width,
255 | 							     max_entry_arr.data_ptr<float>(),  //max_entry_arr already>0
256 | 							     channel_num);	
257 | 
258 |     }		
259 |     return {o,o1};
260 |   }
261 | 
262 | 
263 | std::vector<Tensor> fake_quantize_cuda(Tensor a, int bit_width,int type,int c,int aciq) 
264 | {
265 |     /*
266 |     https://arxiv.org/pdf/1806.08342.pdf  2.5
267 |     For weights,we use the actual minimum and maximum values to determine the quantizer parameters. 
268 |     For activations, we use the moving average of the minimum and maximum values across batches to determine the quantizer parameters.
269 |     float 6 7 ,double 15 16
270 |     */
271 |     if(type==0) return fake_quantize_activate_cuda(a,bit_width,aciq); //type==0 per layer	
272 |     else return fake_quantize_weight_cuda(a,bit_width,c,aciq); //type==1 perchannel
273 | }
274 | 
275 | 
276 | 
277 | 


--------------------------------------------------------------------------------
/src/fake_quantize.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <math.h>
 3 | #include <cuda.h>
 4 | #include <cuda_runtime.h>
 5 | #include <climits>
 6 | #include <stdint.h>
 7 | #include <tuple>
 8 | #include <ATen/ATen.h>
 9 | #include <torch/torch.h>
10 | #include <vector>
11 | 
12 | const int blockSize = 1024;
13 | //#define blockSize 1024
14 | 
15 | using namespace at;
16 | 
17 | 
18 | 
19 | std::vector<Tensor> fake_quantize_cuda(Tensor a, int bit_width=8,int type=1,int c=1,int aciq=0);
20 | 
21 | std::vector<Tensor> fake_quantize_activate_cuda(Tensor a, int bit_width ,int aciq);
22 | std::vector<Tensor> fake_quantize_weight_cuda(Tensor a, int bit_width,int c,int aciq);
23 | 
24 | 
25 | __global__ void max_reduce(float* __restrict__ data,float* out_ptr,int width,int lg_n);
26 | 
27 | 
28 | 
29 | 
30 | __global__ void fake_quantize_layer_google(float* __restrict__ a,
31 | 					   float* o,  
32 |                                            float* o1,
33 | 					   float* mov_max,
34 | 				           int size,
35 | 					   int bit_width,
36 | 					   float* max_entry);
37 | __global__ void fake_quantize_layer_aciq(float* __restrict__ a,
38 | 					 float* o,  
39 |                                          float* o1,
40 | 					 float* mov_max,
41 | 				         int feature_pixl_num,
42 | 					 int size,
43 | 					 int bit_width,
44 | 					 float* max_entry);
45 | 
46 | __global__ void fake_quantize_channel_cuda(float* __restrict__ a,
47 |                                            float* o,  
48 |                                            float* o1,  
49 | 					   int size,
50 | 				           int bit_width,
51 | 					   float* max_entry_arr,
52 | 					   int channel_num);
53 | __global__ void fake_quantize_channel_aciq(float* __restrict__ a,
54 |                                            float* o,  
55 |                                            float* o1,  
56 | 					   int size,
57 | 					   int bit_width,
58 | 					   float* max_entry_arr,
59 | 					   int channel_num);
60 | 
61 | 


--------------------------------------------------------------------------------
/tests/ssd300/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | from argparse import ArgumentParser
  5 | import math
  6 | import numpy as np
  7 | import time
  8 | import torch
  9 | from torch.optim.lr_scheduler import MultiStepLR
 10 | import torch.utils.data.distributed
 11 | from torchsummary import summary
 12 | 
 13 | from ncnnqat import merge_freeze_bn, register_quantization_hook,save_table
 14 | 
 15 | from src.model import model, Loss
 16 | from src.utils import dboxes300_coco, Encoder
 17 | 
 18 | from src.evaluate import evaluate
 19 | from src.train import train_loop, tencent_trick
 20 | from src.data import *
 21 | 
 22 | 
 23 | #os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5'
 24 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
 25 | #os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
 26 | 
 27 | 
 28 | 
 29 | class Logger:
 30 |     def __init__(self, batch_size, local_rank, n_gpu, print_freq=20):
 31 |         self.batch_size = batch_size
 32 |         self.local_rank = local_rank
 33 |         self.n_gpu = n_gpu
 34 |         self.print_freq = print_freq
 35 | 
 36 |         self.processed_samples = 0
 37 |         self.epochs_times = []
 38 |         self.epochs_speeds = []
 39 | 
 40 | 
 41 |     def update_iter(self, epoch, iteration, loss):
 42 |         if self.local_rank != 0:
 43 |             return
 44 | 
 45 |         if iteration % self.print_freq == 0:
 46 |             print('Epoch: {:2d}, Iteration: {}, Loss: {}'.format(epoch, iteration, loss))
 47 | 
 48 |         self.processed_samples = self.processed_samples + self.batch_size
 49 | 
 50 |     def start_epoch(self):
 51 |         self.epoch_start = time.time()
 52 | 
 53 |     def end_epoch(self):
 54 |         epoch_time = time.time() - self.epoch_start
 55 |         epoch_speed = self.processed_samples / epoch_time
 56 | 
 57 |         self.epochs_times.append(epoch_time)
 58 |         self.epochs_speeds.append(epoch_speed)
 59 |         self.processed_samples = 0
 60 | 
 61 |         if self.local_rank == 0:
 62 |             print('Epoch {:2d} finished. Time: {:4f} s, Speed: {:4f} img/sec, Average speed: {:4f}'
 63 |                 .format(len(self.epochs_times)-1, epoch_time, epoch_speed * self.n_gpu, self.average_speed() * self.n_gpu))
 64 | 
 65 |     def average_speed(self):
 66 |         return sum(self.epochs_speeds) / len(self.epochs_speeds)
 67 | 
 68 | 
 69 | def make_parser():
 70 |     epoch_all = 65
 71 |     epoch_qat = epoch_all-5 if epoch_all-5>0 else epoch_all
 72 |     
 73 |     eval_list = [0,epoch_all-1] if epoch_all-1>0 else [0]
 74 |     
 75 |     parser = ArgumentParser(
 76 |         description="Train Single Shot MultiBox Detector on COCO")
 77 |     parser.add_argument(
 78 |         '--data', '-d', type=str, default='./data/coco', required=False,
 79 |         help='path to test and training data files')
 80 |     parser.add_argument(
 81 |         '--epochs', '-e', type=int, default=epoch_all,  #65
 82 |         help='number of epochs for training')
 83 |     parser.add_argument(
 84 |         '--qat-epoch', '-q', type=int, default=epoch_qat,
 85 |         help='epoch of qat begaining')
 86 |     parser.add_argument(
 87 |         '--batch-size', '--bs', type=int, default=32,
 88 |         help='number of examples for each iteration')
 89 |     parser.add_argument(
 90 |         '--eval-batch-size', '--ebs', type=int, default=32,
 91 |         help='number of examples for each evaluation iteration')
 92 |     parser.add_argument(
 93 |         '--seed', '-s', type=int, default=0,
 94 |         help='manually set random seed for torch')
 95 |     parser.add_argument(
 96 |         '--evaluation', nargs='*', type=int,
 97 |         default=eval_list,#[0, 48, 53, 59,63, 64,65],
 98 |         help='epochs at which to evaluate')
 99 |     parser.add_argument(
100 |         '--multistep', nargs='*', type=int, default=[43, 54],
101 |         help='epochs at which to decay learning rate')
102 |     parser.add_argument(
103 |         '--target', type=float, default=None,
104 |         help='target mAP to assert against at the end')
105 |         
106 |     #save model
107 |     parser.add_argument('--check-save', '--s', type=bool, default=True)
108 |     parser.add_argument(
109 |         '--check-point', '-c', type=str, default='./models', required=False,
110 |         help='path to model save files')
111 |     parser.add_argument('--onnx_save', action='store_true')
112 | 
113 |     # Hyperparameters
114 |     parser.add_argument(
115 |         '--learning-rate', '--lr', type=float, default=2.6e-3, help='learning rate')
116 |     parser.add_argument(
117 |         '--momentum', '-m', type=float, default=0.9,
118 |         help='momentum argument for SGD optimizer')
119 |     parser.add_argument(
120 |         '--weight-decay', '--wd', type=float, default=0.0005,
121 |         help='momentum argument for SGD optimizer')
122 |     parser.add_argument('--warmup', type=int, default=None)
123 |     parser.add_argument(
124 |         '--backbone', type=str, default='resnet18',
125 |         choices=['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'])
126 |     parser.add_argument('--num-workers', type=int, default=4)
127 |     parser.add_argument('--fp16-mode', type=str, default='static', choices=['off', 'static', 'amp'],
128 |         help='Half precission mode to use')
129 | 
130 |     # Distributed
131 |     parser.add_argument('--local_rank', default=0, type=int,
132 |         help='Used for multi-process training. Can either be manually set ' +
133 |             'or automatically set by using \'python -m multiproc\'.')
134 | 
135 |     # Pipeline control
136 |     parser.add_argument(
137 |         '--data_pipeline', type=str, default='dali', choices=['dali', 'no_dali'],
138 |         help='data preprocessing pipline to use')
139 | 
140 |     return parser
141 | 
142 | 
143 | def train(args):
144 | 
145 |     args.distributed = False
146 |     if 'WORLD_SIZE' in os.environ:
147 |         print('WORLD_SIZE in os.environ',os.environ['WORLD_SIZE'],args.local_rank)
148 |         args.distributed = int(os.environ['WORLD_SIZE']) > 1
149 |         print(args.distributed)
150 |     if args.distributed:
151 |         torch.cuda.set_device(args.local_rank)
152 |         torch.distributed.init_process_group(backend='nccl', init_method='env://')
153 |         args.N_gpu = torch.distributed.get_world_size()
154 |     else:
155 |         args.N_gpu = 1
156 | 
157 |     dboxes = dboxes300_coco()
158 |     encoder = Encoder(dboxes)
159 |     cocoGt = get_coco_ground_truth(args)
160 |     
161 |     ssd300 = model(args)
162 |     
163 |     loss_func = Loss(dboxes)
164 |     loss_func.cuda()
165 |     
166 |     
167 |     args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
168 |     iteration = 0
169 |     
170 |     optimizer = torch.optim.SGD(
171 |         tencent_trick(ssd300), 
172 |         lr=args.learning_rate,
173 |         momentum=args.momentum,
174 |         weight_decay=args.weight_decay)
175 | 
176 |     scheduler = MultiStepLR(
177 |         optimizer=optimizer, 
178 |         milestones=args.multistep, 
179 |         gamma=0.1)
180 | 
181 |     
182 | 
183 |     val_dataloader, inv_map = get_val_dataloader(args)
184 |     train_loader = get_train_loader(args, dboxes)
185 |     
186 |     #print(inv_map)
187 |     #print(val_dataset.label_info)
188 |     
189 |     acc = 0
190 |     acc_best = 0
191 |     epoch_check = 0
192 |     logger = Logger(args.batch_size, args.local_rank, args.N_gpu)
193 |     
194 |     for epoch in range(epoch_check, args.epochs):
195 |         logger.start_epoch()
196 |         #scheduler.step()
197 |         #print(ssd300)
198 |         '''qat'''
199 |         if epoch==args.qat_epoch:
200 |             register_quantization_hook(ssd300)                
201 |             ssd300 = merge_freeze_bn(ssd300)
202 |             print("qat hook...")
203 |         if epoch>args.qat_epoch:
204 |             ssd300 = merge_freeze_bn(ssd300)
205 |             print("merge bn ...")
206 |         '''qat'''  
207 |             
208 |         iteration = train_loop(
209 |             ssd300, loss_func, epoch, optimizer, 
210 |             train_loader, iteration, logger, args)
211 |         scheduler.step()
212 |         logger.end_epoch()
213 | 
214 |         if epoch in args.evaluation:
215 |             acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
216 |             if args.local_rank == 0:
217 |                 print('Epoch {:2d}, Accuracy: {:4f} mAP'.format(epoch, acc))
218 |                 
219 |             
220 |             if acc>=acc_best and args.local_rank == 0:
221 |                 acc_best = acc
222 |                 
223 |                 if args.distributed:
224 |                     model_dict = ssd300.module.state_dict()
225 |                 else:
226 |                     model_dict = ssd300.state_dict()
227 |                 torch.save({
228 |                 'epoch': epoch+1,
229 |                 'model_state_dict': model_dict,
230 |                 'optimizer_state_dict': optimizer.state_dict(),
231 |                 'val_acc':acc_best,
232 |                 "inv_map":inv_map,
233 |                 "scheduler":scheduler.state_dict(),
234 |                 }, args.checkpoint)
235 | 
236 |         if args.data_pipeline == 'dali':
237 |             train_loader.reset()
238 | 
239 |     return acc, logger.average_speed()
240 |         
241 | 
242 | if __name__ == "__main__":
243 |     parser = make_parser()
244 |     args = parser.parse_args()
245 |     
246 |     
247 |     if args.onnx_save:  #after train  ,load model , save onnx model and ncnn table
248 |         #python main.py --onnx_save
249 |         onnx_path = os.path.join(args.check_point,"model.onnx")
250 |         table_path = os.path.join(args.check_point,"model.table")
251 |         #print(onnx_path)
252 |         checkpoint = os.path.join(args.check_point,"model.pt")
253 |         #print(checkpoint)
254 |         ssd300 = model(args,onnx_save=args.onnx_save)
255 |         summary(ssd300, input_size=(3, 300, 300), device='cpu')
256 |         ssd300.cuda()
257 |         '''qat'''
258 |         register_quantization_hook(ssd300)
259 |         ssd300 = merge_freeze_bn(ssd300)
260 |         '''qat'''
261 |         if os.path.exists(checkpoint):
262 |             print("loadmodel from checkpoint...")
263 |             checkpoint_load = torch.load(checkpoint,map_location='cpu')
264 |             #ssd300.module.load_state_dict(checkpoint_load['model_state_dict']) #donot know ssd300 is distributed
265 |             ssd300.load_state_dict({k.replace('module.',''):v for k,v in checkpoint_load['model_state_dict'].items()})   
266 |             print("loadmodel from checkpoint end...")
267 |         ssd300.eval()
268 |         input_names = [ "input" ]
269 |         #output_names = [ "SSD300-184" ]
270 |         output_names = [ "Conv2d-93" ]
271 |         dummy_input = torch.ones([1, 3, 300, 300]).cuda()
272 |         #dummy_input = torch.randn(1, 3, 300, 300, device='cuda')
273 |         torch.onnx.export(ssd300, dummy_input, onnx_path, verbose=False, input_names=input_names, output_names=output_names)
274 |         save_table(ssd300,onnx_path=onnx_path,table=table_path)
275 |     else:
276 |         args.checkpoint = os.path.join(args.check_point,"model.pt")
277 |         if args.local_rank == 0:
278 |             os.makedirs(args.check_point, exist_ok=True)
279 | 
280 |         torch.backends.cudnn.benchmark = True
281 | 
282 |         if args.fp16_mode != 'off':
283 |             args.fp16 = True
284 |         else:
285 |             args.fp16 = False
286 |         #print(args)
287 |         start_time = time.time()
288 |         acc, avg_speed = train(args)
289 |         # avg_speed is reported per node, adjust for the global speed
290 |         try:
291 |             num_shards = torch.distributed.get_world_size()
292 |         except RuntimeError:
293 |             num_shards = 1
294 |         avg_speed = num_shards * avg_speed
295 |         training_time = time.time() - start_time
296 | 
297 |         if args.local_rank == 0:
298 |             print("Training end: Average speed: {:3f} img/sec, Total time: {:3f} sec, Final accuracy: {:3f} mAP"
299 |               .format(avg_speed, training_time, acc))
300 | 
301 |             if args.target is not None:
302 |                 if args.target > acc:
303 |                     print('Target mAP of {} not met. Possible regression'.format(args.target))
304 |                     sys.exit(1)
305 | '''
306 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.253
307 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.429
308 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.262
309 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.075
310 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.273
311 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.397
312 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.240
313 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.349
314 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.367
315 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.122
316 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.406
317 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.551
318 |  214img/sec
319 |  
320 |  dali 218img/sec
321 |  
322 |  
323 |  warmup 200 + dali + fp16
324 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.256
325 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.434
326 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.263
327 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.078
328 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.274
329 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.408
330 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.240
331 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.352
332 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.368
333 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.126
334 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.403
335 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.563
336 | Current AP: 0.25628
337 | Epoch 64, Accuracy: 0.256285 mAP
338 | DONE (t=9.45s).
339 | Training end: Average speed: 232.580538 img/sec, Total time: 35018.003625 sec, Final accuracy: 0.256285 mAP
340 | 
341 | 
342 | 
343 | not qat
344 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.193
345 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.344
346 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.191
347 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.042
348 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.195
349 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.328
350 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.199
351 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.293
352 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.309
353 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.084
354 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.326
355 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.501
356 | Current AP: 0.19269
357 | 
358 | qat  resnet18
359 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.192
360 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.342
361 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.194
362 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.041
363 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.194
364 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.327
365 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.197
366 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.291
367 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.307
368 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.082
369 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.325
370 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.497
371 | Current AP: 0.19202
372 | '''


--------------------------------------------------------------------------------
/tests/ssd300/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenShisen/ncnnqat/253a413264507cf90089d1aa0e30c0ef30087cfe/tests/ssd300/src/__init__.py


--------------------------------------------------------------------------------
/tests/ssd300/src/coco.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'tylin'
  2 | __version__ = '2.0'
  3 | # Interface for accessing the Microsoft COCO dataset.
  4 | 
  5 | # Microsoft COCO is a large image dataset designed for object detection,
  6 | # segmentation, and caption generation. pycocotools is a Python API that
  7 | # assists in loading, parsing and visualizing the annotations in COCO.
  8 | # Please visit http://mscoco.org/ for more information on COCO, including
  9 | # for the data, paper, and tutorials. The exact format of the annotations
 10 | # is also described on the COCO website. For example usage of the pycocotools
 11 | # please see pycocotools_demo.ipynb. In addition to this API, please download both
 12 | # the COCO images and annotations in order to run the demo.
 13 | 
 14 | # An alternative to using the API is to load the annotations directly
 15 | # into Python dictionary
 16 | # Using the API provides additional utility functions. Note that this API
 17 | # supports both *instance* and *caption* annotations. In the case of
 18 | # captions not all functions are defined (e.g. categories are undefined).
 19 | 
 20 | # The following API functions are defined:
 21 | #  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
 22 | #  decodeMask - Decode binary mask M encoded via run-length encoding.
 23 | #  encodeMask - Encode binary mask M using run-length encoding.
 24 | #  getAnnIds  - Get ann ids that satisfy given filter conditions.
 25 | #  getCatIds  - Get cat ids that satisfy given filter conditions.
 26 | #  getImgIds  - Get img ids that satisfy given filter conditions.
 27 | #  loadAnns   - Load anns with the specified ids.
 28 | #  loadCats   - Load cats with the specified ids.
 29 | #  loadImgs   - Load imgs with the specified ids.
 30 | #  annToMask  - Convert segmentation in an annotation to binary mask.
 31 | #  showAnns   - Display the specified annotations.
 32 | #  loadRes    - Load algorithm results and create API for accessing them.
 33 | #  download   - Download COCO images from mscoco.org server.
 34 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
 35 | # Help on each functions can be accessed by: "help COCO>function".
 36 | 
 37 | # See also COCO>decodeMask,
 38 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
 39 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
 40 | # COCO>loadImgs, COCO>annToMask, COCO>showAnns
 41 | 
 42 | # Microsoft COCO Toolbox.      version 2.0
 43 | # Data, paper, and tutorials available at:  http://mscoco.org/
 44 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
 45 | # Licensed under the Simplified BSD License [see bsd.txt]
 46 | 
 47 | import json
 48 | import time
 49 | import matplotlib.pyplot as plt
 50 | from matplotlib.collections import PatchCollection
 51 | from matplotlib.patches import Polygon
 52 | import numpy as np
 53 | import copy
 54 | import itertools
 55 | from pycocotools import mask as maskUtils
 56 | import os
 57 | from collections import defaultdict
 58 | import sys
 59 | PYTHON_VERSION = sys.version_info[0]
 60 | if PYTHON_VERSION == 2:
 61 |     from urllib import urlretrieve
 62 | elif PYTHON_VERSION == 3:
 63 |     from urllib.request import urlretrieve
 64 | 
 65 | 
 66 | def _isArrayLike(obj):
 67 |     return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
 68 | 
 69 | 
 70 | class COCO:
 71 |     def __init__(self, annotation_file=None):
 72 |         """
 73 |         Constructor of Microsoft COCO helper class for reading and visualizing annotations.
 74 |         :param annotation_file (str): location of annotation file
 75 |         :param image_folder (str): location to the folder that hosts images.
 76 |         :return:
 77 |         """
 78 |         # load dataset
 79 |         self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
 80 |         self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
 81 |         if not annotation_file == None:
 82 |             print('loading annotations into memory...')
 83 |             tic = time.time()
 84 |             dataset = json.load(open(annotation_file, 'r'))
 85 |             assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
 86 |             print('Done (t={:0.2f}s)'.format(time.time()- tic))
 87 |             self.dataset = dataset
 88 |             self.createIndex()
 89 | 
 90 |     def createIndex(self):
 91 |         # create index
 92 |         print('creating index...')
 93 |         anns, cats, imgs = {}, {}, {}
 94 |         imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
 95 |         if 'annotations' in self.dataset:
 96 |             for ann in self.dataset['annotations']:
 97 |                 imgToAnns[ann['image_id']].append(ann)
 98 |                 anns[ann['id']] = ann
 99 | 
100 |         if 'images' in self.dataset:
101 |             for img in self.dataset['images']:
102 |                 imgs[img['id']] = img
103 | 
104 |         if 'categories' in self.dataset:
105 |             for cat in self.dataset['categories']:
106 |                 cats[cat['id']] = cat
107 | 
108 |         if 'annotations' in self.dataset and 'categories' in self.dataset:
109 |             for ann in self.dataset['annotations']:
110 |                 catToImgs[ann['category_id']].append(ann['image_id'])
111 | 
112 |         print('index created!')
113 | 
114 |         # create class members
115 |         self.anns = anns
116 |         self.imgToAnns = imgToAnns
117 |         self.catToImgs = catToImgs
118 |         self.imgs = imgs
119 |         self.cats = cats
120 | 
121 |     def info(self):
122 |         """
123 |         Print information about the annotation file.
124 |         :return:
125 |         """
126 |         for key, value in self.dataset['info'].items():
127 |             print('{}: {}'.format(key, value))
128 | 
129 |     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
130 |         """
131 |         Get ann ids that satisfy given filter conditions. default skips that filter
132 |         :param imgIds  (int array)     : get anns for given imgs
133 |                catIds  (int array)     : get anns for given cats
134 |                areaRng (float array)   : get anns for given area range (e.g. [0 inf])
135 |                iscrowd (boolean)       : get anns for given crowd label (False or True)
136 |         :return: ids (int array)       : integer array of ann ids
137 |         """
138 |         imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
139 |         catIds = catIds if _isArrayLike(catIds) else [catIds]
140 | 
141 |         if len(imgIds) == len(catIds) == len(areaRng) == 0:
142 |             anns = self.dataset['annotations']
143 |         else:
144 |             if not len(imgIds) == 0:
145 |                 lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
146 |                 anns = list(itertools.chain.from_iterable(lists))
147 |             else:
148 |                 anns = self.dataset['annotations']
149 |             anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
150 |             anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
151 |         if not iscrowd == None:
152 |             ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
153 |         else:
154 |             ids = [ann['id'] for ann in anns]
155 |         return ids
156 | 
157 |     def getCatIds(self, catNms=[], supNms=[], catIds=[]):
158 |         """
159 |         filtering parameters. default skips that filter.
160 |         :param catNms (str array)  : get cats for given cat names
161 |         :param supNms (str array)  : get cats for given supercategory names
162 |         :param catIds (int array)  : get cats for given cat ids
163 |         :return: ids (int array)   : integer array of cat ids
164 |         """
165 |         catNms = catNms if _isArrayLike(catNms) else [catNms]
166 |         supNms = supNms if _isArrayLike(supNms) else [supNms]
167 |         catIds = catIds if _isArrayLike(catIds) else [catIds]
168 | 
169 |         if len(catNms) == len(supNms) == len(catIds) == 0:
170 |             cats = self.dataset['categories']
171 |         else:
172 |             cats = self.dataset['categories']
173 |             cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
174 |             cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
175 |             cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
176 |         ids = [cat['id'] for cat in cats]
177 |         return ids
178 | 
179 |     def getImgIds(self, imgIds=[], catIds=[]):
180 |         '''
181 |         Get img ids that satisfy given filter conditions.
182 |         :param imgIds (int array) : get imgs for given ids
183 |         :param catIds (int array) : get imgs with all given cats
184 |         :return: ids (int array)  : integer array of img ids
185 |         '''
186 |         imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
187 |         catIds = catIds if _isArrayLike(catIds) else [catIds]
188 | 
189 |         if len(imgIds) == len(catIds) == 0:
190 |             ids = self.imgs.keys()
191 |         else:
192 |             ids = set(imgIds)
193 |             for i, catId in enumerate(catIds):
194 |                 if i == 0 and len(ids) == 0:
195 |                     ids = set(self.catToImgs[catId])
196 |                 else:
197 |                     ids &= set(self.catToImgs[catId])
198 |         return list(ids)
199 | 
200 |     def loadAnns(self, ids=[]):
201 |         """
202 |         Load anns with the specified ids.
203 |         :param ids (int array)       : integer ids specifying anns
204 |         :return: anns (object array) : loaded ann objects
205 |         """
206 |         if _isArrayLike(ids):
207 |             return [self.anns[id] for id in ids]
208 |         elif type(ids) == int:
209 |             return [self.anns[ids]]
210 | 
211 |     def loadCats(self, ids=[]):
212 |         """
213 |         Load cats with the specified ids.
214 |         :param ids (int array)       : integer ids specifying cats
215 |         :return: cats (object array) : loaded cat objects
216 |         """
217 |         if _isArrayLike(ids):
218 |             return [self.cats[id] for id in ids]
219 |         elif type(ids) == int:
220 |             return [self.cats[ids]]
221 | 
222 |     def loadImgs(self, ids=[]):
223 |         """
224 |         Load anns with the specified ids.
225 |         :param ids (int array)       : integer ids specifying img
226 |         :return: imgs (object array) : loaded img objects
227 |         """
228 |         if _isArrayLike(ids):
229 |             return [self.imgs[id] for id in ids]
230 |         elif type(ids) == int:
231 |             return [self.imgs[ids]]
232 | 
233 |     def showAnns(self, anns):
234 |         """
235 |         Display the specified annotations.
236 |         :param anns (array of object): annotations to display
237 |         :return: None
238 |         """
239 |         if len(anns) == 0:
240 |             return 0
241 |         if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
242 |             datasetType = 'instances'
243 |         elif 'caption' in anns[0]:
244 |             datasetType = 'captions'
245 |         else:
246 |             raise Exception('datasetType not supported')
247 |         if datasetType == 'instances':
248 |             ax = plt.gca()
249 |             ax.set_autoscale_on(False)
250 |             polygons = []
251 |             color = []
252 |             for ann in anns:
253 |                 c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
254 |                 if 'segmentation' in ann:
255 |                     if type(ann['segmentation']) == list:
256 |                         # polygon
257 |                         for seg in ann['segmentation']:
258 |                             poly = np.array(seg).reshape((int(len(seg)/2), 2))
259 |                             polygons.append(Polygon(poly))
260 |                             color.append(c)
261 |                     else:
262 |                         # mask
263 |                         t = self.imgs[ann['image_id']]
264 |                         if type(ann['segmentation']['counts']) == list:
265 |                             rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
266 |                         else:
267 |                             rle = [ann['segmentation']]
268 |                         m = maskUtils.decode(rle)
269 |                         img = np.ones( (m.shape[0], m.shape[1], 3) )
270 |                         if ann['iscrowd'] == 1:
271 |                             color_mask = np.array([2.0,166.0,101.0])/255
272 |                         if ann['iscrowd'] == 0:
273 |                             color_mask = np.random.random((1, 3)).tolist()[0]
274 |                         for i in range(3):
275 |                             img[:,:,i] = color_mask[i]
276 |                         ax.imshow(np.dstack( (img, m*0.5) ))
277 |                 if 'keypoints' in ann and type(ann['keypoints']) == list:
278 |                     # turn skeleton into zero-based index
279 |                     sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
280 |                     kp = np.array(ann['keypoints'])
281 |                     x = kp[0::3]
282 |                     y = kp[1::3]
283 |                     v = kp[2::3]
284 |                     for sk in sks:
285 |                         if np.all(v[sk]>0):
286 |                             plt.plot(x[sk],y[sk], linewidth=3, color=c)
287 |                     plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
288 |                     plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
289 |             p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
290 |             ax.add_collection(p)
291 |             p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
292 |             ax.add_collection(p)
293 |         elif datasetType == 'captions':
294 |             for ann in anns:
295 |                 print(ann['caption'])
296 | 
297 |     def loadRes(self, resFile):
298 |         """
299 |         Load result file and return a result api object.
300 |         :param   resFile (str)     : file name of result file
301 |         :return: res (obj)         : result api object
302 |         """
303 |         res = COCO()
304 |         res.dataset['images'] = [img for img in self.dataset['images']]
305 | 
306 |         print('Loading and preparing results...')
307 |         tic = time.time()
308 |         if type(resFile) == str: #or type(resFile) == unicode:
309 |             anns = json.load(open(resFile))
310 |         elif type(resFile) == np.ndarray:
311 |             anns = self.loadNumpyAnnotations(resFile)
312 |         else:
313 |             anns = resFile
314 |         assert type(anns) == list, 'results in not an array of objects'
315 |         annsImgIds = [ann['image_id'] for ann in anns]
316 |         assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
317 |                'Results do not correspond to current coco set'
318 |         if 'caption' in anns[0]:
319 |             imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
320 |             res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
321 |             for id, ann in enumerate(anns):
322 |                 ann['id'] = id+1
323 |         elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
324 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
325 |             for id, ann in enumerate(anns):
326 |                 bb = ann['bbox']
327 |                 x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
328 |                 if not 'segmentation' in ann:
329 |                     ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
330 |                 ann['area'] = bb[2]*bb[3]
331 |                 ann['id'] = id+1
332 |                 ann['iscrowd'] = 0
333 |         elif 'segmentation' in anns[0]:
334 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
335 |             for id, ann in enumerate(anns):
336 |                 # now only support compressed RLE format as segmentation results
337 |                 ann['area'] = maskUtils.area(ann['segmentation'])
338 |                 if not 'bbox' in ann:
339 |                     ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
340 |                 ann['id'] = id+1
341 |                 ann['iscrowd'] = 0
342 |         elif 'keypoints' in anns[0]:
343 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
344 |             for id, ann in enumerate(anns):
345 |                 s = ann['keypoints']
346 |                 x = s[0::3]
347 |                 y = s[1::3]
348 |                 x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
349 |                 ann['area'] = (x1-x0)*(y1-y0)
350 |                 ann['id'] = id + 1
351 |                 ann['bbox'] = [x0,y0,x1-x0,y1-y0]
352 |         print('DONE (t={:0.2f}s)'.format(time.time()- tic))
353 | 
354 |         res.dataset['annotations'] = anns
355 |         res.createIndex()
356 |         return res
357 | 
358 |     def download(self, tarDir = None, imgIds = [] ):
359 |         '''
360 |         Download COCO images from mscoco.org server.
361 |         :param tarDir (str): COCO results directory name
362 |                imgIds (list): images to be downloaded
363 |         :return:
364 |         '''
365 |         if tarDir is None:
366 |             print('Please specify target directory')
367 |             return -1
368 |         if len(imgIds) == 0:
369 |             imgs = self.imgs.values()
370 |         else:
371 |             imgs = self.loadImgs(imgIds)
372 |         N = len(imgs)
373 |         if not os.path.exists(tarDir):
374 |             os.makedirs(tarDir)
375 |         for i, img in enumerate(imgs):
376 |             tic = time.time()
377 |             fname = os.path.join(tarDir, img['file_name'])
378 |             if not os.path.exists(fname):
379 |                 urlretrieve(img['coco_url'], fname)
380 |             print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
381 | 
382 |     def loadNumpyAnnotations(self, data):
383 |         """
384 |         Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
385 |         :param  data (numpy.ndarray)
386 |         :return: annotations (python nested list)
387 |         """
388 |         print('Converting ndarray to lists...')
389 |         assert(type(data) == np.ndarray)
390 |         print(data.shape)
391 |         assert(data.shape[1] == 7)
392 |         N = data.shape[0]
393 |         ann = []
394 |         for i in range(N):
395 |             if i % 1000000 == 0:
396 |                 print('{}/{}'.format(i,N))
397 |             ann += [{
398 |                 'image_id'  : int(data[i, 0]),
399 |                 'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
400 |                 'score' : data[i, 5],
401 |                 'category_id': int(data[i, 6]),
402 |                 }]
403 |         return ann
404 | 
405 |     def annToRLE(self, ann):
406 |         """
407 |         Convert annotation which can be polygons, uncompressed RLE to RLE.
408 |         :return: binary mask (numpy 2D array)
409 |         """
410 |         t = self.imgs[ann['image_id']]
411 |         h, w = t['height'], t['width']
412 |         segm = ann['segmentation']
413 |         if type(segm) == list:
414 |             # polygon -- a single object might consist of multiple parts
415 |             # we merge all parts into one mask rle code
416 |             rles = maskUtils.frPyObjects(segm, h, w)
417 |             rle = maskUtils.merge(rles)
418 |         elif type(segm['counts']) == list:
419 |             # uncompressed RLE
420 |             rle = maskUtils.frPyObjects(segm, h, w)
421 |         else:
422 |             # rle
423 |             rle = ann['segmentation']
424 |         return rle
425 | 
426 |     def annToMask(self, ann):
427 |         """
428 |         Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
429 |         :return: binary mask (numpy 2D array)
430 |         """
431 |         rle = self.annToRLE(ann)
432 |         m = maskUtils.decode(rle)
433 |         return m
434 | 


--------------------------------------------------------------------------------
/tests/ssd300/src/coco_pipeline.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import torch
17 | from nvidia.dali.pipeline import pipeline_def
18 | import nvidia.dali.types as types
19 | import nvidia.dali.fn as fn
20 | 
21 | 
22 | @pipeline_def
23 | def create_coco_pipeline(default_boxes, args):
24 |     try:
25 |         shard_id = torch.distributed.get_rank()
26 |         num_shards = torch.distributed.get_world_size()
27 |     except RuntimeError:
28 |         shard_id = 0
29 |         num_shards = 1
30 | 
31 |     images, bboxes, labels = fn.readers.coco(file_root=args.train_coco_root,
32 |                                              annotations_file=args.train_annotate,
33 |                                              skip_empty=True,
34 |                                              shard_id=shard_id,
35 |                                              num_shards=num_shards,
36 |                                              ratio=True,
37 |                                              ltrb=True,
38 |                                              random_shuffle=False,
39 |                                              shuffle_after_epoch=True,
40 |                                              name="Reader")
41 | 
42 |     crop_begin, crop_size, bboxes, labels = fn.random_bbox_crop(bboxes, labels,
43 |                                                                 device="cpu",
44 |                                                                 aspect_ratio=[0.5, 2.0],
45 |                                                                 thresholds=[0, 0.1, 0.3, 0.5, 0.7, 0.9],
46 |                                                                 scaling=[0.3, 1.0],
47 |                                                                 bbox_layout="xyXY",
48 |                                                                 allow_no_crop=True,
49 |                                                                 num_attempts=50)
50 |     images = fn.decoders.image_slice(images, crop_begin, crop_size, device="mixed", output_type=types.RGB)
51 |     flip_coin = fn.random.coin_flip(probability=0.5)
52 |     images = fn.resize(images,
53 |                        resize_x=300,
54 |                        resize_y=300,
55 |                        min_filter=types.DALIInterpType.INTERP_TRIANGULAR)
56 | 
57 |     #saturation = fn.uniform(range=[0.5, 1.5])
58 |     #contrast = fn.uniform(range=[0.5, 1.5])
59 |     #brightness = fn.uniform(range=[0.875, 1.125])
60 |     #hue = fn.uniform(range=[-0.5, 0.5])
61 |     
62 |     saturation = fn.random.uniform(range=[0.5, 1.5])
63 |     contrast = fn.random.uniform(range=[0.5, 1.5])
64 |     brightness = fn.random.uniform(range=[0.875, 1.125])
65 |     hue = fn.random.uniform(range=[-0.5, 0.5])
66 | 
67 |     images = fn.hsv(images, dtype=types.FLOAT, hue=hue, saturation=saturation)  # use float to avoid clipping and
68 |                                                          # quantizing the intermediate result
69 |     images = fn.brightness_contrast(images,
70 |                                     contrast_center = 128,  # input is in float, but in 0..255 range
71 |                                     dtype = types.UINT8,
72 |                                     brightness = brightness,
73 |                                     contrast = contrast)
74 | 
75 |     dtype = types.FLOAT16 if args.fp16 else types.FLOAT
76 | 
77 |     bboxes = fn.bb_flip(bboxes, ltrb=True, horizontal=flip_coin)
78 |     images = fn.crop_mirror_normalize(images,
79 |                                       crop=(300, 300),
80 |                                       mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
81 |                                       std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
82 |                                       mirror=flip_coin,
83 |                                       dtype=dtype,
84 |                                       output_layout="CHW",
85 |                                       pad_output=False)
86 | 
87 |     bboxes, labels = fn.box_encoder(bboxes, labels,
88 |                                     criteria=0.5,
89 |                                     anchors=default_boxes.as_ltrb_list())
90 | 
91 |     labels=labels.gpu()
92 |     bboxes=bboxes.gpu()
93 | 
94 |     return images, bboxes, labels
95 | 


--------------------------------------------------------------------------------
/tests/ssd300/src/data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | 
  4 | import torch
  5 | from torch.utils.data import DataLoader
  6 | 
  7 | from src.utils import dboxes300_coco, COCODetection, SSDTransformer
  8 | from src.coco import COCO
  9 | from src.coco_pipeline import create_coco_pipeline
 10 | from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy
 11 | 
 12 | 
 13 | def set_seeds(args):
 14 |     torch.cuda.set_device(args.local_rank)
 15 |     device = torch.device('cuda')
 16 | 
 17 |     if args.distributed:
 18 |         args.seed = broadcast_seeds(args.seed, device)
 19 |         local_seed = (args.seed + torch.distributed.get_rank()) % 2**32
 20 |         local_rank = torch.distributed.get_rank()
 21 |         
 22 |         #local_seed = args.seed % 2**32
 23 |         #local_rank = 0
 24 |     else:
 25 |         local_seed = args.seed % 2**32
 26 |         local_rank = 0
 27 | 
 28 |     print("Rank", local_rank, "using seed = {}".format(local_seed))
 29 | 
 30 |     torch.manual_seed(local_seed)
 31 |     np.random.seed(seed=local_seed)
 32 | 
 33 |     return local_seed
 34 | 
 35 | 
 36 | def broadcast_seeds(seed, device):
 37 |     if torch.distributed.is_initialized():
 38 |         seeds_tensor = torch.LongTensor([seed]).to(device)
 39 |         torch.distributed.broadcast(seeds_tensor, 0)
 40 |         seed = seeds_tensor.item()
 41 |     return seed
 42 | 
 43 | 
 44 | def get_train_pytorch_loader(args, num_workers, default_boxes):
 45 |     dataset = COCODetection(
 46 |         args.train_coco_root,
 47 |         args.train_annotate,
 48 |         SSDTransformer(default_boxes, args, (300, 300), val=False))
 49 | 
 50 |     if args.distributed:
 51 |         train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
 52 |         #train_sampler = None
 53 |     else:
 54 |         train_sampler = None
 55 | 
 56 |     train_dataloader = DataLoader(
 57 |         dataset,
 58 |         batch_size=args.batch_size,
 59 |         shuffle=(train_sampler is None),
 60 |         sampler=train_sampler,
 61 |         drop_last=True,
 62 |         num_workers=num_workers)
 63 | 
 64 |     return train_dataloader
 65 | 
 66 | 
 67 | def get_train_dali_loader(args, default_boxes, local_seed):
 68 |     train_pipe = create_coco_pipeline(
 69 |         default_boxes,
 70 |         args,
 71 |         batch_size=args.batch_size,
 72 |         num_threads=args.num_workers,
 73 |         device_id=args.local_rank,
 74 |         seed=local_seed)
 75 | 
 76 |     train_loader = DALIGenericIterator(
 77 |         train_pipe,
 78 |         ["images", "boxes", "labels"],
 79 |         reader_name="Reader",
 80 |         last_batch_policy=LastBatchPolicy.FILL)
 81 | 
 82 |     return train_loader
 83 | 
 84 | 
 85 | def get_train_loader(args, dboxes):
 86 |     args.train_annotate = os.path.join(
 87 |         args.data, "annotations/instances_train2017.json")
 88 |     args.train_coco_root = os.path.join(args.data, "train2017")
 89 | 
 90 |     local_seed = set_seeds(args)
 91 | 
 92 |     if args.data_pipeline == 'no_dali':
 93 |         return get_train_pytorch_loader(args, args.num_workers, dboxes)
 94 |     elif args.data_pipeline == 'dali':
 95 |         return get_train_dali_loader(args, dboxes, local_seed)
 96 | 
 97 | 
 98 | def get_val_dataset(args):
 99 |     dboxes = dboxes300_coco()
100 |     val_trans = SSDTransformer(dboxes, args,(300, 300), val=True)
101 | 
102 |     val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
103 |     val_coco_root = os.path.join(args.data, "val2017")
104 | 
105 |     val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
106 |     return val_coco
107 | 
108 | 
109 | def get_val_dataloader(args):
110 |     dataset = get_val_dataset(args)
111 |     inv_map = {v: k for k, v in dataset.label_map.items()}
112 |     print(dataset.label_info)
113 | 
114 |     if args.distributed:
115 |         val_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
116 |         #val_sampler = None
117 |     else:
118 |         val_sampler = None
119 | 
120 |     val_dataloader = DataLoader(
121 |         dataset,
122 |         batch_size=args.eval_batch_size,
123 |         shuffle=False,  # Note: distributed sampler is shuffled :(
124 |         sampler=val_sampler,
125 |         num_workers=args.num_workers)
126 | 
127 |     return val_dataloader, inv_map
128 | 
129 | 
130 | def get_coco_ground_truth(args):
131 |     val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
132 |     cocoGt = COCO(annotation_file=val_annotate)
133 |     return cocoGt
134 | 


--------------------------------------------------------------------------------
/tests/ssd300/src/distributed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 3 | import torch.distributed as dist
 4 | from torch.nn.modules import Module
 5 | 
 6 | '''
 7 | This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
 8 | launcher included with this example. It assumes that your run is using multiprocess with 1
 9 | GPU/process, that the model is on the correct device, and that torch.set_device has been
10 | used to set the device.
11 | 
12 | Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
13 | and will be allreduced at the finish of the backward pass.
14 | '''
15 | class DistributedDataParallel(Module):
16 | 
17 |     def __init__(self, module):
18 |         super(DistributedDataParallel, self).__init__()
19 |         self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
20 | 
21 |         self.module = module
22 | 
23 |         for p in self.module.state_dict().values():
24 |             if not torch.is_tensor(p):
25 |                 continue
26 |             if dist._backend == dist.dist_backend.NCCL:
27 |                 assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU."
28 |             dist.broadcast(p, 0)
29 | 
30 |         def allreduce_params():
31 |             if(self.needs_reduction):
32 |                 self.needs_reduction = False
33 |                 buckets = {}
34 |                 for param in self.module.parameters():
35 |                     if param.requires_grad and param.grad is not None:
36 |                         tp = param.data.type()
37 |                         if tp not in buckets:
38 |                             buckets[tp] = []
39 |                         buckets[tp].append(param)
40 |                 if self.warn_on_half:
41 |                     if torch.cuda.HalfTensor in buckets:
42 |                         print("WARNING: gloo dist backend for half parameters may be extremely slow." +
43 |                               " It is recommended to use the NCCL backend in this case.")
44 |                         self.warn_on_half = False
45 | 
46 |                 for tp in buckets:
47 |                     bucket = buckets[tp]
48 |                     grads = [param.grad.data for param in bucket]
49 |                     coalesced = _flatten_dense_tensors(grads)
50 |                     dist.all_reduce(coalesced)
51 |                     coalesced /= dist.get_world_size()
52 |                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
53 |                         buf.copy_(synced)
54 | 
55 |         for param in list(self.module.parameters()):
56 |             def allreduce_hook(*unused):
57 |                 param._execution_engine.queue_callback(allreduce_params)
58 |             if param.requires_grad:
59 |                 param.register_hook(allreduce_hook)
60 | 
61 |     def forward(self, *inputs, **kwargs):
62 |         self.needs_reduction = True
63 |         return self.module(*inputs, **kwargs)
64 | 
65 |     '''
66 |     def _sync_buffers(self):
67 |         buffers = list(self.module._all_buffers())
68 |         if len(buffers) > 0:
69 |             # cross-node buffer sync
70 |             flat_buffers = _flatten_dense_tensors(buffers)
71 |             dist.broadcast(flat_buffers, 0)
72 |             for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
73 |                 buf.copy_(synced)
74 |      def train(self, mode=True):
75 |         # Clear NCCL communicator and CUDA event cache of the default group ID,
76 |         # These cache will be recreated at the later call. This is currently a
77 |         # work-around for a potential NCCL deadlock.
78 |         if dist._backend == dist.dist_backend.NCCL:
79 |             dist._clear_group_cache()
80 |         super(DistributedDataParallel, self).train(mode)
81 |         self.module.train(mode)
82 |     '''
83 | 


--------------------------------------------------------------------------------
/tests/ssd300/src/evaluate.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import time
  3 | import numpy as np
  4 | from contextlib import redirect_stdout
  5 | import io
  6 | 
  7 | from pycocotools.cocoeval import COCOeval
  8 | 
  9 | 
 10 | def evaluate(model, coco, cocoGt, encoder, inv_map, args):
 11 |     if args.distributed:
 12 |         N_gpu = torch.distributed.get_world_size()
 13 |     else:
 14 |         N_gpu = 1
 15 | 
 16 |     model.eval()
 17 |     model.cuda()
 18 |     
 19 |     ret = []
 20 |     start = time.time()
 21 | 
 22 |     # for idx, image_id in enumerate(coco.img_keys):
 23 |     for nbatch, (img, img_id, img_size, _, _) in enumerate(coco):
 24 |         print("Parsing batch: {}/{}".format(nbatch, len(coco)), end='\r')
 25 |         with torch.no_grad():
 26 |             inp = img.cuda()
 27 |             if args.fp16:
 28 |                 #inp = inp.half()
 29 |                 pass
 30 | 
 31 |             # Get predictions
 32 |             ploc, plabel = model(inp)
 33 |             ploc, plabel = ploc.float(), plabel.float()
 34 | 
 35 |             # Handle the batch of predictions produced
 36 |             # This is slow, but consistent with old implementation.
 37 |             for idx in range(ploc.shape[0]):
 38 |                 # ease-of-use for specific predictions
 39 |                 ploc_i = ploc[idx, :, :].unsqueeze(0)
 40 |                 plabel_i = plabel[idx, :, :].unsqueeze(0)
 41 | 
 42 |                 try:
 43 |                     result = encoder.decode_batch(ploc_i, plabel_i, 0.50, 200)[0]
 44 |                 except:
 45 |                     # raise
 46 |                     print("")
 47 |                     print("No object detected in idx: {}".format(idx))
 48 |                     continue
 49 | 
 50 |                 htot, wtot = img_size[0][idx].item(), img_size[1][idx].item()
 51 |                 loc, label, prob = [r.cpu().numpy() for r in result]
 52 |                 for loc_, label_, prob_ in zip(loc, label, prob):
 53 |                     ret.append([img_id[idx], loc_[0] * wtot, \
 54 |                                 loc_[1] * htot,
 55 |                                 (loc_[2] - loc_[0]) * wtot,
 56 |                                 (loc_[3] - loc_[1]) * htot,
 57 |                                 prob_,
 58 |                                 inv_map[label_]])
 59 | 
 60 |     # Now we have all predictions from this rank, gather them all together
 61 |     # if necessary
 62 |     ret = np.array(ret).astype(np.float32)
 63 | 
 64 |     # Multi-GPU eval
 65 |     if args.distributed:
 66 |         # NCCL backend means we can only operate on GPU tensors
 67 |         ret_copy = torch.tensor(ret).cuda()
 68 |         # Everyone exchanges the size of their results
 69 |         ret_sizes = [torch.tensor(0).cuda() for _ in range(N_gpu)]
 70 | 
 71 |         torch.cuda.synchronize()
 72 |         torch.distributed.all_gather(ret_sizes, torch.tensor(ret_copy.shape[0]).cuda())
 73 |         torch.cuda.synchronize()
 74 | 
 75 |         # Get the maximum results size, as all tensors must be the same shape for
 76 |         # the all_gather call we need to make
 77 |         max_size = 0
 78 |         sizes = []
 79 |         for s in ret_sizes:
 80 |             max_size = max(max_size, s.item())
 81 |             sizes.append(s.item())
 82 | 
 83 |         # Need to pad my output to max_size in order to use in all_gather
 84 |         ret_pad = torch.cat([ret_copy, torch.zeros(max_size - ret_copy.shape[0], 7, dtype=torch.float32).cuda()])
 85 | 
 86 |         # allocate storage for results from all other processes
 87 |         other_ret = [torch.zeros(max_size, 7, dtype=torch.float32).cuda() for i in range(N_gpu)]
 88 |         # Everyone exchanges (padded) results
 89 | 
 90 |         torch.cuda.synchronize()
 91 |         torch.distributed.all_gather(other_ret, ret_pad)
 92 |         torch.cuda.synchronize()
 93 | 
 94 |         # Now need to reconstruct the _actual_ results from the padded set using slices.
 95 |         cat_tensors = []
 96 |         for i in range(N_gpu):
 97 |             cat_tensors.append(other_ret[i][:sizes[i]][:])
 98 | 
 99 |         final_results = torch.cat(cat_tensors).cpu().numpy()
100 |     else:
101 |         # Otherwise full results are just our results
102 |         final_results = ret
103 | 
104 |     if args.local_rank == 0:
105 |         print("")
106 |         print("Predicting Ended, total time: {:.2f} s".format(time.time() - start))
107 | 
108 |     cocoDt = cocoGt.loadRes(final_results)
109 | 
110 |     E = COCOeval(cocoGt, cocoDt, iouType='bbox')
111 |     E.evaluate()
112 |     E.accumulate()
113 |     if args.local_rank == 0:
114 |         E.summarize()
115 |         print("Current AP: {:.5f}".format(E.stats[0]))
116 |     else:
117 |         # fix for cocoeval indiscriminate prints
118 |         with redirect_stdout(io.StringIO()):
119 |             E.summarize()
120 | 
121 |     # put your model in training mode back on
122 |     model.train()
123 | 
124 |     return E.stats[0]  # Average Precision  (AP) @[ IoU=050:0.95 | area=   all | maxDets=100 ]
125 | 
126 | 


--------------------------------------------------------------------------------
/tests/ssd300/src/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152
  4 | from torch.nn.parallel import DistributedDataParallel as DDP
  5 | 
  6 | 
  7 | 
  8 | class ResNet(nn.Module):
  9 |     def __init__(self, backbone='resnet50'):
 10 |         super().__init__()
 11 |         if backbone == 'resnet18':
 12 |             backbone = resnet18(pretrained=True)
 13 |             self.out_channels = [256, 512, 512, 256, 256, 128]
 14 |         elif backbone == 'resnet34':
 15 |             backbone = resnet34(pretrained=True)
 16 |             self.out_channels = [256, 512, 512, 256, 256, 256]
 17 |         elif backbone == 'resnet50':
 18 |             backbone = resnet50(pretrained=True)
 19 |             self.out_channels = [1024, 512, 512, 256, 256, 256]
 20 |         elif backbone == 'resnet101':
 21 |             backbone = resnet101(pretrained=True)
 22 |             self.out_channels = [1024, 512, 512, 256, 256, 256]
 23 |         else:  # backbone == 'resnet152':
 24 |             backbone = resnet152(pretrained=True)
 25 |             self.out_channels = [1024, 512, 512, 256, 256, 256]
 26 | 
 27 | 
 28 |         self.feature_extractor = nn.Sequential(*list(backbone.children())[:7])
 29 | 
 30 |         conv4_block1 = self.feature_extractor[-1][0]
 31 | 
 32 |         conv4_block1.conv1.stride = (1, 1)
 33 |         conv4_block1.conv2.stride = (1, 1)
 34 |         conv4_block1.downsample[0].stride = (1, 1)
 35 | 
 36 |     def forward(self, x):
 37 |         x = self.feature_extractor(x)
 38 |         return x
 39 | 
 40 | 
 41 | class SSD300(nn.Module):
 42 |     def __init__(self, backbone='resnet50'):
 43 |         super().__init__()
 44 | 
 45 |         self.feature_extractor = ResNet(backbone=backbone)
 46 | 
 47 |         self.label_num = 81  # number of COCO classes
 48 |         self._build_additional_features(self.feature_extractor.out_channels)
 49 |         self.num_defaults = [4, 6, 6, 6, 4, 4]
 50 |         self.loc = []
 51 |         self.conf = []
 52 | 
 53 |         for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
 54 |             self.loc.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
 55 |             self.conf.append(nn.Conv2d(oc, nd * self.label_num, kernel_size=3, padding=1))
 56 | 
 57 |         self.loc = nn.ModuleList(self.loc)
 58 |         self.conf = nn.ModuleList(self.conf)
 59 |         self._init_weights()
 60 | 
 61 |     def _build_additional_features(self, input_size):
 62 |         self.additional_blocks = []
 63 |         for i, (input_size, output_size, channels) in enumerate(zip(input_size[:-1], input_size[1:], [256, 256, 128, 128, 128])):
 64 |             if i < 3:
 65 |                 layer = nn.Sequential(
 66 |                     nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
 67 |                     nn.BatchNorm2d(channels),
 68 |                     nn.ReLU(inplace=True),
 69 |                     nn.Conv2d(channels, output_size, kernel_size=3, padding=1, stride=2, bias=False),
 70 |                     nn.BatchNorm2d(output_size),
 71 |                     nn.ReLU(inplace=True),
 72 |                 )
 73 |             else:
 74 |                 layer = nn.Sequential(
 75 |                     nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
 76 |                     nn.BatchNorm2d(channels),
 77 |                     nn.ReLU(inplace=True),
 78 |                     nn.Conv2d(channels, output_size, kernel_size=3, bias=False),
 79 |                     nn.BatchNorm2d(output_size),
 80 |                     nn.ReLU(inplace=True),
 81 |                 )
 82 | 
 83 |             self.additional_blocks.append(layer)
 84 | 
 85 |         self.additional_blocks = nn.ModuleList(self.additional_blocks)
 86 | 
 87 |     def _init_weights(self):
 88 |         layers = [*self.additional_blocks, *self.loc, *self.conf]
 89 |         for layer in layers:
 90 |             for param in layer.parameters():
 91 |                 if param.dim() > 1: nn.init.xavier_uniform_(param)
 92 | 
 93 |     # Shape the classifier to the view of bboxes
 94 |     def bbox_view(self, src, loc, conf):
 95 |         ret = []
 96 |         for s, l, c in zip(src, loc, conf):
 97 |             ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
 98 | 
 99 |         locs, confs = list(zip(*ret))
100 |         locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
101 |         return locs, confs
102 | 
103 |     def forward(self, x):
104 |         x = self.feature_extractor(x)
105 | 
106 |         detection_feed = [x]
107 |         for l in self.additional_blocks:
108 |             x = l(x)
109 |             detection_feed.append(x)
110 | 
111 |         # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
112 |         locs, confs = self.bbox_view(detection_feed, self.loc, self.conf)
113 | 
114 |         # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
115 |         return locs, confs
116 | 
117 | def model(args,onnx_save=False):
118 |     ssd300 = SSD300(backbone=args.backbone)
119 |     
120 |     if onnx_save:
121 |         return ssd300
122 |     
123 |     ssd300.cuda()
124 | 
125 |     if args.distributed:
126 |         ssd300 = DDP(ssd300,device_ids=[args.local_rank],output_device=args.local_rank,find_unused_parameters=True)
127 | 
128 |     return ssd300
129 | 
130 | 
131 | class Loss(nn.Module):
132 |     """
133 |         Implements the loss as the sum of the followings:
134 |         1. Confidence Loss: All labels, with hard negative mining
135 |         2. Localization Loss: Only on positive labels
136 |         Suppose input dboxes has the shape 8732x4
137 |     """
138 |     def __init__(self, dboxes):
139 |         super(Loss, self).__init__()
140 |         self.scale_xy = 1.0/dboxes.scale_xy
141 |         self.scale_wh = 1.0/dboxes.scale_wh
142 | 
143 |         #self.sl1_loss = nn.SmoothL1Loss(reduce=False)
144 |         #self.sl1_loss = nn.SmoothL1Loss(reduce=None)
145 |         self.sl1_loss = nn.SmoothL1Loss(reduction='none',reduce=None)
146 |         self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
147 |             requires_grad=False)
148 |         # Two factor are from following links
149 |         # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
150 |         #self.con_loss = nn.CrossEntropyLoss(reduce=False)
151 |         self.con_loss = nn.CrossEntropyLoss(reduce=None,reduction='none')
152 | 
153 |     def _loc_vec(self, loc):
154 |         """
155 |             Generate Location Vectors
156 |         """
157 |         gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ]
158 |         gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log()
159 |         return torch.cat((gxy, gwh), dim=1).contiguous()
160 | 
161 |     def forward(self, ploc, plabel, gloc, glabel):
162 |         """
163 |             ploc, plabel: Nx4x8732, Nxlabel_numx8732
164 |                 predicted location and labels
165 | 
166 |             gloc, glabel: Nx4x8732, Nx8732
167 |                 ground truth location and labels
168 |         """
169 |         mask = glabel > 0
170 |         pos_num = mask.sum(dim=1)
171 | 
172 |         vec_gd = self._loc_vec(gloc)
173 | 
174 |         # sum on four coordinates, and mask
175 |         #print(ploc.shape, vec_gd.shape)
176 |         
177 |         
178 |         #org:
179 |         #sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
180 |         #change:
181 |         sl1 = self.sl1_loss(ploc, vec_gd)#.sum(dim=1)
182 |         sl1 = torch.sum(sl1,dim=1)
183 |         
184 |         #print(sl1.shape,mask.shape)
185 |         sl1 = (mask.float()*sl1).sum(dim=1)
186 | 
187 |         # hard negative mining
188 |         con = self.con_loss(plabel, glabel)
189 | 
190 |         # postive mask will never selected
191 |         #print(con.shape)
192 |         con_neg = con.clone()
193 |         #print(mask.shape)
194 |         #print(con_neg.shape)
195 |         con_neg[mask] = 0
196 |         _, con_idx = con_neg.sort(dim=1, descending=True)
197 |         _, con_rank = con_idx.sort(dim=1)
198 | 
199 |         # number of negative three times positive
200 |         neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
201 |         neg_mask = con_rank < neg_num
202 | 
203 |         #print(con.shape, mask.shape, neg_mask.shape)
204 |         closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
205 | 
206 |         # avoid no object detected
207 |         total_loss = sl1 + closs
208 |         num_mask = (pos_num > 0).float()
209 |         pos_num = pos_num.float().clamp(min=1e-6)
210 |         ret = (total_loss*num_mask/pos_num).mean(dim=0)
211 |         return ret
212 | 


--------------------------------------------------------------------------------
/tests/ssd300/src/train.py:
--------------------------------------------------------------------------------
 1 | from torch.autograd import Variable
 2 | import torch
 3 | import time
 4 | from torch.cuda.amp import autocast as autocast, GradScaler
 5 | from ncnnqat import unquant_weight
 6 | scaler = GradScaler()
 7 | def train_loop(model, loss_func, epoch, optim, train_loader, iteration, logger, args):
 8 |     for nbatch, data in enumerate(train_loader):
 9 |         if args.data_pipeline == 'no_dali':
10 |             (img, _, img_size, bbox, label) = data
11 |             img = img.cuda()
12 |             bbox = bbox.cuda()
13 |             label = label.cuda()
14 |         else:
15 |             img = data[0]["images"]
16 |             bbox = data[0]["boxes"]
17 |             label = data[0]["labels"]
18 |             label = label.type(torch.cuda.LongTensor)
19 | 
20 |         #print(img.dtype)
21 |         #print(bbox.dtype)
22 |         #print(label.dtype)
23 |         #print("====================================")
24 |         #boxes_in_batch = len(label.nonzero())
25 |         boxes_in_batch = len(label.nonzero(as_tuple=True))
26 |         
27 | 
28 |         if boxes_in_batch != 0:
29 |             
30 | 
31 |             trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
32 | 
33 |             label = label.cuda()
34 |             gloc = Variable(trans_bbox, requires_grad=False)
35 |             glabel = Variable(label, requires_grad=False)
36 | 
37 |             with autocast():
38 |                 ploc, plabel = model(img)
39 |                 ploc, plabel = ploc.float(), plabel.float()
40 |                 loss = loss_func(ploc, plabel, gloc, glabel)
41 |             
42 | 
43 |             logger.update_iter(epoch, iteration, loss.item())
44 | 
45 |             if args.fp16:
46 |                 scaler.scale(loss).backward()
47 |             else:
48 |                 loss.backward()
49 | 
50 |         if args.warmup is not None:
51 |             warmup(optim, args.warmup, iteration, args.learning_rate)
52 |         if args.fp16:
53 |             scaler.step(optim)
54 |             scaler.update()
55 |         else:
56 |             '''qat'''
57 |             if epoch >= args.qat_epoch:
58 |                 model.apply(unquant_weight)
59 |             '''qat'''
60 |             optim.step()
61 |         optim.zero_grad()
62 |         iteration += 1
63 |         
64 |     return iteration
65 | 
66 | 
67 | def warmup(optim, warmup_iters, iteration, base_lr):
68 |     if iteration < warmup_iters:
69 |         new_lr = 1. * base_lr / warmup_iters * iteration
70 |         for param_group in optim.param_groups:
71 |             param_group['lr'] = new_lr
72 | 
73 | 
74 | def tencent_trick(model):
75 |     """
76 |     Divide parameters into 2 groups.
77 |     First group is BNs and all biases.
78 |     Second group is the remaining model's parameters.
79 |     Weight decay will be disabled in first group (aka tencent trick).
80 |     """
81 |     decay, no_decay = [], []
82 |     for name, param in model.named_parameters():
83 |         if not param.requires_grad:
84 |             continue  # frozen weights
85 |         if len(param.shape) == 1 or name.endswith(".bias"):
86 |             no_decay.append(param)
87 |         else:
88 |             decay.append(param)
89 |     return [{'params': no_decay, 'weight_decay': 0.0},
90 |             {'params': decay}]
91 | 


--------------------------------------------------------------------------------
/tests/ssd300/src/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchvision.transforms as transforms
  3 | import torch.utils.data as data
  4 | from PIL import Image
  5 | import os
  6 | import numpy as np
  7 | import random
  8 | import itertools
  9 | import torch.nn.functional as F
 10 | import json
 11 | import time
 12 | import bz2
 13 | import pickle
 14 | from math import sqrt
 15 | # from src.coco_pipeline import COCOReaderPipeline
 16 | 
 17 | 
 18 | # This function is from https://github.com/kuangliu/pytorch-ssd.
 19 | def calc_iou_tensor(box1, box2):
 20 |     """ Calculation of IoU based on two boxes tensor,
 21 |         Reference to https://github.com/kuangliu/pytorch-src
 22 |         input:
 23 |             box1 (N, 4) 
 24 |             box2 (M, 4)
 25 |         output:
 26 |             IoU (N, M)
 27 |     """
 28 |     N = box1.size(0)
 29 |     M = box2.size(0)
 30 | 
 31 |     be1 = box1.unsqueeze(1).expand(-1, M, -1)
 32 |     be2 = box2.unsqueeze(0).expand(N, -1, -1)
 33 | 
 34 |     # Left Top & Right Bottom
 35 |     lt = torch.max(be1[:,:,:2], be2[:,:,:2])
 36 |     #mask1 = (be1[:,:, 0] < be2[:,:, 0]) ^ (be1[:,:, 1] < be2[:,:, 1])
 37 |     #mask1 = ~mask1
 38 |     rb = torch.min(be1[:,:,2:], be2[:,:,2:])
 39 |     #mask2 = (be1[:,:, 2] < be2[:,:, 2]) ^ (be1[:,:, 3] < be2[:,:, 3])
 40 |     #mask2 = ~mask2
 41 | 
 42 |     delta = rb - lt
 43 |     delta[delta < 0] = 0
 44 |     intersect = delta[:,:,0]*delta[:,:,1]
 45 |     #*mask1.float()*mask2.float()
 46 | 
 47 |     delta1 = be1[:,:,2:] - be1[:,:,:2]
 48 |     area1 = delta1[:,:,0]*delta1[:,:,1]
 49 |     delta2 = be2[:,:,2:] - be2[:,:,:2]
 50 |     area2 = delta2[:,:,0]*delta2[:,:,1]
 51 | 
 52 |     iou = intersect/(area1 + area2 - intersect)
 53 |     return iou
 54 | 
 55 | 
 56 | # This function is from https://github.com/kuangliu/pytorch-ssd.
 57 | class Encoder(object):
 58 |     """
 59 |         Inspired by https://github.com/kuangliu/pytorch-src
 60 |         Transform between (bboxes, lables) <-> SSD output
 61 |         
 62 |         dboxes: default boxes in size 8732 x 4, 
 63 |             encoder: input ltrb format, output xywh format
 64 |             decoder: input xywh format, output ltrb format 
 65 |         
 66 |         encode: 
 67 |             input  : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
 68 |             output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
 69 |             criteria : IoU threshold of bboexes
 70 | 
 71 |         decode:
 72 |             input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
 73 |             output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
 74 |             criteria : IoU threshold of bboexes
 75 |             max_output : maximum number of output bboxes
 76 |     """
 77 | 
 78 |     def __init__(self, dboxes):
 79 |         self.dboxes = dboxes(order="ltrb")
 80 |         self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
 81 |         self.nboxes = self.dboxes.size(0)
 82 |         #print("# Bounding boxes: {}".format(self.nboxes))
 83 |         self.scale_xy = dboxes.scale_xy
 84 |         self.scale_wh = dboxes.scale_wh
 85 |     
 86 |     def encode(self, bboxes_in, labels_in, criteria = 0.5):
 87 | 
 88 |         ious = calc_iou_tensor(bboxes_in, self.dboxes)
 89 |         best_dbox_ious, best_dbox_idx = ious.max(dim=0)
 90 |         best_bbox_ious, best_bbox_idx = ious.max(dim=1)
 91 | 
 92 |         # set best ious 2.0
 93 |         best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
 94 | 
 95 |         idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
 96 |         best_dbox_idx[best_bbox_idx[idx]] = idx
 97 | 
 98 |         # filter IoU > 0.5
 99 |         masks = best_dbox_ious > criteria
100 |         labels_out = torch.zeros(self.nboxes, dtype=torch.long)
101 |         #print(maxloc.shape, labels_in.shape, labels_out.shape)
102 |         labels_out[masks] = labels_in[best_dbox_idx[masks]]
103 |         bboxes_out = self.dboxes.clone()
104 |         bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
105 |         # Transform format to xywh format
106 |         x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
107 |                      0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
108 |                      -bboxes_out[:, 0] + bboxes_out[:, 2], \
109 |                      -bboxes_out[:, 1] + bboxes_out[:, 3]
110 |         bboxes_out[:, 0] = x
111 |         bboxes_out[:, 1] = y
112 |         bboxes_out[:, 2] = w
113 |         bboxes_out[:, 3] = h
114 |         return bboxes_out, labels_out
115 | 
116 |     def scale_back_batch(self, bboxes_in, scores_in):
117 |         """
118 |             Do scale and transform from xywh to ltrb
119 |             suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
120 |         """
121 |         if bboxes_in.device == torch.device("cpu"):
122 |             self.dboxes = self.dboxes.cpu()
123 |             self.dboxes_xywh = self.dboxes_xywh.cpu()
124 |         else:
125 |             self.dboxes = self.dboxes.cuda()
126 |             self.dboxes_xywh = self.dboxes_xywh.cuda()
127 |             
128 |         bboxes_in = bboxes_in.permute(0, 2, 1)
129 |         scores_in = scores_in.permute(0, 2, 1)
130 |         #print(bboxes_in.device, scores_in.device, self.dboxes_xywh.device)
131 | 
132 |         bboxes_in[:, :, :2] = self.scale_xy*bboxes_in[:, :, :2]
133 |         bboxes_in[:, :, 2:] = self.scale_wh*bboxes_in[:, :, 2:]
134 | 
135 |         bboxes_in[:, :, :2] = bboxes_in[:, :, :2]*self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
136 |         bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp()*self.dboxes_xywh[:, :, 2:]
137 | 
138 |         # Transform format to ltrb 
139 |         l, t, r, b = bboxes_in[:, :, 0] - 0.5*bboxes_in[:, :, 2],\
140 |                      bboxes_in[:, :, 1] - 0.5*bboxes_in[:, :, 3],\
141 |                      bboxes_in[:, :, 0] + 0.5*bboxes_in[:, :, 2],\
142 |                      bboxes_in[:, :, 1] + 0.5*bboxes_in[:, :, 3]
143 | 
144 |         bboxes_in[:, :, 0] = l
145 |         bboxes_in[:, :, 1] = t
146 |         bboxes_in[:, :, 2] = r
147 |         bboxes_in[:, :, 3] = b
148 | 
149 |         return bboxes_in, F.softmax(scores_in, dim=-1)
150 |    
151 |     def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200):
152 |         bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
153 | 
154 |         output = []
155 |         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
156 |             bbox = bbox.squeeze(0)
157 |             prob = prob.squeeze(0)
158 |             output.append(self.decode_single(bbox, prob, criteria, max_output))
159 |             #print(output[-1])
160 |         return output
161 | 
162 |     # perform non-maximum suppression
163 |     def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
164 |         # Reference to https://github.com/amdegroot/ssd.pytorch
165 |        
166 |         bboxes_out = []        
167 |         scores_out = []
168 |         labels_out = []
169 | 
170 |         for i, score in enumerate(scores_in.split(1, 1)):
171 |             # skip background
172 |             # print(score[score>0.90])
173 |             if i == 0: continue
174 |             # print(i)
175 |             
176 |             score = score.squeeze(1)
177 |             mask = score > 0.05
178 | 
179 |             bboxes, score = bboxes_in[mask, :], score[mask]
180 |             if score.size(0) == 0: continue
181 | 
182 |             score_sorted, score_idx_sorted = score.sort(dim=0)
183 | 
184 |             # select max_output indices
185 |             score_idx_sorted = score_idx_sorted[-max_num:]
186 |             candidates = []
187 |             #maxdata, maxloc = scores_in.sort()
188 |         
189 |             while score_idx_sorted.numel() > 0:
190 |                 idx = score_idx_sorted[-1].item()
191 |                 bboxes_sorted = bboxes[score_idx_sorted, :]
192 |                 bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
193 |                 iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
194 |                 # we only need iou < criteria 
195 |                 score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
196 |                 candidates.append(idx)
197 | 
198 |             bboxes_out.append(bboxes[candidates, :])
199 |             scores_out.append(score[candidates])
200 |             labels_out.extend([i]*len(candidates))
201 | 
202 |         bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
203 |                torch.tensor(labels_out, dtype=torch.long), \
204 |                torch.cat(scores_out, dim=0)
205 | 
206 | 
207 |         _, max_ids = scores_out.sort(dim=0)
208 |         max_ids = max_ids[-max_output:]
209 |         return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
210 | 
211 | 
212 | class DefaultBoxes(object):
213 |     def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
214 |                        scale_xy=0.1, scale_wh=0.2):
215 | 
216 |         self.feat_size = feat_size
217 |         self.fig_size = fig_size
218 | 
219 |         self.scale_xy_ = scale_xy
220 |         self.scale_wh_ = scale_wh
221 |         
222 |         # According to https://github.com/weiliu89/caffe
223 |         # Calculation method slightly different from paper
224 |         self.steps = steps
225 |         self.scales = scales
226 | 
227 |         fk = fig_size/np.array(steps)
228 |         self.aspect_ratios = aspect_ratios
229 | 
230 |         self.default_boxes = []
231 |         # size of feature and number of feature
232 |         for idx, sfeat in enumerate(self.feat_size):
233 | 
234 |             sk1 = scales[idx]/fig_size
235 |             sk2 = scales[idx+1]/fig_size
236 |             sk3 = sqrt(sk1*sk2)
237 |             all_sizes = [(sk1, sk1), (sk3, sk3)]
238 | 
239 |             for alpha in aspect_ratios[idx]:
240 |                 w, h = sk1*sqrt(alpha), sk1/sqrt(alpha)
241 |                 all_sizes.append((w, h))
242 |                 all_sizes.append((h, w))
243 |             for w, h in all_sizes:
244 |                 for i, j in itertools.product(range(sfeat), repeat=2):
245 |                     cx, cy = (j+0.5)/fk[idx], (i+0.5)/fk[idx]
246 |                     self.default_boxes.append((cx, cy, w, h)) 
247 | 
248 |         self.dboxes = torch.tensor(self.default_boxes, dtype = torch.float)
249 |         self.dboxes.clamp_(min=0, max=1)
250 |         # For IoU calculation
251 |         self.dboxes_ltrb = self.dboxes.clone()
252 |         self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
253 |         self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
254 |         self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
255 |         self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
256 |     
257 |     @property
258 |     def scale_xy(self):
259 |         return self.scale_xy_
260 |     
261 |     @property    
262 |     def scale_wh(self):
263 |         return self.scale_wh_
264 | 
265 |     def as_ltrb_list(self):
266 |         return [x for x in self.dboxes_ltrb.view(-1).numpy()]
267 | 
268 |     def __call__(self, order="ltrb"):
269 |         if order == "ltrb": return self.dboxes_ltrb
270 |         if order == "xywh": return self.dboxes
271 | 
272 | 
273 | def dboxes300_coco():
274 |     figsize = 300
275 |     feat_size = [38, 19, 10, 5, 3, 1]
276 |     steps = [8, 16, 32, 64, 100, 300]
277 |     # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
278 |     scales = [21, 45, 99, 153, 207, 261, 315]
279 |     aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
280 |     dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
281 |     return dboxes
282 | 
283 | 
284 | # This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation.
285 | class SSDCropping(object):
286 |     """ Cropping for SSD, according to original paper
287 |         Choose between following 3 conditions:
288 |         1. Preserve the original image
289 |         2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9
290 |         3. Random crop
291 |         Reference to https://github.com/chauhan-utk/src.DomainAdaptation
292 |     """
293 |     def __init__(self):
294 |         
295 |         self.sample_options = (
296 |             # Do nothing
297 |             None,
298 |             # min IoU, max IoU
299 |             (0.1, None),
300 |             (0.3, None),
301 |             (0.5, None),
302 |             (0.7, None),
303 |             (0.9, None),
304 |             # no IoU requirements
305 |             (None, None),
306 |         )
307 | 
308 |     def __call__(self, img, img_size, bboxes, labels):
309 |        
310 |         # Ensure always return cropped image
311 |         while True:
312 |             mode = random.choice(self.sample_options)
313 |             
314 |             if mode is None:
315 |                 return img, img_size, bboxes, labels
316 | 
317 |             htot, wtot = img_size
318 |            
319 |             min_iou, max_iou = mode
320 |             min_iou = float("-inf") if min_iou is None else min_iou
321 |             max_iou = float("+inf") if max_iou is None else max_iou
322 |             
323 |             # Implementation use 50 iteration to find possible candidate
324 |             for _ in range(1):
325 |                 # suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1
326 |                 w = random.uniform(0.3 , 1.0)
327 |                 h = random.uniform(0.3 , 1.0)
328 |                 
329 |                 if w/h < 0.5 or w/h > 2:
330 |                     continue
331 | 
332 |                 # left 0 ~ wtot - w, top 0 ~ htot - h
333 |                 left = random.uniform(0, 1.0 - w)
334 |                 top = random.uniform(0, 1.0 - h)
335 | 
336 |                 right = left + w
337 |                 bottom = top + h
338 | 
339 |                 ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]]))
340 |                
341 |                 # tailor all the bboxes and return
342 |                 if not ((ious > min_iou) & (ious < max_iou)).all():
343 |                     continue
344 | 
345 |                 # discard any bboxes whose center not in the cropped image
346 |                 xc = 0.5*(bboxes[:, 0] + bboxes[:, 2])
347 |                 yc = 0.5*(bboxes[:, 1] + bboxes[:, 3])
348 |                 
349 |                 masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)
350 |                 
351 |                 # if no such boxes, continue searching again
352 |                 if not masks.any():
353 |                     continue
354 |                 
355 |                 bboxes[bboxes[:, 0] < left, 0] = left
356 |                 bboxes[bboxes[:, 1] < top, 1] = top
357 |                 bboxes[bboxes[:, 2] > right, 2] = right
358 |                 bboxes[bboxes[:, 3] > bottom, 3] = bottom
359 | 
360 |                 #print(left, top, right, bottom)
361 |                 #print(labels, bboxes, masks)
362 |                 bboxes = bboxes[masks, :]
363 |                 labels = labels[masks]
364 | 
365 |                 left_idx = int(left*wtot)
366 |                 top_idx =  int(top*htot)
367 |                 right_idx = int(right*wtot)
368 |                 bottom_idx = int(bottom*htot)
369 |                 #print(left_idx,top_idx,right_idx,bottom_idx)
370 |                 #img = img[:, top_idx:bottom_idx, left_idx:right_idx]
371 |                 img = img.crop((left_idx, top_idx, right_idx, bottom_idx))
372 | 
373 |                 bboxes[:, 0] = (bboxes[:, 0] - left)/w
374 |                 bboxes[:, 1] = (bboxes[:, 1] - top)/h
375 |                 bboxes[:, 2] = (bboxes[:, 2] - left)/w
376 |                 bboxes[:, 3] = (bboxes[:, 3] - top)/h
377 | 
378 |                 htot = bottom_idx - top_idx
379 |                 wtot = right_idx - left_idx
380 |                 return img, (htot, wtot), bboxes, labels
381 | 
382 | 
383 | class RandomHorizontalFlip(object):
384 |     def __init__(self, p=0.5):
385 |         self.p = p
386 | 
387 |     def __call__(self, image, bboxes):
388 |         if random.random() < self.p:
389 |             bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0]
390 |             return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes
391 |         return image, bboxes
392 | 
393 | 
394 | # Do data augumentation
395 | class SSDTransformer(object):
396 |     """ SSD Data Augumentation, according to original paper
397 |         Composed by several steps:
398 |         Cropping
399 |         Resize
400 |         Flipping
401 |         Jittering
402 |     """
403 |     def __init__(self, dboxes, args, size = (300, 300), val=False):
404 | 
405 |         self.args = args 
406 |         self.size = size
407 |         self.val = val
408 | 
409 |         self.dboxes_ = dboxes
410 |         self.encoder = Encoder(self.dboxes_)
411 |         self.crop = SSDCropping()
412 | 
413 |         train_trans = [transforms.Resize(self.size)]        
414 |         train_trans.append(transforms.ColorJitter(
415 |             brightness=0.125, 
416 |             contrast=0.5, 
417 |             saturation=0.5, 
418 |             hue=0.05))
419 |         train_trans.append(transforms.ToTensor())
420 |         
421 |         self.img_trans = transforms.Compose(train_trans) 
422 |         self.hflip = RandomHorizontalFlip()
423 | 
424 |         # All PyTorch Tensor will be normalized
425 |         # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
426 |         self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
427 |                                                std=[0.229, 0.224, 0.225])
428 | 
429 |         self.trans_val = transforms.Compose([
430 |             transforms.Resize(self.size),
431 |             transforms.ToTensor(),
432 |             self.normalize])
433 |     
434 |     @property
435 |     def dboxes(self):
436 |         return self.dboxes_
437 | 
438 |     def __call__(self, img, img_size, bbox=None, label=None, max_num=200):
439 |         if self.val:
440 |             bbox_out = torch.zeros(max_num, 4)
441 |             label_out =  torch.zeros(max_num, dtype=torch.long)
442 |             bbox_out[:bbox.size(0), :] = bbox
443 |             label_out[:label.size(0)] = label
444 |             return self.trans_val(img), img_size, bbox_out, label_out
445 |    
446 |         img, img_size, bbox, label = self.crop(img, img_size, bbox, label) 
447 |         img, bbox = self.hflip(img, bbox)
448 |         img = self.img_trans(img).contiguous()
449 |         img = self.normalize(img)
450 |         bbox, label = self.encoder.encode(bbox, label)
451 | 
452 |         return img, img_size, bbox, label
453 | 
454 | # Implement a datareader for COCO dataset
455 | class COCODetection(data.Dataset):
456 |     def __init__(self, img_folder, annotate_file, transform=None):
457 |         self.img_folder = img_folder
458 |         self.annotate_file = annotate_file
459 | 
460 |         # Start processing annotation
461 |         with open(annotate_file) as fin:
462 |             self.data = json.load(fin)
463 | 
464 |         self.images = {}
465 | 
466 |         self.label_map = {}
467 |         self.label_info = {}
468 |         #print("Parsing COCO data...")
469 |         start_time = time.time()
470 |         # 0 stand for the background
471 |         cnt = 0
472 |         self.label_info[cnt] = "background"
473 |         for cat in self.data["categories"]:
474 |             cnt += 1
475 |             self.label_map[cat["id"]] = cnt
476 |             self.label_info[cnt] = cat["name"]
477 | 
478 |         # build inference for images
479 |         for img in self.data["images"]:
480 |             img_id = img["id"]
481 |             img_name = img["file_name"]
482 |             img_size = (img["height"],img["width"])
483 |             #print(img_name)
484 |             if img_id in self.images: raise Exception("dulpicated image record")
485 |             self.images[img_id] = (img_name, img_size, [])
486 | 
487 |         # read bboxes
488 |         for bboxes in self.data["annotations"]:
489 |             img_id = bboxes["image_id"]
490 |             category_id = bboxes["category_id"]
491 |             bbox = bboxes["bbox"]
492 |             bbox_label = self.label_map[bboxes["category_id"]]
493 |             self.images[img_id][2].append((bbox, bbox_label))
494 | 
495 |         for k, v in list(self.images.items()):
496 |             if len(v[2]) == 0:
497 |                 #print("empty image: {}".format(k))
498 |                 self.images.pop(k)
499 | 
500 |         self.img_keys = list(self.images.keys())
501 |         self.transform = transform
502 | 
503 |     @property
504 |     def labelnum(self):
505 |         return len(self.label_info)
506 | 
507 |     @staticmethod
508 |     def load(pklfile):  
509 |         #print("Loading from {}".format(pklfile))
510 |         with bz2.open(pklfile, "rb") as fin:
511 |             ret = pickle.load(fin)
512 |         return ret
513 | 
514 |     def save(self, pklfile):
515 |         #print("Saving to {}".format(pklfile))
516 |         with bz2.open(pklfile, "wb") as fout:
517 |             pickle.dump(self, fout)
518 | 
519 |     
520 |     def __len__(self):
521 |         return len(self.images)
522 | 
523 |     def __getitem__(self, idx):
524 |         img_id = self.img_keys[idx]
525 |         img_data = self.images[img_id]
526 |         fn = img_data[0]
527 |         img_path = os.path.join(self.img_folder, fn)
528 |         img = Image.open(img_path).convert("RGB")
529 | 
530 |         htot, wtot = img_data[1]
531 |         bbox_sizes = []
532 |         bbox_labels = []
533 | 
534 |         #for (xc, yc, w, h), bbox_label in img_data[2]:
535 |         for (l,t,w,h), bbox_label in img_data[2]:
536 |             r = l + w
537 |             b = t + h
538 |             #l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h
539 |             bbox_size = (l/wtot, t/htot, r/wtot, b/htot)
540 |             bbox_sizes.append(bbox_size)
541 |             bbox_labels.append(bbox_label) 
542 | 
543 |         bbox_sizes = torch.tensor(bbox_sizes)
544 |         bbox_labels =  torch.tensor(bbox_labels)
545 | 
546 | 
547 |         if self.transform != None:
548 |             img, (htot, wtot), bbox_sizes, bbox_labels = \
549 |                 self.transform(img, (htot, wtot), bbox_sizes, bbox_labels)
550 |         else:
551 |             pass
552 | 
553 |         return img, img_id, (htot, wtot), bbox_sizes, bbox_labels
554 | 
555 | 
556 | def draw_patches(img, bboxes, labels, order="xywh", label_map={}):
557 | 
558 |     import matplotlib.pyplot as plt
559 |     import matplotlib.patches as patches
560 |     # Suppose bboxes in fractional coordinate:
561 |     # cx, cy, w, h
562 |     # img = img.numpy()
563 |     img = np.array(img)
564 |     labels = np.array(labels)
565 |     bboxes = bboxes.numpy() 
566 | 
567 |     if label_map:
568 |         labels = [label_map.get(l) for l in labels]
569 | 
570 |     if order == "ltrb":
571 |         xmin, ymin, xmax, ymax = bboxes[:, 0],  bboxes[:, 1],  bboxes[:, 2],  bboxes[:, 3]
572 |         cx, cy, w, h = (xmin + xmax)/2, (ymin + ymax)/2, xmax - xmin, ymax - ymin
573 |     else:
574 |         cx, cy, w, h = bboxes[:, 0],  bboxes[:, 1],  bboxes[:, 2],  bboxes[:, 3]
575 | 
576 |     htot, wtot,_ = img.shape
577 |     cx *= wtot
578 |     cy *= htot
579 |     w *= wtot
580 |     h *= htot
581 | 
582 |     bboxes = zip(cx, cy, w, h)
583 | 
584 |     plt.imshow(img)
585 |     ax = plt.gca()
586 |     for (cx, cy, w, h), label in zip(bboxes, labels):
587 |         if label == "background": continue
588 |         ax.add_patch(patches.Rectangle((cx-0.5*w, cy-0.5*h), 
589 |                                         w, h, fill=False, color="r"))
590 |         bbox_props = dict(boxstyle="round", fc="y", ec="0.5", alpha=0.3)
591 |         ax.text(cx-0.5*w, cy-0.5*h, label, ha="center", va="center", size=15, bbox=bbox_props) 
592 |     plt.show()
593 | 


--------------------------------------------------------------------------------
/tests/test_cifar10.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import os
  3 | import copy
  4 | from ncnnqat import unquant_weight, merge_freeze_bn, register_quantization_hook,save_table
  5 | import unittest
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | from torch.optim import lr_scheduler
 11 | from torchvision import models
 12 | from torchvision import datasets,utils
 13 | from torch.autograd import Variable
 14 | import torchvision
 15 | import torchvision.transforms as transforms
 16 | from torchsummary import summary
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 |     
 24 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'   
 25 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5'   
 26 | #os.environ['CUDA_VISIBLE_DEVICES'] = '0'   
 27 |     
 28 | 
 29 | 
 30 | 
 31 | def net_builder(class_num,net_name="mobile_netv2"):
 32 |     if net_name == "mobile_netv2":
 33 |         net = models.mobilenet_v2(pretrained=True)
 34 |         net.classifier = nn.Sequential(nn.Linear(1280, 1000), nn.ReLU(True),nn.Dropout(0.5),nn.Linear(1000, class_num))
 35 |     elif net_name == "resnet18":
 36 |         net = models.resnet18(pretrained=True)
 37 |         num_ftrs = net.fc.in_features
 38 |         net.fc = nn.Linear(num_ftrs, class_num)
 39 |     else:
 40 |         raise ValueError("net_name not in(mobile_netv2,resnet18)")
 41 |     return net
 42 | 
 43 |         
 44 | class Mbnet(unittest.TestCase):
 45 |     def test(self):
 46 |         num_workers = 10
 47 | 
 48 |         
 49 |         net_name="resnet18"
 50 |         net_name="mobile_netv2"
 51 |         
 52 |         class_num = 10
 53 |         
 54 |         img_size = 224
 55 |         batch_size = 128
 56 |         epoch_all = 50
 57 |         epoch_merge_bn = epoch_all-5
 58 |         
 59 |         #maybe cuda out of memery,set test epoch in a small count
 60 |         epoch_all = 4
 61 |         epoch_merge_bn = epoch_all-2
 62 |         
 63 |         checkpoint = "./model.pt"
 64 |         pre_list = ["train","val"]
 65 |         dataloaders = {}
 66 |         
 67 |         transform = transforms.Compose([
 68 |             transforms.Resize(img_size),
 69 |             transforms.ToTensor(),
 70 |             transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 71 |         ])
 72 |         trainset = torchvision.datasets.CIFAR10(root='./data',
 73 |                                                 train=True,
 74 |                                                 download=True,
 75 |                                                 transform=transform)
 76 |         dataloaders['train'] = torch.utils.data.DataLoader(trainset,
 77 |                                                   batch_size=batch_size,
 78 |                                                   shuffle=True,
 79 |                                                   num_workers=2)
 80 |         testset = torchvision.datasets.CIFAR10(root='./data',
 81 |                                                train=False,
 82 |                                                download=True,
 83 |                                                transform=transform)
 84 |         dataloaders['val'] = torch.utils.data.DataLoader(testset,
 85 |                                                  batch_size=batch_size,
 86 |                                                  shuffle=True,
 87 |                                                  num_workers=2)
 88 |                                                  
 89 |                                                  
 90 |         
 91 |         dummy_input = torch.randn(1, 3, img_size, img_size, device='cuda')
 92 |         input_names = [ "input" ]
 93 |         output_names = [ "fc" ] #mobilenet
 94 |         
 95 | 
 96 | 
 97 |         net = net_builder(10,net_name=net_name)
 98 | 
 99 |         if torch.cuda.device_count() > 1:
100 |             print("Let's use", torch.cuda.device_count(), "GPUs!")
101 |             net = nn.DataParallel(net)
102 |         net.cuda()
103 |         criterion = nn.CrossEntropyLoss()
104 |         optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
105 |         exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
106 | 
107 |         best_model_wts = copy.deepcopy(net.state_dict())
108 |         best_acc = 0.0
109 |         best_acc_org = 0.0
110 |         print("training:")
111 |         state_dict_merge = False
112 |         for epoch in range(epoch_all):
113 |             net.train()
114 |             if epoch == epoch_merge_bn:
115 |                 best_acc_org = best_acc
116 |                 #save not use qat model
117 |                 if torch.cuda.device_count() > 1:
118 |                     net_t = net_builder(class_num,net_name=net_name)
119 |                     net_t.cuda()
120 |                     net_t.load_state_dict({k.replace('module.',''):v for k,v in best_model_wts.items()})
121 |                     torch.onnx.export(net_t, dummy_input, "mobilenet_org.onnx", verbose=False, input_names=input_names, output_names=output_names) 
122 |                     print("export org onnx")
123 |                 else:
124 |                     torch.onnx.export(net, dummy_input, "mobilenet_org.onnx", verbose=False, input_names=input_names, output_names=output_names) 
125 |                     print("export org onnx")
126 |                 register_quantization_hook(net)                
127 |                 net = merge_freeze_bn(net)
128 |                 
129 |                 best_acc = 0.
130 |             if epoch == epoch_merge_bn+1:  
131 |                 net = merge_freeze_bn(net)
132 |                 print("merge bn")
133 |                 best_model_wts = copy.deepcopy(net.state_dict()) #first epoch of qat ,save model as baseline 
134 |             if epoch > epoch_merge_bn+1: 
135 |                 print("merge bn")            
136 |                 net = merge_freeze_bn(net)
137 |                     
138 |             running_loss = 0.0
139 |             bath_term = 20
140 |             for index, data in enumerate(dataloaders['train']):
141 |                 inputs, labels = data
142 |                 inputs, labels = Variable(inputs.cuda()), Variable(
143 |                     labels.cuda())
144 |                 optimizer.zero_grad()
145 |                 outputs = net(inputs)
146 |                 loss = criterion(outputs, labels)
147 |                 loss.backward()
148 |                 if epoch >= epoch_merge_bn:
149 |                     net.apply(unquant_weight)
150 |                
151 |                 optimizer.step()
152 | 
153 |                 running_loss += loss.item()
154 |                 if index % bath_term == 100:
155 |                     print(' epoch %3d, Iter %5d, loss: %.3f' % (epoch + 1, index + 1, running_loss / bath_term))
156 |                     running_loss = 0.0
157 |             exp_lr_scheduler.step()
158 | 
159 |             net.eval()
160 |             correct = total = 0
161 |             for data in dataloaders['val']:
162 |                 images, labels = data
163 |                 outputs = net(Variable(images.cuda()))
164 |                 _, predicted = torch.max(outputs.data, 1)
165 |                 correct += (predicted == labels.cuda()).sum()
166 |                 total += labels.size(0)
167 |             print('Epoch: {} Accuracy: {}'.format(str(epoch),str(100.0 * correct.cpu().numpy() / total)))  
168 |             epoch_acc = 100.0 * correct / total
169 |             if epoch_acc >= best_acc:
170 |                 best_acc = epoch_acc
171 |                 best_model_wts = copy.deepcopy(net.state_dict()) 
172 |                 print("get best ....")          
173 |         net.load_state_dict(best_model_wts) 
174 |         print('Finished Training.')
175 | 
176 |         net.eval()
177 |         correct = total = 0
178 |         for data in dataloaders['val']:
179 |             images, labels = data
180 |             outputs = net(Variable(images.cuda()))
181 |             _, predicted = torch.max(outputs.data, 1)
182 |             correct += (predicted == labels.cuda()).sum()
183 |             total += labels.size(0)
184 |         print('Accuracy: {}'.format(str(100.0 * correct.cpu().numpy() / total)))  
185 | 
186 |         if torch.cuda.device_count() > 1:
187 |             net_t = net_builder(class_num,net_name=net_name)
188 |             net_t.cuda()
189 |             register_quantization_hook(net_t)
190 |             net_t = merge_freeze_bn(net_t)
191 |             
192 |             net_t.load_state_dict({k.replace('module.',''):v for k,v in net.state_dict().items()})
193 |             torch.onnx.export(net_t, dummy_input, "mobilenet.onnx", verbose=False, input_names=input_names, output_names=output_names) #保存模型
194 |             save_table(net_t,onnx_path="mobilenet.onnx",table="mobilenet.table")
195 |             print("export qat onnx")
196 |         else:
197 |             torch.onnx.export(net, dummy_input, "mobilenet.onnx", verbose=False, input_names=input_names, output_names=output_names)
198 |             save_table(net,onnx_path="mobilenet.onnx",table="mobilenet.table")
199 |             print("export qat onnx")
200 |         print(best_acc_org,best_acc)
201 | if __name__ == "__main__":
202 |     suite = unittest.TestSuite()
203 |     suite.addTest(Mbnet("test"))
204 |     runner = unittest.TextTestRunner()
205 |     runner.run(suite)
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 


--------------------------------------------------------------------------------