├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── coco.py
├── data.py
├── gpuplot.py
├── logs
    ├── .keep
    ├── masking
    ├── ssd300.fp16.b16.k256.pytorch.bench
    ├── ssd300.fp16.b16.k256.pytorch.coco
    ├── ssd300.fp16.b16.k256.pytorch.qdrep
    ├── ssd300.fp16.b16.k256.trt.bench
    ├── ssd300.fp16.b16.k256.trt.coco
    ├── ssd300.fp16.b16.k256.trt.qdrep
    ├── ssd300.fp16.b16.k256.trt.svg
    ├── ssd300.fp32.b16.k256.pytorch.bench
    ├── ssd300.fp32.b16.k256.pytorch.coco
    ├── ssd300.fp32.b16.k256.pytorch.qdrep
    ├── ssd300.fp32.b16.k256.trt.coco
    ├── ssd300.fp32.b16.k256.trt.qdrep
    ├── ssd300.fp32.b16.k256.trt.svg
    ├── ssd300.int8.b16.k256.trt.coco
    ├── ssd300.int8.b16.k256.trt.qdrep
    ├── ssd300.int8.b16.k256.trt.svg
    └── subscript_assignment
├── masking.py
├── models
    └── .keep
├── optrec.py
├── ssd300_baseline.py
├── ssd300_trt.py
└── subscript_assignment.py


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:20.10-py3
 2 | 
 3 | RUN python -c "import torch; torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math='fp32')" 2>/dev/null | :
 4 | RUN python -c "import torch; torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math='fp16')" 2>/dev/null | :
 5 | 
 6 | # Nvidia Apex for mixed-precision inference
 7 | RUN git clone https://github.com/NVIDIA/apex.git /build/apex
 8 | WORKDIR /build/apex
 9 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
10 | 
11 | # use pbridger fork instead of NV repo - only change is NMS plugin updated to handle FP16
12 | WORKDIR /build
13 | RUN git clone --single-branch --branch release/7.2 https://github.com/pbridger/TensorRT.git
14 | WORKDIR /build/TensorRT
15 | RUN mkdir build && git submodule update --init --recursive
16 | WORKDIR /build/TensorRT/build
17 | # set GPU_ARCHS to match your GPU architecture, though this repo only makes sense for >= Volta (70)
18 | RUN cmake .. -DTRT_LIB_DIR=`pwd`/lib -DTRT_OUT_DIR=`pwd`/out -DGPU_ARCHS="75" -DBUILD_SAMPLES=OFF
19 | RUN make -j$(nproc) && make install && cp lib/* /usr/lib/x86_64-linux-gnu/
20 | 
21 | WORKDIR /build/TensorRT/tools/onnx-graphsurgeon
22 | RUN make install
23 | 
24 | RUN pip install pycuda
25 | RUN pip install pynvml
26 | 
27 | WORKDIR /app
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Paul Bridger
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | CONTAINER_NAME := tensorrt-ssd300:latest
  3 | # update to match your COCO 2017 location, or remove that volume mapping if you don't need to run --mode=coco
  4 | DOCKER_CMD := docker run -it --rm --gpus=all --privileged=true --net=bridge --ulimit core=0 --ipc=host -v $(shell pwd):/app -v /data/coco2017:/data/coco2017
  5 | PROFILE_CMD := profile -t cuda,cublas,cudnn,nvtx,osrt --force-overwrite=true --duration=30 --delay=6
  6 | 
  7 | 
  8 | ### External - to be used from outside the container ###
  9 | 
 10 | build-container: Dockerfile
 11 | 	docker build -f $< -t ${CONTAINER_NAME} .
 12 | 
 13 | run-container: build-container
 14 | 	${DOCKER_CMD} ${CONTAINER_NAME}
 15 | 
 16 | 
 17 | logs/%.svg: logs/%.rec
 18 | 	cat $< | svg-term --no-cursor > $@
 19 | 
 20 | 
 21 | logs/ssd300.fp32.b16.k256.pytorch.rec:
 22 | 	rm -f logs/ssd300.fp32.b16.k256.pytorch.bench $@
 23 | 	asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp32.b16.k256.pytorch.bench sleep'
 24 | 	python optrec.py $@.tmp $@
 25 | 
 26 | logs/ssd300.fp16.b16.k256.pytorch.rec:
 27 | 	rm -f logs/ssd300.fp16.b16.k256.pytorch.bench $@
 28 | 	asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp16.b16.k256.pytorch.bench sleep'
 29 | 	python optrec.py $@.tmp $@
 30 | 
 31 | logs/ssd300.fp32.b16.k256.trt.rec:
 32 | 	rm -f logs/ssd300.fp32.b16.k256.trt.bench $@
 33 | 	asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp32.b16.k256.trt.bench sleep'
 34 | 	python optrec.py $@.tmp $@
 35 | 
 36 | logs/ssd300.fp16.b16.k256.trt.rec:
 37 | 	rm -f logs/ssd300.fp16.b16.k256.trt.bench $@
 38 | 	asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp16.b16.k256.trt.bench sleep'
 39 | 	python optrec.py $@.tmp $@
 40 | 
 41 | logs/ssd300.int8.b16.k256.trt.rec:
 42 | 	rm -f logs/ssd300.int8.b16.k256.trt.bench $@
 43 | 	asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.int8.b16.k256.trt.bench sleep'
 44 | 	python optrec.py $@.tmp $@
 45 | 
 46 | sleep:
 47 | 	@sleep 4
 48 | 	@echo '-'
 49 | 
 50 | 
 51 | ### Internal - to be used from within the container (after run-container) ###
 52 | 
 53 | ### Build models
 54 | 
 55 | models/ssd300.fp32.b16.k256.plan:
 56 | 	python ssd300_trt.py --mode=export --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt
 57 | 
 58 | models/ssd300.fp16.b16.k256.plan:
 59 | 	python ssd300_trt.py --mode=export --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt
 60 | 
 61 | models/ssd300.int8.b16.k256.plan:
 62 | 	python ssd300_trt.py --mode=export --precision=int8 --batch-dim=16 --topk=256 --runtime=trt
 63 | 
 64 | models: models/ssd300.fp32.b16.k256.plan models/ssd300.fp16.b16.k256.plan models/ssd300.int8.b16.k256.plan
 65 | 
 66 | ### COCO evaluation
 67 | 
 68 | logs/ssd300.fp32.b16.k256.pytorch.coco: ssd300_trt.py
 69 | 	python $< --mode=coco --precision=fp32 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@
 70 | 
 71 | logs/ssd300.fp16.b16.k256.pytorch.coco: ssd300_trt.py
 72 | 	python $< --mode=coco --precision=fp16 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@
 73 | 
 74 | logs/ssd300.fp32.b16.k256.trt.coco: ssd300_trt.py models/ssd300.fp32.b16.k256.plan
 75 | 	python $< --mode=coco --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@
 76 | 
 77 | logs/ssd300.fp16.b16.k256.trt.coco: ssd300_trt.py models/ssd300.fp16.b16.k256.plan
 78 | 	python $< --mode=coco --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@
 79 | 
 80 | logs/ssd300.int8.b16.k256.trt.coco: ssd300_trt.py models/ssd300.int8.b16.k256.plan
 81 | 	python $< --mode=coco --precision=int8 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@
 82 | 
 83 | coco: logs/ssd300.fp32.b16.k256.pytorch.coco logs/ssd300.fp16.b16.k256.pytorch.coco logs/ssd300.fp32.b16.k256.trt.coco logs/ssd300.fp16.b16.k256.trt.coco logs/ssd300.int8.b16.k256.trt.coco
 84 | 
 85 | ### Throughput benchmarking
 86 | 
 87 | logs/ssd300.fp32.b16.k256.pytorch.bench: ssd300_trt.py
 88 | 	python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@
 89 | 
 90 | logs/ssd300.fp16.b16.k256.pytorch.bench: ssd300_trt.py
 91 | 	python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@
 92 | 
 93 | logs/ssd300.fp32.b16.k256.trt.bench: ssd300_trt.py models/ssd300.fp32.b16.k256.plan
 94 | 	python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@
 95 | 
 96 | logs/ssd300.fp16.b16.k256.trt.bench: ssd300_trt.py models/ssd300.fp16.b16.k256.plan
 97 | 	python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@
 98 | 
 99 | logs/ssd300.int8.b16.k256.trt.bench: ssd300_trt.py models/ssd300.int8.b16.k256.plan
100 | 	python $< --mode=bench --precision=int8 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@
101 | 
102 | bench: logs/ssd300.fp32.b16.k256.pytorch.bench logs/ssd300.fp16.b16.k256.pytorch.bench logs/ssd300.fp32.b16.k256.trt.bench logs/ssd300.fp16.b16.k256.trt.bench logs/ssd300.int8.b16.k256.trt.bench
103 | 
104 | ### Nsight systems report generation
105 | 
106 | logs/ssd300.fp32.b16.k256.pytorch.qdrep: ssd300_trt.py
107 | 	nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=pytorch
108 | 
109 | logs/ssd300.fp16.b16.k256.pytorch.qdrep: ssd300_trt.py
110 | 	nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=pytorch
111 | 
112 | logs/ssd300.fp32.b16.k256.trt.qdrep: ssd300_trt.py models/ssd300.fp32.b16.k256.plan
113 | 	nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt
114 | 
115 | logs/ssd300.fp16.b16.k256.trt.qdrep: ssd300_trt.py models/ssd300.fp16.b16.k256.plan
116 | 	nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt
117 | 
118 | logs/ssd300.int8.b16.k256.trt.qdrep: ssd300_trt.py models/ssd300.int8.b16.k256.plan
119 | 	nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=int8 --batch-dim=16 --topk=256 --runtime=trt
120 | 
121 | qdrep: logs/ssd300.fp32.b16.k256.pytorch.qdrep logs/ssd300.fp16.b16.k256.pytorch.qdrep logs/ssd300.fp32.b16.k256.trt.qdrep logs/ssd300.fp16.b16.k256.trt.qdrep logs/ssd300.int8.b16.k256.trt.qdrep
122 | 
123 | ### Logs for article
124 | 
125 | logs/subscript_assignment: subscript_assignment.py
126 | 	-python $< >$@ 2>&1 
127 | 
128 | logs/masking: masking.py
129 | 	-python $< >$@ 2>&1 
130 | 
131 | logs: logs/masking logs/subscript_assignment
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tensorrt-ssd300-8bit-quantized
2 | 
3 | ## Getting started
4 | 
5 | - To run coco evaluation you'll need the COCO2017 dataset. If you don't put it in /data/coco2017 you'll need to modify the Makefile to match your COCO location (see the DOCKER_CMD line).
6 | 
7 | 


--------------------------------------------------------------------------------
/coco.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | __author__ = 'tylin'
 16 | __version__ = '2.0'
 17 | # Interface for accessing the Microsoft COCO dataset.
 18 | 
 19 | # Microsoft COCO is a large image dataset designed for object detection,
 20 | # segmentation, and caption generation. pycocotools is a Python API that
 21 | # assists in loading, parsing and visualizing the annotations in COCO.
 22 | # Please visit http://mscoco.org/ for more information on COCO, including
 23 | # for the data, paper, and tutorials. The exact format of the annotations
 24 | # is also described on the COCO website. For example usage of the pycocotools
 25 | # please see pycocotools_demo.ipynb. In addition to this API, please download both
 26 | # the COCO images and annotations in order to run the demo.
 27 | 
 28 | # An alternative to using the API is to load the annotations directly
 29 | # into Python dictionary
 30 | # Using the API provides additional utility functions. Note that this API
 31 | # supports both *instance* and *caption* annotations. In the case of
 32 | # captions not all functions are defined (e.g. categories are undefined).
 33 | 
 34 | # The following API functions are defined:
 35 | #  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
 36 | #  decodeMask - Decode binary mask M encoded via run-length encoding.
 37 | #  encodeMask - Encode binary mask M using run-length encoding.
 38 | #  getAnnIds  - Get ann ids that satisfy given filter conditions.
 39 | #  getCatIds  - Get cat ids that satisfy given filter conditions.
 40 | #  getImgIds  - Get img ids that satisfy given filter conditions.
 41 | #  loadAnns   - Load anns with the specified ids.
 42 | #  loadCats   - Load cats with the specified ids.
 43 | #  loadImgs   - Load imgs with the specified ids.
 44 | #  annToMask  - Convert segmentation in an annotation to binary mask.
 45 | #  showAnns   - Display the specified annotations.
 46 | #  loadRes    - Load algorithm results and create API for accessing them.
 47 | #  download   - Download COCO images from mscoco.org server.
 48 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
 49 | # Help on each functions can be accessed by: "help COCO>function".
 50 | 
 51 | # See also COCO>decodeMask,
 52 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
 53 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
 54 | # COCO>loadImgs, COCO>annToMask, COCO>showAnns
 55 | 
 56 | # Microsoft COCO Toolbox.      version 2.0
 57 | # Data, paper, and tutorials available at:  http://mscoco.org/
 58 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
 59 | # Licensed under the Simplified BSD License [see bsd.txt]
 60 | 
 61 | import json
 62 | import time
 63 | import matplotlib.pyplot as plt
 64 | from matplotlib.collections import PatchCollection
 65 | from matplotlib.patches import Polygon
 66 | import numpy as np
 67 | import copy
 68 | import itertools
 69 | from pycocotools import mask as maskUtils
 70 | import os
 71 | from collections import defaultdict
 72 | import sys
 73 | PYTHON_VERSION = sys.version_info[0]
 74 | if PYTHON_VERSION == 2:
 75 |     from urllib import urlretrieve
 76 | elif PYTHON_VERSION == 3:
 77 |     from urllib.request import urlretrieve
 78 | 
 79 | 
 80 | def _isArrayLike(obj):
 81 |     return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
 82 | 
 83 | 
 84 | class COCO:
 85 |     def __init__(self, annotation_file=None):
 86 |         """
 87 |         Constructor of Microsoft COCO helper class for reading and visualizing annotations.
 88 |         :param annotation_file (str): location of annotation file
 89 |         :param image_folder (str): location to the folder that hosts images.
 90 |         :return:
 91 |         """
 92 |         # load dataset
 93 |         self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
 94 |         self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
 95 |         if not annotation_file == None:
 96 |             print('loading annotations into memory...')
 97 |             tic = time.time()
 98 |             dataset = json.load(open(annotation_file, 'r'))
 99 |             assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
100 |             print('Done (t={:0.2f}s)'.format(time.time()- tic))
101 |             self.dataset = dataset
102 |             self.createIndex()
103 | 
104 |     def createIndex(self):
105 |         # create index
106 |         print('creating index...')
107 |         anns, cats, imgs = {}, {}, {}
108 |         imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
109 |         if 'annotations' in self.dataset:
110 |             for ann in self.dataset['annotations']:
111 |                 imgToAnns[ann['image_id']].append(ann)
112 |                 anns[ann['id']] = ann
113 | 
114 |         if 'images' in self.dataset:
115 |             for img in self.dataset['images']:
116 |                 imgs[img['id']] = img
117 | 
118 |         if 'categories' in self.dataset:
119 |             for cat in self.dataset['categories']:
120 |                 cats[cat['id']] = cat
121 | 
122 |         if 'annotations' in self.dataset and 'categories' in self.dataset:
123 |             for ann in self.dataset['annotations']:
124 |                 catToImgs[ann['category_id']].append(ann['image_id'])
125 | 
126 |         print('index created!')
127 | 
128 |         # create class members
129 |         self.anns = anns
130 |         self.imgToAnns = imgToAnns
131 |         self.catToImgs = catToImgs
132 |         self.imgs = imgs
133 |         self.cats = cats
134 | 
135 |     def info(self):
136 |         """
137 |         Print information about the annotation file.
138 |         :return:
139 |         """
140 |         for key, value in self.dataset['info'].items():
141 |             print('{}: {}'.format(key, value))
142 | 
143 |     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
144 |         """
145 |         Get ann ids that satisfy given filter conditions. default skips that filter
146 |         :param imgIds  (int array)     : get anns for given imgs
147 |                catIds  (int array)     : get anns for given cats
148 |                areaRng (float array)   : get anns for given area range (e.g. [0 inf])
149 |                iscrowd (boolean)       : get anns for given crowd label (False or True)
150 |         :return: ids (int array)       : integer array of ann ids
151 |         """
152 |         imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
153 |         catIds = catIds if _isArrayLike(catIds) else [catIds]
154 | 
155 |         if len(imgIds) == len(catIds) == len(areaRng) == 0:
156 |             anns = self.dataset['annotations']
157 |         else:
158 |             if not len(imgIds) == 0:
159 |                 lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
160 |                 anns = list(itertools.chain.from_iterable(lists))
161 |             else:
162 |                 anns = self.dataset['annotations']
163 |             anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
164 |             anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
165 |         if not iscrowd == None:
166 |             ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
167 |         else:
168 |             ids = [ann['id'] for ann in anns]
169 |         return ids
170 | 
171 |     def getCatIds(self, catNms=[], supNms=[], catIds=[]):
172 |         """
173 |         filtering parameters. default skips that filter.
174 |         :param catNms (str array)  : get cats for given cat names
175 |         :param supNms (str array)  : get cats for given supercategory names
176 |         :param catIds (int array)  : get cats for given cat ids
177 |         :return: ids (int array)   : integer array of cat ids
178 |         """
179 |         catNms = catNms if _isArrayLike(catNms) else [catNms]
180 |         supNms = supNms if _isArrayLike(supNms) else [supNms]
181 |         catIds = catIds if _isArrayLike(catIds) else [catIds]
182 | 
183 |         if len(catNms) == len(supNms) == len(catIds) == 0:
184 |             cats = self.dataset['categories']
185 |         else:
186 |             cats = self.dataset['categories']
187 |             cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
188 |             cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
189 |             cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
190 |         ids = [cat['id'] for cat in cats]
191 |         return ids
192 | 
193 |     def getImgIds(self, imgIds=[], catIds=[]):
194 |         '''
195 |         Get img ids that satisfy given filter conditions.
196 |         :param imgIds (int array) : get imgs for given ids
197 |         :param catIds (int array) : get imgs with all given cats
198 |         :return: ids (int array)  : integer array of img ids
199 |         '''
200 |         imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
201 |         catIds = catIds if _isArrayLike(catIds) else [catIds]
202 | 
203 |         if len(imgIds) == len(catIds) == 0:
204 |             ids = self.imgs.keys()
205 |         else:
206 |             ids = set(imgIds)
207 |             for i, catId in enumerate(catIds):
208 |                 if i == 0 and len(ids) == 0:
209 |                     ids = set(self.catToImgs[catId])
210 |                 else:
211 |                     ids &= set(self.catToImgs[catId])
212 |         return list(ids)
213 | 
214 |     def loadAnns(self, ids=[]):
215 |         """
216 |         Load anns with the specified ids.
217 |         :param ids (int array)       : integer ids specifying anns
218 |         :return: anns (object array) : loaded ann objects
219 |         """
220 |         if _isArrayLike(ids):
221 |             return [self.anns[id] for id in ids]
222 |         elif type(ids) == int:
223 |             return [self.anns[ids]]
224 | 
225 |     def loadCats(self, ids=[]):
226 |         """
227 |         Load cats with the specified ids.
228 |         :param ids (int array)       : integer ids specifying cats
229 |         :return: cats (object array) : loaded cat objects
230 |         """
231 |         if _isArrayLike(ids):
232 |             return [self.cats[id] for id in ids]
233 |         elif type(ids) == int:
234 |             return [self.cats[ids]]
235 | 
236 |     def loadImgs(self, ids=[]):
237 |         """
238 |         Load anns with the specified ids.
239 |         :param ids (int array)       : integer ids specifying img
240 |         :return: imgs (object array) : loaded img objects
241 |         """
242 |         if _isArrayLike(ids):
243 |             return [self.imgs[id] for id in ids]
244 |         elif type(ids) == int:
245 |             return [self.imgs[ids]]
246 | 
247 |     def showAnns(self, anns):
248 |         """
249 |         Display the specified annotations.
250 |         :param anns (array of object): annotations to display
251 |         :return: None
252 |         """
253 |         if len(anns) == 0:
254 |             return 0
255 |         if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
256 |             datasetType = 'instances'
257 |         elif 'caption' in anns[0]:
258 |             datasetType = 'captions'
259 |         else:
260 |             raise Exception('datasetType not supported')
261 |         if datasetType == 'instances':
262 |             ax = plt.gca()
263 |             ax.set_autoscale_on(False)
264 |             polygons = []
265 |             color = []
266 |             for ann in anns:
267 |                 c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
268 |                 if 'segmentation' in ann:
269 |                     if type(ann['segmentation']) == list:
270 |                         # polygon
271 |                         for seg in ann['segmentation']:
272 |                             poly = np.array(seg).reshape((int(len(seg)/2), 2))
273 |                             polygons.append(Polygon(poly))
274 |                             color.append(c)
275 |                     else:
276 |                         # mask
277 |                         t = self.imgs[ann['image_id']]
278 |                         if type(ann['segmentation']['counts']) == list:
279 |                             rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
280 |                         else:
281 |                             rle = [ann['segmentation']]
282 |                         m = maskUtils.decode(rle)
283 |                         img = np.ones( (m.shape[0], m.shape[1], 3) )
284 |                         if ann['iscrowd'] == 1:
285 |                             color_mask = np.array([2.0,166.0,101.0])/255
286 |                         if ann['iscrowd'] == 0:
287 |                             color_mask = np.random.random((1, 3)).tolist()[0]
288 |                         for i in range(3):
289 |                             img[:,:,i] = color_mask[i]
290 |                         ax.imshow(np.dstack( (img, m*0.5) ))
291 |                 if 'keypoints' in ann and type(ann['keypoints']) == list:
292 |                     # turn skeleton into zero-based index
293 |                     sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
294 |                     kp = np.array(ann['keypoints'])
295 |                     x = kp[0::3]
296 |                     y = kp[1::3]
297 |                     v = kp[2::3]
298 |                     for sk in sks:
299 |                         if np.all(v[sk]>0):
300 |                             plt.plot(x[sk],y[sk], linewidth=3, color=c)
301 |                     plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
302 |                     plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
303 |             p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
304 |             ax.add_collection(p)
305 |             p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
306 |             ax.add_collection(p)
307 |         elif datasetType == 'captions':
308 |             for ann in anns:
309 |                 print(ann['caption'])
310 | 
311 |     def loadRes(self, resFile):
312 |         """
313 |         Load result file and return a result api object.
314 |         :param   resFile (str)     : file name of result file
315 |         :return: res (obj)         : result api object
316 |         """
317 |         res = COCO()
318 |         res.dataset['images'] = [img for img in self.dataset['images']]
319 | 
320 |         print('Loading and preparing results...')
321 |         tic = time.time()
322 |         if type(resFile) == str: #or type(resFile) == unicode:
323 |             anns = json.load(open(resFile))
324 |         elif type(resFile) == np.ndarray:
325 |             anns = self.loadNumpyAnnotations(resFile)
326 |         else:
327 |             anns = resFile
328 |         assert type(anns) == list, 'results in not an array of objects'
329 |         annsImgIds = [ann['image_id'] for ann in anns]
330 |         assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
331 |                'Results do not correspond to current coco set'
332 |         if 'caption' in anns[0]:
333 |             imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
334 |             res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
335 |             for id, ann in enumerate(anns):
336 |                 ann['id'] = id+1
337 |         elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
338 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
339 |             for id, ann in enumerate(anns):
340 |                 bb = ann['bbox']
341 |                 x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
342 |                 if not 'segmentation' in ann:
343 |                     ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
344 |                 ann['area'] = bb[2]*bb[3]
345 |                 ann['id'] = id+1
346 |                 ann['iscrowd'] = 0
347 |         elif 'segmentation' in anns[0]:
348 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
349 |             for id, ann in enumerate(anns):
350 |                 # now only support compressed RLE format as segmentation results
351 |                 ann['area'] = maskUtils.area(ann['segmentation'])
352 |                 if not 'bbox' in ann:
353 |                     ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
354 |                 ann['id'] = id+1
355 |                 ann['iscrowd'] = 0
356 |         elif 'keypoints' in anns[0]:
357 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
358 |             for id, ann in enumerate(anns):
359 |                 s = ann['keypoints']
360 |                 x = s[0::3]
361 |                 y = s[1::3]
362 |                 x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
363 |                 ann['area'] = (x1-x0)*(y1-y0)
364 |                 ann['id'] = id + 1
365 |                 ann['bbox'] = [x0,y0,x1-x0,y1-y0]
366 |         print('DONE (t={:0.2f}s)'.format(time.time()- tic))
367 | 
368 |         res.dataset['annotations'] = anns
369 |         res.createIndex()
370 |         return res
371 | 
372 |     def download(self, tarDir = None, imgIds = [] ):
373 |         '''
374 |         Download COCO images from mscoco.org server.
375 |         :param tarDir (str): COCO results directory name
376 |                imgIds (list): images to be downloaded
377 |         :return:
378 |         '''
379 |         if tarDir is None:
380 |             print('Please specify target directory')
381 |             return -1
382 |         if len(imgIds) == 0:
383 |             imgs = self.imgs.values()
384 |         else:
385 |             imgs = self.loadImgs(imgIds)
386 |         N = len(imgs)
387 |         if not os.path.exists(tarDir):
388 |             os.makedirs(tarDir)
389 |         for i, img in enumerate(imgs):
390 |             tic = time.time()
391 |             fname = os.path.join(tarDir, img['file_name'])
392 |             if not os.path.exists(fname):
393 |                 urlretrieve(img['coco_url'], fname)
394 |             print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
395 | 
396 |     def loadNumpyAnnotations(self, data):
397 |         """
398 |         Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
399 |         :param  data (numpy.ndarray)
400 |         :return: annotations (python nested list)
401 |         """
402 |         print('Converting ndarray to lists...')
403 |         assert(type(data) == np.ndarray)
404 |         print(data.shape)
405 |         assert(data.shape[1] == 7)
406 |         N = data.shape[0]
407 |         ann = []
408 |         for i in range(N):
409 |             if i % 1000000 == 0:
410 |                 print('{}/{}'.format(i,N))
411 |             ann += [{
412 |                 'image_id'  : int(data[i, 0]),
413 |                 'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
414 |                 'score' : data[i, 5],
415 |                 'category_id': int(data[i, 6]),
416 |                 }]
417 |         return ann
418 | 
419 |     def annToRLE(self, ann):
420 |         """
421 |         Convert annotation which can be polygons, uncompressed RLE to RLE.
422 |         :return: binary mask (numpy 2D array)
423 |         """
424 |         t = self.imgs[ann['image_id']]
425 |         h, w = t['height'], t['width']
426 |         segm = ann['segmentation']
427 |         if type(segm) == list:
428 |             # polygon -- a single object might consist of multiple parts
429 |             # we merge all parts into one mask rle code
430 |             rles = maskUtils.frPyObjects(segm, h, w)
431 |             rle = maskUtils.merge(rles)
432 |         elif type(segm['counts']) == list:
433 |             # uncompressed RLE
434 |             rle = maskUtils.frPyObjects(segm, h, w)
435 |         else:
436 |             # rle
437 |             rle = ann['segmentation']
438 |         return rle
439 | 
440 |     def annToMask(self, ann):
441 |         """
442 |         Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
443 |         :return: binary mask (numpy 2D array)
444 |         """
445 |         rle = self.annToRLE(ann)
446 |         m = maskUtils.decode(rle)
447 |         return m
448 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
  1 | 'adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Detection/SSD/src/utils.py'
  2 | 
  3 | import os
  4 | import math
  5 | import json, pickle, bz2
  6 | import itertools
  7 | from coco import COCO
  8 | 
  9 | from PIL import Image
 10 | import torch
 11 | from torch.utils.data import DataLoader, Dataset
 12 | import torchvision.transforms as transforms
 13 | 
 14 | 
 15 | def init_dboxes(model_dtype=torch.float32):
 16 |     fig_size = 300
 17 |     feat_size = [38, 19, 10, 5, 3, 1]
 18 |     steps = [8, 16, 32, 64, 100, 300]
 19 |     scales = [21, 45, 99, 153, 207, 261, 315]
 20 |     aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
 21 | 
 22 |     fk = fig_size / torch.tensor(steps).float()
 23 | 
 24 |     dboxes = []
 25 |     # size of feature and number of feature
 26 |     for idx, sfeat in enumerate(feat_size):
 27 |         sk1 = scales[idx] / fig_size
 28 |         sk2 = scales[idx + 1] / fig_size
 29 |         sk3 = math.sqrt(sk1 * sk2)
 30 |         all_sizes = [(sk1, sk1), (sk3, sk3)]
 31 | 
 32 |         for alpha in aspect_ratios[idx]:
 33 |             w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha)
 34 |             all_sizes.append((w, h))
 35 |             all_sizes.append((h, w))
 36 | 
 37 |         for w, h in all_sizes:
 38 |             for i, j in itertools.product(range(sfeat), repeat=2):
 39 |                 cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
 40 |                 dboxes.append((cx, cy, w, h))
 41 | 
 42 |     return torch.tensor(
 43 |         dboxes,
 44 |         dtype=model_dtype,
 45 |         device='cuda'
 46 |     ).clamp(0, 1)
 47 | 
 48 | 
 49 | class COCODetection(Dataset):
 50 |     def __init__(self, img_folder, annotate_file, transform):
 51 |         self.img_folder = img_folder
 52 |         self.annotate_file = annotate_file
 53 | 
 54 |         # Start processing annotation
 55 |         with open(annotate_file) as fin:
 56 |             self.data = json.load(fin)
 57 | 
 58 |         self.images = {}
 59 | 
 60 |         self.label_map = {}
 61 |         self.label_info = {}
 62 |         # 0 stand for the background
 63 |         cnt = 0
 64 |         self.label_info[cnt] = "background"
 65 |         for cat in self.data["categories"]:
 66 |             cnt += 1
 67 |             self.label_map[cat["id"]] = cnt
 68 |             self.label_info[cnt] = cat["name"]
 69 | 
 70 |         # build inference for images
 71 |         for img in self.data["images"]:
 72 |             img_id = img["id"]
 73 |             img_name = img["file_name"]
 74 |             img_size = (img["height"],img["width"])
 75 |             if img_id in self.images: raise Exception("dulpicated image record")
 76 |             self.images[img_id] = (img_name, img_size, [])
 77 | 
 78 |         # read bboxes
 79 |         for bboxes in self.data["annotations"]:
 80 |             img_id = bboxes["image_id"]
 81 |             category_id = bboxes["category_id"]
 82 |             bbox = bboxes["bbox"]
 83 |             bbox_label = self.label_map[bboxes["category_id"]]
 84 |             self.images[img_id][2].append((bbox, bbox_label))
 85 | 
 86 |         for k, v in list(self.images.items()):
 87 |             if len(v[2]) == 0:
 88 |                 self.images.pop(k)
 89 | 
 90 |         self.img_keys = list(self.images.keys())
 91 |         self.transform = transform
 92 | 
 93 |     @property
 94 |     def labelnum(self):
 95 |         return len(self.label_info)
 96 | 
 97 |     @staticmethod
 98 |     def load(pklfile):
 99 |         with bz2.open(pklfile, "rb") as fin:
100 |             ret = pickle.load(fin)
101 |         return ret
102 | 
103 |     def save(self, pklfile):
104 |         with bz2.open(pklfile, "wb") as fout:
105 |             pickle.dump(self, fout)
106 | 
107 | 
108 |     def __len__(self):
109 |         return len(self.images)
110 | 
111 |     def __getitem__(self, idx):
112 |         img_id = self.img_keys[idx]
113 |         img_data = self.images[img_id]
114 |         fn = img_data[0]
115 |         img_path = os.path.join(self.img_folder, fn)
116 |         img = Image.open(img_path).convert("RGB")
117 | 
118 |         htot, wtot = img_data[1]
119 |         bbox_sizes = []
120 |         bbox_labels = []
121 | 
122 |         #for (xc, yc, w, h), bbox_label in img_data[2]:
123 |         for (l,t,w,h), bbox_label in img_data[2]:
124 |             r = l + w
125 |             b = t + h
126 |             #l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h
127 |             bbox_size = (l/wtot, t/htot, r/wtot, b/htot)
128 |             bbox_sizes.append(bbox_size)
129 |             bbox_labels.append(bbox_label)
130 | 
131 |         bbox_sizes = torch.tensor(bbox_sizes)
132 |         bbox_labels =  torch.tensor(bbox_labels)
133 | 
134 |         max_num = 200
135 |         bbox_out = torch.zeros(max_num, 4)
136 |         label_out =  torch.zeros(max_num, dtype=torch.long)
137 |         bbox_out[:bbox_sizes.size(0), :] = bbox_sizes
138 |         label_out[:bbox_labels.size(0)] = bbox_labels
139 | 
140 |         img = self.transform(img)
141 |         return img, img_id, (htot, wtot), bbox_out, label_out
142 | 
143 | 
144 | 
145 | def get_val_dataloader(args):
146 |     transformer = transforms.Compose([
147 |         transforms.Resize((300, 300)),
148 |         transforms.ToTensor(),
149 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
150 |     ])
151 | 
152 |     dataset = COCODetection(
153 |         os.path.join(args.data, "val2017"),
154 |         os.path.join(args.data, "annotations/instances_val2017.json"),
155 |         transformer
156 |     )
157 | 
158 |     return DataLoader(
159 |         dataset,
160 |         batch_size=args.batch_dim,
161 |         shuffle=False,  # Note: distributed sampler is shuffled :(
162 |         sampler=None,
163 |         num_workers=args.num_workers
164 |     )
165 | 
166 | 
167 | def get_coco_ground_truth(args):
168 |     return COCO(annotation_file=os.path.join(args.data, "annotations/instances_val2017.json"))
169 | 
170 | 


--------------------------------------------------------------------------------
/gpuplot.py:
--------------------------------------------------------------------------------
  1 | import sys, io
  2 | import time
  3 | import numpy as np
  4 | import threading
  5 | 
  6 | from PIL import Image
  7 | from pynvml import * # all functions have nvml prefix
  8 | nvmlInit()
  9 | 
 10 | 
 11 | dot = '•'
 12 | space = ' '
 13 | horizontal_line = '\u2500'
 14 | vertical_line = '\u2502'
 15 | fill = '\u2588'
 16 | tau = '\u03A4'
 17 | background_color = np.array([0x28, 0x2d, 0x35]) # svg-term background
 18 | foreground_color = np.array([128, 128, 128])
 19 | cold = np.array([0x3f, 0x5e, 0xfb])
 20 | hot = np.array([0xfc, 0x46, 0x6b])
 21 | not_nv = np.array([0x83, 0x60, 0xc3])
 22 | nv = np.array([0x2e, 0xbf, 0x91])
 23 | sky = np.array([0x1c, 0x92, 0xd2])
 24 | foreground_color = np.array([0xf2, 0xfc, 0xfe])
 25 | summer_low = np.array([0x22, 0xc1, 0xc3])
 26 | summer_hi = np.array([0xfd, 0xbb, 0x2d])
 27 | 
 28 | 
 29 | def write(s, stdout=sys.stdout):
 30 |     stdout.write(s)
 31 | 
 32 | 
 33 | def reset():
 34 |     write('\033c')
 35 | 
 36 | 
 37 | def wrap_color(rgb, s):
 38 |     r, g, b = map(int, rgb)
 39 |     return f'\033[38;2;{r};{g};{b}m{s}\033[0m'
 40 | 
 41 | 
 42 | def pixel_seq_to_ascii(pixel_rgb, char, count):
 43 |     if char == space or (pixel_rgb == background_color).all():
 44 |         return space * count
 45 |     return wrap_color(pixel_rgb, char * count)
 46 | 
 47 | 
 48 | def pixels_to_ascii(pixels, chars):
 49 |     rendered_rows = []
 50 |     for row in range(pixels.shape[0]):
 51 |         row_elements = []
 52 |         current_pixel, current_char = None, None
 53 |         pixel_count = 0
 54 | 
 55 |         for column in range(pixels.shape[1]):
 56 |             new_pixel, new_char = pixels[row, column], chars[row, column]
 57 |             if current_pixel is not None and new_char == current_char and (new_pixel == current_pixel).all():
 58 |                 pixel_count += 1
 59 |             else:
 60 |                 if current_pixel is not None:
 61 |                     row_elements.append(pixel_seq_to_ascii(current_pixel, current_char, pixel_count))
 62 |                 current_pixel = new_pixel
 63 |                 current_char = new_char
 64 |                 pixel_count = 1
 65 | 
 66 |         if current_pixel is not None:
 67 |             row_elements.append(pixel_seq_to_ascii(current_pixel, current_char, pixel_count))
 68 |         rendered_rows.append(''.join(row_elements))
 69 | 
 70 |     return '\n'.join(rendered_rows)
 71 | 
 72 | 
 73 | def data_to_dot_matrix(xs, ys, size_wh, y_lim=(None, None)):
 74 |     dw = len(xs)
 75 |     y_min, y_max = y_lim
 76 |     if y_min is None:
 77 |         y_min = min(ys)
 78 |     if y_max is None:
 79 |         y_max = max(ys)
 80 |     dh = y_max - y_min
 81 | 
 82 |     w, h = size_wh
 83 |     dot_matrix = np.zeros((h, w), dtype=np.float32)
 84 |     values = np.zeros((w,), dtype=np.float32)
 85 | 
 86 |     for c in range(w):
 87 |         dc = dw * c / w
 88 |         dc, dc_fraction = int(dc // 1), dc % 1
 89 |         dy = ys[dc]
 90 |         if dc + 1 < dw:
 91 |             dy += dc_fraction * (ys[dc+1] - ys[dc])
 92 |         dy_proportion = (dy - y_min) / dh
 93 |         r = max(0, min(1, dy_proportion)) * h
 94 |         r, r_fraction = int(r // 1), r % 1
 95 | 
 96 |         dot_matrix[h - r:, c] = 1.0
 97 |         if h - r - 1 >= 0:
 98 |             dot_matrix[h - r - 1, c] = r_fraction
 99 |         values[c] = dy_proportion
100 | 
101 |     return dot_matrix, values
102 | 
103 | 
104 | def dot_matrix_to_pixels(dot_matrix, values, low=foreground_color, high=foreground_color):
105 |     pixels = np.ones((*dot_matrix.shape, 3)) * background_color
106 |     chars = np.full(dot_matrix.shape, space, dtype='<U1')
107 | 
108 |     chars[dot_matrix > 0] = dot
109 | 
110 |     colors = np.expand_dims(values, -1) * (high - low) + low
111 | 
112 |     dot_matrix = np.expand_dims(dot_matrix, -1)
113 |     pixels = ((1 - dot_matrix) * background_color) + (dot_matrix * colors)
114 | 
115 |     return pixels, chars
116 | 
117 | 
118 | def render_axes(pixels, chars, title, x_ticks, y_ticks):
119 |     ph, pw, pd = pixels.shape
120 |     new_pixels = np.zeros((ph + 3, pw + 8, pd), dtype=pixels.dtype)
121 |     new_pixels[1:ph + 1, :pw, :] = pixels
122 |     new_chars = np.full((ph + 3, pw + 8), space, dtype='<U1')
123 |     new_chars[1:ph + 1, :pw] = chars
124 | 
125 |     new_chars[ph + 1, :pw + 1] = horizontal_line
126 |     new_pixels[ph + 1, :pw + 1] = foreground_color
127 |     new_chars[1:ph + 1, pw + 1] = vertical_line
128 |     new_pixels[1:ph + 1, pw + 1] = foreground_color
129 | 
130 |     # title
131 |     x = (pw - len(title)) // 2
132 |     new_chars[0, x:x + len(title)] = np.array(list(title))
133 |     new_pixels[0, x:x + len(title)] = foreground_color
134 | 
135 |     # x-axis
136 |     new_chars[ph + 2, pw + 1] = tau
137 |     new_pixels[ph + 2, pw + 1] = foreground_color
138 |     for x, label in x_ticks:
139 |         new_chars[ph + 1, x] = '\u252C'
140 |         new_chars[ph + 2, x:x + len(label)] = np.array(list(label))
141 |         new_pixels[ph + 1, x] = foreground_color
142 |         new_pixels[ph + 2, x:x + len(label)] = foreground_color
143 | 
144 |     # y-axis
145 |     for y, label in y_ticks:
146 |         label = ' ' + label
147 |         y = max(0, y)
148 |         new_chars[y + 1, pw + 1] = '\u251C'
149 |         new_chars[y + 1, pw + 2:pw + 2 + len(label)] = np.array(list(label))
150 |         new_pixels[y + 1, pw + 1] = foreground_color
151 |         new_pixels[y + 1, pw + 2:pw + 2 + len(label)] = foreground_color
152 | 
153 |     new_chars[ph + 1, pw + 1] = '\u2518' #'\u253C'
154 |     new_pixels[ph + 1, pw + 1] = foreground_color
155 | 
156 |     return new_pixels, new_chars
157 | 
158 | 
159 | def log_to_pixels(stdout, chart_wh):
160 |     pw, ph = chart_wh
161 |     pixels = np.ones((ph + 3, pw + 8, 3)) * background_color
162 |     chars = np.full((ph + 3, pw + 8), space, dtype='<U1')
163 | 
164 |     lines = stdout.getvalue().strip().split('\n')[-chart_wh[1]:]
165 |     for from_bottom, line in enumerate(reversed(lines)):
166 |         num_chars = min(pw, len(line))
167 |         chars[ph - from_bottom, 0:0+num_chars] = np.array(list(line[:pw]))
168 |         pixels[ph - from_bottom, 0:0+num_chars] = foreground_color
169 | 
170 |     return pixels, chars
171 | 
172 | 
173 | def bg_plot(num_gpus, sample_hz):
174 |     gpu_handles = [nvmlDeviceGetHandleByIndex(i) for i in range(min(num_gpus, nvmlDeviceGetCount()))]
175 | 
176 |     chart_wh = (50, 10)
177 |     max_data_points = chart_wh[0]
178 |     fps = [0 for i in range(max_data_points)]
179 |     fps_lock = threading.Lock()
180 | 
181 |     def bg_thread_fn(chart_wh, max_data_points, fps, fps_lock):
182 |         sample_period = 1 / sample_hz
183 | 
184 |         fps_min, fps_max = 0, 1500
185 |         temp_min, temp_max = 40, 90
186 |         gpu_min, gpu_max = 0, 100
187 |         mem_min, mem_max = 0, 11
188 |         temp = [0 for i in range(max_data_points)]
189 |         gpu_util = [0 for i in range(max_data_points)]
190 |         mem_used = [0 for i in range(max_data_points)]
191 | 
192 |         captured_stdout = io.StringIO()
193 |         real_stdout = sys.stdout
194 |         sys.stdout = captured_stdout
195 |         captured_stderr = io.StringIO()
196 |         real_stderr = sys.stderr
197 |         # sys.stderr = captured_stderr
198 |         try:
199 |             while True:
200 |                 sample_time = time.time()
201 | 
202 |                 with fps_lock:
203 |                     if fps[-1] is None:
204 |                         return
205 |                     fps.append(fps[-1])
206 | 
207 |                 for i, gpu in enumerate(gpu_handles):
208 |                     if i > 0:
209 |                         break
210 | 
211 |                     temp.append(nvmlDeviceGetTemperature(gpu, 0))
212 | 
213 |                     mem = nvmlDeviceGetMemoryInfo(gpu)
214 |                     mem_used.append(mem.used / 2 ** 30)
215 | 
216 |                     util = nvmlDeviceGetUtilizationRates(gpu)
217 |                     gpu_util.append(util.gpu)
218 | 
219 |                 temp = temp[-max_data_points:]
220 |                 gpu_util = gpu_util[-max_data_points:]
221 |                 mem_used = mem_used[-max_data_points:]
222 |                 while len(fps) > max_data_points:
223 |                     fps.pop(0)
224 |                 x = [t * sample_period for t in range(-len(temp), 0)]
225 | 
226 |                 reset()
227 | 
228 | #                 log_pixels, log_chars = log_to_pixels(captured_stdout, chart_wh)
229 | 
230 |                 fps_dm, fps_values = data_to_dot_matrix(x, fps, chart_wh, y_lim=(fps_min, fps_max))
231 |                 fps_pixels, fps_chars = dot_matrix_to_pixels(fps_dm, fps_values, low=sky, high=foreground_color)
232 |                 fps_pixels, fps_chars = render_axes(
233 |                     fps_pixels, fps_chars,
234 |                     'THROUGHPUT (FPS)',
235 |                     [(0, f'{tau}{int(x[0])}s')],
236 |                     [(0, f'{fps_max}'), (chart_wh[1], f'{fps_min}'), (int(chart_wh[1] * (fps_max - fps[-1]) / (fps_max - fps_min)), f'{fps[-1]:.0f}')]
237 |                 )
238 | 
239 |                 temp_dm, temp_values = data_to_dot_matrix(x, temp, chart_wh, y_lim=(temp_min, temp_max))
240 |                 temp_pixels, temp_chars = dot_matrix_to_pixels(temp_dm, temp_values, low=cold, high=hot)
241 |                 temp_pixels, temp_chars = render_axes(
242 |                     temp_pixels, temp_chars,
243 |                     'GPU TEMP (C)',
244 |                     [(0, f'{tau}{int(x[0])}s')],
245 |                     [(0, f'{temp_max}C'), (chart_wh[1], f'{temp_min}C'), (int(chart_wh[1] * (temp_max - temp[-1]) / (temp_max - temp_min)), f'{temp[-1]:.0f}C')]
246 |                 )
247 | 
248 |                 gpu_dm, gpu_values = data_to_dot_matrix(x, gpu_util, chart_wh, y_lim=(gpu_min, gpu_max))
249 |                 gpu_pixels, gpu_chars = dot_matrix_to_pixels(gpu_dm, gpu_values, low=not_nv, high=nv)
250 |                 gpu_pixels, gpu_chars = render_axes(
251 |                     gpu_pixels, gpu_chars,
252 |                     'GPU UTILIZATION (%)',
253 |                     [(0, f'{tau}{int(x[0])}s')],
254 |                     [(0, f'{gpu_max}%'), (chart_wh[1], f'{gpu_min:3.0f}%'), (int(chart_wh[1] * (gpu_max - gpu_util[-1]) / (gpu_max - gpu_min)), f'{gpu_util[-1]:3.0f}%')]
255 |                 )
256 | 
257 |                 mem_dm, mem_values = data_to_dot_matrix(x, mem_used, chart_wh, y_lim=(mem_min, mem_max))
258 |                 mem_pixels, mem_chars = dot_matrix_to_pixels(mem_dm, mem_values, low=summer_low, high=summer_hi)
259 |                 mem_pixels, mem_chars = render_axes(
260 |                     mem_pixels, mem_chars,
261 |                     'GPU MEM (GB)',
262 |                     [(0, f'{tau}{int(x[0])}s')],
263 |                     [(0, f'{mem_max:3.1f}'), (chart_wh[1], f'{mem_min:3.1f}'), (int(chart_wh[1] * (mem_max - mem_used[-1]) / (mem_max - mem_min)), f'{mem_used[-1]:3.1f}')]
264 |                 )
265 | 
266 |                 l_pixels = np.concatenate((
267 |                     fps_pixels,
268 |                     temp_pixels,
269 |                 ), axis=0)
270 | 
271 |                 l_chars = np.concatenate((
272 |                     fps_chars,
273 |                     temp_chars,
274 |                 ), axis=0)
275 | 
276 |                 r_pixels = np.concatenate((
277 |                     gpu_pixels,
278 |                     mem_pixels,
279 |                 ), axis=0)
280 | 
281 |                 r_chars = np.concatenate((
282 |                     gpu_chars,
283 |                     mem_chars,
284 |                 ), axis=0)
285 | 
286 |                 pixels = np.concatenate((l_pixels, r_pixels), axis=1)
287 |                 chars = np.concatenate((l_chars, r_chars), axis=1)
288 | 
289 |                 time.sleep(0.001)
290 |                 write(pixels_to_ascii(pixels, chars) + '\n', stdout=real_stdout)
291 | 
292 |                 time.sleep(max(0, sample_period - (time.time() - sample_time)))
293 |         finally:
294 |             sys.stdout = real_stdout
295 |             sys.stderr = real_stderr
296 | 
297 |     bg_thread = threading.Thread(target=bg_thread_fn, args=(chart_wh, max_data_points, fps, fps_lock), daemon=True)
298 |     
299 |     def gen_update_fps(fps_lock, fps):
300 |         def update_fps(latest_fps):
301 |             with fps_lock:
302 |                 fps[-1] = latest_fps
303 |         return update_fps
304 |     
305 |     return gen_update_fps(fps_lock, fps), bg_thread
306 | 
307 | 


--------------------------------------------------------------------------------
/logs/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/.keep


--------------------------------------------------------------------------------
/logs/masking:
--------------------------------------------------------------------------------
1 | [TensorRT] ERROR: INVALID_ARGUMENT: getPluginCreator could not find plugin NonZero version 1
2 | /opt/conda/lib/python3.6/site-packages/torch/onnx/symbolic_opset9.py:2329: UserWarning: Exporting aten::index operator with indices of type Byte. Only 1-D indices are supported. In any other case, this will produce an incorrect ONNX graph.
3 |   warnings.warn("Exporting aten::index operator with indices of type Byte. "
4 | /opt/conda/lib/python3.6/site-packages/torch/onnx/symbolic_opset9.py:591: UserWarning: This model contains a squeeze operation on dimension 1 on an input with unknown shape. Note that if the size of dimension 1 of the input is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on non-singleton dimensions, it is recommended to export this model using opset version 11 or higher.
5 |   "version 11 or higher.")
6 | exporting Masking to models/masking.onnx
7 | compiling models/masking.onnx with TensorRT
8 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp16.b16.k256.pytorch.bench:
--------------------------------------------------------------------------------
1 | 500 batches, 8000 images, 18.97 seconds total
2 | 2.4 ms per image
3 | 421.8 FPS
4 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp16.b16.k256.pytorch.coco:
--------------------------------------------------------------------------------
 1 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.25044
 2 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.42427
 3 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.25505
 4 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07417
 5 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26863
 6 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.39955
 7 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.23678
 8 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.34402
 9 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.36105
10 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11822
11 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39443
12 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54920
13 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp16.b16.k256.pytorch.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp16.b16.k256.pytorch.qdrep


--------------------------------------------------------------------------------
/logs/ssd300.fp16.b16.k256.trt.bench:
--------------------------------------------------------------------------------
1 | 500 batches, 8000 images, 8.59 seconds total
2 | 1.1 ms per image
3 | 931.1 FPS
4 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp16.b16.k256.trt.coco:
--------------------------------------------------------------------------------
 1 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.25022
 2 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.42477
 3 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.25491
 4 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07376
 5 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26836
 6 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.39985
 7 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.23762
 8 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.34404
 9 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.36037
10 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11687
11 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39541
12 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54943
13 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp16.b16.k256.trt.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp16.b16.k256.trt.qdrep


--------------------------------------------------------------------------------
/logs/ssd300.fp32.b16.k256.pytorch.bench:
--------------------------------------------------------------------------------
1 | 500 batches, 8000 images, 31.43 seconds total
2 | 3.9 ms per image
3 | 254.5 FPS
4 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp32.b16.k256.pytorch.coco:
--------------------------------------------------------------------------------
 1 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.25041
 2 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.42413
 3 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.25521
 4 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07433
 5 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26849
 6 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.40030
 7 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.23688
 8 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.34397
 9 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.36104
10 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11849
11 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39450
12 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54971
13 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp32.b16.k256.pytorch.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp32.b16.k256.pytorch.qdrep


--------------------------------------------------------------------------------
/logs/ssd300.fp32.b16.k256.trt.coco:
--------------------------------------------------------------------------------
 1 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.25040
 2 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.42430
 3 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.25516
 4 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07426
 5 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26855
 6 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.40043
 7 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.23687
 8 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.34343
 9 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.35993
10 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11745
11 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39354
12 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54890
13 | 


--------------------------------------------------------------------------------
/logs/ssd300.fp32.b16.k256.trt.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp32.b16.k256.trt.qdrep


--------------------------------------------------------------------------------
/logs/ssd300.int8.b16.k256.trt.coco:
--------------------------------------------------------------------------------
 1 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.24766
 2 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.42047
 3 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.25266
 4 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07075
 5 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26687
 6 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.40063
 7 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.23473
 8 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.33937
 9 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.35550
10 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11317
11 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.38830
12 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54646
13 | 


--------------------------------------------------------------------------------
/logs/ssd300.int8.b16.k256.trt.qdrep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.int8.b16.k256.trt.qdrep


--------------------------------------------------------------------------------
/logs/subscript_assignment:
--------------------------------------------------------------------------------
1 | [TensorRT] WARNING: /build/TensorRT/parsers/onnx/onnx2trt_utils.cpp:220: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
2 | [TensorRT] WARNING: /build/TensorRT/parsers/onnx/onnx2trt_utils.cpp:246: One or more weights outside the range of INT32 was clamped
3 | [TensorRT] ERROR: INVALID_ARGUMENT: getPluginCreator could not find plugin ScatterND version 1
4 | exporting SubscriptAssign to models/subscript_assign.onnx
5 | compiling models/subscript_assign.onnx with TensorRT
6 | 


--------------------------------------------------------------------------------
/masking.py:
--------------------------------------------------------------------------------
 1 | import sys, io
 2 | import torch
 3 | import tensorrt as trt
 4 | 
 5 | class Masking(torch.nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def forward(self, X):
10 |         X = X[X.sum(dim=-1) > 0]
11 |         return X
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     m = Masking()
16 |     onnx_filename = 'models/masking.onnx'
17 | 
18 |     print('exporting Masking to', onnx_filename)
19 |     torch.onnx.export(
20 |         Masking(),
21 |         torch.randn((10, 10)),
22 |         onnx_filename,
23 |         opset_version=11
24 |     )
25 | 
26 |     print('compiling', onnx_filename, 'with TensorRT')
27 |     logger = trt.Logger()
28 |     network_flags = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
29 | 
30 |     with trt.Builder(logger) as builder, builder.create_network(network_flags) as network, trt.OnnxParser(network, logger) as parser:
31 |         if not parser.parse(open(onnx_filename, 'rb').read()):
32 |             sys.exit(1)
33 |         engine = builder.build_cuda_engine(network)
34 | 
35 | 


--------------------------------------------------------------------------------
/models/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/models/.keep


--------------------------------------------------------------------------------
/optrec.py:
--------------------------------------------------------------------------------
 1 | import sys, json
 2 | 
 3 | _, input_path, output_path = sys.argv
 4 | 
 5 | header, *lines = open(input_path).readlines()
 6 | fout = open(output_path, 'w')
 7 | 
 8 | fout.write(header)
 9 | 
10 | min_gap = 0.1
11 | max_gap = 1.0
12 | 
13 | prev_offset = None
14 | prev_text = None
15 | running_shortcut = 0.
16 | 
17 | for line in lines:
18 |     offset, mode, text = json.loads(line)
19 |     if prev_offset is None:
20 |         prev_offset = offset
21 |         prev_text = text
22 |     elif offset - prev_offset < min_gap:
23 |         prev_text += text
24 |     else:
25 |         fout.write(json.dumps([prev_offset - running_shortcut, 'o', prev_text]) + '\n')
26 | 
27 |         if offset - prev_offset > max_gap:
28 |             running_shortcut += (offset - prev_offset) - max_gap
29 | 
30 |         prev_offset = offset
31 |         prev_text = text
32 | 
33 | if prev_offset is not None:
34 |     fout.write(json.dumps([prev_offset - running_shortcut, 'o', prev_text]) + '\n')
35 | 
36 | 


--------------------------------------------------------------------------------
/ssd300_baseline.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import contextlib
  3 | import math, itertools
  4 | import torch, torchvision
  5 | 
  6 | 
  7 | # context manager to help keep track of ranges of time using NVTX
  8 | @contextlib.contextmanager
  9 | def nvtx_range(msg):
 10 |     depth = torch.cuda.nvtx.range_push(msg)
 11 |     try:
 12 |         yield depth
 13 |     finally:
 14 |         torch.cuda.nvtx.range_pop()
 15 | 
 16 | 
 17 | def init_dboxes(model_dtype):
 18 |     'adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Detection/SSD/src/utils.py'
 19 |     fig_size = 300
 20 |     feat_size = [38, 19, 10, 5, 3, 1]
 21 |     steps = [8, 16, 32, 64, 100, 300]
 22 |     scales = [21, 45, 99, 153, 207, 261, 315]
 23 |     aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
 24 | 
 25 |     fk = fig_size / torch.tensor(steps).float()
 26 | 
 27 |     dboxes = []
 28 |     # size of feature and number of feature
 29 |     for idx, sfeat in enumerate(feat_size):
 30 |         sk1 = scales[idx] / fig_size
 31 |         sk2 = scales[idx + 1] / fig_size
 32 |         sk3 = math.sqrt(sk1 * sk2)
 33 |         all_sizes = [(sk1, sk1), (sk3, sk3)]
 34 | 
 35 |         for alpha in aspect_ratios[idx]:
 36 |             w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha)
 37 |             all_sizes.append((w, h))
 38 |             all_sizes.append((h, w))
 39 | 
 40 |         for w, h in all_sizes:
 41 |             for i, j in itertools.product(range(sfeat), repeat=2):
 42 |                 cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
 43 |                 dboxes.append((cx, cy, w, h))
 44 | 
 45 |     return torch.tensor(
 46 |         dboxes,
 47 |         dtype=model_dtype,
 48 |         device='cuda'
 49 |     ).clamp(0, 1)
 50 | 
 51 | 
 52 | class SSD300(torch.nn.Module):
 53 |     def __init__(self, detection_threshold, model_precision, batch_dim):
 54 |         super().__init__()
 55 |         self.detector = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math=model_precision).eval()
 56 |         self.detection_threshold = torch.nn.Parameter(torch.tensor(0.4), requires_grad=False)
 57 |         self.model_dtype = torch.float16 if model_precision == 'fp16' else torch.float32
 58 |         self.batch_dim = batch_dim
 59 |         self.class_dim = 81
 60 |         self.scale_xy = 0.1
 61 |         self.scale_wh = 0.2
 62 |         self.dboxes_xywh = torch.nn.Parameter(init_dboxes(self.model_dtype).unsqueeze(dim=0), requires_grad=False)
 63 |         self.box_dim = self.dboxes_xywh.size(1)
 64 |         self.buffer_nchw = torch.nn.Parameter(torch.zeros((batch_dim, 3, 300, 300), dtype=self.model_dtype), requires_grad=False)
 65 |         self.class_dim_tensor = torch.nn.Parameter(torch.tensor([self.class_dim]), requires_grad=False)
 66 |         self.class_indexes = torch.nn.Parameter(torch.arange(self.class_dim).repeat(self.batch_dim * self.box_dim), requires_grad=False)
 67 |         self.image_indexes = torch.nn.Parameter(
 68 |             (torch.ones(self.box_dim * self.class_dim) * torch.arange(1, self.batch_dim + 1).unsqueeze(-1)).view(-1),
 69 |             requires_grad=False
 70 |         )
 71 | 
 72 |     def preprocess(self, image_nchw):
 73 |         'normalize'
 74 |         with nvtx_range('preprocess'):
 75 |             # Nvidia SSD300 code uses mean and std-dev of 128/256
 76 |             return (2 * (image_nchw.to(self.model_dtype) / 255) - 1)
 77 | 
 78 |     def forward(self, image_nchw):
 79 |         image_batch = self.preprocess(image_nchw)
 80 |         locs, labels = self.detector(image_batch)
 81 |         return self.postprocess(locs, labels)
 82 | 
 83 |     def xywh_to_xyxy(self, bboxes_batch, scores_batch):
 84 |         bboxes_batch = bboxes_batch.permute(0, 2, 1)
 85 |         scores_batch = scores_batch.permute(0, 2, 1)
 86 | 
 87 |         bboxes_batch[:, :, :2] = self.scale_xy * bboxes_batch[:, :, :2]
 88 |         bboxes_batch[:, :, 2:] = self.scale_wh * bboxes_batch[:, :, 2:]
 89 | 
 90 |         bboxes_batch[:, :, :2] = bboxes_batch[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
 91 |         bboxes_batch[:, :, 2:] = bboxes_batch[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
 92 | 
 93 |         # transform format to ltrb
 94 |         l, t, r, b = bboxes_batch[:, :, 0] - 0.5 * bboxes_batch[:, :, 2],\
 95 |                      bboxes_batch[:, :, 1] - 0.5 * bboxes_batch[:, :, 3],\
 96 |                      bboxes_batch[:, :, 0] + 0.5 * bboxes_batch[:, :, 2],\
 97 |                      bboxes_batch[:, :, 1] + 0.5 * bboxes_batch[:, :, 3]
 98 | 
 99 |         bboxes_batch[:, :, 0] = l
100 |         bboxes_batch[:, :, 1] = t
101 |         bboxes_batch[:, :, 2] = r
102 |         bboxes_batch[:, :, 3] = b
103 | 
104 |         return bboxes_batch, torch.nn.functional.softmax(scores_batch, dim=-1)
105 | 
106 |     def postprocess(self, locs, labels):
107 |         with nvtx_range('postprocess'):
108 |             locs, probs = self.xywh_to_xyxy(locs, labels)
109 | 
110 |             # flatten batch and classes
111 |             # Exporting the operator repeat_interleave to ONNX opset version 11 is not supported.
112 |             flat_locs = locs.reshape(-1, 4).repeat_interleave(self.class_dim_tensor, dim=0)
113 |             flat_probs = probs.view(-1)
114 | 
115 |             # only do NMS on detections over threshold, and ignore background (0)
116 |             threshold_mask = (flat_probs > self.detection_threshold) & (self.class_indexes > 0)
117 | 
118 |             flat_locs = flat_locs[threshold_mask]
119 |             flat_probs = flat_probs[threshold_mask]
120 |             class_indexes = self.class_indexes[threshold_mask]
121 |             image_indexes = self.image_indexes[threshold_mask]
122 | 
123 |             nms_mask = torchvision.ops.boxes.batched_nms(
124 |                 flat_locs,
125 |                 flat_probs,
126 |                 class_indexes * image_indexes,
127 |                 iou_threshold=0.7
128 |             )
129 | 
130 |             bboxes = flat_locs[nms_mask]
131 |             probs = flat_probs[nms_mask]
132 |             class_indexes = class_indexes[nms_mask]
133 |             return bboxes, probs, class_indexes
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     batch_dim = 16
138 |     model = SSD300(0.05, 'fp32', batch_dim).to('cpu').eval()
139 |     torch.onnx.export(
140 |         model,
141 |         torch.randn((batch_dim, 3, 300, 300)),
142 |         'models/ssd300.baseline.onnx',
143 |         opset_version=11
144 |     )
145 | 
146 | 


--------------------------------------------------------------------------------
/ssd300_trt.py:
--------------------------------------------------------------------------------
  1 | import os, sys, time
  2 | import math
  3 | import io, queue, threading
  4 | from pprint import pprint
  5 | 
  6 | import numpy as np
  7 | import torch, torchvision
  8 | from pycocotools.cocoeval import COCOeval
  9 | 
 10 | import tensorrt as trt
 11 | import onnx
 12 | from onnx import shape_inference, helper, TensorProto
 13 | import onnx_graphsurgeon as gs
 14 | 
 15 | import pycuda.driver as cuda
 16 | import pycuda.autoinit
 17 | 
 18 | from data import get_val_dataloader, get_coco_ground_truth, init_dboxes
 19 | import gpuplot
 20 | 
 21 | 
 22 | class Int8Calibrator(trt.IInt8EntropyCalibrator2):
 23 |     def __init__(self, args):
 24 |         super().__init__()
 25 |         self.batch_dim = args.batch_dim
 26 |         self.dataloader = iter(get_val_dataloader(args))
 27 |         self.current_batch = None # for ref-counting
 28 |         self.cache_path = 'calibration.cache'
 29 | 
 30 |     def get_batch_size(self):
 31 |         return self.batch_dim
 32 | 
 33 |     def get_batch(self, tensor_names):
 34 |         # assume same order as in dataset
 35 |         try:
 36 |             tensor_nchw, _, heights_widths, _, r_e = next(self.dataloader)
 37 |             self.current_batch = tensor_nchw.cuda(), heights_widths[0].cuda(), heights_widths[1].cuda()
 38 |             return [t.data_ptr() for t in self.current_batch]
 39 |         except StopIteration:
 40 |             return None
 41 | 
 42 |     def read_calibration_cache(self):
 43 |         if os.path.exists(self.cache_path):
 44 |             with open(self.cache_path, 'rb') as f:
 45 |                 return f.read()
 46 | 
 47 |     def write_calibration_cache(self, cache):
 48 |         with open(self.cache_path, 'wb') as f:
 49 |             f.write(cache)
 50 | 
 51 | 
 52 | 
 53 | class SSD300(torch.nn.Module):
 54 |     def __init__(self, topk, detection_threshold, iou_threshold, model_precision, batch_dim, trt_path=None, onnx_export=False):
 55 |         super().__init__()
 56 |         self.topk = torch.nn.Parameter(torch.tensor(topk, dtype=torch.int32), requires_grad=False)
 57 |         self.detection_threshold = torch.nn.Parameter(torch.tensor(detection_threshold), requires_grad=False)
 58 |         self.model_dtype = torch.float16 if model_precision == 'fp16' else torch.float32
 59 |         self.batch_dim = batch_dim
 60 |         self.class_dim = 81
 61 |         self.foreground_class_dim = self.class_dim - 1
 62 |         self.scale_xy = 0.1
 63 |         self.scale_wh = 0.2
 64 |         self.scale_xyxywhwh = torch.nn.Parameter(torch.tensor([
 65 |             self.scale_xy,
 66 |             self.scale_xy,
 67 |             self.scale_wh,
 68 |             self.scale_wh
 69 |         ]), requires_grad=False)
 70 |         self.scale_wh_delta = torch.nn.Parameter(torch.tensor([-0.5, -0.5, 0.5, 0.5]), requires_grad=False)
 71 |         self.iou_threshold = iou_threshold
 72 |         self.dboxes_xywh = torch.nn.Parameter(init_dboxes(self.model_dtype).unsqueeze(dim=0), requires_grad=False)
 73 |         self.box_dim = torch.nn.Parameter(torch.tensor(self.dboxes_xywh.size(1)), requires_grad=False)
 74 |         self.buffer_nchw = torch.nn.Parameter(torch.zeros((batch_dim, 3, 300, 300), dtype=self.model_dtype), requires_grad=False)
 75 |         self.class_indexes = torch.nn.Parameter(torch.arange(1, self.class_dim).repeat(self.batch_dim * self.topk), requires_grad=False)
 76 |         self.image_indexes = torch.nn.Parameter(
 77 |             (torch.ones(self.topk * self.foreground_class_dim, dtype=torch.int32) * torch.arange(self.batch_dim).unsqueeze(-1)).view(-1),
 78 |             requires_grad=False
 79 |         )
 80 |         self.onnx_export = onnx_export
 81 |         self.trt_engine = None
 82 |         if trt_path:
 83 |             print('loading TRT engine from', trt_path, '...')
 84 |             self.trt_logger = trt.Logger()
 85 |             trt.init_libnvinfer_plugins(self.trt_logger, '')
 86 |             with open(trt_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
 87 |                 self.trt_engine = runtime.deserialize_cuda_engine(f.read())
 88 |                 self.trt_stream = cuda.Stream()
 89 |                 self.trt_context = self.trt_engine.create_execution_context()
 90 |         else:
 91 |             self.detector = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math=model_precision).eval()
 92 | 
 93 | 
 94 |     def forward(self, tensor_nchw, image_heights, image_widths):
 95 |         if self.onnx_export:
 96 |             return self.forward_pytorch(tensor_nchw, image_heights, image_widths)
 97 |         else:
 98 |             return self.forward_coco(tensor_nchw, image_heights, image_widths)
 99 | 
100 | 
101 |     def forward_pytorch(self, tensor_nchw, image_heights, image_widths):
102 |         locs, scores = self.detector(tensor_nchw)
103 |         locs = locs.permute(0, 2, 1)
104 |         locs = self.rescale_locs(locs)
105 | 
106 |         scores = scores.permute(0, 2, 1)
107 |         probs = torch.nn.functional.softmax(scores, dim=-1)
108 | 
109 |         locs, probs = self.reshape_for_topk(locs, probs)
110 |         bboxes = self.locs_to_xyxy(locs, image_heights, image_widths)
111 |         return bboxes, probs
112 | 
113 | 
114 |     def forward_trt(self, tensor_nchw, image_heights, image_widths):
115 |         trt_outputs, bindings = [], []
116 |         np_to_torch_type = {
117 |             np.float32: torch.float32,
118 |             np.float16: torch.float16,
119 |             np.int32: torch.int32,
120 |             np.int64: torch.int64,
121 |         }
122 | 
123 |         for binding_name in self.trt_engine:
124 |             shape = self.trt_engine.get_binding_shape(binding_name)
125 |             dtype = trt.nptype(self.trt_engine.get_binding_dtype(binding_name))
126 |             torch_type = np_to_torch_type[dtype]
127 | 
128 |             if self.trt_engine.binding_is_input(binding_name):
129 |                 torch_input = vars()[binding_name].to(torch_type)
130 |                 bindings.append(int(torch_input.data_ptr()))
131 |             else:
132 |                 torch_output = torch.zeros(tuple(shape), dtype=torch_type, device='cuda')
133 |                 trt_outputs.append(torch_output)
134 |                 bindings.append(int(torch_output.data_ptr()))
135 | 
136 |         self.trt_context.execute_async_v2(bindings=bindings, stream_handle=self.trt_stream.handle)
137 |         self.trt_stream.synchronize()
138 | 
139 |         return trt_outputs
140 | 
141 | 
142 |     def trt_postprocess(self, batch_dim, num_detections, bboxes, probs, class_indexes):
143 |         # select valid detections and flatten batch/box/class dimensions
144 |         num_detections = num_detections.expand(-1, self.topk)
145 |         detection_mask = num_detections > torch.arange(self.topk, dtype=torch.int32, device='cuda').unsqueeze(0).expand(-1, self.topk)
146 | 
147 |         probs = probs.masked_select(detection_mask)
148 |         class_indexes = self.class_indexes[class_indexes.to(torch.int64)].masked_select(detection_mask)
149 | 
150 |         image_indexes = torch.arange(batch_dim, dtype=torch.int64, device='cuda').unsqueeze(-1).expand(-1, self.topk)
151 |         image_indexes = image_indexes.masked_select(detection_mask)
152 | 
153 |         bboxes = bboxes.masked_select(detection_mask.unsqueeze(-1).expand_as(bboxes))
154 |         bboxes = bboxes.unsqueeze(-1).reshape(-1, 4)
155 | 
156 |         return bboxes, probs, class_indexes, image_indexes
157 | 
158 | 
159 |     def forward_coco(self, tensor_nchw, image_heights, image_widths):
160 |         if self.trt_engine:
161 |             bboxes, probs, class_indexes, image_indexes = self.trt_postprocess(
162 |                 tensor_nchw.size(0),
163 |                 *self.forward_trt(tensor_nchw, image_heights, image_widths)
164 |             )
165 |         else:
166 |             bboxes, probs = self.forward_pytorch(tensor_nchw, image_heights, image_widths)
167 |             bboxes, probs, class_indexes, image_indexes = self.topk_and_nms(bboxes, probs)
168 |         return self.xyxy_to_xywh(bboxes), probs, class_indexes, image_indexes
169 | 
170 | 
171 |     def rescale_locs(self, locs):
172 |         locs *= self.scale_xyxywhwh
173 | 
174 |         xy = locs[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
175 |         wh = locs[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
176 | 
177 |         wh_delta = torch.cat([wh, wh], dim=-1) * self.scale_wh_delta
178 |         cxycxy = torch.cat([xy, xy], dim=-1)
179 |         return cxycxy + wh_delta
180 | 
181 | 
182 |     def reshape_for_topk(self, locs, probs):
183 |         locs = locs.unsqueeze(-2)
184 |         locs = locs.expand(locs.size(0), self.box_dim, self.foreground_class_dim, locs.size(3))
185 |         probs = probs[:, :, 1:]
186 |         return locs, probs
187 | 
188 | 
189 |     def topk_and_nms(self, locs, probs):
190 |         probs, top_prob_indexes = probs.topk(self.topk, dim=1)
191 |         flat_probs = probs.reshape(-1).contiguous()
192 | 
193 |         locs = locs.gather(1, top_prob_indexes.unsqueeze(-1).expand(*top_prob_indexes.size(), 4))
194 |         flat_locs = locs.reshape(-1, 4).contiguous()
195 | 
196 |         # only do NMS on detections over threshold
197 |         threshold_mask = flat_probs > self.detection_threshold
198 | 
199 |         flat_locs = flat_locs[threshold_mask]
200 |         flat_probs = flat_probs[threshold_mask]
201 |         class_indexes = self.class_indexes[threshold_mask]
202 |         image_indexes = self.image_indexes[threshold_mask]
203 | 
204 |         nms_mask = torchvision.ops.boxes.batched_nms(
205 |             flat_locs,
206 |             flat_probs,
207 |             class_indexes * (image_indexes + 1), # do not multiply class_indexes by 0
208 |             iou_threshold=self.iou_threshold
209 |         )
210 | 
211 |         return (
212 |             flat_locs[nms_mask],
213 |             flat_probs[nms_mask],
214 |             class_indexes[nms_mask],
215 |             image_indexes[nms_mask]
216 |         )
217 | 
218 | 
219 |     def locs_to_xyxy(self, locs, image_heights, image_widths):
220 |         image_heights = image_heights.reshape(-1, 1, 1, 1)
221 |         image_widths = image_widths.reshape(-1, 1, 1, 1)
222 | 
223 |         image_wh = torch.cat([image_widths, image_heights], dim=-1)
224 | 
225 |         xy = locs[:, :, :, 0:2] * image_wh
226 |         wh = (locs[:, :, :, 2:4] - locs[:, :, :, 0:2]) * image_wh # surely this could just be locs[:, :, :, 2:4] * image_wh and then return cat([xy, xy2])?
227 | 
228 |         return torch.cat([xy, xy + wh], dim=-1)
229 | 
230 | 
231 |     def xyxy_to_xywh(self, xyxy):
232 |         return torch.cat([xyxy[:, :2], xyxy[:, 2:] - xyxy[:, :2]], dim=-1)
233 | 
234 | 
235 | def eval_coco(args):
236 |     device = torch.device(args.device)
237 | 
238 |     model = SSD300(
239 |         args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path
240 |     ).to(device).eval()
241 | 
242 |     dataloader = get_val_dataloader(args)
243 |     inv_map = {v: k for k, v in dataloader.dataset.label_map.items()}
244 | 
245 |     coco_ground_truth = get_coco_ground_truth(args)
246 | 
247 |     results = None
248 |     start = time.time()
249 | 
250 |     for nbatch, (X, img_id, img_size, _, _) in enumerate(dataloader):
251 |         print('Inference batch: {}/{}'.format(nbatch, len(dataloader)), end='\r')
252 |         with torch.no_grad():
253 |             batch_dim = X.size(0)
254 |             if args.precision == 'fp16':
255 |                 X = X.to(torch.float16)
256 |             X = X.to(device)
257 |             image_heights, image_widths = [i.to(device) for i in img_size]
258 | 
259 |             if batch_dim < args.batch_dim:
260 |                 num_pad = args.batch_dim - batch_dim
261 |                 X = torch.cat([X, X[-1].expand(num_pad, *X[-1].size())], dim=0)
262 |                 image_heights = torch.cat([image_heights, image_heights[-1].repeat(num_pad)], dim=0)
263 |                 image_widths = torch.cat([image_widths, image_widths[-1].repeat(num_pad)], dim=0)
264 | 
265 |             bboxes, probs, class_indexes, image_indexes = model.forward_coco(X, image_heights, image_widths)
266 | 
267 |             # filter out pad results
268 |             small_batch_filter = image_indexes < batch_dim
269 |             bboxes = bboxes[small_batch_filter]
270 |             probs = probs[small_batch_filter]
271 |             class_indexes = class_indexes[small_batch_filter]
272 |             image_indexes = image_indexes[small_batch_filter]
273 | 
274 |             mapped_labels = class_indexes.to('cpu')
275 |             mapped_labels.apply_(lambda i: inv_map[i])
276 |             image_ids = img_id[image_indexes]
277 | 
278 |             batch_results = torch.cat([
279 |                 image_ids.cpu().unsqueeze(-1),
280 |                 bboxes.cpu(),
281 |                 probs.cpu().unsqueeze(-1),
282 |                 mapped_labels.unsqueeze(-1)
283 |             ], dim=1)
284 | 
285 |             if results is not None:
286 |                 results = torch.cat([results, batch_results], dim=0)
287 |             else:
288 |                 results = batch_results
289 | 
290 |     print()
291 |     print(f'DONE (t={time.time() - start:.2f}).')
292 | 
293 |     results = results.numpy().astype(np.float32)
294 | 
295 |     coco_detections = coco_ground_truth.loadRes(results)
296 | 
297 |     E = COCOeval(coco_ground_truth, coco_detections, iouType='bbox')
298 |     E.evaluate()
299 |     E.accumulate()
300 |     stdout = sys.stdout
301 |     try:
302 |         if args.output_path:
303 |             sys.stdout = open(args.output_path, 'w')
304 |         E.summarize()
305 |     finally:
306 |         if args.output_path:
307 |             sys.stdout.close()
308 |         sys.stdout = stdout
309 |     print('mAP: {:.5f}'.format(E.stats[0]))
310 | 
311 | 
312 | def export_engine(args):
313 |     onnx_module = build_onnx(args)
314 |     build_trt_engine(onnx_module, args)
315 | 
316 | 
317 | def build_onnx(args):
318 |     device = torch.device('cpu')
319 |     val_dataloader = get_val_dataloader(args)
320 | 
321 |     for nbatch, (X, img_id, img_size, _, _) in enumerate(val_dataloader):
322 |         inputs = X, img_size[0], img_size[1]
323 |         break
324 | 
325 |     model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, 'fp32', args.batch_dim, None, onnx_export=True).to(device).eval()
326 | 
327 |     onnx_buf = io.BytesIO()
328 |     torch.onnx.export(
329 |         model,
330 |         inputs,
331 |         onnx_buf,
332 |         input_names=('tensor_nchw', 'image_heights', 'image_widths'),
333 |         output_names=('bboxes', 'probs'),
334 |         opset_version=11,
335 |         export_params=True
336 |     )
337 |     onnx_buf.seek(0)
338 |     onnx_module = shape_inference.infer_shapes(onnx.load(onnx_buf))
339 | 
340 |     while len(onnx_module.graph.output):
341 |         onnx_module.graph.output.remove(onnx_module.graph.output[0])
342 |     onnx_module.graph.output.extend([
343 |         helper.make_tensor_value_info('num_detections', TensorProto.INT32, [-1]),
344 |         helper.make_tensor_value_info('nms_bboxes', TensorProto.FLOAT, [-1, -1, -1]),
345 |         helper.make_tensor_value_info('nms_probs', TensorProto.FLOAT, [-1, -1]),
346 |         helper.make_tensor_value_info('nms_classes', TensorProto.FLOAT, [-1, -1]),
347 |     ])
348 | 
349 |     graph = gs.import_onnx(onnx_module)
350 | 
351 |     attrs = {
352 |         'shareLocation': False,
353 |         'numClasses': 80,
354 |         'backgroundLabelId': -1,
355 |         'topK': args.topk,      # per-class, pre NMS
356 |         'keepTopK': args.topk,  # across-classes, per image
357 |         'scoreThreshold': args.detection_threshold,
358 |         'iouThreshold': args.iou_threshold,
359 |         'isNormalized': False,
360 |         'clipBoxes': False,
361 |     }
362 | 
363 |     ts = graph.tensors()
364 | 
365 |     nms_layer = graph.layer(
366 |         op='BatchedNMSDynamic_TRT',
367 |         attrs=attrs,
368 |         inputs=[ts['bboxes'], ts['probs']],
369 |         outputs=[ts['num_detections'], ts['nms_bboxes'], ts['nms_probs'], ts['nms_classes']]
370 |     )
371 | 
372 |     graph.cleanup()
373 |     graph.toposort()
374 | 
375 |     onnx_module = gs.export_onnx(graph)
376 |     onnx_path = os.path.splitext(args.trt_path)[0] + '.onnx'
377 |     print('saving ONNX model to', onnx_path)
378 |     onnx.save(onnx_module, onnx_path)
379 |     return onnx_module
380 | 
381 | 
382 | 
383 | def build_trt_engine(onnx_module, args):
384 |     logger = trt.Logger()
385 | 
386 |     network_flags = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
387 | 
388 |     with trt.Builder(logger) as builder, builder.create_network(network_flags) as network, trt.OnnxParser(network, logger) as parser:
389 |         builder.max_workspace_size = 2 ** 31 # 2 GB
390 |         builder.max_batch_size = args.batch_dim
391 |         builder.fp16_mode = args.precision != 'fp32'
392 |         if args.precision == 'int8':
393 |             builder.int8_mode = True
394 |             builder.int8_calibrator = Int8Calibrator(args)
395 | 
396 |         print('parsing ONNX...')
397 |         onnx_buf = io.BytesIO()
398 |         onnx.save(onnx_module, onnx_buf)
399 |         onnx_buf.seek(0)
400 |         if not parser.parse(onnx_buf.read()):
401 |             print(parser.num_errors, 'parser errors:')
402 |             for i in range(parser.num_errors):
403 |                 print(parser.get_error(i))
404 | 
405 |         print('inputs:')
406 |         inputs = {
407 |             t.name: t.shape
408 |             for t in [
409 |                 network.get_input(i)
410 |                 for i in range(network.num_inputs)
411 |             ]
412 |         }
413 |         pprint(inputs)
414 |         print('outputs:')
415 |         outputs = {
416 |             t.name: t.shape
417 |             for t in [
418 |                 network.get_output(i)
419 |                 for i in range(network.num_outputs)
420 |             ]
421 |         }
422 |         pprint(outputs)
423 | 
424 |         print('building CUDA engine...')
425 |         engine = builder.build_cuda_engine(network)
426 |         if engine:
427 |             print('saving CUDA engine to', args.trt_path)
428 |             with open(args.trt_path, 'wb') as mf:
429 |                 mf.write(engine.serialize())
430 | 
431 |         return engine
432 | 
433 | 
434 | 
435 | def benchmark(args):
436 |     app_start = time.time()
437 | 
438 |     prewarm_iters = 50
439 |     bench_secs = 10
440 | 
441 |     val_dataloader = get_val_dataloader(args)
442 | 
443 |     for nbatch, (tensor_nchw, img_id, (image_heights, image_widths), _, _) in enumerate(val_dataloader):
444 |         tensor_nchw, image_heights, image_widths = [t.to('cuda') for t in (tensor_nchw, image_heights, image_widths)]
445 |         break
446 | 
447 |     batch_dim = tensor_nchw.size(0)
448 | 
449 |     update_fps, plot_thread = gpuplot.bg_plot(
450 |         num_gpus=args.num_devices,
451 |         sample_hz=5,
452 |     )
453 | 
454 |     max_times = 10
455 |     batch_times = []
456 |     last_update = time.time()
457 |     update_period = 0.5
458 | 
459 |     if args.runtime == 'pytorch':
460 |         print(f'Runtime: Pytorch\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}')
461 |         model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path)
462 |         model = model.eval().to('cuda')
463 | 
464 |         if args.precision == 'fp16':
465 |             tensor_nchw, image_heights, image_widths = [t.to(torch.float16) for t in (tensor_nchw, image_heights, image_widths)]
466 | 
467 |         plot_thread.start()
468 | 
469 |         print('Prewarming model')
470 |         for i in range(prewarm_iters):
471 |             model(tensor_nchw, image_heights, image_widths)
472 |             batch_times = (batch_times + [time.time()])[-max_times:]
473 | 
474 |         print(f'Beginning benchmark (+{time.time() - app_start:.1f})...')
475 |         start_time = time.time()
476 | 
477 |         bench_iters = 0
478 |         while True:
479 |             model(tensor_nchw, image_heights, image_widths)
480 |             batch_times = (batch_times + [time.time()])[-max_times:]
481 |             if batch_times[-1] > last_update + update_period and len(batch_times) > 1:
482 |                 last_update = batch_times[-1]
483 |                 update_fps(args.batch_dim * (len(batch_times) - 1) / (batch_times[-1] - batch_times[0]))
484 |             bench_iters += 1
485 |             if time.time() > start_time + bench_secs:
486 |                 break
487 | 
488 |     elif args.runtime == 'trt':
489 |         print(f'Runtime: TensorRT\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}')
490 |         np_to_torch_type = {
491 |             np.float32: torch.float32,
492 |             np.float16: torch.float16,
493 |             np.int32: torch.int32,
494 |             np.int64: torch.int64,
495 |         }
496 | 
497 |         devices = [cuda.Device(i) for i in range(args.num_devices)]
498 |         contexts = [devices[i].make_context() for i in range(args.num_devices)]
499 | 
500 |         for d in devices:
501 |             pycuda.autoinit.context.pop()
502 | 
503 |         context_detail = []
504 | 
505 |         for device_id, context in enumerate(contexts):
506 |             context.push()
507 |             try:
508 |                 torch_device = torch.device('cuda', device_id)
509 |                 streams = [cuda.Stream() for i in range(args.num_streams_per_device)]
510 | 
511 |                 tensors = {
512 |                     name: t.clone().to(torch_device)
513 |                     for name, t in [
514 |                         ('tensor_nchw', tensor_nchw),
515 |                         ('image_heights', image_heights),
516 |                         ('image_widths', image_widths)
517 |                     ]
518 |                 }
519 | 
520 |                 model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path)
521 | 
522 |                 trt_outputs, bindings = [[] for i in range(args.num_streams_per_device)], [[] for i in range(args.num_streams_per_device)]
523 | 
524 |                 for stream_id in range(args.num_streams_per_device):
525 |                     for binding_name in model.trt_engine:
526 |                         shape = model.trt_engine.get_binding_shape(binding_name)
527 |                         dtype = trt.nptype(model.trt_engine.get_binding_dtype(binding_name))
528 |                         torch_type = np_to_torch_type[dtype]
529 | 
530 |                         if model.trt_engine.binding_is_input(binding_name):
531 |                             torch_input = tensors[binding_name].to(torch_type)
532 |                             bindings[stream_id].append(int(torch_input.data_ptr()))
533 |                         else:
534 |                             torch_output = torch.zeros(tuple(shape), dtype=torch_type, device=torch_device)
535 |                             trt_outputs[stream_id].append(torch_output)
536 |                             bindings[stream_id].append(int(torch_output.data_ptr()))
537 | 
538 |                 context_detail.append({
539 |                     'streams': streams,
540 |                     'model': model,
541 |                     'trt_outputs': trt_outputs,
542 |                     'bindings': bindings
543 |                 })
544 | 
545 |             finally:
546 |                 context.pop()
547 | 
548 |         event_queue = queue.Queue(args.num_devices * args.num_streams_per_device)
549 | 
550 |         def sync_streams(update_fps, batch_times, max_times, last_update, update_period):
551 |             while True:
552 |                 ce = event_queue.get()
553 |                 if ce is None:
554 |                     break
555 |                 else:
556 |                     context, e = ce
557 |                     context.push()
558 |                     e.synchronize()
559 |                     context.pop()
560 | 
561 |                     batch_times = (batch_times + [time.time()])[-max_times:]
562 |                     if batch_times[-1] > last_update + update_period and len(batch_times) > 1:
563 |                         last_update = batch_times[-1]
564 |                         update_fps(args.batch_dim * (len(batch_times) - 1) / (batch_times[-1] - batch_times[0]))
565 | 
566 |         sync_thread = threading.Thread(target=sync_streams, args=(update_fps, batch_times, max_times, last_update, update_period))
567 |         sync_thread.start()
568 | 
569 |         plot_thread.start()
570 | 
571 |         # for benchmarking purposes, just run model repeatedly on initial batch of inputs
572 |         bench_iters = 0
573 |         while True:
574 |             if bench_iters == 0:
575 |                 print('Prewarming model')
576 |             elif bench_iters == prewarm_iters:
577 |                 print(f'Beginning benchmark (+{time.time() - app_start:.1f})...')
578 |                 start_time = time.time()
579 |             elif bench_iters > prewarm_iters and time.time() > start_time + bench_secs:
580 |                 break
581 | 
582 |             context_id = bench_iters % len(context_detail)
583 |             context = contexts[context_id]
584 |             context.push()
585 |             try:
586 |                 detail = context_detail[context_id]
587 |                 stream_id = (bench_iters - context_id) % len(detail['streams'])
588 |                 stream = detail['streams'][stream_id]
589 |                 detail['model'].trt_context.execute_async_v2(
590 |                     bindings=detail['bindings'][stream_id],
591 |                     stream_handle=stream.handle
592 |                 )
593 |                 event = cuda.Event(cuda.event_flags.DISABLE_TIMING)
594 |                 event_queue.put((context, event.record(stream)))
595 |             finally:
596 |                 context.pop()
597 | 
598 |             bench_iters += 1
599 | 
600 |         event_queue.put(None)
601 |         while not event_queue.empty():
602 |             pass
603 |         bench_iters -= prewarm_iters
604 | 
605 |     total_time = time.time() - start_time
606 | 
607 |     update_fps(None)
608 |     plot_thread.join()
609 | 
610 |     print(f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total')
611 |     print(f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image')
612 |     print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS')
613 | 
614 |     if args.output_path:
615 |         with open(args.output_path, 'w') as fout:
616 |             print(f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total', file=fout)
617 |             print(f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image', file=fout)
618 |             print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS', file=fout)
619 | 
620 | 
621 | 
622 | def parse_args():
623 |     import argparse
624 |     p = argparse.ArgumentParser()
625 |     p.add_argument('--mode', choices=['coco', 'export', 'bench'], default='coco')
626 |     p.add_argument('--runtime', choices=['pytorch', 'trt'], default='pytorch')
627 |     p.add_argument('--output-path')
628 | 
629 |     p.add_argument('--device', default=('cuda:0' if torch.cuda.is_available() else 'cpu'))
630 |     p.add_argument('--detection-threshold', default=0.05, type=float)
631 |     p.add_argument('--iou-threshold', default=0.5, type=float)
632 |     p.add_argument('--topk', default=256, type=int)
633 |     p.add_argument('--batch-dim', default=16, type=int)
634 |     p.add_argument('--precision', default='fp16')
635 | 
636 |     p.add_argument('--num-streams-per-device', type=int, default=4)
637 |     p.add_argument('--num-devices', type=int, default=1)
638 | 
639 |     p.add_argument('--eval-batch-size', default=None)
640 |     p.add_argument('--data', default='/data/coco2017')
641 |     p.add_argument('--num-workers', default=2)
642 | 
643 |     args = p.parse_args()
644 |     args.eval_batch_size = args.batch_dim
645 | 
646 |     if args.runtime == 'trt':
647 |         args.trt_path = f'models/ssd300.{args.precision}.b{args.batch_dim}.k{args.topk}.plan'
648 |     else:
649 |         args.trt_path = None
650 | 
651 |     if args.mode =='coco' and args.precision == 'int8' and args.runtime != 'trt':
652 |         print('incompatible args')
653 |         sys.exit(1)
654 | 
655 |     return args
656 | 
657 | 
658 | 
659 | if __name__ == '__main__':
660 |     args = parse_args()
661 | 
662 |     if args.mode == 'export':
663 |         export_engine(args)
664 |     elif args.mode == 'coco':
665 |         eval_coco(args)
666 |     elif args.mode == 'bench':
667 |         benchmark(args)
668 | 
669 | 
670 | 
671 | 


--------------------------------------------------------------------------------
/subscript_assignment.py:
--------------------------------------------------------------------------------
 1 | import sys, io
 2 | import torch
 3 | import tensorrt as trt
 4 | 
 5 | class SubscriptAssign(torch.nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def forward(self, X):
10 |         X[:, :2] = 0
11 |         return X
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     m = SubscriptAssign()
16 |     onnx_filename = 'models/subscript_assign.onnx'
17 | 
18 |     print('exporting SubscriptAssign to', onnx_filename)
19 |     torch.onnx.export(
20 |         SubscriptAssign(),
21 |         torch.randn((10, 10)),
22 |         onnx_filename,
23 |         opset_version=11
24 |     )
25 | 
26 |     print('compiling', onnx_filename, 'with TensorRT')
27 |     logger = trt.Logger()
28 |     network_flags = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
29 | 
30 |     with trt.Builder(logger) as builder, builder.create_network(network_flags) as network, trt.OnnxParser(network, logger) as parser:
31 |         if not parser.parse(open(onnx_filename, 'rb').read()):
32 |             sys.exit(1)
33 |         engine = builder.build_cuda_engine(network)
34 | 
35 | 


--------------------------------------------------------------------------------