├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── coco.py ├── data.py ├── gpuplot.py ├── logs ├── .keep ├── masking ├── ssd300.fp16.b16.k256.pytorch.bench ├── ssd300.fp16.b16.k256.pytorch.coco ├── ssd300.fp16.b16.k256.pytorch.qdrep ├── ssd300.fp16.b16.k256.trt.bench ├── ssd300.fp16.b16.k256.trt.coco ├── ssd300.fp16.b16.k256.trt.qdrep ├── ssd300.fp16.b16.k256.trt.svg ├── ssd300.fp32.b16.k256.pytorch.bench ├── ssd300.fp32.b16.k256.pytorch.coco ├── ssd300.fp32.b16.k256.pytorch.qdrep ├── ssd300.fp32.b16.k256.trt.coco ├── ssd300.fp32.b16.k256.trt.qdrep ├── ssd300.fp32.b16.k256.trt.svg ├── ssd300.int8.b16.k256.trt.coco ├── ssd300.int8.b16.k256.trt.qdrep ├── ssd300.int8.b16.k256.trt.svg └── subscript_assignment ├── masking.py ├── models └── .keep ├── optrec.py ├── ssd300_baseline.py ├── ssd300_trt.py └── subscript_assignment.py /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:20.10-py3 2 | 3 | RUN python -c "import torch; torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math='fp32')" 2>/dev/null | : 4 | RUN python -c "import torch; torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math='fp16')" 2>/dev/null | : 5 | 6 | # Nvidia Apex for mixed-precision inference 7 | RUN git clone https://github.com/NVIDIA/apex.git /build/apex 8 | WORKDIR /build/apex 9 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . 10 | 11 | # use pbridger fork instead of NV repo - only change is NMS plugin updated to handle FP16 12 | WORKDIR /build 13 | RUN git clone --single-branch --branch release/7.2 https://github.com/pbridger/TensorRT.git 14 | WORKDIR /build/TensorRT 15 | RUN mkdir build && git submodule update --init --recursive 16 | WORKDIR /build/TensorRT/build 17 | # set GPU_ARCHS to match your GPU architecture, though this repo only makes sense for >= Volta (70) 18 | RUN cmake .. -DTRT_LIB_DIR=`pwd`/lib -DTRT_OUT_DIR=`pwd`/out -DGPU_ARCHS="75" -DBUILD_SAMPLES=OFF 19 | RUN make -j$(nproc) && make install && cp lib/* /usr/lib/x86_64-linux-gnu/ 20 | 21 | WORKDIR /build/TensorRT/tools/onnx-graphsurgeon 22 | RUN make install 23 | 24 | RUN pip install pycuda 25 | RUN pip install pynvml 26 | 27 | WORKDIR /app 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Paul Bridger 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | CONTAINER_NAME := tensorrt-ssd300:latest 3 | # update to match your COCO 2017 location, or remove that volume mapping if you don't need to run --mode=coco 4 | DOCKER_CMD := docker run -it --rm --gpus=all --privileged=true --net=bridge --ulimit core=0 --ipc=host -v $(shell pwd):/app -v /data/coco2017:/data/coco2017 5 | PROFILE_CMD := profile -t cuda,cublas,cudnn,nvtx,osrt --force-overwrite=true --duration=30 --delay=6 6 | 7 | 8 | ### External - to be used from outside the container ### 9 | 10 | build-container: Dockerfile 11 | docker build -f $< -t ${CONTAINER_NAME} . 12 | 13 | run-container: build-container 14 | ${DOCKER_CMD} ${CONTAINER_NAME} 15 | 16 | 17 | logs/%.svg: logs/%.rec 18 | cat $< | svg-term --no-cursor > $@ 19 | 20 | 21 | logs/ssd300.fp32.b16.k256.pytorch.rec: 22 | rm -f logs/ssd300.fp32.b16.k256.pytorch.bench $@ 23 | asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp32.b16.k256.pytorch.bench sleep' 24 | python optrec.py $@.tmp $@ 25 | 26 | logs/ssd300.fp16.b16.k256.pytorch.rec: 27 | rm -f logs/ssd300.fp16.b16.k256.pytorch.bench $@ 28 | asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp16.b16.k256.pytorch.bench sleep' 29 | python optrec.py $@.tmp $@ 30 | 31 | logs/ssd300.fp32.b16.k256.trt.rec: 32 | rm -f logs/ssd300.fp32.b16.k256.trt.bench $@ 33 | asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp32.b16.k256.trt.bench sleep' 34 | python optrec.py $@.tmp $@ 35 | 36 | logs/ssd300.fp16.b16.k256.trt.rec: 37 | rm -f logs/ssd300.fp16.b16.k256.trt.bench $@ 38 | asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.fp16.b16.k256.trt.bench sleep' 39 | python optrec.py $@.tmp $@ 40 | 41 | logs/ssd300.int8.b16.k256.trt.rec: 42 | rm -f logs/ssd300.int8.b16.k256.trt.bench $@ 43 | asciinema rec $@.tmp -c 'make --no-print-directory logs/ssd300.int8.b16.k256.trt.bench sleep' 44 | python optrec.py $@.tmp $@ 45 | 46 | sleep: 47 | @sleep 4 48 | @echo '-' 49 | 50 | 51 | ### Internal - to be used from within the container (after run-container) ### 52 | 53 | ### Build models 54 | 55 | models/ssd300.fp32.b16.k256.plan: 56 | python ssd300_trt.py --mode=export --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt 57 | 58 | models/ssd300.fp16.b16.k256.plan: 59 | python ssd300_trt.py --mode=export --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt 60 | 61 | models/ssd300.int8.b16.k256.plan: 62 | python ssd300_trt.py --mode=export --precision=int8 --batch-dim=16 --topk=256 --runtime=trt 63 | 64 | models: models/ssd300.fp32.b16.k256.plan models/ssd300.fp16.b16.k256.plan models/ssd300.int8.b16.k256.plan 65 | 66 | ### COCO evaluation 67 | 68 | logs/ssd300.fp32.b16.k256.pytorch.coco: ssd300_trt.py 69 | python $< --mode=coco --precision=fp32 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@ 70 | 71 | logs/ssd300.fp16.b16.k256.pytorch.coco: ssd300_trt.py 72 | python $< --mode=coco --precision=fp16 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@ 73 | 74 | logs/ssd300.fp32.b16.k256.trt.coco: ssd300_trt.py models/ssd300.fp32.b16.k256.plan 75 | python $< --mode=coco --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@ 76 | 77 | logs/ssd300.fp16.b16.k256.trt.coco: ssd300_trt.py models/ssd300.fp16.b16.k256.plan 78 | python $< --mode=coco --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@ 79 | 80 | logs/ssd300.int8.b16.k256.trt.coco: ssd300_trt.py models/ssd300.int8.b16.k256.plan 81 | python $< --mode=coco --precision=int8 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@ 82 | 83 | coco: logs/ssd300.fp32.b16.k256.pytorch.coco logs/ssd300.fp16.b16.k256.pytorch.coco logs/ssd300.fp32.b16.k256.trt.coco logs/ssd300.fp16.b16.k256.trt.coco logs/ssd300.int8.b16.k256.trt.coco 84 | 85 | ### Throughput benchmarking 86 | 87 | logs/ssd300.fp32.b16.k256.pytorch.bench: ssd300_trt.py 88 | python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@ 89 | 90 | logs/ssd300.fp16.b16.k256.pytorch.bench: ssd300_trt.py 91 | python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=pytorch --output-path=$@ 92 | 93 | logs/ssd300.fp32.b16.k256.trt.bench: ssd300_trt.py models/ssd300.fp32.b16.k256.plan 94 | python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@ 95 | 96 | logs/ssd300.fp16.b16.k256.trt.bench: ssd300_trt.py models/ssd300.fp16.b16.k256.plan 97 | python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@ 98 | 99 | logs/ssd300.int8.b16.k256.trt.bench: ssd300_trt.py models/ssd300.int8.b16.k256.plan 100 | python $< --mode=bench --precision=int8 --batch-dim=16 --topk=256 --runtime=trt --output-path=$@ 101 | 102 | bench: logs/ssd300.fp32.b16.k256.pytorch.bench logs/ssd300.fp16.b16.k256.pytorch.bench logs/ssd300.fp32.b16.k256.trt.bench logs/ssd300.fp16.b16.k256.trt.bench logs/ssd300.int8.b16.k256.trt.bench 103 | 104 | ### Nsight systems report generation 105 | 106 | logs/ssd300.fp32.b16.k256.pytorch.qdrep: ssd300_trt.py 107 | nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=pytorch 108 | 109 | logs/ssd300.fp16.b16.k256.pytorch.qdrep: ssd300_trt.py 110 | nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=pytorch 111 | 112 | logs/ssd300.fp32.b16.k256.trt.qdrep: ssd300_trt.py models/ssd300.fp32.b16.k256.plan 113 | nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp32 --batch-dim=16 --topk=256 --runtime=trt 114 | 115 | logs/ssd300.fp16.b16.k256.trt.qdrep: ssd300_trt.py models/ssd300.fp16.b16.k256.plan 116 | nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=fp16 --batch-dim=16 --topk=256 --runtime=trt 117 | 118 | logs/ssd300.int8.b16.k256.trt.qdrep: ssd300_trt.py models/ssd300.int8.b16.k256.plan 119 | nsys ${PROFILE_CMD} -o $@ python $< --mode=bench --precision=int8 --batch-dim=16 --topk=256 --runtime=trt 120 | 121 | qdrep: logs/ssd300.fp32.b16.k256.pytorch.qdrep logs/ssd300.fp16.b16.k256.pytorch.qdrep logs/ssd300.fp32.b16.k256.trt.qdrep logs/ssd300.fp16.b16.k256.trt.qdrep logs/ssd300.int8.b16.k256.trt.qdrep 122 | 123 | ### Logs for article 124 | 125 | logs/subscript_assignment: subscript_assignment.py 126 | -python $< >$@ 2>&1 127 | 128 | logs/masking: masking.py 129 | -python $< >$@ 2>&1 130 | 131 | logs: logs/masking logs/subscript_assignment 132 | 133 | 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tensorrt-ssd300-8bit-quantized 2 | 3 | ## Getting started 4 | 5 | - To run coco evaluation you'll need the COCO2017 dataset. If you don't put it in /data/coco2017 you'll need to modify the Makefile to match your COCO location (see the DOCKER_CMD line). 6 | 7 | -------------------------------------------------------------------------------- /coco.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __author__ = 'tylin' 16 | __version__ = '2.0' 17 | # Interface for accessing the Microsoft COCO dataset. 18 | 19 | # Microsoft COCO is a large image dataset designed for object detection, 20 | # segmentation, and caption generation. pycocotools is a Python API that 21 | # assists in loading, parsing and visualizing the annotations in COCO. 22 | # Please visit http://mscoco.org/ for more information on COCO, including 23 | # for the data, paper, and tutorials. The exact format of the annotations 24 | # is also described on the COCO website. For example usage of the pycocotools 25 | # please see pycocotools_demo.ipynb. In addition to this API, please download both 26 | # the COCO images and annotations in order to run the demo. 27 | 28 | # An alternative to using the API is to load the annotations directly 29 | # into Python dictionary 30 | # Using the API provides additional utility functions. Note that this API 31 | # supports both *instance* and *caption* annotations. In the case of 32 | # captions not all functions are defined (e.g. categories are undefined). 33 | 34 | # The following API functions are defined: 35 | # COCO - COCO api class that loads COCO annotation file and prepare data structures. 36 | # decodeMask - Decode binary mask M encoded via run-length encoding. 37 | # encodeMask - Encode binary mask M using run-length encoding. 38 | # getAnnIds - Get ann ids that satisfy given filter conditions. 39 | # getCatIds - Get cat ids that satisfy given filter conditions. 40 | # getImgIds - Get img ids that satisfy given filter conditions. 41 | # loadAnns - Load anns with the specified ids. 42 | # loadCats - Load cats with the specified ids. 43 | # loadImgs - Load imgs with the specified ids. 44 | # annToMask - Convert segmentation in an annotation to binary mask. 45 | # showAnns - Display the specified annotations. 46 | # loadRes - Load algorithm results and create API for accessing them. 47 | # download - Download COCO images from mscoco.org server. 48 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image. 49 | # Help on each functions can be accessed by: "help COCO>function". 50 | 51 | # See also COCO>decodeMask, 52 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, 53 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats, 54 | # COCO>loadImgs, COCO>annToMask, COCO>showAnns 55 | 56 | # Microsoft COCO Toolbox. version 2.0 57 | # Data, paper, and tutorials available at: http://mscoco.org/ 58 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. 59 | # Licensed under the Simplified BSD License [see bsd.txt] 60 | 61 | import json 62 | import time 63 | import matplotlib.pyplot as plt 64 | from matplotlib.collections import PatchCollection 65 | from matplotlib.patches import Polygon 66 | import numpy as np 67 | import copy 68 | import itertools 69 | from pycocotools import mask as maskUtils 70 | import os 71 | from collections import defaultdict 72 | import sys 73 | PYTHON_VERSION = sys.version_info[0] 74 | if PYTHON_VERSION == 2: 75 | from urllib import urlretrieve 76 | elif PYTHON_VERSION == 3: 77 | from urllib.request import urlretrieve 78 | 79 | 80 | def _isArrayLike(obj): 81 | return hasattr(obj, '__iter__') and hasattr(obj, '__len__') 82 | 83 | 84 | class COCO: 85 | def __init__(self, annotation_file=None): 86 | """ 87 | Constructor of Microsoft COCO helper class for reading and visualizing annotations. 88 | :param annotation_file (str): location of annotation file 89 | :param image_folder (str): location to the folder that hosts images. 90 | :return: 91 | """ 92 | # load dataset 93 | self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict() 94 | self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) 95 | if not annotation_file == None: 96 | print('loading annotations into memory...') 97 | tic = time.time() 98 | dataset = json.load(open(annotation_file, 'r')) 99 | assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) 100 | print('Done (t={:0.2f}s)'.format(time.time()- tic)) 101 | self.dataset = dataset 102 | self.createIndex() 103 | 104 | def createIndex(self): 105 | # create index 106 | print('creating index...') 107 | anns, cats, imgs = {}, {}, {} 108 | imgToAnns,catToImgs = defaultdict(list),defaultdict(list) 109 | if 'annotations' in self.dataset: 110 | for ann in self.dataset['annotations']: 111 | imgToAnns[ann['image_id']].append(ann) 112 | anns[ann['id']] = ann 113 | 114 | if 'images' in self.dataset: 115 | for img in self.dataset['images']: 116 | imgs[img['id']] = img 117 | 118 | if 'categories' in self.dataset: 119 | for cat in self.dataset['categories']: 120 | cats[cat['id']] = cat 121 | 122 | if 'annotations' in self.dataset and 'categories' in self.dataset: 123 | for ann in self.dataset['annotations']: 124 | catToImgs[ann['category_id']].append(ann['image_id']) 125 | 126 | print('index created!') 127 | 128 | # create class members 129 | self.anns = anns 130 | self.imgToAnns = imgToAnns 131 | self.catToImgs = catToImgs 132 | self.imgs = imgs 133 | self.cats = cats 134 | 135 | def info(self): 136 | """ 137 | Print information about the annotation file. 138 | :return: 139 | """ 140 | for key, value in self.dataset['info'].items(): 141 | print('{}: {}'.format(key, value)) 142 | 143 | def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): 144 | """ 145 | Get ann ids that satisfy given filter conditions. default skips that filter 146 | :param imgIds (int array) : get anns for given imgs 147 | catIds (int array) : get anns for given cats 148 | areaRng (float array) : get anns for given area range (e.g. [0 inf]) 149 | iscrowd (boolean) : get anns for given crowd label (False or True) 150 | :return: ids (int array) : integer array of ann ids 151 | """ 152 | imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] 153 | catIds = catIds if _isArrayLike(catIds) else [catIds] 154 | 155 | if len(imgIds) == len(catIds) == len(areaRng) == 0: 156 | anns = self.dataset['annotations'] 157 | else: 158 | if not len(imgIds) == 0: 159 | lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] 160 | anns = list(itertools.chain.from_iterable(lists)) 161 | else: 162 | anns = self.dataset['annotations'] 163 | anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] 164 | anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] 165 | if not iscrowd == None: 166 | ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] 167 | else: 168 | ids = [ann['id'] for ann in anns] 169 | return ids 170 | 171 | def getCatIds(self, catNms=[], supNms=[], catIds=[]): 172 | """ 173 | filtering parameters. default skips that filter. 174 | :param catNms (str array) : get cats for given cat names 175 | :param supNms (str array) : get cats for given supercategory names 176 | :param catIds (int array) : get cats for given cat ids 177 | :return: ids (int array) : integer array of cat ids 178 | """ 179 | catNms = catNms if _isArrayLike(catNms) else [catNms] 180 | supNms = supNms if _isArrayLike(supNms) else [supNms] 181 | catIds = catIds if _isArrayLike(catIds) else [catIds] 182 | 183 | if len(catNms) == len(supNms) == len(catIds) == 0: 184 | cats = self.dataset['categories'] 185 | else: 186 | cats = self.dataset['categories'] 187 | cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] 188 | cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] 189 | cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] 190 | ids = [cat['id'] for cat in cats] 191 | return ids 192 | 193 | def getImgIds(self, imgIds=[], catIds=[]): 194 | ''' 195 | Get img ids that satisfy given filter conditions. 196 | :param imgIds (int array) : get imgs for given ids 197 | :param catIds (int array) : get imgs with all given cats 198 | :return: ids (int array) : integer array of img ids 199 | ''' 200 | imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] 201 | catIds = catIds if _isArrayLike(catIds) else [catIds] 202 | 203 | if len(imgIds) == len(catIds) == 0: 204 | ids = self.imgs.keys() 205 | else: 206 | ids = set(imgIds) 207 | for i, catId in enumerate(catIds): 208 | if i == 0 and len(ids) == 0: 209 | ids = set(self.catToImgs[catId]) 210 | else: 211 | ids &= set(self.catToImgs[catId]) 212 | return list(ids) 213 | 214 | def loadAnns(self, ids=[]): 215 | """ 216 | Load anns with the specified ids. 217 | :param ids (int array) : integer ids specifying anns 218 | :return: anns (object array) : loaded ann objects 219 | """ 220 | if _isArrayLike(ids): 221 | return [self.anns[id] for id in ids] 222 | elif type(ids) == int: 223 | return [self.anns[ids]] 224 | 225 | def loadCats(self, ids=[]): 226 | """ 227 | Load cats with the specified ids. 228 | :param ids (int array) : integer ids specifying cats 229 | :return: cats (object array) : loaded cat objects 230 | """ 231 | if _isArrayLike(ids): 232 | return [self.cats[id] for id in ids] 233 | elif type(ids) == int: 234 | return [self.cats[ids]] 235 | 236 | def loadImgs(self, ids=[]): 237 | """ 238 | Load anns with the specified ids. 239 | :param ids (int array) : integer ids specifying img 240 | :return: imgs (object array) : loaded img objects 241 | """ 242 | if _isArrayLike(ids): 243 | return [self.imgs[id] for id in ids] 244 | elif type(ids) == int: 245 | return [self.imgs[ids]] 246 | 247 | def showAnns(self, anns): 248 | """ 249 | Display the specified annotations. 250 | :param anns (array of object): annotations to display 251 | :return: None 252 | """ 253 | if len(anns) == 0: 254 | return 0 255 | if 'segmentation' in anns[0] or 'keypoints' in anns[0]: 256 | datasetType = 'instances' 257 | elif 'caption' in anns[0]: 258 | datasetType = 'captions' 259 | else: 260 | raise Exception('datasetType not supported') 261 | if datasetType == 'instances': 262 | ax = plt.gca() 263 | ax.set_autoscale_on(False) 264 | polygons = [] 265 | color = [] 266 | for ann in anns: 267 | c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] 268 | if 'segmentation' in ann: 269 | if type(ann['segmentation']) == list: 270 | # polygon 271 | for seg in ann['segmentation']: 272 | poly = np.array(seg).reshape((int(len(seg)/2), 2)) 273 | polygons.append(Polygon(poly)) 274 | color.append(c) 275 | else: 276 | # mask 277 | t = self.imgs[ann['image_id']] 278 | if type(ann['segmentation']['counts']) == list: 279 | rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width']) 280 | else: 281 | rle = [ann['segmentation']] 282 | m = maskUtils.decode(rle) 283 | img = np.ones( (m.shape[0], m.shape[1], 3) ) 284 | if ann['iscrowd'] == 1: 285 | color_mask = np.array([2.0,166.0,101.0])/255 286 | if ann['iscrowd'] == 0: 287 | color_mask = np.random.random((1, 3)).tolist()[0] 288 | for i in range(3): 289 | img[:,:,i] = color_mask[i] 290 | ax.imshow(np.dstack( (img, m*0.5) )) 291 | if 'keypoints' in ann and type(ann['keypoints']) == list: 292 | # turn skeleton into zero-based index 293 | sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1 294 | kp = np.array(ann['keypoints']) 295 | x = kp[0::3] 296 | y = kp[1::3] 297 | v = kp[2::3] 298 | for sk in sks: 299 | if np.all(v[sk]>0): 300 | plt.plot(x[sk],y[sk], linewidth=3, color=c) 301 | plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2) 302 | plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2) 303 | p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) 304 | ax.add_collection(p) 305 | p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) 306 | ax.add_collection(p) 307 | elif datasetType == 'captions': 308 | for ann in anns: 309 | print(ann['caption']) 310 | 311 | def loadRes(self, resFile): 312 | """ 313 | Load result file and return a result api object. 314 | :param resFile (str) : file name of result file 315 | :return: res (obj) : result api object 316 | """ 317 | res = COCO() 318 | res.dataset['images'] = [img for img in self.dataset['images']] 319 | 320 | print('Loading and preparing results...') 321 | tic = time.time() 322 | if type(resFile) == str: #or type(resFile) == unicode: 323 | anns = json.load(open(resFile)) 324 | elif type(resFile) == np.ndarray: 325 | anns = self.loadNumpyAnnotations(resFile) 326 | else: 327 | anns = resFile 328 | assert type(anns) == list, 'results in not an array of objects' 329 | annsImgIds = [ann['image_id'] for ann in anns] 330 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 331 | 'Results do not correspond to current coco set' 332 | if 'caption' in anns[0]: 333 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 334 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 335 | for id, ann in enumerate(anns): 336 | ann['id'] = id+1 337 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 338 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 339 | for id, ann in enumerate(anns): 340 | bb = ann['bbox'] 341 | x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] 342 | if not 'segmentation' in ann: 343 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 344 | ann['area'] = bb[2]*bb[3] 345 | ann['id'] = id+1 346 | ann['iscrowd'] = 0 347 | elif 'segmentation' in anns[0]: 348 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 349 | for id, ann in enumerate(anns): 350 | # now only support compressed RLE format as segmentation results 351 | ann['area'] = maskUtils.area(ann['segmentation']) 352 | if not 'bbox' in ann: 353 | ann['bbox'] = maskUtils.toBbox(ann['segmentation']) 354 | ann['id'] = id+1 355 | ann['iscrowd'] = 0 356 | elif 'keypoints' in anns[0]: 357 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 358 | for id, ann in enumerate(anns): 359 | s = ann['keypoints'] 360 | x = s[0::3] 361 | y = s[1::3] 362 | x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y) 363 | ann['area'] = (x1-x0)*(y1-y0) 364 | ann['id'] = id + 1 365 | ann['bbox'] = [x0,y0,x1-x0,y1-y0] 366 | print('DONE (t={:0.2f}s)'.format(time.time()- tic)) 367 | 368 | res.dataset['annotations'] = anns 369 | res.createIndex() 370 | return res 371 | 372 | def download(self, tarDir = None, imgIds = [] ): 373 | ''' 374 | Download COCO images from mscoco.org server. 375 | :param tarDir (str): COCO results directory name 376 | imgIds (list): images to be downloaded 377 | :return: 378 | ''' 379 | if tarDir is None: 380 | print('Please specify target directory') 381 | return -1 382 | if len(imgIds) == 0: 383 | imgs = self.imgs.values() 384 | else: 385 | imgs = self.loadImgs(imgIds) 386 | N = len(imgs) 387 | if not os.path.exists(tarDir): 388 | os.makedirs(tarDir) 389 | for i, img in enumerate(imgs): 390 | tic = time.time() 391 | fname = os.path.join(tarDir, img['file_name']) 392 | if not os.path.exists(fname): 393 | urlretrieve(img['coco_url'], fname) 394 | print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)) 395 | 396 | def loadNumpyAnnotations(self, data): 397 | """ 398 | Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} 399 | :param data (numpy.ndarray) 400 | :return: annotations (python nested list) 401 | """ 402 | print('Converting ndarray to lists...') 403 | assert(type(data) == np.ndarray) 404 | print(data.shape) 405 | assert(data.shape[1] == 7) 406 | N = data.shape[0] 407 | ann = [] 408 | for i in range(N): 409 | if i % 1000000 == 0: 410 | print('{}/{}'.format(i,N)) 411 | ann += [{ 412 | 'image_id' : int(data[i, 0]), 413 | 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], 414 | 'score' : data[i, 5], 415 | 'category_id': int(data[i, 6]), 416 | }] 417 | return ann 418 | 419 | def annToRLE(self, ann): 420 | """ 421 | Convert annotation which can be polygons, uncompressed RLE to RLE. 422 | :return: binary mask (numpy 2D array) 423 | """ 424 | t = self.imgs[ann['image_id']] 425 | h, w = t['height'], t['width'] 426 | segm = ann['segmentation'] 427 | if type(segm) == list: 428 | # polygon -- a single object might consist of multiple parts 429 | # we merge all parts into one mask rle code 430 | rles = maskUtils.frPyObjects(segm, h, w) 431 | rle = maskUtils.merge(rles) 432 | elif type(segm['counts']) == list: 433 | # uncompressed RLE 434 | rle = maskUtils.frPyObjects(segm, h, w) 435 | else: 436 | # rle 437 | rle = ann['segmentation'] 438 | return rle 439 | 440 | def annToMask(self, ann): 441 | """ 442 | Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. 443 | :return: binary mask (numpy 2D array) 444 | """ 445 | rle = self.annToRLE(ann) 446 | m = maskUtils.decode(rle) 447 | return m 448 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | 'adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Detection/SSD/src/utils.py' 2 | 3 | import os 4 | import math 5 | import json, pickle, bz2 6 | import itertools 7 | from coco import COCO 8 | 9 | from PIL import Image 10 | import torch 11 | from torch.utils.data import DataLoader, Dataset 12 | import torchvision.transforms as transforms 13 | 14 | 15 | def init_dboxes(model_dtype=torch.float32): 16 | fig_size = 300 17 | feat_size = [38, 19, 10, 5, 3, 1] 18 | steps = [8, 16, 32, 64, 100, 300] 19 | scales = [21, 45, 99, 153, 207, 261, 315] 20 | aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] 21 | 22 | fk = fig_size / torch.tensor(steps).float() 23 | 24 | dboxes = [] 25 | # size of feature and number of feature 26 | for idx, sfeat in enumerate(feat_size): 27 | sk1 = scales[idx] / fig_size 28 | sk2 = scales[idx + 1] / fig_size 29 | sk3 = math.sqrt(sk1 * sk2) 30 | all_sizes = [(sk1, sk1), (sk3, sk3)] 31 | 32 | for alpha in aspect_ratios[idx]: 33 | w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha) 34 | all_sizes.append((w, h)) 35 | all_sizes.append((h, w)) 36 | 37 | for w, h in all_sizes: 38 | for i, j in itertools.product(range(sfeat), repeat=2): 39 | cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx] 40 | dboxes.append((cx, cy, w, h)) 41 | 42 | return torch.tensor( 43 | dboxes, 44 | dtype=model_dtype, 45 | device='cuda' 46 | ).clamp(0, 1) 47 | 48 | 49 | class COCODetection(Dataset): 50 | def __init__(self, img_folder, annotate_file, transform): 51 | self.img_folder = img_folder 52 | self.annotate_file = annotate_file 53 | 54 | # Start processing annotation 55 | with open(annotate_file) as fin: 56 | self.data = json.load(fin) 57 | 58 | self.images = {} 59 | 60 | self.label_map = {} 61 | self.label_info = {} 62 | # 0 stand for the background 63 | cnt = 0 64 | self.label_info[cnt] = "background" 65 | for cat in self.data["categories"]: 66 | cnt += 1 67 | self.label_map[cat["id"]] = cnt 68 | self.label_info[cnt] = cat["name"] 69 | 70 | # build inference for images 71 | for img in self.data["images"]: 72 | img_id = img["id"] 73 | img_name = img["file_name"] 74 | img_size = (img["height"],img["width"]) 75 | if img_id in self.images: raise Exception("dulpicated image record") 76 | self.images[img_id] = (img_name, img_size, []) 77 | 78 | # read bboxes 79 | for bboxes in self.data["annotations"]: 80 | img_id = bboxes["image_id"] 81 | category_id = bboxes["category_id"] 82 | bbox = bboxes["bbox"] 83 | bbox_label = self.label_map[bboxes["category_id"]] 84 | self.images[img_id][2].append((bbox, bbox_label)) 85 | 86 | for k, v in list(self.images.items()): 87 | if len(v[2]) == 0: 88 | self.images.pop(k) 89 | 90 | self.img_keys = list(self.images.keys()) 91 | self.transform = transform 92 | 93 | @property 94 | def labelnum(self): 95 | return len(self.label_info) 96 | 97 | @staticmethod 98 | def load(pklfile): 99 | with bz2.open(pklfile, "rb") as fin: 100 | ret = pickle.load(fin) 101 | return ret 102 | 103 | def save(self, pklfile): 104 | with bz2.open(pklfile, "wb") as fout: 105 | pickle.dump(self, fout) 106 | 107 | 108 | def __len__(self): 109 | return len(self.images) 110 | 111 | def __getitem__(self, idx): 112 | img_id = self.img_keys[idx] 113 | img_data = self.images[img_id] 114 | fn = img_data[0] 115 | img_path = os.path.join(self.img_folder, fn) 116 | img = Image.open(img_path).convert("RGB") 117 | 118 | htot, wtot = img_data[1] 119 | bbox_sizes = [] 120 | bbox_labels = [] 121 | 122 | #for (xc, yc, w, h), bbox_label in img_data[2]: 123 | for (l,t,w,h), bbox_label in img_data[2]: 124 | r = l + w 125 | b = t + h 126 | #l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h 127 | bbox_size = (l/wtot, t/htot, r/wtot, b/htot) 128 | bbox_sizes.append(bbox_size) 129 | bbox_labels.append(bbox_label) 130 | 131 | bbox_sizes = torch.tensor(bbox_sizes) 132 | bbox_labels = torch.tensor(bbox_labels) 133 | 134 | max_num = 200 135 | bbox_out = torch.zeros(max_num, 4) 136 | label_out = torch.zeros(max_num, dtype=torch.long) 137 | bbox_out[:bbox_sizes.size(0), :] = bbox_sizes 138 | label_out[:bbox_labels.size(0)] = bbox_labels 139 | 140 | img = self.transform(img) 141 | return img, img_id, (htot, wtot), bbox_out, label_out 142 | 143 | 144 | 145 | def get_val_dataloader(args): 146 | transformer = transforms.Compose([ 147 | transforms.Resize((300, 300)), 148 | transforms.ToTensor(), 149 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 150 | ]) 151 | 152 | dataset = COCODetection( 153 | os.path.join(args.data, "val2017"), 154 | os.path.join(args.data, "annotations/instances_val2017.json"), 155 | transformer 156 | ) 157 | 158 | return DataLoader( 159 | dataset, 160 | batch_size=args.batch_dim, 161 | shuffle=False, # Note: distributed sampler is shuffled :( 162 | sampler=None, 163 | num_workers=args.num_workers 164 | ) 165 | 166 | 167 | def get_coco_ground_truth(args): 168 | return COCO(annotation_file=os.path.join(args.data, "annotations/instances_val2017.json")) 169 | 170 | -------------------------------------------------------------------------------- /gpuplot.py: -------------------------------------------------------------------------------- 1 | import sys, io 2 | import time 3 | import numpy as np 4 | import threading 5 | 6 | from PIL import Image 7 | from pynvml import * # all functions have nvml prefix 8 | nvmlInit() 9 | 10 | 11 | dot = '•' 12 | space = ' ' 13 | horizontal_line = '\u2500' 14 | vertical_line = '\u2502' 15 | fill = '\u2588' 16 | tau = '\u03A4' 17 | background_color = np.array([0x28, 0x2d, 0x35]) # svg-term background 18 | foreground_color = np.array([128, 128, 128]) 19 | cold = np.array([0x3f, 0x5e, 0xfb]) 20 | hot = np.array([0xfc, 0x46, 0x6b]) 21 | not_nv = np.array([0x83, 0x60, 0xc3]) 22 | nv = np.array([0x2e, 0xbf, 0x91]) 23 | sky = np.array([0x1c, 0x92, 0xd2]) 24 | foreground_color = np.array([0xf2, 0xfc, 0xfe]) 25 | summer_low = np.array([0x22, 0xc1, 0xc3]) 26 | summer_hi = np.array([0xfd, 0xbb, 0x2d]) 27 | 28 | 29 | def write(s, stdout=sys.stdout): 30 | stdout.write(s) 31 | 32 | 33 | def reset(): 34 | write('\033c') 35 | 36 | 37 | def wrap_color(rgb, s): 38 | r, g, b = map(int, rgb) 39 | return f'\033[38;2;{r};{g};{b}m{s}\033[0m' 40 | 41 | 42 | def pixel_seq_to_ascii(pixel_rgb, char, count): 43 | if char == space or (pixel_rgb == background_color).all(): 44 | return space * count 45 | return wrap_color(pixel_rgb, char * count) 46 | 47 | 48 | def pixels_to_ascii(pixels, chars): 49 | rendered_rows = [] 50 | for row in range(pixels.shape[0]): 51 | row_elements = [] 52 | current_pixel, current_char = None, None 53 | pixel_count = 0 54 | 55 | for column in range(pixels.shape[1]): 56 | new_pixel, new_char = pixels[row, column], chars[row, column] 57 | if current_pixel is not None and new_char == current_char and (new_pixel == current_pixel).all(): 58 | pixel_count += 1 59 | else: 60 | if current_pixel is not None: 61 | row_elements.append(pixel_seq_to_ascii(current_pixel, current_char, pixel_count)) 62 | current_pixel = new_pixel 63 | current_char = new_char 64 | pixel_count = 1 65 | 66 | if current_pixel is not None: 67 | row_elements.append(pixel_seq_to_ascii(current_pixel, current_char, pixel_count)) 68 | rendered_rows.append(''.join(row_elements)) 69 | 70 | return '\n'.join(rendered_rows) 71 | 72 | 73 | def data_to_dot_matrix(xs, ys, size_wh, y_lim=(None, None)): 74 | dw = len(xs) 75 | y_min, y_max = y_lim 76 | if y_min is None: 77 | y_min = min(ys) 78 | if y_max is None: 79 | y_max = max(ys) 80 | dh = y_max - y_min 81 | 82 | w, h = size_wh 83 | dot_matrix = np.zeros((h, w), dtype=np.float32) 84 | values = np.zeros((w,), dtype=np.float32) 85 | 86 | for c in range(w): 87 | dc = dw * c / w 88 | dc, dc_fraction = int(dc // 1), dc % 1 89 | dy = ys[dc] 90 | if dc + 1 < dw: 91 | dy += dc_fraction * (ys[dc+1] - ys[dc]) 92 | dy_proportion = (dy - y_min) / dh 93 | r = max(0, min(1, dy_proportion)) * h 94 | r, r_fraction = int(r // 1), r % 1 95 | 96 | dot_matrix[h - r:, c] = 1.0 97 | if h - r - 1 >= 0: 98 | dot_matrix[h - r - 1, c] = r_fraction 99 | values[c] = dy_proportion 100 | 101 | return dot_matrix, values 102 | 103 | 104 | def dot_matrix_to_pixels(dot_matrix, values, low=foreground_color, high=foreground_color): 105 | pixels = np.ones((*dot_matrix.shape, 3)) * background_color 106 | chars = np.full(dot_matrix.shape, space, dtype=' 0] = dot 109 | 110 | colors = np.expand_dims(values, -1) * (high - low) + low 111 | 112 | dot_matrix = np.expand_dims(dot_matrix, -1) 113 | pixels = ((1 - dot_matrix) * background_color) + (dot_matrix * colors) 114 | 115 | return pixels, chars 116 | 117 | 118 | def render_axes(pixels, chars, title, x_ticks, y_ticks): 119 | ph, pw, pd = pixels.shape 120 | new_pixels = np.zeros((ph + 3, pw + 8, pd), dtype=pixels.dtype) 121 | new_pixels[1:ph + 1, :pw, :] = pixels 122 | new_chars = np.full((ph + 3, pw + 8), space, dtype=' 0: 209 | break 210 | 211 | temp.append(nvmlDeviceGetTemperature(gpu, 0)) 212 | 213 | mem = nvmlDeviceGetMemoryInfo(gpu) 214 | mem_used.append(mem.used / 2 ** 30) 215 | 216 | util = nvmlDeviceGetUtilizationRates(gpu) 217 | gpu_util.append(util.gpu) 218 | 219 | temp = temp[-max_data_points:] 220 | gpu_util = gpu_util[-max_data_points:] 221 | mem_used = mem_used[-max_data_points:] 222 | while len(fps) > max_data_points: 223 | fps.pop(0) 224 | x = [t * sample_period for t in range(-len(temp), 0)] 225 | 226 | reset() 227 | 228 | # log_pixels, log_chars = log_to_pixels(captured_stdout, chart_wh) 229 | 230 | fps_dm, fps_values = data_to_dot_matrix(x, fps, chart_wh, y_lim=(fps_min, fps_max)) 231 | fps_pixels, fps_chars = dot_matrix_to_pixels(fps_dm, fps_values, low=sky, high=foreground_color) 232 | fps_pixels, fps_chars = render_axes( 233 | fps_pixels, fps_chars, 234 | 'THROUGHPUT (FPS)', 235 | [(0, f'{tau}{int(x[0])}s')], 236 | [(0, f'{fps_max}'), (chart_wh[1], f'{fps_min}'), (int(chart_wh[1] * (fps_max - fps[-1]) / (fps_max - fps_min)), f'{fps[-1]:.0f}')] 237 | ) 238 | 239 | temp_dm, temp_values = data_to_dot_matrix(x, temp, chart_wh, y_lim=(temp_min, temp_max)) 240 | temp_pixels, temp_chars = dot_matrix_to_pixels(temp_dm, temp_values, low=cold, high=hot) 241 | temp_pixels, temp_chars = render_axes( 242 | temp_pixels, temp_chars, 243 | 'GPU TEMP (C)', 244 | [(0, f'{tau}{int(x[0])}s')], 245 | [(0, f'{temp_max}C'), (chart_wh[1], f'{temp_min}C'), (int(chart_wh[1] * (temp_max - temp[-1]) / (temp_max - temp_min)), f'{temp[-1]:.0f}C')] 246 | ) 247 | 248 | gpu_dm, gpu_values = data_to_dot_matrix(x, gpu_util, chart_wh, y_lim=(gpu_min, gpu_max)) 249 | gpu_pixels, gpu_chars = dot_matrix_to_pixels(gpu_dm, gpu_values, low=not_nv, high=nv) 250 | gpu_pixels, gpu_chars = render_axes( 251 | gpu_pixels, gpu_chars, 252 | 'GPU UTILIZATION (%)', 253 | [(0, f'{tau}{int(x[0])}s')], 254 | [(0, f'{gpu_max}%'), (chart_wh[1], f'{gpu_min:3.0f}%'), (int(chart_wh[1] * (gpu_max - gpu_util[-1]) / (gpu_max - gpu_min)), f'{gpu_util[-1]:3.0f}%')] 255 | ) 256 | 257 | mem_dm, mem_values = data_to_dot_matrix(x, mem_used, chart_wh, y_lim=(mem_min, mem_max)) 258 | mem_pixels, mem_chars = dot_matrix_to_pixels(mem_dm, mem_values, low=summer_low, high=summer_hi) 259 | mem_pixels, mem_chars = render_axes( 260 | mem_pixels, mem_chars, 261 | 'GPU MEM (GB)', 262 | [(0, f'{tau}{int(x[0])}s')], 263 | [(0, f'{mem_max:3.1f}'), (chart_wh[1], f'{mem_min:3.1f}'), (int(chart_wh[1] * (mem_max - mem_used[-1]) / (mem_max - mem_min)), f'{mem_used[-1]:3.1f}')] 264 | ) 265 | 266 | l_pixels = np.concatenate(( 267 | fps_pixels, 268 | temp_pixels, 269 | ), axis=0) 270 | 271 | l_chars = np.concatenate(( 272 | fps_chars, 273 | temp_chars, 274 | ), axis=0) 275 | 276 | r_pixels = np.concatenate(( 277 | gpu_pixels, 278 | mem_pixels, 279 | ), axis=0) 280 | 281 | r_chars = np.concatenate(( 282 | gpu_chars, 283 | mem_chars, 284 | ), axis=0) 285 | 286 | pixels = np.concatenate((l_pixels, r_pixels), axis=1) 287 | chars = np.concatenate((l_chars, r_chars), axis=1) 288 | 289 | time.sleep(0.001) 290 | write(pixels_to_ascii(pixels, chars) + '\n', stdout=real_stdout) 291 | 292 | time.sleep(max(0, sample_period - (time.time() - sample_time))) 293 | finally: 294 | sys.stdout = real_stdout 295 | sys.stderr = real_stderr 296 | 297 | bg_thread = threading.Thread(target=bg_thread_fn, args=(chart_wh, max_data_points, fps, fps_lock), daemon=True) 298 | 299 | def gen_update_fps(fps_lock, fps): 300 | def update_fps(latest_fps): 301 | with fps_lock: 302 | fps[-1] = latest_fps 303 | return update_fps 304 | 305 | return gen_update_fps(fps_lock, fps), bg_thread 306 | 307 | -------------------------------------------------------------------------------- /logs/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/.keep -------------------------------------------------------------------------------- /logs/masking: -------------------------------------------------------------------------------- 1 | [TensorRT] ERROR: INVALID_ARGUMENT: getPluginCreator could not find plugin NonZero version 1 2 | /opt/conda/lib/python3.6/site-packages/torch/onnx/symbolic_opset9.py:2329: UserWarning: Exporting aten::index operator with indices of type Byte. Only 1-D indices are supported. In any other case, this will produce an incorrect ONNX graph. 3 | warnings.warn("Exporting aten::index operator with indices of type Byte. " 4 | /opt/conda/lib/python3.6/site-packages/torch/onnx/symbolic_opset9.py:591: UserWarning: This model contains a squeeze operation on dimension 1 on an input with unknown shape. Note that if the size of dimension 1 of the input is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on non-singleton dimensions, it is recommended to export this model using opset version 11 or higher. 5 | "version 11 or higher.") 6 | exporting Masking to models/masking.onnx 7 | compiling models/masking.onnx with TensorRT 8 | -------------------------------------------------------------------------------- /logs/ssd300.fp16.b16.k256.pytorch.bench: -------------------------------------------------------------------------------- 1 | 500 batches, 8000 images, 18.97 seconds total 2 | 2.4 ms per image 3 | 421.8 FPS 4 | -------------------------------------------------------------------------------- /logs/ssd300.fp16.b16.k256.pytorch.coco: -------------------------------------------------------------------------------- 1 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.25044 2 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.42427 3 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.25505 4 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07417 5 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26863 6 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.39955 7 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.23678 8 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.34402 9 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.36105 10 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11822 11 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39443 12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54920 13 | -------------------------------------------------------------------------------- /logs/ssd300.fp16.b16.k256.pytorch.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp16.b16.k256.pytorch.qdrep -------------------------------------------------------------------------------- /logs/ssd300.fp16.b16.k256.trt.bench: -------------------------------------------------------------------------------- 1 | 500 batches, 8000 images, 8.59 seconds total 2 | 1.1 ms per image 3 | 931.1 FPS 4 | -------------------------------------------------------------------------------- /logs/ssd300.fp16.b16.k256.trt.coco: -------------------------------------------------------------------------------- 1 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.25022 2 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.42477 3 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.25491 4 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07376 5 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26836 6 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.39985 7 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.23762 8 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.34404 9 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.36037 10 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11687 11 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39541 12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54943 13 | -------------------------------------------------------------------------------- /logs/ssd300.fp16.b16.k256.trt.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp16.b16.k256.trt.qdrep -------------------------------------------------------------------------------- /logs/ssd300.fp32.b16.k256.pytorch.bench: -------------------------------------------------------------------------------- 1 | 500 batches, 8000 images, 31.43 seconds total 2 | 3.9 ms per image 3 | 254.5 FPS 4 | -------------------------------------------------------------------------------- /logs/ssd300.fp32.b16.k256.pytorch.coco: -------------------------------------------------------------------------------- 1 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.25041 2 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.42413 3 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.25521 4 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07433 5 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26849 6 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.40030 7 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.23688 8 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.34397 9 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.36104 10 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11849 11 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39450 12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54971 13 | -------------------------------------------------------------------------------- /logs/ssd300.fp32.b16.k256.pytorch.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp32.b16.k256.pytorch.qdrep -------------------------------------------------------------------------------- /logs/ssd300.fp32.b16.k256.trt.coco: -------------------------------------------------------------------------------- 1 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.25040 2 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.42430 3 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.25516 4 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07426 5 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26855 6 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.40043 7 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.23687 8 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.34343 9 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.35993 10 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11745 11 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.39354 12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54890 13 | -------------------------------------------------------------------------------- /logs/ssd300.fp32.b16.k256.trt.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.fp32.b16.k256.trt.qdrep -------------------------------------------------------------------------------- /logs/ssd300.int8.b16.k256.trt.coco: -------------------------------------------------------------------------------- 1 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.24766 2 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.42047 3 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.25266 4 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.07075 5 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.26687 6 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.40063 7 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.23473 8 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.33937 9 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.35550 10 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.11317 11 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.38830 12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.54646 13 | -------------------------------------------------------------------------------- /logs/ssd300.int8.b16.k256.trt.qdrep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/logs/ssd300.int8.b16.k256.trt.qdrep -------------------------------------------------------------------------------- /logs/subscript_assignment: -------------------------------------------------------------------------------- 1 | [TensorRT] WARNING: /build/TensorRT/parsers/onnx/onnx2trt_utils.cpp:220: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32. 2 | [TensorRT] WARNING: /build/TensorRT/parsers/onnx/onnx2trt_utils.cpp:246: One or more weights outside the range of INT32 was clamped 3 | [TensorRT] ERROR: INVALID_ARGUMENT: getPluginCreator could not find plugin ScatterND version 1 4 | exporting SubscriptAssign to models/subscript_assign.onnx 5 | compiling models/subscript_assign.onnx with TensorRT 6 | -------------------------------------------------------------------------------- /masking.py: -------------------------------------------------------------------------------- 1 | import sys, io 2 | import torch 3 | import tensorrt as trt 4 | 5 | class Masking(torch.nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def forward(self, X): 10 | X = X[X.sum(dim=-1) > 0] 11 | return X 12 | 13 | 14 | if __name__ == '__main__': 15 | m = Masking() 16 | onnx_filename = 'models/masking.onnx' 17 | 18 | print('exporting Masking to', onnx_filename) 19 | torch.onnx.export( 20 | Masking(), 21 | torch.randn((10, 10)), 22 | onnx_filename, 23 | opset_version=11 24 | ) 25 | 26 | print('compiling', onnx_filename, 'with TensorRT') 27 | logger = trt.Logger() 28 | network_flags = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 29 | 30 | with trt.Builder(logger) as builder, builder.create_network(network_flags) as network, trt.OnnxParser(network, logger) as parser: 31 | if not parser.parse(open(onnx_filename, 'rb').read()): 32 | sys.exit(1) 33 | engine = builder.build_cuda_engine(network) 34 | 35 | -------------------------------------------------------------------------------- /models/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pbridger/tensorrt-ssd300-8bit-quantized/c92fffcf6c0a618970981ce5e88c06aab0c7934e/models/.keep -------------------------------------------------------------------------------- /optrec.py: -------------------------------------------------------------------------------- 1 | import sys, json 2 | 3 | _, input_path, output_path = sys.argv 4 | 5 | header, *lines = open(input_path).readlines() 6 | fout = open(output_path, 'w') 7 | 8 | fout.write(header) 9 | 10 | min_gap = 0.1 11 | max_gap = 1.0 12 | 13 | prev_offset = None 14 | prev_text = None 15 | running_shortcut = 0. 16 | 17 | for line in lines: 18 | offset, mode, text = json.loads(line) 19 | if prev_offset is None: 20 | prev_offset = offset 21 | prev_text = text 22 | elif offset - prev_offset < min_gap: 23 | prev_text += text 24 | else: 25 | fout.write(json.dumps([prev_offset - running_shortcut, 'o', prev_text]) + '\n') 26 | 27 | if offset - prev_offset > max_gap: 28 | running_shortcut += (offset - prev_offset) - max_gap 29 | 30 | prev_offset = offset 31 | prev_text = text 32 | 33 | if prev_offset is not None: 34 | fout.write(json.dumps([prev_offset - running_shortcut, 'o', prev_text]) + '\n') 35 | 36 | -------------------------------------------------------------------------------- /ssd300_baseline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import contextlib 3 | import math, itertools 4 | import torch, torchvision 5 | 6 | 7 | # context manager to help keep track of ranges of time using NVTX 8 | @contextlib.contextmanager 9 | def nvtx_range(msg): 10 | depth = torch.cuda.nvtx.range_push(msg) 11 | try: 12 | yield depth 13 | finally: 14 | torch.cuda.nvtx.range_pop() 15 | 16 | 17 | def init_dboxes(model_dtype): 18 | 'adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Detection/SSD/src/utils.py' 19 | fig_size = 300 20 | feat_size = [38, 19, 10, 5, 3, 1] 21 | steps = [8, 16, 32, 64, 100, 300] 22 | scales = [21, 45, 99, 153, 207, 261, 315] 23 | aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] 24 | 25 | fk = fig_size / torch.tensor(steps).float() 26 | 27 | dboxes = [] 28 | # size of feature and number of feature 29 | for idx, sfeat in enumerate(feat_size): 30 | sk1 = scales[idx] / fig_size 31 | sk2 = scales[idx + 1] / fig_size 32 | sk3 = math.sqrt(sk1 * sk2) 33 | all_sizes = [(sk1, sk1), (sk3, sk3)] 34 | 35 | for alpha in aspect_ratios[idx]: 36 | w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha) 37 | all_sizes.append((w, h)) 38 | all_sizes.append((h, w)) 39 | 40 | for w, h in all_sizes: 41 | for i, j in itertools.product(range(sfeat), repeat=2): 42 | cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx] 43 | dboxes.append((cx, cy, w, h)) 44 | 45 | return torch.tensor( 46 | dboxes, 47 | dtype=model_dtype, 48 | device='cuda' 49 | ).clamp(0, 1) 50 | 51 | 52 | class SSD300(torch.nn.Module): 53 | def __init__(self, detection_threshold, model_precision, batch_dim): 54 | super().__init__() 55 | self.detector = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math=model_precision).eval() 56 | self.detection_threshold = torch.nn.Parameter(torch.tensor(0.4), requires_grad=False) 57 | self.model_dtype = torch.float16 if model_precision == 'fp16' else torch.float32 58 | self.batch_dim = batch_dim 59 | self.class_dim = 81 60 | self.scale_xy = 0.1 61 | self.scale_wh = 0.2 62 | self.dboxes_xywh = torch.nn.Parameter(init_dboxes(self.model_dtype).unsqueeze(dim=0), requires_grad=False) 63 | self.box_dim = self.dboxes_xywh.size(1) 64 | self.buffer_nchw = torch.nn.Parameter(torch.zeros((batch_dim, 3, 300, 300), dtype=self.model_dtype), requires_grad=False) 65 | self.class_dim_tensor = torch.nn.Parameter(torch.tensor([self.class_dim]), requires_grad=False) 66 | self.class_indexes = torch.nn.Parameter(torch.arange(self.class_dim).repeat(self.batch_dim * self.box_dim), requires_grad=False) 67 | self.image_indexes = torch.nn.Parameter( 68 | (torch.ones(self.box_dim * self.class_dim) * torch.arange(1, self.batch_dim + 1).unsqueeze(-1)).view(-1), 69 | requires_grad=False 70 | ) 71 | 72 | def preprocess(self, image_nchw): 73 | 'normalize' 74 | with nvtx_range('preprocess'): 75 | # Nvidia SSD300 code uses mean and std-dev of 128/256 76 | return (2 * (image_nchw.to(self.model_dtype) / 255) - 1) 77 | 78 | def forward(self, image_nchw): 79 | image_batch = self.preprocess(image_nchw) 80 | locs, labels = self.detector(image_batch) 81 | return self.postprocess(locs, labels) 82 | 83 | def xywh_to_xyxy(self, bboxes_batch, scores_batch): 84 | bboxes_batch = bboxes_batch.permute(0, 2, 1) 85 | scores_batch = scores_batch.permute(0, 2, 1) 86 | 87 | bboxes_batch[:, :, :2] = self.scale_xy * bboxes_batch[:, :, :2] 88 | bboxes_batch[:, :, 2:] = self.scale_wh * bboxes_batch[:, :, 2:] 89 | 90 | bboxes_batch[:, :, :2] = bboxes_batch[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2] 91 | bboxes_batch[:, :, 2:] = bboxes_batch[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:] 92 | 93 | # transform format to ltrb 94 | l, t, r, b = bboxes_batch[:, :, 0] - 0.5 * bboxes_batch[:, :, 2],\ 95 | bboxes_batch[:, :, 1] - 0.5 * bboxes_batch[:, :, 3],\ 96 | bboxes_batch[:, :, 0] + 0.5 * bboxes_batch[:, :, 2],\ 97 | bboxes_batch[:, :, 1] + 0.5 * bboxes_batch[:, :, 3] 98 | 99 | bboxes_batch[:, :, 0] = l 100 | bboxes_batch[:, :, 1] = t 101 | bboxes_batch[:, :, 2] = r 102 | bboxes_batch[:, :, 3] = b 103 | 104 | return bboxes_batch, torch.nn.functional.softmax(scores_batch, dim=-1) 105 | 106 | def postprocess(self, locs, labels): 107 | with nvtx_range('postprocess'): 108 | locs, probs = self.xywh_to_xyxy(locs, labels) 109 | 110 | # flatten batch and classes 111 | # Exporting the operator repeat_interleave to ONNX opset version 11 is not supported. 112 | flat_locs = locs.reshape(-1, 4).repeat_interleave(self.class_dim_tensor, dim=0) 113 | flat_probs = probs.view(-1) 114 | 115 | # only do NMS on detections over threshold, and ignore background (0) 116 | threshold_mask = (flat_probs > self.detection_threshold) & (self.class_indexes > 0) 117 | 118 | flat_locs = flat_locs[threshold_mask] 119 | flat_probs = flat_probs[threshold_mask] 120 | class_indexes = self.class_indexes[threshold_mask] 121 | image_indexes = self.image_indexes[threshold_mask] 122 | 123 | nms_mask = torchvision.ops.boxes.batched_nms( 124 | flat_locs, 125 | flat_probs, 126 | class_indexes * image_indexes, 127 | iou_threshold=0.7 128 | ) 129 | 130 | bboxes = flat_locs[nms_mask] 131 | probs = flat_probs[nms_mask] 132 | class_indexes = class_indexes[nms_mask] 133 | return bboxes, probs, class_indexes 134 | 135 | 136 | if __name__ == '__main__': 137 | batch_dim = 16 138 | model = SSD300(0.05, 'fp32', batch_dim).to('cpu').eval() 139 | torch.onnx.export( 140 | model, 141 | torch.randn((batch_dim, 3, 300, 300)), 142 | 'models/ssd300.baseline.onnx', 143 | opset_version=11 144 | ) 145 | 146 | -------------------------------------------------------------------------------- /ssd300_trt.py: -------------------------------------------------------------------------------- 1 | import os, sys, time 2 | import math 3 | import io, queue, threading 4 | from pprint import pprint 5 | 6 | import numpy as np 7 | import torch, torchvision 8 | from pycocotools.cocoeval import COCOeval 9 | 10 | import tensorrt as trt 11 | import onnx 12 | from onnx import shape_inference, helper, TensorProto 13 | import onnx_graphsurgeon as gs 14 | 15 | import pycuda.driver as cuda 16 | import pycuda.autoinit 17 | 18 | from data import get_val_dataloader, get_coco_ground_truth, init_dboxes 19 | import gpuplot 20 | 21 | 22 | class Int8Calibrator(trt.IInt8EntropyCalibrator2): 23 | def __init__(self, args): 24 | super().__init__() 25 | self.batch_dim = args.batch_dim 26 | self.dataloader = iter(get_val_dataloader(args)) 27 | self.current_batch = None # for ref-counting 28 | self.cache_path = 'calibration.cache' 29 | 30 | def get_batch_size(self): 31 | return self.batch_dim 32 | 33 | def get_batch(self, tensor_names): 34 | # assume same order as in dataset 35 | try: 36 | tensor_nchw, _, heights_widths, _, r_e = next(self.dataloader) 37 | self.current_batch = tensor_nchw.cuda(), heights_widths[0].cuda(), heights_widths[1].cuda() 38 | return [t.data_ptr() for t in self.current_batch] 39 | except StopIteration: 40 | return None 41 | 42 | def read_calibration_cache(self): 43 | if os.path.exists(self.cache_path): 44 | with open(self.cache_path, 'rb') as f: 45 | return f.read() 46 | 47 | def write_calibration_cache(self, cache): 48 | with open(self.cache_path, 'wb') as f: 49 | f.write(cache) 50 | 51 | 52 | 53 | class SSD300(torch.nn.Module): 54 | def __init__(self, topk, detection_threshold, iou_threshold, model_precision, batch_dim, trt_path=None, onnx_export=False): 55 | super().__init__() 56 | self.topk = torch.nn.Parameter(torch.tensor(topk, dtype=torch.int32), requires_grad=False) 57 | self.detection_threshold = torch.nn.Parameter(torch.tensor(detection_threshold), requires_grad=False) 58 | self.model_dtype = torch.float16 if model_precision == 'fp16' else torch.float32 59 | self.batch_dim = batch_dim 60 | self.class_dim = 81 61 | self.foreground_class_dim = self.class_dim - 1 62 | self.scale_xy = 0.1 63 | self.scale_wh = 0.2 64 | self.scale_xyxywhwh = torch.nn.Parameter(torch.tensor([ 65 | self.scale_xy, 66 | self.scale_xy, 67 | self.scale_wh, 68 | self.scale_wh 69 | ]), requires_grad=False) 70 | self.scale_wh_delta = torch.nn.Parameter(torch.tensor([-0.5, -0.5, 0.5, 0.5]), requires_grad=False) 71 | self.iou_threshold = iou_threshold 72 | self.dboxes_xywh = torch.nn.Parameter(init_dboxes(self.model_dtype).unsqueeze(dim=0), requires_grad=False) 73 | self.box_dim = torch.nn.Parameter(torch.tensor(self.dboxes_xywh.size(1)), requires_grad=False) 74 | self.buffer_nchw = torch.nn.Parameter(torch.zeros((batch_dim, 3, 300, 300), dtype=self.model_dtype), requires_grad=False) 75 | self.class_indexes = torch.nn.Parameter(torch.arange(1, self.class_dim).repeat(self.batch_dim * self.topk), requires_grad=False) 76 | self.image_indexes = torch.nn.Parameter( 77 | (torch.ones(self.topk * self.foreground_class_dim, dtype=torch.int32) * torch.arange(self.batch_dim).unsqueeze(-1)).view(-1), 78 | requires_grad=False 79 | ) 80 | self.onnx_export = onnx_export 81 | self.trt_engine = None 82 | if trt_path: 83 | print('loading TRT engine from', trt_path, '...') 84 | self.trt_logger = trt.Logger() 85 | trt.init_libnvinfer_plugins(self.trt_logger, '') 86 | with open(trt_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime: 87 | self.trt_engine = runtime.deserialize_cuda_engine(f.read()) 88 | self.trt_stream = cuda.Stream() 89 | self.trt_context = self.trt_engine.create_execution_context() 90 | else: 91 | self.detector = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math=model_precision).eval() 92 | 93 | 94 | def forward(self, tensor_nchw, image_heights, image_widths): 95 | if self.onnx_export: 96 | return self.forward_pytorch(tensor_nchw, image_heights, image_widths) 97 | else: 98 | return self.forward_coco(tensor_nchw, image_heights, image_widths) 99 | 100 | 101 | def forward_pytorch(self, tensor_nchw, image_heights, image_widths): 102 | locs, scores = self.detector(tensor_nchw) 103 | locs = locs.permute(0, 2, 1) 104 | locs = self.rescale_locs(locs) 105 | 106 | scores = scores.permute(0, 2, 1) 107 | probs = torch.nn.functional.softmax(scores, dim=-1) 108 | 109 | locs, probs = self.reshape_for_topk(locs, probs) 110 | bboxes = self.locs_to_xyxy(locs, image_heights, image_widths) 111 | return bboxes, probs 112 | 113 | 114 | def forward_trt(self, tensor_nchw, image_heights, image_widths): 115 | trt_outputs, bindings = [], [] 116 | np_to_torch_type = { 117 | np.float32: torch.float32, 118 | np.float16: torch.float16, 119 | np.int32: torch.int32, 120 | np.int64: torch.int64, 121 | } 122 | 123 | for binding_name in self.trt_engine: 124 | shape = self.trt_engine.get_binding_shape(binding_name) 125 | dtype = trt.nptype(self.trt_engine.get_binding_dtype(binding_name)) 126 | torch_type = np_to_torch_type[dtype] 127 | 128 | if self.trt_engine.binding_is_input(binding_name): 129 | torch_input = vars()[binding_name].to(torch_type) 130 | bindings.append(int(torch_input.data_ptr())) 131 | else: 132 | torch_output = torch.zeros(tuple(shape), dtype=torch_type, device='cuda') 133 | trt_outputs.append(torch_output) 134 | bindings.append(int(torch_output.data_ptr())) 135 | 136 | self.trt_context.execute_async_v2(bindings=bindings, stream_handle=self.trt_stream.handle) 137 | self.trt_stream.synchronize() 138 | 139 | return trt_outputs 140 | 141 | 142 | def trt_postprocess(self, batch_dim, num_detections, bboxes, probs, class_indexes): 143 | # select valid detections and flatten batch/box/class dimensions 144 | num_detections = num_detections.expand(-1, self.topk) 145 | detection_mask = num_detections > torch.arange(self.topk, dtype=torch.int32, device='cuda').unsqueeze(0).expand(-1, self.topk) 146 | 147 | probs = probs.masked_select(detection_mask) 148 | class_indexes = self.class_indexes[class_indexes.to(torch.int64)].masked_select(detection_mask) 149 | 150 | image_indexes = torch.arange(batch_dim, dtype=torch.int64, device='cuda').unsqueeze(-1).expand(-1, self.topk) 151 | image_indexes = image_indexes.masked_select(detection_mask) 152 | 153 | bboxes = bboxes.masked_select(detection_mask.unsqueeze(-1).expand_as(bboxes)) 154 | bboxes = bboxes.unsqueeze(-1).reshape(-1, 4) 155 | 156 | return bboxes, probs, class_indexes, image_indexes 157 | 158 | 159 | def forward_coco(self, tensor_nchw, image_heights, image_widths): 160 | if self.trt_engine: 161 | bboxes, probs, class_indexes, image_indexes = self.trt_postprocess( 162 | tensor_nchw.size(0), 163 | *self.forward_trt(tensor_nchw, image_heights, image_widths) 164 | ) 165 | else: 166 | bboxes, probs = self.forward_pytorch(tensor_nchw, image_heights, image_widths) 167 | bboxes, probs, class_indexes, image_indexes = self.topk_and_nms(bboxes, probs) 168 | return self.xyxy_to_xywh(bboxes), probs, class_indexes, image_indexes 169 | 170 | 171 | def rescale_locs(self, locs): 172 | locs *= self.scale_xyxywhwh 173 | 174 | xy = locs[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2] 175 | wh = locs[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:] 176 | 177 | wh_delta = torch.cat([wh, wh], dim=-1) * self.scale_wh_delta 178 | cxycxy = torch.cat([xy, xy], dim=-1) 179 | return cxycxy + wh_delta 180 | 181 | 182 | def reshape_for_topk(self, locs, probs): 183 | locs = locs.unsqueeze(-2) 184 | locs = locs.expand(locs.size(0), self.box_dim, self.foreground_class_dim, locs.size(3)) 185 | probs = probs[:, :, 1:] 186 | return locs, probs 187 | 188 | 189 | def topk_and_nms(self, locs, probs): 190 | probs, top_prob_indexes = probs.topk(self.topk, dim=1) 191 | flat_probs = probs.reshape(-1).contiguous() 192 | 193 | locs = locs.gather(1, top_prob_indexes.unsqueeze(-1).expand(*top_prob_indexes.size(), 4)) 194 | flat_locs = locs.reshape(-1, 4).contiguous() 195 | 196 | # only do NMS on detections over threshold 197 | threshold_mask = flat_probs > self.detection_threshold 198 | 199 | flat_locs = flat_locs[threshold_mask] 200 | flat_probs = flat_probs[threshold_mask] 201 | class_indexes = self.class_indexes[threshold_mask] 202 | image_indexes = self.image_indexes[threshold_mask] 203 | 204 | nms_mask = torchvision.ops.boxes.batched_nms( 205 | flat_locs, 206 | flat_probs, 207 | class_indexes * (image_indexes + 1), # do not multiply class_indexes by 0 208 | iou_threshold=self.iou_threshold 209 | ) 210 | 211 | return ( 212 | flat_locs[nms_mask], 213 | flat_probs[nms_mask], 214 | class_indexes[nms_mask], 215 | image_indexes[nms_mask] 216 | ) 217 | 218 | 219 | def locs_to_xyxy(self, locs, image_heights, image_widths): 220 | image_heights = image_heights.reshape(-1, 1, 1, 1) 221 | image_widths = image_widths.reshape(-1, 1, 1, 1) 222 | 223 | image_wh = torch.cat([image_widths, image_heights], dim=-1) 224 | 225 | xy = locs[:, :, :, 0:2] * image_wh 226 | wh = (locs[:, :, :, 2:4] - locs[:, :, :, 0:2]) * image_wh # surely this could just be locs[:, :, :, 2:4] * image_wh and then return cat([xy, xy2])? 227 | 228 | return torch.cat([xy, xy + wh], dim=-1) 229 | 230 | 231 | def xyxy_to_xywh(self, xyxy): 232 | return torch.cat([xyxy[:, :2], xyxy[:, 2:] - xyxy[:, :2]], dim=-1) 233 | 234 | 235 | def eval_coco(args): 236 | device = torch.device(args.device) 237 | 238 | model = SSD300( 239 | args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path 240 | ).to(device).eval() 241 | 242 | dataloader = get_val_dataloader(args) 243 | inv_map = {v: k for k, v in dataloader.dataset.label_map.items()} 244 | 245 | coco_ground_truth = get_coco_ground_truth(args) 246 | 247 | results = None 248 | start = time.time() 249 | 250 | for nbatch, (X, img_id, img_size, _, _) in enumerate(dataloader): 251 | print('Inference batch: {}/{}'.format(nbatch, len(dataloader)), end='\r') 252 | with torch.no_grad(): 253 | batch_dim = X.size(0) 254 | if args.precision == 'fp16': 255 | X = X.to(torch.float16) 256 | X = X.to(device) 257 | image_heights, image_widths = [i.to(device) for i in img_size] 258 | 259 | if batch_dim < args.batch_dim: 260 | num_pad = args.batch_dim - batch_dim 261 | X = torch.cat([X, X[-1].expand(num_pad, *X[-1].size())], dim=0) 262 | image_heights = torch.cat([image_heights, image_heights[-1].repeat(num_pad)], dim=0) 263 | image_widths = torch.cat([image_widths, image_widths[-1].repeat(num_pad)], dim=0) 264 | 265 | bboxes, probs, class_indexes, image_indexes = model.forward_coco(X, image_heights, image_widths) 266 | 267 | # filter out pad results 268 | small_batch_filter = image_indexes < batch_dim 269 | bboxes = bboxes[small_batch_filter] 270 | probs = probs[small_batch_filter] 271 | class_indexes = class_indexes[small_batch_filter] 272 | image_indexes = image_indexes[small_batch_filter] 273 | 274 | mapped_labels = class_indexes.to('cpu') 275 | mapped_labels.apply_(lambda i: inv_map[i]) 276 | image_ids = img_id[image_indexes] 277 | 278 | batch_results = torch.cat([ 279 | image_ids.cpu().unsqueeze(-1), 280 | bboxes.cpu(), 281 | probs.cpu().unsqueeze(-1), 282 | mapped_labels.unsqueeze(-1) 283 | ], dim=1) 284 | 285 | if results is not None: 286 | results = torch.cat([results, batch_results], dim=0) 287 | else: 288 | results = batch_results 289 | 290 | print() 291 | print(f'DONE (t={time.time() - start:.2f}).') 292 | 293 | results = results.numpy().astype(np.float32) 294 | 295 | coco_detections = coco_ground_truth.loadRes(results) 296 | 297 | E = COCOeval(coco_ground_truth, coco_detections, iouType='bbox') 298 | E.evaluate() 299 | E.accumulate() 300 | stdout = sys.stdout 301 | try: 302 | if args.output_path: 303 | sys.stdout = open(args.output_path, 'w') 304 | E.summarize() 305 | finally: 306 | if args.output_path: 307 | sys.stdout.close() 308 | sys.stdout = stdout 309 | print('mAP: {:.5f}'.format(E.stats[0])) 310 | 311 | 312 | def export_engine(args): 313 | onnx_module = build_onnx(args) 314 | build_trt_engine(onnx_module, args) 315 | 316 | 317 | def build_onnx(args): 318 | device = torch.device('cpu') 319 | val_dataloader = get_val_dataloader(args) 320 | 321 | for nbatch, (X, img_id, img_size, _, _) in enumerate(val_dataloader): 322 | inputs = X, img_size[0], img_size[1] 323 | break 324 | 325 | model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, 'fp32', args.batch_dim, None, onnx_export=True).to(device).eval() 326 | 327 | onnx_buf = io.BytesIO() 328 | torch.onnx.export( 329 | model, 330 | inputs, 331 | onnx_buf, 332 | input_names=('tensor_nchw', 'image_heights', 'image_widths'), 333 | output_names=('bboxes', 'probs'), 334 | opset_version=11, 335 | export_params=True 336 | ) 337 | onnx_buf.seek(0) 338 | onnx_module = shape_inference.infer_shapes(onnx.load(onnx_buf)) 339 | 340 | while len(onnx_module.graph.output): 341 | onnx_module.graph.output.remove(onnx_module.graph.output[0]) 342 | onnx_module.graph.output.extend([ 343 | helper.make_tensor_value_info('num_detections', TensorProto.INT32, [-1]), 344 | helper.make_tensor_value_info('nms_bboxes', TensorProto.FLOAT, [-1, -1, -1]), 345 | helper.make_tensor_value_info('nms_probs', TensorProto.FLOAT, [-1, -1]), 346 | helper.make_tensor_value_info('nms_classes', TensorProto.FLOAT, [-1, -1]), 347 | ]) 348 | 349 | graph = gs.import_onnx(onnx_module) 350 | 351 | attrs = { 352 | 'shareLocation': False, 353 | 'numClasses': 80, 354 | 'backgroundLabelId': -1, 355 | 'topK': args.topk, # per-class, pre NMS 356 | 'keepTopK': args.topk, # across-classes, per image 357 | 'scoreThreshold': args.detection_threshold, 358 | 'iouThreshold': args.iou_threshold, 359 | 'isNormalized': False, 360 | 'clipBoxes': False, 361 | } 362 | 363 | ts = graph.tensors() 364 | 365 | nms_layer = graph.layer( 366 | op='BatchedNMSDynamic_TRT', 367 | attrs=attrs, 368 | inputs=[ts['bboxes'], ts['probs']], 369 | outputs=[ts['num_detections'], ts['nms_bboxes'], ts['nms_probs'], ts['nms_classes']] 370 | ) 371 | 372 | graph.cleanup() 373 | graph.toposort() 374 | 375 | onnx_module = gs.export_onnx(graph) 376 | onnx_path = os.path.splitext(args.trt_path)[0] + '.onnx' 377 | print('saving ONNX model to', onnx_path) 378 | onnx.save(onnx_module, onnx_path) 379 | return onnx_module 380 | 381 | 382 | 383 | def build_trt_engine(onnx_module, args): 384 | logger = trt.Logger() 385 | 386 | network_flags = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 387 | 388 | with trt.Builder(logger) as builder, builder.create_network(network_flags) as network, trt.OnnxParser(network, logger) as parser: 389 | builder.max_workspace_size = 2 ** 31 # 2 GB 390 | builder.max_batch_size = args.batch_dim 391 | builder.fp16_mode = args.precision != 'fp32' 392 | if args.precision == 'int8': 393 | builder.int8_mode = True 394 | builder.int8_calibrator = Int8Calibrator(args) 395 | 396 | print('parsing ONNX...') 397 | onnx_buf = io.BytesIO() 398 | onnx.save(onnx_module, onnx_buf) 399 | onnx_buf.seek(0) 400 | if not parser.parse(onnx_buf.read()): 401 | print(parser.num_errors, 'parser errors:') 402 | for i in range(parser.num_errors): 403 | print(parser.get_error(i)) 404 | 405 | print('inputs:') 406 | inputs = { 407 | t.name: t.shape 408 | for t in [ 409 | network.get_input(i) 410 | for i in range(network.num_inputs) 411 | ] 412 | } 413 | pprint(inputs) 414 | print('outputs:') 415 | outputs = { 416 | t.name: t.shape 417 | for t in [ 418 | network.get_output(i) 419 | for i in range(network.num_outputs) 420 | ] 421 | } 422 | pprint(outputs) 423 | 424 | print('building CUDA engine...') 425 | engine = builder.build_cuda_engine(network) 426 | if engine: 427 | print('saving CUDA engine to', args.trt_path) 428 | with open(args.trt_path, 'wb') as mf: 429 | mf.write(engine.serialize()) 430 | 431 | return engine 432 | 433 | 434 | 435 | def benchmark(args): 436 | app_start = time.time() 437 | 438 | prewarm_iters = 50 439 | bench_secs = 10 440 | 441 | val_dataloader = get_val_dataloader(args) 442 | 443 | for nbatch, (tensor_nchw, img_id, (image_heights, image_widths), _, _) in enumerate(val_dataloader): 444 | tensor_nchw, image_heights, image_widths = [t.to('cuda') for t in (tensor_nchw, image_heights, image_widths)] 445 | break 446 | 447 | batch_dim = tensor_nchw.size(0) 448 | 449 | update_fps, plot_thread = gpuplot.bg_plot( 450 | num_gpus=args.num_devices, 451 | sample_hz=5, 452 | ) 453 | 454 | max_times = 10 455 | batch_times = [] 456 | last_update = time.time() 457 | update_period = 0.5 458 | 459 | if args.runtime == 'pytorch': 460 | print(f'Runtime: Pytorch\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}') 461 | model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path) 462 | model = model.eval().to('cuda') 463 | 464 | if args.precision == 'fp16': 465 | tensor_nchw, image_heights, image_widths = [t.to(torch.float16) for t in (tensor_nchw, image_heights, image_widths)] 466 | 467 | plot_thread.start() 468 | 469 | print('Prewarming model') 470 | for i in range(prewarm_iters): 471 | model(tensor_nchw, image_heights, image_widths) 472 | batch_times = (batch_times + [time.time()])[-max_times:] 473 | 474 | print(f'Beginning benchmark (+{time.time() - app_start:.1f})...') 475 | start_time = time.time() 476 | 477 | bench_iters = 0 478 | while True: 479 | model(tensor_nchw, image_heights, image_widths) 480 | batch_times = (batch_times + [time.time()])[-max_times:] 481 | if batch_times[-1] > last_update + update_period and len(batch_times) > 1: 482 | last_update = batch_times[-1] 483 | update_fps(args.batch_dim * (len(batch_times) - 1) / (batch_times[-1] - batch_times[0])) 484 | bench_iters += 1 485 | if time.time() > start_time + bench_secs: 486 | break 487 | 488 | elif args.runtime == 'trt': 489 | print(f'Runtime: TensorRT\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}') 490 | np_to_torch_type = { 491 | np.float32: torch.float32, 492 | np.float16: torch.float16, 493 | np.int32: torch.int32, 494 | np.int64: torch.int64, 495 | } 496 | 497 | devices = [cuda.Device(i) for i in range(args.num_devices)] 498 | contexts = [devices[i].make_context() for i in range(args.num_devices)] 499 | 500 | for d in devices: 501 | pycuda.autoinit.context.pop() 502 | 503 | context_detail = [] 504 | 505 | for device_id, context in enumerate(contexts): 506 | context.push() 507 | try: 508 | torch_device = torch.device('cuda', device_id) 509 | streams = [cuda.Stream() for i in range(args.num_streams_per_device)] 510 | 511 | tensors = { 512 | name: t.clone().to(torch_device) 513 | for name, t in [ 514 | ('tensor_nchw', tensor_nchw), 515 | ('image_heights', image_heights), 516 | ('image_widths', image_widths) 517 | ] 518 | } 519 | 520 | model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path) 521 | 522 | trt_outputs, bindings = [[] for i in range(args.num_streams_per_device)], [[] for i in range(args.num_streams_per_device)] 523 | 524 | for stream_id in range(args.num_streams_per_device): 525 | for binding_name in model.trt_engine: 526 | shape = model.trt_engine.get_binding_shape(binding_name) 527 | dtype = trt.nptype(model.trt_engine.get_binding_dtype(binding_name)) 528 | torch_type = np_to_torch_type[dtype] 529 | 530 | if model.trt_engine.binding_is_input(binding_name): 531 | torch_input = tensors[binding_name].to(torch_type) 532 | bindings[stream_id].append(int(torch_input.data_ptr())) 533 | else: 534 | torch_output = torch.zeros(tuple(shape), dtype=torch_type, device=torch_device) 535 | trt_outputs[stream_id].append(torch_output) 536 | bindings[stream_id].append(int(torch_output.data_ptr())) 537 | 538 | context_detail.append({ 539 | 'streams': streams, 540 | 'model': model, 541 | 'trt_outputs': trt_outputs, 542 | 'bindings': bindings 543 | }) 544 | 545 | finally: 546 | context.pop() 547 | 548 | event_queue = queue.Queue(args.num_devices * args.num_streams_per_device) 549 | 550 | def sync_streams(update_fps, batch_times, max_times, last_update, update_period): 551 | while True: 552 | ce = event_queue.get() 553 | if ce is None: 554 | break 555 | else: 556 | context, e = ce 557 | context.push() 558 | e.synchronize() 559 | context.pop() 560 | 561 | batch_times = (batch_times + [time.time()])[-max_times:] 562 | if batch_times[-1] > last_update + update_period and len(batch_times) > 1: 563 | last_update = batch_times[-1] 564 | update_fps(args.batch_dim * (len(batch_times) - 1) / (batch_times[-1] - batch_times[0])) 565 | 566 | sync_thread = threading.Thread(target=sync_streams, args=(update_fps, batch_times, max_times, last_update, update_period)) 567 | sync_thread.start() 568 | 569 | plot_thread.start() 570 | 571 | # for benchmarking purposes, just run model repeatedly on initial batch of inputs 572 | bench_iters = 0 573 | while True: 574 | if bench_iters == 0: 575 | print('Prewarming model') 576 | elif bench_iters == prewarm_iters: 577 | print(f'Beginning benchmark (+{time.time() - app_start:.1f})...') 578 | start_time = time.time() 579 | elif bench_iters > prewarm_iters and time.time() > start_time + bench_secs: 580 | break 581 | 582 | context_id = bench_iters % len(context_detail) 583 | context = contexts[context_id] 584 | context.push() 585 | try: 586 | detail = context_detail[context_id] 587 | stream_id = (bench_iters - context_id) % len(detail['streams']) 588 | stream = detail['streams'][stream_id] 589 | detail['model'].trt_context.execute_async_v2( 590 | bindings=detail['bindings'][stream_id], 591 | stream_handle=stream.handle 592 | ) 593 | event = cuda.Event(cuda.event_flags.DISABLE_TIMING) 594 | event_queue.put((context, event.record(stream))) 595 | finally: 596 | context.pop() 597 | 598 | bench_iters += 1 599 | 600 | event_queue.put(None) 601 | while not event_queue.empty(): 602 | pass 603 | bench_iters -= prewarm_iters 604 | 605 | total_time = time.time() - start_time 606 | 607 | update_fps(None) 608 | plot_thread.join() 609 | 610 | print(f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total') 611 | print(f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image') 612 | print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS') 613 | 614 | if args.output_path: 615 | with open(args.output_path, 'w') as fout: 616 | print(f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total', file=fout) 617 | print(f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image', file=fout) 618 | print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS', file=fout) 619 | 620 | 621 | 622 | def parse_args(): 623 | import argparse 624 | p = argparse.ArgumentParser() 625 | p.add_argument('--mode', choices=['coco', 'export', 'bench'], default='coco') 626 | p.add_argument('--runtime', choices=['pytorch', 'trt'], default='pytorch') 627 | p.add_argument('--output-path') 628 | 629 | p.add_argument('--device', default=('cuda:0' if torch.cuda.is_available() else 'cpu')) 630 | p.add_argument('--detection-threshold', default=0.05, type=float) 631 | p.add_argument('--iou-threshold', default=0.5, type=float) 632 | p.add_argument('--topk', default=256, type=int) 633 | p.add_argument('--batch-dim', default=16, type=int) 634 | p.add_argument('--precision', default='fp16') 635 | 636 | p.add_argument('--num-streams-per-device', type=int, default=4) 637 | p.add_argument('--num-devices', type=int, default=1) 638 | 639 | p.add_argument('--eval-batch-size', default=None) 640 | p.add_argument('--data', default='/data/coco2017') 641 | p.add_argument('--num-workers', default=2) 642 | 643 | args = p.parse_args() 644 | args.eval_batch_size = args.batch_dim 645 | 646 | if args.runtime == 'trt': 647 | args.trt_path = f'models/ssd300.{args.precision}.b{args.batch_dim}.k{args.topk}.plan' 648 | else: 649 | args.trt_path = None 650 | 651 | if args.mode =='coco' and args.precision == 'int8' and args.runtime != 'trt': 652 | print('incompatible args') 653 | sys.exit(1) 654 | 655 | return args 656 | 657 | 658 | 659 | if __name__ == '__main__': 660 | args = parse_args() 661 | 662 | if args.mode == 'export': 663 | export_engine(args) 664 | elif args.mode == 'coco': 665 | eval_coco(args) 666 | elif args.mode == 'bench': 667 | benchmark(args) 668 | 669 | 670 | 671 | -------------------------------------------------------------------------------- /subscript_assignment.py: -------------------------------------------------------------------------------- 1 | import sys, io 2 | import torch 3 | import tensorrt as trt 4 | 5 | class SubscriptAssign(torch.nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def forward(self, X): 10 | X[:, :2] = 0 11 | return X 12 | 13 | 14 | if __name__ == '__main__': 15 | m = SubscriptAssign() 16 | onnx_filename = 'models/subscript_assign.onnx' 17 | 18 | print('exporting SubscriptAssign to', onnx_filename) 19 | torch.onnx.export( 20 | SubscriptAssign(), 21 | torch.randn((10, 10)), 22 | onnx_filename, 23 | opset_version=11 24 | ) 25 | 26 | print('compiling', onnx_filename, 'with TensorRT') 27 | logger = trt.Logger() 28 | network_flags = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 29 | 30 | with trt.Builder(logger) as builder, builder.create_network(network_flags) as network, trt.OnnxParser(network, logger) as parser: 31 | if not parser.parse(open(onnx_filename, 'rb').read()): 32 | sys.exit(1) 33 | engine = builder.build_cuda_engine(network) 34 | 35 | --------------------------------------------------------------------------------