├── LICENSE ├── README.md ├── calib_train_image └── 此处存放INT8量化需要的训练图像约30660张.txt ├── checkpoint └── 该文件夹存放训练好的DETR pth模型文件.txt ├── cpp ├── CMakeLists.txt ├── model │ └── ONNX模型文件.txt ├── res.jpg └── src │ └── main.cc ├── detr_pth2onnx.py ├── generate_batch_plan.py ├── inference_detr_onnx.py ├── inference_detr_trt.py ├── model ├── README.md ├── __init__.py ├── backbone.py ├── box_ops.py ├── detr.py ├── hubconf.py ├── matcher.py ├── misc.py ├── position_encoding.py ├── segmentation.py └── transformer.py ├── performance_accuracy_detr.py ├── performance_time_detr.py ├── pic ├── average_diff_percentage.png ├── bug1.png ├── bug1_onnx.png ├── bug2.png ├── bug2_onnx.png ├── bug3.png ├── bug4.png ├── latency_vs_throughput.png ├── test_fp16.jpg ├── test_fp32.jpg └── time.png ├── requirements.txt ├── trt_int8_quant.py └── trt_util ├── __init__.py ├── calibrator.py ├── common.py ├── plot_box.py ├── process_img.py └── trt_lite.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## TensorRT for DETR 2 | 3 | **美迪康AI Lab** 4 | 5 | :white_check_mark: :tada::tada: 美迪康AI Lab在2022 TensorRT Transformer模型优化大赛,获得了 **一等奖**,我们连同该项目一起迁移到: 6 | 7 | 8 | :white_check_mark: :tada: :tada::tada: 在本次比赛中,该项目获得了 **二等奖** :tada::tada::tada: 9 | 10 | :white_check_mark: :tada: :tada::tada: :tada::tada::tada: 11 | 12 | 13 | 14 | #### 0. 环境配置 15 | 16 | + TensorRT Docker镜像环境:`nvcr.io/nvidia/tensorrt:21.03-py3`(TensorRT-7.2.2.3),需要Host中安装好Docker和Nvidia-Docker2和版本为`Driver Version: 460.32.03`的显卡驱动. 17 | + 在Docker镜像内需要安装相应的Python库,可以在项目下执行`pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com`进行安装. 18 | + 显卡类型:Tesla T4 (16G显存). 19 | + CUDA 11.2, cuDNN-8.1. 20 | + 系统信息为:`Linux version 4.15.0-139-generic (buildd@lgw01-amd64-035) (gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)) #143-Ubuntu SMP Tue Mar 16 01:30:17 UTC 2021`. 21 | 22 | **项目结构及说明** 23 | 24 | ```shell 25 | . 26 | ├── model # DETR模型相关的python代码(主要来源https://github.com/facebookresearch/detr) 27 | │   ├── README.md 28 | │   ├── __init__.py 29 | │   ├── backbone.py # backbone resnet50 30 | │   ├── box_ops.py 31 | │   ├── detr.py # DETR model build 32 | │   ├── hubconf.py 33 | │   ├── matcher.py 34 | │   ├── misc.py 35 | │   ├── position_encoding.py # position_encoding,支持sine和自学习,默认是sine 36 | │   ├── segmentation.py # 分割的模型的build 37 | │   └── transformer.py # transformer的encoder和decoder包括多头的自注意力,Skip, FFN 38 | | 39 | ├── trt_util # TensorRT相关的辅助方法 40 | │ ├── __init__.py 41 | │ ├── calibrator.py # INT8量化的calibrator 42 | │ ├── common.py #host与device数据交互,TensorRT序列化engine及调用(支持FP32,FP16,INT8),Dynamic shape序列化engine及调用(支持FP32,FP16,INT8) 43 | │ ├── plot_box.py # 画出detr推断预测的box 44 | │ ├── process_img.py # detr图像预处理,支持numpy,torchvision, cupy 45 | │ └── trt_lite.py # tensorrt性能测试辅助方法,基于https://github.com/NVIDIA/trt-samples-for-hackathon-cn/blob/master/python/修改 46 | | 47 | ├── calib_train_image # INT8量化的数据约30660张, 开源代码该部分内容被删除 48 | │   ├── A_57b26b46_2e1e_11eb_9d64_00d861c69d42.jpg 49 | │   ├── ... ... 50 | │   └── N9_50667548_2e21_11eb_ac9b_00d861c69d42.jpg 51 | | 52 | ├── test # 性能测试需要的测试图像约1000张,开源代码该部分内容被删除 53 | │   ├── test_c6d6ecec_2fd1_11eb_b773_00d861c69d42.jpg 54 | │   ├── ... ... 55 | │   └── test_d4c4ea34_2fd1_11eb_9f0e_00d861c69d42.jpg 56 | | 57 | ├── checkpoint # DETR Pytorch 模型,开源代码该部分仅提供模型下载链接 58 | │   ├── detr_resnet50.pth 59 | │   └── log.txt 60 | ├── pic # README 静态资源文件 61 | | 62 | ├── detr_pth2onnx.py # pytorch 转onnx支持static,dynamic shape, btached, onnx check, onnx-simplifier, onnx-graphsurgeon 63 | ├── generate_batch_plan.py # 生成batched static tensorrt 序列化engine文件,支持FP32,FP16,任意batch size 64 | ├── inference_detr_onnx.py # onnx runtime模型推断,支持static,dynamic shape,用于验证onnx的正确性 65 | ├── inference_detr_trt.py # tensorrt模型推断,支持,static,dynamic shape,FP32,FP16,INT8并检验engine是否存在,不存在调用序列化程序 66 | ├── performance_accuracy_detr.py # TensorRT识别精度的计算和可视化 67 | ├── performance_time_detr.py # TensorRT benchmark的计算和可视化 68 | ├── trt_int8_quant.py # INT8量化,并生成量化模型的engine和cache文件 69 | | 70 | ├── requirements.txt # Python package list 71 | ├── LICENSE 72 | └── README.md 73 | 74 | # 说明: 75 | # 1. README提供过程中用到的Linux 相关命令,比如 trtexec, polygraphy, Nsight Systems的使用 76 | # 2. 用到的模型文件包括.pth,.onnx,.plan文件在README中提供百度云盘的下载地址 77 | # 3. 项目过程中产生的log文件比如,测试benchmark生成的数据,序列化engine过程中的日志,polygraphy日志,Nsight Systems生成UI文件均在README中提供百度云盘下载地址 78 | 79 | ``` 80 | 81 | 82 | 83 | 84 | 85 | ### 1.Pytorch checkpoint to ONNX 86 | 87 | ```shell 88 | # pytorch to onnx 89 | $ python3 detr_pth2onnx.py -h 90 | 91 | # batch_size=1, static 92 | # 在项目下生成detr.onnx和detr_sim.onnx(simplify后的onnx) 93 | $ python3 detr_pth2onnx.py --model_dir ./checkpoint/detr_resnet50.pth --check --onnx_dir ./detr.onnx 94 | 95 | # dynamic shape 96 | # 在项目下生成detr_dynamic.onnx和detr_dynamic_sim.onnx 97 | $ python3 detr_pth2onnx.py --model_dir ./checkpoint/detr_resnet50.pth --check --onnx_dir ./detr_dynamic.onnx --dynamic_axes 98 | 99 | # batch_size=n, static 100 | # 生成./output/detr_batch_{n}.onnx和output/detr_batch_{n}_sim.onnx 101 | $ python3 detr_pth2onnx.py --model_dir ./checkpoint/detr_resnet50.pth --check --onnx_dir ./output/detr_batch_2.onnx --batch_size=2 102 | 103 | ``` 104 | 105 | 106 | 107 | **simplify的其他方式** 108 | 109 | ```shell 110 | # onnx-simplifier 111 | # static 112 | $ python3 -m onnxsim detr.onnx detr_sim.onnx 113 | # dynamic 114 | $ python3 -m onnxsim detr_dynamic.onnx detr_dynamic_sim.onnx --input-shape "inputs:1,3,800,800" --dynamic-input-shape 115 | ``` 116 | 117 | onnxruntime测试onnx模型 118 | 119 | ```shell 120 | $ python3 inference_detr_onnx.py 121 | ``` 122 | 123 | 124 | 125 | **注意**:上述过程生成的detr_sim.onnx文件序列化engine后,TensorRT推断结果全部为0! 126 | 127 | ```python 128 | # 需要onnx-graphsurgeon做如下修改 (该代码导师提供) 129 | import onnx 130 | import onnx_graphsurgeon as gs 131 | 132 | graph = gs.import_onnx(onnx.load("./detr_sim.onnx")) 133 | for node in graph.nodes: 134 | if node.name == "Gather_2682": 135 | print(node.inputs[1]) 136 | node.inputs[1].values = np.int64(5) 137 | print(node.inputs[1]) 138 | elif node.name == "Gather_2684": 139 | print(node.inputs[1]) 140 | node.inputs[1].values = np.int64(5) 141 | print(node.inputs[1]) 142 | 143 | onnx.save(gs.export_onnx(graph),'changed.onnx') 144 | 145 | ``` 146 | 147 | 148 | 149 | ### 2.TensorRT Inference in FP32 or FP16 Mode 150 | 151 | 生成TensorRT序列化engine文件并调用,有两种方式: 152 | 153 | + 1.使用python实现 154 | 155 | ```shell 156 | # 提供build engine和反序列化engine进行推断 157 | # inference_detr_trt.py支持FP32,FP16的build engine和engine的推断,同时支持static shape和Dynamic shape的推断,前处理,后处理和结果可视化,支持INT8量化后engine的推断,包括static shape, dynamic shape及前处理后处理和结果可视化 158 | 159 | # static shape 160 | # FP32 161 | $ python3 inference_detr_trt.py -h 162 | $ python3 inference_detr_trt.py --model_dir ./detr_sim.onnx --engine_dir ./detr.plan --image_dir ./test 163 | 164 | # FP16 165 | $ python3 inference_detr_trt.py --model_dir ./detr_sim.onnx --engine_dir ./detr_fp16.plan --image_dir ./test --fp16 166 | 167 | # INT8 168 | $ python3 inference_detr_trt.py --model_dir ./detr_sim.onnx --engine_dir ./detr_int8.plan --image_dir ./test --int8 169 | 170 | # dynamic shape 171 | $ python3 inference_detr_trt.py --model_dir ./detr_sim.onnx --engine_dir ./detr.plan --image_dir ./test --dynamic --batch_size=8 172 | 173 | # 生成batch的engine 174 | $ python3 generate_batch_plan.py --model_dir ./output/detr_batch_{n}_sim.onnx --engine_dir ./output/detr_batch_{n}_fp16.plan --batch_size={n} --fp16 175 | # eg 176 | $ python3 generate_batch_plan.py --model_dir ./output/detr_batch_2_sim.onnx --engine_dir ./output/detr_batch_2.plan --batch_size=2 177 | $ python3 generate_batch_plan.py --model_dir ./output/detr_batch_2_sim.onnx --engine_dir ./output/detr_batch_2_fp16.plan --batch_size=2 --fp16 178 | ``` 179 | 180 | TensorRT Inference的结果Demo(上trt fp32,下trt fp16): 181 | 182 |
183 | 184 | + 2.使用trtexec 185 | 186 | 187 | 188 | ```shell 189 | # static shape 190 | trtexec --verbose --onnx=detr.onnx --saveEngine=detr.plan # error 191 | trtexec --verbose --onnx=detr_sim.onnx --saveEngine=detr.plan 192 | 193 | trtexec --verbose --onnx=detr_sim.onnx --saveEngine=detr.plan --fp16 194 | 195 | # dynamic shape (error) 196 | # FP32 197 | trtexec --verbose --onnx=detr_dynamic_sim.onnx --saveEngine=detr_dynamic.plan --optShapes=input:1x3x800x800 --minShapes=input:1x3x800x800 --maxShapes=input:16x3x800x800 --workspace=10240 198 | 199 | # FP16 200 | trtexec --verbose --onnx=detr_dynamic_sim.onnx --saveEngine=detr_dynamic_fp16.plan --optShapes=input:1x3x800x800 --minShapes=input:1x3x800x800 --maxShapes=input:64x3x800x800 --fp16 201 | 202 | ``` 203 | 204 | 205 | 206 | > 该过程遇到的问题: 207 | > 208 | > 1. 在tag为20.12-py 的TensorRT镜像中,onnx转trt engine文件时出现了**Myelin Error** (fig1,fig2)[**该问题由导师协助解决,将TensorRT的Docker镜像的tag换成21.03-py**] 209 | > 2. 如果不做onnx的simplify,无法序列化engine,报错如fig3 [**解决办法是进行onnx-simplifier**] 210 | > 3. Dynamic Shape可以正常通过torch.onnx.export获得并且在onnxruntime下可正常调用,但是在序列化engine时,无法正常工作(我们提供了dynamic shape的序列化方法和序列化后engine的调用方法,但是遗憾无法序列化dynamic时的engine),序列化engine的错误信息如fig4.[**该问题现在依然没有解决,未来的工作希望基于TensorRT API重新搭建网络或Plugin添加不支持的层**] 211 | > 212 | 213 | 214 | 215 | 216 | 217 | ![bug1:低版本tensorrt问题](./pic/bug1.png) 218 | 219 | **fig1:Myelin Error** 220 | 221 | 222 | 223 | ![bug1对应结点](./pic/bug1_onnx.png) 224 | 225 | **fig2: Myeline Error 对应ONNX结点** 226 | 227 | 228 | 229 | ![bug3.png](./pic/bug3.png) 230 | 231 | **fig3:onnx不做simplify无法序列化** 232 | 233 | 234 | 235 | ![bug4.png](./pic/bug4.png) 236 | 237 | **fig4:detr_dynamic_sim.onnx无法序列化engine,进而dynamic shape代码部分完成后无法进行dynamic shape的测试** 238 | 239 | 240 | 241 | ### 3.TensorRT Inference Time(IT) and Mean Value of Relative Error(MVRE) 242 | 243 | ```shell 244 | # Inference Time(IT) 245 | $ python3 performance_time_detr.py 246 | 247 | # Mean value of relative error (MVRE) 248 | $ python3 performance_accuracy_detr.py 249 | 250 | # Nsight Systems 251 | $ nsys profile -o nsight_detr_out python3 performance_time_detr.py 252 | $ nsys-ui nsight_detr_out 253 | ``` 254 | 255 | + benchmark的计算中关于Latency和Throughput的计算,设及的nRound为1000次,该统计保证预处理和后处理相同的条件下仅包含模型加速推断部分的统计 256 | + benchmark的计算中关于平均相对误差的计算采用预测Score和预测Box分开分别计算,使用1000张测试图片进行测试 257 | 258 | 259 | 260 | ![性能对比表格](./pic/time.png) 261 | 262 |
性能对比
263 | 264 | 265 | 266 | ![lantencyvsthroughput](./pic/latency_vs_throughput.png) 267 | 268 |
Latency vs Throughput
269 | 270 | ![识别精度](pic/average_diff_percentage.png) 271 | 272 |
测试1000张图像,平均相对精度基本满足条件,正常的FP32在1e-6,FP16在1e-3数量级 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | >该过程遇到的问题: 281 | > 282 | >4.关于平均相对精度的统计中发现,TensorRT序列化的engine在推断后的结果全部为0(使用`polygraphy run detr_sim.onnx --trt --onnxrt --onnx-outputs mark all --trt-outputs mark all`查看原因,得到错误信息如fig5,fig6在第1775个节点后就出现了错误;我们是正常把detr_sim.onnx序列换成功了,但是序列化的engine的识别结果不正确)[**可能是TensorRT ONNX parser权重解析的一个BUG**,这个问题已经在导师的帮助下解决,解决方式参考[第一节](### 1.Pytorch checkpoint to ONNX ),基于onnx-graphsurgeon修改了结点的信息] **BUG** 283 | 284 | 285 | 286 | ![bug2.png](pic/bug2.png) 287 | 288 | **fig5: 通过polygraphy 查看信息** 289 | 290 | 291 | 292 | ![bug2_onnx.png](./pic/bug2_onnx.png) 293 | 294 | **fig6: 定位到polygraphy 找到的错误结点** 295 | 296 | 297 | 298 | ### 4.INT8 量化 299 | 300 | 1. INT8量化序列化INT8 Engine文件 301 | 302 | ```shell 303 | # generate int8 model 304 | $ python3 trt_int8_quant.py -h 305 | 306 | $ python3 trt_int8_quant.py --onnx_model_path ./detr_sim.onnx --engine_model_path ./detr_int8.plan --calib_img_dir ./calib_train_image --calibration_table ./detr_calibration.cache --int8 307 | 308 | ``` 309 | 310 | 使用约30660张训练图像进行INT8量化. 311 | 312 | 313 | 314 | > 该过程遇到的问题: 315 | > 316 | > 5.失败了,cache文件没有出来! 检查代码并没有发现什么异常,序列化的engine出来了(感觉并没有INT8量化),并没有cache文件出来。[**可能TensorRT的BUG**, 原因(导师解释):因为onnx模型中有`where op`,这个`where op`只支持myelin作为backend,且没有int8实现,所以就直接跳过int8 calibration了,导师提供的解决办法是写一个`plugin`来替代`where op`,即使这样也不一定就能够进行int8 calibration] **BUG** 317 | 318 | ​ 319 | 320 | 2. TesorRT Inference in INT8 321 | 322 | ```shell 323 | $ python3 inference_detr_trt.py -h 324 | $ python3 inference_detr_trt.py --model_dir ./detr_sim.onnx --engine_dir ./detr_int8.plan --int8 325 | ``` 326 | 327 | 提供了INT8推理引擎的推断代码. 328 | 329 | 330 | 331 | ### 5.Profile每一层的耗时 332 | 333 | ```shell 334 | $ trtexec --verbose --onnx=detr_sim.onnx --saveEngine=detr_batch1.plan 335 | $ trtexec --loadEngine=detr_batch1.plan --batch=1 336 | ``` 337 | 338 | ```shell 339 | $ trtexec --loadEngine=detr_batch1.plan --batch=1 --dumpProfile |& tee profile.txt 340 | ``` 341 | 342 | 343 | 344 | ```shell 345 | # 每一层的耗时,因为层数比较多,这里仅列出耗时比较长一些层的例子和耗时比较少的一些层的例子 346 | 347 | # 耗时比较多的层的例子: 348 | -------------------------------------------------------------------------- 349 | Layer Time (ms) Avg. Time (ms) Time % 350 | --------------------------------------------------------------------------- 351 | Conv_107 + Relu_110 53.99 0.6427 1.8 352 | Conv_123 + Add_126 + Relu_127 41.26 0.4912 1.4 353 | Conv_298 21.66 0.2579 0.7 354 | 355 | 上述类型的层有很多个 356 | --------------------------------------------------------------------------- 357 | 358 | # 耗时比较少的层的例子 359 | -------------------------------------------------------------------------- 360 | Layer Time (ms) Avg. Time (ms) Time % 361 | --------------------------------------------------------------------------- 362 | MatMul_487 3.29 0.0392 0.1 363 | Add_488 1.51 0.0179 0.1 364 | ReduceMean_546 0.95 0.0113 0.0 365 | Sub_547 1.22 0.0145 0.0 366 | Add_690 + Relu_691 4.71 0.0560 0.2 367 | 368 | 上述类型的层有很多个 369 | --------------------------------------------------------------------------- 370 | 371 | # 结论: 372 | 373 | 1. DETR的backbone部分主要涉及ResNet-50是主要TensorRT inference的耗时的地方 374 | 2. transformer的encoder和decoder耗时较少 375 | ``` 376 | 377 | 378 | 379 | ### 6.未来的工作 380 | 381 | 382 | 383 | 1. 在ONNX序列化engine的过程中,发现不做onnx-simplifer序列化engine是有错误的,错误可以参考fig3,这可能是因为其中的一些op tensorRT目前还不支持,未来打算基于DETR网络结构通过TensorRT API搭建网络,实现前向推理加速; 384 | 385 | 3. INT8量化无法生成cache文件(写一个plugin来替代where op, 看是否可以解决INT8量化的问题); 386 | 4. Dynamic shape的ONNX文件进行onnx-simplifier后,依然无法序列化engine,其错误信息和fig4相同,原因是onnx-simplifier并没有对dynamic shape的onnx起到任何简化作用,dynamic shape的onnx模型文件的op和1中面临的问题相同,下一步的工作就是基于TensorRT API或Plugin重新调整网络。 387 | 388 | 389 | 390 | ### 7.连接地址 391 | 392 | 1. 项目模型下载地址包括.pth,.onnx,.plan模型文件: 链接:https://pan.baidu.com/s/1IsHHfFi5zphpbfGTmvPIag 提取码:detr 393 | 2. 项目中生成的日志文件下载地址: 链接:https://pan.baidu.com/s/1rvG2ApC67Jt61t3ISZA3Dg 提取码:logs 394 | 3. DETR参考官方REPO: 395 | 4. DETR Paper: 396 | 5. 项目中参考的代码地址1: 397 | 6. 项目中参考的代码地址2: 398 | 399 | 400 | 401 | ### 8.提交的TensorRT的BUG 402 | 403 | 1. INT8量化,cache文件没有出来, 检查代码并没有发现什么异常,序列化的engine出来了(感觉并没有INT8量化),并没有cache文件出来。因为onnx模型中有`where op`,这个`where op`只支持myelin作为backend,且没有INT8实现,所以就直接跳过int8 calibration ; 404 | 2. .关于平均相对精度的统计中发现,TensorRT序列化的engine在推断后的结果全部为0(使用`polygraphy run detr_sim.onnx --trt --onnxrt --onnx-outputs mark all --trt-outputs mark all`查看原因,得到错误信息如fig5,fig6在第1775个节点后就出现了错误;我们是正常把detr_sim.onnx序列换成功了,但是序列化的engine的识别结果不正确)(**是TensorRT ONNX parser参数解析的一个BUG**,这个问题已经在导师的帮助下解决,解决方式参考[第一节](### 1.Pytorch checkpoint to ONNX ),基于onnx-graphsurgeon修改了结点的信息) 405 | 406 | 407 | 408 | ### 9.TensorRT C++实现 :white_check_mark: 409 | 410 | 1. cmake 411 | 412 | ```shell 413 | cd cpp 414 | cmake . 415 | ``` 416 | 417 | ``` 418 | root@8d80a7f44e59:/workspace/05_detr# cmake . 419 | -- The C compiler identification is GNU 9.3.0 420 | -- The CXX compiler identification is GNU 9.3.0 421 | -- Check for working C compiler: /usr/bin/cc 422 | -- Check for working C compiler: /usr/bin/cc -- works 423 | -- Detecting C compiler ABI info 424 | -- Detecting C compiler ABI info - done 425 | -- Detecting C compile features 426 | -- Detecting C compile features - done 427 | -- Check for working CXX compiler: /usr/bin/c++ 428 | -- Check for working CXX compiler: /usr/bin/c++ -- works 429 | -- Detecting CXX compiler ABI info 430 | -- Detecting CXX compiler ABI info - done 431 | -- Detecting CXX compile features 432 | -- Detecting CXX compile features - done 433 | -- Found OpenCV: /workspace/opencv-4.5.2/build (found version "4.5.2") 434 | -- Looking for pthread.h 435 | -- Looking for pthread.h - found 436 | -- Looking for pthread_create 437 | -- Looking for pthread_create - not found 438 | -- Looking for pthread_create in pthreads 439 | -- Looking for pthread_create in pthreads - not found 440 | -- Looking for pthread_create in pthread 441 | -- Looking for pthread_create in pthread - found 442 | -- Found Threads: TRUE 443 | -- Found CUDA: /usr/local/cuda (found version "11.2") 444 | -- cmake success!!! DETR by xj 445 | -- Configuring done 446 | -- Generating done 447 | -- Build files have been written to: /workspace/05_detr 448 | 449 | ``` 450 | 451 | 452 | 453 | 2. make 454 | 455 | ```shell 456 | make 457 | ``` 458 | 459 | ``` 460 | root@8d80a7f44e59:/workspace/05_detr# make 461 | Scanning dependencies of target detr 462 | [ 50%] Building CXX object CMakeFiles/detr.dir/src/main.cc.o 463 | [100%] Linking CXX executable detr 464 | [100%] Built target detr 465 | 466 | ``` 467 | 468 | 469 | 470 | 3. test 471 | 472 | ```shell 473 | ./detr ./test_img/test.jpg true 474 | 475 | # true 先解析ONNX模型,然后序列化Engine 476 | # false 直接从本地Engine文件反序列化 477 | ``` 478 | 479 | 480 | 481 | ![](cpp/res.jpg) 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | -------------------------------------------------------------------------------- /calib_train_image/此处存放INT8量化需要的训练图像约30660张.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/calib_train_image/此处存放INT8量化需要的训练图像约30660张.txt -------------------------------------------------------------------------------- /checkpoint/该文件夹存放训练好的DETR pth模型文件.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/checkpoint/该文件夹存放训练好的DETR pth模型文件.txt -------------------------------------------------------------------------------- /cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(detr) 4 | 5 | # opencv 6 | set(OpenCV_DIR /workspace/opencv-4.5.2/build) 7 | find_package(OpenCV REQUIRED) 8 | 9 | # ${var}表示引用这个变量 10 | find_package(CUDA REQUIRED) 11 | include_directories(${CUDA_INCLUDE_DIRS}) 12 | include_directories(${TensorRT_INCLUDE_DIRS}) 13 | # find_library(CUDA) 14 | # find_library(NVINFER NAMES nvinfer) 15 | # find_library(NVPARSERS NAMES nvparsers) 16 | # find_library(NVONNXPARSERS NAMES nvonnxparser) 17 | 18 | find_library(NVINFER NAMES nvinfer) 19 | find_library(NVPARSERS NAMES nvparsers) 20 | find_library(NVONNXPARSERS NAMES nvonnxparser) 21 | 22 | find_library(CUDNN_LIBRARY 23 | NAMES libcudnn.so${__cudnn_ver_suffix} libcudnn${__cudnn_ver_suffix}.dylib ${__cudnn_lib_win_name} 24 | PATHS $ENV{LD_LIBRARY_PATH} ${__libpath_cudart} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} 25 | PATH_SUFFIXES lib lib64 bin 26 | DOC "CUDNN library." 27 | ) 28 | 29 | 30 | file(GLOB_RECURSE _HEAD ${CMAKE_CURRENT_LIST_DIR}/src/*.h 31 | ${CMAKE_CURRENT_LIST_DIR}/src/*.cuh 32 | ) 33 | 34 | file(GLOB _SRC ${CMAKE_CURRENT_LIST_DIR}/src/*.cc 35 | ${CMAKE_CURRENT_LIST_DIR}/src/*.cu 36 | ) 37 | 38 | 39 | add_executable (${PROJECT_NAME} ${_SRC} ${_HEAD}) 40 | target_link_libraries(${PROJECT_NAME} 41 | ${NVINFER} 42 | ${NVONNXPARSERS} 43 | ${CUDA_LIBRARIES} 44 | ${CUDA_CUBLAS_LIBRARIES} 45 | ${CUDNN_LIBRARY} 46 | ${OpenCV_LIBS}) 47 | 48 | message(STATUS "cmake success!!! DETR by xj") 49 | -------------------------------------------------------------------------------- /cpp/model/ONNX模型文件.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/cpp/model/ONNX模型文件.txt -------------------------------------------------------------------------------- /cpp/res.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/cpp/res.jpg -------------------------------------------------------------------------------- /cpp/src/main.cc: -------------------------------------------------------------------------------- 1 | 2 | // detr trt demo 3 | // xj 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "cuda_runtime_api.h" 12 | #include "NvOnnxParser.h" 13 | #include "NvInfer.h" 14 | 15 | #define BATCH_SIZE 1 16 | #define INPUT_W 800 17 | #define INPUT_H 800 18 | #define INPUT_SIZE 800 19 | #define NUM_CLASS 22 20 | #define NUM_QURREY 100 //detr默认是100 21 | #define PROB_THRESH 0.7 22 | 23 | 24 | 25 | using namespace std; 26 | using namespace cv; 27 | 28 | std::vector class_names = {"NA", "Class A", "Class B", "Class C", "Class D", "Class E", "Class F", 29 | "Class G", "Class H", "Class I", "Class J", "Class K", "Class L", "Class M", 30 | "Class N", "Class O", "Class P", "Class Q", "Class R", "Class S", "Class T","Class U"}; 31 | 32 | 33 | 34 | class Logger : public nvinfer1::ILogger 35 | { 36 | public: 37 | Logger(Severity severity = Severity::kWARNING) : reportableSeverity(severity) //初始化参数列表 38 | { 39 | } 40 | 41 | void log(Severity severity, const char* msg) override 42 | { 43 | // suppress messages with severity enum value greater than the reportable 44 | if (severity > reportableSeverity) 45 | return; 46 | 47 | switch (severity) 48 | { 49 | case Severity::kINTERNAL_ERROR: 50 | std::cerr << "INTERNAL_ERROR: "; 51 | break; 52 | case Severity::kERROR: 53 | std::cerr << "ERROR: "; 54 | break; 55 | case Severity::kWARNING: 56 | std::cerr << "WARNING: "; 57 | break; 58 | case Severity::kINFO: 59 | std::cerr << "INFO: "; 60 | break; 61 | default: 62 | std::cerr << "UNKNOWN: "; 63 | break; 64 | } 65 | std::cerr << msg << std::endl; 66 | } 67 | 68 | Severity reportableSeverity; 69 | }; 70 | 71 | // 这一部分可以通过trtexec实现 72 | void onnxTotrt(const std::string& model_file, // name of the onnx model 73 | nvinfer1::IHostMemory** trt_model_stream, // output buffer for the TensorRT model 74 | Logger g_logger_, 75 | bool do_engine = true) { 76 | 77 | int verbosity = static_cast(nvinfer1::ILogger::Severity::kWARNING); 78 | 79 | // -- create the builder ------------------/ 80 | const auto explicit_batch = static_cast(BATCH_SIZE) 81 | << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); 82 | nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_); 83 | nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicit_batch); 84 | 85 | // --create the parser to load onnx file---/ 86 | auto parser = nvonnxparser::createParser(*network, g_logger_); 87 | if (!parser->parseFromFile(model_file.c_str(), verbosity)) { 88 | std::string msg("failed to parse onnx file"); 89 | g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str()); 90 | exit(EXIT_FAILURE); 91 | } 92 | 93 | // -- build the config for pass in specific parameters ---/ 94 | builder->setMaxBatchSize(BATCH_SIZE); 95 | nvinfer1::IBuilderConfig* config = builder->createBuilderConfig(); 96 | config->setMaxWorkspaceSize(1 << 30); 97 | nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 98 | 99 | // std::cout <<"engine bindings dimension" << engine->getNbBindings() << std::endl; 100 | 101 | // -- serialize the engine,then close everything down --/ 102 | *trt_model_stream = engine->serialize(); 103 | 104 | //------- 序列化engine保存 105 | if (do_engine) { 106 | 107 | // serialize Model 108 | // IHostMemory *trt_model_stream = engine->serialize(); 109 | std::string serialize_str; 110 | std::ofstream serialize_output_stream; 111 | serialize_str.resize((*trt_model_stream)->size()); 112 | memcpy((void*)serialize_str.data(), (*trt_model_stream)->data(), (*trt_model_stream)->size()); 113 | serialize_output_stream.open("./model/detr.trt"); 114 | serialize_output_stream << serialize_str; 115 | serialize_output_stream.close(); 116 | 117 | } 118 | 119 | parser->destroy(); 120 | engine->destroy(); 121 | network->destroy(); 122 | config->destroy(); 123 | builder->destroy(); 124 | }; 125 | 126 | //前处理 127 | void preprocess(cv::Mat& img, float dstdata_arr[]) { 128 | 129 | cv::Mat img_rgb; 130 | cv::cvtColor(img, img_rgb, cv::COLOR_BGR2RGB); 131 | cv::resize(img_rgb, img_rgb, cv::Size(INPUT_SIZE, INPUT_SIZE), cv::INTER_LINEAR); 132 | cv::Mat img_rgb_float; 133 | img_rgb.convertTo(img_rgb_float, CV_32FC3, 1 / 255.0); // 转float 归一化 134 | 135 | std::vector rgbChannels(3); 136 | std::vector dstdata; 137 | cv::split(img_rgb_float, rgbChannels); 138 | 139 | 140 | for (auto i = 0; i < rgbChannels.size(); i++) { 141 | std::vector data = std::vector(rgbChannels[i].reshape(1, 1)); 142 | 143 | for (int j = 0; j < data.size(); j++) { 144 | if (i == 0) { 145 | dstdata.push_back((data[j] - 0.485) / 0.229); 146 | } 147 | else if (i == 1) { 148 | dstdata.push_back((data[j] - 0.456) / 0.224); 149 | } 150 | else { 151 | dstdata.push_back((data[j] - 0.406) / 0.225); 152 | } 153 | } 154 | } 155 | 156 | std::copy(dstdata.begin(), dstdata.end(), dstdata_arr); 157 | 158 | // return dstdata_arr; 159 | } 160 | 161 | 162 | 163 | //后处理 164 | 165 | // 定义box 166 | struct Bbox { 167 | float xmin; 168 | float ymin; 169 | float xmax; 170 | float ymax; 171 | float score; 172 | int cid; 173 | }; 174 | 175 | // 把box画在图像上 176 | cv::Mat renderBoundingBox(cv::Mat image, const std::vector &bboxes) { 177 | for (auto it : bboxes) { 178 | float score = it.score; 179 | //std::cout << score; 180 | cv::rectangle(image, cv::Point(it.xmin, it.ymin), cv::Point(it.xmax, it.ymax), cv::Scalar(255, 204, 0), 2); 181 | std::string pred_class = class_names[it.cid]; 182 | std::string label_text = pred_class + ": " + std::to_string(score); 183 | cv::putText(image, label_text, cv::Point(it.xmin, it.ymin-10), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 204, 255)); 184 | } 185 | return image; 186 | } 187 | 188 | 189 | // softmax 190 | template 191 | int softmax(const T* src, T* dst, int length) { 192 | const T alpha = *std::max_element(src, src + length); 193 | T denominator{ 0 }; 194 | 195 | for (int i = 0; i < length; ++i) { 196 | //dst[i] = std::exp(src[i] - alpha); 197 | dst[i] = std::exp(src[i]); 198 | 199 | denominator += dst[i]; 200 | } 201 | 202 | for (int i = 0; i < length; ++i) { 203 | dst[i] /= denominator; 204 | } 205 | 206 | return 0; 207 | } 208 | 209 | 210 | // 后处理 211 | vector postprocess(std::vector origin_output, const int &iw, const int &ih) { 212 | 213 | vector bboxes; 214 | Bbox bbox; 215 | 216 | float* Logits = origin_output[0]; 217 | float* Boxes = origin_output[1]; 218 | 219 | for (int i = 0; i < NUM_QURREY; i++) { 220 | std::vector Probs; 221 | std::vector Boxes_wh; 222 | for (int j = 0; j < 22; j++) { 223 | Probs.push_back(Logits[i * 22 + j]); 224 | } 225 | 226 | int length = Probs.size(); 227 | std::vector dst(length); 228 | 229 | softmax(Probs.data(), dst.data(), length); 230 | 231 | auto maxPosition = std::max_element(dst.begin(), dst.end() - 1); 232 | //std::cout << maxPosition - dst.begin() << " | " << *maxPosition << std::endl; 233 | 234 | 235 | if (*maxPosition < PROB_THRESH) { 236 | Probs.clear(); 237 | Boxes_wh.clear(); 238 | continue; 239 | } 240 | else { 241 | bbox.score = *maxPosition; 242 | bbox.cid = maxPosition - dst.begin(); 243 | 244 | float cx = Boxes[i * 4]; 245 | float cy = Boxes[i * 4 + 1]; 246 | float cw = Boxes[i * 4 + 2]; 247 | float ch = Boxes[i * 4 + 3]; 248 | 249 | float x1 = (cx - 0.5 * cw) * iw; 250 | float y1 = (cy - 0.5 * ch) * ih; 251 | float x2 = (cx + 0.5 * cw) * iw; 252 | float y2 = (cy + 0.5 * ch) * ih; 253 | 254 | bbox.xmin = x1; 255 | bbox.ymin = y1; 256 | bbox.xmax = x2; 257 | bbox.ymax = y2; 258 | 259 | bboxes.push_back(bbox); 260 | 261 | Probs.clear(); 262 | Boxes_wh.clear(); 263 | } 264 | 265 | } 266 | return bboxes; 267 | 268 | } 269 | 270 | 271 | 272 | 273 | float h_input[INPUT_SIZE * INPUT_SIZE * 3]; //images 274 | float h_output_1[100 * 22]; //pred_logits 275 | float h_output_2[100 * 4]; //pred_boxes 276 | 277 | 278 | 279 | 280 | int main(int argc, char **argv) { 281 | 282 | std::string do_engine = argv[2]; 283 | 284 | // --initial a logger 285 | Logger g_logger_; 286 | nvinfer1::IHostMemory* trt_model_stream{ nullptr }; 287 | std::string onnx_file = "./model/detr_sim.onnx"; 288 | 289 | // --Pass the params recorded in ONNX_file to trt_model_stream --/ 290 | 291 | if (do_engine == "true") { 292 | 293 | onnxTotrt(onnx_file, &trt_model_stream, g_logger_); 294 | if (trt_model_stream == nullptr) 295 | { 296 | std::cerr << "Failed to load ONNX file " << std::endl; 297 | } 298 | 299 | // --deserialize the engine from the stream --- / 300 | nvinfer1::IRuntime* engine_runtime = nvinfer1::createInferRuntime(g_logger_); 301 | if (engine_runtime == nullptr) 302 | { 303 | std::cerr << "Failed to create TensorRT Runtime object." << std::endl; 304 | } 305 | 306 | // --load the infer engine -----/ 307 | nvinfer1::ICudaEngine* engine_infer = engine_runtime->deserializeCudaEngine(trt_model_stream->data(), trt_model_stream->size(), nullptr); 308 | if (engine_infer == nullptr) 309 | { 310 | std::cerr << "Failed to create TensorRT Engine." << std::endl; 311 | } 312 | nvinfer1::IExecutionContext* engine_context = engine_infer->createExecutionContext(); 313 | 314 | // --destroy stream ---/. 315 | trt_model_stream->destroy(); 316 | std::cout << "loaded trt model , do inference" << std::endl; 317 | 318 | 319 | /////////////////////////////////////////////////////////////////// 320 | // enqueue them up 321 | ////////////////////////////////////////////////////////////////// 322 | 323 | // 加载数据,前处理 324 | 325 | cv::Mat image; 326 | image = cv::imread(argv[1], 1); 327 | 328 | // -- allocate host memory ------------/ 329 | preprocess(image, h_input); 330 | 331 | //申请显存指针 332 | //cudaMalloc的第一个参数传递的是存储在cpu内存中的指针变量的地址, 333 | //cudaMalloc在执行完成后,向这个地址中写入了一个地址值(此地址值是GPU显存里的) 334 | void* buffers[3]; 335 | cudaMalloc(&buffers[0], INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float)); //<- inputs 336 | cudaMalloc(&buffers[1], 100 * 22 * sizeof(float)); //<- pred_logits 337 | cudaMalloc(&buffers[2], 100 * 4 * sizeof(float)); //<- pred_boxes 338 | 339 | 340 | 341 | // cudaMemcpy用于在主机(Host)和设备(Device)之间往返的传递数据,用法如下: 342 | 343 | // 主机到设备:cudaMemcpy(d_A,h_A,nBytes,cudaMemcpyHostToDevice) 344 | // 设备到主机:cudaMemcpy(h_A,d_A,nBytes,cudaMemcpyDeviceToHost) 345 | // 注意:该函数是同步执行函数,在未完成数据的转移操作之前会锁死并一直占有CPU进程的控制权,所以不用再添加cudaDeviceSynchronize()函数 346 | cudaMemcpy(buffers[0], h_input, INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float), cudaMemcpyHostToDevice); 347 | 348 | // -- do execute --------// 349 | // int16_t, int32_t..., 等, 使用typedef facility定义特定大小intergers在不同的机器上, 并提供了代码可移植性。s 350 | int32_t BATCH_SIZE_ = 1; 351 | //engine_context->execute(BATCH_SIZE_, buffers); 352 | engine_context->executeV2(buffers); 353 | 354 | 355 | cudaMemcpy(h_output_1, buffers[1],100 * 22 * sizeof(float),cudaMemcpyDeviceToHost); 356 | cudaMemcpy(h_output_2, buffers[2], 100 * 4 * sizeof(float), cudaMemcpyDeviceToHost); 357 | 358 | 359 | 360 | 361 | std::cout << "开始打印TensorRT返回的结果:" << std::endl; 362 | std::vector output = { h_output_1 ,h_output_2 }; 363 | 364 | // 后处理 365 | vector bboxes = postprocess(output, image.cols, image.rows); 366 | 367 | cv::Mat showImage; 368 | showImage = renderBoundingBox(image, bboxes); 369 | cv::imwrite("res.jpg", showImage); 370 | 371 | 372 | cudaFree(buffers[0]); 373 | cudaFree(buffers[1]); 374 | cudaFree(buffers[2]); 375 | 376 | //engine_runtime->destroy(); 377 | //engine_infer->destroy(); 378 | 379 | 380 | } 381 | else { 382 | 383 | // 如果基于序列化的engine,直接在engine文件中反序列化 384 | nvinfer1::IRuntime* engine_runtime = nvinfer1::createInferRuntime(g_logger_); 385 | std::string cached_path = "./model/detr.trt"; 386 | std::ifstream fin(cached_path); 387 | std::string cached_engine = ""; 388 | while (fin.peek() != EOF) { 389 | std::stringstream buffer; 390 | buffer << fin.rdbuf(); 391 | cached_engine.append(buffer.str()); 392 | } 393 | fin.close(); 394 | nvinfer1::ICudaEngine* engine_infer = engine_runtime->deserializeCudaEngine(cached_engine.data(), cached_engine.size(), nullptr); 395 | int num_index = engine_infer->getNbBindings(); 396 | int input_index = engine_infer->getBindingIndex("inputs"); //1x3x800 X 800 397 | //std::string input_name = engine_infer->getBindingName(0) 398 | int output_index_1 = engine_infer->getBindingIndex("pred_logits"); 399 | int output_index_2 = engine_infer->getBindingIndex("pred_boxes"); 400 | 401 | nvinfer1::IExecutionContext* engine_context = engine_infer->createExecutionContext(); 402 | 403 | if (engine_context == nullptr) 404 | { 405 | std::cerr << "Failed to create TensorRT Execution Context." << std::endl; 406 | } 407 | 408 | // cached_engine->destroy(); 409 | std::cout << "loaded trt model , do inference" << std::endl; 410 | 411 | 412 | /////////////////////////////////////////////////////////////////// 413 | // enqueue them up 414 | ////////////////////////////////////////////////////////////////// 415 | 416 | // 加载数据,前处理 417 | cv::Mat image; 418 | image = cv::imread(argv[1], 1); 419 | 420 | // -- allocate host memory ------------/ 421 | 422 | preprocess(image, h_input); 423 | //image.release(); 424 | 425 | 426 | //申请显存指针 427 | //cudaMalloc的第一个参数传递的是存储在cpu内存中的指针变量的地址, 428 | //cudaMalloc在执行完成后,向这个地址中写入了一个地址值(此地址值是GPU显存里的) 429 | void* buffers[3]; 430 | cudaMalloc(&buffers[0], INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float)); //<- inputs 431 | cudaMalloc(&buffers[1], 100 * 22 * sizeof(float)); //<- pred_logits 432 | cudaMalloc(&buffers[2], 100 * 4 * sizeof(float)); //<- pred_boxes 433 | 434 | // cudaMemcpy用于在主机(Host)和设备(Device)之间往返的传递数据,用法如下: 435 | 436 | // 主机到设备:cudaMemcpy(d_A,h_A,nBytes,cudaMemcpyHostToDevice) 437 | // 设备到主机:cudaMemcpy(h_A,d_A,nBytes,cudaMemcpyDeviceToHost) 438 | // 注意:该函数是同步执行函数,在未完成数据的转移操作之前会锁死并一直占有CPU进程的控制权,所以不用再添加cudaDeviceSynchronize()函数 439 | cudaMemcpy(buffers[0], h_input, INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float), cudaMemcpyHostToDevice); 440 | 441 | // -- do execute --------// 442 | // int16_t, int32_t..., 等, 使用typedef facility定义特定大小intergers在不同的机器上, 并提供了代码可移植性。s 443 | int32_t BATCH_SIZE_ = 1; 444 | //engine_context->execute(BATCH_SIZE_, buffers); 445 | engine_context->executeV2(buffers); 446 | 447 | 448 | cudaMemcpy(h_output_1, buffers[1], 100 * 22 * sizeof(float), cudaMemcpyDeviceToHost); 449 | cudaMemcpy(h_output_2, buffers[2], 100 * 4 * sizeof(float), cudaMemcpyDeviceToHost); 450 | 451 | std::cout << "开始打印TensorRT返回的结果:" << std::endl; 452 | std::vector output = { h_output_1 ,h_output_2 }; 453 | 454 | // 后处理 455 | vector bboxes = postprocess(output, image.cols, image.rows); 456 | 457 | std::cout << "后处理完成!" << std::endl; 458 | 459 | 460 | cv::Mat showImage; 461 | showImage = renderBoundingBox(image, bboxes); 462 | cv::imwrite("res.jpg", showImage); 463 | 464 | 465 | cudaFree(buffers[0]); 466 | cudaFree(buffers[1]); 467 | cudaFree(buffers[2]); 468 | 469 | 470 | 471 | //engine_runtime->destroy(); 472 | //engine_infer->destroy(); 473 | 474 | 475 | } 476 | 477 | // // cudaStreamDestroy(stream); 478 | 479 | 480 | return 0; 481 | } 482 | -------------------------------------------------------------------------------- /detr_pth2onnx.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # ~~~Medcare AI Lab~~~ 17 | 18 | import io 19 | import sys 20 | import argparse 21 | 22 | import numpy as np 23 | import onnx 24 | import onnxruntime 25 | from onnxsim import simplify 26 | import onnx_graphsurgeon as gs 27 | 28 | import torch 29 | from model.misc import nested_tensor_from_tensor_list 30 | from model.hubconf import detr_resnet50 31 | 32 | 33 | class ONNXExporter: 34 | 35 | @classmethod 36 | def setUpClass(cls): 37 | torch.manual_seed(123) 38 | 39 | def run_model(self, model, onnx_path,inputs_list, dynamic_axes=False, tolerate_small_mismatch=False, do_constant_folding=True, 40 | output_names=None, input_names=None): 41 | model.eval() 42 | 43 | onnx_io = io.BytesIO() 44 | onnx_path = onnx_path 45 | 46 | torch.onnx.export(model, inputs_list[0], onnx_io, 47 | input_names=input_names, output_names=output_names,export_params=True,training=False,opset_version=12) 48 | torch.onnx.export(model, inputs_list[0], onnx_path, 49 | input_names=input_names, output_names=output_names,export_params=True,training=False,opset_version=12) 50 | 51 | print(f"[INFO] ONNX model export success! save path: {onnx_path}") 52 | 53 | # validate the exported model with onnx runtime 54 | for test_inputs in inputs_list: 55 | with torch.no_grad(): 56 | if isinstance(test_inputs, torch.Tensor) or isinstance(test_inputs, list): 57 | # test_inputs = (nested_tensor_from_tensor_list(test_inputs),) 58 | test_inputs = (test_inputs,) 59 | test_ouputs = model(*test_inputs) 60 | if isinstance(test_ouputs, torch.Tensor): 61 | test_ouputs = (test_ouputs,) 62 | self.ort_validate(onnx_io, test_inputs, test_ouputs, tolerate_small_mismatch) 63 | 64 | print("[INFO] Validate the exported model with onnx runtime success!") 65 | 66 | # dynamic_shape 67 | if dynamic_axes: 68 | # dynamic_axes = [int(ax) for ax in list(dynamic_axes)] 69 | torch.onnx.export(model, inputs_list[0], './detr_dynamic.onnx', dynamic_axes={input_names[0]: {0:'-1'},output_names[0]:{0:'-1'},output_names[1]:{0:'-1'}}, 70 | input_names=input_names, output_names=output_names, verbose=True, opset_version=12) 71 | 72 | print(f"[INFO] Dynamic Shape ONNX model export success! Dynamic shape:{dynamic_axes} save path: ./detr_dynamic.onnx") 73 | 74 | def ort_validate(self, onnx_io, inputs, outputs, tolerate_small_mismatch=False): 75 | 76 | inputs, _ = torch.jit._flatten(inputs) 77 | outputs, _ = torch.jit._flatten(outputs) 78 | 79 | def to_numpy(tensor): 80 | if tensor.requires_grad: 81 | return tensor.detach().cpu().numpy() 82 | else: 83 | return tensor.cpu().numpy() 84 | 85 | inputs = list(map(to_numpy, inputs)) 86 | outputs = list(map(to_numpy, outputs)) 87 | 88 | ort_session = onnxruntime.InferenceSession(onnx_io.getvalue()) 89 | # compute onnxruntime output prediction 90 | ort_inputs = dict((ort_session.get_inputs()[i].name, inpt) for i, inpt in enumerate(inputs)) 91 | ort_outs = ort_session.run(None, ort_inputs) 92 | for i in range(0, len(outputs)): 93 | try: 94 | torch.testing.assert_allclose(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05) 95 | except AssertionError as error: 96 | if tolerate_small_mismatch: 97 | self.assertIn("(0.00%)", str(error), str(error)) 98 | else: 99 | raise 100 | 101 | @staticmethod 102 | def check_onnx(onnx_path): 103 | model = onnx.load(onnx_path) 104 | onnx.checker.check_model(model) 105 | print(f"[INFO] ONNX model: {onnx_path} check success!") 106 | 107 | 108 | @staticmethod 109 | def onnx_change(onnx_path): 110 | '''该部分代码由导师提供,解决trt inference 全是0的问题,感谢!!! 111 | ''' 112 | node_configs = [(2682,2684),(2775,2777),(2961,2963),(3333,3335),(4077,4079)] 113 | if 'batch_2' in onnx_path: 114 | node_number = node_configs[1] 115 | elif 'batch_4' in onnx_path: 116 | node_number = node_configs[2] 117 | elif 'batch_8' in onnx_path: 118 | node_number = node_configs[3] 119 | elif 'batch_16' in onnx_path: 120 | node_number = node_configs[4] 121 | else: 122 | node_number = node_configs[0] 123 | 124 | graph = gs.import_onnx(onnx.load(onnx_path)) 125 | for node in graph.nodes: 126 | if node.name == f"Gather_{node_number[0]}": 127 | print(node.inputs[1]) 128 | node.inputs[1].values = np.int64(5) 129 | print(node.inputs[1]) 130 | elif node.name == f"Gather_{node_number[1]}": 131 | print(node.inputs[1]) 132 | node.inputs[1].values = np.int64(5) 133 | print(node.inputs[1]) 134 | 135 | onnx.save(gs.export_onnx(graph),onnx_path) 136 | print(f"[INFO] onnx修改完成, 保存在{onnx_path}.") 137 | 138 | 139 | 140 | 141 | 142 | if __name__ == '__main__': 143 | 144 | parser = argparse.ArgumentParser(description='DETR Model to ONNX Model') 145 | parser.add_argument('--model_dir', type= str , default='./checkpoint/detr_resnet50.pth', help='DETR Pytorch Model Saved Dir') 146 | parser.add_argument('--dynamic_axes', action="store_true", help='Dynamic ONNX Model') 147 | parser.add_argument('--check', action="store_true", help='Check Your ONNX Model') 148 | parser.add_argument('--onnx_dir', type=str,default="./detr.onnx", help="Check ONNX Model's dir") 149 | parser.add_argument('--batch_size', type=int,default=1, help="Batch Size") 150 | 151 | 152 | args = parser.parse_args() 153 | 154 | # load torch model 155 | detr = detr_resnet50(pretrained=False,num_classes=20+1).eval() # max label index add 1 156 | state_dict = torch.load(args.model_dir,map_location='cuda') # pytorch model path 157 | detr.load_state_dict(state_dict["model"]) 158 | 159 | # input 160 | dummy_image = [torch.ones(args.batch_size, 3, 800, 800) ] 161 | 162 | # to onnx 163 | onnx_export = ONNXExporter() 164 | onnx_export.run_model(detr,args.onnx_dir, dummy_image,input_names=['inputs'],dynamic_axes=args.dynamic_axes, 165 | output_names=["pred_logits", "pred_boxes"],tolerate_small_mismatch=True) 166 | 167 | # check onnx model 168 | if args.check: 169 | ONNXExporter.check_onnx(args.onnx_dir) 170 | 171 | 172 | print('[INFO] Simplifying model...') 173 | model = onnx.load(args.onnx_dir) 174 | # simplifying dynamic model 175 | simplified_model, check = simplify(model, 176 | input_shapes={'inputs': [args.batch_size, 3, 800, 800]}, 177 | dynamic_input_shape=args.dynamic_axes) 178 | 179 | 180 | onnx.save(simplified_model,(args.onnx_dir[:-5]+"_sim.onnx")) 181 | 182 | # onnx change 183 | onnx_export.onnx_change(args.onnx_dir[:-5]+"_sim.onnx") 184 | 185 | 186 | # simplifer onnx 187 | # $ python3 -m onnxsim detr.onnx detr_sim.onnx 188 | # $ python3 -m onnxsim detr_dynamic.onnx detr_dynamic_sim.onnx 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /generate_batch_plan.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # ~~~Medcare AI Lab~~~ 17 | 18 | import os 19 | import sys 20 | import time 21 | import cv2 22 | from PIL import Image 23 | import argparse 24 | 25 | import pycuda.driver as cuda 26 | import pycuda.autoinit 27 | import cupy as cp 28 | import numpy as np 29 | import tensorrt as trt 30 | 31 | 32 | from trt_util.common import allocate_buffers,do_inference_v2,build_engine_onnx 33 | from trt_util.process_img import preprocess_np,preprocess_torch_v1 34 | from trt_util.plot_box import plot_box, CLASSES 35 | 36 | TRT_LOGGER = trt.Logger(trt.Logger.INFO) 37 | 38 | 39 | def main(onnx_model_file,engine_file,fp16=False,batch_size=1): 40 | 41 | # Build a TensorRT engine. 42 | with build_engine_onnx(onnx_model_file,engine_file,FP16=fp16,batch_size=batch_size,verbose=False) as engine: 43 | inputs, outputs, bindings, stream = allocate_buffers(engine) 44 | # Contexts are used to perform inference. 45 | with engine.create_execution_context() as context: 46 | 47 | print("------Engine Infor:---------") 48 | print(engine.max_batch_size) 49 | print(engine.get_binding_shape(0)) 50 | print(engine.get_binding_shape(1)) 51 | print(engine.get_binding_shape(2)) 52 | 53 | print("------Context Infor:---------") 54 | print(context.get_binding_shape(0)) 55 | print(context.get_binding_shape(1)) 56 | print(context.get_binding_shape(1)) 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | parser = argparse.ArgumentParser(description='Create TensorRT Engine in FP32 ,FP16 Mode ') 62 | parser.add_argument('--model_dir', type= str , default='./output/detr_sim.onnx', help='ONNX Model Path') 63 | parser.add_argument('--engine_dir', type= str , default='./output/detr_batch_.plan', help='TensorRT Engine File') 64 | 65 | parser.add_argument('--fp16', action="store_true", help='Open FP16 Mode or Not, if True You Should Load FP16 Engine File') 66 | parser.add_argument('--batch_size', type=int , default=2, help='Batch size, static=2') 67 | 68 | 69 | args = parser.parse_args() 70 | 71 | main(args.model_dir,args.engine_dir,args.fp16,args.batch_size) 72 | -------------------------------------------------------------------------------- /inference_detr_onnx.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ~~~Medcare AI Lab~~~ 18 | 19 | 20 | 21 | import cv2 22 | from PIL import Image 23 | import numpy as np 24 | import os 25 | import time 26 | 27 | # onnxruntime requires python 3.5 or above 28 | try: 29 | import onnxruntime 30 | except ImportError: 31 | onnxruntime = None 32 | 33 | import torch 34 | from torch import nn 35 | import torchvision.transforms as T 36 | 37 | torch.set_grad_enabled(False) 38 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 39 | print("[INFO] 当前使用{}做推断".format(device)) 40 | 41 | 42 | # 图像数据处理 43 | transform = T.Compose([ 44 | T.Resize((800,800)), # PIL.Image.BILINEAR 45 | T.ToTensor(), 46 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 47 | ]) 48 | 49 | # 将xywh转xyxy 50 | def box_cxcywh_to_xyxy(x): 51 | x = torch.from_numpy(x) 52 | x_c, y_c, w, h = x.unbind(1) 53 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 54 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 55 | return torch.stack(b, dim=1) 56 | 57 | # 将0-1映射到图像 58 | def rescale_bboxes(out_bbox, size): 59 | img_w, img_h = size 60 | b = box_cxcywh_to_xyxy(out_bbox) 61 | b = b.cpu().numpy() 62 | b = b * np.array([img_w, img_h, img_w, img_h], dtype=np.float32) 63 | return b 64 | 65 | # plot box by opencv 66 | def plot_result(pil_img, prob, boxes,save_name=None,imshow=False, imwrite=False): 67 | LABEL = ["NA","Class A","Class B","Class C","Class D","Class E","Class F", 68 | "Class G","Class H","Class I","Class J","Class K","Class L","Class M", 69 | "Class N","Class O","Class P","Class Q","Class R","Class S","Class T"] 70 | opencvImage = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) 71 | 72 | if len(prob) == 0: 73 | print("[INFO] NO box detect !!! ") 74 | if imwrite: 75 | if not os.path.exists("./result/pred_no"): 76 | os.makedirs("./result/pred_no") 77 | cv2.imwrite(os.path.join("./result/pred_no",save_name),opencvImage) 78 | return 79 | 80 | for p, (xmin, ymin, xmax, ymax) in zip(prob, boxes): 81 | 82 | cl = p.argmax() 83 | if not cl in [6,7]: 84 | continue; 85 | label_text = '{}: {}%'.format(LABEL[cl],round(p[cl]*100,2)) 86 | 87 | cv2.rectangle(opencvImage, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (255, 255, 0), 2) 88 | cv2.putText(opencvImage, label_text,(int(xmin)+10, int(ymin)+30), cv2.FONT_HERSHEY_SIMPLEX, 1, 89 | (255, 255, 0), 2) 90 | 91 | if imshow: 92 | cv2.imshow('detect', opencvImage) 93 | cv2.waitKey(0) 94 | 95 | if imwrite: 96 | if not os.path.exists("./result/pred"): 97 | os.makedirs('./result/pred') 98 | cv2.imwrite('./result/pred/{}'.format(save_name), opencvImage) 99 | 100 | 101 | def detect_onnx(ort_session,im,prob_threshold=0.7): 102 | # compute onnxruntime output prediction 103 | # 前处理 104 | img = transform(im).unsqueeze(0).cpu().numpy() 105 | 106 | ort_inputs = {"inputs":img} 107 | start = time.time() 108 | scores,boxs = ort_session.run(None, ort_inputs) 109 | 110 | # 后处理 + 也可以加NMS 111 | probas = torch.from_numpy(np.array(scores)).softmax(-1)[0, :, :-1] 112 | keep = probas.max(-1).values > prob_threshold 113 | end = time.time() 114 | 115 | probas = probas.cpu().detach().numpy() 116 | keep = keep.cpu().detach().numpy() 117 | 118 | # convert boxes from [0; 1] to image scales 119 | bboxes_scaled = rescale_bboxes(boxs[0, keep], im.size) 120 | print(f"onnxruntime Time: {end-start}s") 121 | 122 | return probas[keep] ,bboxes_scaled 123 | 124 | 125 | 126 | if __name__ == "__main__": 127 | 128 | onnx_path = "./detr_dynamic_sim.onnx" 129 | ort_session = onnxruntime.InferenceSession(onnx_path) 130 | files = os.listdir("./test") 131 | 132 | for file in files: 133 | img_path = os.path.join("./test",file) 134 | im = Image.open(img_path) 135 | 136 | scores, boxes = detect_onnx(ort_session,im) 137 | 138 | print(scores) 139 | print(boxes) 140 | # plot_result(im, scores, boxes,save_name=file,imshow=False, imwrite=True) 141 | # print("[INFO] {} time: {} done!!!".format(file,None)) 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /inference_detr_trt.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ~~~Medcare AI Lab~~~ 18 | 19 | import os 20 | import sys 21 | import time 22 | import cv2 23 | from PIL import Image 24 | import argparse 25 | 26 | import pycuda.driver as cuda 27 | import pycuda.autoinit 28 | import cupy as cp 29 | import numpy as np 30 | import tensorrt as trt 31 | 32 | from trt_util.common import allocate_buffers,do_inference_v2,build_engine_onnx 33 | from trt_util.process_img import preprocess_np,preprocess_torch_v1 34 | from trt_util.plot_box import plot_box, CLASSES 35 | 36 | TRT_LOGGER = trt.Logger(trt.Logger.INFO) 37 | 38 | 39 | 40 | def engine_infer(engine,context,inputs, outputs, bindings, stream,test_image): 41 | 42 | # image_input, img_raw, _ = preprocess_np(test_image) 43 | image_input, img_raw, _ = preprocess_torch_v1((test_image)) 44 | inputs[0].host = image_input.astype(np.float32).ravel() # device to host to device,在性能对比时将替换该方式 45 | 46 | start = time.time() 47 | scores,boxs = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, input_tensor=image_input) 48 | print(f"推断耗时:{time.time()-start}s") 49 | 50 | # print(scores) 51 | # print(boxs) 52 | 53 | output_shapes = [(1,100,22), (1,100,4)] 54 | scores = scores.reshape(output_shapes[0]) 55 | boxs = boxs.reshape(output_shapes[1]) 56 | 57 | return scores,boxs,img_raw 58 | 59 | 60 | 61 | def main(onnx_model_file,engine_file,image_dir,fp16=False,int8=False,batch_size=1,dynamic=False): 62 | 63 | test_images = [test_image for test_image in os.listdir(image_dir)] 64 | 65 | if int8: 66 | # only load the plan engine file 67 | if not os.path.exists(engine_file): 68 | raise "[Error] INT8 Mode must given the correct engine plan file. Please Check!!!" 69 | with open(engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: 70 | engine = runtime.deserialize_cuda_engine(f.read()) 71 | with engine.create_execution_context() as context: 72 | inputs, outputs, bindings, stream = allocate_buffers(engine) 73 | # print(dir(context)) 74 | 75 | if dynamic: 76 | context.active_optimization_profile = 0#增加部分 77 | origin_inputshape=context.get_binding_shape(0) 78 | if origin_inputshape[0]==-1: 79 | origin_inputshape[0] = batch_size 80 | context.set_binding_shape(0,(origin_inputshape)) 81 | print(f"[INFO] INT8 mode.Dynamic:{dynamic}. Deserialize from: {engine_file}.") 82 | 83 | for test_image in test_images: 84 | 85 | scores,boxs, img_raw = engine_infer(engine,context,inputs, outputs, bindings, stream, os.path.join(image_dir,test_image)) 86 | 87 | print(f"[INFO] trt inference done. save result in : ./trt_infer_res/in8/{test_image}") 88 | if not os.path.exists("./trt_infer_res/in8"): 89 | os.makedirs("./trt_infer_res/in8") 90 | plot_box(img_raw, scores, boxs, prob_threshold=0.7, save_fig=os.path.join('./trt_infer_res/in8',test_image)) 91 | 92 | 93 | else: 94 | # Build a TensorRT engine. 95 | with build_engine_onnx(onnx_model_file,engine_file,FP16=fp16,verbose=False,dynamic_input=dynamic) as engine: 96 | inputs, outputs, bindings, stream = allocate_buffers(engine) 97 | # Contexts are used to perform inference. 98 | with engine.create_execution_context() as context: 99 | print(engine.get_binding_shape(0)) 100 | print(engine.get_binding_shape(1)) 101 | print(engine.get_binding_shape(2)) 102 | 103 | print(context.get_binding_shape(0)) 104 | print(context.get_binding_shape(1)) 105 | # Load a normalized test case into the host input page-locked buffer. 106 | if dynamic: 107 | context.active_optimization_profile = 0#增加部分 108 | origin_inputshape=context.get_binding_shape(0) 109 | if origin_inputshape[0]==-1: 110 | origin_inputshape[0] = batch_size 111 | context.set_binding_shape(0,(origin_inputshape)) 112 | 113 | print(f"[INFO] FP16 mode is: {fp16},Dynamic:{dynamic} Deserialize from: {engine_file}.") 114 | 115 | for test_image in test_images: 116 | scores,boxs, img_raw = engine_infer(engine,context,inputs, outputs, bindings, stream, os.path.join(image_dir,test_image)) 117 | 118 | if fp16: 119 | save_dir = "./trt_infer_res/fp16" 120 | else: 121 | save_dir = "./trt_infer_res/fp32" 122 | 123 | print(f"[INFO] trt inference done. save result in : {save_dir}/{test_image}") 124 | if not os.path.exists(save_dir): 125 | os.makedirs(save_dir) 126 | plot_box(img_raw, scores, boxs, prob_threshold=0.7, save_fig=os.path.join(save_dir,test_image)) 127 | 128 | 129 | 130 | 131 | if __name__ == '__main__': 132 | 133 | parser = argparse.ArgumentParser(description='Inference by TensorRT in FP32 ,FP16 Mode or INT8 Mode.') 134 | parser.add_argument('--model_dir', type= str , default='./detr_sim.onnx', help='ONNX Model Path') 135 | parser.add_argument('--engine_dir', type= str , default='./detr.plan', help='TensorRT Engine File') 136 | parser.add_argument('--image_dir', type=str,default="./test", help='Test Image Dir') 137 | 138 | parser.add_argument('--fp16', action="store_true", help='Open FP16 Mode or Not, if True You Should Load FP16 Engine File') 139 | parser.add_argument('--int8', action="store_true", help='Open INT8 Mode or Not, if True You Should Load INT8 Engine File') 140 | parser.add_argument('--batch_size', type=int , default=1, help='Batch size, static=1') 141 | parser.add_argument('--dynamic', action="store_true", help='Dynamic Shape or Not when inference in trt') 142 | 143 | 144 | args = parser.parse_args() 145 | 146 | main(args.model_dir,args.engine_dir,args.image_dir,args.fp16,args.int8,args.batch_size,args.dynamic) 147 | 148 | 149 | -------------------------------------------------------------------------------- /model/README.md: -------------------------------------------------------------------------------- 1 | 声明:该文件夹下的代码来源于FaceBook开源项目:https://github.com/facebookresearch/detr 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from .detr import build 3 | 4 | 5 | def build_model(args): 6 | return build(args) 7 | -------------------------------------------------------------------------------- /model/backbone.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Backbone modules. 4 | """ 5 | from collections import OrderedDict 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | import torchvision 10 | from torch import nn 11 | from torchvision.models._utils import IntermediateLayerGetter 12 | from typing import Dict, List 13 | 14 | from .misc import NestedTensor, is_main_process 15 | 16 | from .position_encoding import build_position_encoding 17 | 18 | 19 | class FrozenBatchNorm2d(torch.nn.Module): 20 | """ 21 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 22 | 23 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 24 | without which any other models than torchvision.models.resnet[18,34,50,101] 25 | produce nans. 26 | """ 27 | 28 | def __init__(self, n): 29 | super(FrozenBatchNorm2d, self).__init__() 30 | self.register_buffer("weight", torch.ones(n)) 31 | self.register_buffer("bias", torch.zeros(n)) 32 | self.register_buffer("running_mean", torch.zeros(n)) 33 | self.register_buffer("running_var", torch.ones(n)) 34 | 35 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 36 | missing_keys, unexpected_keys, error_msgs): 37 | num_batches_tracked_key = prefix + 'num_batches_tracked' 38 | if num_batches_tracked_key in state_dict: 39 | del state_dict[num_batches_tracked_key] 40 | 41 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 42 | state_dict, prefix, local_metadata, strict, 43 | missing_keys, unexpected_keys, error_msgs) 44 | 45 | def forward(self, x): 46 | # move reshapes to the beginning 47 | # to make it fuser-friendly 48 | w = self.weight.reshape(1, -1, 1, 1) 49 | b = self.bias.reshape(1, -1, 1, 1) 50 | rv = self.running_var.reshape(1, -1, 1, 1) 51 | rm = self.running_mean.reshape(1, -1, 1, 1) 52 | eps = 1e-5 53 | scale = w * (rv + eps).rsqrt() 54 | bias = b - rm * scale 55 | return x * scale + bias 56 | 57 | 58 | class BackboneBase(nn.Module): 59 | 60 | def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool): 61 | super().__init__() 62 | for name, parameter in backbone.named_parameters(): 63 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 64 | parameter.requires_grad_(False) 65 | if return_interm_layers: 66 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} 67 | else: 68 | return_layers = {'layer4': "0"} 69 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 70 | self.num_channels = num_channels 71 | 72 | def forward(self, tensor_list: NestedTensor): 73 | xs = self.body(tensor_list.tensors) 74 | # out: Dict[str, NestedTensor] = {} 75 | out = {} 76 | for name, x in xs.items(): 77 | m = tensor_list.mask 78 | assert m is not None 79 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 80 | out[name] = NestedTensor(x, mask) 81 | return out 82 | 83 | 84 | class Backbone(BackboneBase): 85 | """ResNet backbone with frozen BatchNorm.""" 86 | def __init__(self, name: str, 87 | train_backbone: bool, 88 | return_interm_layers: bool, 89 | dilation: bool): 90 | backbone = getattr(torchvision.models, name)( 91 | replace_stride_with_dilation=[False, False, dilation], 92 | pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) 93 | num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 94 | super().__init__(backbone, train_backbone, num_channels, return_interm_layers) 95 | 96 | 97 | class Joiner(nn.Sequential): 98 | def __init__(self, backbone, position_embedding): 99 | super().__init__(backbone, position_embedding) 100 | 101 | def forward(self, tensor_list: NestedTensor): 102 | xs = self[0](tensor_list) 103 | # out: List[NestedTensor] = [] 104 | out = [] 105 | 106 | pos = [] 107 | for name, x in xs.items(): 108 | out.append(x) 109 | # position encoding 110 | pos.append(self[1](x).to(x.tensors.dtype)) 111 | 112 | return out, pos 113 | 114 | 115 | def build_backbone(args): 116 | position_embedding = build_position_encoding(args) 117 | train_backbone = args.lr_backbone > 0 118 | return_interm_layers = args.masks 119 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) 120 | model = Joiner(backbone, position_embedding) 121 | model.num_channels = backbone.num_channels 122 | return model 123 | -------------------------------------------------------------------------------- /model/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch 6 | from torchvision.ops.boxes import box_area 7 | 8 | 9 | def box_cxcywh_to_xyxy(x): 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 12 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 13 | return torch.stack(b, dim=-1) 14 | 15 | 16 | def box_xyxy_to_cxcywh(x): 17 | x0, y0, x1, y1 = x.unbind(-1) 18 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 19 | (x1 - x0), (y1 - y0)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | # modified from torchvision to also return the union 24 | def box_iou(boxes1, boxes2): 25 | area1 = box_area(boxes1) 26 | area2 = box_area(boxes2) 27 | 28 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 29 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 30 | 31 | wh = (rb - lt).clamp(min=0) # [N,M,2] 32 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 33 | 34 | union = area1[:, None] + area2 - inter 35 | 36 | iou = inter / union 37 | return iou, union 38 | 39 | 40 | def generalized_box_iou(boxes1, boxes2): 41 | """ 42 | Generalized IoU from https://giou.stanford.edu/ 43 | 44 | The boxes should be in [x0, y0, x1, y1] format 45 | 46 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 47 | and M = len(boxes2) 48 | """ 49 | # degenerate boxes gives inf / nan results 50 | # so do an early check 51 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 52 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 53 | iou, union = box_iou(boxes1, boxes2) 54 | 55 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 56 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 57 | 58 | wh = (rb - lt).clamp(min=0) # [N,M,2] 59 | area = wh[:, :, 0] * wh[:, :, 1] 60 | 61 | return iou - (area - union) / area 62 | 63 | 64 | def masks_to_boxes(masks): 65 | """Compute the bounding boxes around the provided masks 66 | 67 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 68 | 69 | Returns a [N, 4] tensors, with the boxes in xyxy format 70 | """ 71 | if masks.numel() == 0: 72 | return torch.zeros((0, 4), device=masks.device) 73 | 74 | h, w = masks.shape[-2:] 75 | 76 | y = torch.arange(0, h, dtype=torch.float) 77 | x = torch.arange(0, w, dtype=torch.float) 78 | y, x = torch.meshgrid(y, x) 79 | 80 | x_mask = (masks * x.unsqueeze(0)) 81 | x_max = x_mask.flatten(1).max(-1)[0] 82 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 83 | 84 | y_mask = (masks * y.unsqueeze(0)) 85 | y_max = y_mask.flatten(1).max(-1)[0] 86 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 87 | 88 | return torch.stack([x_min, y_min, x_max, y_max], 1) 89 | -------------------------------------------------------------------------------- /model/detr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | DETR model and criterion classes. 4 | """ 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn 8 | 9 | from . import box_ops 10 | from .misc import (NestedTensor, nested_tensor_from_tensor_list, 11 | accuracy, get_world_size, interpolate, 12 | is_dist_avail_and_initialized) 13 | 14 | from .backbone import build_backbone 15 | from .matcher import build_matcher 16 | from .segmentation import (DETRsegm, PostProcessPanoptic, PostProcessSegm, 17 | dice_loss, sigmoid_focal_loss) 18 | from .transformer import build_transformer 19 | 20 | 21 | class DETR(nn.Module): 22 | """ This is the DETR module that performs object detection """ 23 | def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False): 24 | """ Initializes the model. 25 | Parameters: 26 | backbone: torch module of the backbone to be used. See backbone.py 27 | transformer: torch module of the transformer architecture. See transformer.py 28 | num_classes: number of object classes 29 | num_queries: number of object queries, ie detection slot. This is the maximal number of objects 30 | DETR can detect in a single image. For COCO, we recommend 100 queries. 31 | aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. 32 | """ 33 | super().__init__() 34 | self.num_queries = num_queries 35 | self.transformer = transformer 36 | hidden_dim = transformer.d_model 37 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1) 38 | self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) 39 | self.query_embed = nn.Embedding(num_queries, hidden_dim) 40 | self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1) 41 | self.backbone = backbone 42 | self.aux_loss = aux_loss 43 | 44 | def forward(self, samples: NestedTensor): 45 | """ The forward expects a NestedTensor, which consists of: 46 | - samples.tensor: batched images, of shape [batch_size x 3 x H x W] 47 | - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels 48 | 49 | It returns a dict with the following elements: 50 | - "pred_logits": the classification logits (including no-object) for all queries. 51 | Shape= [batch_size x num_queries x (num_classes + 1)] 52 | - "pred_boxes": The normalized boxes coordinates for all queries, represented as 53 | (center_x, center_y, height, width). These values are normalized in [0, 1], 54 | relative to the size of each individual image (disregarding possible padding). 55 | See PostProcess for information on how to retrieve the unnormalized bounding box. 56 | - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of 57 | dictionnaries containing the two above keys for each decoder layer. 58 | """ 59 | if isinstance(samples, (list, torch.Tensor)): 60 | samples = nested_tensor_from_tensor_list(samples) 61 | features, pos = self.backbone(samples) 62 | 63 | src, mask = features[-1].decompose() 64 | assert mask is not None 65 | hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] 66 | 67 | outputs_class = self.class_embed(hs) 68 | outputs_coord = self.bbox_embed(hs).sigmoid() 69 | out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} 70 | if self.aux_loss: 71 | out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) 72 | return out 73 | 74 | @torch.jit.unused 75 | def _set_aux_loss(self, outputs_class, outputs_coord): 76 | # this is a workaround to make torchscript happy, as torchscript 77 | # doesn't support dictionary with non-homogeneous values, such 78 | # as a dict having both a Tensor and a list. 79 | return [{'pred_logits': a, 'pred_boxes': b} 80 | for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] 81 | 82 | 83 | class SetCriterion(nn.Module): 84 | """ This class computes the loss for DETR. 85 | The process happens in two steps: 86 | 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 87 | 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) 88 | """ 89 | def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses): 90 | """ Create the criterion. 91 | Parameters: 92 | num_classes: number of object categories, omitting the special no-object category 93 | matcher: module able to compute a matching between targets and proposals 94 | weight_dict: dict containing as key the names of the losses and as values their relative weight. 95 | eos_coef: relative classification weight applied to the no-object category 96 | losses: list of all the losses to be applied. See get_loss for list of available losses. 97 | """ 98 | super().__init__() 99 | self.num_classes = num_classes 100 | self.matcher = matcher 101 | self.weight_dict = weight_dict 102 | self.eos_coef = eos_coef 103 | self.losses = losses 104 | empty_weight = torch.ones(self.num_classes + 1) 105 | empty_weight[-1] = self.eos_coef 106 | self.register_buffer('empty_weight', empty_weight) 107 | 108 | def loss_labels(self, outputs, targets, indices, num_boxes, log=True): 109 | """Classification loss (NLL) 110 | targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] 111 | """ 112 | assert 'pred_logits' in outputs 113 | src_logits = outputs['pred_logits'] 114 | 115 | idx = self._get_src_permutation_idx(indices) 116 | target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) 117 | target_classes = torch.full(src_logits.shape[:2], self.num_classes, 118 | dtype=torch.int64, device=src_logits.device) 119 | target_classes[idx] = target_classes_o 120 | 121 | loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) 122 | losses = {'loss_ce': loss_ce} 123 | 124 | if log: 125 | # TODO this should probably be a separate loss, not hacked in this one here 126 | losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] 127 | return losses 128 | 129 | @torch.no_grad() 130 | def loss_cardinality(self, outputs, targets, indices, num_boxes): 131 | """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes 132 | This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients 133 | """ 134 | pred_logits = outputs['pred_logits'] 135 | device = pred_logits.device 136 | tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device) 137 | # Count the number of predictions that are NOT "no-object" (which is the last class) 138 | card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1) 139 | card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) 140 | losses = {'cardinality_error': card_err} 141 | return losses 142 | 143 | def loss_boxes(self, outputs, targets, indices, num_boxes): 144 | """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss 145 | targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] 146 | The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. 147 | """ 148 | assert 'pred_boxes' in outputs 149 | idx = self._get_src_permutation_idx(indices) 150 | src_boxes = outputs['pred_boxes'][idx] 151 | target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) 152 | 153 | loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') 154 | 155 | losses = {} 156 | losses['loss_bbox'] = loss_bbox.sum() / num_boxes 157 | 158 | loss_giou = 1 - torch.diag(box_ops.generalized_box_iou( 159 | box_ops.box_cxcywh_to_xyxy(src_boxes), 160 | box_ops.box_cxcywh_to_xyxy(target_boxes))) 161 | losses['loss_giou'] = loss_giou.sum() / num_boxes 162 | return losses 163 | 164 | def loss_masks(self, outputs, targets, indices, num_boxes): 165 | """Compute the losses related to the masks: the focal loss and the dice loss. 166 | targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] 167 | """ 168 | assert "pred_masks" in outputs 169 | 170 | src_idx = self._get_src_permutation_idx(indices) 171 | tgt_idx = self._get_tgt_permutation_idx(indices) 172 | src_masks = outputs["pred_masks"] 173 | src_masks = src_masks[src_idx] 174 | masks = [t["masks"] for t in targets] 175 | # TODO use valid to mask invalid areas due to padding in loss 176 | target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() 177 | target_masks = target_masks.to(src_masks) 178 | target_masks = target_masks[tgt_idx] 179 | 180 | # upsample predictions to the target size 181 | src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:], 182 | mode="bilinear", align_corners=False) 183 | src_masks = src_masks[:, 0].flatten(1) 184 | 185 | target_masks = target_masks.flatten(1) 186 | target_masks = target_masks.view(src_masks.shape) 187 | losses = { 188 | "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), 189 | "loss_dice": dice_loss(src_masks, target_masks, num_boxes), 190 | } 191 | return losses 192 | 193 | def _get_src_permutation_idx(self, indices): 194 | # permute predictions following indices 195 | batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) 196 | src_idx = torch.cat([src for (src, _) in indices]) 197 | return batch_idx, src_idx 198 | 199 | def _get_tgt_permutation_idx(self, indices): 200 | # permute targets following indices 201 | batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) 202 | tgt_idx = torch.cat([tgt for (_, tgt) in indices]) 203 | return batch_idx, tgt_idx 204 | 205 | def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): 206 | loss_map = { 207 | 'labels': self.loss_labels, 208 | 'cardinality': self.loss_cardinality, 209 | 'boxes': self.loss_boxes, 210 | 'masks': self.loss_masks 211 | } 212 | assert loss in loss_map, 'do you really want to compute {} loss?'.format(loss) 213 | return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) 214 | 215 | def forward(self, outputs, targets): 216 | """ This performs the loss computation. 217 | Parameters: 218 | outputs: dict of tensors, see the output specification of the model for the format 219 | targets: list of dicts, such that len(targets) == batch_size. 220 | The expected keys in each dict depends on the losses applied, see each loss' doc 221 | """ 222 | outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'} 223 | 224 | # Retrieve the matching between the outputs of the last layer and the targets 225 | indices = self.matcher(outputs_without_aux, targets) 226 | 227 | # Compute the average number of target boxes accross all nodes, for normalization purposes 228 | num_boxes = sum(len(t["labels"]) for t in targets) 229 | num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) 230 | if is_dist_avail_and_initialized(): 231 | torch.distributed.all_reduce(num_boxes) 232 | num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() 233 | 234 | # Compute all the requested losses 235 | losses = {} 236 | for loss in self.losses: 237 | losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) 238 | 239 | # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. 240 | if 'aux_outputs' in outputs: 241 | for i, aux_outputs in enumerate(outputs['aux_outputs']): 242 | indices = self.matcher(aux_outputs, targets) 243 | for loss in self.losses: 244 | if loss == 'masks': 245 | # Intermediate masks losses are too costly to compute, we ignore them. 246 | continue 247 | kwargs = {} 248 | if loss == 'labels': 249 | # Logging is enabled only for the last layer 250 | kwargs = {'log': False} 251 | l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) 252 | l_dict = {k + '_{}'.format(i): v for k, v in l_dict.items()} 253 | losses.update(l_dict) 254 | 255 | return losses 256 | 257 | 258 | class PostProcess(nn.Module): 259 | """ This module converts the model's output into the format expected by the coco api""" 260 | @torch.no_grad() 261 | def forward(self, outputs, target_sizes): 262 | """ Perform the computation 263 | Parameters: 264 | outputs: raw outputs of the model 265 | target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch 266 | For evaluation, this must be the original image size (before any data augmentation) 267 | For visualization, this should be the image size after data augment, but before padding 268 | """ 269 | out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] 270 | 271 | assert len(out_logits) == len(target_sizes) 272 | assert target_sizes.shape[1] == 2 273 | 274 | prob = F.softmax(out_logits, -1) 275 | scores, labels = prob[..., :-1].max(-1) 276 | 277 | # convert to [x0, y0, x1, y1] format 278 | boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) 279 | # and from relative [0, 1] to absolute [0, height] coordinates 280 | img_h, img_w = target_sizes.unbind(1) 281 | scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) 282 | boxes = boxes * scale_fct[:, None, :] 283 | 284 | results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)] 285 | 286 | return results 287 | 288 | 289 | class MLP(nn.Module): 290 | """ Very simple multi-layer perceptron (also called FFN)""" 291 | 292 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 293 | super().__init__() 294 | self.num_layers = num_layers 295 | h = [hidden_dim] * (num_layers - 1) 296 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 297 | 298 | def forward(self, x): 299 | for i, layer in enumerate(self.layers): 300 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 301 | return x 302 | 303 | 304 | # def build(args): 305 | # num_classes = 20 if args.dataset_file != 'coco' else 91 306 | # if args.dataset_file == "coco_panoptic": 307 | # num_classes = 250 308 | # device = torch.device(args.device) 309 | 310 | # backbone = build_backbone(args) 311 | 312 | # transformer = build_transformer(args) 313 | 314 | # model = DETR( 315 | # backbone, 316 | # transformer, 317 | # num_classes=num_classes, 318 | # num_queries=args.num_queries, 319 | # aux_loss=args.aux_loss, 320 | # ) 321 | # if args.masks: 322 | # model = DETRsegm(model, freeze_detr=(args.frozen_weights is not None)) 323 | # matcher = build_matcher(args) 324 | # weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef} 325 | # weight_dict['loss_giou'] = args.giou_loss_coef 326 | # if args.masks: 327 | # weight_dict["loss_mask"] = args.mask_loss_coef 328 | # weight_dict["loss_dice"] = args.dice_loss_coef 329 | # # TODO this is a hack 330 | # if args.aux_loss: 331 | # aux_weight_dict = {} 332 | # for i in range(args.dec_layers - 1): 333 | # aux_weight_dict.update({k + '_{}'.format(i): v for k, v in weight_dict.items()}) 334 | # weight_dict.update(aux_weight_dict) 335 | 336 | # losses = ['labels', 'boxes', 'cardinality'] 337 | # if args.masks: 338 | # losses += ["masks"] 339 | # criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, 340 | # eos_coef=args.eos_coef, losses=losses) 341 | # criterion.to(device) 342 | # postprocessors = {'bbox': PostProcess()} 343 | # if args.masks: 344 | # postprocessors['segm'] = PostProcessSegm() 345 | # if args.dataset_file == "coco_panoptic": 346 | # is_thing_map = {i: i <= 90 for i in range(201)} 347 | # postprocessors["panoptic"] = PostProcessPanoptic(is_thing_map, threshold=0.85) 348 | 349 | # return model, criterion, postprocessors 350 | 351 | 352 | def build(args): 353 | num_classes = 3 + 1 # <--------------- 354 | if args.dataset_file == "coco_panoptic": # 全景分割 355 | num_classes = 3 + 1 # <------------- 356 | device = torch.device(args.device) 357 | 358 | backbone = build_backbone(args) 359 | 360 | transformer = build_transformer(args) 361 | 362 | model = DETR( 363 | backbone, 364 | transformer, 365 | num_classes=num_classes, 366 | num_queries=args.num_queries, 367 | aux_loss=args.aux_loss, 368 | ) 369 | if args.masks: 370 | model = DETRsegm(model, freeze_detr=(args.frozen_weights is not None)) 371 | matcher = build_matcher(args) 372 | weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef} 373 | weight_dict['loss_giou'] = args.giou_loss_coef 374 | if args.masks: 375 | weight_dict["loss_mask"] = args.mask_loss_coef 376 | weight_dict["loss_dice"] = args.dice_loss_coef 377 | # TODO this is a hack 378 | if args.aux_loss: 379 | aux_weight_dict = {} 380 | for i in range(args.dec_layers - 1): 381 | aux_weight_dict.update({k + '_{}'.format(i): v for k, v in weight_dict.items()}) 382 | weight_dict.update(aux_weight_dict) 383 | 384 | losses = ['labels', 'boxes', 'cardinality'] 385 | if args.masks: 386 | losses += ["masks"] 387 | criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, 388 | eos_coef=args.eos_coef, losses=losses) 389 | criterion.to(device) 390 | postprocessors = {'bbox': PostProcess()} 391 | if args.masks: 392 | postprocessors['segm'] = PostProcessSegm() 393 | if args.dataset_file == "coco_panoptic": 394 | is_thing_map = {i: i <= 90 for i in range(201)} 395 | postprocessors["panoptic"] = PostProcessPanoptic(is_thing_map, threshold=0.85) 396 | 397 | return model, criterion, postprocessors 398 | -------------------------------------------------------------------------------- /model/hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | 5 | from .backbone import Backbone, Joiner 6 | from .detr import DETR, PostProcess 7 | from .position_encoding import PositionEmbeddingSine 8 | from .segmentation import DETRsegm, PostProcessPanoptic 9 | from .transformer import Transformer 10 | 11 | dependencies = ["torch", "torchvision"] 12 | 13 | 14 | def _make_detr(backbone_name: str, dilation=False, num_classes=91, mask=False): 15 | hidden_dim = 256 16 | backbone = Backbone(backbone_name, train_backbone=True, return_interm_layers=mask, dilation=dilation) 17 | pos_enc = PositionEmbeddingSine(hidden_dim // 2, normalize=True) 18 | backbone_with_pos_enc = Joiner(backbone, pos_enc) 19 | backbone_with_pos_enc.num_channels = backbone.num_channels 20 | transformer = Transformer(d_model=hidden_dim, return_intermediate_dec=True) 21 | detr = DETR(backbone_with_pos_enc, transformer, num_classes=num_classes, num_queries=100) 22 | if mask: 23 | return DETRsegm(detr) 24 | return detr 25 | 26 | 27 | def detr_resnet50(pretrained=False, num_classes=91, return_postprocessor=False): 28 | """ 29 | DETR R50 with 6 encoder and 6 decoder layers. 30 | 31 | Achieves 42/62.4 AP/AP50 on COCO val5k. 32 | """ 33 | model = _make_detr("resnet50", dilation=False, num_classes=num_classes) 34 | if pretrained: 35 | checkpoint = torch.hub.load_state_dict_from_url( 36 | url="https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth", map_location="cpu", check_hash=True 37 | ) 38 | model.load_state_dict(checkpoint["model"]) 39 | if return_postprocessor: 40 | return model, PostProcess() 41 | return model 42 | 43 | 44 | def detr_resnet50_dc5(pretrained=False, num_classes=91, return_postprocessor=False): 45 | """ 46 | DETR-DC5 R50 with 6 encoder and 6 decoder layers. 47 | 48 | The last block of ResNet-50 has dilation to increase 49 | output resolution. 50 | Achieves 43.3/63.1 AP/AP50 on COCO val5k. 51 | """ 52 | model = _make_detr("resnet50", dilation=True, num_classes=num_classes) 53 | if pretrained: 54 | checkpoint = torch.hub.load_state_dict_from_url( 55 | url="https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-f0fb7ef5.pth", map_location="cpu", check_hash=True 56 | ) 57 | model.load_state_dict(checkpoint["model"]) 58 | if return_postprocessor: 59 | return model, PostProcess() 60 | return model 61 | 62 | 63 | def detr_resnet101(pretrained=False, num_classes=91, return_postprocessor=False): 64 | """ 65 | DETR-DC5 R101 with 6 encoder and 6 decoder layers. 66 | 67 | Achieves 43.5/63.8 AP/AP50 on COCO val5k. 68 | """ 69 | model = _make_detr("resnet101", dilation=False, num_classes=num_classes) 70 | if pretrained: 71 | checkpoint = torch.hub.load_state_dict_from_url( 72 | url="https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth", map_location="cpu", check_hash=True 73 | ) 74 | model.load_state_dict(checkpoint["model"]) 75 | if return_postprocessor: 76 | return model, PostProcess() 77 | return model 78 | 79 | 80 | def detr_resnet101_dc5(pretrained=False, num_classes=91, return_postprocessor=False): 81 | """ 82 | DETR-DC5 R101 with 6 encoder and 6 decoder layers. 83 | 84 | The last block of ResNet-101 has dilation to increase 85 | output resolution. 86 | Achieves 44.9/64.7 AP/AP50 on COCO val5k. 87 | """ 88 | model = _make_detr("resnet101", dilation=True, num_classes=num_classes) 89 | if pretrained: 90 | checkpoint = torch.hub.load_state_dict_from_url( 91 | url="https://dl.fbaipublicfiles.com/detr/detr-r101-dc5-a2e86def.pth", map_location="cpu", check_hash=True 92 | ) 93 | model.load_state_dict(checkpoint["model"]) 94 | if return_postprocessor: 95 | return model, PostProcess() 96 | return model 97 | 98 | 99 | def detr_resnet50_panoptic( 100 | pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False 101 | ): 102 | """ 103 | DETR R50 with 6 encoder and 6 decoder layers. 104 | Achieves 43.4 PQ on COCO val5k. 105 | 106 | threshold is the minimum confidence required for keeping segments in the prediction 107 | """ 108 | model = _make_detr("resnet50", dilation=False, num_classes=num_classes, mask=True) 109 | is_thing_map = {i: i <= 90 for i in range(250)} 110 | if pretrained: 111 | checkpoint = torch.hub.load_state_dict_from_url( 112 | url="https://dl.fbaipublicfiles.com/detr/detr-r50-panoptic-00ce5173.pth", 113 | map_location="cpu", 114 | check_hash=True, 115 | ) 116 | model.load_state_dict(checkpoint["model"]) 117 | if return_postprocessor: 118 | return model, PostProcessPanoptic(is_thing_map, threshold=threshold) 119 | return model 120 | 121 | 122 | def detr_resnet50_dc5_panoptic( 123 | pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False 124 | ): 125 | """ 126 | DETR-DC5 R50 with 6 encoder and 6 decoder layers. 127 | 128 | The last block of ResNet-50 has dilation to increase 129 | output resolution. 130 | Achieves 44.6 on COCO val5k. 131 | 132 | threshold is the minimum confidence required for keeping segments in the prediction 133 | """ 134 | model = _make_detr("resnet50", dilation=True, num_classes=num_classes, mask=True) 135 | is_thing_map = {i: i <= 90 for i in range(250)} 136 | if pretrained: 137 | checkpoint = torch.hub.load_state_dict_from_url( 138 | url="https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-panoptic-da08f1b1.pth", 139 | map_location="cpu", 140 | check_hash=True, 141 | ) 142 | model.load_state_dict(checkpoint["model"]) 143 | if return_postprocessor: 144 | return model, PostProcessPanoptic(is_thing_map, threshold=threshold) 145 | return model 146 | 147 | 148 | def detr_resnet101_panoptic( 149 | pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False 150 | ): 151 | """ 152 | DETR-DC5 R101 with 6 encoder and 6 decoder layers. 153 | 154 | Achieves 45.1 PQ on COCO val5k. 155 | 156 | threshold is the minimum confidence required for keeping segments in the prediction 157 | """ 158 | model = _make_detr("resnet101", dilation=False, num_classes=num_classes, mask=True) 159 | is_thing_map = {i: i <= 90 for i in range(250)} 160 | if pretrained: 161 | checkpoint = torch.hub.load_state_dict_from_url( 162 | url="https://dl.fbaipublicfiles.com/detr/detr-r101-panoptic-40021d53.pth", 163 | map_location="cpu", 164 | check_hash=True, 165 | ) 166 | model.load_state_dict(checkpoint["model"]) 167 | if return_postprocessor: 168 | return model, PostProcessPanoptic(is_thing_map, threshold=threshold) 169 | return model 170 | -------------------------------------------------------------------------------- /model/matcher.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Modules to compute the matching cost and solve the corresponding LSAP. 4 | """ 5 | import torch 6 | from scipy.optimize import linear_sum_assignment 7 | from torch import nn 8 | 9 | from .box_ops import box_cxcywh_to_xyxy, generalized_box_iou 10 | 11 | 12 | class HungarianMatcher(nn.Module): 13 | """This class computes an assignment between the targets and the predictions of the network 14 | 15 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 16 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 17 | while the others are un-matched (and thus treated as non-objects). 18 | """ 19 | 20 | def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1): 21 | """Creates the matcher 22 | 23 | Params: 24 | cost_class: This is the relative weight of the classification error in the matching cost 25 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 26 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 27 | """ 28 | super().__init__() 29 | self.cost_class = cost_class 30 | self.cost_bbox = cost_bbox 31 | self.cost_giou = cost_giou 32 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" 33 | 34 | @torch.no_grad() 35 | def forward(self, outputs, targets): 36 | """ Performs the matching 37 | 38 | Params: 39 | outputs: This is a dict that contains at least these entries: 40 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 41 | "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates 42 | 43 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 44 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 45 | objects in the target) containing the class labels 46 | "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates 47 | 48 | Returns: 49 | A list of size batch_size, containing tuples of (index_i, index_j) where: 50 | - index_i is the indices of the selected predictions (in order) 51 | - index_j is the indices of the corresponding selected targets (in order) 52 | For each batch element, it holds: 53 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 54 | """ 55 | bs, num_queries = outputs["pred_logits"].shape[:2] 56 | 57 | # We flatten to compute the cost matrices in a batch 58 | out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] 59 | out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] 60 | 61 | # Also concat the target labels and boxes 62 | tgt_ids = torch.cat([v["labels"] for v in targets]) 63 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 64 | 65 | # Compute the classification cost. Contrary to the loss, we don't use the NLL, 66 | # but approximate it in 1 - proba[target class]. 67 | # The 1 is a constant that doesn't change the matching, it can be ommitted. 68 | cost_class = -out_prob[:, tgt_ids] 69 | 70 | # Compute the L1 cost between boxes 71 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 72 | 73 | # Compute the giou cost betwen boxes 74 | cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) 75 | 76 | # Final cost matrix 77 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 78 | C = C.view(bs, num_queries, -1).cpu() 79 | 80 | sizes = [len(v["boxes"]) for v in targets] 81 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 82 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 83 | 84 | 85 | def build_matcher(args): 86 | return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou) 87 | -------------------------------------------------------------------------------- /model/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Misc functions, including distributed helpers. 4 | 5 | Mostly copy-paste from torchvision references. 6 | """ 7 | import os 8 | import subprocess 9 | import time 10 | from collections import defaultdict, deque 11 | import datetime 12 | import pickle 13 | from typing import Optional, List 14 | 15 | import torch 16 | import torch.distributed as dist 17 | from torch import Tensor 18 | 19 | # needed due to empty tensor bug in pytorch and torchvision 0.5 20 | import torchvision 21 | if float(torchvision.__version__[:3]) < 0.7: 22 | from torchvision.ops import _new_empty_tensor 23 | from torchvision.ops.misc import _output_size 24 | 25 | 26 | class SmoothedValue(object): 27 | """Track a series of values and provide access to smoothed values over a 28 | window or the global series average. 29 | """ 30 | 31 | def __init__(self, window_size=20, fmt=None): 32 | if fmt is None: 33 | fmt = "{median:.4f} ({global_avg:.4f})" 34 | self.deque = deque(maxlen=window_size) 35 | self.total = 0.0 36 | self.count = 0 37 | self.fmt = fmt 38 | 39 | def update(self, value, n=1): 40 | self.deque.append(value) 41 | self.count += n 42 | self.total += value * n 43 | 44 | def synchronize_between_processes(self): 45 | """ 46 | Warning: does not synchronize the deque! 47 | """ 48 | if not is_dist_avail_and_initialized(): 49 | return 50 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') 51 | dist.barrier() 52 | dist.all_reduce(t) 53 | t = t.tolist() 54 | self.count = int(t[0]) 55 | self.total = t[1] 56 | 57 | @property 58 | def median(self): 59 | d = torch.tensor(list(self.deque)) 60 | return d.median().item() 61 | 62 | @property 63 | def avg(self): 64 | d = torch.tensor(list(self.deque), dtype=torch.float32) 65 | return d.mean().item() 66 | 67 | @property 68 | def global_avg(self): 69 | return self.total / self.count 70 | 71 | @property 72 | def max(self): 73 | return max(self.deque) 74 | 75 | @property 76 | def value(self): 77 | return self.deque[-1] 78 | 79 | def __str__(self): 80 | return self.fmt.format( 81 | median=self.median, 82 | avg=self.avg, 83 | global_avg=self.global_avg, 84 | max=self.max, 85 | value=self.value) 86 | 87 | 88 | def all_gather(data): 89 | """ 90 | Run all_gather on arbitrary picklable data (not necessarily tensors) 91 | Args: 92 | data: any picklable object 93 | Returns: 94 | list[data]: list of data gathered from each rank 95 | """ 96 | world_size = get_world_size() 97 | if world_size == 1: 98 | return [data] 99 | 100 | # serialized to a Tensor 101 | buffer = pickle.dumps(data) 102 | storage = torch.ByteStorage.from_buffer(buffer) 103 | tensor = torch.ByteTensor(storage).to("cuda") 104 | 105 | # obtain Tensor size of each rank 106 | local_size = torch.tensor([tensor.numel()], device="cuda") 107 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] 108 | dist.all_gather(size_list, local_size) 109 | size_list = [int(size.item()) for size in size_list] 110 | max_size = max(size_list) 111 | 112 | # receiving Tensor from all ranks 113 | # we pad the tensor because torch all_gather does not support 114 | # gathering tensors of different shapes 115 | tensor_list = [] 116 | for _ in size_list: 117 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) 118 | if local_size != max_size: 119 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") 120 | tensor = torch.cat((tensor, padding), dim=0) 121 | dist.all_gather(tensor_list, tensor) 122 | 123 | data_list = [] 124 | for size, tensor in zip(size_list, tensor_list): 125 | buffer = tensor.cpu().numpy().tobytes()[:size] 126 | data_list.append(pickle.loads(buffer)) 127 | 128 | return data_list 129 | 130 | 131 | def reduce_dict(input_dict, average=True): 132 | """ 133 | Args: 134 | input_dict (dict): all the values will be reduced 135 | average (bool): whether to do average or sum 136 | Reduce the values in the dictionary from all processes so that all processes 137 | have the averaged results. Returns a dict with the same fields as 138 | input_dict, after reduction. 139 | """ 140 | world_size = get_world_size() 141 | if world_size < 2: 142 | return input_dict 143 | with torch.no_grad(): 144 | names = [] 145 | values = [] 146 | # sort the keys so that they are consistent across processes 147 | for k in sorted(input_dict.keys()): 148 | names.append(k) 149 | values.append(input_dict[k]) 150 | values = torch.stack(values, dim=0) 151 | dist.all_reduce(values) 152 | if average: 153 | values /= world_size 154 | reduced_dict = {k: v for k, v in zip(names, values)} 155 | return reduced_dict 156 | 157 | 158 | class MetricLogger(object): 159 | def __init__(self, delimiter="\t"): 160 | self.meters = defaultdict(SmoothedValue) 161 | self.delimiter = delimiter 162 | 163 | def update(self, **kwargs): 164 | for k, v in kwargs.items(): 165 | if isinstance(v, torch.Tensor): 166 | v = v.item() 167 | assert isinstance(v, (float, int)) 168 | self.meters[k].update(v) 169 | 170 | def __getattr__(self, attr): 171 | if attr in self.meters: 172 | return self.meters[attr] 173 | if attr in self.__dict__: 174 | return self.__dict__[attr] 175 | raise AttributeError("'{}' object has no attribute '{}'".format( 176 | type(self).__name__, attr)) 177 | 178 | def __str__(self): 179 | loss_str = [] 180 | for name, meter in self.meters.items(): 181 | loss_str.append( 182 | "{}: {}".format(name, str(meter)) 183 | ) 184 | return self.delimiter.join(loss_str) 185 | 186 | def synchronize_between_processes(self): 187 | for meter in self.meters.values(): 188 | meter.synchronize_between_processes() 189 | 190 | def add_meter(self, name, meter): 191 | self.meters[name] = meter 192 | 193 | def log_every(self, iterable, print_freq, header=None): 194 | i = 0 195 | if not header: 196 | header = '' 197 | start_time = time.time() 198 | end = time.time() 199 | iter_time = SmoothedValue(fmt='{avg:.4f}') 200 | data_time = SmoothedValue(fmt='{avg:.4f}') 201 | space_fmt = ':' + str(len(str(len(iterable)))) + 'd' 202 | if torch.cuda.is_available(): 203 | log_msg = self.delimiter.join([ 204 | header, 205 | '[{0' + space_fmt + '}/{1}]', 206 | 'eta: {eta}', 207 | '{meters}', 208 | 'time: {time}', 209 | 'data: {data}', 210 | 'max mem: {memory:.0f}' 211 | ]) 212 | else: 213 | log_msg = self.delimiter.join([ 214 | header, 215 | '[{0' + space_fmt + '}/{1}]', 216 | 'eta: {eta}', 217 | '{meters}', 218 | 'time: {time}', 219 | 'data: {data}' 220 | ]) 221 | MB = 1024.0 * 1024.0 222 | for obj in iterable: 223 | data_time.update(time.time() - end) 224 | yield obj 225 | iter_time.update(time.time() - end) 226 | if i % print_freq == 0 or i == len(iterable) - 1: 227 | eta_seconds = iter_time.global_avg * (len(iterable) - i) 228 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 229 | if torch.cuda.is_available(): 230 | print(log_msg.format( 231 | i, len(iterable), eta=eta_string, 232 | meters=str(self), 233 | time=str(iter_time), data=str(data_time), 234 | memory=torch.cuda.max_memory_allocated() / MB)) 235 | else: 236 | print(log_msg.format( 237 | i, len(iterable), eta=eta_string, 238 | meters=str(self), 239 | time=str(iter_time), data=str(data_time))) 240 | i += 1 241 | end = time.time() 242 | total_time = time.time() - start_time 243 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 244 | print('{} Total time: {} ({:.4f} s / it)'.format( 245 | header, total_time_str, total_time / len(iterable))) 246 | 247 | 248 | def get_sha(): 249 | cwd = os.path.dirname(os.path.abspath(__file__)) 250 | 251 | def _run(command): 252 | return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() 253 | sha = 'N/A' 254 | diff = "clean" 255 | branch = 'N/A' 256 | try: 257 | sha = _run(['git', 'rev-parse', 'HEAD']) 258 | subprocess.check_output(['git', 'diff'], cwd=cwd) 259 | diff = _run(['git', 'diff-index', 'HEAD']) 260 | diff = "has uncommited changes" if diff else "clean" 261 | branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) 262 | except Exception: 263 | pass 264 | message = "sha: {}, status: {}, branch: {}".format(sha,diff,branch) 265 | return message 266 | 267 | 268 | def collate_fn(batch): 269 | batch = list(zip(*batch)) 270 | batch[0] = nested_tensor_from_tensor_list(batch[0]) 271 | return tuple(batch) 272 | 273 | 274 | def _max_by_axis(the_list): 275 | # type: (List[List[int]]) -> List[int] 276 | maxes = the_list[0] 277 | for sublist in the_list[1:]: 278 | for index, item in enumerate(sublist): 279 | maxes[index] = max(maxes[index], item) 280 | return maxes 281 | 282 | 283 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 284 | # TODO make this more general 285 | if tensor_list[0].ndim == 3: 286 | if torchvision._is_tracing(): 287 | # nested_tensor_from_tensor_list() does not export well to ONNX 288 | # call _onnx_nested_tensor_from_tensor_list() instead 289 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 290 | 291 | # TODO make it support different-sized images 292 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 293 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 294 | batch_shape = [len(tensor_list)] + max_size 295 | b, c, h, w = batch_shape 296 | dtype = tensor_list[0].dtype 297 | device = tensor_list[0].device 298 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 299 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 300 | for img, pad_img, m in zip(tensor_list, tensor, mask): 301 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 302 | m[: img.shape[1], :img.shape[2]] = False 303 | else: 304 | raise ValueError('not supported') 305 | return NestedTensor(tensor, mask) 306 | 307 | 308 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 309 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 310 | @torch.jit.unused 311 | def _onnx_nested_tensor_from_tensor_list(tensor_list): 312 | max_size = [] 313 | for i in range(tensor_list[0].dim()): 314 | max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64) 315 | max_size.append(max_size_i) 316 | max_size = tuple(max_size) 317 | 318 | # work around for 319 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 320 | # m[: img.shape[1], :img.shape[2]] = False 321 | # which is not yet supported in onnx 322 | padded_imgs = [] 323 | padded_masks = [] 324 | for img in tensor_list: 325 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 326 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 327 | padded_imgs.append(padded_img) 328 | 329 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 330 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 331 | padded_masks.append(padded_mask.to(torch.bool)) 332 | 333 | tensor = torch.stack(padded_imgs) 334 | mask = torch.stack(padded_masks) 335 | 336 | return NestedTensor(tensor, mask=mask) 337 | 338 | 339 | class NestedTensor(object): 340 | def __init__(self, tensors, mask: Optional[Tensor]): 341 | self.tensors = tensors 342 | self.mask = mask 343 | 344 | def to(self, device): 345 | # type: (Device) -> NestedTensor # noqa 346 | cast_tensor = self.tensors.to(device) 347 | mask = self.mask 348 | if mask is not None: 349 | assert mask is not None 350 | cast_mask = mask.to(device) 351 | else: 352 | cast_mask = None 353 | return NestedTensor(cast_tensor, cast_mask) 354 | 355 | def decompose(self): 356 | return self.tensors, self.mask 357 | 358 | def __repr__(self): 359 | return str(self.tensors) 360 | 361 | 362 | def setup_for_distributed(is_master): 363 | """ 364 | This function disables printing when not in master process 365 | """ 366 | import builtins as __builtin__ 367 | builtin_print = __builtin__.print 368 | 369 | def print(*args, **kwargs): 370 | force = kwargs.pop('force', False) 371 | if is_master or force: 372 | builtin_print(*args, **kwargs) 373 | 374 | __builtin__.print = print 375 | 376 | 377 | def is_dist_avail_and_initialized(): 378 | if not dist.is_available(): 379 | return False 380 | if not dist.is_initialized(): 381 | return False 382 | return True 383 | 384 | 385 | def get_world_size(): 386 | if not is_dist_avail_and_initialized(): 387 | return 1 388 | return dist.get_world_size() 389 | 390 | 391 | def get_rank(): 392 | if not is_dist_avail_and_initialized(): 393 | return 0 394 | return dist.get_rank() 395 | 396 | 397 | def is_main_process(): 398 | return get_rank() == 0 399 | 400 | 401 | def save_on_master(*args, **kwargs): 402 | if is_main_process(): 403 | torch.save(*args, **kwargs) 404 | 405 | 406 | def init_distributed_mode(args): 407 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 408 | args.rank = int(os.environ["RANK"]) 409 | args.world_size = int(os.environ['WORLD_SIZE']) 410 | args.gpu = int(os.environ['LOCAL_RANK']) 411 | elif 'SLURM_PROCID' in os.environ: 412 | args.rank = int(os.environ['SLURM_PROCID']) 413 | args.gpu = args.rank % torch.cuda.device_count() 414 | else: 415 | print('Not using distributed mode') 416 | args.distributed = False 417 | return 418 | 419 | args.distributed = True 420 | 421 | torch.cuda.set_device(args.gpu) 422 | args.dist_backend = 'nccl' 423 | print('| distributed init (rank {}): {}'.format( 424 | args.rank, args.dist_url), flush=True) 425 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 426 | world_size=args.world_size, rank=args.rank) 427 | torch.distributed.barrier() 428 | setup_for_distributed(args.rank == 0) 429 | 430 | 431 | @torch.no_grad() 432 | def accuracy(output, target, topk=(1,)): 433 | """Computes the precision@k for the specified values of k""" 434 | if target.numel() == 0: 435 | return [torch.zeros([], device=output.device)] 436 | maxk = max(topk) 437 | batch_size = target.size(0) 438 | 439 | _, pred = output.topk(maxk, 1, True, True) 440 | pred = pred.t() 441 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 442 | 443 | res = [] 444 | for k in topk: 445 | correct_k = correct[:k].view(-1).float().sum(0) 446 | res.append(correct_k.mul_(100.0 / batch_size)) 447 | return res 448 | 449 | 450 | def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): 451 | # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor 452 | """ 453 | Equivalent to nn.functional.interpolate, but with support for empty batch sizes. 454 | This will eventually be supported natively by PyTorch, and this 455 | class can go away. 456 | """ 457 | if float(torchvision.__version__[:3]) < 0.7: 458 | if input.numel() > 0: 459 | return torch.nn.functional.interpolate( 460 | input, size, scale_factor, mode, align_corners 461 | ) 462 | 463 | output_shape = _output_size(2, input, size, scale_factor) 464 | output_shape = list(input.shape[:-2]) + list(output_shape) 465 | return _new_empty_tensor(input, output_shape) 466 | else: 467 | return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) 468 | -------------------------------------------------------------------------------- /model/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Various positional encodings for the transformer. 4 | """ 5 | import math 6 | import torch 7 | from torch import nn 8 | 9 | from .misc import NestedTensor 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 18 | super().__init__() 19 | self.num_pos_feats = num_pos_feats 20 | self.temperature = temperature 21 | self.normalize = normalize 22 | if scale is not None and normalize is False: 23 | raise ValueError("normalize should be True if scale is passed") 24 | if scale is None: 25 | scale = 2 * math.pi 26 | self.scale = scale 27 | 28 | def forward(self, tensor_list: NestedTensor): 29 | x = tensor_list.tensors 30 | mask = tensor_list.mask 31 | assert mask is not None 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 46 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 47 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 48 | return pos 49 | 50 | 51 | class PositionEmbeddingLearned(nn.Module): 52 | """ 53 | Absolute pos embedding, learned. 54 | """ 55 | def __init__(self, num_pos_feats=256): 56 | super().__init__() 57 | self.row_embed = nn.Embedding(50, num_pos_feats) 58 | self.col_embed = nn.Embedding(50, num_pos_feats) 59 | self.reset_parameters() 60 | 61 | def reset_parameters(self): 62 | nn.init.uniform_(self.row_embed.weight) 63 | nn.init.uniform_(self.col_embed.weight) 64 | 65 | def forward(self, tensor_list: NestedTensor): 66 | x = tensor_list.tensors 67 | h, w = x.shape[-2:] 68 | i = torch.arange(w, device=x.device) 69 | j = torch.arange(h, device=x.device) 70 | x_emb = self.col_embed(i) 71 | y_emb = self.row_embed(j) 72 | pos = torch.cat([ 73 | x_emb.unsqueeze(0).repeat(h, 1, 1), 74 | y_emb.unsqueeze(1).repeat(1, w, 1), 75 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 76 | return pos 77 | 78 | 79 | def build_position_encoding(args): 80 | N_steps = args.hidden_dim // 2 81 | if args.position_embedding in ('v2', 'sine'): 82 | # TODO find a better way of exposing other arguments 83 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 84 | elif args.position_embedding in ('v3', 'learned'): 85 | position_embedding = PositionEmbeddingLearned(N_steps) 86 | else: 87 | raise ValueError("not supported {}".format(args.position_embedding)) 88 | 89 | return position_embedding 90 | -------------------------------------------------------------------------------- /model/segmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | This file provides the definition of the convolutional heads used to predict masks, as well as the losses 4 | """ 5 | import io 6 | from collections import defaultdict 7 | from typing import List, Optional 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from torch import Tensor 13 | from PIL import Image 14 | 15 | from . import box_ops as box_ops 16 | from .misc import NestedTensor, interpolate, nested_tensor_from_tensor_list 17 | 18 | try: 19 | from panopticapi.utils import id2rgb, rgb2id 20 | except ImportError: 21 | pass 22 | 23 | 24 | class DETRsegm(nn.Module): 25 | def __init__(self, detr, freeze_detr=False): 26 | super().__init__() 27 | self.detr = detr 28 | 29 | if freeze_detr: 30 | for p in self.parameters(): 31 | p.requires_grad_(False) 32 | 33 | hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead 34 | self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0.0) 35 | self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim) 36 | 37 | def forward(self, samples: NestedTensor): 38 | if isinstance(samples, (list, torch.Tensor)): 39 | samples = nested_tensor_from_tensor_list(samples) 40 | features, pos = self.detr.backbone(samples) 41 | 42 | bs = features[-1].tensors.shape[0] 43 | 44 | src, mask = features[-1].decompose() 45 | assert mask is not None 46 | src_proj = self.detr.input_proj(src) 47 | hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1]) 48 | 49 | outputs_class = self.detr.class_embed(hs) 50 | outputs_coord = self.detr.bbox_embed(hs).sigmoid() 51 | out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]} 52 | if self.detr.aux_loss: 53 | out['aux_outputs'] = self.detr._set_aux_loss(outputs_class, outputs_coord) 54 | 55 | # FIXME h_boxes takes the last one computed, keep this in mind 56 | bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask) 57 | 58 | seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors]) 59 | outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) 60 | 61 | out["pred_masks"] = outputs_seg_masks 62 | return out 63 | 64 | 65 | def _expand(tensor, length: int): 66 | return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1) 67 | 68 | 69 | class MaskHeadSmallConv(nn.Module): 70 | """ 71 | Simple convolutional head, using group norm. 72 | Upsampling is done using a FPN approach 73 | """ 74 | 75 | def __init__(self, dim, fpn_dims, context_dim): 76 | super().__init__() 77 | 78 | inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] 79 | self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1) 80 | self.gn1 = torch.nn.GroupNorm(8, dim) 81 | self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1) 82 | self.gn2 = torch.nn.GroupNorm(8, inter_dims[1]) 83 | self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) 84 | self.gn3 = torch.nn.GroupNorm(8, inter_dims[2]) 85 | self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) 86 | self.gn4 = torch.nn.GroupNorm(8, inter_dims[3]) 87 | self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) 88 | self.gn5 = torch.nn.GroupNorm(8, inter_dims[4]) 89 | self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1) 90 | 91 | self.dim = dim 92 | 93 | self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1) 94 | self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1) 95 | self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1) 96 | 97 | for m in self.modules(): 98 | if isinstance(m, nn.Conv2d): 99 | nn.init.kaiming_uniform_(m.weight, a=1) 100 | nn.init.constant_(m.bias, 0) 101 | 102 | def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): 103 | x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) 104 | 105 | x = self.lay1(x) 106 | x = self.gn1(x) 107 | x = F.relu(x) 108 | x = self.lay2(x) 109 | x = self.gn2(x) 110 | x = F.relu(x) 111 | 112 | cur_fpn = self.adapter1(fpns[0]) 113 | if cur_fpn.size(0) != x.size(0): 114 | cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) 115 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 116 | x = self.lay3(x) 117 | x = self.gn3(x) 118 | x = F.relu(x) 119 | 120 | cur_fpn = self.adapter2(fpns[1]) 121 | if cur_fpn.size(0) != x.size(0): 122 | cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) 123 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 124 | x = self.lay4(x) 125 | x = self.gn4(x) 126 | x = F.relu(x) 127 | 128 | cur_fpn = self.adapter3(fpns[2]) 129 | if cur_fpn.size(0) != x.size(0): 130 | cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) 131 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 132 | x = self.lay5(x) 133 | x = self.gn5(x) 134 | x = F.relu(x) 135 | 136 | x = self.out_lay(x) 137 | return x 138 | 139 | 140 | class MHAttentionMap(nn.Module): 141 | """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" 142 | 143 | def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True): 144 | super().__init__() 145 | self.num_heads = num_heads 146 | self.hidden_dim = hidden_dim 147 | self.dropout = nn.Dropout(dropout) 148 | 149 | self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) 150 | self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) 151 | 152 | nn.init.zeros_(self.k_linear.bias) 153 | nn.init.zeros_(self.q_linear.bias) 154 | nn.init.xavier_uniform_(self.k_linear.weight) 155 | nn.init.xavier_uniform_(self.q_linear.weight) 156 | self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 157 | 158 | def forward(self, q, k, mask: Optional[Tensor] = None): 159 | q = self.q_linear(q) 160 | k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) 161 | qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) 162 | kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) 163 | weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) 164 | 165 | if mask is not None: 166 | weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf")) 167 | weights = F.softmax(weights.flatten(2), dim=-1).view_as(weights) 168 | weights = self.dropout(weights) 169 | return weights 170 | 171 | 172 | def dice_loss(inputs, targets, num_boxes): 173 | """ 174 | Compute the DICE loss, similar to generalized IOU for masks 175 | Args: 176 | inputs: A float tensor of arbitrary shape. 177 | The predictions for each example. 178 | targets: A float tensor with the same shape as inputs. Stores the binary 179 | classification label for each element in inputs 180 | (0 for the negative class and 1 for the positive class). 181 | """ 182 | inputs = inputs.sigmoid() 183 | inputs = inputs.flatten(1) 184 | numerator = 2 * (inputs * targets).sum(1) 185 | denominator = inputs.sum(-1) + targets.sum(-1) 186 | loss = 1 - (numerator + 1) / (denominator + 1) 187 | return loss.sum() / num_boxes 188 | 189 | 190 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): 191 | """ 192 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 193 | Args: 194 | inputs: A float tensor of arbitrary shape. 195 | The predictions for each example. 196 | targets: A float tensor with the same shape as inputs. Stores the binary 197 | classification label for each element in inputs 198 | (0 for the negative class and 1 for the positive class). 199 | alpha: (optional) Weighting factor in range (0,1) to balance 200 | positive vs negative examples. Default = -1 (no weighting). 201 | gamma: Exponent of the modulating factor (1 - p_t) to 202 | balance easy vs hard examples. 203 | Returns: 204 | Loss tensor 205 | """ 206 | prob = inputs.sigmoid() 207 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 208 | p_t = prob * targets + (1 - prob) * (1 - targets) 209 | loss = ce_loss * ((1 - p_t) ** gamma) 210 | 211 | if alpha >= 0: 212 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 213 | loss = alpha_t * loss 214 | 215 | return loss.mean(1).sum() / num_boxes 216 | 217 | 218 | class PostProcessSegm(nn.Module): 219 | def __init__(self, threshold=0.5): 220 | super().__init__() 221 | self.threshold = threshold 222 | 223 | @torch.no_grad() 224 | def forward(self, results, outputs, orig_target_sizes, max_target_sizes): 225 | assert len(orig_target_sizes) == len(max_target_sizes) 226 | max_h, max_w = max_target_sizes.max(0)[0].tolist() 227 | outputs_masks = outputs["pred_masks"].squeeze(2) 228 | outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False) 229 | outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu() 230 | 231 | for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): 232 | img_h, img_w = t[0], t[1] 233 | results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) 234 | results[i]["masks"] = F.interpolate( 235 | results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" 236 | ).byte() 237 | 238 | return results 239 | 240 | 241 | class PostProcessPanoptic(nn.Module): 242 | """This class converts the output of the model to the final panoptic result, in the format expected by the 243 | coco panoptic API """ 244 | 245 | def __init__(self, is_thing_map, threshold=0.85): 246 | """ 247 | Parameters: 248 | is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether 249 | the class is a thing (True) or a stuff (False) class 250 | threshold: confidence threshold: segments with confidence lower than this will be deleted 251 | """ 252 | super().__init__() 253 | self.threshold = threshold 254 | self.is_thing_map = is_thing_map 255 | 256 | def forward(self, outputs, processed_sizes, target_sizes=None): 257 | """ This function computes the panoptic prediction from the model's predictions. 258 | Parameters: 259 | outputs: This is a dict coming directly from the model. See the model doc for the content. 260 | processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the 261 | model, ie the size after data augmentation but before batching. 262 | target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size 263 | of each prediction. If left to None, it will default to the processed_sizes 264 | """ 265 | if target_sizes is None: 266 | target_sizes = processed_sizes 267 | assert len(processed_sizes) == len(target_sizes) 268 | out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"] 269 | assert len(out_logits) == len(raw_masks) == len(target_sizes) 270 | preds = [] 271 | 272 | def to_tuple(tup): 273 | if isinstance(tup, tuple): 274 | return tup 275 | return tuple(tup.cpu().tolist()) 276 | 277 | for cur_logits, cur_masks, cur_boxes, size, target_size in zip( 278 | out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes 279 | ): 280 | # we filter empty queries and detection below threshold 281 | scores, labels = cur_logits.softmax(-1).max(-1) 282 | keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold) 283 | cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) 284 | cur_scores = cur_scores[keep] 285 | cur_classes = cur_classes[keep] 286 | cur_masks = cur_masks[keep] 287 | cur_masks = interpolate(cur_masks[None], to_tuple(size), mode="bilinear").squeeze(0) 288 | cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep]) 289 | 290 | h, w = cur_masks.shape[-2:] 291 | assert len(cur_boxes) == len(cur_classes) 292 | 293 | # It may be that we have several predicted masks for the same stuff class. 294 | # In the following, we track the list of masks ids for each stuff class (they are merged later on) 295 | cur_masks = cur_masks.flatten(1) 296 | stuff_equiv_classes = defaultdict(lambda: []) 297 | for k, label in enumerate(cur_classes): 298 | if not self.is_thing_map[label.item()]: 299 | stuff_equiv_classes[label.item()].append(k) 300 | 301 | def get_ids_area(masks, scores, dedup=False): 302 | # This helper function creates the final panoptic segmentation image 303 | # It also returns the area of the masks that appears on the image 304 | 305 | m_id = masks.transpose(0, 1).softmax(-1) 306 | 307 | if m_id.shape[-1] == 0: 308 | # We didn't detect any mask :( 309 | m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) 310 | else: 311 | m_id = m_id.argmax(-1).view(h, w) 312 | 313 | if dedup: 314 | # Merge the masks corresponding to the same stuff class 315 | for equiv in stuff_equiv_classes.values(): 316 | if len(equiv) > 1: 317 | for eq_id in equiv: 318 | m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) 319 | 320 | final_h, final_w = to_tuple(target_size) 321 | 322 | seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy())) 323 | seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST) 324 | 325 | np_seg_img = ( 326 | torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy() 327 | ) 328 | m_id = torch.from_numpy(rgb2id(np_seg_img)) 329 | 330 | area = [] 331 | for i in range(len(scores)): 332 | area.append(m_id.eq(i).sum().item()) 333 | return area, seg_img 334 | 335 | area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) 336 | if cur_classes.numel() > 0: 337 | # We know filter empty masks as long as we find some 338 | while True: 339 | filtered_small = torch.as_tensor( 340 | [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device 341 | ) 342 | if filtered_small.any().item(): 343 | cur_scores = cur_scores[~filtered_small] 344 | cur_classes = cur_classes[~filtered_small] 345 | cur_masks = cur_masks[~filtered_small] 346 | area, seg_img = get_ids_area(cur_masks, cur_scores) 347 | else: 348 | break 349 | 350 | else: 351 | cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) 352 | 353 | segments_info = [] 354 | for i, a in enumerate(area): 355 | cat = cur_classes[i].item() 356 | segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a}) 357 | del cur_classes 358 | 359 | with io.BytesIO() as out: 360 | seg_img.save(out, format="PNG") 361 | predictions = {"png_string": out.getvalue(), "segments_info": segments_info} 362 | preds.append(predictions) 363 | return preds 364 | -------------------------------------------------------------------------------- /model/transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | DETR Transformer class. 4 | 5 | Copy-paste from torch.nn.Transformer with modifications: 6 | * positional encodings are passed in MHattention 7 | * extra LN at the end of encoder is removed 8 | * decoder returns a stack of activations from all decoding layers 9 | """ 10 | import copy 11 | from typing import Optional, List 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch import nn, Tensor 16 | 17 | 18 | class Transformer(nn.Module): 19 | 20 | def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, 21 | num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, 22 | activation="relu", normalize_before=False, 23 | return_intermediate_dec=False): 24 | super().__init__() 25 | 26 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, 27 | dropout, activation, normalize_before) 28 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 29 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 30 | 31 | decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, 32 | dropout, activation, normalize_before) 33 | decoder_norm = nn.LayerNorm(d_model) 34 | self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, 35 | return_intermediate=return_intermediate_dec) 36 | 37 | self._reset_parameters() 38 | 39 | self.d_model = d_model 40 | self.nhead = nhead 41 | 42 | def _reset_parameters(self): 43 | for p in self.parameters(): 44 | if p.dim() > 1: 45 | nn.init.xavier_uniform_(p) 46 | 47 | def forward(self, src, mask, query_embed, pos_embed): 48 | # flatten NxCxHxW to HWxNxC 49 | bs, c, h, w = src.shape 50 | src = src.flatten(2).permute(2, 0, 1) 51 | pos_embed = pos_embed.flatten(2).permute(2, 0, 1) 52 | query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) 53 | mask = mask.flatten(1) 54 | 55 | tgt = torch.zeros_like(query_embed) 56 | memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) 57 | hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, 58 | pos=pos_embed, query_pos=query_embed) 59 | return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) 60 | 61 | 62 | class TransformerEncoder(nn.Module): 63 | 64 | def __init__(self, encoder_layer, num_layers, norm=None): 65 | super().__init__() 66 | self.layers = _get_clones(encoder_layer, num_layers) 67 | self.num_layers = num_layers 68 | self.norm = norm 69 | 70 | def forward(self, src, 71 | mask: Optional[Tensor] = None, 72 | src_key_padding_mask: Optional[Tensor] = None, 73 | pos: Optional[Tensor] = None): 74 | output = src 75 | 76 | for layer in self.layers: 77 | output = layer(output, src_mask=mask, 78 | src_key_padding_mask=src_key_padding_mask, pos=pos) 79 | 80 | if self.norm is not None: 81 | output = self.norm(output) 82 | 83 | return output 84 | 85 | 86 | class TransformerDecoder(nn.Module): 87 | 88 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): 89 | super().__init__() 90 | self.layers = _get_clones(decoder_layer, num_layers) 91 | self.num_layers = num_layers 92 | self.norm = norm 93 | self.return_intermediate = return_intermediate 94 | 95 | def forward(self, tgt, memory, 96 | tgt_mask: Optional[Tensor] = None, 97 | memory_mask: Optional[Tensor] = None, 98 | tgt_key_padding_mask: Optional[Tensor] = None, 99 | memory_key_padding_mask: Optional[Tensor] = None, 100 | pos: Optional[Tensor] = None, 101 | query_pos: Optional[Tensor] = None): 102 | output = tgt 103 | 104 | intermediate = [] 105 | 106 | for layer in self.layers: 107 | output = layer(output, memory, tgt_mask=tgt_mask, 108 | memory_mask=memory_mask, 109 | tgt_key_padding_mask=tgt_key_padding_mask, 110 | memory_key_padding_mask=memory_key_padding_mask, 111 | pos=pos, query_pos=query_pos) 112 | if self.return_intermediate: 113 | intermediate.append(self.norm(output)) 114 | 115 | if self.norm is not None: 116 | output = self.norm(output) 117 | if self.return_intermediate: 118 | intermediate.pop() 119 | intermediate.append(output) 120 | 121 | if self.return_intermediate: 122 | return torch.stack(intermediate) 123 | 124 | return output.unsqueeze(0) 125 | 126 | 127 | class TransformerEncoderLayer(nn.Module): 128 | 129 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 130 | activation="relu", normalize_before=False): 131 | super().__init__() 132 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 133 | # Implementation of Feedforward model 134 | self.linear1 = nn.Linear(d_model, dim_feedforward) 135 | self.dropout = nn.Dropout(dropout) 136 | self.linear2 = nn.Linear(dim_feedforward, d_model) 137 | 138 | self.norm1 = nn.LayerNorm(d_model) 139 | self.norm2 = nn.LayerNorm(d_model) 140 | self.dropout1 = nn.Dropout(dropout) 141 | self.dropout2 = nn.Dropout(dropout) 142 | 143 | self.activation = _get_activation_fn(activation) 144 | self.normalize_before = normalize_before 145 | 146 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 147 | return tensor if pos is None else tensor + pos 148 | 149 | def forward_post(self, 150 | src, 151 | src_mask: Optional[Tensor] = None, 152 | src_key_padding_mask: Optional[Tensor] = None, 153 | pos: Optional[Tensor] = None): 154 | q = k = self.with_pos_embed(src, pos) 155 | src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, 156 | key_padding_mask=src_key_padding_mask)[0] 157 | src = src + self.dropout1(src2) 158 | src = self.norm1(src) 159 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 160 | src = src + self.dropout2(src2) 161 | src = self.norm2(src) 162 | return src 163 | 164 | def forward_pre(self, src, 165 | src_mask: Optional[Tensor] = None, 166 | src_key_padding_mask: Optional[Tensor] = None, 167 | pos: Optional[Tensor] = None): 168 | src2 = self.norm1(src) 169 | q = k = self.with_pos_embed(src2, pos) 170 | src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, 171 | key_padding_mask=src_key_padding_mask)[0] 172 | src = src + self.dropout1(src2) 173 | src2 = self.norm2(src) 174 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) 175 | src = src + self.dropout2(src2) 176 | return src 177 | 178 | def forward(self, src, 179 | src_mask: Optional[Tensor] = None, 180 | src_key_padding_mask: Optional[Tensor] = None, 181 | pos: Optional[Tensor] = None): 182 | if self.normalize_before: 183 | return self.forward_pre(src, src_mask, src_key_padding_mask, pos) 184 | return self.forward_post(src, src_mask, src_key_padding_mask, pos) 185 | 186 | 187 | class TransformerDecoderLayer(nn.Module): 188 | 189 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 190 | activation="relu", normalize_before=False): 191 | super().__init__() 192 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 193 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 194 | # Implementation of Feedforward model 195 | self.linear1 = nn.Linear(d_model, dim_feedforward) 196 | self.dropout = nn.Dropout(dropout) 197 | self.linear2 = nn.Linear(dim_feedforward, d_model) 198 | 199 | self.norm1 = nn.LayerNorm(d_model) 200 | self.norm2 = nn.LayerNorm(d_model) 201 | self.norm3 = nn.LayerNorm(d_model) 202 | self.dropout1 = nn.Dropout(dropout) 203 | self.dropout2 = nn.Dropout(dropout) 204 | self.dropout3 = nn.Dropout(dropout) 205 | 206 | self.activation = _get_activation_fn(activation) 207 | self.normalize_before = normalize_before 208 | 209 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 210 | return tensor if pos is None else tensor + pos 211 | 212 | def forward_post(self, tgt, memory, 213 | tgt_mask: Optional[Tensor] = None, 214 | memory_mask: Optional[Tensor] = None, 215 | tgt_key_padding_mask: Optional[Tensor] = None, 216 | memory_key_padding_mask: Optional[Tensor] = None, 217 | pos: Optional[Tensor] = None, 218 | query_pos: Optional[Tensor] = None): 219 | q = k = self.with_pos_embed(tgt, query_pos) 220 | tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, 221 | key_padding_mask=tgt_key_padding_mask)[0] 222 | tgt = tgt + self.dropout1(tgt2) 223 | tgt = self.norm1(tgt) 224 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), 225 | key=self.with_pos_embed(memory, pos), 226 | value=memory, attn_mask=memory_mask, 227 | key_padding_mask=memory_key_padding_mask)[0] 228 | tgt = tgt + self.dropout2(tgt2) 229 | tgt = self.norm2(tgt) 230 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) 231 | tgt = tgt + self.dropout3(tgt2) 232 | tgt = self.norm3(tgt) 233 | return tgt 234 | 235 | def forward_pre(self, tgt, memory, 236 | tgt_mask: Optional[Tensor] = None, 237 | memory_mask: Optional[Tensor] = None, 238 | tgt_key_padding_mask: Optional[Tensor] = None, 239 | memory_key_padding_mask: Optional[Tensor] = None, 240 | pos: Optional[Tensor] = None, 241 | query_pos: Optional[Tensor] = None): 242 | tgt2 = self.norm1(tgt) 243 | q = k = self.with_pos_embed(tgt2, query_pos) 244 | tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, 245 | key_padding_mask=tgt_key_padding_mask)[0] 246 | tgt = tgt + self.dropout1(tgt2) 247 | tgt2 = self.norm2(tgt) 248 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), 249 | key=self.with_pos_embed(memory, pos), 250 | value=memory, attn_mask=memory_mask, 251 | key_padding_mask=memory_key_padding_mask)[0] 252 | tgt = tgt + self.dropout2(tgt2) 253 | tgt2 = self.norm3(tgt) 254 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) 255 | tgt = tgt + self.dropout3(tgt2) 256 | return tgt 257 | 258 | def forward(self, tgt, memory, 259 | tgt_mask: Optional[Tensor] = None, 260 | memory_mask: Optional[Tensor] = None, 261 | tgt_key_padding_mask: Optional[Tensor] = None, 262 | memory_key_padding_mask: Optional[Tensor] = None, 263 | pos: Optional[Tensor] = None, 264 | query_pos: Optional[Tensor] = None): 265 | if self.normalize_before: 266 | return self.forward_pre(tgt, memory, tgt_mask, memory_mask, 267 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 268 | return self.forward_post(tgt, memory, tgt_mask, memory_mask, 269 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 270 | 271 | 272 | def _get_clones(module, N): 273 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 274 | 275 | 276 | def build_transformer(args): 277 | return Transformer( 278 | d_model=args.hidden_dim, 279 | dropout=args.dropout, 280 | nhead=args.nheads, 281 | dim_feedforward=args.dim_feedforward, 282 | num_encoder_layers=args.enc_layers, 283 | num_decoder_layers=args.dec_layers, 284 | normalize_before=args.pre_norm, 285 | return_intermediate_dec=True, 286 | ) 287 | 288 | 289 | def _get_activation_fn(activation): 290 | """Return an activation function given a string""" 291 | if activation == "relu": 292 | return F.relu 293 | if activation == "gelu": 294 | return F.gelu 295 | if activation == "glu": 296 | return F.glu 297 | raise RuntimeError("activation should be relu/gelu, not {}.".format(activation)) 298 | -------------------------------------------------------------------------------- /performance_accuracy_detr.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ~~~Medcare AI Lab~~~ 18 | 19 | import os 20 | import torch 21 | import torchvision 22 | from torchsummary import summary 23 | import time 24 | import pycuda.driver as cuda 25 | import pycuda.autoinit 26 | import numpy as np 27 | 28 | from trt_util.process_img import PyTorchTensorHolder 29 | from trt_util.trt_lite import TrtLite 30 | from trt_util.process_img import preprocess_torch_v1,preprocess_torch_v2 31 | 32 | import matplotlib.pyplot as plt 33 | import matplotlib 34 | matplotlib.use('Agg') 35 | 36 | from model.hubconf import detr_resnet50 37 | device = torch.device("cuda:0") 38 | 39 | # model torch 40 | model_path='checkpoint/detr_resnet50.pth' 41 | detr = detr_resnet50(pretrained=False,num_classes=20+1).eval() # <------这里类别需要+1 42 | state_dict = torch.load(model_path) # <-----------修改加载模型的路径 43 | detr.load_state_dict(state_dict["model"]) 44 | detr.to(device) 45 | 46 | # model trt fp32 47 | engine_path="./detr.plan" 48 | trt_32 = TrtLite(engine_file_path=engine_path) 49 | 50 | # model trt fp16 51 | engine_path_16="./detr_fp16.plan" 52 | trt_16 = TrtLite(engine_file_path=engine_path_16) 53 | 54 | 55 | def acc_static_torch(test_dir,detr=detr): 56 | 57 | img, _, _ = preprocess_torch_v2(test_dir) 58 | outputs = detr(img.to(device)) 59 | 60 | scores = outputs['pred_logits'].cpu().detach().numpy() 61 | boxes = outputs['pred_boxes'].cpu().detach().numpy() 62 | 63 | return scores.ravel(), boxes.ravel() 64 | 65 | 66 | def acc_static_trt(test_dir,trt_=trt_32): 67 | 68 | # trt_.print_info() 69 | i2shape = {0: (1, 3, 800, 800)} 70 | io_info = trt_.get_io_info(i2shape) 71 | d_buffers = trt_.allocate_io_buffers(i2shape,True) 72 | output_data_trt_prob = np.zeros(io_info[1][2], dtype=np.float32) 73 | output_data_trt_box = np.zeros(io_info[2][2], dtype=np.float32) 74 | 75 | img, _, _ = preprocess_torch_v2(test_dir) 76 | d_buffers[0] = img.to(device) 77 | 78 | trt_.execute([t.data_ptr() for t in d_buffers], i2shape) 79 | 80 | scores = d_buffers[1].cpu().numpy().ravel() 81 | boxes = d_buffers[2].cpu().numpy().ravel() 82 | 83 | return scores, boxes 84 | 85 | 86 | 87 | 88 | if __name__ == "__main__": 89 | 90 | # 平均相对精度的计算 91 | # np.mean(np.abs(output_data_pytorch - output_data_trt) / np.abs(output_data_pytorch)) 92 | files = os.listdir("./test") 93 | 94 | Average_diff_perc_socre_32 = [] 95 | Average_diff_perc_box_32 = [] 96 | 97 | Average_diff_perc_socre_16 = [] 98 | Average_diff_perc_box_16 = [] 99 | 100 | for file in files[:1000]: 101 | print(file) 102 | file_path = os.path.join("./test/",file) 103 | 104 | torch_score, torch_box = acc_static_torch(file_path) 105 | 106 | # fp32 107 | trt_score_32,trt_box_32 = acc_static_trt(file_path,trt_=trt_32) 108 | 109 | # fp16 110 | trt_score_16,trt_box_16 = acc_static_trt(file_path,trt_=trt_16) 111 | 112 | adp_score_32 = np.mean(np.abs(torch_score - trt_score_32) / np.abs(torch_score)) 113 | adp_box_32 = np.mean(np.abs(torch_box - trt_box_32) / np.abs(torch_box)) 114 | 115 | adp_score_16 = np.mean(np.abs(torch_score - trt_score_16) / np.abs(torch_score)) 116 | adp_box_16 = np.mean(np.abs(torch_box - trt_box_16) / np.abs(torch_box)) 117 | 118 | Average_diff_perc_socre_32.append(adp_score_32) 119 | Average_diff_perc_box_32.append(adp_box_32) 120 | 121 | Average_diff_perc_socre_16.append(adp_score_16) 122 | Average_diff_perc_box_16.append(adp_box_16) 123 | 124 | 125 | print("-"*50) 126 | print(f"trt FP32 Score的平均相对精度:{Average_diff_perc_socre_32}") 127 | print(f"trt FP32 Box的平均相对精度:{Average_diff_perc_box_32}") 128 | 129 | print(f"trt FP16 Score的平均相对精度:{Average_diff_perc_socre_16}") 130 | print(f"trt FP16 Box的平均相对精度:{Average_diff_perc_box_16}") 131 | print("-"*50) 132 | 133 | # plot Average diff percentage 134 | plt.rcParams['figure.figsize'] = (16.0, 9.0) # 单位是inches 135 | fig,subs=plt.subplots(2,2) 136 | subs[0][0].plot(np.arange(len(files[:1000])), Average_diff_perc_socre_32, 'ro',label='FP32(Score)') 137 | subs[0][0].axhline(y=1e-6,color='b',linestyle='--') 138 | subs[0][0].axhline(y=1e-5,color='b',linestyle='--') 139 | 140 | subs[0][0].set_xlabel('Test Image ID') 141 | subs[0][0].set_ylabel('Average Diff Percentage') 142 | subs[0][0].legend() 143 | 144 | subs[0][1].plot(np.arange(len(files[:1000])), Average_diff_perc_box_32, 'bv',label='FP32(Box)') 145 | subs[0][1].axhline(y=1e-6,color='r',linestyle='--') 146 | subs[0][1].axhline(y=1e-5,color='r',linestyle='--') 147 | subs[0][1].set_xlabel('Test Image ID') 148 | subs[0][1].set_ylabel('Average Diff Percentage') 149 | subs[0][1].legend() 150 | 151 | subs[1][0].plot(np.arange(len(files[:1000])), Average_diff_perc_socre_16, 'g^',label='FP16(Score)') 152 | subs[1][0].axhline(y=1e-3,color='k',linestyle='--') 153 | subs[1][0].axhline(y=1e-2,color='k',linestyle='--') 154 | subs[1][0].set_xlabel('Test Image ID') 155 | subs[1][0].set_ylabel('Average Diff Percentage') 156 | subs[1][0].legend() 157 | 158 | subs[1][1].plot(np.arange(len(files[:1000])), Average_diff_perc_box_16, 'k*',label='FP16(Box)') 159 | subs[1][1].axhline(y=1e-3,color='g',linestyle='--') 160 | subs[1][1].axhline(y=1e-2,color='g',linestyle='--') 161 | subs[1][1].set_xlabel('Test Image ID') 162 | subs[1][1].set_ylabel('Average Diff Percentage') 163 | subs[1][1].legend() 164 | 165 | plt.savefig("./average_diff_percentage.png") 166 | plt.close() 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /performance_time_detr.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ~~~Medcare AI Lab~~~ 18 | 19 | 20 | import torch 21 | import torchvision 22 | from torchsummary import summary 23 | import time 24 | import pycuda 25 | import pycuda.driver as cuda 26 | import pycuda.autoinit 27 | import numpy as np 28 | 29 | from trt_util.process_img import PyTorchTensorHolder 30 | from trt_util.trt_lite import TrtLite 31 | 32 | import matplotlib.pyplot as plt 33 | import matplotlib 34 | matplotlib.use('Agg') 35 | 36 | 37 | from model.hubconf import detr_resnet50 38 | device = torch.device("cuda:0") 39 | 40 | 41 | def time_static_torch(input_data,model_path='checkpoint/detr_resnet50.pth',batch_size=1,nRound=1000): 42 | detr = detr_resnet50(pretrained=False,num_classes=20+1).eval() 43 | state_dict = torch.load(model_path) 44 | detr.load_state_dict(state_dict["model"]) 45 | detr.to(device) 46 | 47 | torch.cuda.synchronize() 48 | t0 = time.time() 49 | 50 | for i in range(nRound): 51 | detr(input_data) 52 | 53 | torch.cuda.synchronize() 54 | 55 | Latency_pytorch = (time.time() - t0)*1000 / nRound 56 | Throughput_pytorch = 1000/Latency_pytorch*batch_size 57 | 58 | # 清空释放显存 59 | del detr 60 | input_data.cpu() 61 | del input_data 62 | torch.cuda.empty_cache() 63 | 64 | return Latency_pytorch, Throughput_pytorch 65 | 66 | 67 | 68 | def time_static_trt(input_data,engine_path,batch_size=1,nRound=1000): 69 | trt_ = TrtLite(engine_file_path=engine_path) 70 | # trt_.print_info() 71 | # if batch_size == 1: 72 | # i2shape = {0: (1, 3, 800, 800)} 73 | # else: 74 | # i2shape = batch_size 75 | i2shape = {0: (batch_size, 3, 800, 800)} 76 | io_info = trt_.get_io_info(i2shape) 77 | d_buffers = trt_.allocate_io_buffers(i2shape, True) 78 | output_data_trt_prob = np.zeros(io_info[1][2], dtype=np.float32) 79 | output_data_trt_box = np.zeros(io_info[2][2], dtype=np.float32) 80 | 81 | d_buffers[0] = input_data 82 | 83 | torch.cuda.synchronize() 84 | t0 = time.time() 85 | 86 | for i in range(nRound): 87 | trt_.execute([t.data_ptr() for t in d_buffers], i2shape) 88 | # output_data_trt_prob = d_buffers[1].cpu().numpy() 89 | # output_data_trt_box = d_buffers[2].cpu().numpy() 90 | 91 | torch.cuda.synchronize() 92 | 93 | Latency_trt = (time.time() - t0) *1000 / nRound 94 | Throughput_trt = 1000/Latency_trt*batch_size 95 | 96 | # 释放显存 97 | del trt_ 98 | input_data.cpu() 99 | del input_data 100 | d_buffers[0].cpu() 101 | try: 102 | del d_buffers[0] 103 | d_buffers[1].cpu() 104 | del d_buffers[1] 105 | d_buffers[2].cpu() 106 | del d_buffers[2] 107 | except: 108 | pass 109 | torch.cuda.empty_cache() 110 | 111 | return Latency_trt, Throughput_trt 112 | 113 | 114 | 115 | if __name__ == "__main__": 116 | 117 | 118 | # # Latency and Throughput 119 | # Pytorch batch size = 32时 out of memory, 因此我们仅对比了batch size是16以下的batch 120 | # batch size =32 121 | # RuntimeError: CUDA out of memory. 122 | # Tried to allocate 314.00 MiB (GPU 0; 14.76 GiB total capacity; 13.03 GiB already allocated; 123 | # 230.75 MiB free; 13.55 GiB reserved in total by PyTorch) 124 | 125 | batch_sizes = [16,8,4,2,1] 126 | # batch_sizes = [16] 127 | 128 | # FP32 129 | static_torch_32 = {'batch_size':[],'latency':[],'throughput':[],'LSU':[],"TSU":[]} 130 | static_trt_32 = {'batch_size':[],'latency':[],'throughput':[],'LSU':[],"TSU":[]} 131 | static_trt_16 = {'batch_size':[],'latency':[],'throughput':[],'LSU':[],"TSU":[]} 132 | 133 | for batch_size in batch_sizes: 134 | print(f"[INFO] 当前测试的batch size为:{batch_size}") 135 | torch.manual_seed(0) 136 | input_data = torch.randn(batch_size, 3, 800, 800, dtype=torch.float32, device='cuda') 137 | 138 | # torch 139 | print("[INFO] 正在进行pytorch测试") 140 | l_torch,t_torch = time_static_torch(input_data=input_data,model_path='checkpoint/detr_resnet50.pth',batch_size=batch_size,nRound=1000) 141 | 142 | print("[INFO] 释放模型") 143 | time.sleep(10) 144 | 145 | # fp32 146 | if batch_size == 1: 147 | batch_plan = "./detr.plan" 148 | else: 149 | batch_plan = f"./output/detr_batch_{batch_size}.plan" 150 | print("[INFO] 正在进行trt FP32测试") 151 | l_trt,t_trt = time_static_trt(input_data=input_data,engine_path=batch_plan,batch_size=batch_size,nRound=1000) 152 | 153 | lsu = round(l_trt / l_torch,2) 154 | tsu = round(t_trt /t_torch,2) 155 | 156 | time.sleep(10) 157 | 158 | # fp16 159 | if batch_size == 1: 160 | batch_plan = "./detr_fp16.plan" 161 | else: 162 | batch_plan = f"./output/detr_batch_{batch_size}_fp16.plan" 163 | print("[INFO] 正在进行trt FP16测试") 164 | l_trt_16,t_trt_16 = time_static_trt(input_data=input_data,engine_path=batch_plan,batch_size=batch_size,nRound=1000) 165 | 166 | lsu_16 = round(l_trt_16 / l_torch,2) 167 | tsu_16 = round(t_trt_16 /t_torch,2) 168 | 169 | input_data.cpu() 170 | del input_data 171 | torch.cuda.empty_cache() 172 | time.sleep(10) 173 | 174 | static_torch_32['batch_size'].append(batch_size) 175 | static_torch_32['latency'].append(l_torch) 176 | static_torch_32['throughput'].append(t_torch) 177 | static_torch_32['LSU'].append("1x") 178 | static_torch_32['TSU'].append("1x") 179 | 180 | static_trt_32['batch_size'].append(batch_size) 181 | static_trt_32['latency'].append(l_trt) 182 | static_trt_32['throughput'].append(t_trt) 183 | static_trt_32['LSU'].append(str(lsu)+"x") 184 | static_trt_32['TSU'].append(str(tsu)+"x") 185 | 186 | static_trt_16['batch_size'].append(batch_size) 187 | static_trt_16['latency'].append(l_trt_16) 188 | static_trt_16['throughput'].append(t_trt_16) 189 | static_trt_16['LSU'].append(str(lsu_16)+"x") 190 | static_trt_16['TSU'].append(str(tsu_16)+"x") 191 | 192 | print("-"*50) 193 | print("torch:") 194 | print(static_torch_32) 195 | print("trt fp32:") 196 | print(static_trt_32) 197 | print("trt fp16:") 198 | print(static_trt_16) 199 | print("-"*50) 200 | 201 | # plot latency vs throughput 202 | torch_x = static_torch_32['latency'] 203 | torch_y = static_torch_32['throughput'] 204 | 205 | trt_32_x = static_trt_32['latency'] 206 | trt_32_y = static_trt_32['throughput'] 207 | 208 | trt_16_x = static_trt_16['latency'] 209 | trt_16_y = static_trt_16['throughput'] 210 | 211 | plt.rcParams['figure.figsize'] = (16.0, 9.0) 212 | plt.plot(torch_x, torch_y, 'ro--',label='Pytorch') 213 | for i,(a, b) in enumerate(zip(torch_x, torch_y)): 214 | # plt.text(a+15,b-0.15,'(%d,%d,%d)'%(batch_size[i],a,b),ha='center', va='bottom',fontdict={'size': 10, 'color': 'r'}) 215 | plt.text(a+15,b-0.15,f'Batch:{batch_sizes[i]}',ha='center', va='bottom',fontdict={'size': 10, 'color': 'r'}) 216 | 217 | 218 | plt.plot(trt_32_x, trt_32_y, 'b^--',label='TensorRT(FP32)') 219 | for i,(a, b) in enumerate(zip(trt_32_x, trt_32_y)): 220 | # plt.text(a+15,b-0.15,'(%d,%d)'%(a,b),ha='center', va='bottom',fontdict={'size': 10, 'color': 'b'}) 221 | plt.text(a+15,b-0.15,f'Batch:{batch_sizes[i]}',ha='center', va='bottom',fontdict={'size': 10, 'color': 'b'}) 222 | 223 | 224 | plt.plot(trt_16_x, trt_16_y, 'g*--',label='TensorRT(FP16)') 225 | for i,(a, b) in enumerate(zip(trt_16_x, trt_16_y)): 226 | # plt.text(a+15,b-0.15,'(%d,%d)'%(a,b),ha='center', va='bottom',fontdict={'size': 10, 'color': 'g'}) 227 | plt.text(a+15,b-0.15,f'Batch:{batch_sizes[i]}',ha='center', va='bottom',fontdict={'size': 10, 'color': 'g'}) 228 | if batch_sizes[i] in [4,8]: 229 | plt.annotate(f"({int(a)},{int(b)})",xy=(a,b),xytext=(a*0.9,b*0.9),arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=.2')) 230 | 231 | plt.xlabel('Latency (ms)') 232 | plt.ylabel('Throughput') 233 | plt.legend() 234 | plt.savefig("./latency_vs_throughput.png") 235 | plt.close() 236 | 237 | 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /pic/average_diff_percentage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/average_diff_percentage.png -------------------------------------------------------------------------------- /pic/bug1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/bug1.png -------------------------------------------------------------------------------- /pic/bug1_onnx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/bug1_onnx.png -------------------------------------------------------------------------------- /pic/bug2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/bug2.png -------------------------------------------------------------------------------- /pic/bug2_onnx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/bug2_onnx.png -------------------------------------------------------------------------------- /pic/bug3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/bug3.png -------------------------------------------------------------------------------- /pic/bug4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/bug4.png -------------------------------------------------------------------------------- /pic/latency_vs_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/latency_vs_throughput.png -------------------------------------------------------------------------------- /pic/test_fp16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/test_fp16.jpg -------------------------------------------------------------------------------- /pic/test_fp32.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/test_fp32.jpg -------------------------------------------------------------------------------- /pic/time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/pic/time.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | colored==1.4.2 3 | cupy-cuda112==8.6.0 4 | cycler==0.10.0 5 | dataclasses==0.6 6 | decorator==4.4.2 7 | fastrlock==0.6 8 | future==0.18.2 9 | graphsurgeon @ file:///TensorRT-7.2.2.3/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl 10 | kiwisolver==1.3.1 11 | Mako==1.1.4 12 | MarkupSafe==1.1.1 13 | matplotlib==3.4.1 14 | numpy==1.20.1 15 | nvidia-pyindex==1.0.8 16 | onnx==1.8.1 17 | onnx-graphsurgeon==0.2.7 18 | onnx-simplifier==0.3.4 19 | onnx2pytorch==0.2.0 20 | onnxoptimizer==0.2.5 21 | onnxruntime==1.7.0 22 | opencv-python==4.5.1.48 23 | Pillow==8.1.1 24 | polygraphy==0.25.0 25 | protobuf==3.15.5 26 | pycuda==2020.1 27 | pyparsing==2.4.7 28 | python-dateutil==2.8.1 29 | pytools==2021.1 30 | scipy==1.6.2 31 | six==1.15.0 32 | tensorrt @ file:///TensorRT-7.2.2.3/python/tensorrt-7.2.2.3-cp38-none-linux_x86_64.whl 33 | torch==1.6.0 34 | torchsummary==1.5.1 35 | torchvision 36 | typing-extensions==3.7.4.3 37 | uff @ file:///TensorRT-7.2.2.3/uff/uff-0.6.9-py2.py3-none-any.whl 38 | -------------------------------------------------------------------------------- /trt_int8_quant.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ~~~Medcare AI Lab~~~ 18 | 19 | import os 20 | import glob 21 | import cv2 22 | from PIL import Image 23 | import numpy as np 24 | import argparse 25 | 26 | import torchvision.transforms as T 27 | from trt_util.common import build_engine_onnx_v2 28 | from trt_util.calibrator import Calibrator 29 | 30 | 31 | transform = T.Compose([ 32 | T.Resize((800,800)), # PIL.Image.BILINEAR 33 | T.ToTensor(), 34 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 35 | ]) 36 | 37 | def preprocess(img_pil): 38 | img = transform(img_pil).cpu().numpy() 39 | return img 40 | 41 | # def preprocess(img_pil): 42 | # img = img_pil.resize((800, 800),Image.BILINEAR) 43 | # img = np.array(img).astype(np.float32) / 255.0 44 | # img = img.transpose(2,0,1) 45 | # # print(img.shape) 46 | # img = (img - np.array([ [[0.485]], [[0.456]], [[0.406]] ]))/np.array([ [[0.229]], [[0.224]], [[0.225]] ]) 47 | 48 | # # img = img.transpose(1,2,0) 49 | # # img = np.expand_dims(img, axis=0) 50 | # img = np.ascontiguousarray(img) 51 | # img = np.array(img).astype(np.float32) 52 | # print(img.shape) 53 | # return img 54 | 55 | class DataLoader: 56 | def __init__(self,calib_img_dir="./calib_train_image",batch=1,batch_size=32): 57 | self.index = 0 58 | self.length = batch 59 | self.batch_size = batch_size 60 | self.calib_img_dir = calib_img_dir 61 | # self.img_list = [i.strip() for i in open('calib.txt').readlines()] 62 | self.img_list = glob.glob(os.path.join(self.calib_img_dir, "*.jpg")) 63 | print(f'[INFO] found all {len(self.img_list)} images to calib.') 64 | assert len(self.img_list) > self.batch_size * self.length, '[Error] {} must contains more than {} images to calib'.format(self.calib_img_dir,self.batch_size * self.length) 65 | self.calibration_data = np.zeros((self.batch_size,3,800,800), dtype=np.float32) 66 | 67 | def reset(self): 68 | self.index = 0 69 | 70 | def next_batch(self): 71 | if self.index < self.length: 72 | for i in range(self.batch_size): 73 | assert os.path.exists(self.img_list[i + self.index * self.batch_size]), '[Error] Batch not found!!' 74 | # data preprocess 75 | img = Image.open(self.img_list[i + self.index * self.batch_size]) 76 | # img = cv2.imread(self.img_list[i + self.index * self.batch_size]) 77 | img = preprocess(img) 78 | # self.calibration_data[i] = np.ones((3,800,800), dtype=np.float32) 79 | self.calibration_data[i] = img 80 | 81 | self.index += 1 82 | return np.ascontiguousarray(self.calibration_data, dtype=np.float32) 83 | else: 84 | return np.array([]) 85 | 86 | def __len__(self): 87 | return self.length 88 | 89 | def main(onnx_model_path,engine_model_path,calib_img_dir,calibration_table,fp16,int8,batch,batch_size): 90 | 91 | fp16_mode = fp16 92 | int8_mode = int8 93 | 94 | # calibration 95 | calibration_stream = DataLoader(calib_img_dir=calib_img_dir,batch=batch,batch_size=batch_size) 96 | engine_model_path = engine_model_path 97 | 98 | # 校准产生校准表,但是我们并没有生成校准表! 99 | engine_fixed = build_engine_onnx_v2(onnx_model_path, engine_model_path, fp16_mode=fp16_mode, 100 | int8_mode=int8_mode,max_batch_size=batch_size, calibration_stream=calibration_stream, 101 | calibration_table_path=calibration_table, save_engine=True) 102 | assert engine_fixed, '[Error] Broken engine_fixed' 103 | print('[INFO] ====> onnx to tensorrt completed !\n') 104 | 105 | if __name__ == '__main__': 106 | 107 | parser = argparse.ArgumentParser(description='TensorRT INT8 Quant.') 108 | parser.add_argument('--onnx_model_path', type= str , default='./detr_sim.onnx', help='ONNX Model Path') 109 | parser.add_argument('--engine_model_path', type= str , default='./detr_int8.plan', help='TensorRT Engine File') 110 | parser.add_argument('--calib_img_dir', type= str , default='./calib_train_image', help='Calib Image Dir') 111 | parser.add_argument('--calibration_table', type=str,default="./detr_calibration.cache", help='Calibration Table') 112 | parser.add_argument('--batch', type=int,default=958, help='Number of Batch: [total_image/batch_size]') # 30660/batch_size 113 | parser.add_argument('--batch_size', type=int,default=32, help='Batch Size') 114 | 115 | parser.add_argument('--fp16', action="store_true", help='Open FP16 Mode') 116 | parser.add_argument('--int8', action="store_true", help='Open INT8 Mode') 117 | 118 | args = parser.parse_args() 119 | main(args.onnx_model_path,args.engine_model_path,args.calib_img_dir,args.calibration_table, 120 | args.fp16,args.int8,args.batch,args.batch_size) 121 | 122 | # python3 trt_int8_quant.py --onnx_model_path ./detr_sim.onnx --engine_model_path ./detr_int8.plan --calib_img_dir ./calib_train_image --calibration_table ./detr_calibration.cache --batch 1 --int8 123 | 124 | -------------------------------------------------------------------------------- /trt_util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/TensorRT-DETR/5b32fba6a627f5430928e6a86a1a922eb447f99d/trt_util/__init__.py -------------------------------------------------------------------------------- /trt_util/calibrator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # ~~~Medcare AI Lab~~~ 17 | 18 | import os 19 | import tensorrt as trt 20 | import pycuda.driver as cuda 21 | import pycuda.autoinit 22 | import numpy as np 23 | import ctypes 24 | 25 | 26 | 27 | class Calibrator(trt.IInt8EntropyCalibrator2): 28 | '''calibrator 29 | IInt8EntropyCalibrator2 30 | IInt8LegacyCalibrator 31 | IInt8EntropyCalibrator 32 | IInt8MinMaxCalibrator 33 | 34 | ''' 35 | def __init__(self, stream, cache_file=""): 36 | trt.IInt8EntropyCalibrator2.__init__(self) 37 | self.stream = stream 38 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 39 | self.cache_file = cache_file 40 | # print(self.cache_file) 41 | stream.reset() 42 | 43 | 44 | def get_batch_size(self): 45 | return self.stream.batch_size 46 | 47 | def get_batch(self, names): 48 | 49 | batch = self.stream.next_batch() 50 | if not batch.size: 51 | return None 52 | 53 | cuda.memcpy_htod(self.d_input, batch) 54 | return [int(self.d_input)] 55 | 56 | def read_calibration_cache(self): 57 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 58 | if os.path.exists(self.cache_file): 59 | with open(self.cache_file, "rb") as f: 60 | print(f"[INFO] Using calibration cache to save time: {self.cache_file}") 61 | return f.read() 62 | 63 | def write_calibration_cache(self, cache): 64 | with open(self.cache_file, "wb") as f: 65 | print(f"[INFO] Caching calibration data for future use: {self.cache_file}") 66 | f.write(cache) 67 | -------------------------------------------------------------------------------- /trt_util/common.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # ~~~Medcare AI Lab~~~ 17 | # 该部分代码参考了TensorRT官方示例完成,对相关方法进行修改 18 | # 19 | 20 | 21 | import pycuda.driver as cuda 22 | #https://documen.tician.de/pycuda/driver.html 23 | import pycuda.autoinit 24 | import numpy as np 25 | import tensorrt as trt 26 | from .calibrator import Calibrator 27 | 28 | import sys, os 29 | import time 30 | 31 | # TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) 32 | # TRT_LOGGER = trt.Logger(trt.Logger.INFO) 33 | TRT_LOGGER = trt.Logger() 34 | 35 | # Allocate host and device buffers, and create a stream. 36 | class HostDeviceMem(object): 37 | def __init__(self, host_mem, device_mem): 38 | self.host = host_mem 39 | self.device = device_mem 40 | 41 | def __str__(self): 42 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 43 | 44 | def __repr__(self): 45 | return self.__str__() 46 | 47 | 48 | def allocate_buffers(engine): 49 | inputs = [] 50 | outputs = [] 51 | bindings = [] 52 | stream = cuda.Stream() 53 | for binding in engine: 54 | size = trt.volume(engine.get_binding_shape(binding)) # <--------- the main diff to v2 55 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 56 | # Allocate host and device buffers 57 | host_mem = cuda.pagelocked_empty(size, dtype) 58 | device_mem = cuda.mem_alloc(host_mem.nbytes) 59 | # Append the device buffer to device bindings. 60 | bindings.append(int(device_mem)) 61 | # Append to the appropriate list. 62 | if engine.binding_is_input(binding): 63 | inputs.append(HostDeviceMem(host_mem, device_mem)) 64 | else: 65 | outputs.append(HostDeviceMem(host_mem, device_mem)) 66 | return inputs, outputs, bindings, stream 67 | 68 | 69 | 70 | def allocate_buffers_v2(engine): 71 | inputs = [] 72 | outputs = [] 73 | bindings = [] 74 | stream = cuda.Stream() 75 | for binding in engine: 76 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 77 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 78 | # Allocate host and device buffers 79 | host_mem = cuda.pagelocked_empty(size, dtype) 80 | device_mem = cuda.mem_alloc(host_mem.nbytes) 81 | # Append the device buffer to device bindings. 82 | bindings.append(int(device_mem)) 83 | # Append to the appropriate list. 84 | if engine.binding_is_input(binding): 85 | inputs.append(HostDeviceMem(host_mem, device_mem)) 86 | else: 87 | outputs.append(HostDeviceMem(host_mem, device_mem)) 88 | return inputs, outputs, bindings, stream 89 | 90 | 91 | # do inference multi outputs 92 | def do_inference_v2(context, bindings, inputs, outputs, stream, input_tensor): 93 | # Transfer input data to the GPU. 94 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 95 | 96 | # Run inference. 97 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) 98 | # Transfer predictions back from the GPU. 99 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 100 | # Synchronize the stream 101 | stream.synchronize() 102 | # Return only the host outputs. 103 | return [out.host for out in outputs] 104 | 105 | # The onnx path is used for Pytorch models. 106 | def build_engine_onnx(model_file,engine_file,FP16=False,verbose=False,dynamic_input=False,batch_size=1): 107 | 108 | def get_engine(): 109 | EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 110 | # with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network,builder.create_builder_config() as config, trt.OnnxParser(network,TRT_LOGGER) as parser: 111 | with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, builder.create_builder_config() as config,\ 112 | trt.OnnxParser(network,TRT_LOGGER) as parser: 113 | # Workspace size is the maximum amount of memory available to the builder while building an engine. 114 | builder.max_workspace_size = 6 << 30 # 6G 115 | builder.max_batch_size = batch_size 116 | # config.max_batch_size = 2 117 | 118 | if FP16: 119 | print("[INFO] Open FP16 Mode!") 120 | # config.set_flag(tensorrt.BuilderFlag.FP16) 121 | builder.fp16_mode = True 122 | 123 | with open(model_file, 'rb') as model: 124 | parser.parse(model.read()) 125 | if verbose: 126 | print(">"*50) 127 | for error in range(parser.num_errors): 128 | print(parser.get_error(error)) 129 | 130 | network.get_input(0).shape = [ batch_size, 3, 800, 800 ] 131 | 132 | if dynamic_input: 133 | profile = builder.create_optimization_profile(); 134 | profile.set_shape("inputs", (1,3,800,800), (8,3,800,800), (64,3,800,800)) 135 | config.add_optimization_profile(profile) 136 | 137 | # builder engine 138 | engine = builder.build_cuda_engine(network) 139 | print("[INFO] Completed creating Engine!") 140 | with open(engine_file, "wb") as f: 141 | f.write(engine.serialize()) 142 | return engine 143 | 144 | if os.path.exists(engine_file): 145 | # If a serialized engine exists, use it instead of building an engine. 146 | print("[INFO] Reading engine from file {}".format(engine_file)) 147 | with open(engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 148 | return runtime.deserialize_cuda_engine(f.read()) 149 | else: 150 | return get_engine() 151 | 152 | 153 | # int8 quant 154 | def build_engine_onnx_v2(onnx_file_path="", engine_file_path="",fp16_mode=False, int8_mode=False, \ 155 | max_batch_size=1,calibration_stream=None, calibration_table_path="", save_engine=False): 156 | """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" 157 | def build_engine(max_batch_size, save_engine): 158 | """Takes an ONNX file and creates a TensorRT engine to run inference with""" 159 | with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network,\ 160 | builder.create_builder_config() as config,trt.OnnxParser(network, TRT_LOGGER) as parser: 161 | 162 | # parse onnx model file 163 | if not os.path.exists(onnx_file_path): 164 | quit(f'[Error]ONNX file {onnx_file_path} not found') 165 | print(f'[INFO] Loading ONNX file from path {onnx_file_path}...') 166 | with open(onnx_file_path, 'rb') as model: 167 | print('[INFO] Beginning ONNX file parsing') 168 | parser.parse(model.read()) 169 | assert network.num_layers > 0, '[Error] Failed to parse ONNX model. \ 170 | Please check if the ONNX model is compatible ' 171 | print('[INFO] Completed parsing of ONNX file') 172 | print(f'[INFO] Building an engine from file {onnx_file_path}; this may take a while...') 173 | 174 | # build trt engine 175 | builder.max_batch_size = max_batch_size 176 | # config.max_workspace_size = 2 << 30 # 2GB 177 | builder.max_workspace_size = 2 << 30 # 2GB 178 | builder.fp16_mode = fp16_mode 179 | if int8_mode: 180 | builder.int8_mode = int8_mode 181 | # config.set_flag(trt.BuilderFlag.INT8) 182 | assert calibration_stream, '[Error] a calibration_stream should be provided for int8 mode' 183 | config.int8_calibrator = Calibrator(calibration_stream, calibration_table_path) 184 | # builder.int8_calibrator = Calibrator(calibration_stream, calibration_table_path) 185 | print('[INFO] Int8 mode enabled') 186 | engine = builder.build_cuda_engine(network) 187 | if engine is None: 188 | print('[INFO] Failed to create the engine') 189 | return None 190 | print("[INFO] Completed creating the engine") 191 | if save_engine: 192 | with open(engine_file_path, "wb") as f: 193 | f.write(engine.serialize()) 194 | return engine 195 | 196 | if os.path.exists(engine_file_path): 197 | # If a serialized engine exists, load it instead of building a new one. 198 | print(f"[INFO] Reading engine from file {engine_file_path}") 199 | with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 200 | return runtime.deserialize_cuda_engine(f.read()) 201 | else: 202 | return build_engine(max_batch_size, save_engine) 203 | 204 | 205 | -------------------------------------------------------------------------------- /trt_util/plot_box.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ~~~Medcare AI Lab~~~ 18 | 19 | 20 | import numpy as np 21 | import torch 22 | from PIL import Image 23 | import matplotlib.pyplot as plt 24 | import matplotlib 25 | matplotlib.use('Agg') 26 | import cupy as cp 27 | 28 | 29 | CLASSES = ["NA","Class A","Class B","Class C","Class D","Class E","Class F", 30 | "Class G","Class H","Class I","Class J","Class K","Class L","Class M", 31 | "Class N","Class O","Class P","Class Q","Class R","Class S","Class T","Class U"] 32 | 33 | COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125], 34 | [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]] 35 | 36 | # 将xywh转xyxy 37 | def box_cxcywh_to_xyxy(x): 38 | x = torch.from_numpy(x) 39 | x_c, y_c, w, h = x.unbind(1) 40 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 41 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 42 | return torch.stack(b, dim=1) 43 | 44 | # 将0-1映射到图像 45 | def rescale_bboxes(out_bbox, size): 46 | img_w, img_h = size 47 | b = box_cxcywh_to_xyxy(out_bbox) 48 | b = b.cpu().numpy() 49 | b = b * np.array([img_w, img_h, img_w, img_h], dtype=np.float32) 50 | return b 51 | 52 | 53 | def plot_box(pil_img, prob, boxes, prob_threshold=0.1, save_fig=''): 54 | 55 | # 根据阈值将box去掉 56 | # print(prob) 57 | # print(boxes) 58 | prob = torch.from_numpy(prob).softmax(-1)[0,:,:-1] 59 | keep = prob.max(-1).values >= prob_threshold 60 | # convert boxes from [0; 1] to image scales 61 | prob = prob.cpu().detach().numpy() 62 | keep = keep.cpu().detach().numpy() 63 | 64 | boxes = rescale_bboxes(boxes[0, keep], pil_img.size) 65 | prob = prob[keep] 66 | 67 | # print("----------------*--------------------") 68 | # print(f"prob: {prob}") 69 | # print(f"box: {boxes}") 70 | # print("----------------*--------------------") 71 | 72 | # plot box 73 | plt.figure(figsize=(16,10)) 74 | plt.imshow(pil_img) 75 | ax = plt.gca() 76 | for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), COLORS * 100): 77 | ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, 78 | fill=False, color=c, linewidth=3)) 79 | cl = p.argmax() 80 | text = f'{CLASSES[cl]}: {p[cl]:0.2f}' 81 | ax.text(xmin, ymin, text, fontsize=15, 82 | bbox=dict(facecolor='yellow', alpha=0.5)) 83 | plt.axis('off') 84 | # plt.show() 85 | if not save_fig == '': 86 | plt.savefig(save_fig,transparent=True, dpi=300, pad_inches = 0) 87 | 88 | plt.close() 89 | -------------------------------------------------------------------------------- /trt_util/process_img.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ~~~Medcare AI Lab~~~ 18 | 19 | 20 | 21 | import numpy as np 22 | from PIL import Image 23 | 24 | import torch 25 | from torch import nn 26 | import torchvision.transforms as T 27 | 28 | import pycuda.driver as cuda 29 | import pycuda.autoinit 30 | 31 | import cupy as cp 32 | from cupy.core.dlpack import toDlpack 33 | from cupy.core.dlpack import fromDlpack 34 | from torch.utils.dlpack import to_dlpack 35 | from torch.utils.dlpack import from_dlpack 36 | 37 | def preprocess_np(img_path): 38 | '''process use numpy 39 | ''' 40 | im = Image.open(img_path) 41 | img = im.resize((800, 800),Image.BILINEAR) 42 | img = np.array(img).astype(np.float32) / 255.0 43 | img = img.transpose(2,0,1) 44 | # print(img.shape) 45 | img = (img - np.array([ [[0.485]], [[0.456]], [[0.406]] ]))/np.array([ [[0.229]], [[0.224]], [[0.225]] ]) 46 | 47 | # img = img.transpose(1,2,0) 48 | img = np.expand_dims(img, axis=0) 49 | img = np.ascontiguousarray(img) 50 | img = np.array(img).astype(np.float32) 51 | 52 | return img, im, im.size 53 | 54 | 55 | class PyTorchTensorHolder(pycuda.driver.PointerHolderBase): 56 | '''代码来源: 57 | https://github.com/NVIDIA/trt-samples-for-hackathon-cn/blob/master/python/app_onnx_resnet50.py 58 | ''' 59 | def __init__(self, tensor): 60 | super(PyTorchTensorHolder, self).__init__() 61 | self.tensor = tensor 62 | def get_pointer(self): 63 | return self.tensor.data_ptr() 64 | 65 | transform = T.Compose([ 66 | T.Resize((800,800)), # PIL.Image.BILINEAR 67 | T.ToTensor(), 68 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 69 | ]) 70 | 71 | 72 | def preprocess_torch(img_path): 73 | '''process use torchvision 74 | ''' 75 | im = Image.open(img_path) 76 | img = transform(im).unsqueeze(0) 77 | img = PyTorchTensorHolder(img) 78 | return img, im, im.size 79 | 80 | def preprocess_torch_v1(img_path): 81 | im = Image.open(img_path) 82 | img = transform(im).unsqueeze(0).cpu().numpy() 83 | return img, im, im.size 84 | 85 | def preprocess_torch_v2(img_path): 86 | im = Image.open(img_path) 87 | img = transform(im).unsqueeze(0) 88 | return img, im, im.size 89 | 90 | 91 | def preprocess_cu(img_np): 92 | mean_cp = cp.array([ [[0.485]], [[0.456]], [[0.406]] ]) 93 | std_cp = cp.array([ [[0.229]], [[0.224]], [[0.225]] ]) 94 | 95 | img_cu = cp.divide(cp.asarray(img,dtype=cp.float32),255.0) 96 | img_cu = img_cu.transpose(2,0,1) 97 | img_cu = cp.subtract(img_cu,mean_cp) 98 | img_cu = cp.divide(img_cu,std_cp) 99 | 100 | # cupy to torch tensor 101 | # img_tensor = from_dlpack(toDlpack(img_cu)) 102 | 103 | return img_cu -------------------------------------------------------------------------------- /trt_util/trt_lite.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # ~~~Medcare AI Lab~~~ 17 | # 代码来源:https://github.com/NVIDIA/trt-samples-for-hackathon-cn/blob/master/python/ (仅做简单修改) 18 | 19 | from functools import reduce 20 | import tensorrt 21 | import torch 22 | import numpy as np 23 | 24 | class TrtLite: 25 | def __init__(self, build_engine_proc = None, build_engine_params = None, engine_file_path = None): 26 | logger = tensorrt.Logger(tensorrt.Logger.INFO) 27 | if engine_file_path is None: 28 | with tensorrt.Builder(logger) as builder: 29 | if build_engine_params is not None: 30 | self.engine = build_engine_proc(builder, *build_engine_params) 31 | else: 32 | self.engine = build_engine_proc(builder) 33 | else: 34 | with open(engine_file_path, 'rb') as f, tensorrt.Runtime(logger) as runtime: 35 | self.engine = runtime.deserialize_cuda_engine(f.read()) 36 | self.context = self.engine.create_execution_context() 37 | 38 | def __del__(self): 39 | self.engine = None 40 | self.context = None 41 | print("[INFO] 释放模型") 42 | 43 | def save_to_file(self, engine_file_path): 44 | with open(engine_file_path, 'wb') as f: 45 | f.write(self.engine.serialize()) 46 | 47 | def get_io_info(self, input_desc): 48 | def to_numpy_dtype(trt_dtype): 49 | tb = { 50 | tensorrt.DataType.BOOL: np.dtype('bool'), 51 | tensorrt.DataType.FLOAT: np.dtype('float32'), 52 | tensorrt.DataType.HALF: np.dtype('float16'), 53 | tensorrt.DataType.INT32: np.dtype('int32'), 54 | tensorrt.DataType.INT8: np.dtype('int8'), 55 | } 56 | return tb[trt_dtype] 57 | 58 | if isinstance(input_desc, dict): 59 | if self.engine.has_implicit_batch_dimension: 60 | print('Engine was built with static-shaped input so you should provide batch_size instead of i2shape') 61 | return 62 | i2shape = input_desc 63 | for i, shape in i2shape.items(): 64 | self.context.set_binding_shape(i, shape) 65 | return [(self.engine.get_binding_name(i), self.engine.binding_is_input(i), 66 | tuple(self.context.get_binding_shape(i)), to_numpy_dtype(self.engine.get_binding_dtype(i))) for i in range(self.engine.num_bindings)] 67 | 68 | batch_size = input_desc 69 | return [(self.engine.get_binding_name(i), 70 | self.engine.binding_is_input(i), 71 | (batch_size,) + tuple(self.context.get_binding_shape(i)), 72 | to_numpy_dtype(self.engine.get_binding_dtype(i))) for i in range(self.engine.num_bindings)] 73 | 74 | def allocate_io_buffers(self, input_desc, on_gpu): 75 | io_info = self.get_io_info(input_desc) 76 | if io_info is None: 77 | return 78 | if on_gpu: 79 | cuda = torch.device('cuda') 80 | np2pth = { 81 | np.dtype('bool'): torch.bool, 82 | np.dtype('float32'): torch.float32, 83 | np.dtype('float16'): torch.float16, 84 | np.dtype('int32'): torch.int32, 85 | np.dtype('int8'): torch.int8, 86 | } 87 | return [torch.empty(i[2], dtype=np2pth[i[3]], device=cuda) for i in io_info] 88 | else: 89 | return [np.zeros(i[2], i[3]) for i in io_info] 90 | 91 | def execute(self, bindings, input_desc, stream_handle = 0, input_consumed = None): 92 | if isinstance(input_desc, dict): 93 | i2shape = input_desc 94 | for i, shape in i2shape.items(): 95 | self.context.set_binding_shape(i, shape) 96 | self.context.execute_async_v2(bindings, stream_handle, input_consumed) 97 | return 98 | 99 | batch_size = input_desc 100 | self.context.execute_async(batch_size, bindings, stream_handle, input_consumed) 101 | 102 | def print_info(self): 103 | print("Batch dimension is", "implicit" if self.engine.has_implicit_batch_dimension else "explicit") 104 | for i in range(self.engine.num_bindings): 105 | print("input" if self.engine.binding_is_input(i) else "output", 106 | self.engine.get_binding_name(i), self.engine.get_binding_dtype(i), 107 | self.engine.get_binding_shape(i), 108 | -1 if -1 in self.engine.get_binding_shape(i) else reduce( 109 | lambda x, y: x * y, self.engine.get_binding_shape(i)) * self.engine.get_binding_dtype(i).itemsize) --------------------------------------------------------------------------------