├── CLA.md ├── LICENSE.md ├── README.md ├── deepstream_yolo ├── README.md ├── config_infer_primary_yoloV4.txt ├── config_infer_primary_yoloV7.txt ├── deepstream_app_config_yolo.txt ├── labels.txt └── nvdsinfer_custom_impl_Yolo │ ├── Makefile │ ├── nvdsparsebbox_Yolo.cpp │ └── nvdsparsebbox_Yolo_cuda.cu ├── tensorrt_yolov4 ├── Makefile ├── Makefile.config ├── README.md ├── data │ ├── demo.jpg │ └── demo_out.jpg └── source │ ├── Makefile │ ├── SampleYolo.cpp │ ├── SampleYolo.hpp │ ├── generate_coco_image_list.py │ ├── main.cpp │ └── onnx_add_nms_plugin.py ├── tensorrt_yolov7 ├── CMakeLists.txt ├── README.md ├── imgs │ ├── horses.jpg │ └── zidane.jpg ├── samples │ ├── detect.cpp │ ├── validate_coco.cpp │ └── video_detect.cpp ├── src │ ├── Yolov7.cpp │ ├── Yolov7.h │ ├── argsParser.cpp │ ├── argsParser.h │ └── tools.h └── test_coco_map.py └── yolov7_qat ├── README.md ├── doc ├── Guidance_of_QAT_performance_optimization.md └── imgs │ ├── QATConv.png │ ├── QATFlow.png │ ├── int8_q_recommended_procedure.png │ ├── monkey-patch-qat-conv-fp16-issue_ptq.png │ ├── monkey-patch-qat-conv-fp16-issue_ptqonnx.png │ ├── monkey-patch-qat-conv-fp16-issue_qat.png │ ├── monkey-patch-qat-conv-fp16-issue_qatonnx.png │ ├── monkey-patch-qat-conv-fp16-issue_qatonnx_edit.png │ └── monkey-patch-qat-maxpooling-qat.png ├── quantization ├── quantize.py └── rules.py └── scripts ├── detect-trt.py ├── draw-engine.py ├── eval-trt.py ├── eval-trt.sh ├── qat-yolov5.py ├── qat.py ├── quantize_utils.py └── trt-int8.py /CLA.md: -------------------------------------------------------------------------------- 1 | ## Individual Contributor License Agreement (CLA) 2 | 3 | **Thank you for submitting your contributions to this project.** 4 | 5 | By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions 6 | to the project. 7 | 8 | ### License. 9 | 10 | You hereby represent that all present, past and future contributions are governed by the 11 | [MIT License](https://opensource.org/licenses/MIT) 12 | copyright statement. 13 | 14 | This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights 15 | of the code or documents you contribute to the project itself or its maintainers. 16 | Furthermore you also represent that you have the authority to perform the above waiver 17 | with respect to the entirety of you contributions. 18 | 19 | ### Moral Rights. 20 | 21 | To the fullest extent permitted under applicable law, you hereby waive, and agree not to 22 | assert, all of your “moral rights” in or relating to your contributions for the benefit of the project. 23 | 24 | ### Third Party Content. 25 | 26 | If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, 27 | specifications, documentation, data, materials, feedback, information or other works of authorship that were not 28 | authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary 29 | rights associated with your Contribution (“Third Party Rights”), 30 | then you agree to include with the submission of your Contribution full details respecting such Third Party 31 | Content and Third Party Rights, including, without limitation, identification of which aspects of your 32 | Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the 33 | Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable 34 | third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater 35 | certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights 36 | do not apply to any portion of a Project that is incorporated into your Contribution to that same Project. 37 | 38 | ### Representations. 39 | 40 | You represent that, other than the Third Party Content and Third Party Rights identified by 41 | you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled 42 | to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were 43 | created in the course of your employment with your past or present employer(s), you represent that such 44 | employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer 45 | (s) has waived all of their right, title or interest in or to your Contributions. 46 | 47 | ### Disclaimer. 48 | 49 | To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" 50 | basis, without any warranties or conditions, express or implied, including, without limitation, any implied 51 | warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not 52 | required to provide support for your Contributions, except to the extent you desire to provide support. 53 | 54 | ### No Obligation. 55 | 56 | You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions 57 | into the project. The decision to use or incorporate your contributions into the project will be made at the 58 | sole discretion of the maintainers or their authorized delegates. 59 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Yolo DeepStream 2 | 3 | ## Description 4 | 5 | This repo have 4 parts: 6 | ### 1) yolov7_qat 7 | In [yolov7_qat](yolov7_qat), We use [TensorRT's pytorch quntization tool](https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization) to Finetune training QAT yolov7 from the pre-trained weight. 8 | Finally we get the same performance of PTQ in TensorRT on Jetson OrinX. And the accuracy(mAP) of the model only dropped a little. 9 | 10 | ### 2) tensorrt_yolov7 11 | In [tensorrt_yolov7](tensorrt_yolov7), We provide a standalone c++ yolov7-app sample here. You can use trtexec to convert FP32 onnx models or QAT-int8 models exported from repo [yolov7_qat](yolov7_qat) to trt-engines. And set the trt-engine as yolov7-app's input. It can do detections on images/videos. Or test mAP on COCO dataset. 12 | 13 | ### 3) deepstream_yolo 14 | In [deepstream_yolo](deepstream_yolo), This sample shows how to integrate YOLO models with customized output layer parsing for detected objects with DeepStreamSDK. 15 | 16 | ### 4) tensorrt_yolov4 17 | In [tensorrt_yolov4](tensorrt_yolov4), This sample shows a standalone tensorrt-sample for yolov4. 18 | 19 | ## Performance 20 | For YoloV7 sample: 21 | 22 | Below table shows the end-to-end performance of processing 1080p videos with this sample application. 23 | - Testing Device : 24 | 25 | 1. Jetson AGX Orin 64GB(PowerMode:MAXN + GPU-freq:1.3GHz + CPU:12-core-2.2GHz) 26 | 27 | 2. Tesla T4 28 | 29 | |Device |precision |Number
of streams | Batch Size | trtexec FPS| deepstream-app FPS
with cuda-post-process |deepstream-app FPS
with cpu-post-process| 30 | |----------- |----------- |----------------- | -----------|----------- |-----------|-----------| 31 | | Orin-X| FP16 | 1 | 1 | 126 | 124 | 120 | 32 | | Orin-X| FP16 | 16 | 16 | 162 | 145 | 135 | 33 | | Orin-X| Int8(PTQ/QAT)| 1 | 1 | 180 | 175 | 128 | 34 | | Orin-X| Int8(PTQ/QAT)| 16 | 16 | 264 | 264 | 135 | 35 | | T4 | FP16 | 1 | 1 | 132 | 125 | 123 | 36 | | T4 | FP16 | 16 | 16 | 169 | 169 | 123 | 37 | | T4 | Int8(PTQ/QAT)| 1 | 1 | 208 | 170 | 127 | 38 | | T4 | Int8(PTQ/QAT)| 16 | 16 | 305 | 300 | 132 | 39 | 40 | 41 | - note: trtexec cudaGraph not enabled as deepstream not support cudaGraph 42 | 43 | ## Code structure 44 | ```bash 45 | ├── deepstream_yolo 46 | │ ├── config_infer_primary_yoloV4.txt # config file for yolov4 model 47 | │ ├── config_infer_primary_yoloV7.txt # config file for yolov7 model 48 | │ ├── deepstream_app_config_yolo.txt # deepStream reference app configuration file for using YOLOv models as the primary detector. 49 | │ ├── labels.txt # labels for coco detection # output layer parsing function for detected objects for the Yolo model. 50 | │ ├── nvdsinfer_custom_impl_Yolo 51 | │ │ ├── Makefile 52 | │ │ └── nvdsparsebbox_Yolo.cpp 53 | │ └── README.md 54 | ├── README.md 55 | ├── tensorrt_yolov4 56 | │ ├── data 57 | │ │ ├── demo.jpg # the demo image 58 | │ │ └── demo_out.jpg # image detection output of the demo image 59 | │ ├── Makefile 60 | │ ├── Makefile.config 61 | │ ├── README.md 62 | │ └── source 63 | │ ├── generate_coco_image_list.py # python script to get list of image names from MS COCO annotation or information file 64 | │ ├── main.cpp # program main entrance where parameters are configured here 65 | │ ├── Makefile 66 | │ ├── onnx_add_nms_plugin.py # python script to add BatchedNMSPlugin node into ONNX model 67 | │ ├── SampleYolo.cpp # yolov4 inference class functions definition file 68 | │ └── SampleYolo.hpp # yolov4 inference class definition file 69 | ├── tensorrt_yolov7 70 | │ ├── CMakeLists.txt 71 | │ ├── imgs # the demo images 72 | │ │ ├── horses.jpg 73 | │ │ └── zidane.jpg 74 | │ ├── README.md 75 | │ ├── samples 76 | │ │ ├── detect.cpp # detection app for images detection 77 | │ │ ├── validate_coco.cpp # validate coco dataset app 78 | │ │ └── video_detect.cpp # detection app for video detection 79 | │ ├── src 80 | │ │ ├── argsParser.cpp # argsParser helper class for commandline parsing 81 | │ │ ├── argsParser.h # argsParser helper class for commandline parsing 82 | │ │ ├── tools.h # helper function for yolov7 class 83 | │ │ ├── Yolov7.cpp # Class Yolov7 84 | │ │ └── Yolov7.h # Class Yolov7 85 | │ └── test_coco_map.py # tool for test coco map with json file 86 | └── yolov7_qat 87 | ├── doc 88 | │ ├── Guidance_of_QAT_performance_optimization.md # guidance for Q&DQ insert and placement for pytorch-quantization tool 89 | ├── quantization 90 | │ ├── quantize.py # helper class for quantize yolov7 model 91 | │ └── rules.py # rules for Q&DQ nodes insert and restrictions 92 | ├── README.md 93 | └── scripts 94 | ├── detect-trt.py # detect a image with tensorrt engine 95 | ├── draw-engine.py # draw tensorrt engine to graph 96 | ├── eval-trt.py # the script for evalating tensorrt mAP 97 | ├── eval-trt.sh # the command lne script for evaluating tensorrt mAP 98 | ├── qat.py # main function for QAT and PTQ 99 | └── trt-int8.py # tensorrt build-in calibration 100 | ``` 101 | -------------------------------------------------------------------------------- /deepstream_yolo/README.md: -------------------------------------------------------------------------------- 1 | # Deploy YOLO Models With DeepStream # 2 | 3 | **This sample shows how to integrate YOLO models with customized output layer parsing for detected objects with DeepStreamSDK.** 4 | 5 | ## 1. Sample contents: ## 6 | - `deepstream_app_config_yolo.txt`: DeepStream reference app configuration file for using YOLO models as the primary detector. 7 | - `config_infer_primary_yoloV4.txt`: Configuration file for the GStreamer nvinfer plugin for the YoloV4 detector model. 8 | - `config_infer_primary_yoloV7.txt`: Configuration file for the GStreamer nvinfer plugin for the YoloV7 detector model. 9 | - `nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp`: Output layer parsing function for detected objects for the Yolo models. 10 | 11 | ## 2. Pre-requisites: ## 12 | 13 | ### 2.1 Please make sure DeepStream 6.1.1+ is properly installed ### 14 | 15 | ### 2.2 Generate Model ### 16 | #### YoloV4 17 | 18 | - Go to this pytorch repository where you can convert YOLOv4 Pytorch model into **ONNX** 19 | - Other famous YOLOv4 pytorch repositories as references: 20 | - 21 | - 22 | - 23 | - 24 | - Or you can download reference ONNX model directly from here ([link](https://drive.google.com/file/d/1tp1xzeey4YBSd8nGd-dkn8Ymii9ordEj/view?usp=sharing)). 25 | 26 | #### YOLOv7 27 | following the guide https://github.com/WongKinYiu/yolov7#export, export a dynamic-batch-1-output onnx-model 28 | ```bash 29 | $ python export.py --weights ./yolov7.pt --grid --simplify --topk-all 100 --iou-thres 0.65 --conf-thres 0.35 --img-size 640 640 --dynamic-batch 30 | ``` 31 | or using the qat model exported from [yolov7_qat](../yolov7_qat) 32 | ## 3. Download and Run ## 33 | 34 | ```sh 35 | $ cd ~/ 36 | $ git clone https://github.com/NVIDIA-AI-IOT/yolo_deepstream.git 37 | $ cd ~/yolo_deepstream/deepstream_yolo/nvdsinfer_custom_impl_Yolo 38 | $ make 39 | $ cd .. 40 | ``` 41 | Make sure the model exists under ~/yolo_deepstream/deepstream_yolo/. Change the "config-file" parameter in the "deepstream_app_config_yolo.txt" configuration file to the nvinfer configuration file for the model you want to run with. 42 | |Model|Nvinfer Configuration File| 43 | |-----------|----------| 44 | |YoloV4|config_infer_primary_yoloV4.txt| 45 | |YoloV7|config_infer_primary_yoloV7.txt| 46 | 47 | ``` 48 | $ deepstream-app -c deepstream_app_config_yolo.txt 49 | ``` 50 | ## 4. CUDA Post Processing 51 | 52 | this sample provide two ways of yolov7 post-processing(decoce yolo result, not include NMS), CPU version and GPU version 53 | - CPU implement can be found in: [nvdsparsebbox_Yolo.cpp](deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp) 54 | - CUDA implement can be found in: [nvdsparsebbox_Yolo_cuda.cu](deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu) 55 | 56 | Default will use CUDA-post processing. To enable CPU post-processing: 57 | in [config_infer_primary_yoloV7.txt](deepstream_yolo/config_infer_primary_yoloV7.txt) 58 | 59 | - `parse-bbox-func-name=NvDsInferParseCustomYoloV7_cuda` -> `parse-bbox-func-name=NvDsInferParseCustomYoloV7` 60 | - `disable-output-host-copy=1` -> `disable-output-host-copy=0` 61 | 62 | The performance of the CPU-post-processing and CUDA-post-processing result can be found in [Performance](https://github.com/NVIDIA-AI-IOT/yolo_deepstream#performance) 63 | 64 | -------------------------------------------------------------------------------- /deepstream_yolo/config_infer_primary_yoloV4.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | # Following properties are mandatory when engine files are not specified: 25 | # int8-calib-file(Only in INT8), model-file-format 26 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names 27 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names 28 | # ONNX: onnx-file 29 | # 30 | # Mandatory properties for detectors: 31 | # num-detected-classes 32 | # 33 | # Optional properties for detectors: 34 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0) 35 | # custom-lib-path 36 | # parse-bbox-func-name 37 | # 38 | # Mandatory properties for classifiers: 39 | # classifier-threshold, is-classifier 40 | # 41 | # Optional properties for classifiers: 42 | # classifier-async-mode(Secondary mode only, Default=false) 43 | # 44 | # Optional properties in secondary mode: 45 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes), 46 | # input-object-min-width, input-object-min-height, input-object-max-width, 47 | # input-object-max-height 48 | # 49 | # Following properties are always recommended: 50 | # batch-size(Default=1) 51 | # 52 | # Other optional properties: 53 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32), 54 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path, 55 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary), 56 | # custom-lib-path, network-mode(Default=0 i.e FP32) 57 | # 58 | # The values in the config file are overridden by values set through GObject 59 | # properties. 60 | 61 | [property] 62 | gpu-id=0 63 | net-scale-factor=0.0039215697906911373 64 | #0=RGB, 1=BGR 65 | model-color-format=0 66 | onnx-file=yolov4_-1_3_416_416_nms_dynamic.onnx 67 | model-engine-file=yolov4_-1_3_416_416_nms_dynamic.onnx_b16_gpu0_fp16.engine 68 | labelfile-path=labels.txt 69 | batch-size=16 70 | ## 0=FP32, 1=INT8, 2=FP16 mode 71 | network-mode=2 72 | num-detected-classes=80 73 | gie-unique-id=1 74 | network-type=0 75 | is-classifier=0 76 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering) 77 | cluster-mode=2 78 | maintain-aspect-ratio=1 79 | parse-bbox-func-name=NvDsInferParseCustomYoloV4 80 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 81 | #scaling-filter=0 82 | #scaling-compute-hw=0 83 | 84 | [class-attrs-all] 85 | nms-iou-threshold=0.6 86 | pre-cluster-threshold=0.4 87 | -------------------------------------------------------------------------------- /deepstream_yolo/config_infer_primary_yoloV7.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | # Following properties are mandatory when engine files are not specified: 25 | # int8-calib-file(Only in INT8), model-file-format 26 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names 27 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names 28 | # ONNX: onnx-file 29 | # 30 | # Mandatory properties for detectors: 31 | # num-detected-classes 32 | # 33 | # Optional properties for detectors: 34 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0) 35 | # custom-lib-path 36 | # parse-bbox-func-name 37 | # 38 | # Mandatory properties for classifiers: 39 | # classifier-threshold, is-classifier 40 | # 41 | # Optional properties for classifiers: 42 | # classifier-async-mode(Secondary mode only, Default=false) 43 | # 44 | # Optional properties in secondary mode: 45 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes), 46 | # input-object-min-width, input-object-min-height, input-object-max-width, 47 | # input-object-max-height 48 | # 49 | # Following properties are always recommended: 50 | # batch-size(Default=1) 51 | # 52 | # Other optional properties: 53 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32), 54 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path, 55 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary), 56 | # custom-lib-path, network-mode(Default=0 i.e FP32) 57 | # 58 | # The values in the config file are overridden by values set through GObject 59 | # properties. 60 | 61 | [property] 62 | gpu-id=0 63 | net-scale-factor=0.0039215697906911373 64 | #0=RGB, 1=BGR 65 | model-color-format=0 66 | onnx-file=yolov7.onnx 67 | labelfile-path=labels.txt 68 | ## 0=FP32, 1=INT8, 2=FP16 mode 69 | network-mode=2 70 | num-detected-classes=80 71 | gie-unique-id=1 72 | network-type=0 73 | is-classifier=0 74 | ## 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering) 75 | cluster-mode=2 76 | maintain-aspect-ratio=1 77 | symmetric-padding=1 78 | ## Bilinear Interpolation 79 | scaling-filter=1 80 | #parse-bbox-func-name=NvDsInferParseCustomYoloV7 81 | parse-bbox-func-name=NvDsInferParseCustomYoloV7_cuda 82 | #disable-output-host-copy=0 83 | disable-output-host-copy=1 84 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 85 | #scaling-compute-hw=0 86 | ## start from DS6.2 87 | crop-objects-to-roi-boundary=1 88 | 89 | 90 | [class-attrs-all] 91 | #nms-iou-threshold=0.3 92 | #threshold=0.7 93 | nms-iou-threshold=0.65 94 | pre-cluster-threshold=0.25 95 | topk=300 96 | 97 | -------------------------------------------------------------------------------- /deepstream_yolo/deepstream_app_config_yolo.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | [application] 25 | enable-perf-measurement=1 26 | perf-measurement-interval-sec=5 27 | #gie-kitti-output-dir=streamscl 28 | 29 | [tiled-display] 30 | enable=0 31 | rows=4 32 | columns=4 33 | width=1280 34 | height=720 35 | gpu-id=0 36 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform 37 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla 38 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla 39 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla 40 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson 41 | nvbuf-memory-type=0 42 | 43 | [source0] 44 | enable=1 45 | #Type - 1=CameraV4L2 2=URI 3=MultiURI 46 | type=3 47 | uri=file:/opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4 48 | num-sources=16 49 | gpu-id=0 50 | # (0): memtype_device - Memory type Device 51 | # (1): memtype_pinned - Memory type Host Pinned 52 | # (2): memtype_unified - Memory type Unified 53 | cudadec-memtype=0 54 | 55 | [sink0] 56 | enable=1 57 | #Type - 1=FakeSink 2=EglSink 3=File 58 | type=3 59 | sync=0 60 | source-id=0 61 | gpu-id=0 62 | nvbuf-memory-type=0 63 | #1=mp4 2=mkv 64 | container=1 65 | #1=h264 2=h265 66 | codec=1 67 | output-file=yolov4.mp4 68 | 69 | [osd] 70 | enable=1 71 | gpu-id=0 72 | border-width=1 73 | text-size=12 74 | text-color=1;1;1;1; 75 | text-bg-color=0.3;0.3;0.3;1 76 | font=Serif 77 | show-clock=0 78 | clock-x-offset=800 79 | clock-y-offset=820 80 | clock-text-size=12 81 | clock-color=1;0;0;0 82 | nvbuf-memory-type=0 83 | 84 | [streammux] 85 | gpu-id=0 86 | ##Boolean property to inform muxer that sources are live 87 | live-source=0 88 | batch-size=16 89 | ##time out in usec, to wait after the first buffer is available 90 | ##to push the batch even if the complete batch is not formed 91 | batched-push-timeout=40000 92 | ## Set muxer output width and height 93 | width=1280 94 | height=720 95 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works 96 | ##along with width, height properties 97 | enable-padding=0 98 | nvbuf-memory-type=0 99 | 100 | # config-file property is mandatory for any gie section. 101 | # Other properties are optional and if set will override the properties set in 102 | # the infer config file. 103 | [primary-gie] 104 | enable=1 105 | gpu-id=0 106 | labelfile-path=labels.txt 107 | batch-size=16 108 | #Required by the app for OSD, not a plugin property 109 | bbox-border-color0=1;0;0;1 110 | bbox-border-color1=0;1;1;1 111 | bbox-border-color2=0;0;1;1 112 | bbox-border-color3=0;1;0;1 113 | interval=0 114 | gie-unique-id=1 115 | nvbuf-memory-type=0 116 | config-file=config_infer_primary_yoloV4.txt 117 | #config-file=config_infer_primary_yoloV7.txt 118 | 119 | [tracker] 120 | enable=0 121 | # For NvDCF and DeepSORT tracker, tracker-width and tracker-height must be a multiple of 32, respectively 122 | tracker-width=640 123 | tracker-height=384 124 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so 125 | # ll-config-file required to set different tracker types 126 | # ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_IOU.yml 127 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_perf.yml 128 | # ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_accuracy.yml 129 | # ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_DeepSORT.yml 130 | gpu-id=0 131 | enable-batch-process=1 132 | enable-past-frame=1 133 | display-tracking-id=1 134 | 135 | [tests] 136 | file-loop=0 137 | -------------------------------------------------------------------------------- /deepstream_yolo/labels.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /deepstream_yolo/nvdsinfer_custom_impl_Yolo/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | CC:= g++ 25 | NVCC:=/usr/local/cuda/bin/nvcc 26 | 27 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations 28 | CFLAGS+= -I/opt/nvidia/deepstream/deepstream/sources/includes/ -I/usr/local/cuda/include 29 | 30 | CUFLAGS:= -std=c++14 -shared 31 | CUFLAGS+= -I/opt/nvidia/deepstream/deepstream/sources/includes/ -I/usr/local/cuda/include 32 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda/lib64 -lcudart -lcublas -lstdc++fs 33 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group 34 | 35 | INCS:= $(wildcard *.h) 36 | SRCFILES:= nvdsparsebbox_Yolo.cpp\ 37 | nvdsparsebbox_Yolo_cuda.cu 38 | 39 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so 40 | 41 | TARGET_OBJS:= $(SRCFILES:.cpp=.o) 42 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o) 43 | 44 | all: $(TARGET_LIB) 45 | 46 | %.o: %.cpp $(INCS) Makefile 47 | $(CC) -c -o $@ $(CFLAGS) $< 48 | 49 | %.o: %.cu $(INCS) Makefile 50 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $(CUFLAGS) $< 51 | 52 | $(TARGET_LIB) : $(TARGET_OBJS) 53 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS) 54 | 55 | clean: 56 | rm -rf $(TARGET_LIB) *.o 57 | -------------------------------------------------------------------------------- /deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include "nvdsinfer_custom_impl.h" 33 | 34 | static const int NUM_CLASSES_YOLO = 80; 35 | 36 | float clamp(const float val, const float minVal, const float maxVal) 37 | { 38 | assert(minVal <= maxVal); 39 | return std::min(maxVal, std::max(minVal, val)); 40 | } 41 | 42 | extern "C" bool NvDsInferParseCustomYoloV4( 43 | std::vector const& outputLayersInfo, 44 | NvDsInferNetworkInfo const& networkInfo, 45 | NvDsInferParseDetectionParams const& detectionParams, 46 | std::vector& objectList); 47 | 48 | extern "C" bool NvDsInferParseCustomYoloV7( 49 | std::vector const& outputLayersInfo, 50 | NvDsInferNetworkInfo const& networkInfo, 51 | NvDsInferParseDetectionParams const& detectionParams, 52 | std::vector& objectList); 53 | 54 | /* YOLOv4 implementations */ 55 | static NvDsInferParseObjectInfo convertBBoxYoloV4(const float& bx1, const float& by1, const float& bx2, 56 | const float& by2, const uint& netW, const uint& netH) 57 | { 58 | NvDsInferParseObjectInfo b; 59 | // Restore coordinates to network input resolution 60 | 61 | float x1 = bx1 * netW; 62 | float y1 = by1 * netH; 63 | float x2 = bx2 * netW; 64 | float y2 = by2 * netH; 65 | 66 | x1 = clamp(x1, 0, netW); 67 | y1 = clamp(y1, 0, netH); 68 | x2 = clamp(x2, 0, netW); 69 | y2 = clamp(y2, 0, netH); 70 | 71 | b.left = x1; 72 | b.width = clamp(x2 - x1, 0, netW); 73 | b.top = y1; 74 | b.height = clamp(y2 - y1, 0, netH); 75 | 76 | return b; 77 | } 78 | 79 | static void addBBoxProposalYoloV4(const float bx, const float by, const float bw, const float bh, 80 | const uint& netW, const uint& netH, const int maxIndex, 81 | const float maxProb, std::vector& binfo) 82 | { 83 | NvDsInferParseObjectInfo bbi = convertBBoxYoloV4(bx, by, bw, bh, netW, netH); 84 | if (bbi.width < 1 || bbi.height < 1) return; 85 | 86 | bbi.detectionConfidence = maxProb; 87 | bbi.classId = maxIndex; 88 | binfo.push_back(bbi); 89 | } 90 | 91 | static std::vector 92 | decodeYoloV4Tensor( 93 | const float* boxes, const float* scores, 94 | const uint num_bboxes, NvDsInferParseDetectionParams const& detectionParams, 95 | const uint& netW, const uint& netH) 96 | { 97 | std::vector binfo; 98 | 99 | uint bbox_location = 0; 100 | uint score_location = 0; 101 | for (uint b = 0; b < num_bboxes; ++b) 102 | { 103 | float bx1 = boxes[bbox_location]; 104 | float by1 = boxes[bbox_location + 1]; 105 | float bx2 = boxes[bbox_location + 2]; 106 | float by2 = boxes[bbox_location + 3]; 107 | 108 | float maxProb = 0.0f; 109 | int maxIndex = -1; 110 | 111 | for (uint c = 0; c < detectionParams.numClassesConfigured; ++c) 112 | { 113 | float prob = scores[score_location + c]; 114 | if (prob > maxProb) 115 | { 116 | maxProb = prob; 117 | maxIndex = c; 118 | } 119 | } 120 | 121 | if (maxProb > detectionParams.perClassPreclusterThreshold[maxIndex]) 122 | { 123 | addBBoxProposalYoloV4(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo); 124 | } 125 | 126 | bbox_location += 4; 127 | score_location += detectionParams.numClassesConfigured; 128 | } 129 | 130 | return binfo; 131 | } 132 | 133 | extern "C" bool NvDsInferParseCustomYoloV4( 134 | std::vector const& outputLayersInfo, 135 | NvDsInferNetworkInfo const& networkInfo, 136 | NvDsInferParseDetectionParams const& detectionParams, 137 | std::vector& objectList) 138 | { 139 | if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured) 140 | { 141 | std::cerr << "WARNING: Num classes mismatch. Configured:" 142 | << detectionParams.numClassesConfigured 143 | << ", detected by network: " << NUM_CLASSES_YOLO << std::endl; 144 | } 145 | 146 | std::vector objects; 147 | 148 | const NvDsInferLayerInfo &boxes = outputLayersInfo[0]; // num_boxes x 4 149 | const NvDsInferLayerInfo &scores = outputLayersInfo[1]; // num_boxes x num_classes 150 | 151 | // 3 dimensional: [num_boxes, 1, 4] 152 | assert(boxes.inferDims.numDims == 3); 153 | // 2 dimensional: [num_boxes, num_classes] 154 | assert(scores.inferDims.numDims == 2); 155 | 156 | // The second dimension should be num_classes 157 | assert(detectionParams.numClassesConfigured == scores.inferDims.d[1]); 158 | 159 | uint num_bboxes = boxes.inferDims.d[0]; 160 | 161 | // std::cout << "Network Info: " << networkInfo.height << " " << networkInfo.width << std::endl; 162 | 163 | std::vector outObjs = 164 | decodeYoloV4Tensor( 165 | (const float*)(boxes.buffer), (const float*)(scores.buffer), num_bboxes, detectionParams, 166 | networkInfo.width, networkInfo.height); 167 | 168 | objects.insert(objects.end(), outObjs.begin(), outObjs.end()); 169 | 170 | objectList = objects; 171 | 172 | return true; 173 | } 174 | /* YOLOv4 implementations end*/ 175 | 176 | /*Yolov7 bbox parser*/ 177 | static NvDsInferParseObjectInfo convertBBoxYoloV7(const float& bx, const float& by, const float& bw, 178 | const float& bh, const int& stride, const uint& netW, 179 | const uint& netH) 180 | { 181 | NvDsInferParseObjectInfo b; 182 | // Restore coordinates to network input resolution 183 | float xCenter = bx * stride; 184 | float yCenter = by * stride; 185 | float x0 = xCenter - bw / 2; 186 | float y0 = yCenter - bh / 2; 187 | float x1 = x0 + bw; 188 | float y1 = y0 + bh; 189 | 190 | x0 = clamp(x0, 0, netW); 191 | y0 = clamp(y0, 0, netH); 192 | x1 = clamp(x1, 0, netW); 193 | y1 = clamp(y1, 0, netH); 194 | 195 | b.left = x0; 196 | b.width = clamp(x1 - x0, 0, netW); 197 | b.top = y0; 198 | b.height = clamp(y1 - y0, 0, netH); 199 | 200 | return b; 201 | } 202 | 203 | static void addBBoxProposalYoloV7(const float bx, const float by, const float bw, const float bh, 204 | const uint stride, const uint& netW, const uint& netH, const int maxIndex, 205 | const float maxProb, std::vector& binfo) 206 | { 207 | NvDsInferParseObjectInfo bbi = convertBBoxYoloV7(bx, by, bw, bh, stride, netW, netH); 208 | if (bbi.width < 1 || bbi.height < 1) return; 209 | 210 | bbi.detectionConfidence = maxProb; 211 | bbi.classId = maxIndex; 212 | binfo.push_back(bbi); 213 | } 214 | 215 | static bool NvDsInferParseYoloV7( 216 | std::vector const& outputLayersInfo, 217 | NvDsInferNetworkInfo const& networkInfo, 218 | NvDsInferParseDetectionParams const& detectionParams, 219 | std::vector& objectList) 220 | { 221 | 222 | 223 | if (outputLayersInfo.empty()) { 224 | std::cerr << "Could not find output layer in bbox parsing" << std::endl;; 225 | return false; 226 | } 227 | const NvDsInferLayerInfo &layer = outputLayersInfo[0]; 228 | 229 | if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured) 230 | { 231 | std::cerr << "WARNING: Num classes mismatch. Configured:" 232 | << detectionParams.numClassesConfigured 233 | << ", detected by network: " << NUM_CLASSES_YOLO << std::endl; 234 | } 235 | 236 | std::vector objects; 237 | 238 | float* data = (float*)layer.buffer; 239 | const int dimensions = layer.inferDims.d[1]; 240 | int rows = layer.inferDims.numElements / layer.inferDims.d[1]; 241 | 242 | for (int i = 0; i < rows; ++i) { 243 | //85 = x, y, w, h, maxProb, score0......score79 244 | float bx = data[ 0]; 245 | float by = data[ 1]; 246 | float bw = data[ 2]; 247 | float bh = data[ 3]; 248 | float maxProb = data[ 4]; 249 | int maxIndex = data[ 5]; 250 | float * classes_scores = data + 5; 251 | 252 | float maxScore = 0; 253 | int index = 0; 254 | for (int j = 0 ;j < NUM_CLASSES_YOLO; j++){ 255 | if(*classes_scores > maxScore){ 256 | index = j; 257 | maxScore = *classes_scores; 258 | } 259 | classes_scores++; 260 | } 261 | 262 | maxIndex = index; 263 | data += dimensions; 264 | 265 | addBBoxProposalYoloV7(bx, by, bw, bh, 1, networkInfo.width, networkInfo.height, maxIndex, maxProb, objects); 266 | } 267 | objectList = objects; 268 | return true; 269 | } 270 | 271 | extern "C" bool NvDsInferParseCustomYoloV7( 272 | std::vector const& outputLayersInfo, 273 | NvDsInferNetworkInfo const& networkInfo, 274 | NvDsInferParseDetectionParams const& detectionParams, 275 | std::vector& objectList) 276 | { 277 | return NvDsInferParseYoloV7 ( 278 | outputLayersInfo, networkInfo, detectionParams, objectList); 279 | } 280 | 281 | /* Check that the custom function has been defined correctly */ 282 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV4); 283 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV7); 284 | -------------------------------------------------------------------------------- /deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include "nvdsinfer_custom_impl.h" 32 | #include "nvtx3/nvToolsExt.h" 33 | #include 34 | #include 35 | 36 | static const int NUM_CLASSES_YOLO = 80; 37 | #define OBJECTLISTSIZE 25200 38 | #define BLOCKSIZE 1024 39 | thrust::device_vector objects_v(OBJECTLISTSIZE); 40 | 41 | extern "C" bool NvDsInferParseCustomYoloV7_cuda( 42 | std::vector const& outputLayersInfo, 43 | NvDsInferNetworkInfo const& networkInfo, 44 | NvDsInferParseDetectionParams const& detectionParams, 45 | std::vector& objectList); 46 | 47 | 48 | __global__ void decodeYoloV7Tensor_cuda(NvDsInferParseObjectInfo *binfo/*output*/, float* data, int dimensions, int rows, 49 | int netW, int netH, float Threshold){ 50 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 51 | if(idx < rows) { 52 | data = data + idx * dimensions; 53 | float maxProb = data[ 4]; 54 | //maxProb < Threshold, directly return 55 | if(maxProb < Threshold){ 56 | binfo[idx].detectionConfidence = 0.0; 57 | return; 58 | } 59 | float bx = data[ 0]; 60 | float by = data[ 1]; 61 | float bw = data[ 2]; 62 | float bh = data[ 3]; 63 | int maxIndex = 0; 64 | float * classes_scores = (float *)(data + 5); 65 | float maxScore = 0; 66 | int index = 0; 67 | 68 | #pragma unroll 69 | for (int j = 0 ;j < NUM_CLASSES_YOLO; j++){ 70 | if(*classes_scores > maxScore){ 71 | index = j; 72 | maxScore = *classes_scores; 73 | } 74 | classes_scores++; 75 | } 76 | if(maxProb * maxScore < Threshold){ 77 | binfo[idx].detectionConfidence = 0.0; 78 | return; 79 | } 80 | maxIndex = index; 81 | float stride = 1.0; 82 | float xCenter = bx * stride; 83 | float yCenter = by * stride; 84 | float x0 = xCenter - bw / 2.0; 85 | float y0 = yCenter - bh / 2.0; 86 | float x1 = x0 + bw; 87 | float y1 = y0 + bh; 88 | x0 = fminf(float(netW), fmaxf(float(0.0), x0)); 89 | y0 = fminf(float(netH), fmaxf(float(0.0), y0)); 90 | x1 = fminf(float(netW), fmaxf(float(0.0), x1)); 91 | y1 = fminf(float(netH), fmaxf(float(0.0), y1)); 92 | binfo[idx].left = x0; 93 | binfo[idx].top = y0; 94 | binfo[idx].width = fminf(float(netW), fmaxf(float(0.0), x1-x0)); 95 | binfo[idx].height = fminf(float(netH), fmaxf(float(0.0), y1-y0)); 96 | binfo[idx].detectionConfidence = maxProb * maxScore; 97 | binfo[idx].classId = maxIndex; 98 | } 99 | return; 100 | } 101 | static bool NvDsInferParseYoloV7_cuda( 102 | std::vector const& outputLayersInfo, 103 | NvDsInferNetworkInfo const& networkInfo, 104 | NvDsInferParseDetectionParams const& detectionParams, 105 | std::vector& objectList) 106 | { 107 | 108 | if (outputLayersInfo.empty()) { 109 | std::cerr << "Could not find output layer in bbox parsing" << std::endl;; 110 | return false; 111 | } 112 | const NvDsInferLayerInfo &layer = outputLayersInfo[0]; 113 | 114 | if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured) 115 | { 116 | std::cerr << "WARNING: Num classes mismatch. Configured:" 117 | << detectionParams.numClassesConfigured 118 | << ", detected by network: " << NUM_CLASSES_YOLO << std::endl; 119 | } 120 | 121 | float* data = (float*)layer.buffer; 122 | const int dimensions = layer.inferDims.d[1]; 123 | int rows = layer.inferDims.numElements / layer.inferDims.d[1]; 124 | 125 | int GRIDSIZE = ((OBJECTLISTSIZE-1)/BLOCKSIZE)+1; 126 | //find the min threshold 127 | float min_PreclusterThreshold = *(std::min_element(detectionParams.perClassPreclusterThreshold.begin(), 128 | detectionParams.perClassPreclusterThreshold.end())); 129 | decodeYoloV7Tensor_cuda<<>> 130 | (thrust::raw_pointer_cast(objects_v.data()), data, dimensions, rows, networkInfo.width, 131 | networkInfo.height, min_PreclusterThreshold); 132 | objectList.resize(OBJECTLISTSIZE); 133 | thrust::copy(objects_v.begin(),objects_v.end(),objectList.begin());//the same as cudamemcpy 134 | 135 | return true; 136 | } 137 | 138 | extern "C" bool NvDsInferParseCustomYoloV7_cuda( 139 | std::vector const& outputLayersInfo, 140 | NvDsInferNetworkInfo const& networkInfo, 141 | NvDsInferParseDetectionParams const& detectionParams, 142 | std::vector& objectList) 143 | { 144 | nvtxRangePush("NvDsInferParseYoloV7"); 145 | bool ret = NvDsInferParseYoloV7_cuda ( 146 | outputLayersInfo, networkInfo, detectionParams, objectList); 147 | 148 | nvtxRangePop(); 149 | return ret; 150 | } 151 | 152 | /* Check that the custom function has been defined correctly */ 153 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV7_cuda); 154 | -------------------------------------------------------------------------------- /tensorrt_yolov4/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | 25 | SHELL=/bin/bash -o pipefail 26 | TARGET?=$(shell uname -m) 27 | LIBDIR?=lib 28 | VERBOSE?=0 29 | ifeq ($(VERBOSE), 1) 30 | AT= 31 | else 32 | AT=@ 33 | endif 34 | CUDA_TRIPLE=x86_64-linux 35 | CUBLAS_TRIPLE=x86_64-linux-gnu 36 | DLSW_TRIPLE=x86_64-linux-gnu 37 | ifeq ($(TARGET), aarch64) 38 | CUDA_TRIPLE=aarch64-linux 39 | CUBLAS_TRIPLE=aarch64-linux-gnu 40 | DLSW_TRIPLE=aarch64-linux-gnu 41 | endif 42 | ifeq ($(TARGET), qnx) 43 | CUDA_TRIPLE=aarch64-qnx 44 | CUBLAS_TRIPLE=aarch64-qnx-gnu 45 | DLSW_TRIPLE=aarch64-unknown-nto-qnx 46 | endif 47 | ifeq ($(TARGET), ppc64le) 48 | CUDA_TRIPLE=ppc64le-linux 49 | CUBLAS_TRIPLE=ppc64le-linux 50 | DLSW_TRIPLE=ppc64le-linux 51 | endif 52 | ifeq ($(TARGET), android64) 53 | DLSW_TRIPLE=aarch64-linux-androideabi 54 | CUDA_TRIPLE=$(DLSW_TRIPLE) 55 | CUBLAS_TRIPLE=$(DLSW_TRIPLE) 56 | endif 57 | export TARGET 58 | export VERBOSE 59 | export LIBDIR 60 | export CUDA_TRIPLE 61 | export CUBLAS_TRIPLE 62 | export DLSW_TRIPLE 63 | 64 | ifeq ($(SAFE_PDK), 1) 65 | # Only dlaSafetyRuntime is currently able to execute with safety pdk. 66 | samples = dlaSafetyRuntime 67 | else 68 | samples = sampleAlgorithmSelector sampleCharRNN sampleDynamicReshape sampleFasterRCNN sampleGoogleNet sampleINT8 sampleINT8API sampleMLP sampleMNIST sampleMNISTAPI sampleNMT sampleMovieLens sampleOnnxMNIST sampleUffPluginV2Ext sampleReformatFreeIO sampleSSD sampleUffFasterRCNN sampleUffMaskRCNN sampleUffMNIST sampleUffSSD trtexec samplePlugin 69 | 70 | 71 | # sampleMovieLensMPS should only be compiled for Linux targets. 72 | # sample uses Linux specific shared memory and IPC libraries. 73 | ifeq ($(TARGET),x86_64) 74 | samples += sampleMovieLensMPS 75 | endif 76 | 77 | # sampleNvmedia/dlaSafetyRuntime/dlaSafetyBuilder should only be compiled with DLA enabled. 78 | ifeq ($(ENABLE_DLA),1) 79 | samples += sampleNvmedia 80 | samples += dlaSafetyRuntime 81 | samples += dlaSafetyBuilder 82 | endif 83 | endif 84 | 85 | .PHONY: all clean help 86 | all: 87 | $(AT)$(foreach sample,$(samples), $(MAKE) -C $(sample) &&) : 88 | 89 | clean: 90 | $(AT)$(foreach sample,$(samples), $(MAKE) clean -C $(sample) &&) : 91 | 92 | help: 93 | $(AT)echo "Sample building help menu." 94 | $(AT)echo "Samples:" 95 | $(AT)$(foreach sample,$(samples), echo -e "\t$(sample)" &&) : 96 | $(AT)echo -e "\nCommands:" 97 | $(AT)echo -e "\tall - build all samples." 98 | $(AT)echo -e "\tclean - clean all samples." 99 | $(AT)echo -e "\nVariables:" 100 | $(AT)echo -e "\tTARGET - Specify the target to build for." 101 | $(AT)echo -e "\tVERBOSE - Specify verbose output." 102 | $(AT)echo -e "\tCUDA_INSTALL_DIR - Directory where cuda installs to." 103 | -------------------------------------------------------------------------------- /tensorrt_yolov4/README.md: -------------------------------------------------------------------------------- 1 | # YOLOv4 Standalone Program of Multi-Tasks 2 | 3 | ## 1. Contents 4 | 5 | - **`common`** Some common code dependencies and utilities 6 | - **`source`** Source code of standalone Program 7 | - `main.cpp`: Program main entrance where parameters are configured here 8 | - `SampleYolo.hpp`: YOLOv4 inference class definition file 9 | - `SampleYolo.cpp`: YOLOv4 inference class functions definition file 10 | - `onnx_add_nms_plugin.py`: Python script to add BatchedNMSPlugin node into ONNX model 11 | - `generate_coco_image_list.py`: Python script to get list of image names from MS COCO annotation or information file 12 | 13 | - **`data`** This directory saves: 14 | - `yolov4.onnx`: the ONNX model (User generated) 15 | - `yolov4.engine`: the TensorRT engine model (would be generated by this program) 16 | - `demo.jpg`: The demo image (Already exists) 17 | - `demo_out.jpg`: Image detection output of the demo image (Already exists, but would be renewed by the program) 18 | - `names.txt`: MS COCO dataset label names (have to be downloaded or generated via COCO API) 19 | - `categories.txt`: MS COCO dataset categories where IDs and names are separated by `"\t"` (have to be generated via COCO API) 20 | - `val2017.txt`: MS COCO validation set image list (have to be generated from corresponding COCO annotation file) 21 | - `valdev2017.txt`: MS COCO test set image list (have to be generated from corresponding COCO annotation file) 22 | - `coco_result.json`: MS COCO dataset output (would be generated by this program) 23 | 24 | 25 | ## 2 Prerequisites before building & running YOLOv4 standalone ## 26 | 27 | ### 2.1 Download TensorRT (higher than 7.1, you can ignore this step if TensorRT 7.1 is already installed) ### 28 | 29 | - Download TensorRT from NVIDIA developer page: 30 | - Install or depackage the deb file or tar file. 31 | 32 | ### 2.2 Download and build TensorRT OSS ### 33 | 34 | - Refer to README files in 35 | - Go to if you are working on Jetson platform 36 | - Go to if you are working on x86 platform 37 | 38 | - Follow guidences in README to clone repository and build `libnvinfer_plugin.so.7.x.x` 39 | 40 | - Rename `/lib/libnvinfer_plugin.so.7.x.x` to `/lib/libnvinfer_plugin.so.7.x.x.back` 41 | 42 | - Copy `/build/out/libnvinfer_plugin.so.7.x.x` into `/lib` 43 | 44 | ### 2.3 Generate YOLOv4 ONNX model with BatchedNMSPlugin node included ### 45 | 46 | #### Step 1 Generate YOLOv4 ONNX model (`CSPDarknet-53 CNN + YOLO header CNN + YOLO layers`) #### 47 | 48 | - Here is one of the YOLOv4 Pytorch repositories that can guide you to generate an ONNX model of YOLOv4. 49 | You can convert from the pretrained DarkNet model into ONNX directly; but you can also 1) convert the DarkNet model into Pytorch, 2) train the Pytorch model using your own dataset, and 3) then convert into ONNX. 50 | 51 | - Other famous YOLOv4 pytorch repositories as references: 52 | - 53 | - 54 | - 55 | - 56 | 57 | 58 | #### Step 2 Add into YOLOv4 ONNX model the BatchedNMSPlugin (`CSPDarknet-53 CNN + YOLO header CNN + YOLO layers + BatchedNMSPlugin`) 59 | 60 | **How can I add `BatchedNMSPlugin` node into ONNX model?** 61 | 62 | - Open `source_gpu_nms/onnx_add_nms_plugin.py` 63 | 64 | - Update attribute values to suit your model 65 | 66 | Example: 67 | ```py 68 | attrs["shareLocation"] = 1 69 | attrs["backgroundLabelId"] = -1 70 | attrs["numClasses"] = 80 71 | attrs["topK"] = topK # from program arguments 72 | attrs["keepTopK"] = keepTopK # from program arguments 73 | attrs["scoreThreshold"] = 0.3 74 | attrs["iouThreshold"] = 0.6 75 | attrs["isNormalized"] = 1 76 | attrs["clipBoxes"] = 1 77 | ``` 78 | 79 | - Copy `onnx_add_nms_plugin.py` into `/tools/onnx-graphsurgeon` 80 | 81 | - Go to `/tools/onnx-graphsurgeon` and execute `onnx_add_nms_plugin.py` 82 | 83 | ```sh 84 | cd /tools/onnx-graphsurgeon 85 | python onnx_add_nms_plugin.py -f -t -k 86 | ``` 87 | 88 | ## 3. How can I build and run YOLOv4 standalone program? ## 89 | 90 | ### 3.1 Add common source code includes ### 91 | 92 | - This YOLOv4 standalone sample depends on the same common includes as other C++ samples of TensorRT. 93 | - Option 1: Add a link to `/TensorRT-7.1.x.x/samples/common` in `tensorrt_yolov4` 94 | ``` 95 | cd /yolov4_sample/tensorrt_yolov4 96 | ln -s /TensorRT-7.1.x.x/samples/common common 97 | ``` 98 | - Option 2: Simply copy common includes into `tensorrt_yolov4` 99 | ``` 100 | cd /yolov4_sample/tensorrt_yolov4 101 | cp -r /TensorRT-7.1.x.x/samples/common common ./ 102 | ``` 103 | 104 | ### 3.2 OpenCV dependencies ### 105 | 106 | - Note: There are OpenCV dependencies in this program. Please check if there are OpenCV includes in /usr/include/opencv and if OpenCV libraries like `-lopencv_core` and `-lopencv_imgproc` are installed. 107 | 108 | - Follow README and documents of this repository **** to install OpenCV if corresponding includes and libraries do not exist. 109 | 110 | ### 3.3 Compile and build ### 111 | 112 | 113 | ```sh 114 | cd /yolov4_sample/yolo_cpp_standalone/source_gpu_nms 115 | make clean 116 | make -j 117 | ``` 118 | 119 | 120 | ### 3.4 Basic program parameters ### 121 | 122 | - Step1: Use text editor to open `main.cpp` in `/YOLOv4_Sample/tensorrt_yolov4/source` 123 | 124 | - Step2: Go to where function `initializeSampleParams()` is defined 125 | 126 | - Step3: You will find some basic configurations in `initializeSampleParams()` like follows: 127 | 128 | ```cpp 129 | // This argument is for calibration of int8 130 | // Int8 calibration is not available until now 131 | // You have to prepare samples for int8 calibration by yourself 132 | params.nbCalBatches = 80; 133 | 134 | // The engine file to generate or to load 135 | // The engine file does not exist: 136 | // This program will try to load onnx file and convert onnx into engine 137 | // The engine file exists: 138 | // This program will load the engine file directly 139 | params.engingFileName = "../data/yolov4.engine"; 140 | 141 | // The onnx file to load 142 | params.onnxFileName = "../data/yolov4.onnx"; 143 | 144 | // Input tensor name of ONNX file & engine file 145 | params.inputTensorNames.push_back("input"); 146 | 147 | // Old batch configuration, it is zero if explicitBatch flag is true for the tensorrt engine 148 | // May be deprecated in the future 149 | params.batchSize = 0; 150 | 151 | // Number of classes (usually 80, but can be other values) 152 | params.outputClsSize = 80; 153 | 154 | // topK parameter of BatchedNMSPlugin 155 | params.topK = 2000; 156 | 157 | // keepTopK parameter of BatchedNMSPlugin 158 | params.keepTopK = 1000; 159 | 160 | // Batch size, you can modify to other batch size values if needed 161 | params.explicitBatchSize = 1; 162 | 163 | params.inputImageName = "../data/demo.jpg"; 164 | params.cocoClassNamesFileName = "../data/coco.names"; 165 | params.cocoClassIDFileName = "../data/categories.txt"; 166 | 167 | // Config number of DLA cores, -1 if there is no DLA core 168 | params.dlaCore = -1; 169 | ``` 170 | 171 | - Step4: Copy and rename the ONNX file (`BatchedNMSPlugin` node included) to the location defined by `initializeSampleParams()` 172 | 173 | 174 | ### 3.5 Run this program to convert ONNX file into Engine file ### 175 | 176 | - This program will automatically convert ONNX into engine if engine does not exist. 177 | - Command: 178 | - To generate Engine of fp32 mode: 179 | ``` 180 | ../bin/yolov4 181 | ``` 182 | - To generate Engine of fp16 mode: 183 | ``` 184 | ../bin/yolov4 --fp16 185 | ``` 186 | 187 | ### 3.6 Specific program parameters for `demo` mode, `speed` mode and `coco` mode ### 188 | 189 | #### 3.6.1 To run this program in `demo` mode 190 | 191 | - Command: 192 | 193 | ``` 194 | ../bin/yolov4 --demo 195 | ``` 196 | 197 | - This program will feed the demo image into YOLOv4 engine and write detection output as an image. 198 | - Please make sure `params.demo = 1` if you want to run this program in demo mode. 199 | 200 | ```cpp 201 | // Configurations to run a demo image 202 | params.demo = 1; 203 | params.outputImageName = "../data/demo_out.jpg"; 204 | ``` 205 | 206 | #### 3.6.2 To run this program in `speed` mode 207 | 208 | - Command: 209 | 210 | ``` 211 | ../bin/yolov4 --speed 212 | ``` 213 | 214 | - This program will repeatedly feed the demo image into engine to accumulate time consumed in each iteration 215 | - Please make sure `params.speedTest = 1` if you want to run this program in speed mode 216 | 217 | ```cpp 218 | // Configurations to run speed test 219 | params.speedTest = 1; 220 | params.speedTestItrs = 1000; 221 | ``` 222 | 223 | #### 3.6.3 To run this program in `coco` mode 224 | 225 | - Command: 226 | 227 | ``` 228 | ../bin/yolov4 --coco 229 | ``` 230 | 231 | - Corresponding configuration in `initializeSampleParams()` would be like this: 232 | 233 | ```cpp 234 | // Configurations of Test on COCO dataset 235 | params.cocoTest = 1; 236 | params.cocoClassNamesFileName = "../data/coco.names"; 237 | params.cocoClassIDFileName = "../data/categories.txt"; 238 | params.cocoImageListFileName = "../data/val2017.txt"; 239 | params.cocoTestResultFileName = "../data/coco_result.json"; 240 | params.cocoImageDir = "../data/val2017"; 241 | ``` 242 | 243 | **Note: COCO dataset is just an example, you can use your own validation set or test set to validate YOLOv4 model trained by your own training set** 244 | 245 | - Step 1: Download MS COCO images and annotations from 246 | 247 | - Images for validation: 248 | - Annotations for training and validation: 249 | - Images for test: 250 | - Image info for test: 251 | 252 | - Step 2: Clone COCO API repository from and use COCO API to generate `categories.txt` 253 | 254 | - Format of `categories.txt` must follow this rule: IDs and names are separated by "\t". 255 | 256 | ``` 257 | 1 person 258 | 2 bicycle 259 | 2 car 260 | 4 motorcycle 261 | 5 airplane 262 | ``` 263 | 264 | - COCO API example that can help you distill categories from COCO dataset (You can have a look at `cocoapi\PythonAPI\pycocoDemo.ipynb` of for more details): 265 | 266 | ```py 267 | # display COCO categories and supercategories 268 | cats = coco.loadCats(coco.getCatIds()) 269 | nms=[cat['name'] for cat in cats] 270 | print('COCO categories: \n{}\n'.format(' '.join(nms))) 271 | ``` 272 | 273 | 274 | - Step 3: Generate image list file using python script `generate_coco_image_list.py` 275 | 276 | ``` 277 | python generate_coco_image_list.py 278 | ``` 279 | 280 | - For example, to generate validation image list, the command would be: 281 | 282 | ``` 283 | python generate_coco_image_list.py instances_val2017.json val2017.txt 284 | ``` 285 | - For example, to generate test-dev image list, the command would be: 286 | ``` 287 | python generate_coco_image_list.py image_info_test-dev2017.json testdev2017.txt 288 | ``` 289 | 290 | - This program will read image names from the list file whose path should be the same as `params.cocoImageListFileName`, and then feed these images located in `params.cocoImageDir` to YOLOv4 engine 291 | - Please make sure `params.cocoTest = 1` and images exist in `params.cocoImageDir` 292 | 293 | 294 | -------------------------------------------------------------------------------- /tensorrt_yolov4/data/demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov4/data/demo.jpg -------------------------------------------------------------------------------- /tensorrt_yolov4/data/demo_out.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov4/data/demo_out.jpg -------------------------------------------------------------------------------- /tensorrt_yolov4/source/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | OUTNAME_RELEASE = yolov4 25 | OUTNAME_DEBUG = yolov4_debug 26 | EXTRA_DIRECTORIES = ../common 27 | .NOTPARALLEL: 28 | MAKEFILE ?= ../Makefile.config 29 | include $(MAKEFILE) 30 | -------------------------------------------------------------------------------- /tensorrt_yolov4/source/SampleYolo.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | //! 25 | //! SampleYolo.cpp 26 | //! This file contains the implementation of the YOLOv4 sample. It creates the network using 27 | //! the YOLOv4 ONNX model. 28 | 29 | #pragma once 30 | 31 | #include "BatchStream.h" 32 | #include "EntropyCalibrator.h" 33 | #include "argsParser.h" 34 | #include "buffers.h" 35 | #include "common.h" 36 | #include "logger.h" 37 | 38 | #include "NvOnnxParser.h" 39 | #include "NvInfer.h" 40 | #include 41 | 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | #include 49 | #include 50 | #include 51 | #include 52 | 53 | //! 54 | //! \brief The SampleYoloParams structure groups the additional parameters required by 55 | //! the SSD sample. 56 | //! 57 | struct SampleYoloParams : public samplesCommon::OnnxSampleParams 58 | { 59 | int outputClsSize = 80; //!< The number of output classes 60 | int topK = 2000; 61 | int keepTopK = 1000; //!< The maximum number of detection post-NMS 62 | int nbCalBatches = 100; //!< The number of batches for calibration 63 | int demo = 0; 64 | int speedTest = 0; 65 | int cocoTest = 0; 66 | size_t speedTestItrs = 1000; 67 | int explicitBatchSize = 1; 68 | std::vector inputShape; 69 | std::vector> outputShapes; 70 | std::string inputImageName; 71 | std::string outputImageName; 72 | std::string calibrationBatches; //!< The path to calibration batches 73 | std::string engingFileName; 74 | std::string cocoClassNamesFileName; 75 | std::string cocoClassIDFileName; 76 | std::string cocoImageListFileName; 77 | std::string cocoImageOutputDir; 78 | std::string cocoTestResultFileName; 79 | std::string cocoImageDir; 80 | }; 81 | 82 | struct BoundingBox 83 | { 84 | float x1; 85 | float y1; 86 | float x2; 87 | float y2; 88 | float score; 89 | int cls; 90 | }; 91 | 92 | enum NMS_TYPE 93 | { 94 | MIN, 95 | UNION, 96 | }; 97 | 98 | struct SpeedInfo 99 | { 100 | long long preProcess; 101 | long long model; 102 | long long postProcess; 103 | 104 | SpeedInfo() : 105 | preProcess {0}, 106 | model {0}, 107 | postProcess {0} 108 | {} 109 | 110 | void printTimeConsmued() 111 | { 112 | std::cout << "Time consumed in preProcess: " << this->preProcess << std::endl; 113 | std::cout << "Time consumed in model: " << this->model << std::endl; 114 | std::cout << "Time consumed in postProcess: " << this->postProcess << std::endl; 115 | } 116 | }; 117 | 118 | class BoundingBoxComparator 119 | { 120 | public: 121 | bool operator() (const BoundingBox & b1, const BoundingBox & b2) 122 | { 123 | return b1.score > b2.score; 124 | } 125 | }; 126 | 127 | class StringComparator 128 | { 129 | public: 130 | bool operator() (const std::string & first, const std::string & second) const 131 | { 132 | return first < second; 133 | } 134 | }; 135 | 136 | //! \brief The SampleYolo class implements the SSD sample 137 | //! 138 | //! \details It creates the network using a caffe model 139 | //! 140 | class SampleYolo 141 | { 142 | template 143 | using SampleUniquePtr = std::unique_ptr; 144 | 145 | public: 146 | static const std::string gSampleName; 147 | 148 | SampleYolo(const SampleYoloParams& params); 149 | 150 | //! 151 | //! \brief Function builds the network engine 152 | //! 153 | bool build(); 154 | 155 | //! 156 | //! \brief Runs the TensorRT inference engine for this sample 157 | //! 158 | bool infer(); 159 | 160 | //! 161 | //! \brief Cleans up any state created in the sample class 162 | //! 163 | bool teardown(); 164 | 165 | private: 166 | SampleYoloParams mParams; //!< The parameters for the sample. 167 | 168 | nvinfer1::Dims mInputDims; //!< The dimensions of the input to the network. 169 | 170 | cv::Mat mSampleImage; 171 | 172 | SpeedInfo mSpeedInfo; 173 | 174 | //std::vector> mPPMs; //!< PPMs of test images 175 | 176 | std::shared_ptr mEngine; //!< The TensorRT engine used to run the network 177 | 178 | std::vector mClasses; 179 | 180 | std::map mClassesMap; 181 | 182 | std::vector mImageFiles; 183 | 184 | std::ofstream mCocoResult; 185 | 186 | std::vector image_rows; 187 | std::vector image_cols; 188 | std::vector image_pad_rows; 189 | std::vector image_pad_cols; 190 | 191 | size_t mImageIdx; 192 | 193 | //! 194 | //! \brief Parses an ONNX model for YOLO and creates a TensorRT network 195 | //! 196 | bool constructNetwork(SampleUniquePtr& builder, 197 | SampleUniquePtr& network, SampleUniquePtr& config, 198 | SampleUniquePtr& parser); 199 | 200 | //! 201 | //! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer 202 | //! 203 | bool processInput_aspectRatio(const samplesCommon::BufferManager& buffers); 204 | 205 | bool processInput(const samplesCommon::BufferManager& buffers); 206 | 207 | //! 208 | //! \brief Filters output detections and verify results 209 | //! 210 | bool verifyOutput_aspectRatio(const samplesCommon::BufferManager& buffers); 211 | 212 | bool verifyOutput(const samplesCommon::BufferManager& buffers); 213 | 214 | //! 215 | //! \brief To check if certain file exists given the path 216 | //! 217 | bool fileExists(const std::string& name) 218 | { 219 | std::ifstream f(name.c_str()); 220 | return f.good(); 221 | } 222 | 223 | bool infer_iteration(SampleUniquePtr &context, samplesCommon::BufferManager &buffers); 224 | 225 | std::vector> get_bboxes(int batch_size, int keep_topk, 226 | int32_t *num_detections, float *mnsed_boxes, float *mnsed_scores, float *mnsed_classes); 227 | 228 | void draw_bboxes(const std::vector &bboxes, cv::Mat &img); 229 | 230 | void draw_coco_test_bboxes(const std::vector &bboxes, cv::Mat &img, int img_id); 231 | 232 | long long now_in_milliseconds(); 233 | }; 234 | 235 | -------------------------------------------------------------------------------- /tensorrt_yolov4/source/generate_coco_image_list.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | import re 25 | import sys 26 | 27 | json_file_name = sys.argv[1] 28 | img_list_name = sys.argv[2] 29 | 30 | json_text = None 31 | with open(json_file_name, 'r') as f: 32 | json_text = f.read() 33 | 34 | matched_list = re.findall( r'\"([0-9]+.jpg)\"', json_text) 35 | 36 | with open(img_list_name, 'w') as f: 37 | for img_name in matched_list: 38 | f.write(img_name) 39 | f.write('\n') 40 | -------------------------------------------------------------------------------- /tensorrt_yolov4/source/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | #include "SampleYolo.hpp" 24 | 25 | //! 26 | //! \brief Prints the help information for running this sample 27 | //! 28 | void printHelpInfo() 29 | { 30 | std::cout << "--help Display help information" << std::endl; 31 | std::cout << "--demo This app will run demo if this option is set" 32 | << std::endl; 33 | std::cout << "--speed This app will run speed test if this option is set" 34 | << std::endl; 35 | std::cout << "--coco This app will run COCO dataset if this option is set" 36 | << std::endl; 37 | std::cout << "--fp16 Specify to run in fp16 mode." << std::endl; 38 | std::cout << "--int8 Specify to run in int8 mode." << std::endl; 39 | } 40 | 41 | SampleYoloParams specifyInputAndOutputNamesAndShapes(SampleYoloParams ¶ms) 42 | { 43 | params.inputShape = std::vector {params.explicitBatchSize, 3, 416, 416}; 44 | 45 | // Output shapes when BatchedNMSPlugin is available 46 | params.outputShapes.push_back(std::vector{params.explicitBatchSize, 1}); 47 | params.outputShapes.push_back(std::vector{params.explicitBatchSize, params.keepTopK, 4}); 48 | params.outputShapes.push_back(std::vector{params.explicitBatchSize, params.keepTopK}); 49 | params.outputShapes.push_back(std::vector{params.explicitBatchSize, params.keepTopK}); 50 | 51 | // Output tensors when BatchedNMSPlugin is available 52 | params.outputTensorNames.push_back("num_detections"); 53 | params.outputTensorNames.push_back("nmsed_boxes"); 54 | params.outputTensorNames.push_back("nmsed_scores"); 55 | params.outputTensorNames.push_back("nmsed_classes"); 56 | 57 | return params; 58 | } 59 | 60 | //! 61 | //! \brief Initializes members of the params struct using the command line args 62 | //! 63 | SampleYoloParams initializeSampleParams(std::vector args) 64 | { 65 | SampleYoloParams params; 66 | 67 | // This argument is for calibration of int8 68 | // Int8 calibration is not available until now 69 | // You have to prepare samples for int8 calibration by yourself 70 | params.nbCalBatches = 80; 71 | 72 | // The engine file to generate or to load 73 | // The engine file does not exist: 74 | // This program will try to load onnx file and convert onnx into engine 75 | // The engine file exists: 76 | // This program will load the engine file directly 77 | params.engingFileName = "../data/yolov4.engine"; 78 | 79 | // The onnx file to load 80 | params.onnxFileName = "../data/yolov4.onnx"; 81 | 82 | // Input tensor name of ONNX file & engine file 83 | params.inputTensorNames.push_back("input"); 84 | 85 | // Old batch configuration, it is zero if explicitBatch flag is true for the tensorrt engine 86 | // May be deprecated in the future 87 | params.batchSize = 0; 88 | 89 | // Number of classes (usually 80, but can be other values) 90 | params.outputClsSize = 80; 91 | 92 | // topK parameter of BatchedNMSPlugin 93 | params.topK = 2000; 94 | 95 | // keepTopK parameter of BatchedNMSPlugin 96 | params.keepTopK = 1000; 97 | 98 | // Batch size, you can modify to other batch size values if needed 99 | params.explicitBatchSize = 1; 100 | 101 | params.inputImageName = "../data/demo.jpg"; 102 | params.cocoClassNamesFileName = "../data/names.txt"; 103 | params.cocoClassIDFileName = "../data/categories.txt"; 104 | 105 | // Config number of DLA cores, -1 if there is no DLA core 106 | params.dlaCore = -1; 107 | 108 | for (auto &arg : args) 109 | { 110 | if (arg == "--help") 111 | { 112 | printHelpInfo(); 113 | } 114 | else if (arg == "--demo") 115 | { 116 | // Configurations to run a demo image 117 | params.demo = 1; 118 | params.outputImageName = "../data/demo_out.jpg"; 119 | } 120 | else if (arg == "--speed") 121 | { 122 | // Configurations to run speed test 123 | params.speedTest = 1; 124 | params.speedTestItrs = 1000; 125 | } 126 | else if (arg == "--coco") 127 | { 128 | // Configurations of Test on COCO dataset 129 | params.cocoTest = 1; 130 | params.cocoImageListFileName = "../data/val2017.txt"; 131 | params.cocoTestResultFileName = "../data/coco_result.json"; 132 | params.cocoImageDir = "../data/val2017"; 133 | } 134 | else if (arg == "--int8") 135 | { 136 | params.int8 = true; 137 | } 138 | else if (arg == "--fp16") 139 | { 140 | params.fp16 = true; 141 | } 142 | } 143 | 144 | specifyInputAndOutputNamesAndShapes(params); 145 | 146 | return params; 147 | } 148 | 149 | int main(int argc, char** argv) 150 | { 151 | std::vector args; 152 | for (int i = 0; i < argc; ++i) 153 | { 154 | args.push_back(std::string(argv[i])); 155 | } 156 | 157 | auto sampleTest = sample::gLogger.defineTest(SampleYolo::gSampleName, argc, argv); 158 | 159 | sample::gLogger.reportTestStart(sampleTest); 160 | 161 | SampleYolo sample(initializeSampleParams(args)); 162 | 163 | sample::gLogInfo << "Building and running a GPU inference engine for Yolo" << std::endl; 164 | 165 | if (!sample.build()) 166 | { 167 | return sample::gLogger.reportFail(sampleTest); 168 | } 169 | 170 | sample::gLogInfo << "Loading or building yolo model done" << std::endl; 171 | 172 | if (!sample.infer()) 173 | { 174 | return sample::gLogger.reportFail(sampleTest); 175 | } 176 | 177 | sample::gLogInfo << "Inference of yolo model done" << std::endl; 178 | 179 | if (!sample.teardown()) 180 | { 181 | return sample::gLogger.reportFail(sampleTest); 182 | } 183 | 184 | return EXIT_SUCCESS; // sample::gLogger.reportPass(sampleTest); 185 | } 186 | -------------------------------------------------------------------------------- /tensorrt_yolov4/source/onnx_add_nms_plugin.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | 25 | #!/usr/bin/env python3 26 | import onnx_graphsurgeon as gs 27 | import argparse 28 | import onnx 29 | import numpy as np 30 | 31 | def create_and_add_plugin_node(graph, topK, keepTopK): 32 | 33 | batch_size = graph.inputs[0].shape[0] 34 | input_h = graph.inputs[0].shape[2] 35 | input_w = graph.inputs[0].shape[3] 36 | 37 | tensors = graph.tensors() 38 | boxes_tensor = tensors["boxes"] 39 | confs_tensor = tensors["confs"] 40 | 41 | num_detections = gs.Variable(name="num_detections").to_variable(dtype=np.int32, shape=[batch_size, 1]) 42 | nmsed_boxes = gs.Variable(name="nmsed_boxes").to_variable(dtype=np.float32, shape=[batch_size, keepTopK, 4]) 43 | nmsed_scores = gs.Variable(name="nmsed_scores").to_variable(dtype=np.float32, shape=[batch_size, keepTopK]) 44 | nmsed_classes = gs.Variable(name="nmsed_classes").to_variable(dtype=np.float32, shape=[batch_size, keepTopK]) 45 | 46 | new_outputs = [num_detections, nmsed_boxes, nmsed_scores, nmsed_classes] 47 | 48 | mns_node = gs.Node( 49 | op="BatchedNMS_TRT", 50 | attrs=create_attrs(input_h, input_w, topK, keepTopK), 51 | inputs=[boxes_tensor, confs_tensor], 52 | outputs=new_outputs) 53 | 54 | graph.nodes.append(mns_node) 55 | graph.outputs = new_outputs 56 | 57 | return graph.cleanup().toposort() 58 | 59 | 60 | 61 | 62 | def create_attrs(input_h, input_w, topK, keepTopK): 63 | 64 | num_anchors = 3 65 | 66 | h1 = input_h // 8 67 | h2 = input_h // 16 68 | h3 = input_h // 32 69 | 70 | w1 = input_w // 8 71 | w2 = input_w // 16 72 | w3 = input_w // 32 73 | 74 | num_boxes = num_anchors * (h1 * w1 + h2 * w2 + h3 * w3) 75 | 76 | attrs = {} 77 | 78 | attrs["shareLocation"] = 1 79 | attrs["backgroundLabelId"] = -1 80 | attrs["numClasses"] = 80 81 | attrs["topK"] = topK 82 | attrs["keepTopK"] = keepTopK 83 | attrs["scoreThreshold"] = 0.4 84 | attrs["iouThreshold"] = 0.6 85 | attrs["isNormalized"] = 1 86 | attrs["clipBoxes"] = 1 87 | 88 | # 001 is the default plugin version the parser will search for, and therefore can be omitted, 89 | # but we include it here for illustrative purposes. 90 | attrs["plugin_version"] = "1" 91 | 92 | return attrs 93 | 94 | 95 | def main(): 96 | parser = argparse.ArgumentParser(description="Add batchedNMSPlugin") 97 | parser.add_argument("-f", "--model", help="Path to the ONNX model generated by export_model.py", default="yolov4_1_3_416_416.onnx") 98 | parser.add_argument("-t", "--topK", help="number of bounding boxes for nms", default=2000) 99 | parser.add_argument("-k", "--keepTopK", help="bounding boxes to be kept per image", default=1000) 100 | 101 | args, _ = parser.parse_known_args() 102 | 103 | graph = gs.import_onnx(onnx.load(args.model)) 104 | 105 | graph = create_and_add_plugin_node(graph, int(args.topK), int(args.keepTopK)) 106 | 107 | onnx.save(gs.export_onnx(graph), args.model + ".nms.onnx") 108 | 109 | 110 | if __name__ == '__main__': 111 | main() 112 | 113 | -------------------------------------------------------------------------------- /tensorrt_yolov7/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | cmake_minimum_required( VERSION 3.0 ) 24 | 25 | project( YOLOV7 ) 26 | enable_language( CUDA ) 27 | find_package(CUDA) 28 | set( CMAKE_C_STANDARD 99 ) 29 | set( CMAKE_CXX_STANDARD 11 ) 30 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -fpic -fpie -fpermissive -std=c++11 -pthread" ) 31 | 32 | find_package(OpenCV REQUIRED) 33 | include_directories( ${OpenCV_INCLUDE_DIRS}) 34 | find_package(jsoncpp CONFIG REQUIRED) 35 | 36 | 37 | 38 | # global include_directories 39 | include_directories( /usr/local/cuda/include ) 40 | #add judgement about system: 41 | 42 | MESSAGE(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR is ${CMAKE_HOST_SYSTEM_PROCESSOR}") 43 | 44 | if (${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL aarch64) 45 | include_directories( /usr/include/aarch64-linux-gnu/ ) # for jetson 46 | elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL x86_64) 47 | include_directories( /usr/lib/x86_64-linux-gnu/ ) 48 | endif() 49 | 50 | include_directories( "${CMAKE_SOURCE_DIR}/src/" ) 51 | include_directories( "/usr/include/jsoncpp/") 52 | # global definitions 53 | add_definitions( -w) 54 | 55 | # global library path 56 | if (${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL aarch64) 57 | link_directories( "/usr/lib/aarch64-linux-gnu/" ) 58 | elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL x86_64) 59 | link_directories( "/usr/lib/x86_64-linux-gnu/" ) 60 | endif() 61 | 62 | link_directories( "/usr/lib/" ) 63 | link_directories( "/usr/local/lib/") 64 | link_directories( "/usr/local/cuda/lib64/" ) 65 | 66 | FILE(GLOB_RECURSE YOLO_SRC src/*.cpp ) 67 | add_library( yolo SHARED ${YOLO_SRC} ) 68 | target_link_libraries(yolo PRIVATE nvinfer) 69 | target_link_libraries(yolo PRIVATE nvinfer_plugin) 70 | target_link_libraries(yolo PRIVATE nvparsers) 71 | target_link_libraries(yolo PRIVATE nvonnxparser cudart ${OpenCV_LIBS}) 72 | 73 | add_executable(detect samples/detect.cpp ) 74 | target_link_libraries(detect yolo cudart ${OpenCV_LIBS} ) 75 | 76 | add_executable(video_detect samples/video_detect.cpp ) 77 | target_link_libraries(video_detect yolo cudart ${OpenCV_LIBS} ) 78 | 79 | add_executable(validate_coco samples/validate_coco.cpp ) 80 | target_link_libraries(validate_coco yolo cudart ${OpenCV_LIBS} ) 81 | target_link_libraries(validate_coco jsoncpp) 82 | -------------------------------------------------------------------------------- /tensorrt_yolov7/README.md: -------------------------------------------------------------------------------- 1 | # Yolov7 TensorRT cpp 2 | 3 | ## Description 4 | This is a yolov7 TensorRT cpp app. Fisrt, using trtexec to convert onnx model to FP32 or FP16 TensorRT engine ,or INT8 TensorRT engine from the QAT model finetuned from [yolov7_qat](../yolov7_qat). 5 | Then you can use the `detect/video_detect` app to detect a list of images(images number must smaller than the batchsize of the model)/video. or use `validate_coco` app to test mAP of the TensorRT engine. 6 | ## Prerequisites 7 | #### Install opencv 8 | - Note: There are OpenCV4 dependencies in this program. 9 | Follow README and documents of this repository https://github.com/opencv/opencv to install OpenCV. 10 | And, if you want use detect_video app, please install opencv with `ffmpeg` enabled 11 | 12 | #### Install jsoncpp libs 13 | jsoncpp lib is used to write coco-dataset-validate-result to json file. 14 | ```bash 15 | $ sudo apt-get install libjsoncpp-dev 16 | ``` 17 | ## Build and Run yolov7-TensorRT-app 18 | ### Build 19 | ```bash 20 | $ mkdir build && cd build 21 | $ cmake .. 22 | $ make -j4 23 | ``` 24 | 25 | ### Prepare TensorRT engines 26 | 27 | convert onnx model to tensorrt-engine 28 | ```bash 29 | # fp32 model 30 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --saveEngine=yolov7fp32.engine 31 | # fp16 model 32 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --saveEngine=yolov7fp16.engine --fp16 33 | # int8 QAT model, the onnx model with Q&DQ nodes 34 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7qat.onnx --saveEngine=yolov7QAT.engine --fp16 --int8 35 | ``` 36 | ### Detection & Validate 37 | - detect with image: 38 | ```bash 39 | $ ./build/detect --engine=yolov7db4fp32.engine --img=./imgs/horses.jpg,./imgs/zidane.jpg 40 | ``` 41 | - detect with video: 42 | - note: only support batchsize = 1 now. 43 | ```bash 44 | $ ./build/video_detect --engine=./yolov7fp32.engine --video=YOUR_VIDEO_PATH.mp4 45 | ``` 46 | - validate mAP on dataset 47 | - note: validate_coco only support model inputsize `[batchsize, 3, 672, 672]` 48 | ```bash 49 | $ ./build/validate_coco --engine=./yolov7fp32.engine --coco=/YOUR/COCO/DATA/PATH/ 50 | -------------------------------------------------------- 51 | Yolov7 initialized from: yolov7672.engine 52 | input : images , shape : [ 1,3,672,672,] 53 | output : output , shape : [ 1,27783,85,] 54 | -------------------------------------------------------- 55 | 5000 / 5000 56 | predict result has been written to ./predict.json 57 | 58 | $ python test_coco_map.py --predict ./predict.json --coco /YOUR/COCO/DATA/PATH/ 59 | ... 60 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.51005 61 | ... 62 | ``` 63 | -------------------------------------------------------------------------------- /tensorrt_yolov7/imgs/horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov7/imgs/horses.jpg -------------------------------------------------------------------------------- /tensorrt_yolov7/imgs/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov7/imgs/zidane.jpg -------------------------------------------------------------------------------- /tensorrt_yolov7/samples/detect.cpp: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: MIT 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | 36 | std::vector parse_img_paths(argsParser& cmdLine) { 37 | return cmdLine.ParseStringList("img"); 38 | } 39 | 40 | std::string parse_model_path(argsParser& cmdLine) { 41 | const char* engine_path_str = cmdLine.ParseString("engine"); 42 | std::string engine_path; 43 | if (engine_path_str) engine_path = std::string(engine_path_str); 44 | return engine_path; 45 | } 46 | 47 | bool print_help() { 48 | printf("--------------------------------------------------------------------------------------------------------\n"); 49 | printf("---------------------------- yolov7 images detector ---------------------------------------------\n"); 50 | printf(" '--help': print help information \n"); 51 | printf(" '--engine=yolov7.engine' Load yolov7 trt-engine \n"); 52 | printf(" '--img=img1,jpg,img2.jpg,img3.jpg' specify the path of the images, split by `,`\n"); 53 | return true; 54 | } 55 | 56 | 57 | int main(int argc, char** argv){ 58 | 59 | argsParser cmdLine(argc, argv); 60 | //! parse device_flag, see parse_device_flag 61 | if(cmdLine.ParseFlag("help")) { print_help(); return 0; } 62 | 63 | std::string engine_path = parse_model_path(cmdLine); 64 | std::vector img_paths = parse_img_paths(cmdLine); 65 | // print img paths 66 | std::cout<<"input "< bgr_imgs; 75 | for(int i = 0; i< img_paths.size();i++){ 76 | bgr_imgs.push_back(cv::imread(img_paths[i])); 77 | } 78 | 79 | std::cout<<"preprocess start"<>> nmsresults = yolov7.PostProcess(); 90 | 91 | for(int j =0; j < nmsresults.size();j++){ 92 | Yolov7::DrawBoxesonGraph(bgr_imgs[j],nmsresults[j]); 93 | std::string output_path = img_paths[j] + "detect" + std::to_string(j)+".jpg"; 94 | cv::imwrite(output_path, bgr_imgs[j]); 95 | std::cout<<"detectec image written to: "< 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | std::string parse_model_path(argsParser& cmdLine) { 38 | const char* engine_path_str = cmdLine.ParseString("engine"); 39 | std::string engine_path; 40 | if (engine_path_str) engine_path = std::string(engine_path_str); 41 | return engine_path; 42 | } 43 | 44 | std::string parse_coco_path(argsParser& cmdLine) { 45 | const char* coco_path_str = cmdLine.ParseString("coco"); 46 | std::string coco_path; 47 | if (coco_path_str) coco_path = std::string(coco_path_str); 48 | return coco_path; 49 | } 50 | 51 | bool print_help() { 52 | printf("--------------------------------------------------------------------------------------------------------\n"); 53 | printf("---------------------------- yolov7 coco validate tool ---------------------------------------------\n"); 54 | printf(" '--help': print help information \n"); 55 | printf(" '--engine=yolov7.engine' Load yolov7 trt-engine \n"); 56 | printf(" '--coco=./data/coco/' specify the path of the coco dataset\n"); 57 | return true; 58 | } 59 | 60 | int coco80_to_coco91_class(int id) { 61 | //# converts 80-index (val2014) to 91-index (paper) 62 | // # https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/ 63 | // # a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n') 64 | // # b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n') 65 | // # x1 = [list(a[i] == b).index(True) + 1 for i in range(80)] # darknet to coco 66 | // # x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)] # coco to darknet 67 | std::vector x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 68 | 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 69 | 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90}; 70 | return x[id]; 71 | } 72 | std::vector xyxy2xywh(float x0, float x1, float x2, float x3){ 73 | // # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right 74 | // y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) 75 | std::vector y; 76 | y.resize(4); 77 | y[0] = (x0 + x2) / 2;// # x center 78 | y[1] = (x1 + x3) / 2;// # y center 79 | y[2] = x2 - x0;// # width 80 | y[3] = x3 - x1;// # height 81 | y[0] -= y[2]/2; 82 | y[1] -= y[3]/2; 83 | // box[:, :2] -= box[:, 2:] / 2 84 | 85 | return y; 86 | } 87 | 88 | int number_classes = 80; 89 | 90 | std::vector readCocoPaths(std::string coco_file_path) { 91 | std::vector result; 92 | std::ifstream coco_test_file(coco_file_path); 93 | std::string line; 94 | std::string folder_path = coco_file_path.substr(0, coco_file_path.find_last_of("/")+1); 95 | if(coco_test_file) { 96 | while(getline(coco_test_file, line)){ 97 | 98 | result.push_back(folder_path+line); 99 | // std::cout<<"folder_path+line:"< bgr_imgs; 121 | std::vector imgPathList = readCocoPaths(coco_path);; 122 | std::vector>> batchNmsResult; 123 | int maxBatchsize = yolov7.getInputDim().d[0]; 124 | 125 | 126 | Json::Value root; 127 | Json::FastWriter writer; 128 | 129 | for(int i = 0 ; i < imgPathList.size(); ){ 130 | //infer with a batch 131 | for(int j = 0; j < maxBatchsize && i nchwMats = yolov7.preProcess4Validate(bgr_imgs); 137 | 138 | printf("\r%d / %d", i, imgPathList.size()); 139 | fflush(stdout); 140 | 141 | yolov7.infer(); 142 | 143 | batchNmsResult = yolov7.PostProcess(0.65, 0.001); 144 | 145 | for(int j = 0; j< batchNmsResult.size();j++){ 146 | int imgth = i - batchNmsResult.size() + j; 147 | // processing the name. eg: ./images/train2017/000000000250.jpg will be processed as 250 148 | int image_id = stoi(imgPathList[imgth].substr(imgPathList[imgth].length()-16, imgPathList[imgth].find_last_of(".")-(imgPathList[imgth].length()-16))); 149 | for(int k = 0; k point = xyxy2xywh(batchNmsResult[j][k][0],batchNmsResult[j][k][1],batchNmsResult[j][k][2],batchNmsResult[j][k][3]); 157 | bboxObj.append(point[0]); 158 | bboxObj.append(point[1]); 159 | bboxObj.append(point[2]); 160 | bboxObj.append(point[3]); 161 | OneResult["bbox"] = bboxObj; 162 | root.append(OneResult); 163 | } 164 | } 165 | bgr_imgs.clear(); 166 | } 167 | 168 | std::string json_file = writer.write(root); 169 | std::ofstream out("./predict.json"); 170 | out << json_file; 171 | std::cout< 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | 36 | std::string parse_video_path(argsParser& cmdLine) { 37 | const char* video_path_str = cmdLine.ParseString("video"); 38 | std::string video_path; 39 | if (video_path_str) video_path = std::string(video_path_str); 40 | return video_path; 41 | } 42 | 43 | std::string parse_model_path(argsParser& cmdLine) { 44 | const char* engine_path_str = cmdLine.ParseString("engine"); 45 | std::string engine_path; 46 | if (engine_path_str) engine_path = std::string(engine_path_str); 47 | return engine_path; 48 | } 49 | 50 | bool print_help() { 51 | printf("--------------------------------------------------------------------------------------------------------\n"); 52 | printf("---------------------------- yolov7 images detector ---------------------------------------------\n"); 53 | printf(" '--help': print help information \n"); 54 | printf(" '--engine=yolov7.engine' Load yolov7 trt-engine \n"); 55 | printf(" '--video=video.mp4' specify the path of the video \n"); 56 | return true; 57 | } 58 | 59 | 60 | int main(int argc, char** argv){ 61 | 62 | argsParser cmdLine(argc, argv); 63 | //! parse device_flag, see parse_device_flag 64 | if(cmdLine.ParseFlag("help")) { print_help(); return 0; } 65 | 66 | std::string engine_path = parse_model_path(cmdLine); 67 | std::string video_path = parse_video_path(cmdLine); 68 | 69 | Yolov7 yolov7(engine_path); 70 | 71 | cv::VideoCapture capture; 72 | cv::Mat frame; 73 | frame= capture.open(video_path); 74 | if(!capture.isOpened()) 75 | { 76 | printf("can not open ... please check whether your opencv has installed with ffmpeg..\n"); 77 | return -1; 78 | } 79 | cv::Size size = cv::Size(capture.get(cv::CAP_PROP_FRAME_WIDTH), capture.get(cv::CAP_PROP_FRAME_HEIGHT)); 80 | cv::VideoWriter writer; 81 | writer.open(std::string(video_path+".detect.mp4"), cv::VideoWriter::fourcc('M', 'J', 'P', 'G'), 10, size, true); 82 | std::vector framev; 83 | std::vector>> nmsresults; 84 | int total_frame_count = capture.get(cv::CAP_PROP_FRAME_COUNT); 85 | int i = 0; 86 | while (capture.read(frame)){ 87 | framev.push_back(frame); 88 | yolov7.preProcess(framev); 89 | yolov7.infer(); 90 | nmsresults = yolov7.PostProcess(); 91 | Yolov7::DrawBoxesonGraph(frame,nmsresults[0]); 92 | writer.write(frame); 93 | framev.clear(); 94 | i++; 95 | printf("\r%d / %d", i, total_frame_count); 96 | fflush(stdout); 97 | } 98 | capture.release(); 99 | std::cout<<"Done..."< 32 | #include 33 | #include 34 | #include 35 | #include "NvInfer.h" 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | //opencv for preprocessing & postprocessing 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | class Yolov7 { 49 | public: 50 | //! 51 | //! \brief init Yolov7 class object 52 | //! 53 | //! \param engine_path The path of trt engine file 54 | //! 55 | Yolov7(std::string engine_path); 56 | 57 | //! 58 | //! \brief preprocess a list of image, the image will remembered inside the class by Yolov7 object 59 | //! 60 | //! \param cv_img input images with BGR-UInt8, the size of the vector must smmaller than the maxBatchsize of the model 61 | //! 62 | std::vector preProcess(std::vector &cv_img);// 63 | 64 | //! 65 | //! \brief run tensorRT inference with the data preProcessed 66 | //! 67 | int infer(); 68 | 69 | //! 70 | //! \brief PostProcess, will decode and nms the batch inference result of yolov7 71 | //! 72 | //! \param cv_img 73 | //! \return return all the nms result of Yolov7 74 | //! 75 | std::vector>> PostProcess(float iou_thres = 0.45f, float conf_thres = 0.25f); 76 | 77 | //! 78 | //! \brief Get the input dimenssion of the model 79 | //! 80 | //! \return return Dims of input 81 | //! 82 | nvinfer1::Dims getInputDim(); 83 | 84 | //! 85 | //! \brief Get the output dimenssion of the model 86 | //! 87 | //! \return return the Dims of output 88 | //! 89 | nvinfer1::Dims getOutputDim(); 90 | 91 | //! 92 | //! \brief Draw boxes on bgr image 93 | //! \param bgr_img The images need to be drawed with boxes 94 | //! \param nmsresult nms result get from PostProcess function 95 | //! 96 | static int Yolov7::DrawBoxesonGraph(cv::Mat &bgr_img, std::vector> nmsresult); 97 | 98 | //! 99 | //! \brief preprocess a list of image for validate mAP on coco dataset! the model must have a [batchsize, 3, 672, 672] input 100 | //! 101 | //! \param cv_img input images with BGR-UInt8, the size of the vector must smmaller than the maxBatchsize of the model 102 | //! 103 | std::vector preProcess4Validate(std::vector &cv_img); 104 | 105 | //! 106 | //! \brief PostProcess for validate mAP on coco dataset!, will decode the batch inference result of yolov7 107 | //! 108 | //! \param cv_img 109 | //! \return return all the nms result of Yolov7 110 | //! 111 | std::vector>> PostProcess4Validate(float iou_thres = 0.45f, float conf_thres = 0.25f); 112 | private: 113 | 114 | int pushImg(void *imgBuffer, int numImg, bool fromCPU = true); 115 | 116 | std::vector>> decode_yolov7_result(float conf_thres); 117 | std::vector>> yolov7_nms(std::vector>> &bboxes, float iou_thres); 118 | std::vector> nms(std::vector> &bboxes, float iou_thres); 119 | 120 | //TODO: to be imp 121 | void CudaGraphEndCapture(cudaStream_t stream); 122 | 123 | void CudaGraphBeginCapture(cudaStream_t stream); 124 | 125 | bool CudaGraphLaunch(cudaStream_t stream); 126 | 127 | bool enableCudaGraph(); 128 | 129 | void ReportArgs(); 130 | 131 | private: 132 | 133 | int mImgPushed; 134 | int mMaxBatchSize; 135 | bool mDynamicBatch; 136 | 137 | //stream and event 138 | std::unique_ptr mStream; 139 | std::unique_ptr mEvent; 140 | 141 | // trt objects 142 | std::unique_ptr> mRuntime; 143 | std::unique_ptr> mEngine; 144 | std::unique_ptr> mContext; 145 | std::vector>> mBindings; 146 | 147 | std::vector mBindingArray; 148 | std::vector mHostOutputBuffer; 149 | std::vector mHostNMSBuffer; 150 | 151 | std::string mEnginePath; 152 | nvinfer1::Dims mInputDim; //maxB,3,640,640 153 | nvinfer1::Dims mOutputDim; 154 | int mImgBufferSize;//sizeof(float)x3x640x640 155 | 156 | //cuda graph objects 157 | cudaGraph_t mGraph{}; 158 | cudaGraphExec_t mGraphExec{}; 159 | 160 | std::vector> md2i; 161 | 162 | bool mCudaGraphEnabled; 163 | 164 | //TODOs 165 | //! 166 | //! get how many imgs has been totally processed 167 | //! 168 | // caculate fps real time 169 | unsigned long long mLast_inference_time; 170 | unsigned long long mTotal_inference_time; 171 | int mInference_count; 172 | public: 173 | int imgProcessed() { return mInference_count; }; 174 | }; 175 | -------------------------------------------------------------------------------- /tensorrt_yolov7/src/argsParser.cpp: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: MIT 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | 26 | #include "argsParser.h" 27 | 28 | // constructor 29 | argsParser::argsParser(const int pArgc, char** pArgv) { 30 | argc = pArgc; 31 | argv = pArgv; 32 | } 33 | // ParseFlag 34 | bool argsParser::ParseFlag(std::string string_ref) const { 35 | if (argc < 1) return false; 36 | 37 | for (int i = 0; i < argc; i++) { 38 | const int string_start = std::string(argv[i]).find_last_of('-') + 1; 39 | if (string_start == 0) continue; 40 | 41 | const char* string_argv = &argv[i][string_start]; 42 | 43 | const char* equal_pos = strchr(string_argv, '='); 44 | 45 | const int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 46 | const int length = (int)(string_ref.size()); 47 | 48 | if (length == argv_length && !strncasecmp(string_argv, string_ref.c_str(), length)) return true; 49 | } 50 | return false; 51 | } 52 | 53 | // ParseString 54 | const char* argsParser::ParseString(std::string string_ref) const { 55 | if (argc < 1) return NULL; 56 | 57 | for (int i = 0; i < argc; i++) { 58 | const int string_start = std::string(argv[i]).find_last_of('-') + 1; 59 | 60 | if (string_start == 0) continue; 61 | 62 | char* string_argv = (char*)&argv[i][string_start]; 63 | const int length = (int)(string_ref.size()); 64 | 65 | if (!strncasecmp(string_argv, string_ref.c_str(), length)) return (string_argv + length + 1); 66 | //*string_retval = &string_argv[length+1]; 67 | } 68 | return NULL; 69 | } 70 | 71 | 72 | // ParseStringList eg. img1,img2,img3 73 | std::vector argsParser::ParseStringList(std::string argName, const char delimiter) const{ 74 | const char* ListStr = ParseString(argName); 75 | std::vector result; 76 | if (ListStr == NULL) return result; 77 | int string_start = 0; 78 | int string_end = 0; 79 | 80 | int strLen = (int)strlen(ListStr); 81 | while(string_end < strLen){ 82 | while (delimiter != ListStr[string_end] && string_end < strLen) string_end++; 83 | result.push_back(std::string(ListStr).substr(string_start,string_end-string_start)); 84 | string_end++; 85 | string_start = string_end; 86 | } 87 | return result; 88 | } 89 | -------------------------------------------------------------------------------- /tensorrt_yolov7/src/argsParser.h: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: MIT 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | 26 | #ifndef __COMMAND_LINE_H_ 27 | #define __COMMAND_LINE_H_ 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | #include 36 | 37 | /** 38 | * args line parser 39 | */ 40 | class argsParser { 41 | public: 42 | argsParser(const int argc, char** argv); 43 | 44 | /** 45 | * Parse Flag 46 | */ 47 | bool ParseFlag(const std::string argName) const; 48 | 49 | /** 50 | * Parse String 51 | */ 52 | const char* ParseString(const std::string argName) const; 53 | // const char* ParseString2(const std::string argName, const char* defaultValue = NULL, bool allowOtherDelimiters = true) const; 54 | 55 | /** 56 | * Parse String list delimited by "," 57 | */ 58 | std::vector ParseStringList(std::string argName, const char delimiter = ',') const; 59 | 60 | /** 61 | * The argument count that the object was created with from main() 62 | */ 63 | int argc; 64 | char** argv; 65 | }; 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /tensorrt_yolov7/src/tools.h: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | * SPDX-License-Identifier: MIT 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #ifndef __TOOLS_H__ 26 | #define __TOOLS_H__ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include "NvInfer.h" 33 | 34 | void checkCudaErrors(cudaError_t err) { 35 | if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorName(err)); 36 | } 37 | 38 | // Logger for TensorRT info/warning/errors 39 | class Logger : public nvinfer1::ILogger { 40 | public: 41 | Logger(Severity severity = Severity::kWARNING) : reportableSeverity(severity) {} 42 | 43 | void log(Severity severity, const char* msg) noexcept override { 44 | // suppress messages with severity enum value greater than the reportable 45 | if (severity > reportableSeverity) return; 46 | 47 | switch (severity) { 48 | case Severity::kINTERNAL_ERROR: 49 | std::cerr << "INTERNAL_ERROR: "; 50 | break; 51 | case Severity::kERROR: 52 | std::cerr << "ERROR: "; 53 | break; 54 | case Severity::kWARNING: 55 | std::cerr << "WARNING: "; 56 | break; 57 | case Severity::kINFO: 58 | std::cerr << "INFO: "; 59 | break; 60 | default: 61 | std::cerr << "UNKNOWN: "; 62 | break; 63 | } 64 | std::cerr << msg << std::endl; 65 | } 66 | 67 | Severity reportableSeverity; 68 | }; 69 | template 70 | struct TrtDeleter { 71 | void operator()(T* p) noexcept { 72 | if (p != nullptr) delete p; 73 | } 74 | }; 75 | 76 | template 77 | struct CuMemDeleter { 78 | void operator()(T* p) noexcept { checkCudaErrors(cudaFree(p)); } 79 | }; 80 | 81 | template 82 | std::unique_ptr> mallocCudaMem(size_t nbElems) { 83 | T* ptr = nullptr; 84 | checkCudaErrors(cudaMalloc((void**)&ptr, sizeof(T) * nbElems)); 85 | return std::unique_ptr>{ptr}; 86 | } 87 | 88 | struct EventDeleter { 89 | void operator()(CUevent_st* event) noexcept { checkCudaErrors(cudaEventDestroy(event)); } 90 | }; 91 | struct StreamDeleter { 92 | void operator()(CUstream_st* stream) noexcept { checkCudaErrors(cudaStreamDestroy(stream)); } 93 | }; 94 | 95 | std::unique_ptr makeCudaEvent(int flags) { 96 | cudaEvent_t event; 97 | checkCudaErrors(cudaEventCreateWithFlags(&event, flags)); 98 | return std::unique_ptr{event}; 99 | } 100 | 101 | std::unique_ptr makeCudaStream(int flags, int priority) { 102 | cudaStream_t stream; 103 | checkCudaErrors(cudaStreamCreateWithPriority(&stream, flags, priority)); 104 | return std::unique_ptr{stream}; 105 | } 106 | 107 | 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /tensorrt_yolov7/test_coco_map.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | import json 24 | import os 25 | import argparse 26 | 27 | if __name__ == '__main__': 28 | parser = argparse.ArgumentParser(prog='test.py') 29 | parser.add_argument('--predict', type=str, default='./predict.json', help='model.pt path(s)') 30 | parser.add_argument('--coco', type=str, default='./coco/', help='*.data path') 31 | opt = parser.parse_args() 32 | print('\nEvaluating pycocotools mAP... saving %s...' % opt.predict) 33 | try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb 34 | from pycocotools.coco import COCO 35 | from pycocotools.cocoeval import COCOeval 36 | anno = COCO(opt.coco+"/annotations/instances_val2017.json") # init annotations api 37 | pred = anno.loadRes(opt.predict) # init predictions api 38 | eval = COCOeval(anno, pred, 'bbox') 39 | # if is_coco: 40 | # eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files] # image IDs to evaluate 41 | eval.evaluate() 42 | eval.accumulate() 43 | eval.summarize() 44 | map, map50 = eval.stats[:2] # update results (mAP@0.5:0.95, mAP@0.5) 45 | except Exception as e: 46 | print(f'pycocotools unable to run: {e}') 47 | -------------------------------------------------------------------------------- /yolov7_qat/README.md: -------------------------------------------------------------------------------- 1 | # YoloV7 Quantization Aware Training 2 | ## Description 3 | We use [TensorRT's pytorch quntization tool](https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization) to finetune training QAT yolov7 from the pre-trained weight, then export the model to onnx and deploy it with TensorRT. The accuray and performance can be found in below table. 4 | 5 | | Method | Calibration method | mAPval
0.5|mAPval
0.5:0.95 |batch-1 fps
Jetson Orin-X |batch-16 fps
Jetson Orin-X |weight| 6 | | ---- | ---- |---- |---- |----|----|-| 7 | | pytorch FP16 | - | 0.6972 | 0.5120 |-|-|[yolov7.pt](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt)| 8 | | pytorch PTQ-INT8 | Histogram(MSE) | 0.6957 | 0.5100 |-|-|[yolov7_ptq.pt](https://nvidia.box.com/shared/static/j0rclm9k2ymj6ahdx55dxnnskzq91flh) [yolov7_ptq_640.onnx](https://nvidia.box.com/shared/static/rlv3buq7sei2log2d3beyg1jhjyw59hn)| 9 | | pytorch QAT-INT8 | Histogram(MSE) | 0.6961 | 0.5111 |-|-|[yolov7_qat.pt](https://nvidia.box.com/shared/static/vph9af9rbe7ed7ibfnajsk248mw9nq9f)| 10 | | TensorRT FP16| - | 0.6973 | 0.5124 |140 |168|[yolov7.onnx](https://nvidia.box.com/shared/static/rmh8rttesg4cgrysb2qm12udpvd95as1) | 11 | | TensorRT PTQ-INT8 | TensorRT built in EntropyCalibratorV2 | 0.6317 | 0.4573 |207|264|-| 12 | | TensorRT QAT-INT8 | Histogram(MSE) | 0.6962 | 0.5113 |207|266|[yolov7_qat_640.onnx](https://nvidia.box.com/shared/static/v1ze885p35hfjl96xtw8s0xbcpv64tfr)| 13 | - network input resolution: 3x640x640 14 | - note: trtexec cudaGraph is enabled 15 | 16 | ## How To QAT Training 17 | ### 1.Setup 18 | 19 | Suggest to use docker environment. 20 | ```bash 21 | $ docker pull nvcr.io/nvidia/pytorch:22.09-py3 22 | ``` 23 | 24 | 1. Clone and apply patch 25 | ```bash 26 | # use this YoloV7 as a sample base 27 | git clone https://github.com/WongKinYiu/yolov7.git 28 | cp -r yolov_deepstream/yolov7_qat/* yolov7/ 29 | ``` 30 | 31 | 2. Install dependencies 32 | ```bash 33 | $ pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com 34 | ``` 35 | 36 | 3. Download dataset and pretrained model 37 | ```bash 38 | $ bash scripts/get_coco.sh 39 | $ wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt 40 | ``` 41 | 42 | ### 2. Start QAT training 43 | ```bash 44 | $ python scripts/qat.py quantize yolov7.pt --ptq=ptq.pt --qat=qat.pt --eval-ptq --eval-origin 45 | ``` 46 | This script includes steps below: 47 | - Insert Q&DQ nodes to get fake-quant pytorch model
48 | [Pytorch quntization tool](https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization) provides automatic insertion of QDQ function. But for yolov7 model, it can not get the same performance as PTQ, because in Explicit mode(QAT mode), TensorRT will henceforth refer Q/DQ nodes' placement to restrict the precision of the model. Some of the automatic added Q&DQ nodes can not be fused with other layers which will cause some extra useless precision convertion. In our script, We find Some rules and restrictions for yolov7, QDQ nodes are automatically analyzed and configured in a rule-based manner, ensuring that they are optimal under TensorRT. Ensuring that all nodes are running INT8(confirmed with tool:[trt-engine-explorer](https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer), see [scripts/draw-engine.py](./scripts/draw-engine.py)). for details of this part, please refer [quantization/rules.py](./quantization/rules.py), About the guidance of Q&DQ insert, please refer [Guidance_of_QAT_performance_optimization](./doc/Guidance_of_QAT_performance_optimization.md) 49 | 50 | - PTQ calibration
51 | After inserting Q&DQ nodes, we recommend to run PTQ-Calibration first. Per experiments, `Histogram(MSE)` is the best PTQ calibration method for yolov7. 52 | Note: if you are satisfied with PTQ result, you could also skip QAT. 53 | 54 | - QAT training
55 | After QAT, need to finetune traning our model. after getting the accuracy we are satisfied, Saving the weights to files 56 | 57 | ### 3. Export onnx 58 | ```bash 59 | $ python scripts/qat.py export qat.pt --size=640 --save=qat.onnx --dynamic 60 | ``` 61 | 62 | ### 4. Evaluate model accuracy on coco 63 | ```bash 64 | $ bash scripts/eval-trt.sh qat.pt 65 | ``` 66 | 67 | ### 5. Benchmark 68 | ```bash 69 | $ /usr/src/tensorrt/bin/trtexec --onnx=qat.onnx --int8 --fp16 --workspace=1024000 --minShapes=images:4x3x640x640 --optShapes=images:4x3x640x640 --maxShapes=images:4x3x640x640 70 | ``` 71 | 72 | 73 | ## Quantization Yolov7-Tiny 74 | ```bash 75 | $ python scripts/qat.py quantize yolov7-tiny.pt --qat=qat.pt --ptq=ptq.pt --ignore-policy="model\.77\.m\.(.*)|model\.0\.(.*)" --supervision-stride=1 --eval-ptq --eval-origin 76 | ``` 77 | 78 | ## Note 79 | - For YoloV5, please use the script `scripts/qat-yolov5.py`. This adds QAT support for `Add operator`, making it more performant. 80 | - Please refer to the `quantize.replace_bottleneck_forward` function to handle the `Add operator`. 81 | -------------------------------------------------------------------------------- /yolov7_qat/doc/Guidance_of_QAT_performance_optimization.md: -------------------------------------------------------------------------------- 1 | 2 | # Get QAT models' best performance on TensorRT 3 | 4 | ## 1. Description 5 | This guidance will show how to get the best performance QAT model on yolov7. 6 | 7 | There are two workflows for quantizing networks in TensorRT, one is Post-training quantization (PTQ).(ref:[tensorrt-developer-guide/intro-quantization](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#intro-quantization)). The other is QAT.(ref:[tensorrt-developer-guide/work-with-qat-networks](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work-with-qat-networks). In PTQ mode, TensorRT will have the best performance, as it always choose the best layer fusion tactics and fastest kernels to make the global optimal network enqueue graph. 8 | In QAT modes, the enqueue graph is designed by user. Which depends on the QDQ placement, The accuracy conversion and layer fusion strategies in the network are selected strictly according to the QDQ placement.(About the Q&DQ processing of TensorRT, please refer :[TensorRT-developer-guide: Processing of Q/DQ Networks](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#tensorrt-process-qdq)). That is, If we want to get the best performance of QAT, The Q&DQ nodes must make sure: 9 | 1. All the computationally intensive layers will run with INT8. 10 | 2. Q&DQ can not break down the layer fusion of QAT model. 11 | 3. Do not have unnecessary data conversion between INT8 and FLOAT 12 | 13 | One effective way to get best performance of QAT is comparing the enqueue graph of QAT-TensorRT model with PTQ, and ensure they are the same. 14 | 15 | ## 2. Workflow 16 | Our solution is: verbosing the QAT-Graph and compare with the PTQ-Graph. And back to fineTune the Q&DQ nodes placement. The procedure can be summaried as below. 17 | 1. Insert QDQ in the model and export it to onnx 18 | 2. Convert PTQ-Onnx and QAT-onnx to TensorRT model and draw the TensorRT-model-graph 19 | 3. Compare the TensorRT-enqueue-Graph and performance between QAT and PTQ 20 | 4. If the QAT Graph is different from PTQ Graph and the performance also wrose. modify the QDQ placement. Back to Step 1. Else, to Step 5 21 | 5. Run PTQ benchmark and QAT benchmark to verify 22 | 23 | QATFlow 24 | 25 | For the layer-fusion rules: We can refer: [TensorRT-developer-guide: Types of Fusions](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#fusion-types) 26 | For the tools for verbosing the TensorRT-model graph:[github-TensorRT: trt-engine-explorer](https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer)(ref: [blog:exploring-tensorrt-engines-with-trex](https://developer.nvidia.com/blog/exploring-tensorrt-engines-with-trex/)) 27 | 28 | 29 | ## 3. Step by step guidance of QAT optimization on yolov7 30 | 31 | Now we will step by step optimizing a QAT model performance, We only care about the performance rather than accuracy at this time as we had not starting finetune the accuracy with training. 32 | we use pytorch-quantization tool [pytorch-quantization](https://github.com/NVIDIA/TensorRT/blob/main/tools/pytorch-quantization) to quantize our pytorch model. And export onnx model with Q&DQ nodes. 33 | This package provides a number of quantized layer modules, which contain quantizers for inputs and weights. e.g. `quant_nn.QuantLinear`, which can be used in place of `nn.Linear. ` These quantized layers can be substituted automatically, via monkey-patching, or by manually modifying the model definition. 34 | Automatic layer substitution is done with `quant_modules`. This should be called before model creation.[ref: [pytorch-quantization-toolkit-tutorials](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/tutorials/quant_resnet50.html#quantizing-resnet50)] 35 | 36 | ### 1) Insert QDQ to model with monkey-patch quantization 37 | 38 | with `quant_modules.initialize()` and `quant_modules.deactivate()`. The tool will automatic insert Q&DQ nodes in the network. 39 | 40 | ```python 41 | quant_modules.initialize() 42 | # Load PyTorch model 43 | device = select_device(opt.device) 44 | model = Model(opt.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) 45 | labels = model.names 46 | quant_modules.deactivate() 47 | ``` 48 | calibrate the onnx model to get the scale of Q&DQ nodes. 49 | ```python 50 | def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator,hist_percentile, out_dir, device): 51 | """ 52 | Feed data to the network and calibrate. 53 | Arguments: 54 | model: classification model 55 | model_name: name to use when creating state files 56 | data_loader: calibration data set 57 | num_calib_batch: amount of calibration passes to perform 58 | calibrator: type of calibration to use (max/histogram) 59 | hist_percentile: percentiles to be used for historgram calibration 60 | out_dir: dir to save state files in 61 | """ 62 | if num_calib_batch > 0: 63 | print("Calibrating model") 64 | with torch.no_grad(): 65 | collect_stats(model, data_loader, num_calib_batch, device) 66 | if not calibrator == "histogram": 67 | compute_amax(model, method="max") 68 | calib_output = os.path.join( 69 | out_dir, 70 | F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth") 71 | ckpt = {'model': deepcopy(model)} 72 | torch.save(ckpt, calib_output) 73 | else: 74 | for percentile in hist_percentile: 75 | print(F"{percentile} percentile calibration") 76 | compute_amax(model, method="percentile") 77 | calib_output = os.path.join( 78 | out_dir, 79 | F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth") 80 | ckpt = {'model': deepcopy(model)} 81 | torch.save(ckpt, calib_output) 82 | for method in ["mse", "entropy"]: 83 | print(F"{method} calibration") 84 | compute_amax(model, method=method) 85 | calib_output = os.path.join( 86 | out_dir, 87 | F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth") 88 | ckpt = {'model': deepcopy(model)} 89 | torch.save(ckpt, calib_output) 90 | ``` 91 | ### 2) export the calibrated-pytorch model to onnx 92 | ```python 93 | quant_nn.TensorQuantizer.use_fb_fake_quant = True 94 | torch.onnx.export(model, img, f, verbose=False, opset_version=13, input_names['images'], 95 | output_names=output_names, 96 | dynamic_axes=dynamic_axes) 97 | quant_nn.TensorQuantizer.use_fb_fake_quant = False 98 | ``` 99 | ***Now we got a onnx model with Q&DQ layers. TensorRT will process the onnx model with QDQ nodes as QAT models, With this way. Calibration is no longer needed as TensorRT will automatically performs INT8 quantization based on scales of Q and DQ nodes.*** 100 | 101 | TIPS: We calibrate the pytorch model with fake-quant, the exported onnx will have Q&DQ nodes. In the eye of pytorch, it is a ptq-model as we only did a calibration but no finetune training. But in the eye of TensorRT, as long as there are Q&DQ nodes inside the onnx, TensorRT will regard it as a QAT model. 102 | 103 | ### 3) Run TensorRT benchmark and export layers information to json 104 | we can export the TensorRT-engine-graph and profile information with flag `--exportLayerInfo=layer.json --profilingVerbosity=detailed --exportProfile=profile.json`. 105 | first we export fp32 onnx model 106 | ```bash 107 | $ python export.py --weights ./yolov7.pt --grid --simplify --topk-all 100 --iou-thres 0.65 --conf-thres 0.35 --img-size 640 640 108 | ``` 109 | Then we copy the onnx to target device, Here we use Jetson OrinX as our target device, TensorRT has different behavior on different GPUs. So the test must run on your final target device 110 | 111 | Run PTQ benchmark 112 | ```bash 113 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --fp16 --int8 --verbose --saveEngine=yolov7_ptq.engine --workspace=1024000 --warmUp=500 --duration=10 --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_ptq_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_ptq_profile.json 114 | ``` 115 | Run fp16 benchmark 116 | ```bash 117 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --fp16 --verbose --saveEngine=yolov7_fp16.engine --workspace=1024000 --warmUp=500 --duration=10 --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_fp16_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_fp16_profile.json 118 | ``` 119 | Run QAT benchmark 120 | ```bash 121 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7_qat.onnx --fp16 --int8 --verbose --saveEngine=yolov7_qat.engine --workspace=1024000 --warmUp=500 --duration=10 --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_qat_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_qat_profile.json 122 | ``` 123 | 124 | Run QAT_mask detect benchmark 125 | ```bash 126 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7_qat_maskdet.onnx --fp16 --int8 --verbose --saveEngine=yolov7_qat_maskdet.engine --workspace=1024000 --warmUp=500 --duration=10 --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_qat_maskdet_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_qat_maskdet_profile.json 127 | ``` 128 | 129 | We can get the fps from the log: 130 | The PTQ performance is : 131 | ```bash 132 | [I] Throughput: 206.562 qps 133 | ``` 134 | The fp16 performance is : 135 | ```bash 136 | [I] Throughput: 139.597 qps 137 | ``` 138 | The version 1 QAT performance is: 139 | ```bash 140 | [I] Throughput: 180.439 qps 141 | ``` 142 | That is not a good performance as we expect, Let's look insight the reason 143 | 144 | ### 4) Draw Engine graph 145 | 146 | we use TensorRT opensource tool: [trt-engine-explorer](https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer) drawing the enqueue graph of TensorRT. This tool take the trtexec exported layer json information as input. 147 | Use the below code to draw the TensorRT-Engine-graph.(edit from `trt-engine-explorer/utils/draw_engine.py`) 148 | 149 | ```python 150 | import graphviz 151 | from trex import * 152 | import argparse 153 | import shutil 154 | 155 | 156 | def draw_engine(engine_json_fname: str, engine_profile_fname: str): 157 | graphviz_is_installed = shutil.which("dot") is not None 158 | if not graphviz_is_installed: 159 | print("graphviz is required but it is not installed.\n") 160 | print("To install on Ubuntu:") 161 | print("sudo apt --yes install graphviz") 162 | exit() 163 | 164 | plan = EnginePlan(engine_json_fname, engine_profile_fname) 165 | formatter = layer_type_formatter 166 | display_regions = True 167 | expand_layer_details = False 168 | 169 | graph = to_dot(plan, formatter, 170 | display_regions=display_regions, 171 | expand_layer_details=expand_layer_details) 172 | render_dot(graph, engine_json_fname, 'svg') 173 | 174 | 175 | if __name__ == "__main__": 176 | parser = argparse.ArgumentParser() 177 | parser.add_argument('--layer', help="name of engine JSON file to draw") 178 | parser.add_argument('--profile', help="name of profile JSON file to draw") 179 | args = parser.parse_args() 180 | draw_engine(engine_json_fname=args.layer,engine_profile_fname=args.profile) 181 | ``` 182 | draw the graph: 183 | ```bash 184 | $ python draw_engine.py --layer yolov7_qat_layer.json --profile yolov7_qat_profile.json 185 | $ python draw_engine.py --layer yolov7_ptq_layer.json --profile yolov7_ptq_profile.json 186 | ``` 187 | we get `yolov7_qat_layer.json.svg` and `yolov7_ptq_layer.json.svg` 188 | 189 | Let's see the difference: 190 | 191 | monkey-patch-qat-conv-fp16-issue_ptqonnxmonkey-patch-qat-conv-fp16-issue_ptqmonkey-patch-qat-conv-fp16-issue_qatonnxmonkey-patch-qat-conv-fp16-issue_qatonnx 192 | 193 | -
pic1: The convolution layers before first concat layer in onnx
194 | -
pic2: pic1's TensorRT-graph
195 | -
pic3: the qat-onnx model
196 | -
pic4: pic3's TensorRt-graph
197 | -
(click to see full picture)
198 | 199 | ### 5) Gap analyze and QDQ placement optimization 200 | There are a lot of useless int8->fp16 and fp16->int8 data convert in our QAT model. That is because : TensorRT will enforce the rules of QDQ to ensure consistent accuracy during inference and training(We didn't see any fp32 tensors here becasue TensorRT believes that fp16 will have the same accuracy as fp32) 201 | That is to say: If we want to reduce these useless data format convertion, We must edit our QDQ nodes to suit the fusion rules of TensorRT QAT. 202 | From the PTQ & QAT engine-graph, we can observed that: the concat layer will be reduced in TensorRT and all the input and output of concat will merge to one tensor(marked are red arrows in the below pic). If we do not guarantee the scale of Q&DQ nodes(marked with green circle in the below pic) of these tensors are the same. There will be redundant precision-conversion in our Graph. 203 | 204 | monkey-patch-qat-conv-fp16-issue_qatonnx_edit 205 | 206 | For all the network-struct like this, We need do the same restrict. There is a special scene we need to take care: QDQ can cross some of the layers according to the commute rules from [TensorRT-developer-guide:tensorrt-process-qdq](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#tensorrt-process-qdq). eg. Max-pooling. 207 | the DQ nodes marked with red circle will cross the MaxPool layer and TensorRT will remember the crossed-MaxPooling layer as int8 precision. Now we meet the similar scence as concat: We should restrict the scale of Q&DQ the same as the Q&DQ in the green circle to avoid generate useless data format convertion here. 208 | 209 | monkey-patch-qat-maxpooling-qat.png 210 | 211 | ### 6) optimized QAT model's performance 212 | Now we apply all the restriction we have metioned. We can test the performance: 213 | 214 | we still use trtexec to benchmark the onnx model: 215 | ```bash 216 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7_qat_maskdet.onnx --fp16 --int8 --verbose --saveEngine=yolov7_qat_optimized.engine --workspace=1024000 --warmUp=500 --duration=10 --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_qat_optimized_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_qat_optimized_profile.json 217 | [I] Throughput: 207.267 qps 218 | ``` 219 | This performance is almost the same as PTQ performance. 220 | 221 | Next we need can finetune training our model to improve the accracy of the model. 222 | -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/QATConv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/QATConv.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/QATFlow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/QATFlow.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/int8_q_recommended_procedure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/int8_q_recommended_procedure.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptq.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptqonnx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptqonnx.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qat.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx_edit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx_edit.png -------------------------------------------------------------------------------- /yolov7_qat/doc/imgs/monkey-patch-qat-maxpooling-qat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-maxpooling-qat.png -------------------------------------------------------------------------------- /yolov7_qat/quantization/rules.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | import onnx 24 | 25 | def find_with_input_node(model, name): 26 | for node in model.graph.node: 27 | if len(node.input) > 0 and name in node.input: 28 | return node 29 | 30 | def find_all_with_input_node(model, name): 31 | all = [] 32 | for node in model.graph.node: 33 | if len(node.input) > 0 and name in node.input: 34 | all.append(node) 35 | return all 36 | 37 | def find_with_output_node(model, name): 38 | for node in model.graph.node: 39 | if len(node.output) > 0 and name in node.output: 40 | return node 41 | 42 | def find_with_no_change_parent_node(model, node): 43 | parent = find_with_output_node(model, node.input[0]) 44 | if parent is not None: 45 | if parent.op_type in ["Concat", "MaxPool"]: 46 | return find_with_no_change_parent_node(model, parent) 47 | return parent 48 | 49 | def find_quantizelinear_conv(model, qnode): 50 | dq = find_with_input_node(model, qnode.output[0]) 51 | conv = find_with_input_node(model, dq.output[0]) 52 | return conv 53 | 54 | 55 | def find_quantize_conv_name(model, weight_qname): 56 | dq = find_with_output_node(model, weight_qname) 57 | q = find_with_output_node(model, dq.input[0]) 58 | return ".".join(q.input[0].split(".")[:-1]) 59 | 60 | def find_quantizer_pairs(onnx_file): 61 | 62 | model = onnx.load(onnx_file) 63 | match_pairs = [] 64 | for node in model.graph.node: 65 | if node.op_type == "Concat": 66 | qnodes = find_all_with_input_node(model, node.output[0]) 67 | major = None 68 | for qnode in qnodes: 69 | if qnode.op_type != "QuantizeLinear": 70 | continue 71 | 72 | conv = find_quantizelinear_conv(model, qnode) 73 | if major is None: 74 | major = find_quantize_conv_name(model, conv.input[1]) 75 | else: 76 | match_pairs.append([major, find_quantize_conv_name(model, conv.input[1])]) 77 | 78 | for subnode in model.graph.node: 79 | if len(subnode.input) > 0 and subnode.op_type == "QuantizeLinear" and subnode.input[0] in node.input: 80 | subconv = find_quantizelinear_conv(model, subnode) 81 | match_pairs.append([major, find_quantize_conv_name(model, subconv.input[1])]) 82 | 83 | elif node.op_type == "MaxPool": 84 | qnode = find_with_input_node(model, node.output[0]) 85 | if not (qnode and qnode.op_type == "QuantizeLinear"): 86 | continue 87 | 88 | major = find_quantizelinear_conv(model, qnode) 89 | major = find_quantize_conv_name(model, major.input[1]) 90 | same_input_nodes = find_all_with_input_node(model, node.input[0]) 91 | 92 | for same_input_node in same_input_nodes: 93 | if same_input_node.op_type == "QuantizeLinear": 94 | subconv = find_quantizelinear_conv(model, same_input_node) 95 | match_pairs.append([major, find_quantize_conv_name(model, subconv.input[1])]) 96 | return match_pairs 97 | -------------------------------------------------------------------------------- /yolov7_qat/scripts/detect-trt.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | import argparse 24 | import time 25 | from pathlib import Path 26 | 27 | import cv2 28 | import torch 29 | import torch.backends.cudnn as cudnn 30 | from numpy import random 31 | import numpy as np 32 | from models.experimental import attempt_load 33 | from utils.datasets import LoadStreams, LoadImages 34 | from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \ 35 | scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path 36 | from utils.plots import plot_one_box 37 | from utils.torch_utils import select_device, load_classifier, time_synchronized, TracedModel 38 | 39 | import pycuda.autoinit 40 | import pycuda.driver as cuda 41 | import tensorrt as trt 42 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. 43 | # Simple helper data class that's a little nicer to use than a 2-tuple. 44 | names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', \ 45 | 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',\ 46 | 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', \ 47 | 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',\ 48 | 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', \ 49 | 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',\ 50 | 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',\ 51 | 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', \ 52 | 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',\ 53 | 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',\ 54 | 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',\ 55 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', \ 56 | 'teddy bear', 'hair drier', 'toothbrush'] 57 | 58 | class HostDeviceMem(object): 59 | def __init__(self, host_mem, device_mem): 60 | self.host = host_mem 61 | self.device = device_mem 62 | 63 | def __str__(self): 64 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 65 | 66 | def __repr__(self): 67 | return self.__str__() 68 | 69 | def allocate_buffers(engine): 70 | inputs = [] 71 | outputs = [] 72 | bindings = [] 73 | stream = cuda.Stream() 74 | for binding in engine: 75 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 76 | print("binding shape: ", engine.get_binding_shape(binding)) 77 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 78 | # Allocate host and device buffers 79 | host_mem = cuda.pagelocked_empty(size, dtype) 80 | device_mem = cuda.mem_alloc(host_mem.nbytes) 81 | # Append the device buffer to device bindings. 82 | bindings.append(int(device_mem)) 83 | # Append to the appropriate list. 84 | if engine.binding_is_input(binding): 85 | inputs.append(HostDeviceMem(host_mem, device_mem)) 86 | else: 87 | outputs.append(HostDeviceMem(host_mem, device_mem)) 88 | return inputs, outputs, bindings, stream 89 | 90 | def do_inference_v2(context, bindings, inputs, outputs, stream): 91 | # Transfer input data to the GPU. 92 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 93 | # Run inference. 94 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) 95 | # Transfer predictions back from the GPU. 96 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 97 | # Synchronize the stream 98 | stream.synchronize() 99 | # Return only the host outputs. 100 | return [out.host for out in outputs] 101 | 102 | def detect(save_img=False): 103 | source, view_img, save_txt, imgsz = opt.source, opt.view_img, opt.save_txt, opt.img_size 104 | save_img = not opt.nosave and not source.endswith('.txt') # save inference images 105 | webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith( 106 | ('rtsp://', 'rtmp://', 'http://', 'https://')) 107 | 108 | # Directories 109 | save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # increment run 110 | (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir 111 | 112 | # Initialize 113 | set_logging() 114 | device = select_device(opt.device) # device will be avilable for NMS 115 | 116 | # Set Dataloader 117 | vid_path, vid_writer = None, None 118 | if webcam: 119 | view_img = check_imshow() 120 | cudnn.benchmark = True # set True to speed up constant image size inference 121 | dataset = LoadStreams(source, img_size=imgsz) 122 | else: 123 | dataset = LoadImages(source, img_size=imgsz, auto=False) 124 | 125 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in names] 126 | 127 | ####### start trt objects 128 | logger = trt.Logger(trt.Logger.INFO) 129 | f = open(opt.engine, 'rb') 130 | runtime = trt.Runtime(logger) 131 | engine = runtime.deserialize_cuda_engine(f.read()) 132 | inputs, outputs, bindings, stream = allocate_buffers(engine) 133 | outputshape = [engine.get_binding_shape(binding) for binding in engine][1] 134 | 135 | t0 = time.time() 136 | for path, img, im0s, vid_cap in dataset: 137 | img = img.astype(np.float32) 138 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 139 | if len(img.shape) == 3: 140 | img = np.expand_dims(img, 0) 141 | 142 | inputs[0].host = img 143 | context = engine.create_execution_context() 144 | trt_outputs = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream = stream) 145 | trt_outputs = torch.Tensor(trt_outputs[0].reshape(outputshape)) 146 | # Inference 147 | t1 = time_synchronized() 148 | 149 | # pred = trt_outputs 150 | t2 = time_synchronized() 151 | 152 | # Apply NMS 153 | trt_outputs = non_max_suppression(trt_outputs, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) 154 | t3 = time_synchronized() 155 | 156 | # Process detections 157 | for i, det in enumerate(trt_outputs): # detections per image 158 | if webcam: # batch_size >= 1 159 | p, s, im0, frame = path[i], '%g: ' % i, im0s[i].copy(), dataset.count 160 | else: 161 | p, s, im0, frame = path, '', im0s, getattr(dataset, 'frame', 0) 162 | 163 | p = Path(p) # to Path 164 | save_path = str(save_dir / p.name) # img.jpg 165 | txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}') # img.txt 166 | gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh 167 | if len(det): 168 | # Rescale boxes from img_size to im0 size 169 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() 170 | 171 | # Print results 172 | for c in det[:, -1].unique(): 173 | n = (det[:, -1] == c).sum() # detections per class 174 | s += f"{n} {names[int(c)]}{'s' * (n > 1)}, " # add to string 175 | 176 | # Write results 177 | for *xyxy, conf, cls in reversed(det): 178 | if save_txt: # Write to file 179 | xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh 180 | line = (cls, *xywh, conf) if opt.save_conf else (cls, *xywh) # label format 181 | with open(txt_path + '.txt', 'a') as f: 182 | f.write(('%g ' * len(line)).rstrip() % line + '\n') 183 | 184 | if save_img or view_img: # Add bbox to image 185 | label = f'{names[int(cls)]} {conf:.2f}' 186 | plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=1) 187 | 188 | # Print time (inference + NMS) 189 | print(f'{s}Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference, ({(1E3 * (t3 - t2)):.1f}ms) NMS') 190 | 191 | # Stream results 192 | if view_img: 193 | cv2.imshow(str(p), im0) 194 | cv2.waitKey(1) # 1 millisecond 195 | 196 | # Save results (image with detections) 197 | if save_img: 198 | if dataset.mode == 'image': 199 | cv2.imwrite(save_path, im0) 200 | print(f" The image with the result is saved in: {save_path}") 201 | else: # 'video' or 'stream' 202 | if vid_path != save_path: # new video 203 | vid_path = save_path 204 | if isinstance(vid_writer, cv2.VideoWriter): 205 | vid_writer.release() # release previous video writer 206 | if vid_cap: # video 207 | fps = vid_cap.get(cv2.CAP_PROP_FPS) 208 | w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 209 | h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 210 | else: # stream 211 | fps, w, h = 30, im0.shape[1], im0.shape[0] 212 | save_path += '.mp4' 213 | vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h)) 214 | vid_writer.write(im0) 215 | 216 | if save_txt or save_img: 217 | s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else '' 218 | #print(f"Results saved to {save_dir}{s}") 219 | 220 | print(f'Done. ({time.time() - t0:.3f}s)') 221 | 222 | 223 | if __name__ == '__main__': 224 | parser = argparse.ArgumentParser() 225 | parser.add_argument('--engine', type=str, default='yolov7.engine', help='model.pt path(s)') 226 | parser.add_argument('--source', type=str, default='inference/images', help='source') # file/folder, 0 for webcam 227 | parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)') 228 | parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold') 229 | parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS') 230 | parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') 231 | parser.add_argument('--view-img', action='store_true', help='display results') 232 | parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') 233 | parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') 234 | parser.add_argument('--nosave', action='store_true', help='do not save images/videos') 235 | parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3') 236 | parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') 237 | parser.add_argument('--augment', action='store_true', help='augmented inference') 238 | parser.add_argument('--project', default='runs/detect', help='save results to project/name') 239 | parser.add_argument('--name', default='exp', help='save results to project/name') 240 | parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') 241 | opt = parser.parse_args() 242 | print(opt) 243 | 244 | detect() 245 | -------------------------------------------------------------------------------- /yolov7_qat/scripts/draw-engine.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | 25 | """ 26 | This script generates an SVG diagram of the input engine graph SVG file. 27 | Note: 28 | THIS SCRIPT DEPENDS ON LIB: https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer 29 | this script requires graphviz which can be installed manually: 30 | $ sudo apt-get --yes install graphviz 31 | $ python3 -m pip install graphviz networkx 32 | """ 33 | 34 | import graphviz 35 | from trex import * 36 | import argparse 37 | import shutil 38 | 39 | 40 | def draw_engine(engine_json_fname: str, engine_profile_fname: str): 41 | graphviz_is_installed = shutil.which("dot") is not None 42 | if not graphviz_is_installed: 43 | print("graphviz is required but it is not installed.\n") 44 | print("To install on Ubuntu:") 45 | print("sudo apt --yes install graphviz") 46 | exit() 47 | 48 | plan = EnginePlan(engine_json_fname, engine_profile_fname) 49 | formatter = layer_type_formatter 50 | display_regions = True 51 | expand_layer_details = False 52 | 53 | graph = to_dot(plan, formatter, 54 | display_regions=display_regions, 55 | expand_layer_details=expand_layer_details) 56 | render_dot(graph, engine_json_fname, 'svg') 57 | 58 | 59 | if __name__ == "__main__": 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument('--layer', help="name of engine JSON file to draw") 62 | parser.add_argument('--profile', help="name of profile JSON file to draw") 63 | args = parser.parse_args() 64 | draw_engine(engine_json_fname=args.layer,engine_profile_fname=args.profile) 65 | -------------------------------------------------------------------------------- /yolov7_qat/scripts/eval-trt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script only worked on the Quantization model, otherwise, errors would throw up (Origin model, etc.) 4 | weight=$1 5 | prefix=${weight%.*} 6 | onnx=${prefix}.onnx 7 | graph=${prefix}.graph 8 | engine=${prefix}.engine 9 | 10 | # onnx must be 672x672 of input 11 | python scripts/qat.py export $weight --dynamic --save=$onnx --size=672 12 | 13 | # To obtain more QPS can add --fp16 flag for detect layer 14 | trtexec --onnx=$onnx \ 15 | --saveEngine=${engine} --int8 --buildOnly --memPoolSize=workspace:1024MiB \ 16 | --dumpLayerInfo --exportLayerInfo=${graph} --profilingVerbosity=detailed 17 | 18 | python scripts/draw-engine.py ${graph} 19 | python scripts/eval-trt.py --engine=${engine} 20 | -------------------------------------------------------------------------------- /yolov7_qat/scripts/qat.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | import sys 24 | import os 25 | 26 | # Add the current directory to PYTHONPATH for YoloV7 27 | sys.path.insert(0, os.path.abspath(".")) 28 | pydir = os.path.dirname(__file__) 29 | 30 | import yaml 31 | import collections 32 | import warnings 33 | import argparse 34 | import json 35 | from pathlib import Path 36 | 37 | # PyTorch 38 | import torch 39 | import torch.nn as nn 40 | 41 | # YoloV7 42 | import test 43 | from models.yolo import Model 44 | from models.common import Conv 45 | from utils.datasets import create_dataloader 46 | from utils.google_utils import attempt_download 47 | from utils.general import init_seeds 48 | 49 | import quantization.quantize as quantize 50 | 51 | # Disable all warning 52 | warnings.filterwarnings("ignore") 53 | 54 | 55 | class SummaryTool: 56 | def __init__(self, file): 57 | self.file = file 58 | self.data = [] 59 | 60 | def append(self, item): 61 | self.data.append(item) 62 | json.dump(self.data, open(self.file, "w"), indent=4) 63 | 64 | 65 | # Load YoloV7 Model 66 | def load_yolov7_model(weight, device) -> Model: 67 | 68 | attempt_download(weight) 69 | model = torch.load(weight, map_location=device)["model"] 70 | for m in model.modules(): 71 | if type(m) is nn.Upsample: 72 | m.recompute_scale_factor = None # torch 1.11.0 compatibility 73 | elif type(m) is Conv: 74 | m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatibility 75 | 76 | model.float() 77 | model.eval() 78 | 79 | with torch.no_grad(): 80 | model.fuse() 81 | return model 82 | 83 | 84 | def create_coco_train_dataloader(cocodir, batch_size=10): 85 | 86 | with open("data/hyp.scratch.p5.yaml") as f: 87 | hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps 88 | 89 | loader = create_dataloader( 90 | f"{cocodir}/train2017.txt", 91 | imgsz=640, 92 | batch_size=batch_size, 93 | opt=collections.namedtuple("Opt", "single_cls")(False), 94 | augment=True, hyp=hyp, rect=False, cache=False, stride=32,pad=0, image_weights=False)[0] 95 | return loader 96 | 97 | 98 | def create_coco_val_dataloader(cocodir, batch_size=10, keep_images=None): 99 | 100 | loader = create_dataloader( 101 | f"{cocodir}/val2017.txt", 102 | imgsz=640, 103 | batch_size=batch_size, 104 | opt=collections.namedtuple("Opt", "single_cls")(False), 105 | augment=False, hyp=None, rect=True, cache=False,stride=32,pad=0.5, image_weights=False)[0] 106 | 107 | def subclass_len(self): 108 | if keep_images is not None: 109 | return keep_images 110 | return len(self.img_files) 111 | 112 | loader.dataset.__len__ = subclass_len 113 | return loader 114 | 115 | 116 | def evaluate_coco(model, dataloader, using_cocotools = False, save_dir=".", conf_thres=0.001, iou_thres=0.65): 117 | 118 | if save_dir and os.path.dirname(save_dir) != "": 119 | os.makedirs(os.path.dirname(save_dir), exist_ok=True) 120 | 121 | return test.test( 122 | "data/coco.yaml", 123 | save_dir=Path(save_dir), 124 | dataloader=dataloader, conf_thres=conf_thres,iou_thres=iou_thres,model=model,is_coco=True, 125 | plots=False,half_precision=True,save_json=using_cocotools)[0][3] 126 | 127 | 128 | def export_onnx(model : Model, file, size=640, dynamic_batch=False): 129 | 130 | device = next(model.parameters()).device 131 | model.float() 132 | 133 | dummy = torch.zeros(1, 3, size, size, device=device) 134 | model.model[-1].concat = True 135 | grid_old_func = model.model[-1]._make_grid 136 | model.model[-1]._make_grid = lambda *args: torch.from_numpy(grid_old_func(*args).data.numpy()) 137 | 138 | quantize.export_onnx(model, dummy, file, opset_version=13, 139 | input_names=["images"], output_names=["outputs"], 140 | dynamic_axes={"images": {0: "batch"}, "outputs": {0: "batch"}} if dynamic_batch else None 141 | ) 142 | model.model[-1].concat = False 143 | model.model[-1]._make_grid = grid_old_func 144 | 145 | 146 | def cmd_quantize(weight, cocodir, device, ignore_policy, save_ptq, save_qat, supervision_stride, iters, eval_origin, eval_ptq): 147 | quantize.initialize() 148 | 149 | if save_ptq and os.path.dirname(save_ptq) != "": 150 | os.makedirs(os.path.dirname(save_ptq), exist_ok=True) 151 | 152 | if save_qat and os.path.dirname(save_qat) != "": 153 | os.makedirs(os.path.dirname(save_qat), exist_ok=True) 154 | 155 | device = torch.device(device) 156 | model = load_yolov7_model(weight, device) 157 | train_dataloader = create_coco_train_dataloader(cocodir) 158 | val_dataloader = create_coco_val_dataloader(cocodir) 159 | quantize.replace_to_quantization_module(model, ignore_policy=ignore_policy) 160 | quantize.apply_custom_rules_to_quantizer(model, export_onnx) 161 | quantize.calibrate_model(model, train_dataloader, device) 162 | 163 | json_save_dir = "." if os.path.dirname(save_ptq) == "" else os.path.dirname(save_ptq) 164 | summary_file = os.path.join(json_save_dir, "summary.json") 165 | summary = SummaryTool(summary_file) 166 | 167 | if eval_origin: 168 | print("Evaluate Origin...") 169 | with quantize.disable_quantization(model): 170 | ap = evaluate_coco(model, val_dataloader, True, json_save_dir) 171 | summary.append(["Origin", ap]) 172 | 173 | if eval_ptq: 174 | print("Evaluate PTQ...") 175 | ap = evaluate_coco(model, val_dataloader, True, json_save_dir) 176 | summary.append(["PTQ", ap]) 177 | 178 | if save_ptq: 179 | print(f"Save ptq model to {save_ptq}") 180 | torch.save({"model": model}, save_ptq) 181 | 182 | if save_qat is None: 183 | print("Done as save_qat is None.") 184 | return 185 | 186 | best_ap = 0 187 | def per_epoch(model, epoch, lr): 188 | 189 | nonlocal best_ap 190 | ap = evaluate_coco(model, val_dataloader, True, json_save_dir) 191 | summary.append([f"QAT{epoch}", ap]) 192 | 193 | if ap > best_ap: 194 | print(f"Save qat model to {save_qat} @ {ap:.5f}") 195 | best_ap = ap 196 | torch.save({"model": model}, save_qat) 197 | 198 | def preprocess(datas): 199 | return datas[0].to(device).float() / 255.0 200 | 201 | def supervision_policy(): 202 | supervision_list = [] 203 | for item in model.model: 204 | supervision_list.append(id(item)) 205 | 206 | keep_idx = list(range(0, len(model.model) - 1, supervision_stride)) 207 | keep_idx.append(len(model.model) - 2) 208 | def impl(name, module): 209 | if id(module) not in supervision_list: return False 210 | idx = supervision_list.index(id(module)) 211 | if idx in keep_idx: 212 | print(f"Supervision: {name} will compute loss with origin model during QAT training") 213 | else: 214 | print(f"Supervision: {name} no compute loss during QAT training, that is unsupervised only and doesn't mean don't learn") 215 | return idx in keep_idx 216 | return impl 217 | 218 | quantize.finetune( 219 | model, train_dataloader, per_epoch, early_exit_batchs_per_epoch=iters, 220 | preprocess=preprocess, supervision_policy=supervision_policy()) 221 | 222 | 223 | def cmd_export(weight, save, size, dynamic): 224 | 225 | quantize.initialize() 226 | if save is None: 227 | name = os.path.basename(weight) 228 | name = name[:name.rfind('.')] 229 | save = os.path.join(os.path.dirname(weight), name + ".onnx") 230 | 231 | export_onnx(torch.load(weight, map_location="cpu")["model"], save, size, dynamic_batch=dynamic) 232 | print(f"Save onnx to {save}") 233 | 234 | 235 | def cmd_sensitive_analysis(weight, device, cocodir, summary_save, num_image): 236 | 237 | quantize.initialize() 238 | device = torch.device(device) 239 | model = load_yolov7_model(weight, device) 240 | train_dataloader = create_coco_train_dataloader(cocodir) 241 | val_dataloader = create_coco_val_dataloader(cocodir, keep_images=None if num_image is None or num_image < 1 else num_image) 242 | quantize.replace_to_quantization_module(model) 243 | quantize.calibrate_model(model, train_dataloader, device) 244 | 245 | summary = SummaryTool(summary_save) 246 | print("Evaluate PTQ...") 247 | ap = evaluate_coco(model, val_dataloader) 248 | summary.append([ap, "PTQ"]) 249 | 250 | print("Sensitive analysis by each layer...") 251 | for i in range(0, len(model.model)): 252 | layer = model.model[i] 253 | if quantize.have_quantizer(layer): 254 | print(f"Quantization disable model.{i}") 255 | quantize.disable_quantization(layer).apply() 256 | ap = evaluate_coco(model, val_dataloader) 257 | summary.append([ap, f"model.{i}"]) 258 | quantize.enable_quantization(layer).apply() 259 | else: 260 | print(f"ignore model.{i} because it is {type(layer)}") 261 | 262 | summary = sorted(summary.data, key=lambda x:x[0], reverse=True) 263 | print("Sensitive summary:") 264 | for n, (ap, name) in enumerate(summary[:10]): 265 | print(f"Top{n}: Using fp16 {name}, ap = {ap:.5f}") 266 | 267 | 268 | def cmd_test(weight, device, cocodir, confidence, nmsthres): 269 | 270 | device = torch.device(device) 271 | model = load_yolov7_model(weight, device) 272 | val_dataloader = create_coco_val_dataloader(cocodir) 273 | evaluate_coco(model, val_dataloader, True, conf_thres=confidence, iou_thres=nmsthres) 274 | 275 | 276 | if __name__ == "__main__": 277 | 278 | parser = argparse.ArgumentParser(prog='qat.py') 279 | subps = parser.add_subparsers(dest="cmd") 280 | exp = subps.add_parser("export", help="Export weight to onnx file") 281 | exp.add_argument("weight", type=str, default="yolov7.pt", help="export pt file") 282 | exp.add_argument("--save", type=str, required=False, help="export onnx file") 283 | exp.add_argument("--size", type=int, default=640, help="export input size") 284 | exp.add_argument("--dynamic", action="store_true", help="export dynamic batch") 285 | 286 | qat = subps.add_parser("quantize", help="PTQ/QAT finetune ...") 287 | qat.add_argument("weight", type=str, nargs="?", default="yolov7.pt", help="weight file") 288 | qat.add_argument("--cocodir", type=str, default="/datav/dataset/coco", help="coco directory") 289 | qat.add_argument("--device", type=str, default="cuda:0", help="device") 290 | qat.add_argument("--ignore-policy", type=str, default="model\.105\.m\.(.*)", help="regx") 291 | qat.add_argument("--ptq", type=str, default="ptq.pt", help="file") 292 | qat.add_argument("--qat", type=str, default=None, help="file") 293 | qat.add_argument("--supervision-stride", type=int, default=1, help="supervision stride") 294 | qat.add_argument("--iters", type=int, default=200, help="iters per epoch") 295 | qat.add_argument("--eval-origin", action="store_true", help="do eval for origin model") 296 | qat.add_argument("--eval-ptq", action="store_true", help="do eval for ptq model") 297 | 298 | sensitive = subps.add_parser("sensitive", help="Sensitive layer analysis") 299 | sensitive.add_argument("weight", type=str, nargs="?", default="yolov7.pt", help="weight file") 300 | sensitive.add_argument("--device", type=str, default="cuda:0", help="device") 301 | sensitive.add_argument("--cocodir", type=str, default="/datav/dataset/coco", help="coco directory") 302 | sensitive.add_argument("--summary", type=str, default="sensitive-summary.json", help="summary save file") 303 | sensitive.add_argument("--num-image", type=int, default=None, help="number of image to evaluate") 304 | 305 | testcmd = subps.add_parser("test", help="Do evaluate") 306 | testcmd.add_argument("weight", type=str, default="yolov7.pt", help="weight file") 307 | testcmd.add_argument("--cocodir", type=str, default="/datav/dataset/coco", help="coco directory") 308 | testcmd.add_argument("--device", type=str, default="cuda:0", help="device") 309 | testcmd.add_argument("--confidence", type=float, default=0.001, help="confidence threshold") 310 | testcmd.add_argument("--nmsthres", type=float, default=0.65, help="nms threshold") 311 | 312 | args = parser.parse_args() 313 | init_seeds(57) 314 | 315 | if args.cmd == "export": 316 | cmd_export(args.weight, args.save, args.size, args.dynamic) 317 | elif args.cmd == "quantize": 318 | print(args) 319 | cmd_quantize( 320 | args.weight, args.cocodir, args.device, args.ignore_policy, 321 | args.ptq, args.qat, args.supervision_stride, args.iters, 322 | args.eval_origin, args.eval_ptq 323 | ) 324 | elif args.cmd == "sensitive": 325 | cmd_sensitive_analysis(args.weight, args.device, args.cocodir, args.summary, args.num_image) 326 | elif args.cmd == "test": 327 | cmd_test(args.weight, args.device, args.cocodir, args.confidence, args.nmsthres) 328 | else: 329 | parser.print_help() 330 | -------------------------------------------------------------------------------- /yolov7_qat/scripts/quantize_utils.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | 24 | import onnx_graphsurgeon as gs 25 | from onnx_graphsurgeon.ir.tensor import Variable 26 | import onnx 27 | import numpy as np 28 | import argparse 29 | import logging 30 | 31 | LAYER_ID = 0 32 | TENSOR_ID = 0 33 | def get_qparams_constants(node_to_quantize_name, scale_init=0.5, zero_point_init=0): 34 | global LAYER_ID, TENSOR_ID 35 | """ ATTENTION: "node_to_quantize_name" needs to be different every time this function is called. 36 | Otherwise, "scale, zero_point" are overwritten. 37 | TODO: ensure that this happens! The same goes for 38 | "q_out and dq_out = gs.Variable(UNIQUE_NAME)" 39 | 40 | :param node_to_quantize_name: 41 | :param scale_init: 42 | :param zero_point_init: 43 | :return: 2 gs.Constants (scale and zero-point). 44 | """ 45 | scale = gs.Constant( 46 | name=node_to_quantize_name + "_scale" + str(TENSOR_ID), 47 | values=np.array(scale_init, dtype=np.float32)) 48 | TENSOR_ID = TENSOR_ID + 1 49 | zero_point = gs.Constant( 50 | name=node_to_quantize_name + "_zero_point" + str(TENSOR_ID), 51 | values=np.array(zero_point_init, dtype=np.int8) 52 | ) 53 | TENSOR_ID = TENSOR_ID + 1 54 | return scale, zero_point 55 | 56 | def quantize_tensor(graph, tensor_to_quantize, scale, name_suffix=""): 57 | global LAYER_ID, TENSOR_ID 58 | output_nodes = tensor_to_quantize['x'].outputs 59 | nodes_and_quantized = [] 60 | nodes_inputidx = [] 61 | 62 | for node in output_nodes: 63 | for idx, inp in enumerate(node.inputs): 64 | if inp.name == tensor_to_quantize['x'].name: 65 | nodes_and_quantized.append(node) 66 | nodes_inputidx.append(idx) 67 | break 68 | 69 | # QuantizeLinear node 70 | q_scale, q_zero_point = get_qparams_constants(tensor_to_quantize['x'].name + "_inp_q" + name_suffix, scale_init=scale) 71 | q_out = gs.Variable(name=tensor_to_quantize['x'].name + "_QuantizeLinear_out" + name_suffix + str(TENSOR_ID)) 72 | TENSOR_ID = TENSOR_ID + 1 73 | quant_node = gs.Node( 74 | op="QuantizeLinear", 75 | name="QuantI_"+ tensor_to_quantize['x'].name + str(LAYER_ID), 76 | inputs=[tensor_to_quantize["x"], q_scale, q_zero_point], 77 | outputs=[q_out] 78 | ) 79 | LAYER_ID = LAYER_ID + 1 80 | # DequantizeLinear node 81 | dq_scale, dq_zero_point = get_qparams_constants(tensor_to_quantize['x'].name + "_inp_dq" + name_suffix, scale_init=scale) 82 | dq_out = gs.Variable(name=tensor_to_quantize['x'].name + "_DequantizeLinear_out" + name_suffix + str(TENSOR_ID)) 83 | TENSOR_ID = TENSOR_ID + 1 84 | dequant_node = gs.Node( 85 | op="DequantizeLinear", 86 | name="DequantI_"+ tensor_to_quantize['x'].name + str(LAYER_ID), 87 | inputs=[q_out, dq_scale, dq_zero_point], 88 | outputs=[dq_out] 89 | ) 90 | LAYER_ID = LAYER_ID + 1 91 | #shit code 92 | for i, node in enumerate(nodes_and_quantized): 93 | node.inputs[nodes_inputidx[i]] = dq_out 94 | 95 | graph.nodes.extend([quant_node, dequant_node]) 96 | return graph 97 | 98 | def quantize_input(graph, node_to_quantize, node_to_quantize_input, scale, name_suffix=""): 99 | global LAYER_ID, TENSOR_ID 100 | # QuantizeLinear node 101 | q_scale, q_zero_point = get_qparams_constants(node_to_quantize.name + "_inp_q" + name_suffix, scale_init=scale) 102 | q_out = gs.Variable(name=node_to_quantize.name + "_QuantizeLinear_out" + name_suffix + name_suffix + str(TENSOR_ID)) 103 | TENSOR_ID = TENSOR_ID + 1 104 | quant_node = gs.Node( 105 | op="QuantizeLinear", 106 | name="QuantI_"+ node_to_quantize.name + str(LAYER_ID), 107 | inputs=[node_to_quantize_input["x"], q_scale, q_zero_point], 108 | outputs=[q_out] 109 | ) 110 | LAYER_ID = LAYER_ID + 1 111 | 112 | # DequantizeLinear node 113 | dq_scale, dq_zero_point = get_qparams_constants(node_to_quantize.name + "_inp_dq" + name_suffix, scale_init=scale) 114 | dq_out = gs.Variable(name=node_to_quantize.name + "_DequantizeLinear_out" + name_suffix + name_suffix + str(TENSOR_ID)) 115 | TENSOR_ID = TENSOR_ID + 1 116 | dequant_node = gs.Node( 117 | op="DequantizeLinear", 118 | name="DequantI_"+ node_to_quantize.name + str(LAYER_ID), 119 | inputs=[q_out, dq_scale, dq_zero_point], 120 | outputs=[dq_out] 121 | ) 122 | LAYER_ID = LAYER_ID + 1 123 | 124 | node_to_quantize.inputs[node_to_quantize_input["idx"]] = dq_out 125 | graph.nodes.extend([quant_node, dequant_node]) 126 | 127 | graph.cleanup().toposort() 128 | return graph 129 | 130 | 131 | def quantize_weight(graph, node_to_quantize, node_to_quantize_weight, axis=0, name_suffix=""): 132 | global LAYER_ID, TENSOR_ID 133 | """ 134 | When connected to the weight, the "y_scale" parameter can be recovered directly from the Weight matrix. 135 | See official doc: https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#intro-quantization 136 | 137 | :param graph: 138 | :param node_to_quantize: 139 | :param node_to_quantize_weight: 140 | :param axis: 141 | :param name_suffix: 142 | :return: 143 | """ 144 | shape = node_to_quantize_weight["x"].shape[axis] 145 | # Recover "y_scale" from weight matrix 146 | weight_matrix = node_to_quantize_weight["x"].values 147 | y_scale_arr = [] 148 | # Recover "y_scale" for each batch. If axis != 0, move the desired axis to the be idx=0. 149 | if axis !=0: 150 | weight_matrix = np.moveaxis(weight_matrix, [axis], [0]) 151 | # for bais 1d-weight 152 | if len(weight_matrix.shape) == 1: 153 | weight_matrix = np.expand_dims(weight_matrix, axis=0) 154 | for w in weight_matrix[:]: 155 | dyn_range = max(abs(w.min()), abs(w.max())) 156 | y_scale = dyn_range / 127.0 157 | y_scale_arr.append(y_scale) 158 | 159 | # QuantizeLinear node 160 | q_scale, q_zero_point = get_qparams_constants( 161 | node_to_quantize.name + "_weight_q" + name_suffix, 162 | scale_init=y_scale_arr, # * np.ones(shape=(shape,)), 163 | zero_point_init=np.zeros(shape=(shape,)), 164 | ) 165 | q_out = gs.Variable(name=node_to_quantize.name + "_QuantizeLinear_weight_out" + name_suffix + str(TENSOR_ID)) 166 | TENSOR_ID = TENSOR_ID + 1 167 | quant_node = gs.Node( 168 | op="QuantizeLinear", 169 | name="QuantW_"+ node_to_quantize.name + str(LAYER_ID), 170 | inputs=[node_to_quantize_weight["x"], q_scale, q_zero_point], 171 | outputs=[q_out], 172 | attrs={"axis": axis} 173 | ) 174 | LAYER_ID = LAYER_ID + 1 175 | 176 | 177 | # DequantizeLinear node 178 | dq_scale, dq_zero_point = get_qparams_constants( 179 | node_to_quantize.name + "_weight_dq" + name_suffix, 180 | scale_init=y_scale_arr, # * np.ones(shape=(shape,)), 181 | zero_point_init=np.zeros(shape=(shape,)), 182 | ) 183 | TENSOR_ID = TENSOR_ID + 1 184 | dq_out = gs.Variable(name=node_to_quantize.name + "_DequantizeLinear_weight_out" + name_suffix + str(TENSOR_ID)) 185 | dequant_node = gs.Node( 186 | op="DequantizeLinear", 187 | name="DequantW_"+ node_to_quantize.name + str(LAYER_ID), 188 | inputs=[q_out, dq_scale, dq_zero_point], 189 | outputs=[dq_out], 190 | attrs={"axis": axis} 191 | ) 192 | LAYER_ID = LAYER_ID + 1 193 | 194 | node_to_quantize.inputs[node_to_quantize_weight["idx"]] = dq_out 195 | graph.nodes.extend([quant_node, dequant_node]) 196 | 197 | graph.cleanup().toposort() 198 | return graph 199 | 200 | def get_node_to_quantize_infos(node_to_quantize, disableResAdd:bool): 201 | # Separate inputs into activation ('Variable' type) and weight ('Constant' type). 202 | node_to_quantize_input = [] 203 | node_to_quantize_weight = [] 204 | for idx, inp in enumerate(node_to_quantize.inputs): 205 | if isinstance(inp, Variable): 206 | node_to_quantize_input.append({"x": inp, "idx": idx}) 207 | # residual add, will not work with bias add 208 | if node_to_quantize.op == "Add" and (not disableResAdd) and len(node_to_quantize_input) == 2: 209 | node_to_quantize_input = [node_to_quantize_input[0]] 210 | else: # Constant 211 | if ( 212 | len(node_to_quantize_weight) == 0 213 | and node_to_quantize.op not in ["Add", "BatchNormalization"] 214 | and len(inp.shape) > 1 215 | ): 216 | # 1) Only quantize the Weight, not Bias 217 | # 2) Do not quantize bias matrix in BiasAdd ops 218 | # 3) Only save weight matrices with shape > 1 (Conv 4D, MatMul 2D) 219 | node_to_quantize_weight.append({"x": inp, "idx": idx}) 220 | 221 | # for bias add after matmul 222 | elif( 223 | len(node_to_quantize_weight) == 0 224 | and node_to_quantize.op =="Add" 225 | and isinstance(node_to_quantize.inputs[0], gs.Constant)): 226 | node_to_quantize_weight.append({"x": inp, "idx": idx}) 227 | 228 | 229 | return node_to_quantize_input, node_to_quantize_weight 230 | 231 | def quantize_node_automatically(graph, node_to_quantize, scale, disableResAdd:bool): 232 | """ 233 | Quantizes a node according to information in graph.json (generated from the PTQ engine building step. 234 | 235 | :return: 236 | """ 237 | node_to_quantize_input, node_to_quantize_weight = get_node_to_quantize_infos(node_to_quantize, disableResAdd) 238 | 239 | # Quantize inputs 240 | input_was_quantized = False 241 | # Quantizable layer 242 | for i, node_inp in enumerate(node_to_quantize_input): 243 | graph = quantize_input(graph, node_to_quantize, node_inp, scale, name_suffix=str(i)) 244 | input_was_quantized = True 245 | 246 | # Quantize weights 247 | for i, node_weight in enumerate(node_to_quantize_weight): 248 | if input_was_quantized: 249 | graph = quantize_weight( 250 | graph, 251 | node_to_quantize, 252 | node_weight, 253 | axis=1 if node_to_quantize.op in ["MatMul", "ConvTranspose"] else 0, # TODO: Automatize axis detection. Automatize this by checking the expected layer output and extract axis that matches desired dimension. 254 | name_suffix=str(i) 255 | ) 256 | return graph 257 | 258 | def quantize_tensor_automatically(graph, tensor_to_quantize, scale): 259 | """ 260 | Quantizes a tensor 261 | 262 | :return: 263 | """ 264 | tensor_to_quantize = [{'x':tensor_to_quantize},] 265 | # Quantizable tensor 266 | for i, tensor_inp in enumerate(tensor_to_quantize): 267 | graph = quantize_tensor(graph, tensor_inp, scale, name_suffix=str(i)) 268 | return graph 269 | 270 | 271 | def quant_one_node(graph, node_name, scale=0.04370, disableResAdd:bool = False): 272 | nodes = graph.nodes 273 | node_to_quantize = [x for x in nodes if x.name == node_name] 274 | if len(node_to_quantize) == 0: 275 | logging.warning(f'node: ',node_name, "did not found, skip") 276 | if len(node_to_quantize) > 1: 277 | logging.error(f'found multiple node named: ',node_name) 278 | node_to_quantize = node_to_quantize[0] 279 | graph = quantize_node_automatically(graph, node_to_quantize, scale, disableResAdd) 280 | return graph 281 | 282 | def quant_one_tensor(graph, tensor_name, scale=0.04370): 283 | # nodes = graph.nodes 284 | tensors = graph.tensors() 285 | tensor_to_quantize = [tensor for name, tensor in tensors.items() if tensor.name == tensor_name] 286 | if len(tensor_to_quantize) == 0: 287 | logging.warning(f'tensor: ',tensor_name, "did not found, skip") 288 | if len(tensor_to_quantize) > 1: 289 | logging.error(f'found multiple tensor named: ',tensor_name) 290 | 291 | tensor_to_quantize = tensor_to_quantize[0] 292 | graph = quantize_tensor_automatically(graph, tensor_to_quantize, scale) 293 | return graph 294 | 295 | def quant_node_of_list(graph, op_name_list:list, disableResAdd:bool): 296 | for op in op_name_list: 297 | graph = quant_one_node(graph, op, disableResAdd=disableResAdd) 298 | ##TODO: if one element is Conv1:0.03, it should be support 299 | return graph 300 | 301 | def quant_tensor_of_list(graph, tensor_name_list:list): 302 | for tensor in tensor_name_list: 303 | graph = quant_one_tensor(graph, tensor) 304 | return graph 305 | 306 | # def quant_all_nodes_of_type(): 307 | # return None 308 | 309 | def quant_onnx(model_path, output_model_path, nodes_name_to_quant, tensors_name_to_quant, disableResAdd:bool): 310 | model = onnx.load(model_path) 311 | model = onnx.shape_inference.infer_shapes(model) 312 | graph = gs.import_onnx(model) 313 | graph = quant_node_of_list(graph, nodes_name_to_quant, disableResAdd) 314 | graph = quant_tensor_of_list(graph, tensors_name_to_quant) 315 | graph.cleanup() 316 | new_model = gs.export_onnx(graph) 317 | onnx.save(new_model, output_model_path) 318 | 319 | if __name__ == "__main__": 320 | parser = argparse.ArgumentParser(description='iso_the onnx model with new input and output') 321 | parser.add_argument('--model', default='model.onnx', type=str, help='the onnx model') 322 | parser.add_argument('--output_model', default='', type=str, help='the output model') 323 | parser.add_argument('--nodes', nargs='+', type=str, help='the input nodes list you want to quant',default=[]) 324 | parser.add_argument('--disableResAdd', action='store_true', help='if enabled this flag, residual add will have two inputs') 325 | 326 | parser.add_argument('--tensors', nargs='+', type=str, help='the tensors list you want to quant',default=[]) 327 | 328 | args = parser.parse_args() 329 | print(args) 330 | quant_onnx(args.model, args.output_model, args.nodes, args.tensors, args.disableResAdd) 331 | -------------------------------------------------------------------------------- /yolov7_qat/scripts/trt-int8.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: MIT 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | # DEALINGS IN THE SOFTWARE. 22 | ################################################################################ 23 | import tensorrt as trt 24 | import pycuda.driver as cuda 25 | import pycuda.autoinit 26 | 27 | import numpy as np 28 | import random 29 | import cv2 30 | 31 | # For ../common.py 32 | import sys, os 33 | TRT_LOGGER = trt.Logger() 34 | 35 | 36 | def load_yolov7_coco_image(cocodir, topn = None): 37 | 38 | files = os.listdir(cocodir) 39 | files = [file for file in files if file.endswith(".jpg")] 40 | 41 | if topn is not None: 42 | np.random.seed(31) 43 | np.random.shuffle(files) 44 | files = files[:topn] 45 | 46 | datas = [] 47 | 48 | # dataloader is setup pad=0.5 49 | for i, file in enumerate(files): 50 | if i == 0: continue 51 | if (i + 1) % 200 == 0: 52 | print(f"Load {i + 1} / {len(files)} ...") 53 | 54 | img = cv2.imread(os.path.join(cocodir, file)) 55 | from_ = img.shape[1], img.shape[0] 56 | to_ = 640, 640 57 | scale = min(to_[0] / from_[0], to_[1] / from_[1]) 58 | 59 | # low accuracy 60 | # M = np.array([ 61 | # [scale, 0, 16], 62 | # [0, scale, 16], # same to pytorch 63 | # ]) 64 | 65 | # more accuracy 66 | M = np.array([ 67 | [scale, 0, -scale * from_[0] * 0.5 + to_[0] * 0.5 + scale * 0.5 - 0.5 + 16], 68 | [0, scale, -scale * from_[1] * 0.5 + to_[1] * 0.5 + scale * 0.5 - 0.5 + 16], # same to pytorch 69 | ]) 70 | input = cv2.warpAffine(img, M, (672, 672), borderValue=(114, 114, 114)) 71 | input = input[..., ::-1].transpose(2, 0, 1)[None] # BGR->RGB, HWC->CHW, CHW->1CHW 72 | input = (input / 255.0).astype(np.float32) 73 | datas.append(input) 74 | 75 | return np.concatenate(datas, axis=0) 76 | 77 | 78 | class MNISTEntropyCalibrator(trt.IInt8EntropyCalibrator2): 79 | def __init__(self, training_data, cache_file, batch_size=64): 80 | # Whenever you specify a custom constructor for a TensorRT class, 81 | # you MUST call the constructor of the parent explicitly. 82 | trt.IInt8EntropyCalibrator2.__init__(self) 83 | 84 | self.cache_file = cache_file 85 | self.batch_size = batch_size 86 | self.current_index = 0 87 | 88 | # Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned. 89 | if not os.path.exists(cache_file): 90 | 91 | # Allocate enough memory for a whole batch. 92 | self.data = load_yolov7_coco_image(training_data, 1000) 93 | self.device_input = cuda.mem_alloc(self.data[0].nbytes * self.batch_size) 94 | 95 | def get_batch_size(self): 96 | return self.batch_size 97 | 98 | # TensorRT passes along the names of the engine bindings to the get_batch function. 99 | # You don't necessarily have to use them, but they can be useful to understand the order of 100 | # the inputs. The bindings list is expected to have the same ordering as 'names'. 101 | def get_batch(self, names): 102 | if self.current_index + self.batch_size > self.data.shape[0]: 103 | return None 104 | 105 | current_batch = int(self.current_index / self.batch_size) 106 | if current_batch % 10 == 0: 107 | print("Calibrating batch {:}, containing {:} images".format(current_batch, self.batch_size)) 108 | 109 | batch = self.data[self.current_index : self.current_index + self.batch_size].ravel() 110 | cuda.memcpy_htod(self.device_input, batch) 111 | self.current_index += self.batch_size 112 | return [self.device_input] 113 | 114 | def read_calibration_cache(self): 115 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 116 | if os.path.exists(self.cache_file): 117 | with open(self.cache_file, "rb") as f: 118 | return f.read() 119 | 120 | def write_calibration_cache(self, cache): 121 | with open(self.cache_file, "wb") as f: 122 | f.write(cache) 123 | 124 | 125 | def build_int8_engine(onnx_file, calib, batch_size=32): 126 | with trt.Builder( 127 | TRT_LOGGER 128 | ) as builder, builder.create_network(1) as network, builder.create_builder_config() as config: 129 | # We set the builder batch size to be the same as the calibrator's, as we use the same batches 130 | # during inference. Note that this is not required in general, and inference batch size is 131 | # independent of calibration batch size. 132 | builder.max_batch_size = batch_size 133 | config.max_workspace_size = 1024 * 1024 * 1024 # 1024 MB 134 | config.set_flag(trt.BuilderFlag.INT8) 135 | config.int8_calibrator = calib 136 | with trt.OnnxParser(network, TRT_LOGGER) as parser: 137 | parser.parse_from_file(onnx_file) 138 | # network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME)) 139 | # Build engine and do int8 calibration. 140 | plan = builder.build_serialized_network(network, config) 141 | return bytes(plan) 142 | 143 | 144 | def replace_suffix(file, new_suffix): 145 | r = file.rfind(".") 146 | return f"{file[:r]}{new_suffix}" 147 | 148 | 149 | def main(): 150 | # Now we create a calibrator and give it the location of our calibration data. 151 | # We also allow it to cache calibration data for faster engine building. 152 | onnxfile = "yolov7.onnx" 153 | calibration_cache = replace_suffix(onnxfile, ".cache") 154 | engine_file = replace_suffix(onnxfile, ".engine") 155 | calib = MNISTEntropyCalibrator("/datav/dataset/coco/images/train2017/", cache_file=calibration_cache) 156 | 157 | # Inference batch size can be different from calibration batch size. 158 | batch_size = 1 159 | engine_data = build_int8_engine(onnxfile, calib, batch_size) 160 | 161 | with open(engine_file, "wb") as f: 162 | f.write(engine_data) 163 | 164 | 165 | if __name__ == "__main__": 166 | main() 167 | --------------------------------------------------------------------------------