├── CLA.md
├── LICENSE.md
├── README.md
├── deepstream_yolo
    ├── README.md
    ├── config_infer_primary_yoloV4.txt
    ├── config_infer_primary_yoloV7.txt
    ├── deepstream_app_config_yolo.txt
    ├── labels.txt
    └── nvdsinfer_custom_impl_Yolo
    │   ├── Makefile
    │   ├── nvdsparsebbox_Yolo.cpp
    │   └── nvdsparsebbox_Yolo_cuda.cu
├── tensorrt_yolov4
    ├── Makefile
    ├── Makefile.config
    ├── README.md
    ├── data
    │   ├── demo.jpg
    │   └── demo_out.jpg
    └── source
    │   ├── Makefile
    │   ├── SampleYolo.cpp
    │   ├── SampleYolo.hpp
    │   ├── generate_coco_image_list.py
    │   ├── main.cpp
    │   └── onnx_add_nms_plugin.py
├── tensorrt_yolov7
    ├── CMakeLists.txt
    ├── README.md
    ├── imgs
    │   ├── horses.jpg
    │   └── zidane.jpg
    ├── samples
    │   ├── detect.cpp
    │   ├── validate_coco.cpp
    │   └── video_detect.cpp
    ├── src
    │   ├── Yolov7.cpp
    │   ├── Yolov7.h
    │   ├── argsParser.cpp
    │   ├── argsParser.h
    │   └── tools.h
    └── test_coco_map.py
└── yolov7_qat
    ├── README.md
    ├── doc
        ├── Guidance_of_QAT_performance_optimization.md
        └── imgs
        │   ├── QATConv.png
        │   ├── QATFlow.png
        │   ├── int8_q_recommended_procedure.png
        │   ├── monkey-patch-qat-conv-fp16-issue_ptq.png
        │   ├── monkey-patch-qat-conv-fp16-issue_ptqonnx.png
        │   ├── monkey-patch-qat-conv-fp16-issue_qat.png
        │   ├── monkey-patch-qat-conv-fp16-issue_qatonnx.png
        │   ├── monkey-patch-qat-conv-fp16-issue_qatonnx_edit.png
        │   └── monkey-patch-qat-maxpooling-qat.png
    ├── quantization
        ├── quantize.py
        └── rules.py
    └── scripts
        ├── detect-trt.py
        ├── draw-engine.py
        ├── eval-trt.py
        ├── eval-trt.sh
        ├── qat-yolov5.py
        ├── qat.py
        ├── quantize_utils.py
        └── trt-int8.py


/CLA.md:
--------------------------------------------------------------------------------
 1 | ## Individual Contributor License Agreement (CLA)
 2 | 
 3 | **Thank you for submitting your contributions to this project.**
 4 | 
 5 | By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions
 6 | to the project.
 7 | 
 8 | ### License.
 9 | 
10 | You hereby represent that all present, past and future contributions are governed by the
11 | [MIT License](https://opensource.org/licenses/MIT)
12 | copyright statement.
13 | 
14 | This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights
15 | of the code or documents you contribute to the project itself or its maintainers.
16 | Furthermore you also represent that you have the authority to perform the above waiver
17 | with respect to the entirety of you contributions.
18 | 
19 | ### Moral Rights.
20 | 
21 | To the fullest extent permitted under applicable law, you hereby waive, and agree not to
22 | assert, all of your “moral rights” in or relating to your contributions for the benefit of the project.
23 | 
24 | ### Third Party Content.
25 | 
26 | If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools,
27 | specifications, documentation, data, materials, feedback, information or other works of authorship that were not
28 | authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary
29 | rights associated with your Contribution (“Third Party Rights”),
30 | then you agree to include with the submission of your Contribution full details respecting such Third Party
31 | Content and Third Party Rights, including, without limitation, identification of which aspects of your
32 | Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the
33 | Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable
34 | third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater
35 | certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights
36 | do not apply to any portion of a Project that is incorporated into your Contribution to that same Project.
37 | 
38 | ### Representations.
39 | 
40 | You represent that, other than the Third Party Content and Third Party Rights identified by
41 | you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled
42 | to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were
43 | created in the course of your employment with your past or present employer(s), you represent that such
44 | employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer
45 | (s) has waived all of their right, title or interest in or to your Contributions.
46 | 
47 | ### Disclaimer.
48 | 
49 | To the fullest extent permitted under applicable law, your Contributions are provided on an "as is"
50 | basis, without any warranties or conditions, express or implied, including, without limitation, any implied
51 | warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not
52 | required to provide support for your Contributions, except to the extent you desire to provide support.
53 | 
54 | ### No Obligation.
55 | 
56 | You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions
57 | into the project. The decision to use or incorporate your contributions into the project will be made at the
58 | sole discretion of the maintainers or their authorized delegates.
59 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Yolo DeepStream
  2 | 
  3 | ##  Description
  4 | 
  5 | This repo have 4 parts:
  6 | ### 1) yolov7_qat
  7 | In [yolov7_qat](yolov7_qat), We use [TensorRT's pytorch quntization tool](https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization) to Finetune training QAT yolov7 from the pre-trained weight. 
  8 | Finally we get the same performance of PTQ in TensorRT on Jetson OrinX. And the accuracy(mAP) of the model only dropped a little.
  9 | 
 10 | ### 2) tensorrt_yolov7
 11 | In [tensorrt_yolov7](tensorrt_yolov7), We provide a standalone c++ yolov7-app sample here. You can use trtexec to convert FP32 onnx models or QAT-int8 models exported from repo [yolov7_qat](yolov7_qat) to trt-engines. And set the trt-engine as yolov7-app's input. It can do detections on images/videos. Or test mAP on COCO dataset.
 12 | 
 13 | ### 3) deepstream_yolo
 14 | In [deepstream_yolo](deepstream_yolo), This sample shows how to integrate YOLO models with customized output layer parsing for detected objects with DeepStreamSDK.
 15 | 
 16 | ### 4) tensorrt_yolov4
 17 | In [tensorrt_yolov4](tensorrt_yolov4), This sample shows a standalone tensorrt-sample for yolov4.
 18 | 
 19 | ## Performance
 20 | For YoloV7 sample:
 21 | 
 22 | Below table shows the end-to-end performance of processing 1080p videos with this sample application.
 23 | - Testing Device : 
 24 | 
 25 |   1. Jetson AGX Orin 64GB(PowerMode:MAXN + GPU-freq:1.3GHz + CPU:12-core-2.2GHz)
 26 | 
 27 |   2. Tesla T4
 28 | 
 29 | |Device      |precision      |Number <br>of streams | Batch Size | trtexec FPS| deepstream-app FPS<br>with cuda-post-process |deepstream-app FPS<br> with cpu-post-process|
 30 | |-----------    |-----------    |----------------- | -----------|----------- |-----------|-----------|
 31 | |  Orin-X|  FP16         |  1               |     1      |       126  | 124       |   120     |
 32 | |  Orin-X|  FP16         |  16              |    16      |       162  | 145       |   135     |
 33 | |  Orin-X|  Int8(PTQ/QAT)|  1               |     1      |       180  | 175       |   128      |
 34 | |  Orin-X|  Int8(PTQ/QAT)|  16              |    16      |       264  | 264       |   135      |
 35 | |  T4    |  FP16         |  1               |     1      |      132   |    125    |  123      |
 36 | |  T4    |  FP16         |  16              |    16      |      169   |   169     |   123     |
 37 | |  T4    |  Int8(PTQ/QAT)|  1               |     1      |     208    |   170     |    127    |
 38 | |  T4    |  Int8(PTQ/QAT)|  16              |    16      |     305    |  300      |   132      |
 39 | 
 40 | 
 41 | - note: trtexec cudaGraph not enabled as deepstream not support cudaGraph
 42 | 
 43 | ## Code structure
 44 | ```bash
 45 | ├── deepstream_yolo
 46 | │   ├── config_infer_primary_yoloV4.txt # config file for yolov4 model
 47 | │   ├── config_infer_primary_yoloV7.txt # config file for yolov7 model
 48 | │   ├── deepstream_app_config_yolo.txt # deepStream reference app configuration file for using YOLOv models as the primary detector.
 49 | │   ├── labels.txt # labels for coco detection # output layer parsing function for detected objects for the Yolo model.
 50 | │   ├── nvdsinfer_custom_impl_Yolo 
 51 | │   │   ├── Makefile
 52 | │   │   └── nvdsparsebbox_Yolo.cpp 
 53 | │   └── README.md 
 54 | ├── README.md
 55 | ├── tensorrt_yolov4
 56 | │   ├── data 
 57 | │   │   ├── demo.jpg # the demo image
 58 | │   │   └── demo_out.jpg # image detection output of the demo image
 59 | │   ├── Makefile
 60 | │   ├── Makefile.config
 61 | │   ├── README.md
 62 | │   └── source
 63 | │       ├── generate_coco_image_list.py # python script to get list of image names from MS COCO annotation or information file
 64 | │       ├── main.cpp # program main entrance where parameters are configured here
 65 | │       ├── Makefile
 66 | │       ├── onnx_add_nms_plugin.py # python script to add BatchedNMSPlugin node into ONNX model
 67 | │       ├── SampleYolo.cpp # yolov4 inference class functions definition file
 68 | │       └── SampleYolo.hpp # yolov4 inference class definition file
 69 | ├── tensorrt_yolov7
 70 | │   ├── CMakeLists.txt
 71 | │   ├── imgs # the demo images
 72 | │   │   ├── horses.jpg 
 73 | │   │   └── zidane.jpg
 74 | │   ├── README.md
 75 | │   ├── samples 
 76 | │   │   ├── detect.cpp # detection app for images detection
 77 | │   │   ├── validate_coco.cpp # validate coco dataset app
 78 | │   │   └── video_detect.cpp # detection app for video detection
 79 | │   ├── src
 80 | │   │   ├── argsParser.cpp # argsParser helper class for commandline parsing
 81 | │   │   ├── argsParser.h # argsParser helper class for commandline parsing
 82 | │   │   ├── tools.h # helper function for yolov7 class
 83 | │   │   ├── Yolov7.cpp # Class Yolov7
 84 | │   │   └── Yolov7.h # Class Yolov7
 85 | │   └── test_coco_map.py # tool for test coco map with json file
 86 | └── yolov7_qat
 87 |     ├── doc
 88 |     │   ├── Guidance_of_QAT_performance_optimization.md # guidance for Q&DQ insert and placement for pytorch-quantization tool
 89 |     ├── quantization
 90 |     │   ├── quantize.py # helper class for quantize yolov7 model
 91 |     │   └── rules.py # rules for Q&DQ nodes insert and restrictions
 92 |     ├── README.md 
 93 |     └── scripts
 94 |         ├── detect-trt.py # detect a image with tensorrt engine
 95 |         ├── draw-engine.py # draw tensorrt engine to graph
 96 |         ├── eval-trt.py # the script for evalating tensorrt mAP
 97 |         ├── eval-trt.sh # the command lne script for evaluating tensorrt mAP
 98 |         ├── qat.py # main function for QAT and PTQ
 99 |         └── trt-int8.py # tensorrt build-in calibration
100 | ```
101 | 


--------------------------------------------------------------------------------
/deepstream_yolo/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy YOLO Models With DeepStream #
 2 | 
 3 | **This sample shows how to integrate YOLO models with customized output layer parsing for detected objects with DeepStreamSDK.**
 4 | 
 5 | ## 1. Sample contents: ##
 6 | - `deepstream_app_config_yolo.txt`: DeepStream reference app configuration file for using YOLO models as the primary detector.
 7 | - `config_infer_primary_yoloV4.txt`: Configuration file for the GStreamer nvinfer plugin for the YoloV4 detector model.
 8 | - `config_infer_primary_yoloV7.txt`: Configuration file for the GStreamer nvinfer plugin for the YoloV7 detector model.
 9 | - `nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp`: Output layer parsing function for detected objects for the Yolo models.
10 | 
11 | ## 2. Pre-requisites: ##
12 | 
13 | ### 2.1 Please make sure DeepStream 6.1.1+ is properly installed ###
14 | 
15 | ### 2.2 Generate Model ###
16 | #### YoloV4 
17 | 
18 | - Go to this pytorch repository <https://github.com/Tianxiaomo/pytorch-YOLOv4> where you can convert YOLOv4 Pytorch model into **ONNX**
19 | - Other famous YOLOv4 pytorch repositories as references:
20 |   - <https://github.com/WongKinYiu/PyTorch_YOLOv4>
21 |   - <https://github.com/bubbliiiing/yolov4-pytorch>
22 |   - <https://github.com/maudzung/Complex-YOLOv4-Pytorch>
23 |   - <https://github.com/AllanYiin/YoloV4>
24 | - Or you can download reference ONNX model directly from here ([link](https://drive.google.com/file/d/1tp1xzeey4YBSd8nGd-dkn8Ymii9ordEj/view?usp=sharing)).  
25 | 
26 | #### YOLOv7
27 | following the guide https://github.com/WongKinYiu/yolov7#export, export a dynamic-batch-1-output onnx-model
28 | ```bash
29 | $ python export.py --weights ./yolov7.pt --grid --simplify --topk-all 100 --iou-thres 0.65 --conf-thres 0.35 --img-size 640 640 --dynamic-batch
30 | ```
31 | or using the qat model exported from [yolov7_qat](../yolov7_qat)
32 | ## 3. Download and Run ##
33 | 
34 | ```sh
35 |   $ cd ~/
36 |   $ git clone https://github.com/NVIDIA-AI-IOT/yolo_deepstream.git
37 |   $ cd ~/yolo_deepstream/deepstream_yolo/nvdsinfer_custom_impl_Yolo
38 |   $ make
39 |   $ cd ..
40 | ```
41 |   Make sure the model exists under ~/yolo_deepstream/deepstream_yolo/. Change the "config-file" parameter in the "deepstream_app_config_yolo.txt" configuration file to the nvinfer configuration file for the model you want to run with. 
42 | |Model|Nvinfer Configuration File|
43 | |-----------|----------|
44 | |YoloV4|config_infer_primary_yoloV4.txt|
45 | |YoloV7|config_infer_primary_yoloV7.txt|
46 | 
47 | ```  
48 |   $ deepstream-app -c deepstream_app_config_yolo.txt
49 | ```
50 | ## 4. CUDA Post Processing
51 | 
52 | this sample provide two ways of yolov7 post-processing(decoce yolo result, not include NMS), CPU version and GPU version
53 | - CPU implement can be found in: [nvdsparsebbox_Yolo.cpp](deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp)
54 | - CUDA implement can be found in: [nvdsparsebbox_Yolo_cuda.cu](deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu)
55 | 
56 | Default will use CUDA-post processing. To enable CPU post-processing:
57 | in [config_infer_primary_yoloV7.txt](deepstream_yolo/config_infer_primary_yoloV7.txt)
58 | 
59 | - `parse-bbox-func-name=NvDsInferParseCustomYoloV7_cuda` -> `parse-bbox-func-name=NvDsInferParseCustomYoloV7`
60 | - `disable-output-host-copy=1` -> `disable-output-host-copy=0`
61 | 
62 | The performance of the CPU-post-processing and CUDA-post-processing result can be found in [Performance](https://github.com/NVIDIA-AI-IOT/yolo_deepstream#performance)
63 | 
64 | 


--------------------------------------------------------------------------------
/deepstream_yolo/config_infer_primary_yoloV4.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | 
24 | # Following properties are mandatory when engine files are not specified:
25 | #   int8-calib-file(Only in INT8), model-file-format
26 | #   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
27 | #   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
28 | #   ONNX: onnx-file
29 | #
30 | # Mandatory properties for detectors:
31 | #   num-detected-classes
32 | #
33 | # Optional properties for detectors:
34 | #   cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
35 | #   custom-lib-path
36 | #   parse-bbox-func-name
37 | #
38 | # Mandatory properties for classifiers:
39 | #   classifier-threshold, is-classifier
40 | #
41 | # Optional properties for classifiers:
42 | #   classifier-async-mode(Secondary mode only, Default=false)
43 | #
44 | # Optional properties in secondary mode:
45 | #   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
46 | #   input-object-min-width, input-object-min-height, input-object-max-width,
47 | #   input-object-max-height
48 | #
49 | # Following properties are always recommended:
50 | #   batch-size(Default=1)
51 | #
52 | # Other optional properties:
53 | #   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
54 | #   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
55 | #   mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
56 | #   custom-lib-path, network-mode(Default=0 i.e FP32)
57 | #
58 | # The values in the config file are overridden by values set through GObject
59 | # properties.
60 | 
61 | [property]
62 | gpu-id=0
63 | net-scale-factor=0.0039215697906911373
64 | #0=RGB, 1=BGR
65 | model-color-format=0
66 | onnx-file=yolov4_-1_3_416_416_nms_dynamic.onnx
67 | model-engine-file=yolov4_-1_3_416_416_nms_dynamic.onnx_b16_gpu0_fp16.engine
68 | labelfile-path=labels.txt
69 | batch-size=16
70 | ## 0=FP32, 1=INT8, 2=FP16 mode
71 | network-mode=2
72 | num-detected-classes=80
73 | gie-unique-id=1
74 | network-type=0
75 | is-classifier=0
76 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
77 | cluster-mode=2
78 | maintain-aspect-ratio=1
79 | parse-bbox-func-name=NvDsInferParseCustomYoloV4
80 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
81 | #scaling-filter=0
82 | #scaling-compute-hw=0
83 | 
84 | [class-attrs-all]
85 | nms-iou-threshold=0.6
86 | pre-cluster-threshold=0.4
87 | 


--------------------------------------------------------------------------------
/deepstream_yolo/config_infer_primary_yoloV7.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | 
24 | # Following properties are mandatory when engine files are not specified:
25 | #   int8-calib-file(Only in INT8), model-file-format
26 | #   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
27 | #   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
28 | #   ONNX: onnx-file
29 | #
30 | # Mandatory properties for detectors:
31 | #   num-detected-classes
32 | #
33 | # Optional properties for detectors:
34 | #   cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
35 | #   custom-lib-path
36 | #   parse-bbox-func-name
37 | #
38 | # Mandatory properties for classifiers:
39 | #   classifier-threshold, is-classifier
40 | #
41 | # Optional properties for classifiers:
42 | #   classifier-async-mode(Secondary mode only, Default=false)
43 | #
44 | # Optional properties in secondary mode:
45 | #   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
46 | #   input-object-min-width, input-object-min-height, input-object-max-width,
47 | #   input-object-max-height
48 | #
49 | # Following properties are always recommended:
50 | #   batch-size(Default=1)
51 | #
52 | # Other optional properties:
53 | #   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
54 | #   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
55 | #   mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
56 | #   custom-lib-path, network-mode(Default=0 i.e FP32)
57 | #
58 | # The values in the config file are overridden by values set through GObject
59 | # properties.
60 | 
61 | [property]
62 | gpu-id=0
63 | net-scale-factor=0.0039215697906911373
64 | #0=RGB, 1=BGR
65 | model-color-format=0
66 | onnx-file=yolov7.onnx
67 | labelfile-path=labels.txt
68 | ## 0=FP32, 1=INT8, 2=FP16 mode
69 | network-mode=2
70 | num-detected-classes=80
71 | gie-unique-id=1
72 | network-type=0
73 | is-classifier=0
74 | ## 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
75 | cluster-mode=2
76 | maintain-aspect-ratio=1
77 | symmetric-padding=1
78 | ## Bilinear Interpolation
79 | scaling-filter=1
80 | #parse-bbox-func-name=NvDsInferParseCustomYoloV7
81 | parse-bbox-func-name=NvDsInferParseCustomYoloV7_cuda
82 | #disable-output-host-copy=0
83 | disable-output-host-copy=1
84 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
85 | #scaling-compute-hw=0
86 | ## start from DS6.2
87 | crop-objects-to-roi-boundary=1
88 | 
89 | 
90 | [class-attrs-all]
91 | #nms-iou-threshold=0.3
92 | #threshold=0.7
93 | nms-iou-threshold=0.65
94 | pre-cluster-threshold=0.25
95 | topk=300
96 | 
97 | 


--------------------------------------------------------------------------------
/deepstream_yolo/deepstream_app_config_yolo.txt:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a
  6 | # copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | # DEALINGS IN THE SOFTWARE.
 22 | ################################################################################
 23 | 
 24 | [application]
 25 | enable-perf-measurement=1
 26 | perf-measurement-interval-sec=5
 27 | #gie-kitti-output-dir=streamscl
 28 | 
 29 | [tiled-display]
 30 | enable=0
 31 | rows=4
 32 | columns=4
 33 | width=1280
 34 | height=720
 35 | gpu-id=0
 36 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
 37 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla
 38 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla
 39 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla
 40 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
 41 | nvbuf-memory-type=0
 42 | 
 43 | [source0]
 44 | enable=1
 45 | #Type - 1=CameraV4L2 2=URI 3=MultiURI
 46 | type=3
 47 | uri=file:/opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4
 48 | num-sources=16
 49 | gpu-id=0
 50 | # (0): memtype_device   - Memory type Device
 51 | # (1): memtype_pinned   - Memory type Host Pinned
 52 | # (2): memtype_unified  - Memory type Unified
 53 | cudadec-memtype=0
 54 | 
 55 | [sink0]
 56 | enable=1
 57 | #Type - 1=FakeSink 2=EglSink 3=File
 58 | type=3
 59 | sync=0
 60 | source-id=0
 61 | gpu-id=0
 62 | nvbuf-memory-type=0
 63 | #1=mp4 2=mkv
 64 | container=1
 65 | #1=h264 2=h265
 66 | codec=1
 67 | output-file=yolov4.mp4
 68 | 
 69 | [osd]
 70 | enable=1
 71 | gpu-id=0
 72 | border-width=1
 73 | text-size=12
 74 | text-color=1;1;1;1;
 75 | text-bg-color=0.3;0.3;0.3;1
 76 | font=Serif
 77 | show-clock=0
 78 | clock-x-offset=800
 79 | clock-y-offset=820
 80 | clock-text-size=12
 81 | clock-color=1;0;0;0
 82 | nvbuf-memory-type=0
 83 | 
 84 | [streammux]
 85 | gpu-id=0
 86 | ##Boolean property to inform muxer that sources are live
 87 | live-source=0
 88 | batch-size=16
 89 | ##time out in usec, to wait after the first buffer is available
 90 | ##to push the batch even if the complete batch is not formed
 91 | batched-push-timeout=40000
 92 | ## Set muxer output width and height
 93 | width=1280
 94 | height=720
 95 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works
 96 | ##along with width, height properties
 97 | enable-padding=0
 98 | nvbuf-memory-type=0
 99 | 
100 | # config-file property is mandatory for any gie section.
101 | # Other properties are optional and if set will override the properties set in
102 | # the infer config file.
103 | [primary-gie]
104 | enable=1
105 | gpu-id=0
106 | labelfile-path=labels.txt
107 | batch-size=16
108 | #Required by the app for OSD, not a plugin property
109 | bbox-border-color0=1;0;0;1
110 | bbox-border-color1=0;1;1;1
111 | bbox-border-color2=0;0;1;1
112 | bbox-border-color3=0;1;0;1
113 | interval=0
114 | gie-unique-id=1
115 | nvbuf-memory-type=0
116 | config-file=config_infer_primary_yoloV4.txt
117 | #config-file=config_infer_primary_yoloV7.txt
118 | 
119 | [tracker]
120 | enable=0
121 | # For NvDCF and DeepSORT tracker, tracker-width and tracker-height must be a multiple of 32, respectively
122 | tracker-width=640
123 | tracker-height=384
124 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
125 | # ll-config-file required to set different tracker types
126 | # ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_IOU.yml
127 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_perf.yml
128 | # ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_accuracy.yml
129 | # ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_DeepSORT.yml
130 | gpu-id=0
131 | enable-batch-process=1
132 | enable-past-frame=1
133 | display-tracking-id=1
134 | 
135 | [tests]
136 | file-loop=0
137 | 


--------------------------------------------------------------------------------
/deepstream_yolo/labels.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/deepstream_yolo/nvdsinfer_custom_impl_Yolo/Makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | 
24 | CC:= g++
25 | NVCC:=/usr/local/cuda/bin/nvcc
26 | 
27 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
28 | CFLAGS+= -I/opt/nvidia/deepstream/deepstream/sources/includes/ -I/usr/local/cuda/include
29 | 
30 | CUFLAGS:= -std=c++14 -shared 
31 | CUFLAGS+= -I/opt/nvidia/deepstream/deepstream/sources/includes/ -I/usr/local/cuda/include
32 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda/lib64 -lcudart -lcublas -lstdc++fs
33 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
34 | 
35 | INCS:= $(wildcard *.h)
36 | SRCFILES:= nvdsparsebbox_Yolo.cpp\
37 | 			nvdsparsebbox_Yolo_cuda.cu
38 | 
39 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
40 | 
41 | TARGET_OBJS:= $(SRCFILES:.cpp=.o)
42 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
43 | 
44 | all: $(TARGET_LIB)
45 | 
46 | %.o: %.cpp $(INCS) Makefile
47 | 	$(CC) -c -o $@ $(CFLAGS) $<
48 | 
49 | %.o: %.cu $(INCS) Makefile
50 | 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $(CUFLAGS)  $<
51 | 
52 | $(TARGET_LIB) : $(TARGET_OBJS)
53 | 	$(CC) -o $@  $(TARGET_OBJS) $(LFLAGS)
54 | 
55 | clean:
56 | 	rm -rf $(TARGET_LIB) *.o
57 | 


--------------------------------------------------------------------------------
/deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | 
 24 | 
 25 | #include <algorithm>
 26 | #include <cassert>
 27 | #include <cmath>
 28 | #include <cstring>
 29 | #include <fstream>
 30 | #include <iostream>
 31 | #include <unordered_map>
 32 | #include "nvdsinfer_custom_impl.h"
 33 | 
 34 | static const int NUM_CLASSES_YOLO = 80;
 35 | 
 36 | float clamp(const float val, const float minVal, const float maxVal)
 37 | {
 38 |     assert(minVal <= maxVal);
 39 |     return std::min(maxVal, std::max(minVal, val));
 40 | }
 41 | 
 42 | extern "C" bool NvDsInferParseCustomYoloV4(
 43 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
 44 |     NvDsInferNetworkInfo const& networkInfo,
 45 |     NvDsInferParseDetectionParams const& detectionParams,
 46 |     std::vector<NvDsInferParseObjectInfo>& objectList);
 47 | 
 48 | extern "C" bool NvDsInferParseCustomYoloV7(
 49 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
 50 |     NvDsInferNetworkInfo const& networkInfo,
 51 |     NvDsInferParseDetectionParams const& detectionParams,
 52 |     std::vector<NvDsInferParseObjectInfo>& objectList);
 53 | 
 54 | /* YOLOv4 implementations */
 55 | static NvDsInferParseObjectInfo convertBBoxYoloV4(const float& bx1, const float& by1, const float& bx2,
 56 |                                      const float& by2, const uint& netW, const uint& netH)
 57 | {
 58 |     NvDsInferParseObjectInfo b;
 59 |     // Restore coordinates to network input resolution
 60 | 
 61 |     float x1 = bx1 * netW;
 62 |     float y1 = by1 * netH;
 63 |     float x2 = bx2 * netW;
 64 |     float y2 = by2 * netH;
 65 | 
 66 |     x1 = clamp(x1, 0, netW);
 67 |     y1 = clamp(y1, 0, netH);
 68 |     x2 = clamp(x2, 0, netW);
 69 |     y2 = clamp(y2, 0, netH);
 70 | 
 71 |     b.left = x1;
 72 |     b.width = clamp(x2 - x1, 0, netW);
 73 |     b.top = y1;
 74 |     b.height = clamp(y2 - y1, 0, netH);
 75 | 
 76 |     return b;
 77 | }
 78 | 
 79 | static void addBBoxProposalYoloV4(const float bx, const float by, const float bw, const float bh,
 80 |                      const uint& netW, const uint& netH, const int maxIndex,
 81 |                      const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
 82 | {
 83 |     NvDsInferParseObjectInfo bbi = convertBBoxYoloV4(bx, by, bw, bh, netW, netH);
 84 |     if (bbi.width < 1 || bbi.height < 1) return;
 85 | 
 86 |     bbi.detectionConfidence = maxProb;
 87 |     bbi.classId = maxIndex;
 88 |     binfo.push_back(bbi);
 89 | }
 90 | 
 91 | static std::vector<NvDsInferParseObjectInfo>
 92 | decodeYoloV4Tensor(
 93 |     const float* boxes, const float* scores,
 94 |     const uint num_bboxes, NvDsInferParseDetectionParams const& detectionParams,
 95 |     const uint& netW, const uint& netH)
 96 | {
 97 |     std::vector<NvDsInferParseObjectInfo> binfo;
 98 | 
 99 |     uint bbox_location = 0;
100 |     uint score_location = 0;
101 |     for (uint b = 0; b < num_bboxes; ++b)
102 |     {
103 |         float bx1 = boxes[bbox_location];
104 |         float by1 = boxes[bbox_location + 1];
105 |         float bx2 = boxes[bbox_location + 2];
106 |         float by2 = boxes[bbox_location + 3];
107 | 
108 |         float maxProb = 0.0f;
109 |         int maxIndex = -1;
110 | 
111 |         for (uint c = 0; c < detectionParams.numClassesConfigured; ++c)
112 |         {
113 |             float prob = scores[score_location + c];
114 |             if (prob > maxProb)
115 |             {
116 |                 maxProb = prob;
117 |                 maxIndex = c;
118 |             }
119 |         }
120 | 
121 |         if (maxProb > detectionParams.perClassPreclusterThreshold[maxIndex])
122 |         {
123 |             addBBoxProposalYoloV4(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
124 |         }
125 | 
126 |         bbox_location += 4;
127 |         score_location += detectionParams.numClassesConfigured;
128 |     }
129 | 
130 |     return binfo;
131 | }
132 | 
133 | extern "C" bool NvDsInferParseCustomYoloV4(
134 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
135 |     NvDsInferNetworkInfo const& networkInfo,
136 |     NvDsInferParseDetectionParams const& detectionParams,
137 |     std::vector<NvDsInferParseObjectInfo>& objectList)
138 | {
139 |     if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
140 |     {
141 |         std::cerr << "WARNING: Num classes mismatch. Configured:"
142 |                   << detectionParams.numClassesConfigured
143 |                   << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
144 |     }
145 | 
146 |     std::vector<NvDsInferParseObjectInfo> objects;
147 | 
148 |     const NvDsInferLayerInfo &boxes = outputLayersInfo[0]; // num_boxes x 4
149 |     const NvDsInferLayerInfo &scores = outputLayersInfo[1]; // num_boxes x num_classes
150 | 
151 |     // 3 dimensional: [num_boxes, 1, 4]
152 |     assert(boxes.inferDims.numDims == 3);
153 |     // 2 dimensional: [num_boxes, num_classes]
154 |     assert(scores.inferDims.numDims == 2);
155 | 
156 |     // The second dimension should be num_classes
157 |     assert(detectionParams.numClassesConfigured == scores.inferDims.d[1]);
158 |     
159 |     uint num_bboxes = boxes.inferDims.d[0];
160 | 
161 |     // std::cout << "Network Info: " << networkInfo.height << "  " << networkInfo.width << std::endl;
162 | 
163 |     std::vector<NvDsInferParseObjectInfo> outObjs =
164 |         decodeYoloV4Tensor(
165 |             (const float*)(boxes.buffer), (const float*)(scores.buffer), num_bboxes, detectionParams,
166 |             networkInfo.width, networkInfo.height);
167 | 
168 |     objects.insert(objects.end(), outObjs.begin(), outObjs.end());
169 | 
170 |     objectList = objects;
171 | 
172 |     return true;
173 | }
174 | /* YOLOv4 implementations end*/
175 | 
176 | /*Yolov7 bbox parser*/
177 | static NvDsInferParseObjectInfo convertBBoxYoloV7(const float& bx, const float& by, const float& bw,
178 |                                      const float& bh, const int& stride, const uint& netW,
179 |                                      const uint& netH)
180 | {
181 |     NvDsInferParseObjectInfo b;
182 |     // Restore coordinates to network input resolution
183 |     float xCenter = bx * stride;
184 |     float yCenter = by * stride;
185 |     float x0 = xCenter - bw / 2;
186 |     float y0 = yCenter - bh / 2;
187 |     float x1 = x0 + bw;
188 |     float y1 = y0 + bh;
189 | 
190 |     x0 = clamp(x0, 0, netW);
191 |     y0 = clamp(y0, 0, netH);
192 |     x1 = clamp(x1, 0, netW);
193 |     y1 = clamp(y1, 0, netH);
194 | 
195 |     b.left = x0;
196 |     b.width = clamp(x1 - x0, 0, netW);
197 |     b.top = y0;
198 |     b.height = clamp(y1 - y0, 0, netH);
199 | 
200 |     return b;
201 | }
202 | 
203 | static void addBBoxProposalYoloV7(const float bx, const float by, const float bw, const float bh,
204 |                      const uint stride, const uint& netW, const uint& netH, const int maxIndex,
205 |                      const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
206 | {
207 |     NvDsInferParseObjectInfo bbi = convertBBoxYoloV7(bx, by, bw, bh, stride, netW, netH);
208 |     if (bbi.width < 1 || bbi.height < 1) return;
209 | 
210 |     bbi.detectionConfidence = maxProb;
211 |     bbi.classId = maxIndex;
212 |     binfo.push_back(bbi);
213 | }
214 | 
215 | static bool NvDsInferParseYoloV7(
216 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
217 |     NvDsInferNetworkInfo const& networkInfo,
218 |     NvDsInferParseDetectionParams const& detectionParams,
219 |     std::vector<NvDsInferParseObjectInfo>& objectList)
220 | {
221 |  
222 | 
223 |     if (outputLayersInfo.empty()) {
224 |         std::cerr << "Could not find output layer in bbox parsing" << std::endl;;
225 |         return false;
226 |     }
227 |     const NvDsInferLayerInfo &layer = outputLayersInfo[0];
228 | 
229 |     if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
230 |     {
231 |         std::cerr << "WARNING: Num classes mismatch. Configured:"
232 |                   << detectionParams.numClassesConfigured
233 |                   << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
234 |     }
235 | 
236 |     std::vector<NvDsInferParseObjectInfo> objects;
237 | 
238 |     float* data = (float*)layer.buffer;
239 |     const int dimensions = layer.inferDims.d[1];
240 |     int rows = layer.inferDims.numElements / layer.inferDims.d[1];
241 | 
242 |     for (int i = 0; i < rows; ++i) {
243 |         //85 = x, y, w, h, maxProb, score0......score79
244 |         float bx = data[ 0];
245 |         float by = data[ 1];
246 |         float bw = data[ 2];
247 |         float bh = data[ 3];
248 |         float maxProb = data[ 4];
249 |         int  maxIndex = data[ 5];
250 |         float * classes_scores = data + 5;
251 |         
252 |         float maxScore = 0;
253 |         int index = 0;
254 |         for (int j = 0 ;j < NUM_CLASSES_YOLO; j++){
255 |            if(*classes_scores > maxScore){
256 |               index = j;
257 |               maxScore = *classes_scores;
258 |            }
259 |            classes_scores++;
260 |         }
261 | 	
262 |         maxIndex = index;
263 |         data += dimensions;
264 |         
265 |         addBBoxProposalYoloV7(bx, by, bw, bh, 1, networkInfo.width, networkInfo.height, maxIndex, maxProb, objects);    
266 |     }
267 |     objectList = objects;
268 |     return true;
269 | }
270 | 
271 | extern "C" bool NvDsInferParseCustomYoloV7(
272 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
273 |     NvDsInferNetworkInfo const& networkInfo,
274 |     NvDsInferParseDetectionParams const& detectionParams,
275 |     std::vector<NvDsInferParseObjectInfo>& objectList)
276 | {
277 |     return NvDsInferParseYoloV7 (
278 |         outputLayersInfo, networkInfo, detectionParams, objectList);
279 | }
280 | 
281 | /* Check that the custom function has been defined correctly */
282 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV4);
283 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV7);
284 | 


--------------------------------------------------------------------------------
/deepstream_yolo/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 |  
 24 | #include <algorithm>
 25 | #include <cassert>
 26 | #include <cmath>
 27 | #include <cstring>
 28 | #include <fstream>
 29 | #include <iostream>
 30 | #include <unordered_map>
 31 | #include "nvdsinfer_custom_impl.h"
 32 | #include "nvtx3/nvToolsExt.h"
 33 | #include <thrust/host_vector.h>
 34 | #include <thrust/device_vector.h>
 35 | 
 36 | static const int NUM_CLASSES_YOLO = 80;
 37 | #define OBJECTLISTSIZE 25200
 38 | #define BLOCKSIZE  1024
 39 | thrust::device_vector<NvDsInferParseObjectInfo> objects_v(OBJECTLISTSIZE);
 40 | 
 41 | extern "C" bool NvDsInferParseCustomYoloV7_cuda(
 42 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
 43 |     NvDsInferNetworkInfo const& networkInfo,
 44 |     NvDsInferParseDetectionParams const& detectionParams,
 45 |     std::vector<NvDsInferParseObjectInfo>& objectList);
 46 | 
 47 | 
 48 | __global__ void decodeYoloV7Tensor_cuda(NvDsInferParseObjectInfo *binfo/*output*/, float* data, int dimensions, int rows,
 49 |                                         int netW, int netH, float Threshold){
 50 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 51 |     if(idx < rows) {
 52 |         data = data + idx * dimensions;
 53 |         float maxProb = data[ 4];
 54 |         //maxProb < Threshold, directly return
 55 |         if(maxProb < Threshold){
 56 |             binfo[idx].detectionConfidence = 0.0;
 57 |             return;
 58 |         }
 59 |         float bx = data[ 0];
 60 |         float by = data[ 1];
 61 |         float bw = data[ 2];
 62 |         float bh = data[ 3];
 63 |         int  maxIndex = 0;
 64 |         float * classes_scores = (float *)(data + 5);
 65 |         float maxScore = 0;
 66 |         int index = 0;
 67 | 
 68 |         #pragma unroll
 69 |         for (int j = 0 ;j < NUM_CLASSES_YOLO; j++){
 70 |            if(*classes_scores > maxScore){
 71 |               index = j;
 72 |               maxScore = *classes_scores;
 73 |            }
 74 |            classes_scores++;
 75 |         }
 76 |         if(maxProb * maxScore < Threshold){
 77 |             binfo[idx].detectionConfidence = 0.0;
 78 |             return;
 79 |         }
 80 |         maxIndex = index;
 81 |         float stride = 1.0;
 82 |         float xCenter = bx * stride;
 83 |         float yCenter = by * stride;
 84 |         float x0 = xCenter - bw / 2.0;
 85 |         float y0 = yCenter - bh / 2.0;
 86 |         float x1 = x0 + bw;
 87 |         float y1 = y0 + bh;
 88 |         x0 = fminf(float(netW), fmaxf(float(0.0), x0));
 89 |         y0 = fminf(float(netH), fmaxf(float(0.0), y0));
 90 |         x1 = fminf(float(netW), fmaxf(float(0.0), x1));
 91 |         y1 = fminf(float(netH), fmaxf(float(0.0), y1));
 92 |         binfo[idx].left = x0;
 93 |         binfo[idx].top = y0;
 94 |         binfo[idx].width = fminf(float(netW), fmaxf(float(0.0), x1-x0));
 95 |         binfo[idx].height = fminf(float(netH), fmaxf(float(0.0), y1-y0));
 96 |         binfo[idx].detectionConfidence = maxProb * maxScore;
 97 |         binfo[idx].classId = maxIndex;
 98 |     }
 99 |     return;
100 | }
101 | static bool NvDsInferParseYoloV7_cuda(
102 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
103 |     NvDsInferNetworkInfo const& networkInfo,
104 |     NvDsInferParseDetectionParams const& detectionParams,
105 |     std::vector<NvDsInferParseObjectInfo>& objectList)
106 | {
107 |  
108 |     if (outputLayersInfo.empty()) {
109 |         std::cerr << "Could not find output layer in bbox parsing" << std::endl;;
110 |         return false;
111 |     }
112 |     const NvDsInferLayerInfo &layer = outputLayersInfo[0];
113 | 
114 |     if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
115 |     {
116 |         std::cerr << "WARNING: Num classes mismatch. Configured:"
117 |                   << detectionParams.numClassesConfigured
118 |                   << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
119 |     }
120 | 
121 |     float* data = (float*)layer.buffer;
122 |     const int dimensions = layer.inferDims.d[1];
123 |     int rows = layer.inferDims.numElements / layer.inferDims.d[1];
124 |     
125 |     int GRIDSIZE = ((OBJECTLISTSIZE-1)/BLOCKSIZE)+1;
126 |     //find the min threshold
127 |     float min_PreclusterThreshold = *(std::min_element(detectionParams.perClassPreclusterThreshold.begin(),
128 |         detectionParams.perClassPreclusterThreshold.end()));
129 |     decodeYoloV7Tensor_cuda<<<GRIDSIZE,BLOCKSIZE>>>
130 |         (thrust::raw_pointer_cast(objects_v.data()), data, dimensions, rows, networkInfo.width, 
131 |         networkInfo.height, min_PreclusterThreshold);
132 |     objectList.resize(OBJECTLISTSIZE);
133 |     thrust::copy(objects_v.begin(),objects_v.end(),objectList.begin());//the same as cudamemcpy
134 | 
135 |     return true;
136 | }
137 | 
138 | extern "C" bool NvDsInferParseCustomYoloV7_cuda(
139 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
140 |     NvDsInferNetworkInfo const& networkInfo,
141 |     NvDsInferParseDetectionParams const& detectionParams,
142 |     std::vector<NvDsInferParseObjectInfo>& objectList)
143 | {
144 |     nvtxRangePush("NvDsInferParseYoloV7");
145 |     bool ret = NvDsInferParseYoloV7_cuda (
146 |         outputLayersInfo, networkInfo, detectionParams, objectList);
147 | 
148 |     nvtxRangePop();
149 |     return ret;
150 | }
151 | 
152 | /* Check that the custom function has been defined correctly */
153 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV7_cuda);
154 | 


--------------------------------------------------------------------------------
/tensorrt_yolov4/Makefile:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a
  6 | # copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | # DEALINGS IN THE SOFTWARE.
 22 | ################################################################################
 23 | 
 24 | 
 25 | SHELL=/bin/bash -o pipefail
 26 | TARGET?=$(shell uname -m)
 27 | LIBDIR?=lib
 28 | VERBOSE?=0
 29 | ifeq ($(VERBOSE), 1)
 30 | AT=
 31 | else
 32 | AT=@
 33 | endif
 34 | CUDA_TRIPLE=x86_64-linux
 35 | CUBLAS_TRIPLE=x86_64-linux-gnu
 36 | DLSW_TRIPLE=x86_64-linux-gnu
 37 | ifeq ($(TARGET), aarch64)
 38 | CUDA_TRIPLE=aarch64-linux
 39 | CUBLAS_TRIPLE=aarch64-linux-gnu
 40 | DLSW_TRIPLE=aarch64-linux-gnu
 41 | endif
 42 | ifeq ($(TARGET), qnx)
 43 | CUDA_TRIPLE=aarch64-qnx
 44 | CUBLAS_TRIPLE=aarch64-qnx-gnu
 45 | DLSW_TRIPLE=aarch64-unknown-nto-qnx
 46 | endif
 47 | ifeq ($(TARGET), ppc64le)
 48 | CUDA_TRIPLE=ppc64le-linux
 49 | CUBLAS_TRIPLE=ppc64le-linux
 50 | DLSW_TRIPLE=ppc64le-linux
 51 | endif
 52 | ifeq ($(TARGET), android64)
 53 | DLSW_TRIPLE=aarch64-linux-androideabi
 54 | CUDA_TRIPLE=$(DLSW_TRIPLE)
 55 | CUBLAS_TRIPLE=$(DLSW_TRIPLE)
 56 | endif
 57 | export TARGET
 58 | export VERBOSE
 59 | export LIBDIR
 60 | export CUDA_TRIPLE
 61 | export CUBLAS_TRIPLE
 62 | export DLSW_TRIPLE
 63 | 
 64 | ifeq ($(SAFE_PDK), 1)
 65 |   # Only dlaSafetyRuntime is currently able to execute with safety pdk.
 66 |   samples = dlaSafetyRuntime
 67 | else
 68 |   samples = sampleAlgorithmSelector sampleCharRNN sampleDynamicReshape sampleFasterRCNN sampleGoogleNet sampleINT8 sampleINT8API sampleMLP sampleMNIST sampleMNISTAPI sampleNMT sampleMovieLens sampleOnnxMNIST sampleUffPluginV2Ext sampleReformatFreeIO sampleSSD sampleUffFasterRCNN sampleUffMaskRCNN sampleUffMNIST sampleUffSSD trtexec samplePlugin
 69 | 
 70 | 
 71 |   # sampleMovieLensMPS should only be compiled for Linux targets.
 72 |   # sample uses Linux specific shared memory and IPC libraries.
 73 |   ifeq ($(TARGET),x86_64)
 74 |     samples += sampleMovieLensMPS
 75 |   endif
 76 | 
 77 |   # sampleNvmedia/dlaSafetyRuntime/dlaSafetyBuilder should only be compiled with DLA enabled.
 78 |   ifeq ($(ENABLE_DLA),1)
 79 | 	samples += sampleNvmedia
 80 | 	samples += dlaSafetyRuntime
 81 | 	samples += dlaSafetyBuilder
 82 |   endif
 83 | endif
 84 | 
 85 | .PHONY: all clean help
 86 | all:
 87 | 	$(AT)$(foreach sample,$(samples), $(MAKE) -C $(sample) &&) :
 88 | 
 89 | clean:
 90 | 	$(AT)$(foreach sample,$(samples), $(MAKE) clean -C $(sample) &&) :
 91 | 
 92 | help:
 93 | 	$(AT)echo "Sample building help menu."
 94 | 	$(AT)echo "Samples:"
 95 | 	$(AT)$(foreach sample,$(samples), echo -e "\t$(sample)" &&) :
 96 | 	$(AT)echo -e "\nCommands:"
 97 | 	$(AT)echo -e "\tall - build all samples."
 98 | 	$(AT)echo -e "\tclean - clean all samples."
 99 | 	$(AT)echo -e "\nVariables:"
100 | 	$(AT)echo -e "\tTARGET - Specify the target to build for."
101 | 	$(AT)echo -e "\tVERBOSE - Specify verbose output."
102 | 	$(AT)echo -e "\tCUDA_INSTALL_DIR - Directory where cuda installs to."
103 | 


--------------------------------------------------------------------------------
/tensorrt_yolov4/README.md:
--------------------------------------------------------------------------------
  1 | # YOLOv4 Standalone Program of Multi-Tasks 
  2 | 
  3 | ## 1. Contents
  4 | 
  5 | - **`common`** Some common code dependencies and utilities 
  6 | - **`source`** Source code of standalone Program
  7 |     - `main.cpp`: Program main entrance where parameters are configured here
  8 |     - `SampleYolo.hpp`: YOLOv4 inference class definition file
  9 |     - `SampleYolo.cpp`: YOLOv4 inference class functions definition file
 10 |     - `onnx_add_nms_plugin.py`: Python script to add BatchedNMSPlugin node into ONNX model
 11 |     - `generate_coco_image_list.py`: Python script to get list of image names from MS COCO annotation or information file
 12 | 
 13 | - **`data`** This directory saves:
 14 |     - `yolov4.onnx`: the ONNX model (User generated)
 15 |     - `yolov4.engine`: the TensorRT engine model (would be generated by this program)
 16 |     - `demo.jpg`: The demo image (Already exists)
 17 |     - `demo_out.jpg`: Image detection output of the demo image (Already exists, but would be renewed by the program)
 18 |     - `names.txt`: MS COCO dataset label names (have to be downloaded or generated via COCO API)
 19 |     - `categories.txt`: MS COCO dataset categories where IDs and names are separated by `"\t"` (have to be generated via COCO API)
 20 |     - `val2017.txt`: MS COCO validation set image list (have to be generated from corresponding COCO annotation file)
 21 |     - `valdev2017.txt`: MS COCO test set image list (have to be generated from corresponding COCO annotation file)
 22 |     - `coco_result.json`: MS COCO dataset output (would be generated by this program)
 23 | 
 24 | 
 25 | ## 2 Prerequisites before building & running YOLOv4 standalone ##
 26 | 
 27 | ### 2.1 Download TensorRT (higher than 7.1, you can ignore this step if TensorRT 7.1 is already installed) ###
 28 | 
 29 | - Download TensorRT from NVIDIA developer page: <https://developer.nvidia.com/nvidia-tensorrt-7x-download>
 30 | - Install or depackage the deb file or tar file.
 31 | 
 32 | ### 2.2 Download and build TensorRT OSS ###
 33 | 
 34 | - Refer to README files in <https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps/tree/master/TRT-OSS>
 35 |     - Go to <https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps/tree/master/TRT-OSS/Jetson> if you are working on Jetson platform
 36 |     - Go to <https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps/tree/master/TRT-OSS/x86> if you are working on x86 platform
 37 | 
 38 | - Follow guidences in README to clone repository <https://github.com/NVIDIA/TensorRT> and build `libnvinfer_plugin.so.7.x.x` 
 39 | 
 40 | - Rename `<TensorRT_dir>/lib/libnvinfer_plugin.so.7.x.x` to `<TensorRT_dir>/lib/libnvinfer_plugin.so.7.x.x.back`
 41 | 
 42 | - Copy `<TensorRT_OSS_dir>/build/out/libnvinfer_plugin.so.7.x.x` into `<TensorRT7.1_GA_dir>/lib`
 43 | 
 44 | ### 2.3 Generate YOLOv4 ONNX model with BatchedNMSPlugin node included ###
 45 | 
 46 | #### Step 1 Generate YOLOv4 ONNX model (`CSPDarknet-53 CNN + YOLO header CNN + YOLO layers`) ####
 47 | 
 48 | - Here is one of the YOLOv4 Pytorch repositories <https://github.com/Tianxiaomo/pytorch-YOLOv4> that can guide you to generate an ONNX model of YOLOv4.
 49 | You can convert from the pretrained DarkNet model into ONNX directly; but you can also 1) convert the DarkNet model into Pytorch, 2) train the Pytorch model using your own dataset, and 3) then convert into ONNX.
 50 | 
 51 | - Other famous YOLOv4 pytorch repositories as references:
 52 |   - <https://github.com/WongKinYiu/PyTorch_YOLOv4>
 53 |   - <https://github.com/bubbliiiing/yolov4-pytorch>
 54 |   - <https://github.com/maudzung/Complex-YOLOv4-Pytorch>
 55 |   - <https://github.com/AllanYiin/YoloV4>
 56 | 
 57 | 
 58 | #### Step 2 Add into YOLOv4 ONNX model the BatchedNMSPlugin (`CSPDarknet-53 CNN + YOLO header CNN + YOLO layers + BatchedNMSPlugin`)
 59 | 
 60 | **How can I add `BatchedNMSPlugin` node into ONNX model?**
 61 | 
 62 | - Open `source_gpu_nms/onnx_add_nms_plugin.py`
 63 | 
 64 | - Update attribute values to suit your model
 65 | 
 66 | Example:
 67 | ```py
 68 |     attrs["shareLocation"] = 1
 69 |     attrs["backgroundLabelId"] = -1
 70 |     attrs["numClasses"] = 80
 71 |     attrs["topK"] = topK # from program arguments
 72 |     attrs["keepTopK"] = keepTopK # from program arguments
 73 |     attrs["scoreThreshold"] = 0.3
 74 |     attrs["iouThreshold"] = 0.6
 75 |     attrs["isNormalized"] = 1
 76 |     attrs["clipBoxes"] = 1
 77 | ```
 78 | 
 79 | - Copy `onnx_add_nms_plugin.py` into `<TensorRT_OSS_dir>/tools/onnx-graphsurgeon`
 80 | 
 81 | - Go to `<TensorRT_OSS_dir>/tools/onnx-graphsurgeon` and execute `onnx_add_nms_plugin.py`
 82 | 
 83 | ```sh
 84 | cd <TensorRT_OSS_dir>/tools/onnx-graphsurgeon
 85 | python onnx_add_nms_plugin.py -f <yolov4_onnx_file> -t <topk_value> -k <keep_topk_value>
 86 | ```
 87 | 
 88 | ## 3. How can I build and run YOLOv4 standalone program? ##
 89 | 
 90 | ### 3.1 Add common source code includes ###
 91 | 
 92 | - This YOLOv4 standalone sample depends on the same common includes as other C++ samples of TensorRT.
 93 |     - Option 1: Add a link to `<where_tensorRT_is_installed>/TensorRT-7.1.x.x/samples/common` in `tensorrt_yolov4`
 94 |     ```
 95 |     cd <dir_on_your_machine>/yolov4_sample/tensorrt_yolov4
 96 |     ln -s <where_tensorRT_is_installed>/TensorRT-7.1.x.x/samples/common common
 97 |     ```
 98 |     - Option 2: Simply copy common includes into `tensorrt_yolov4`
 99 |     ```
100 |     cd <dir_on_your_machine>/yolov4_sample/tensorrt_yolov4
101 |     cp -r <where_tensorRT_is_installed>/TensorRT-7.1.x.x/samples/common common ./
102 |     ```
103 | 
104 | ### 3.2 OpenCV dependencies ###
105 | 
106 | - Note: There are OpenCV dependencies in this program. Please check if there are OpenCV includes in /usr/include/opencv and if OpenCV libraries like `-lopencv_core` and `-lopencv_imgproc` are installed.
107 | 
108 | - Follow README and documents of this repository **<https://github.com/opencv/opencv>** to install OpenCV if corresponding includes and libraries do not exist.
109 | 
110 | ### 3.3 Compile and build ###
111 | 
112 | 
113 | ```sh
114 | cd <dir_on_your_machine>/yolov4_sample/yolo_cpp_standalone/source_gpu_nms
115 | make clean
116 | make -j<num_processors>
117 | ```
118 | 
119 | 
120 | ### 3.4 Basic program parameters ###
121 | 
122 | - Step1: Use text editor to open `main.cpp` in `<dir_on_your_machine>/YOLOv4_Sample/tensorrt_yolov4/source`
123 | 
124 | - Step2: Go to where function `initializeSampleParams()` is defined
125 | 
126 | - Step3: You will find some basic configurations in `initializeSampleParams()` like follows:
127 | 
128 | ```cpp
129 |     // This argument is for calibration of int8
130 |     // Int8 calibration is not available until now
131 |     // You have to prepare samples for int8 calibration by yourself 
132 |     params.nbCalBatches = 80;
133 | 
134 |     // The engine file to generate or to load
135 |     // The engine file does not exist:
136 |     //     This program will try to load onnx file and convert onnx into engine
137 |     // The engine file exists:
138 |     //     This program will load the engine file directly
139 |     params.engingFileName = "../data/yolov4.engine";
140 | 
141 |     // The onnx file to load
142 |     params.onnxFileName = "../data/yolov4.onnx";
143 |     
144 |     // Input tensor name of ONNX file & engine file
145 |     params.inputTensorNames.push_back("input");
146 |     
147 |     // Old batch configuration, it is zero if explicitBatch flag is true for the tensorrt engine
148 |     // May be deprecated in the future
149 |     params.batchSize = 0;
150 |     
151 |     // Number of classes (usually 80, but can be other values)
152 |     params.outputClsSize = 80;
153 |     
154 |     // topK parameter of BatchedNMSPlugin
155 |     params.topK = 2000;
156 |     
157 |     // keepTopK parameter of BatchedNMSPlugin
158 |     params.keepTopK = 1000;
159 | 
160 |     // Batch size, you can modify to other batch size values if needed
161 |     params.explicitBatchSize = 1;
162 | 
163 |     params.inputImageName = "../data/demo.jpg";
164 |     params.cocoClassNamesFileName = "../data/coco.names";
165 |     params.cocoClassIDFileName = "../data/categories.txt";
166 | 
167 |     // Config number of DLA cores, -1 if there is no DLA core
168 |     params.dlaCore = -1;
169 | ```
170 | 
171 | - Step4: Copy and rename the ONNX file (`BatchedNMSPlugin` node included) to the location defined by `initializeSampleParams()`
172 | 
173 | 
174 | ### 3.5 Run this program to convert ONNX file into Engine file ###
175 | 
176 | - This program will automatically convert ONNX into engine if engine does not exist.
177 | - Command:
178 |     - To generate Engine of fp32 mode:
179 |     ```
180 |     ../bin/yolov4
181 |     ```
182 |     - To generate Engine of fp16 mode:
183 |     ```
184 |     ../bin/yolov4 --fp16
185 |     ```
186 | 
187 | ### 3.6 Specific program parameters for `demo` mode, `speed` mode and `coco` mode ###
188 | 
189 | #### 3.6.1 To run this program in `demo` mode
190 | 
191 | - Command:
192 | 
193 | ```
194 | ../bin/yolov4 --demo
195 | ```
196 | 
197 | - This program will feed the demo image into YOLOv4 engine and write detection output as an image.
198 | - Please make sure `params.demo = 1` if you want to run this program in demo mode.
199 | 
200 | ```cpp
201 |     // Configurations to run a demo image
202 |     params.demo = 1;
203 |     params.outputImageName = "../data/demo_out.jpg";
204 | ```
205 | 
206 | #### 3.6.2 To run this program in `speed` mode
207 | 
208 | - Command:
209 | 
210 | ```
211 | ../bin/yolov4 --speed
212 | ```
213 | 
214 | - This program will repeatedly feed the demo image into engine to accumulate time consumed in each iteration
215 | - Please make sure `params.speedTest = 1` if you want to run this program in speed mode
216 | 
217 | ```cpp
218 |     // Configurations to run speed test
219 |     params.speedTest = 1;
220 |     params.speedTestItrs = 1000;
221 | ```
222 | 
223 | #### 3.6.3 To run this program in `coco` mode
224 | 
225 | - Command:
226 | 
227 | ```
228 | ../bin/yolov4 --coco
229 | ```
230 | 
231 | - Corresponding configuration in `initializeSampleParams()` would be like this:
232 | 
233 | ```cpp
234 |     // Configurations of Test on COCO dataset
235 |     params.cocoTest = 1;
236 |     params.cocoClassNamesFileName = "../data/coco.names";
237 |     params.cocoClassIDFileName = "../data/categories.txt";
238 |     params.cocoImageListFileName = "../data/val2017.txt";
239 |     params.cocoTestResultFileName = "../data/coco_result.json";
240 |     params.cocoImageDir = "../data/val2017";
241 | ```
242 | 
243 | **Note: COCO dataset is just an example, you can use your own validation set or test set to validate YOLOv4 model trained by your own training set**
244 | 
245 | - Step 1: Download MS COCO images and annotations from <https://cocodataset.org/#download>
246 | 
247 |     - Images for validation: <http://images.cocodataset.org/zips/val2017.zip>
248 |     - Annotations for training and validation: <http://images.cocodataset.org/annotations/annotations_trainval2017.zip>
249 |     - Images for test: <http://images.cocodataset.org/zips/test2017.zip>
250 |     - Image info for test: <http://images.cocodataset.org/annotations/image_info_test2017.zip>
251 | 
252 | - Step 2: Clone COCO API repository from <https://github.com/cocodataset/cocoapi> and use COCO API to generate `categories.txt`
253 | 
254 |     - Format of `categories.txt` must follow this rule: IDs and names are separated by "\t".
255 | 
256 |     ```
257 |     1   person
258 |     2   bicycle
259 |     2   car
260 |     4	motorcycle
261 |     5	airplane
262 |     ```
263 | 
264 |     - COCO API example that can help you distill categories from COCO dataset (You can have a look at `cocoapi\PythonAPI\pycocoDemo.ipynb` of <https://github.com/cocodataset/cocoapi> for more details):
265 | 
266 |     ```py
267 |     # display COCO categories and supercategories
268 |     cats = coco.loadCats(coco.getCatIds())
269 |     nms=[cat['name'] for cat in cats]
270 |     print('COCO categories: \n{}\n'.format(' '.join(nms)))
271 |     ```
272 | 
273 | 
274 | - Step 3: Generate image list file using python script `generate_coco_image_list.py`
275 | 
276 | ```
277 | python generate_coco_image_list.py <json file of image annotations> <image list text>
278 | ```
279 | 
280 | - For example, to generate validation image list, the command would be:
281 | 
282 |     ```
283 |     python generate_coco_image_list.py instances_val2017.json val2017.txt
284 |     ```
285 | - For example, to generate test-dev image list, the command would be:
286 |     ```
287 |     python generate_coco_image_list.py image_info_test-dev2017.json testdev2017.txt
288 |     ```
289 | 
290 | - This program will read image names from the list file whose path should be the same as `params.cocoImageListFileName`, and then feed these images located in `params.cocoImageDir` to YOLOv4 engine
291 | - Please make sure `params.cocoTest = 1` and images exist in `params.cocoImageDir`
292 | 
293 | 
294 | 


--------------------------------------------------------------------------------
/tensorrt_yolov4/data/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov4/data/demo.jpg


--------------------------------------------------------------------------------
/tensorrt_yolov4/data/demo_out.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov4/data/demo_out.jpg


--------------------------------------------------------------------------------
/tensorrt_yolov4/source/Makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | 
24 | OUTNAME_RELEASE = yolov4
25 | OUTNAME_DEBUG   = yolov4_debug
26 | EXTRA_DIRECTORIES = ../common
27 | .NOTPARALLEL:
28 | MAKEFILE ?= ../Makefile.config
29 | include $(MAKEFILE)
30 | 


--------------------------------------------------------------------------------
/tensorrt_yolov4/source/SampleYolo.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | 
 24 | //!
 25 | //! SampleYolo.cpp
 26 | //! This file contains the implementation of the YOLOv4 sample. It creates the network using
 27 | //! the YOLOv4 ONNX model.
 28 | 
 29 | #pragma once
 30 | 
 31 | #include "BatchStream.h"
 32 | #include "EntropyCalibrator.h"
 33 | #include "argsParser.h"
 34 | #include "buffers.h"
 35 | #include "common.h"
 36 | #include "logger.h"
 37 | 
 38 | #include "NvOnnxParser.h"
 39 | #include "NvInfer.h"
 40 | #include <cuda_runtime_api.h>
 41 | 
 42 | #include <cstdlib>
 43 | #include <fstream>
 44 | #include <iostream>
 45 | #include <sstream>
 46 | #include <map>
 47 | 
 48 | #include <opencv2/opencv.hpp>
 49 | #include <opencv2/core/core.hpp>
 50 | #include <opencv2/highgui/highgui.hpp>
 51 | #include <opencv2/imgproc/imgproc.hpp>
 52 | 
 53 | //!
 54 | //! \brief The SampleYoloParams structure groups the additional parameters required by
 55 | //!         the SSD sample.
 56 | //!
 57 | struct SampleYoloParams : public samplesCommon::OnnxSampleParams
 58 | {
 59 |     int outputClsSize = 80;              //!< The number of output classes
 60 |     int topK = 2000;
 61 |     int keepTopK = 1000;                   //!< The maximum number of detection post-NMS
 62 |     int nbCalBatches = 100;               //!< The number of batches for calibration
 63 |     int demo = 0;
 64 |     int speedTest = 0;
 65 |     int cocoTest = 0;
 66 |     size_t speedTestItrs = 1000;
 67 |     int explicitBatchSize = 1;
 68 |     std::vector<int> inputShape;
 69 |     std::vector<std::vector<int>> outputShapes;
 70 |     std::string inputImageName;
 71 |     std::string outputImageName;
 72 |     std::string calibrationBatches; //!< The path to calibration batches
 73 |     std::string engingFileName;
 74 |     std::string cocoClassNamesFileName;
 75 |     std::string cocoClassIDFileName;
 76 |     std::string cocoImageListFileName;
 77 |     std::string cocoImageOutputDir;
 78 |     std::string cocoTestResultFileName;
 79 |     std::string cocoImageDir;
 80 | };
 81 | 
 82 | struct BoundingBox
 83 | {
 84 |     float x1;
 85 |     float y1;
 86 |     float x2;
 87 |     float y2;
 88 |     float score;
 89 |     int cls;
 90 | };
 91 | 
 92 | enum NMS_TYPE
 93 | {
 94 |     MIN,
 95 |     UNION,
 96 | };
 97 | 
 98 | struct SpeedInfo
 99 | {
100 |     long long preProcess;
101 |     long long model;
102 |     long long postProcess;
103 | 
104 |     SpeedInfo() :
105 |         preProcess {0},
106 |         model {0},
107 |         postProcess {0}
108 |     {}
109 | 
110 |     void printTimeConsmued()
111 |     {
112 |         std::cout << "Time consumed in preProcess: " << this->preProcess << std::endl;
113 |         std::cout << "Time consumed in model: " << this->model << std::endl;
114 |         std::cout << "Time consumed in postProcess: " << this->postProcess << std::endl;
115 |     }
116 | };
117 | 
118 | class BoundingBoxComparator
119 | {
120 | public:
121 |     bool operator() (const BoundingBox & b1, const BoundingBox & b2)
122 |     {
123 |         return b1.score > b2.score;
124 |     }
125 | };
126 | 
127 | class StringComparator
128 | {
129 | public:
130 |     bool operator() (const std::string & first, const std::string & second) const
131 |     { 
132 |         return first < second; 
133 |     }
134 | };
135 | 
136 | //! \brief  The SampleYolo class implements the SSD sample
137 | //!
138 | //! \details It creates the network using a caffe model
139 | //!
140 | class SampleYolo
141 | {
142 |     template <typename T>
143 |     using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
144 | 
145 | public:
146 |     static const std::string gSampleName;
147 |     
148 |     SampleYolo(const SampleYoloParams& params);
149 | 
150 |     //!
151 |     //! \brief Function builds the network engine
152 |     //!
153 |     bool build();
154 | 
155 |     //!
156 |     //! \brief Runs the TensorRT inference engine for this sample
157 |     //!
158 |     bool infer();
159 | 
160 |     //!
161 |     //! \brief Cleans up any state created in the sample class
162 |     //!
163 |     bool teardown();
164 | 
165 | private:
166 |     SampleYoloParams mParams; //!< The parameters for the sample.
167 | 
168 |     nvinfer1::Dims mInputDims; //!< The dimensions of the input to the network.
169 | 
170 |     cv::Mat mSampleImage;
171 | 
172 |     SpeedInfo mSpeedInfo;
173 | 
174 |     //std::vector<samplesCommon::PPM<3, 320, 512>> mPPMs; //!< PPMs of test images
175 | 
176 |     std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The TensorRT engine used to run the network
177 | 
178 |     std::vector<std::string> mClasses;
179 | 
180 |     std::map<std::string, int, StringComparator> mClassesMap;
181 | 
182 |     std::vector<std::string> mImageFiles;
183 | 
184 |     std::ofstream mCocoResult;
185 | 
186 |     std::vector<int> image_rows;
187 |     std::vector<int> image_cols;
188 |     std::vector<int> image_pad_rows;
189 |     std::vector<int> image_pad_cols;
190 | 
191 |     size_t mImageIdx;
192 | 
193 |     //!
194 |     //! \brief Parses an ONNX model for YOLO and creates a TensorRT network
195 |     //!
196 |     bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
197 |         SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
198 |         SampleUniquePtr<nvonnxparser::IParser>& parser);
199 | 
200 |     //!
201 |     //! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer
202 |     //!
203 |     bool processInput_aspectRatio(const samplesCommon::BufferManager& buffers);
204 | 
205 |     bool processInput(const samplesCommon::BufferManager& buffers);
206 | 
207 |     //!
208 |     //! \brief Filters output detections and verify results
209 |     //!
210 |     bool verifyOutput_aspectRatio(const samplesCommon::BufferManager& buffers);
211 | 
212 |     bool verifyOutput(const samplesCommon::BufferManager& buffers);
213 | 
214 |     //!
215 |     //! \brief To check if certain file exists given the path
216 |     //!
217 |     bool fileExists(const std::string& name)
218 |     {
219 |         std::ifstream f(name.c_str());
220 |         return f.good();
221 |     }
222 | 
223 |     bool infer_iteration(SampleUniquePtr<nvinfer1::IExecutionContext> &context, samplesCommon::BufferManager &buffers);
224 | 
225 |     std::vector<std::vector<BoundingBox>> get_bboxes(int batch_size, int keep_topk,
226 |         int32_t *num_detections, float *mnsed_boxes, float *mnsed_scores, float *mnsed_classes);
227 | 
228 |     void draw_bboxes(const std::vector<BoundingBox> &bboxes, cv::Mat &img);
229 | 
230 |     void draw_coco_test_bboxes(const std::vector<BoundingBox> &bboxes, cv::Mat &img, int img_id);
231 | 
232 |     long long now_in_milliseconds();
233 | };
234 | 
235 | 


--------------------------------------------------------------------------------
/tensorrt_yolov4/source/generate_coco_image_list.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | 
24 | import re
25 | import sys
26 | 
27 | json_file_name = sys.argv[1]
28 | img_list_name = sys.argv[2]
29 | 
30 | json_text = None
31 | with open(json_file_name, 'r') as f:
32 |     json_text = f.read()
33 | 
34 | matched_list = re.findall( r'\"([0-9]+.jpg)\"', json_text)
35 | 
36 | with open(img_list_name, 'w') as f:
37 |     for img_name in matched_list:
38 |         f.write(img_name)
39 |         f.write('\n')
40 | 


--------------------------------------------------------------------------------
/tensorrt_yolov4/source/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | #include "SampleYolo.hpp"
 24 | 
 25 | //!
 26 | //! \brief Prints the help information for running this sample
 27 | //!
 28 | void printHelpInfo()
 29 | {
 30 |     std::cout << "--help          Display help information" << std::endl;
 31 |     std::cout << "--demo          This app will run demo if this option is set"
 32 |               << std::endl;
 33 |     std::cout << "--speed         This app will run speed test if this option is set"
 34 |               << std::endl;
 35 |     std::cout << "--coco          This app will run COCO dataset if this option is set"
 36 |               << std::endl;
 37 |     std::cout << "--fp16          Specify to run in fp16 mode." << std::endl;
 38 |     std::cout << "--int8          Specify to run in int8 mode." << std::endl;
 39 | }
 40 | 
 41 | SampleYoloParams specifyInputAndOutputNamesAndShapes(SampleYoloParams &params)
 42 | {
 43 |     params.inputShape = std::vector<int> {params.explicitBatchSize, 3, 416, 416};
 44 | 
 45 |     // Output shapes when BatchedNMSPlugin is available
 46 |     params.outputShapes.push_back(std::vector<int>{params.explicitBatchSize, 1});
 47 |     params.outputShapes.push_back(std::vector<int>{params.explicitBatchSize, params.keepTopK, 4});
 48 |     params.outputShapes.push_back(std::vector<int>{params.explicitBatchSize, params.keepTopK});
 49 |     params.outputShapes.push_back(std::vector<int>{params.explicitBatchSize, params.keepTopK});
 50 | 
 51 |     // Output tensors when BatchedNMSPlugin is available
 52 |     params.outputTensorNames.push_back("num_detections");
 53 |     params.outputTensorNames.push_back("nmsed_boxes");
 54 |     params.outputTensorNames.push_back("nmsed_scores");
 55 |     params.outputTensorNames.push_back("nmsed_classes");
 56 | 
 57 |     return params;
 58 | }
 59 | 
 60 | //!
 61 | //! \brief Initializes members of the params struct using the command line args
 62 | //!
 63 | SampleYoloParams initializeSampleParams(std::vector<std::string> args)
 64 | {
 65 |     SampleYoloParams params;
 66 | 
 67 |     // This argument is for calibration of int8
 68 |     // Int8 calibration is not available until now
 69 |     // You have to prepare samples for int8 calibration by yourself 
 70 |     params.nbCalBatches = 80;
 71 | 
 72 |     // The engine file to generate or to load
 73 |     // The engine file does not exist:
 74 |     //     This program will try to load onnx file and convert onnx into engine
 75 |     // The engine file exists:
 76 |     //     This program will load the engine file directly
 77 |     params.engingFileName = "../data/yolov4.engine";
 78 | 
 79 |     // The onnx file to load
 80 |     params.onnxFileName = "../data/yolov4.onnx";
 81 |     
 82 |     // Input tensor name of ONNX file & engine file
 83 |     params.inputTensorNames.push_back("input");
 84 |     
 85 |     // Old batch configuration, it is zero if explicitBatch flag is true for the tensorrt engine
 86 |     // May be deprecated in the future
 87 |     params.batchSize = 0;
 88 |     
 89 |     // Number of classes (usually 80, but can be other values)
 90 |     params.outputClsSize = 80;
 91 |     
 92 |     // topK parameter of BatchedNMSPlugin
 93 |     params.topK = 2000;
 94 |     
 95 |     // keepTopK parameter of BatchedNMSPlugin
 96 |     params.keepTopK = 1000;
 97 | 
 98 |     // Batch size, you can modify to other batch size values if needed
 99 |     params.explicitBatchSize = 1;
100 | 
101 |     params.inputImageName = "../data/demo.jpg";
102 |     params.cocoClassNamesFileName = "../data/names.txt";
103 |     params.cocoClassIDFileName = "../data/categories.txt";
104 | 
105 |     // Config number of DLA cores, -1 if there is no DLA core
106 |     params.dlaCore = -1;
107 | 
108 |     for (auto &arg : args)
109 |     {
110 |         if (arg == "--help")
111 |         {
112 |             printHelpInfo();
113 |         }
114 |         else if (arg == "--demo")
115 |         {
116 |             // Configurations to run a demo image
117 |             params.demo = 1;
118 |             params.outputImageName = "../data/demo_out.jpg";
119 |         }
120 |         else if (arg == "--speed")
121 |         {
122 |             // Configurations to run speed test
123 |             params.speedTest = 1;
124 |             params.speedTestItrs = 1000;
125 |         }
126 |         else if (arg == "--coco")
127 |         {
128 |             // Configurations of Test on COCO dataset
129 |             params.cocoTest = 1;
130 |             params.cocoImageListFileName = "../data/val2017.txt";
131 |             params.cocoTestResultFileName = "../data/coco_result.json";
132 |             params.cocoImageDir = "../data/val2017";
133 |         }
134 |         else if (arg == "--int8")
135 |         {
136 |             params.int8 = true;
137 |         }
138 |         else if (arg == "--fp16")
139 |         {
140 |             params.fp16 = true;
141 |         }
142 |     }
143 | 
144 |     specifyInputAndOutputNamesAndShapes(params);
145 | 
146 |     return params;
147 | }
148 | 
149 | int main(int argc, char** argv)
150 | {
151 |     std::vector<std::string> args;
152 |     for (int i = 0; i < argc; ++i)
153 |     {
154 |         args.push_back(std::string(argv[i]));
155 |     }
156 | 
157 |     auto sampleTest = sample::gLogger.defineTest(SampleYolo::gSampleName, argc, argv);
158 | 
159 |     sample::gLogger.reportTestStart(sampleTest);
160 | 
161 |     SampleYolo sample(initializeSampleParams(args));
162 | 
163 |     sample::gLogInfo << "Building and running a GPU inference engine for Yolo" << std::endl;
164 | 
165 |     if (!sample.build())
166 |     {
167 |         return sample::gLogger.reportFail(sampleTest);
168 |     }
169 | 
170 |     sample::gLogInfo << "Loading or building yolo model done" << std::endl;
171 | 
172 |     if (!sample.infer())
173 |     {
174 |         return sample::gLogger.reportFail(sampleTest);
175 |     }
176 | 
177 |     sample::gLogInfo << "Inference of yolo model done" << std::endl;
178 | 
179 |     if (!sample.teardown())
180 |     {
181 |         return sample::gLogger.reportFail(sampleTest);
182 |     }
183 | 
184 |     return EXIT_SUCCESS; // sample::gLogger.reportPass(sampleTest);
185 | }
186 | 


--------------------------------------------------------------------------------
/tensorrt_yolov4/source/onnx_add_nms_plugin.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a
  6 | # copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | # DEALINGS IN THE SOFTWARE.
 22 | ################################################################################
 23 | 
 24 | 
 25 | #!/usr/bin/env python3
 26 | import onnx_graphsurgeon as gs
 27 | import argparse
 28 | import onnx
 29 | import numpy as np
 30 | 
 31 | def create_and_add_plugin_node(graph, topK, keepTopK):
 32 |     
 33 |     batch_size = graph.inputs[0].shape[0]
 34 |     input_h = graph.inputs[0].shape[2]
 35 |     input_w = graph.inputs[0].shape[3]
 36 | 
 37 |     tensors = graph.tensors()
 38 |     boxes_tensor = tensors["boxes"]
 39 |     confs_tensor = tensors["confs"]
 40 | 
 41 |     num_detections = gs.Variable(name="num_detections").to_variable(dtype=np.int32, shape=[batch_size, 1])
 42 |     nmsed_boxes = gs.Variable(name="nmsed_boxes").to_variable(dtype=np.float32, shape=[batch_size, keepTopK, 4])
 43 |     nmsed_scores = gs.Variable(name="nmsed_scores").to_variable(dtype=np.float32, shape=[batch_size, keepTopK])
 44 |     nmsed_classes = gs.Variable(name="nmsed_classes").to_variable(dtype=np.float32, shape=[batch_size, keepTopK])
 45 | 
 46 |     new_outputs = [num_detections, nmsed_boxes, nmsed_scores, nmsed_classes]
 47 | 
 48 |     mns_node = gs.Node(
 49 |         op="BatchedNMS_TRT",
 50 |         attrs=create_attrs(input_h, input_w, topK, keepTopK),
 51 |         inputs=[boxes_tensor, confs_tensor],
 52 |         outputs=new_outputs)
 53 | 
 54 |     graph.nodes.append(mns_node)
 55 |     graph.outputs = new_outputs
 56 | 
 57 |     return graph.cleanup().toposort()
 58 | 
 59 | 
 60 | 
 61 | 
 62 | def create_attrs(input_h, input_w, topK, keepTopK):
 63 | 
 64 |     num_anchors = 3
 65 | 
 66 |     h1 = input_h // 8
 67 |     h2 = input_h // 16
 68 |     h3 = input_h // 32
 69 | 
 70 |     w1 = input_w // 8
 71 |     w2 = input_w // 16
 72 |     w3 = input_w // 32
 73 | 
 74 |     num_boxes = num_anchors * (h1 * w1 + h2 * w2 + h3 * w3)
 75 | 
 76 |     attrs = {}
 77 | 
 78 |     attrs["shareLocation"] = 1
 79 |     attrs["backgroundLabelId"] = -1
 80 |     attrs["numClasses"] = 80
 81 |     attrs["topK"] = topK
 82 |     attrs["keepTopK"] = keepTopK
 83 |     attrs["scoreThreshold"] = 0.4
 84 |     attrs["iouThreshold"] = 0.6
 85 |     attrs["isNormalized"] = 1
 86 |     attrs["clipBoxes"] = 1
 87 | 
 88 |     # 001 is the default plugin version the parser will search for, and therefore can be omitted,
 89 |     # but we include it here for illustrative purposes.
 90 |     attrs["plugin_version"] = "1"
 91 | 
 92 |     return attrs
 93 | 
 94 | 
 95 | def main():
 96 |     parser = argparse.ArgumentParser(description="Add batchedNMSPlugin")
 97 |     parser.add_argument("-f", "--model", help="Path to the ONNX model generated by export_model.py", default="yolov4_1_3_416_416.onnx")
 98 |     parser.add_argument("-t", "--topK", help="number of bounding boxes for nms", default=2000)
 99 |     parser.add_argument("-k", "--keepTopK", help="bounding boxes to be kept per image", default=1000)
100 | 
101 |     args, _ = parser.parse_known_args()
102 | 
103 |     graph = gs.import_onnx(onnx.load(args.model))
104 |     
105 |     graph = create_and_add_plugin_node(graph, int(args.topK), int(args.keepTopK))
106 |     
107 |     onnx.save(gs.export_onnx(graph), args.model + ".nms.onnx")
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     main()
112 | 
113 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | cmake_minimum_required( VERSION 3.0 )
24 | 
25 | project( YOLOV7 )
26 | enable_language( CUDA )
27 | find_package(CUDA)
28 | set( CMAKE_C_STANDARD 99 )
29 | set( CMAKE_CXX_STANDARD 11 )
30 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -fpic -fpie -fpermissive  -std=c++11 -pthread" )
31 | 
32 | find_package(OpenCV REQUIRED)
33 | include_directories( ${OpenCV_INCLUDE_DIRS})
34 | find_package(jsoncpp CONFIG REQUIRED)
35 | 
36 | 
37 | 
38 | # global include_directories
39 | include_directories( /usr/local/cuda/include )
40 | #add judgement about system:
41 | 
42 | MESSAGE(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR is ${CMAKE_HOST_SYSTEM_PROCESSOR}")
43 | 
44 | if (${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL aarch64)
45 |     include_directories( /usr/include/aarch64-linux-gnu/ ) # for jetson
46 | elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL x86_64)
47 |     include_directories( /usr/lib/x86_64-linux-gnu/ )
48 | endif()
49 | 
50 | include_directories( "${CMAKE_SOURCE_DIR}/src/" )
51 | include_directories( "/usr/include/jsoncpp/")
52 | # global definitions
53 | add_definitions( -w)
54 | 
55 | # global library path
56 | if (${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL aarch64)
57 |     link_directories( "/usr/lib/aarch64-linux-gnu/" )
58 | elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} EQUAL x86_64)
59 |     link_directories( "/usr/lib/x86_64-linux-gnu/" )
60 | endif()
61 | 
62 | link_directories( "/usr/lib/" )
63 | link_directories( "/usr/local/lib/")
64 | link_directories( "/usr/local/cuda/lib64/" )
65 | 
66 | FILE(GLOB_RECURSE YOLO_SRC src/*.cpp )
67 | add_library( yolo SHARED ${YOLO_SRC} )
68 | target_link_libraries(yolo PRIVATE nvinfer)
69 | target_link_libraries(yolo PRIVATE nvinfer_plugin)
70 | target_link_libraries(yolo PRIVATE nvparsers)
71 | target_link_libraries(yolo PRIVATE nvonnxparser cudart ${OpenCV_LIBS})
72 | 
73 | add_executable(detect samples/detect.cpp )
74 | target_link_libraries(detect yolo cudart ${OpenCV_LIBS} )
75 | 
76 | add_executable(video_detect samples/video_detect.cpp )
77 | target_link_libraries(video_detect yolo cudart ${OpenCV_LIBS} )
78 | 
79 | add_executable(validate_coco samples/validate_coco.cpp )
80 | target_link_libraries(validate_coco yolo cudart ${OpenCV_LIBS} )
81 | target_link_libraries(validate_coco jsoncpp)
82 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/README.md:
--------------------------------------------------------------------------------
 1 | # Yolov7 TensorRT cpp
 2 | 
 3 | ## Description
 4 | This is a yolov7 TensorRT cpp app. Fisrt, using trtexec to convert onnx model to FP32 or FP16 TensorRT engine ,or INT8 TensorRT engine from the QAT model finetuned from [yolov7_qat](../yolov7_qat).
 5 | Then you can use the `detect/video_detect` app to detect a list of images(images number must smaller than the batchsize of the model)/video. or use `validate_coco` app to test mAP of the TensorRT engine.
 6 | ## Prerequisites
 7 | #### Install opencv
 8 | - Note: There are OpenCV4 dependencies in this program. 
 9 | Follow README and documents of this repository https://github.com/opencv/opencv to install OpenCV.
10 | And, if you want use detect_video app, please install opencv with `ffmpeg` enabled
11 | 
12 | #### Install jsoncpp libs
13 | jsoncpp lib is used to write coco-dataset-validate-result to json file. 
14 | ```bash
15 | $ sudo apt-get install libjsoncpp-dev
16 | ```
17 | ## Build and Run yolov7-TensorRT-app
18 | ### Build
19 | ```bash
20 | $ mkdir build && cd build
21 | $ cmake ..
22 | $ make -j4
23 | ```
24 | 
25 | ### Prepare TensorRT engines
26 | 
27 | convert onnx model to tensorrt-engine
28 | ```bash
29 | # fp32 model
30 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --saveEngine=yolov7fp32.engine
31 | # fp16 model
32 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --saveEngine=yolov7fp16.engine --fp16
33 | # int8 QAT model, the onnx model with Q&DQ nodes
34 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7qat.onnx --saveEngine=yolov7QAT.engine --fp16 --int8
35 | ```
36 | ### Detection & Validate
37 | - detect with image:
38 |     ```bash
39 |     $ ./build/detect --engine=yolov7db4fp32.engine --img=./imgs/horses.jpg,./imgs/zidane.jpg
40 |     ```
41 | - detect with video:
42 |     - note: only support batchsize = 1 now.
43 |     ```bash
44 |     $ ./build/video_detect --engine=./yolov7fp32.engine --video=YOUR_VIDEO_PATH.mp4
45 |     ```
46 | - validate mAP on dataset
47 |     - note: validate_coco only support model inputsize `[batchsize, 3, 672, 672]`
48 |     ```bash
49 |     $ ./build/validate_coco --engine=./yolov7fp32.engine --coco=/YOUR/COCO/DATA/PATH/
50 |     --------------------------------------------------------
51 |     Yolov7 initialized from: yolov7672.engine
52 |     input : images , shape : [ 1,3,672,672,]
53 |     output : output , shape : [ 1,27783,85,]
54 |     --------------------------------------------------------
55 |     5000 / 5000
56 |     predict result has been written to ./predict.json
57 | 
58 |     $ python test_coco_map.py --predict ./predict.json --coco /YOUR/COCO/DATA/PATH/
59 |     ...
60 |     Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.51005
61 |     ...
62 |     ```
63 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/imgs/horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov7/imgs/horses.jpg


--------------------------------------------------------------------------------
/tensorrt_yolov7/imgs/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/tensorrt_yolov7/imgs/zidane.jpg


--------------------------------------------------------------------------------
/tensorrt_yolov7/samples/detect.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: MIT
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | #include <Yolov7.h>
 25 | #include <vector>
 26 | #include <numeric>
 27 | #include <random>
 28 | #include <string>
 29 | #include <opencv2/opencv.hpp>
 30 | #include <opencv2/core/core.hpp>
 31 | #include <opencv2/highgui/highgui.hpp>
 32 | #include <opencv2/imgproc/imgproc.hpp>
 33 | #include <argsParser.h>
 34 | 
 35 | 
 36 | std::vector<std::string> parse_img_paths(argsParser& cmdLine) {
 37 |     return cmdLine.ParseStringList("img");
 38 | }
 39 | 
 40 | std::string parse_model_path(argsParser& cmdLine) {
 41 |     const char* engine_path_str = cmdLine.ParseString("engine");
 42 |     std::string engine_path;
 43 |     if (engine_path_str) engine_path = std::string(engine_path_str);
 44 |     return engine_path;
 45 | }
 46 | 
 47 | bool print_help() {
 48 |     printf("--------------------------------------------------------------------------------------------------------\n");
 49 |     printf("---------------------------- yolov7 images detector ---------------------------------------------\n");
 50 |     printf(" '--help': print help information \n");
 51 |     printf(" '--engine=yolov7.engine' Load yolov7 trt-engine  \n");
 52 |     printf(" '--img=img1,jpg,img2.jpg,img3.jpg' specify the path of the images, split by `,`\n");
 53 |     return true;
 54 | }
 55 | 
 56 | 
 57 | int main(int argc, char** argv){
 58 | 
 59 |     argsParser cmdLine(argc, argv);
 60 |     //! parse device_flag, see parse_device_flag
 61 |     if(cmdLine.ParseFlag("help")) { print_help(); return 0; }
 62 | 
 63 |     std::string engine_path = parse_model_path(cmdLine);
 64 |     std::vector<std::string> img_paths = parse_img_paths(cmdLine);
 65 |     // print img paths
 66 |     std::cout<<"input "<<img_paths.size()<<" images, paths: ";
 67 |     for(int i = 0;i < img_paths.size();i++) {
 68 |         std::cout<< img_paths[i]<<", ";
 69 |     }
 70 |     std::cout<<std::endl;
 71 |     
 72 |     Yolov7 yolov7(engine_path);
 73 | 
 74 |     std::vector<cv::Mat> bgr_imgs;
 75 |     for(int i = 0; i< img_paths.size();i++){
 76 |         bgr_imgs.push_back(cv::imread(img_paths[i]));
 77 |     }
 78 |     
 79 |     std::cout<<"preprocess start"<<std::endl;
 80 | 
 81 |     yolov7.preProcess(bgr_imgs);
 82 |     
 83 |     std::cout<<"inference start"<<std::endl;
 84 | 
 85 |     yolov7.infer();
 86 | 
 87 |     std::cout<<"postprocessing start"<<std::endl;
 88 | 
 89 |     std::vector<std::vector<std::vector<float>>> nmsresults = yolov7.PostProcess();
 90 | 
 91 |     for(int j =0; j < nmsresults.size();j++){
 92 |         Yolov7::DrawBoxesonGraph(bgr_imgs[j],nmsresults[j]);
 93 |         std::string output_path = img_paths[j] + "detect" + std::to_string(j)+".jpg";      
 94 |         cv::imwrite(output_path, bgr_imgs[j]);
 95 |         std::cout<<"detectec image written to: "<<output_path<<std::endl;
 96 |     }
 97 |     
 98 |     std::cout<<"Done..."<<std::endl;
 99 |     
100 | }
101 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/samples/validate_coco.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: MIT
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | #include <Yolov7.h>
 25 | #include <vector>
 26 | #include <numeric>
 27 | #include <random>
 28 | #include <string>
 29 | #include <opencv2/opencv.hpp>
 30 | #include <opencv2/core/core.hpp>
 31 | #include <opencv2/highgui/highgui.hpp>
 32 | #include <opencv2/imgproc/imgproc.hpp>
 33 | #include <json/json.h>
 34 | #include <fstream>
 35 | #include <argsParser.h>
 36 | 
 37 | std::string parse_model_path(argsParser& cmdLine) {
 38 |     const char* engine_path_str = cmdLine.ParseString("engine");
 39 |     std::string engine_path;
 40 |     if (engine_path_str) engine_path = std::string(engine_path_str);
 41 |     return engine_path;
 42 | }
 43 | 
 44 | std::string parse_coco_path(argsParser& cmdLine) {
 45 |     const char* coco_path_str = cmdLine.ParseString("coco");
 46 |     std::string coco_path;
 47 |     if (coco_path_str) coco_path = std::string(coco_path_str);
 48 |     return coco_path;
 49 | }
 50 | 
 51 | bool print_help() {
 52 |     printf("--------------------------------------------------------------------------------------------------------\n");
 53 |     printf("---------------------------- yolov7 coco validate tool ---------------------------------------------\n");
 54 |     printf(" '--help': print help information \n");
 55 |     printf(" '--engine=yolov7.engine' Load yolov7 trt-engine  \n");
 56 |     printf(" '--coco=./data/coco/' specify the path of the coco dataset\n");
 57 |     return true;
 58 | }
 59 | 
 60 | int coco80_to_coco91_class(int id) {
 61 |     //# converts 80-index (val2014) to 91-index (paper)
 62 |     // # https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/
 63 |     // # a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n')
 64 |     // # b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n')
 65 |     // # x1 = [list(a[i] == b).index(True) + 1 for i in range(80)]  # darknet to coco
 66 |     // # x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)]  # coco to darknet
 67 |     std::vector<int> x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
 68 |          35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 69 |          64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90};
 70 |     return x[id];
 71 | }
 72 | std::vector<float> xyxy2xywh(float x0, float x1, float x2, float x3){
 73 |     // # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
 74 |     // y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
 75 |     std::vector<float> y;
 76 |     y.resize(4);
 77 |     y[0] = (x0 + x2) / 2;//  # x center
 78 |     y[1] = (x1 + x3) / 2;//  # y center
 79 |     y[2] = x2 - x0;//  # width
 80 |     y[3] = x3 - x1;//  # height
 81 |     y[0] -= y[2]/2;
 82 |     y[1] -= y[3]/2;
 83 |     // box[:, :2] -= box[:, 2:] / 2 
 84 |     
 85 |     return y;
 86 | }
 87 | 
 88 | int number_classes = 80;
 89 | 
 90 | std::vector<std::string> readCocoPaths(std::string coco_file_path) {
 91 |     std::vector<std::string> result;
 92 |     std::ifstream coco_test_file(coco_file_path);
 93 |     std::string line;
 94 |     std::string folder_path = coco_file_path.substr(0, coco_file_path.find_last_of("/")+1);
 95 |     if(coco_test_file) {
 96 |         while(getline(coco_test_file, line)){
 97 |             
 98 |             result.push_back(folder_path+line);
 99 |             // std::cout<<"folder_path+line:"<<folder_path+line<<std::endl;
100 |         }
101 |     }
102 |     // std::cout <<"Done"<<folder_path<<std::endl;
103 |     return result;
104 | }
105 | 
106 | 
107 | int main(int argc, char** argv){
108 | 
109 |     argsParser cmdLine(argc, argv);
110 |     //! parse device_flag, see parse_device_flag
111 |     if(cmdLine.ParseFlag("help")) { print_help(); return 0; }
112 | 
113 |     std::string engine_path = parse_model_path(cmdLine);
114 |     std::string coco_path = parse_coco_path(cmdLine);
115 | 
116 |     coco_path += "/val2017.txt";
117 |     Yolov7 yolov7(engine_path);
118 | 
119 |     // containor fr
120 |     std::vector<cv::Mat> bgr_imgs;
121 |     std::vector<std::string> imgPathList = readCocoPaths(coco_path);;
122 |     std::vector<std::vector<std::vector<float>>> batchNmsResult;
123 |     int maxBatchsize = yolov7.getInputDim().d[0];
124 | 
125 |     
126 |     Json::Value root;
127 |     Json::FastWriter writer;
128 | 
129 |     for(int i = 0 ; i < imgPathList.size(); ){
130 |         //infer with a batch
131 |         for(int j = 0; j < maxBatchsize && i<imgPathList.size() ; j++,i++){
132 |             cv::Mat one_img = cv::imread(imgPathList[i]);
133 |             bgr_imgs.push_back(one_img);
134 |         }
135 | 
136 |         std::vector<cv::Mat> nchwMats = yolov7.preProcess4Validate(bgr_imgs);
137 | 
138 |         printf("\r%d / %d", i, imgPathList.size());
139 |         fflush(stdout);
140 |         
141 |         yolov7.infer();
142 |         
143 |         batchNmsResult = yolov7.PostProcess(0.65, 0.001);
144 | 
145 |         for(int j = 0; j< batchNmsResult.size();j++){
146 |             int imgth = i - batchNmsResult.size() + j;
147 |             // processing the name. eg: ./images/train2017/000000000250.jpg will be processed as 250
148 |             int image_id = stoi(imgPathList[imgth].substr(imgPathList[imgth].length()-16, imgPathList[imgth].find_last_of(".")-(imgPathList[imgth].length()-16)));
149 |             for(int k = 0; k <batchNmsResult[j].size();k++){
150 |                 Json::Value OneResult;
151 |                 Json::Value bboxObj;
152 |                 
153 |                 OneResult["image_id"] = image_id;
154 |                 OneResult["category_id"] = coco80_to_coco91_class(batchNmsResult[j][k][4]);
155 |                 OneResult["score"] = batchNmsResult[j][k][5];
156 |                 std::vector<float> point = xyxy2xywh(batchNmsResult[j][k][0],batchNmsResult[j][k][1],batchNmsResult[j][k][2],batchNmsResult[j][k][3]);
157 |                 bboxObj.append(point[0]);
158 |                 bboxObj.append(point[1]);
159 |                 bboxObj.append(point[2]);
160 |                 bboxObj.append(point[3]);
161 |                 OneResult["bbox"] = bboxObj;
162 |                 root.append(OneResult);
163 |             }
164 |         }
165 |         bgr_imgs.clear();
166 |     }
167 |     
168 |     std::string json_file = writer.write(root);
169 |     std::ofstream out("./predict.json");
170 |     out << json_file;
171 |     std::cout<<std::endl<<"predict result has been written to ./predict.json "<<std::endl;
172 |     return 0;
173 | }
174 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/samples/video_detect.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: MIT
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | #include <Yolov7.h>
 25 | #include <vector>
 26 | #include <numeric>
 27 | #include <random>
 28 | #include <string>
 29 | #include <opencv2/opencv.hpp>
 30 | #include <opencv2/core/core.hpp>
 31 | #include <opencv2/highgui/highgui.hpp>
 32 | #include <opencv2/imgproc/imgproc.hpp>
 33 | #include <argsParser.h>
 34 | 
 35 | 
 36 | std::string parse_video_path(argsParser& cmdLine) {
 37 |     const char* video_path_str = cmdLine.ParseString("video");
 38 |     std::string video_path;
 39 |     if (video_path_str) video_path = std::string(video_path_str);
 40 |     return video_path;
 41 | }
 42 | 
 43 | std::string parse_model_path(argsParser& cmdLine) {
 44 |     const char* engine_path_str = cmdLine.ParseString("engine");
 45 |     std::string engine_path;
 46 |     if (engine_path_str) engine_path = std::string(engine_path_str);
 47 |     return engine_path;
 48 | }
 49 | 
 50 | bool print_help() {
 51 |     printf("--------------------------------------------------------------------------------------------------------\n");
 52 |     printf("---------------------------- yolov7 images detector ---------------------------------------------\n");
 53 |     printf(" '--help': print help information \n");
 54 |     printf(" '--engine=yolov7.engine' Load yolov7 trt-engine  \n");
 55 |     printf(" '--video=video.mp4' specify the path of the video \n");
 56 |     return true;
 57 | }
 58 | 
 59 | 
 60 | int main(int argc, char** argv){
 61 | 
 62 |     argsParser cmdLine(argc, argv);
 63 |     //! parse device_flag, see parse_device_flag
 64 |     if(cmdLine.ParseFlag("help")) { print_help(); return 0; }
 65 | 
 66 |     std::string engine_path = parse_model_path(cmdLine);
 67 |     std::string video_path = parse_video_path(cmdLine);
 68 |     
 69 |     Yolov7 yolov7(engine_path);
 70 | 
 71 |     cv::VideoCapture capture;
 72 |     cv::Mat frame;
 73 |     frame= capture.open(video_path);
 74 |     if(!capture.isOpened())
 75 |     {
 76 |         printf("can not open ... please check whether your opencv has installed with ffmpeg..\n");
 77 |         return -1;
 78 |     }
 79 |     cv::Size size = cv::Size(capture.get(cv::CAP_PROP_FRAME_WIDTH), capture.get(cv::CAP_PROP_FRAME_HEIGHT));
 80 |     cv::VideoWriter writer;
 81 |     writer.open(std::string(video_path+".detect.mp4"), cv::VideoWriter::fourcc('M', 'J', 'P', 'G'), 10, size, true);
 82 |     std::vector<cv::Mat> framev;
 83 |     std::vector<std::vector<std::vector<float>>> nmsresults;
 84 |     int total_frame_count = capture.get(cv::CAP_PROP_FRAME_COUNT);
 85 |     int i = 0;
 86 |     while (capture.read(frame)){
 87 |         framev.push_back(frame);
 88 |         yolov7.preProcess(framev);
 89 |         yolov7.infer();
 90 |         nmsresults = yolov7.PostProcess();
 91 |         Yolov7::DrawBoxesonGraph(frame,nmsresults[0]);
 92 |         writer.write(frame);
 93 |         framev.clear();
 94 |         i++;
 95 |         printf("\r%d / %d", i, total_frame_count);
 96 |         fflush(stdout);
 97 |     }
 98 |     capture.release();
 99 |     std::cout<<"Done..."<<std::endl;
100 |     
101 | }
102 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/src/Yolov7.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: MIT
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | 
 25 | 
 26 | //!
 27 | //! This file contains the implementation of the yolov7 sample. 
 28 | //!
 29 | #pragma once
 30 | 
 31 | #include <iostream>
 32 | #include <string>
 33 | #include <memory>
 34 | #include <cuda_runtime.h>
 35 | #include "NvInfer.h"
 36 | #include <vector>
 37 | #include <NvInferPlugin.h>
 38 | #include <tools.h>
 39 | #include <fstream>
 40 | #include <algorithm>
 41 | #include <numeric>
 42 | //opencv for preprocessing &  postprocessing
 43 | #include <opencv2/opencv.hpp>
 44 | #include <opencv2/core/core.hpp>
 45 | #include <opencv2/highgui/highgui.hpp>
 46 | #include <opencv2/imgproc/imgproc.hpp>
 47 | 
 48 | class Yolov7 {
 49 | public:
 50 |     //!
 51 |     //! \brief init Yolov7 class object
 52 |     //!
 53 |     //! \param engine_path The path of trt engine file
 54 |     //!
 55 |     Yolov7(std::string engine_path);
 56 | 
 57 |     //!
 58 |     //! \brief preprocess a list of image, the image will remembered inside the class by Yolov7 object
 59 |     //!
 60 |     //! \param cv_img  input images with BGR-UInt8, the size of the vector must smmaller than the maxBatchsize of the model
 61 |     //!
 62 |     std::vector<cv::Mat> preProcess(std::vector<cv::Mat> &cv_img);// 
 63 | 
 64 |     //!
 65 |     //! \brief run tensorRT inference with the data preProcessed
 66 |     //!
 67 |     int infer();
 68 | 
 69 |     //!
 70 |     //! \brief PostProcess, will decode and nms the batch inference result of yolov7
 71 |     //!
 72 |     //! \param cv_img  
 73 |     //! \return return all the nms result of Yolov7 
 74 |     //!
 75 |     std::vector<std::vector<std::vector<float>>> PostProcess(float iou_thres = 0.45f, float conf_thres = 0.25f);
 76 |     
 77 |     //!
 78 |     //! \brief Get the input dimenssion of the model
 79 |     //!
 80 |     //! \return return Dims of input
 81 |     //!
 82 |     nvinfer1::Dims getInputDim();
 83 | 
 84 |     //!
 85 |     //! \brief Get the output dimenssion of the model
 86 |     //!
 87 |     //! \return return the Dims of output
 88 |     //!
 89 |     nvinfer1::Dims getOutputDim();
 90 | 
 91 |     //!
 92 |     //! \brief Draw boxes on bgr image
 93 |     //! \param bgr_img The images need to be drawed with boxes
 94 |     //! \param nmsresult nms result get from PostProcess function
 95 |     //!
 96 |     static int Yolov7::DrawBoxesonGraph(cv::Mat &bgr_img, std::vector<std::vector<float>> nmsresult);
 97 | 
 98 |     //!
 99 |     //! \brief preprocess a list of image for validate mAP on coco dataset! the model must have a [batchsize, 3, 672, 672] input
100 |     //!
101 |     //! \param cv_img  input images with BGR-UInt8, the size of the vector must smmaller than the maxBatchsize of the model
102 |     //!
103 |     std::vector<cv::Mat> preProcess4Validate(std::vector<cv::Mat> &cv_img);
104 | 
105 |     //!
106 |     //! \brief PostProcess for validate mAP on coco dataset!, will decode the batch inference result of yolov7
107 |     //!
108 |     //! \param cv_img  
109 |     //! \return return all the nms result of Yolov7 
110 |     //!
111 |     std::vector<std::vector<std::vector<float>>> PostProcess4Validate(float iou_thres = 0.45f, float conf_thres = 0.25f);
112 | private:
113 | 
114 |     int pushImg(void *imgBuffer, int numImg, bool fromCPU = true);
115 | 
116 |     std::vector<std::vector<std::vector<float>>> decode_yolov7_result(float conf_thres);
117 |     std::vector<std::vector<std::vector<float>>> yolov7_nms(std::vector<std::vector<std::vector<float>>> &bboxes, float iou_thres);
118 |     std::vector<std::vector<float>> nms(std::vector<std::vector<float>> &bboxes, float iou_thres);
119 |     
120 |     //TODO: to be imp
121 |     void CudaGraphEndCapture(cudaStream_t stream);
122 | 
123 |     void CudaGraphBeginCapture(cudaStream_t stream);
124 | 
125 |     bool CudaGraphLaunch(cudaStream_t stream);
126 | 
127 |     bool enableCudaGraph();
128 | 
129 |     void ReportArgs();
130 | 
131 | private:
132 | 
133 |     int mImgPushed;
134 |     int mMaxBatchSize;
135 |     bool mDynamicBatch;
136 | 
137 |     //stream and event
138 |     std::unique_ptr<CUstream_st, StreamDeleter> mStream;
139 |     std::unique_ptr<CUevent_st, EventDeleter> mEvent;
140 | 
141 |     // trt objects
142 |     std::unique_ptr<nvinfer1::IRuntime,TrtDeleter<nvinfer1::IRuntime>> mRuntime;
143 |     std::unique_ptr<nvinfer1::ICudaEngine, TrtDeleter<nvinfer1::ICudaEngine>> mEngine;
144 |     std::unique_ptr<nvinfer1::IExecutionContext, TrtDeleter<nvinfer1::IExecutionContext>> mContext;
145 |     std::vector<std::unique_ptr<char, CuMemDeleter<char>>> mBindings;
146 | 
147 |     std::vector<void *> mBindingArray;
148 |     std::vector<float> mHostOutputBuffer;
149 |     std::vector<float> mHostNMSBuffer;
150 | 
151 |     std::string mEnginePath;
152 |     nvinfer1::Dims mInputDim; //maxB,3,640,640
153 |     nvinfer1::Dims mOutputDim;
154 |     int mImgBufferSize;//sizeof(float)x3x640x640
155 | 
156 |     //cuda graph objects
157 |     cudaGraph_t mGraph{};
158 |     cudaGraphExec_t mGraphExec{};
159 | 
160 |     std::vector<std::vector<float>> md2i;
161 | 
162 |     bool mCudaGraphEnabled;
163 | 
164 | //TODOs
165 |     //!
166 |     //! get how many imgs has been totally processed
167 |     //!
168 |     // caculate fps real time
169 |     unsigned long long mLast_inference_time;
170 |     unsigned long long mTotal_inference_time;
171 |     int mInference_count;
172 | public:
173 |     int imgProcessed() { return mInference_count; };
174 | };
175 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/src/argsParser.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: MIT
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a
 7 |  * copy of this software and associated documentation files (the "Software"),
 8 |  * to deal in the Software without restriction, including without limitation
 9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 |  * and/or sell copies of the Software, and to permit persons to whom the
11 |  * Software is furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in
14 |  * all copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 |  * DEALINGS IN THE SOFTWARE.
23 |  */
24 | 
25 | 
26 | #include "argsParser.h"
27 | 
28 | // constructor
29 | argsParser::argsParser(const int pArgc, char** pArgv) {
30 |     argc = pArgc;
31 |     argv = pArgv;
32 | }
33 | // ParseFlag
34 | bool argsParser::ParseFlag(std::string string_ref) const {
35 |     if (argc < 1) return false;
36 | 
37 |     for (int i = 0; i < argc; i++) {
38 |         const int string_start = std::string(argv[i]).find_last_of('-') + 1; 
39 |         if (string_start == 0) continue;
40 | 
41 |         const char* string_argv = &argv[i][string_start];
42 |         
43 |         const char* equal_pos = strchr(string_argv, '=');
44 | 
45 |         const int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
46 |         const int length = (int)(string_ref.size());
47 | 
48 |         if (length == argv_length && !strncasecmp(string_argv, string_ref.c_str(), length)) return true;
49 |     }
50 |     return false;
51 | }
52 | 
53 | // ParseString
54 | const char* argsParser::ParseString(std::string string_ref) const {
55 |     if (argc < 1) return NULL;
56 | 
57 |     for (int i = 0; i < argc; i++) {
58 |         const int string_start = std::string(argv[i]).find_last_of('-') + 1; 
59 | 
60 |         if (string_start == 0) continue;
61 | 
62 |         char* string_argv = (char*)&argv[i][string_start];
63 |         const int length = (int)(string_ref.size());
64 | 
65 |         if (!strncasecmp(string_argv, string_ref.c_str(), length)) return (string_argv + length + 1);
66 |         //*string_retval = &string_argv[length+1];
67 |     }
68 |     return NULL;
69 | }
70 | 
71 | 
72 | // ParseStringList eg. img1,img2,img3
73 | std::vector<std::string> argsParser::ParseStringList(std::string argName, const char delimiter)  const{
74 |     const char* ListStr = ParseString(argName);
75 |     std::vector<std::string> result;
76 |     if (ListStr == NULL) return result;
77 |     int string_start = 0;
78 |     int string_end = 0;
79 | 
80 |     int strLen = (int)strlen(ListStr);
81 |     while(string_end < strLen){
82 |         while (delimiter != ListStr[string_end] && string_end < strLen) string_end++;
83 |         result.push_back(std::string(ListStr).substr(string_start,string_end-string_start));
84 |         string_end++;
85 |         string_start = string_end;
86 |     }
87 |     return result;
88 | }
89 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/src/argsParser.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 |  * SPDX-License-Identifier: MIT
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a
 7 |  * copy of this software and associated documentation files (the "Software"),
 8 |  * to deal in the Software without restriction, including without limitation
 9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 |  * and/or sell copies of the Software, and to permit persons to whom the
11 |  * Software is furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in
14 |  * all copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 |  * DEALINGS IN THE SOFTWARE.
23 |  */
24 | 
25 | 
26 | #ifndef __COMMAND_LINE_H_
27 | #define __COMMAND_LINE_H_
28 | 
29 | #include <stdint.h>
30 | #include <stdlib.h>
31 | #include <vector>
32 | #include <string>
33 | 
34 | #include <string.h>
35 | #include <strings.h>
36 | 
37 | /**
38 |  * args line parser 
39 |  */
40 | class argsParser {
41 | public:
42 |     argsParser(const int argc, char** argv);
43 | 
44 |     /**
45 |      * Parse Flag
46 |      */
47 |     bool ParseFlag(const std::string argName) const;
48 | 
49 |     /**
50 |      * Parse String
51 |      */
52 |     const char* ParseString(const std::string  argName) const;
53 |     // const char* ParseString2(const std::string argName, const char* defaultValue = NULL, bool allowOtherDelimiters = true) const;
54 | 
55 |     /**
56 |      * Parse String list delimited by ","
57 |      */
58 |     std::vector<std::string> ParseStringList(std::string argName, const char delimiter = ',') const;
59 | 
60 |     /**
61 |      * The argument count that the object was created with from main()
62 |      */
63 |     int argc;
64 |     char** argv;
65 | };
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/src/tools.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |  * SPDX-License-Identifier: MIT
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a
  7 |  * copy of this software and associated documentation files (the "Software"),
  8 |  * to deal in the Software without restriction, including without limitation
  9 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 10 |  * and/or sell copies of the Software, and to permit persons to whom the
 11 |  * Software is furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 19 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 21 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 22 |  * DEALINGS IN THE SOFTWARE.
 23 |  */
 24 | 
 25 | #ifndef __TOOLS_H__
 26 | #define __TOOLS_H__
 27 | 
 28 | #include <cuda_runtime.h>
 29 | #include <iostream>
 30 | #include <memory>
 31 | #include <stdexcept>
 32 | #include "NvInfer.h"
 33 | 
 34 | void checkCudaErrors(cudaError_t err) {
 35 |     if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorName(err));
 36 | }
 37 | 
 38 | // Logger for TensorRT info/warning/errors
 39 | class Logger : public nvinfer1::ILogger {
 40 | public:
 41 |     Logger(Severity severity = Severity::kWARNING) : reportableSeverity(severity) {}
 42 | 
 43 |     void log(Severity severity, const char* msg) noexcept override {
 44 |         // suppress messages with severity enum value greater than the reportable
 45 |         if (severity > reportableSeverity) return;
 46 | 
 47 |         switch (severity) {
 48 |         case Severity::kINTERNAL_ERROR:
 49 |             std::cerr << "INTERNAL_ERROR: ";
 50 |             break;
 51 |         case Severity::kERROR:
 52 |             std::cerr << "ERROR: ";
 53 |             break;
 54 |         case Severity::kWARNING:
 55 |             std::cerr << "WARNING: ";
 56 |             break;
 57 |         case Severity::kINFO:
 58 |             std::cerr << "INFO: ";
 59 |             break;
 60 |         default:
 61 |             std::cerr << "UNKNOWN: ";
 62 |             break;
 63 |         }
 64 |         std::cerr << msg << std::endl;
 65 |     }
 66 | 
 67 |     Severity reportableSeverity;
 68 | };
 69 | template <typename T>
 70 | struct TrtDeleter {
 71 |     void operator()(T* p) noexcept {
 72 |         if (p != nullptr) delete p;
 73 |     }
 74 | };
 75 | 
 76 | template <typename T>
 77 | struct CuMemDeleter {
 78 |     void operator()(T* p) noexcept { checkCudaErrors(cudaFree(p)); }
 79 | };
 80 | 
 81 | template <typename T>
 82 | std::unique_ptr<T, CuMemDeleter<T>> mallocCudaMem(size_t nbElems) {
 83 |     T* ptr = nullptr;
 84 |     checkCudaErrors(cudaMalloc((void**)&ptr, sizeof(T) * nbElems));
 85 |     return std::unique_ptr<T, CuMemDeleter<T>>{ptr};
 86 | }
 87 | 
 88 | struct EventDeleter {
 89 |     void operator()(CUevent_st* event) noexcept { checkCudaErrors(cudaEventDestroy(event)); }
 90 | };
 91 | struct StreamDeleter {
 92 |     void operator()(CUstream_st* stream) noexcept { checkCudaErrors(cudaStreamDestroy(stream)); }
 93 | };
 94 | 
 95 | std::unique_ptr<CUevent_st, EventDeleter> makeCudaEvent(int flags) {
 96 |     cudaEvent_t event;
 97 |     checkCudaErrors(cudaEventCreateWithFlags(&event, flags));
 98 |     return std::unique_ptr<CUevent_st, EventDeleter>{event};
 99 | }
100 | 
101 | std::unique_ptr<CUstream_st, StreamDeleter> makeCudaStream(int flags, int priority) {
102 |     cudaStream_t stream;
103 |     checkCudaErrors(cudaStreamCreateWithPriority(&stream, flags, priority));
104 |     return std::unique_ptr<CUstream_st, StreamDeleter>{stream};
105 | }
106 | 
107 | 
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/tensorrt_yolov7/test_coco_map.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | import json
24 | import os
25 | import argparse
26 | 
27 | if __name__ == '__main__':
28 |     parser = argparse.ArgumentParser(prog='test.py')
29 |     parser.add_argument('--predict', type=str, default='./predict.json', help='model.pt path(s)')
30 |     parser.add_argument('--coco', type=str, default='./coco/', help='*.data path')
31 |     opt = parser.parse_args()
32 |     print('\nEvaluating pycocotools mAP... saving %s...' % opt.predict)
33 |     try:  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
34 |         from pycocotools.coco import COCO
35 |         from pycocotools.cocoeval import COCOeval
36 |         anno = COCO(opt.coco+"/annotations/instances_val2017.json")  # init annotations api
37 |         pred = anno.loadRes(opt.predict)  # init predictions api
38 |         eval = COCOeval(anno, pred, 'bbox')
39 |         # if is_coco:
40 |         #     eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files]  # image IDs to evaluate
41 |         eval.evaluate()
42 |         eval.accumulate()
43 |         eval.summarize()
44 |         map, map50 = eval.stats[:2]  # update results (mAP@0.5:0.95, mAP@0.5)
45 |     except Exception as e:
46 |         print(f'pycocotools unable to run: {e}')
47 | 


--------------------------------------------------------------------------------
/yolov7_qat/README.md:
--------------------------------------------------------------------------------
 1 | # YoloV7 Quantization Aware Training
 2 | ## Description
 3 |  We use [TensorRT's pytorch quntization tool](https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization) to finetune training QAT yolov7 from the pre-trained weight, then export the model to onnx and deploy it with TensorRT. The accuray and performance can be found in below table.
 4 | 
 5 | |  Method   | Calibration method  | mAP<sup>val<br>0.5|mAP<sup>val<br>0.5:0.95 |batch-1 fps<br>Jetson Orin-X  |batch-16 fps<br>Jetson Orin-X  |weight|
 6 | |  ----  | ----  |----  |----  |----|----|-|
 7 | | pytorch FP16 | -             | 0.6972 | 0.5120 |-|-|[yolov7.pt](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt)|
 8 | | pytorch PTQ-INT8  | Histogram(MSE)  | 0.6957 | 0.5100 |-|-|[yolov7_ptq.pt](https://nvidia.box.com/shared/static/j0rclm9k2ymj6ahdx55dxnnskzq91flh) [yolov7_ptq_640.onnx](https://nvidia.box.com/shared/static/rlv3buq7sei2log2d3beyg1jhjyw59hn)|
 9 | | pytorch QAT-INT8  | Histogram(MSE)  | 0.6961 | 0.5111 |-|-|[yolov7_qat.pt](https://nvidia.box.com/shared/static/vph9af9rbe7ed7ibfnajsk248mw9nq9f)|
10 | | TensorRT FP16| -             | 0.6973 | 0.5124 |140 |168|[yolov7.onnx](https://nvidia.box.com/shared/static/rmh8rttesg4cgrysb2qm12udpvd95as1) |
11 | | TensorRT PTQ-INT8 | TensorRT built in EntropyCalibratorV2 | 0.6317 | 0.4573 |207|264|-|
12 | | TensorRT QAT-INT8 | Histogram(MSE)  | 0.6962 | 0.5113 |207|266|[yolov7_qat_640.onnx](https://nvidia.box.com/shared/static/v1ze885p35hfjl96xtw8s0xbcpv64tfr)|
13 |  - network input resolution: 3x640x640
14 |  - note: trtexec cudaGraph is enabled
15 | 
16 | ## How To QAT Training
17 | ### 1.Setup
18 | 
19 | Suggest to use docker environment.
20 | ```bash
21 | $ docker pull nvcr.io/nvidia/pytorch:22.09-py3
22 | ```
23 | 
24 | 1. Clone and apply patch
25 | ```bash
26 | # use this YoloV7 as a sample base 
27 | git clone https://github.com/WongKinYiu/yolov7.git
28 | cp -r yolov_deepstream/yolov7_qat/* yolov7/
29 | ```
30 | 
31 | 2. Install dependencies
32 | ```bash
33 | $ pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
34 | ```
35 | 
36 | 3. Download dataset and pretrained model
37 | ```bash
38 | $ bash scripts/get_coco.sh
39 | $ wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
40 | ```
41 | 
42 | ### 2. Start QAT training
43 |   ```bash
44 |   $ python scripts/qat.py quantize yolov7.pt --ptq=ptq.pt --qat=qat.pt --eval-ptq --eval-origin
45 |   ```
46 |   This script includes steps below: 
47 |   - Insert Q&DQ nodes to get fake-quant pytorch model<br>
48 |   [Pytorch quntization tool](https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization) provides automatic insertion of QDQ function. But for yolov7 model, it can not get the same performance as PTQ, because in Explicit mode(QAT mode), TensorRT will henceforth refer Q/DQ nodes' placement to restrict the precision of the model. Some of the automatic added Q&DQ nodes can not be fused with other layers which will cause some extra useless precision convertion. In our script, We find Some rules and restrictions for yolov7, QDQ nodes are automatically analyzed and configured in a rule-based manner, ensuring that they are optimal under TensorRT. Ensuring that all nodes are running INT8(confirmed with tool:[trt-engine-explorer](https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer), see [scripts/draw-engine.py](./scripts/draw-engine.py)). for details of this part, please refer [quantization/rules.py](./quantization/rules.py), About the guidance of Q&DQ insert, please refer [Guidance_of_QAT_performance_optimization](./doc/Guidance_of_QAT_performance_optimization.md)
49 | 
50 |   - PTQ calibration<br>
51 |   After inserting Q&DQ nodes, we recommend to run PTQ-Calibration first. Per experiments, `Histogram(MSE)` is the best PTQ calibration method for yolov7.
52 |   Note: if you are satisfied with PTQ result, you could also skip QAT.
53 |   
54 |   - QAT training<br>
55 |   After QAT, need to finetune traning our model. after getting the accuracy we are satisfied, Saving the weights to files
56 | 
57 | ### 3. Export onnx 
58 |   ```bash
59 |   $ python scripts/qat.py export qat.pt --size=640 --save=qat.onnx --dynamic
60 |   ```
61 | 
62 | ### 4. Evaluate model accuracy on coco 
63 |   ```bash
64 |   $ bash scripts/eval-trt.sh qat.pt
65 |   ```
66 | 
67 | ### 5. Benchmark
68 |   ```bash
69 |   $ /usr/src/tensorrt/bin/trtexec --onnx=qat.onnx --int8 --fp16  --workspace=1024000 --minShapes=images:4x3x640x640 --optShapes=images:4x3x640x640 --maxShapes=images:4x3x640x640
70 |   ```
71 | 
72 | 
73 | ## Quantization Yolov7-Tiny
74 | ```bash
75 | $ python scripts/qat.py quantize yolov7-tiny.pt --qat=qat.pt --ptq=ptq.pt --ignore-policy="model\.77\.m\.(.*)|model\.0\.(.*)" --supervision-stride=1 --eval-ptq --eval-origin
76 | ```
77 | 
78 | ## Note
79 | - For YoloV5, please use the script `scripts/qat-yolov5.py`. This adds QAT support for `Add operator`, making it more performant.
80 | - Please refer to the `quantize.replace_bottleneck_forward` function to handle the `Add operator`.
81 | 


--------------------------------------------------------------------------------
/yolov7_qat/doc/Guidance_of_QAT_performance_optimization.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Get QAT models' best performance on TensorRT
  3 | 
  4 | ## 1. Description
  5 | This guidance will show how to get the best performance QAT model on yolov7.
  6 | 
  7 | There are two workflows for quantizing networks in TensorRT, one is Post-training quantization (PTQ).(ref:[tensorrt-developer-guide/intro-quantization](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#intro-quantization)). The other is QAT.(ref:[tensorrt-developer-guide/work-with-qat-networks](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work-with-qat-networks). In PTQ mode, TensorRT will have the best performance, as it always choose the best layer fusion tactics and fastest kernels to make the global optimal network enqueue graph.
  8 | In QAT modes, the enqueue graph is designed by user. Which depends on the QDQ placement, The accuracy conversion and layer fusion strategies in the network are selected strictly according to the QDQ placement.(About the Q&DQ processing of TensorRT, please refer :[TensorRT-developer-guide: Processing of Q/DQ Networks](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#tensorrt-process-qdq)). That is, If we want to get the best performance of QAT, The Q&DQ nodes must make sure: 
  9 | 1. All the computationally intensive layers will run with INT8.
 10 | 2. Q&DQ can not break down the layer fusion of QAT model. 
 11 | 3. Do not have unnecessary data conversion between INT8 and FLOAT
 12 | 
 13 | One effective way to get best performance of QAT is comparing the enqueue graph of QAT-TensorRT model with PTQ, and ensure they are the same.
 14 | 
 15 | ## 2. Workflow
 16 | Our solution is: verbosing the QAT-Graph and compare with the PTQ-Graph. And back to fineTune the Q&DQ nodes placement. The procedure can be summaried as below.
 17 | 1. Insert QDQ in the model and export it to onnx
 18 | 2. Convert PTQ-Onnx and QAT-onnx to TensorRT model and draw the TensorRT-model-graph
 19 | 3. Compare the TensorRT-enqueue-Graph and performance between QAT and PTQ
 20 | 4. If the QAT Graph is different from PTQ Graph and the performance also wrose. modify the QDQ placement. Back to Step 1. Else, to Step 5
 21 | 5. Run PTQ benchmark and QAT benchmark to verify
 22 | 
 23 | <img src="./imgs/QATFlow.png" width=50% alt="QATFlow" align=center />
 24 | 
 25 | For the layer-fusion rules: We can refer: [TensorRT-developer-guide: Types of Fusions](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#fusion-types)
 26 | For the tools for verbosing the TensorRT-model graph：[github-TensorRT: trt-engine-explorer](https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer)(ref: [blog:exploring-tensorrt-engines-with-trex](https://developer.nvidia.com/blog/exploring-tensorrt-engines-with-trex/))
 27 | 
 28 | 
 29 | ## 3. Step by step guidance of QAT optimization on yolov7
 30 | 
 31 | Now we will step by step optimizing a QAT model performance, We only care about the performance rather than accuracy at this time as we had not starting finetune the accuracy with training.
 32 | we use pytorch-quantization tool [pytorch-quantization](https://github.com/NVIDIA/TensorRT/blob/main/tools/pytorch-quantization) to quantize our pytorch model. And export onnx model with Q&DQ nodes.
 33 | This package provides a number of quantized layer modules, which contain quantizers for inputs and weights. e.g. `quant_nn.QuantLinear`, which can be used in place of `nn.Linear. ` These quantized layers can be substituted automatically, via monkey-patching, or by manually modifying the model definition.
 34 | Automatic layer substitution is done with `quant_modules`. This should be called before model creation.[ref: [pytorch-quantization-toolkit-tutorials](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/tutorials/quant_resnet50.html#quantizing-resnet50)]
 35 | 
 36 | ### 1) Insert QDQ to model with monkey-patch quantization
 37 | 
 38 | with `quant_modules.initialize()` and `quant_modules.deactivate()`. The tool will automatic insert Q&DQ nodes in the network.
 39 | 
 40 | ```python
 41 | quant_modules.initialize()
 42 | # Load PyTorch model
 43 | device = select_device(opt.device)
 44 | model = Model(opt.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)
 45 | labels = model.names
 46 | quant_modules.deactivate()
 47 | ```
 48 | calibrate the onnx model to get the scale of Q&DQ nodes.
 49 | ```python
 50 | def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator,hist_percentile, out_dir, device):
 51 |     """
 52 |         Feed data to the network and calibrate.
 53 |         Arguments:
 54 |             model: classification model
 55 |             model_name: name to use when creating state files
 56 |             data_loader: calibration data set
 57 |             num_calib_batch: amount of calibration passes to perform
 58 |             calibrator: type of calibration to use (max/histogram)
 59 |             hist_percentile: percentiles to be used for historgram calibration
 60 |             out_dir: dir to save state files in
 61 |     """
 62 |     if num_calib_batch > 0:
 63 |         print("Calibrating model")
 64 |         with torch.no_grad():
 65 |             collect_stats(model, data_loader, num_calib_batch, device)
 66 |         if not calibrator == "histogram":
 67 |             compute_amax(model, method="max")
 68 |             calib_output = os.path.join(
 69 |                 out_dir,
 70 |                 F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth")
 71 |             ckpt = {'model': deepcopy(model)}
 72 |             torch.save(ckpt, calib_output)
 73 |         else:
 74 |             for percentile in hist_percentile:
 75 |                 print(F"{percentile} percentile calibration")
 76 |                 compute_amax(model, method="percentile")
 77 |                 calib_output = os.path.join(
 78 |                     out_dir,
 79 |                     F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth")
 80 |                 ckpt = {'model': deepcopy(model)}
 81 |                 torch.save(ckpt, calib_output)
 82 |             for method in ["mse", "entropy"]:
 83 |                 print(F"{method} calibration")
 84 |                 compute_amax(model, method=method)
 85 |                 calib_output = os.path.join(
 86 |                     out_dir,
 87 |                     F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth")
 88 |                 ckpt = {'model': deepcopy(model)}
 89 |                 torch.save(ckpt, calib_output)
 90 | ```
 91 | ### 2) export the calibrated-pytorch model to onnx
 92 | ```python
 93 | quant_nn.TensorQuantizer.use_fb_fake_quant = True
 94 | torch.onnx.export(model, img, f, verbose=False, opset_version=13, input_names['images'],
 95 |                 output_names=output_names,
 96 |                 dynamic_axes=dynamic_axes)
 97 | quant_nn.TensorQuantizer.use_fb_fake_quant = False
 98 | ```
 99 | ***Now we got a onnx model with Q&DQ layers. TensorRT will process the onnx model with QDQ nodes as QAT models, With this way. Calibration is no longer needed as TensorRT will automatically performs INT8 quantization based on scales of Q and DQ nodes.***
100 | 
101 | TIPS: We calibrate the pytorch model with fake-quant, the exported onnx will have Q&DQ nodes. In the eye of pytorch, it is a ptq-model as we only did a calibration but no finetune training. But in the eye of TensorRT, as long as there are Q&DQ nodes inside the onnx， TensorRT will regard it as a QAT model.
102 | 
103 | ### 3) Run TensorRT benchmark and export layers information to json
104 | we can export the TensorRT-engine-graph and profile information with flag `--exportLayerInfo=layer.json --profilingVerbosity=detailed --exportProfile=profile.json`.
105 | first we export fp32 onnx model
106 | ```bash
107 | $ python export.py --weights ./yolov7.pt --grid --simplify --topk-all 100 --iou-thres 0.65 --conf-thres 0.35 --img-size 640 640
108 | ```
109 | Then we copy the onnx to target device, Here we use Jetson OrinX as our target device, TensorRT has different behavior on different GPUs. So the test must run on your final target device
110 | 
111 | Run PTQ benchmark
112 | ```bash
113 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --fp16 --int8 --verbose --saveEngine=yolov7_ptq.engine --workspace=1024000 --warmUp=500 --duration=10  --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_ptq_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_ptq_profile.json
114 | ```
115 | Run fp16  benchmark
116 | ```bash
117 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7.onnx --fp16  --verbose --saveEngine=yolov7_fp16.engine --workspace=1024000 --warmUp=500 --duration=10  --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_fp16_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_fp16_profile.json
118 | ```
119 | Run QAT benchmark
120 | ```bash
121 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7_qat.onnx --fp16 --int8 --verbose --saveEngine=yolov7_qat.engine --workspace=1024000 --warmUp=500 --duration=10  --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_qat_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_qat_profile.json
122 | ```
123 | 
124 | Run QAT_mask detect benchmark
125 | ```bash
126 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7_qat_maskdet.onnx --fp16 --int8 --verbose --saveEngine=yolov7_qat_maskdet.engine --workspace=1024000 --warmUp=500 --duration=10  --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_qat_maskdet_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_qat_maskdet_profile.json
127 | ```
128 | 
129 | We can get the fps from the log:
130 | The PTQ performance is :
131 | ```bash
132 | [I] Throughput: 206.562 qps
133 | ```
134 | The fp16 performance is :
135 | ```bash
136 | [I] Throughput: 139.597 qps
137 | ```
138 | The version 1 QAT performance is:
139 | ```bash
140 | [I] Throughput: 180.439 qps
141 | ```
142 | That is not a good performance as we expect, Let's look insight the reason
143 | 
144 | ### 4) Draw Engine graph
145 | 
146 | we use TensorRT opensource tool: [trt-engine-explorer](https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer) drawing the enqueue graph of TensorRT. This tool take the trtexec exported layer json information as input.
147 | Use the below code to draw the TensorRT-Engine-graph.(edit from `trt-engine-explorer/utils/draw_engine.py`)
148 | 
149 | ```python
150 | import graphviz
151 | from trex import *
152 | import argparse
153 | import shutil
154 | 
155 | 
156 | def draw_engine(engine_json_fname: str, engine_profile_fname: str):
157 |     graphviz_is_installed =  shutil.which("dot") is not None
158 |     if not graphviz_is_installed:
159 |         print("graphviz is required but it is not installed.\n")
160 |         print("To install on Ubuntu:")
161 |         print("sudo apt --yes install graphviz")
162 |         exit()
163 | 
164 |     plan = EnginePlan(engine_json_fname, engine_profile_fname)
165 |     formatter = layer_type_formatter
166 |     display_regions = True
167 |     expand_layer_details = False
168 | 
169 |     graph = to_dot(plan, formatter,
170 |                 display_regions=display_regions,
171 |                 expand_layer_details=expand_layer_details)
172 |     render_dot(graph, engine_json_fname, 'svg')
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     parser = argparse.ArgumentParser()
177 |     parser.add_argument('--layer', help="name of engine JSON file to draw")
178 |     parser.add_argument('--profile', help="name of profile JSON file to draw")
179 |     args = parser.parse_args()
180 |     draw_engine(engine_json_fname=args.layer,engine_profile_fname=args.profile)
181 | ```
182 | draw the graph:
183 | ```bash
184 | $ python draw_engine.py --layer yolov7_qat_layer.json --profile yolov7_qat_profile.json
185 | $ python draw_engine.py --layer yolov7_ptq_layer.json --profile yolov7_ptq_profile.json
186 | ```
187 | we get `yolov7_qat_layer.json.svg` and `yolov7_ptq_layer.json.svg`
188 | 
189 | Let's see the difference:
190 | 
191 | <img src="./imgs/monkey-patch-qat-conv-fp16-issue_ptqonnx.png" width=200 alt="monkey-patch-qat-conv-fp16-issue_ptqonnx" align=center /><img src="./imgs/monkey-patch-qat-conv-fp16-issue_ptq.png" width=200 alt="monkey-patch-qat-conv-fp16-issue_ptq" align=center /><img src="./imgs/monkey-patch-qat-conv-fp16-issue_qatonnx.png" width=200 alt="monkey-patch-qat-conv-fp16-issue_qatonnx" align=center /><img src="./imgs/monkey-patch-qat-conv-fp16-issue_qat.png" width=200 alt="monkey-patch-qat-conv-fp16-issue_qatonnx" align=center />
192 | 
193 | - <center> pic1: The convolution layers before first concat layer in onnx </center>
194 | - <center> pic2: pic1's TensorRT-graph </center>
195 | - <center> pic3: the qat-onnx model </center>
196 | - <center> pic4: pic3's TensorRt-graph </center>
197 | - <center> (click to see full picture) </center>
198 |  
199 | ### 5) Gap analyze and QDQ placement optimization
200 |  There are a lot of useless int8->fp16 and fp16->int8 data convert in our QAT model. That is because : TensorRT will enforce the rules of QDQ to ensure consistent accuracy during inference and training(We didn't see any fp32 tensors here becasue TensorRT believes that fp16 will have the same accuracy as fp32)
201 |  That is to say: If we want to reduce these useless data format convertion, We must edit our QDQ nodes to suit the fusion rules of TensorRT QAT.
202 |  From the PTQ & QAT engine-graph, we can observed that: the concat layer will be reduced in TensorRT and all the input and output of concat will merge to one tensor(marked are red arrows in the below pic). If we do not guarantee the scale of Q&DQ nodes(marked with green circle in the below pic) of these tensors are the same. There will be redundant precision-conversion in our Graph.
203 | 
204 |    <img src="./imgs/monkey-patch-qat-conv-fp16-issue_qatonnx_edit.png" width=70% alt="monkey-patch-qat-conv-fp16-issue_qatonnx_edit" align=center />
205 | 
206 | For all the network-struct like this, We need do the same restrict. There is a special scene we need to take care: QDQ can cross some of the layers according to the commute rules from [TensorRT-developer-guide:tensorrt-process-qdq](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#tensorrt-process-qdq). eg. Max-pooling.
207 | the DQ nodes marked with red circle will cross the MaxPool layer and TensorRT will remember the crossed-MaxPooling layer as int8 precision. Now we meet the similar scence as concat: We should restrict the scale of Q&DQ the same as the Q&DQ in the green circle to avoid generate  useless data format convertion here.
208 | 
209 |    <img src="./imgs/monkey-patch-qat-maxpooling-qat.png" width=50% alt="monkey-patch-qat-maxpooling-qat.png" align=center />
210 | 
211 | ### 6) optimized QAT model's performance
212 | Now we apply all the restriction we have metioned. We can test the performance:
213 | 
214 | we still use trtexec to benchmark the onnx model:
215 | ```bash
216 | $ /usr/src/tensorrt/bin/trtexec --onnx=yolov7_qat_maskdet.onnx --fp16 --int8 --verbose --saveEngine=yolov7_qat_optimized.engine --workspace=1024000 --warmUp=500 --duration=10  --useCudaGraph --useSpinWait --noDataTransfers --exportLayerInfo=yolov7_qat_optimized_layer.json --profilingVerbosity=detailed --exportProfile=yolov7_qat_optimized_profile.json
217 | [I] Throughput: 207.267 qps
218 | ```
219 | This performance is almost the same as PTQ performance.
220 | 
221 | Next we need can finetune training our model to improve the accracy of the model.
222 | 


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/QATConv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/QATConv.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/QATFlow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/QATFlow.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/int8_q_recommended_procedure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/int8_q_recommended_procedure.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptq.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptqonnx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_ptqonnx.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qat.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx_edit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-conv-fp16-issue_qatonnx_edit.png


--------------------------------------------------------------------------------
/yolov7_qat/doc/imgs/monkey-patch-qat-maxpooling-qat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA-AI-IOT/yolo_deepstream/e9b75770ea58713d1cb3902d67c36e11acb888d7/yolov7_qat/doc/imgs/monkey-patch-qat-maxpooling-qat.png


--------------------------------------------------------------------------------
/yolov7_qat/quantization/rules.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | import onnx
24 | 
25 | def find_with_input_node(model, name):
26 |     for node in model.graph.node:
27 |         if len(node.input) > 0 and name in node.input:
28 |             return node
29 | 
30 | def find_all_with_input_node(model, name):
31 |     all = []
32 |     for node in model.graph.node:
33 |         if len(node.input) > 0 and name in node.input:
34 |             all.append(node)
35 |     return all
36 | 
37 | def find_with_output_node(model, name):
38 |     for node in model.graph.node:
39 |         if len(node.output) > 0 and name in node.output:
40 |             return node
41 | 
42 | def find_with_no_change_parent_node(model, node):
43 |     parent = find_with_output_node(model, node.input[0])
44 |     if parent is not None:
45 |         if parent.op_type in ["Concat", "MaxPool"]:
46 |             return find_with_no_change_parent_node(model, parent)
47 |     return parent
48 | 
49 | def find_quantizelinear_conv(model, qnode):
50 |     dq   = find_with_input_node(model, qnode.output[0])
51 |     conv = find_with_input_node(model, dq.output[0])
52 |     return conv
53 | 
54 | 
55 | def find_quantize_conv_name(model, weight_qname):
56 |     dq = find_with_output_node(model, weight_qname)
57 |     q  = find_with_output_node(model, dq.input[0])
58 |     return ".".join(q.input[0].split(".")[:-1])
59 | 
60 | def find_quantizer_pairs(onnx_file):
61 | 
62 |     model = onnx.load(onnx_file)
63 |     match_pairs = []
64 |     for node in model.graph.node:   
65 |         if node.op_type == "Concat":
66 |             qnodes = find_all_with_input_node(model, node.output[0])
67 |             major = None
68 |             for qnode in qnodes:
69 |                 if qnode.op_type != "QuantizeLinear":
70 |                     continue
71 |                 
72 |                 conv = find_quantizelinear_conv(model, qnode)
73 |                 if major is None:
74 |                     major = find_quantize_conv_name(model, conv.input[1])
75 |                 else:
76 |                     match_pairs.append([major, find_quantize_conv_name(model, conv.input[1])])
77 | 
78 |                 for subnode in model.graph.node:
79 |                     if len(subnode.input) > 0 and subnode.op_type == "QuantizeLinear" and subnode.input[0] in node.input:
80 |                         subconv = find_quantizelinear_conv(model, subnode)
81 |                         match_pairs.append([major, find_quantize_conv_name(model, subconv.input[1])])
82 | 
83 |         elif node.op_type == "MaxPool":
84 |             qnode = find_with_input_node(model, node.output[0])
85 |             if not (qnode and qnode.op_type == "QuantizeLinear"):
86 |                 continue
87 | 
88 |             major = find_quantizelinear_conv(model, qnode)
89 |             major = find_quantize_conv_name(model, major.input[1])
90 |             same_input_nodes = find_all_with_input_node(model, node.input[0])
91 | 
92 |             for same_input_node in same_input_nodes:
93 |                 if same_input_node.op_type == "QuantizeLinear":
94 |                     subconv = find_quantizelinear_conv(model, same_input_node)
95 |                     match_pairs.append([major, find_quantize_conv_name(model, subconv.input[1])])
96 |     return match_pairs
97 | 


--------------------------------------------------------------------------------
/yolov7_qat/scripts/detect-trt.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a
  6 | # copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | # DEALINGS IN THE SOFTWARE.
 22 | ################################################################################
 23 | import argparse
 24 | import time
 25 | from pathlib import Path
 26 | 
 27 | import cv2
 28 | import torch
 29 | import torch.backends.cudnn as cudnn
 30 | from numpy import random
 31 | import numpy as np
 32 | from models.experimental import attempt_load
 33 | from utils.datasets import LoadStreams, LoadImages
 34 | from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \
 35 |     scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path
 36 | from utils.plots import plot_one_box
 37 | from utils.torch_utils import select_device, load_classifier, time_synchronized, TracedModel
 38 | 
 39 | import pycuda.autoinit
 40 | import pycuda.driver as cuda
 41 | import tensorrt as trt
 42 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
 43 | # Simple helper data class that's a little nicer to use than a 2-tuple.
 44 | names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', \
 45 |     'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',\
 46 |     'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', \
 47 |     'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',\
 48 |     'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', \
 49 |     'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',\
 50 |     'baseball bat', 'baseball glove', 'skateboard', 'surfboard',\
 51 |     'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', \
 52 |     'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',\
 53 |     'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',\
 54 |     'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',\
 55 |     'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', \
 56 |     'teddy bear', 'hair drier', 'toothbrush']
 57 | 
 58 | class HostDeviceMem(object):
 59 |     def __init__(self, host_mem, device_mem):
 60 |         self.host = host_mem
 61 |         self.device = device_mem
 62 | 
 63 |     def __str__(self):
 64 |         return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
 65 | 
 66 |     def __repr__(self):
 67 |         return self.__str__()
 68 | 
 69 | def allocate_buffers(engine):
 70 |     inputs = []
 71 |     outputs = []
 72 |     bindings = []
 73 |     stream = cuda.Stream()
 74 |     for binding in engine:
 75 |         size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
 76 |         print("binding shape: ", engine.get_binding_shape(binding))
 77 |         dtype = trt.nptype(engine.get_binding_dtype(binding))
 78 |         # Allocate host and device buffers
 79 |         host_mem = cuda.pagelocked_empty(size, dtype)
 80 |         device_mem = cuda.mem_alloc(host_mem.nbytes)
 81 |         # Append the device buffer to device bindings.
 82 |         bindings.append(int(device_mem))
 83 |         # Append to the appropriate list.
 84 |         if engine.binding_is_input(binding):
 85 |             inputs.append(HostDeviceMem(host_mem, device_mem))
 86 |         else:
 87 |             outputs.append(HostDeviceMem(host_mem, device_mem))
 88 |     return inputs, outputs, bindings, stream
 89 | 
 90 | def do_inference_v2(context, bindings, inputs, outputs, stream):
 91 |     # Transfer input data to the GPU.
 92 |     [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
 93 |     # Run inference.
 94 |     context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
 95 |     # Transfer predictions back from the GPU.
 96 |     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
 97 |     # Synchronize the stream
 98 |     stream.synchronize()
 99 |     # Return only the host outputs.
100 |     return [out.host for out in outputs]
101 | 
102 | def detect(save_img=False):
103 |     source,  view_img, save_txt, imgsz = opt.source, opt.view_img, opt.save_txt, opt.img_size
104 |     save_img = not opt.nosave and not source.endswith('.txt')  # save inference images
105 |     webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
106 |         ('rtsp://', 'rtmp://', 'http://', 'https://'))
107 | 
108 |     # Directories
109 |     save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))  # increment run
110 |     (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir
111 | 
112 |     # Initialize
113 |     set_logging()
114 |     device = select_device(opt.device) # device will be avilable for NMS
115 | 
116 |     # Set Dataloader
117 |     vid_path, vid_writer = None, None
118 |     if webcam:
119 |         view_img = check_imshow()
120 |         cudnn.benchmark = True  # set True to speed up constant image size inference
121 |         dataset = LoadStreams(source, img_size=imgsz)
122 |     else:
123 |         dataset = LoadImages(source, img_size=imgsz, auto=False)
124 | 
125 |     colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]
126 |    
127 |     ####### start trt objects
128 |     logger = trt.Logger(trt.Logger.INFO)
129 |     f = open(opt.engine, 'rb') 
130 |     runtime = trt.Runtime(logger)
131 |     engine = runtime.deserialize_cuda_engine(f.read())
132 |     inputs, outputs, bindings, stream = allocate_buffers(engine)
133 |     outputshape = [engine.get_binding_shape(binding) for binding in engine][1]
134 |     
135 |     t0 = time.time()
136 |     for path, img, im0s, vid_cap in dataset:
137 |         img = img.astype(np.float32)
138 |         img /= 255.0  # 0 - 255 to 0.0 - 1.0
139 |         if len(img.shape) == 3:
140 |             img = np.expand_dims(img, 0)
141 | 
142 |         inputs[0].host = img
143 |         context = engine.create_execution_context()
144 |         trt_outputs = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream = stream)
145 |         trt_outputs = torch.Tensor(trt_outputs[0].reshape(outputshape))
146 |         # Inference
147 |         t1 = time_synchronized()
148 | 
149 |         # pred = trt_outputs
150 |         t2 = time_synchronized()
151 | 
152 |         # Apply NMS
153 |         trt_outputs = non_max_suppression(trt_outputs, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
154 |         t3 = time_synchronized()
155 | 
156 |         # Process detections
157 |         for i, det in enumerate(trt_outputs):  # detections per image
158 |             if webcam:  # batch_size >= 1
159 |                 p, s, im0, frame = path[i], '%g: ' % i, im0s[i].copy(), dataset.count
160 |             else:
161 |                 p, s, im0, frame = path, '', im0s, getattr(dataset, 'frame', 0)
162 | 
163 |             p = Path(p)  # to Path
164 |             save_path = str(save_dir / p.name)  # img.jpg
165 |             txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # img.txt
166 |             gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
167 |             if len(det):
168 |                 # Rescale boxes from img_size to im0 size
169 |                 det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
170 | 
171 |                 # Print results
172 |                 for c in det[:, -1].unique():
173 |                     n = (det[:, -1] == c).sum()  # detections per class
174 |                     s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string
175 | 
176 |                 # Write results
177 |                 for *xyxy, conf, cls in reversed(det):
178 |                     if save_txt:  # Write to file
179 |                         xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
180 |                         line = (cls, *xywh, conf) if opt.save_conf else (cls, *xywh)  # label format
181 |                         with open(txt_path + '.txt', 'a') as f:
182 |                             f.write(('%g ' * len(line)).rstrip() % line + '\n')
183 | 
184 |                     if save_img or view_img:  # Add bbox to image
185 |                         label = f'{names[int(cls)]} {conf:.2f}'
186 |                         plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=1)
187 | 
188 |             # Print time (inference + NMS)
189 |             print(f'{s}Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference, ({(1E3 * (t3 - t2)):.1f}ms) NMS')
190 | 
191 |             # Stream results
192 |             if view_img:
193 |                 cv2.imshow(str(p), im0)
194 |                 cv2.waitKey(1)  # 1 millisecond
195 | 
196 |             # Save results (image with detections)
197 |             if save_img:
198 |                 if dataset.mode == 'image':
199 |                     cv2.imwrite(save_path, im0)
200 |                     print(f" The image with the result is saved in: {save_path}")
201 |                 else:  # 'video' or 'stream'
202 |                     if vid_path != save_path:  # new video
203 |                         vid_path = save_path
204 |                         if isinstance(vid_writer, cv2.VideoWriter):
205 |                             vid_writer.release()  # release previous video writer
206 |                         if vid_cap:  # video
207 |                             fps = vid_cap.get(cv2.CAP_PROP_FPS)
208 |                             w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
209 |                             h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
210 |                         else:  # stream
211 |                             fps, w, h = 30, im0.shape[1], im0.shape[0]
212 |                             save_path += '.mp4'
213 |                         vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
214 |                     vid_writer.write(im0)
215 | 
216 |     if save_txt or save_img:
217 |         s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
218 |         #print(f"Results saved to {save_dir}{s}")
219 | 
220 |     print(f'Done. ({time.time() - t0:.3f}s)')
221 | 
222 | 
223 | if __name__ == '__main__':
224 |     parser = argparse.ArgumentParser()
225 |     parser.add_argument('--engine', type=str, default='yolov7.engine', help='model.pt path(s)')
226 |     parser.add_argument('--source', type=str, default='inference/images', help='source')  # file/folder, 0 for webcam
227 |     parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
228 |     parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold')
229 |     parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS')
230 |     parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
231 |     parser.add_argument('--view-img', action='store_true', help='display results')
232 |     parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
233 |     parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
234 |     parser.add_argument('--nosave', action='store_true', help='do not save images/videos')
235 |     parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3')
236 |     parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
237 |     parser.add_argument('--augment', action='store_true', help='augmented inference')
238 |     parser.add_argument('--project', default='runs/detect', help='save results to project/name')
239 |     parser.add_argument('--name', default='exp', help='save results to project/name')
240 |     parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
241 |     opt = parser.parse_args()
242 |     print(opt)
243 | 
244 |     detect()
245 | 


--------------------------------------------------------------------------------
/yolov7_qat/scripts/draw-engine.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a
 6 | # copy of this software and associated documentation files (the "Software"),
 7 | # to deal in the Software without restriction, including without limitation
 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | # DEALINGS IN THE SOFTWARE.
22 | ################################################################################
23 | 
24 | 
25 | """
26 | This script generates an SVG diagram of the input engine graph SVG file.
27 | Note:
28 |     THIS SCRIPT DEPENDS ON LIB: https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer
29 |     this script requires graphviz which can be installed manually:
30 |     $ sudo apt-get --yes install graphviz
31 |     $ python3 -m pip install graphviz networkx
32 | """
33 | 
34 | import graphviz
35 | from trex import *
36 | import argparse
37 | import shutil
38 | 
39 | 
40 | def draw_engine(engine_json_fname: str, engine_profile_fname: str):
41 |     graphviz_is_installed =  shutil.which("dot") is not None
42 |     if not graphviz_is_installed:
43 |         print("graphviz is required but it is not installed.\n")
44 |         print("To install on Ubuntu:")
45 |         print("sudo apt --yes install graphviz")
46 |         exit()
47 | 
48 |     plan = EnginePlan(engine_json_fname, engine_profile_fname)
49 |     formatter = layer_type_formatter
50 |     display_regions = True
51 |     expand_layer_details = False
52 | 
53 |     graph = to_dot(plan, formatter,
54 |                 display_regions=display_regions,
55 |                 expand_layer_details=expand_layer_details)
56 |     render_dot(graph, engine_json_fname, 'svg')
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     parser = argparse.ArgumentParser()
61 |     parser.add_argument('--layer', help="name of engine JSON file to draw")
62 |     parser.add_argument('--profile', help="name of profile JSON file to draw")
63 |     args = parser.parse_args()
64 |     draw_engine(engine_json_fname=args.layer,engine_profile_fname=args.profile)
65 | 


--------------------------------------------------------------------------------
/yolov7_qat/scripts/eval-trt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script only worked on the Quantization model, otherwise, errors would throw up (Origin model, etc.)
 4 | weight=$1
 5 | prefix=${weight%.*}
 6 | onnx=${prefix}.onnx
 7 | graph=${prefix}.graph
 8 | engine=${prefix}.engine
 9 | 
10 | # onnx must be 672x672 of input
11 | python scripts/qat.py export $weight --dynamic --save=$onnx --size=672
12 | 
13 | # To obtain more QPS can add --fp16 flag for detect layer
14 | trtexec --onnx=$onnx \
15 |     --saveEngine=${engine} --int8 --buildOnly --memPoolSize=workspace:1024MiB \
16 |     --dumpLayerInfo --exportLayerInfo=${graph} --profilingVerbosity=detailed
17 | 
18 | python scripts/draw-engine.py ${graph}
19 | python scripts/eval-trt.py --engine=${engine}
20 | 


--------------------------------------------------------------------------------
/yolov7_qat/scripts/qat.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a
  6 | # copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | # DEALINGS IN THE SOFTWARE.
 22 | ################################################################################
 23 | import sys
 24 | import os
 25 | 
 26 | # Add the current directory to PYTHONPATH for YoloV7
 27 | sys.path.insert(0, os.path.abspath("."))
 28 | pydir = os.path.dirname(__file__)
 29 | 
 30 | import yaml
 31 | import collections
 32 | import warnings
 33 | import argparse
 34 | import json
 35 | from pathlib import Path
 36 | 
 37 | # PyTorch
 38 | import torch
 39 | import torch.nn as nn
 40 | 
 41 | # YoloV7
 42 | import test
 43 | from models.yolo import Model
 44 | from models.common import Conv
 45 | from utils.datasets import create_dataloader
 46 | from utils.google_utils import attempt_download
 47 | from utils.general import init_seeds
 48 | 
 49 | import quantization.quantize as quantize
 50 | 
 51 | # Disable all warning
 52 | warnings.filterwarnings("ignore")
 53 | 
 54 | 
 55 | class SummaryTool:
 56 |     def __init__(self, file):
 57 |         self.file = file
 58 |         self.data = []
 59 | 
 60 |     def append(self, item):
 61 |         self.data.append(item)
 62 |         json.dump(self.data, open(self.file, "w"), indent=4)
 63 | 
 64 | 
 65 | # Load YoloV7 Model
 66 | def load_yolov7_model(weight, device) -> Model:
 67 | 
 68 |     attempt_download(weight)
 69 |     model = torch.load(weight, map_location=device)["model"]
 70 |     for m in model.modules():
 71 |         if type(m) is nn.Upsample:
 72 |             m.recompute_scale_factor = None  # torch 1.11.0 compatibility
 73 |         elif type(m) is Conv:
 74 |             m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility
 75 |             
 76 |     model.float()
 77 |     model.eval()
 78 | 
 79 |     with torch.no_grad():
 80 |         model.fuse()
 81 |     return model
 82 | 
 83 | 
 84 | def create_coco_train_dataloader(cocodir, batch_size=10):
 85 | 
 86 |     with open("data/hyp.scratch.p5.yaml") as f:
 87 |         hyp = yaml.load(f, Loader=yaml.SafeLoader)  # load hyps
 88 | 
 89 |     loader = create_dataloader(
 90 |         f"{cocodir}/train2017.txt", 
 91 |         imgsz=640, 
 92 |         batch_size=batch_size, 
 93 |         opt=collections.namedtuple("Opt", "single_cls")(False),
 94 |         augment=True, hyp=hyp, rect=False, cache=False, stride=32,pad=0, image_weights=False)[0]
 95 |     return loader
 96 | 
 97 | 
 98 | def create_coco_val_dataloader(cocodir, batch_size=10, keep_images=None):
 99 | 
100 |     loader = create_dataloader(
101 |         f"{cocodir}/val2017.txt", 
102 |         imgsz=640, 
103 |         batch_size=batch_size, 
104 |         opt=collections.namedtuple("Opt", "single_cls")(False),
105 |         augment=False, hyp=None, rect=True, cache=False,stride=32,pad=0.5, image_weights=False)[0]
106 | 
107 |     def subclass_len(self):
108 |         if keep_images is not None:
109 |             return keep_images
110 |         return len(self.img_files)
111 | 
112 |     loader.dataset.__len__ = subclass_len
113 |     return loader
114 | 
115 | 
116 | def evaluate_coco(model, dataloader, using_cocotools = False, save_dir=".", conf_thres=0.001, iou_thres=0.65):
117 | 
118 |     if save_dir and os.path.dirname(save_dir) != "":
119 |         os.makedirs(os.path.dirname(save_dir), exist_ok=True)
120 | 
121 |     return test.test(
122 |         "data/coco.yaml", 
123 |         save_dir=Path(save_dir),
124 |         dataloader=dataloader, conf_thres=conf_thres,iou_thres=iou_thres,model=model,is_coco=True,
125 |         plots=False,half_precision=True,save_json=using_cocotools)[0][3]
126 |     
127 | 
128 | def export_onnx(model : Model, file, size=640, dynamic_batch=False):
129 | 
130 |     device = next(model.parameters()).device
131 |     model.float()
132 | 
133 |     dummy = torch.zeros(1, 3, size, size, device=device)
134 |     model.model[-1].concat = True
135 |     grid_old_func = model.model[-1]._make_grid
136 |     model.model[-1]._make_grid = lambda *args: torch.from_numpy(grid_old_func(*args).data.numpy())
137 | 
138 |     quantize.export_onnx(model, dummy, file, opset_version=13, 
139 |         input_names=["images"], output_names=["outputs"], 
140 |         dynamic_axes={"images": {0: "batch"}, "outputs": {0: "batch"}} if dynamic_batch else None
141 |     )
142 |     model.model[-1].concat = False
143 |     model.model[-1]._make_grid = grid_old_func
144 | 
145 | 
146 | def cmd_quantize(weight, cocodir, device, ignore_policy, save_ptq, save_qat, supervision_stride, iters, eval_origin, eval_ptq):
147 |     quantize.initialize()
148 | 
149 |     if save_ptq and os.path.dirname(save_ptq) != "":
150 |         os.makedirs(os.path.dirname(save_ptq), exist_ok=True)
151 | 
152 |     if save_qat and os.path.dirname(save_qat) != "":
153 |         os.makedirs(os.path.dirname(save_qat), exist_ok=True)
154 |     
155 |     device  = torch.device(device)
156 |     model   = load_yolov7_model(weight, device)
157 |     train_dataloader = create_coco_train_dataloader(cocodir)
158 |     val_dataloader   = create_coco_val_dataloader(cocodir)
159 |     quantize.replace_to_quantization_module(model, ignore_policy=ignore_policy)
160 |     quantize.apply_custom_rules_to_quantizer(model, export_onnx)
161 |     quantize.calibrate_model(model, train_dataloader, device)
162 | 
163 |     json_save_dir = "." if os.path.dirname(save_ptq) == "" else os.path.dirname(save_ptq)
164 |     summary_file = os.path.join(json_save_dir, "summary.json")
165 |     summary = SummaryTool(summary_file)
166 | 
167 |     if eval_origin:
168 |         print("Evaluate Origin...")
169 |         with quantize.disable_quantization(model):
170 |             ap = evaluate_coco(model, val_dataloader, True, json_save_dir)
171 |             summary.append(["Origin", ap])
172 | 
173 |     if eval_ptq:
174 |         print("Evaluate PTQ...")
175 |         ap = evaluate_coco(model, val_dataloader, True, json_save_dir)
176 |         summary.append(["PTQ", ap])
177 | 
178 |     if save_ptq:
179 |         print(f"Save ptq model to {save_ptq}")
180 |         torch.save({"model": model}, save_ptq)
181 | 
182 |     if save_qat is None:
183 |         print("Done as save_qat is None.")
184 |         return
185 | 
186 |     best_ap = 0
187 |     def per_epoch(model, epoch, lr):
188 | 
189 |         nonlocal best_ap
190 |         ap = evaluate_coco(model, val_dataloader, True, json_save_dir)
191 |         summary.append([f"QAT{epoch}", ap])
192 | 
193 |         if ap > best_ap:
194 |             print(f"Save qat model to {save_qat} @ {ap:.5f}")
195 |             best_ap = ap
196 |             torch.save({"model": model}, save_qat)
197 | 
198 |     def preprocess(datas):
199 |         return datas[0].to(device).float() / 255.0
200 | 
201 |     def supervision_policy():
202 |         supervision_list = []
203 |         for item in model.model:
204 |             supervision_list.append(id(item))
205 | 
206 |         keep_idx = list(range(0, len(model.model) - 1, supervision_stride))
207 |         keep_idx.append(len(model.model) - 2)
208 |         def impl(name, module):
209 |             if id(module) not in supervision_list: return False
210 |             idx = supervision_list.index(id(module))
211 |             if idx in keep_idx:
212 |                 print(f"Supervision: {name} will compute loss with origin model during QAT training")
213 |             else:
214 |                 print(f"Supervision: {name} no compute loss during QAT training, that is unsupervised only and doesn't mean don't learn")
215 |             return idx in keep_idx
216 |         return impl
217 | 
218 |     quantize.finetune(
219 |         model, train_dataloader, per_epoch, early_exit_batchs_per_epoch=iters, 
220 |         preprocess=preprocess, supervision_policy=supervision_policy())
221 | 
222 | 
223 | def cmd_export(weight, save, size, dynamic):
224 |     
225 |     quantize.initialize()
226 |     if save is None:
227 |         name = os.path.basename(weight)
228 |         name = name[:name.rfind('.')]
229 |         save = os.path.join(os.path.dirname(weight), name + ".onnx")
230 |         
231 |     export_onnx(torch.load(weight, map_location="cpu")["model"], save, size, dynamic_batch=dynamic)
232 |     print(f"Save onnx to {save}")
233 | 
234 | 
235 | def cmd_sensitive_analysis(weight, device, cocodir, summary_save, num_image):
236 | 
237 |     quantize.initialize()
238 |     device  = torch.device(device)
239 |     model   = load_yolov7_model(weight, device)
240 |     train_dataloader = create_coco_train_dataloader(cocodir)
241 |     val_dataloader   = create_coco_val_dataloader(cocodir, keep_images=None if num_image is None or num_image < 1 else num_image)
242 |     quantize.replace_to_quantization_module(model)
243 |     quantize.calibrate_model(model, train_dataloader, device)
244 | 
245 |     summary = SummaryTool(summary_save)
246 |     print("Evaluate PTQ...")
247 |     ap = evaluate_coco(model, val_dataloader)
248 |     summary.append([ap, "PTQ"])
249 | 
250 |     print("Sensitive analysis by each layer...")
251 |     for i in range(0, len(model.model)):
252 |         layer = model.model[i]
253 |         if quantize.have_quantizer(layer):
254 |             print(f"Quantization disable model.{i}")
255 |             quantize.disable_quantization(layer).apply()
256 |             ap = evaluate_coco(model, val_dataloader)
257 |             summary.append([ap, f"model.{i}"])
258 |             quantize.enable_quantization(layer).apply()
259 |         else:
260 |             print(f"ignore model.{i} because it is {type(layer)}")
261 |     
262 |     summary = sorted(summary.data, key=lambda x:x[0], reverse=True)
263 |     print("Sensitive summary:")
264 |     for n, (ap, name) in enumerate(summary[:10]):
265 |         print(f"Top{n}: Using fp16 {name}, ap = {ap:.5f}")
266 | 
267 | 
268 | def cmd_test(weight, device, cocodir, confidence, nmsthres):
269 | 
270 |     device  = torch.device(device)
271 |     model   = load_yolov7_model(weight, device)
272 |     val_dataloader   = create_coco_val_dataloader(cocodir)
273 |     evaluate_coco(model, val_dataloader, True, conf_thres=confidence, iou_thres=nmsthres)
274 | 
275 | 
276 | if __name__ == "__main__":
277 | 
278 |     parser = argparse.ArgumentParser(prog='qat.py')
279 |     subps  = parser.add_subparsers(dest="cmd")
280 |     exp    = subps.add_parser("export", help="Export weight to onnx file")
281 |     exp.add_argument("weight", type=str, default="yolov7.pt", help="export pt file")
282 |     exp.add_argument("--save", type=str, required=False, help="export onnx file")
283 |     exp.add_argument("--size", type=int, default=640, help="export input size")
284 |     exp.add_argument("--dynamic", action="store_true", help="export dynamic batch")
285 | 
286 |     qat = subps.add_parser("quantize", help="PTQ/QAT finetune ...")
287 |     qat.add_argument("weight", type=str, nargs="?", default="yolov7.pt", help="weight file")
288 |     qat.add_argument("--cocodir", type=str, default="/datav/dataset/coco", help="coco directory")
289 |     qat.add_argument("--device", type=str, default="cuda:0", help="device")
290 |     qat.add_argument("--ignore-policy", type=str, default="model\.105\.m\.(.*)", help="regx")
291 |     qat.add_argument("--ptq", type=str, default="ptq.pt", help="file")
292 |     qat.add_argument("--qat", type=str, default=None, help="file")
293 |     qat.add_argument("--supervision-stride", type=int, default=1, help="supervision stride")
294 |     qat.add_argument("--iters", type=int, default=200, help="iters per epoch")
295 |     qat.add_argument("--eval-origin", action="store_true", help="do eval for origin model")
296 |     qat.add_argument("--eval-ptq", action="store_true", help="do eval for ptq model")
297 | 
298 |     sensitive = subps.add_parser("sensitive", help="Sensitive layer analysis")
299 |     sensitive.add_argument("weight", type=str, nargs="?", default="yolov7.pt", help="weight file")
300 |     sensitive.add_argument("--device", type=str, default="cuda:0", help="device")
301 |     sensitive.add_argument("--cocodir", type=str, default="/datav/dataset/coco", help="coco directory")
302 |     sensitive.add_argument("--summary", type=str, default="sensitive-summary.json", help="summary save file")
303 |     sensitive.add_argument("--num-image", type=int, default=None, help="number of image to evaluate")
304 | 
305 |     testcmd = subps.add_parser("test", help="Do evaluate")
306 |     testcmd.add_argument("weight", type=str, default="yolov7.pt", help="weight file")
307 |     testcmd.add_argument("--cocodir", type=str, default="/datav/dataset/coco", help="coco directory")
308 |     testcmd.add_argument("--device", type=str, default="cuda:0", help="device")
309 |     testcmd.add_argument("--confidence", type=float, default=0.001, help="confidence threshold")
310 |     testcmd.add_argument("--nmsthres", type=float, default=0.65, help="nms threshold")
311 | 
312 |     args = parser.parse_args()
313 |     init_seeds(57)
314 | 
315 |     if args.cmd == "export":
316 |         cmd_export(args.weight, args.save, args.size, args.dynamic)
317 |     elif args.cmd == "quantize":
318 |         print(args)
319 |         cmd_quantize(
320 |             args.weight, args.cocodir, args.device, args.ignore_policy, 
321 |             args.ptq, args.qat, args.supervision_stride, args.iters,
322 |             args.eval_origin, args.eval_ptq
323 |         )
324 |     elif args.cmd == "sensitive":
325 |         cmd_sensitive_analysis(args.weight, args.device, args.cocodir, args.summary, args.num_image)
326 |     elif args.cmd == "test":
327 |         cmd_test(args.weight, args.device, args.cocodir, args.confidence, args.nmsthres)
328 |     else:
329 |         parser.print_help()
330 | 


--------------------------------------------------------------------------------
/yolov7_qat/scripts/quantize_utils.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a
  6 | # copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | # DEALINGS IN THE SOFTWARE.
 22 | ################################################################################
 23 | 
 24 | import onnx_graphsurgeon as gs
 25 | from onnx_graphsurgeon.ir.tensor import Variable
 26 | import onnx
 27 | import numpy as np
 28 | import argparse
 29 | import logging
 30 | 
 31 | LAYER_ID = 0
 32 | TENSOR_ID = 0
 33 | def get_qparams_constants(node_to_quantize_name, scale_init=0.5, zero_point_init=0):
 34 |     global LAYER_ID, TENSOR_ID
 35 |     """ ATTENTION: "node_to_quantize_name" needs to be different every time this function is called.
 36 |     Otherwise, "scale, zero_point" are overwritten.
 37 |     TODO: ensure that this happens! The same goes for
 38 |         "q_out and dq_out = gs.Variable(UNIQUE_NAME)"
 39 | 
 40 |     :param node_to_quantize_name:
 41 |     :param scale_init:
 42 |     :param zero_point_init:
 43 |     :return: 2 gs.Constants (scale and zero-point).
 44 |     """
 45 |     scale = gs.Constant(
 46 |         name=node_to_quantize_name + "_scale" + str(TENSOR_ID),
 47 |         values=np.array(scale_init, dtype=np.float32))
 48 |     TENSOR_ID = TENSOR_ID + 1
 49 |     zero_point = gs.Constant(
 50 |         name=node_to_quantize_name + "_zero_point" + str(TENSOR_ID),
 51 |         values=np.array(zero_point_init, dtype=np.int8)
 52 |     )
 53 |     TENSOR_ID = TENSOR_ID + 1
 54 |     return scale, zero_point
 55 | 
 56 | def quantize_tensor(graph, tensor_to_quantize, scale, name_suffix=""):
 57 |     global LAYER_ID, TENSOR_ID
 58 |     output_nodes = tensor_to_quantize['x'].outputs
 59 |     nodes_and_quantized = []
 60 |     nodes_inputidx = []
 61 | 
 62 |     for node in output_nodes:
 63 |         for idx, inp in enumerate(node.inputs):
 64 |             if inp.name == tensor_to_quantize['x'].name:
 65 |                 nodes_and_quantized.append(node)
 66 |                 nodes_inputidx.append(idx)
 67 |                 break
 68 | 
 69 |     # QuantizeLinear node
 70 |     q_scale, q_zero_point = get_qparams_constants(tensor_to_quantize['x'].name + "_inp_q" + name_suffix, scale_init=scale)
 71 |     q_out = gs.Variable(name=tensor_to_quantize['x'].name + "_QuantizeLinear_out" + name_suffix + str(TENSOR_ID))
 72 |     TENSOR_ID = TENSOR_ID + 1
 73 |     quant_node = gs.Node(
 74 |         op="QuantizeLinear",
 75 |         name="QuantI_"+ tensor_to_quantize['x'].name + str(LAYER_ID),
 76 |         inputs=[tensor_to_quantize["x"], q_scale, q_zero_point],
 77 |         outputs=[q_out]
 78 |     )
 79 |     LAYER_ID = LAYER_ID + 1
 80 |     # DequantizeLinear node
 81 |     dq_scale, dq_zero_point = get_qparams_constants(tensor_to_quantize['x'].name + "_inp_dq" + name_suffix, scale_init=scale)
 82 |     dq_out = gs.Variable(name=tensor_to_quantize['x'].name + "_DequantizeLinear_out" + name_suffix  + str(TENSOR_ID))
 83 |     TENSOR_ID = TENSOR_ID + 1
 84 |     dequant_node = gs.Node(
 85 |         op="DequantizeLinear",
 86 |         name="DequantI_"+ tensor_to_quantize['x'].name + str(LAYER_ID),
 87 |         inputs=[q_out, dq_scale, dq_zero_point],
 88 |         outputs=[dq_out]
 89 |     )
 90 |     LAYER_ID = LAYER_ID + 1
 91 |     #shit code
 92 |     for i, node in enumerate(nodes_and_quantized):
 93 |         node.inputs[nodes_inputidx[i]] = dq_out
 94 | 
 95 |     graph.nodes.extend([quant_node, dequant_node])
 96 |     return graph
 97 | 
 98 | def quantize_input(graph, node_to_quantize, node_to_quantize_input, scale, name_suffix=""):
 99 |     global LAYER_ID, TENSOR_ID
100 |     # QuantizeLinear node
101 |     q_scale, q_zero_point = get_qparams_constants(node_to_quantize.name + "_inp_q" + name_suffix, scale_init=scale)
102 |     q_out = gs.Variable(name=node_to_quantize.name + "_QuantizeLinear_out" + name_suffix  + name_suffix + str(TENSOR_ID))
103 |     TENSOR_ID = TENSOR_ID + 1
104 |     quant_node = gs.Node(
105 |         op="QuantizeLinear",
106 |         name="QuantI_"+ node_to_quantize.name + str(LAYER_ID),
107 |         inputs=[node_to_quantize_input["x"], q_scale, q_zero_point],
108 |         outputs=[q_out]
109 |     )
110 |     LAYER_ID = LAYER_ID + 1
111 | 
112 |     # DequantizeLinear node
113 |     dq_scale, dq_zero_point = get_qparams_constants(node_to_quantize.name + "_inp_dq" + name_suffix, scale_init=scale)
114 |     dq_out = gs.Variable(name=node_to_quantize.name + "_DequantizeLinear_out" + name_suffix + name_suffix + str(TENSOR_ID))
115 |     TENSOR_ID = TENSOR_ID + 1
116 |     dequant_node = gs.Node(
117 |         op="DequantizeLinear",
118 |         name="DequantI_"+ node_to_quantize.name + str(LAYER_ID),
119 |         inputs=[q_out, dq_scale, dq_zero_point],
120 |         outputs=[dq_out]
121 |     )
122 |     LAYER_ID = LAYER_ID + 1
123 | 
124 |     node_to_quantize.inputs[node_to_quantize_input["idx"]] = dq_out
125 |     graph.nodes.extend([quant_node, dequant_node])
126 | 
127 |     graph.cleanup().toposort()
128 |     return graph
129 | 
130 | 
131 | def quantize_weight(graph, node_to_quantize, node_to_quantize_weight, axis=0, name_suffix=""):
132 |     global LAYER_ID, TENSOR_ID
133 |     """
134 |     When connected to the weight, the "y_scale" parameter can be recovered directly from the Weight matrix.
135 |     See official doc: https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#intro-quantization
136 | 
137 |     :param graph:
138 |     :param node_to_quantize:
139 |     :param node_to_quantize_weight:
140 |     :param axis:
141 |     :param name_suffix:
142 |     :return:
143 |     """
144 |     shape = node_to_quantize_weight["x"].shape[axis]
145 |     # Recover "y_scale" from weight matrix
146 |     weight_matrix = node_to_quantize_weight["x"].values
147 |     y_scale_arr = []
148 |     # Recover "y_scale" for each batch. If axis != 0, move the desired axis to the be idx=0.
149 |     if axis !=0:
150 |         weight_matrix = np.moveaxis(weight_matrix, [axis], [0])
151 |     # for bais 1d-weight
152 |     if len(weight_matrix.shape) == 1:
153 |         weight_matrix = np.expand_dims(weight_matrix, axis=0)
154 |     for w in weight_matrix[:]:
155 |         dyn_range = max(abs(w.min()), abs(w.max()))
156 |         y_scale = dyn_range / 127.0
157 |         y_scale_arr.append(y_scale)
158 | 
159 |     # QuantizeLinear node
160 |     q_scale, q_zero_point = get_qparams_constants(
161 |         node_to_quantize.name + "_weight_q" + name_suffix,
162 |         scale_init=y_scale_arr,  # * np.ones(shape=(shape,)),
163 |         zero_point_init=np.zeros(shape=(shape,)),
164 |     )
165 |     q_out = gs.Variable(name=node_to_quantize.name + "_QuantizeLinear_weight_out" + name_suffix + str(TENSOR_ID))
166 |     TENSOR_ID = TENSOR_ID + 1
167 |     quant_node = gs.Node(
168 |         op="QuantizeLinear",
169 |         name="QuantW_"+ node_to_quantize.name + str(LAYER_ID),
170 |         inputs=[node_to_quantize_weight["x"], q_scale, q_zero_point],
171 |         outputs=[q_out],
172 |         attrs={"axis": axis}
173 |     )
174 |     LAYER_ID = LAYER_ID + 1
175 | 
176 | 
177 |     # DequantizeLinear node
178 |     dq_scale, dq_zero_point = get_qparams_constants(
179 |         node_to_quantize.name + "_weight_dq" + name_suffix,
180 |         scale_init=y_scale_arr,  # * np.ones(shape=(shape,)),
181 |         zero_point_init=np.zeros(shape=(shape,)),
182 |     )
183 |     TENSOR_ID = TENSOR_ID + 1
184 |     dq_out = gs.Variable(name=node_to_quantize.name + "_DequantizeLinear_weight_out" + name_suffix + str(TENSOR_ID))
185 |     dequant_node = gs.Node(
186 |         op="DequantizeLinear",
187 |         name="DequantW_"+ node_to_quantize.name + str(LAYER_ID),
188 |         inputs=[q_out, dq_scale, dq_zero_point],
189 |         outputs=[dq_out],
190 |         attrs={"axis": axis}
191 |     )
192 |     LAYER_ID = LAYER_ID + 1
193 | 
194 |     node_to_quantize.inputs[node_to_quantize_weight["idx"]] = dq_out
195 |     graph.nodes.extend([quant_node, dequant_node])
196 | 
197 |     graph.cleanup().toposort()
198 |     return graph
199 | 
200 | def get_node_to_quantize_infos(node_to_quantize, disableResAdd:bool):
201 |     # Separate inputs into activation ('Variable' type) and weight ('Constant' type).
202 |     node_to_quantize_input = []
203 |     node_to_quantize_weight = []
204 |     for idx, inp in enumerate(node_to_quantize.inputs):
205 |         if isinstance(inp, Variable):
206 |             node_to_quantize_input.append({"x": inp, "idx": idx})
207 |             # residual add, will not work with bias add
208 |             if node_to_quantize.op == "Add" and (not disableResAdd) and len(node_to_quantize_input) == 2:
209 |                 node_to_quantize_input = [node_to_quantize_input[0]]
210 |         else:  # Constant
211 |             if (
212 |                     len(node_to_quantize_weight) == 0
213 |                     and node_to_quantize.op not in ["Add", "BatchNormalization"]
214 |                     and len(inp.shape) > 1
215 |             ):
216 |                 # 1) Only quantize the Weight, not Bias
217 |                 # 2) Do not quantize bias matrix in BiasAdd ops
218 |                 # 3) Only save weight matrices with shape > 1 (Conv 4D, MatMul 2D)
219 |                 node_to_quantize_weight.append({"x": inp, "idx": idx})
220 | 
221 |             # for bias add after matmul
222 |             elif(
223 |                     len(node_to_quantize_weight) == 0
224 |                     and node_to_quantize.op =="Add"
225 |                     and isinstance(node_to_quantize.inputs[0], gs.Constant)):
226 |                 node_to_quantize_weight.append({"x": inp, "idx": idx})
227 | 
228 | 
229 |     return node_to_quantize_input, node_to_quantize_weight
230 | 
231 | def quantize_node_automatically(graph, node_to_quantize, scale, disableResAdd:bool):
232 |     """
233 |     Quantizes a node according to information in graph.json (generated from the PTQ engine building step.
234 | 
235 |     :return:
236 |     """
237 |     node_to_quantize_input, node_to_quantize_weight = get_node_to_quantize_infos(node_to_quantize, disableResAdd)
238 | 
239 |     # Quantize inputs
240 |     input_was_quantized = False
241 |     # Quantizable layer
242 |     for i, node_inp in enumerate(node_to_quantize_input):
243 |         graph = quantize_input(graph, node_to_quantize, node_inp, scale, name_suffix=str(i))
244 |         input_was_quantized = True
245 | 
246 |     # Quantize weights
247 |     for i, node_weight in enumerate(node_to_quantize_weight):
248 |         if input_was_quantized:
249 |             graph = quantize_weight(
250 |                 graph,
251 |                 node_to_quantize,
252 |                 node_weight,
253 |                 axis=1 if node_to_quantize.op in ["MatMul", "ConvTranspose"] else 0,  # TODO: Automatize axis detection. Automatize this by checking the expected layer output and extract axis that matches desired dimension.
254 |                 name_suffix=str(i)
255 |             )
256 |     return graph
257 | 
258 | def quantize_tensor_automatically(graph, tensor_to_quantize, scale):
259 |     """
260 |     Quantizes a tensor
261 | 
262 |     :return:
263 |     """
264 |     tensor_to_quantize = [{'x':tensor_to_quantize},]
265 |     # Quantizable tensor
266 |     for i, tensor_inp in enumerate(tensor_to_quantize):
267 |         graph = quantize_tensor(graph, tensor_inp, scale, name_suffix=str(i))
268 |     return graph
269 | 
270 | 
271 | def quant_one_node(graph, node_name, scale=0.04370, disableResAdd:bool = False):
272 |     nodes = graph.nodes
273 |     node_to_quantize = [x for x in nodes if x.name == node_name]
274 |     if len(node_to_quantize) == 0:
275 |         logging.warning(f'node: ',node_name, "did not found, skip")
276 |     if len(node_to_quantize) > 1:
277 |         logging.error(f'found multiple node named: ',node_name)
278 |     node_to_quantize = node_to_quantize[0]
279 |     graph = quantize_node_automatically(graph, node_to_quantize, scale, disableResAdd)
280 |     return graph
281 | 
282 | def quant_one_tensor(graph, tensor_name, scale=0.04370):
283 |     # nodes = graph.nodes
284 |     tensors = graph.tensors()
285 |     tensor_to_quantize = [tensor for name, tensor in tensors.items() if tensor.name == tensor_name]
286 |     if len(tensor_to_quantize) == 0:
287 |         logging.warning(f'tensor: ',tensor_name, "did not found, skip")
288 |     if len(tensor_to_quantize) > 1:
289 |         logging.error(f'found multiple tensor named: ',tensor_name)
290 | 
291 |     tensor_to_quantize = tensor_to_quantize[0]
292 |     graph = quantize_tensor_automatically(graph, tensor_to_quantize, scale)
293 |     return graph
294 | 
295 | def quant_node_of_list(graph, op_name_list:list, disableResAdd:bool):
296 |     for op in op_name_list:
297 |         graph = quant_one_node(graph, op, disableResAdd=disableResAdd)
298 |         ##TODO: if one element is Conv1:0.03, it should be support
299 |     return graph
300 | 
301 | def quant_tensor_of_list(graph, tensor_name_list:list):
302 |     for tensor in tensor_name_list:
303 |         graph = quant_one_tensor(graph, tensor)
304 |     return graph
305 | 
306 | # def quant_all_nodes_of_type():
307 |     # return None
308 | 
309 | def quant_onnx(model_path, output_model_path, nodes_name_to_quant, tensors_name_to_quant, disableResAdd:bool):
310 |     model = onnx.load(model_path)
311 |     model = onnx.shape_inference.infer_shapes(model)
312 |     graph = gs.import_onnx(model)
313 |     graph = quant_node_of_list(graph, nodes_name_to_quant, disableResAdd)
314 |     graph = quant_tensor_of_list(graph, tensors_name_to_quant)
315 |     graph.cleanup()
316 |     new_model = gs.export_onnx(graph)
317 |     onnx.save(new_model, output_model_path)
318 | 
319 | if __name__ == "__main__":
320 |     parser = argparse.ArgumentParser(description='iso_the onnx model with new input and output')
321 |     parser.add_argument('--model', default='model.onnx', type=str, help='the onnx model')
322 |     parser.add_argument('--output_model', default='', type=str, help='the output model')
323 |     parser.add_argument('--nodes', nargs='+', type=str, help='the input nodes list you want to quant',default=[])
324 |     parser.add_argument('--disableResAdd', action='store_true', help='if enabled this flag, residual add will have two inputs')
325 | 
326 |     parser.add_argument('--tensors', nargs='+', type=str, help='the tensors list you want to quant',default=[])
327 | 
328 |     args = parser.parse_args()
329 |     print(args)
330 |     quant_onnx(args.model,  args.output_model, args.nodes, args.tensors, args.disableResAdd)
331 | 


--------------------------------------------------------------------------------
/yolov7_qat/scripts/trt-int8.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a
  6 | # copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | # DEALINGS IN THE SOFTWARE.
 22 | ################################################################################
 23 | import tensorrt as trt
 24 | import pycuda.driver as cuda
 25 | import pycuda.autoinit
 26 | 
 27 | import numpy as np
 28 | import random
 29 | import cv2
 30 | 
 31 | # For ../common.py
 32 | import sys, os
 33 | TRT_LOGGER = trt.Logger()
 34 | 
 35 | 
 36 | def load_yolov7_coco_image(cocodir, topn = None):
 37 |     
 38 |     files = os.listdir(cocodir)
 39 |     files = [file for file in files if file.endswith(".jpg")]
 40 | 
 41 |     if topn is not None:
 42 |         np.random.seed(31)
 43 |         np.random.shuffle(files)
 44 |         files = files[:topn]
 45 | 
 46 |     datas = []
 47 | 
 48 |     # dataloader is setup pad=0.5
 49 |     for i, file in enumerate(files):
 50 |         if i == 0: continue
 51 |         if (i + 1) % 200 == 0:
 52 |             print(f"Load {i + 1} / {len(files)} ...")
 53 | 
 54 |         img = cv2.imread(os.path.join(cocodir, file))
 55 |         from_ = img.shape[1], img.shape[0]
 56 |         to_   = 640, 640
 57 |         scale = min(to_[0] / from_[0], to_[1] / from_[1])
 58 | 
 59 |         # low accuracy
 60 |         # M = np.array([
 61 |         #     [scale, 0, 16],
 62 |         #     [0, scale, 16],  # same to pytorch
 63 |         # ])
 64 | 
 65 |         # more accuracy
 66 |         M = np.array([
 67 |             [scale, 0, -scale * from_[0]  * 0.5  + to_[0] * 0.5 + scale * 0.5 - 0.5 + 16],
 68 |             [0, scale, -scale * from_[1] * 0.5 + to_[1] * 0.5 + scale * 0.5 - 0.5 + 16],  # same to pytorch
 69 |         ])
 70 |         input = cv2.warpAffine(img, M, (672, 672), borderValue=(114, 114, 114))
 71 |         input = input[..., ::-1].transpose(2, 0, 1)[None]   # BGR->RGB, HWC->CHW, CHW->1CHW
 72 |         input = (input / 255.0).astype(np.float32)
 73 |         datas.append(input)
 74 |         
 75 |     return np.concatenate(datas, axis=0)
 76 |     
 77 | 
 78 | class MNISTEntropyCalibrator(trt.IInt8EntropyCalibrator2):
 79 |     def __init__(self, training_data, cache_file, batch_size=64):
 80 |         # Whenever you specify a custom constructor for a TensorRT class,
 81 |         # you MUST call the constructor of the parent explicitly.
 82 |         trt.IInt8EntropyCalibrator2.__init__(self)
 83 | 
 84 |         self.cache_file = cache_file
 85 |         self.batch_size = batch_size
 86 |         self.current_index = 0
 87 | 
 88 |         # Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned.
 89 |         if not os.path.exists(cache_file):
 90 | 
 91 |             # Allocate enough memory for a whole batch.
 92 |             self.data = load_yolov7_coco_image(training_data, 1000)
 93 |             self.device_input = cuda.mem_alloc(self.data[0].nbytes * self.batch_size)
 94 | 
 95 |     def get_batch_size(self):
 96 |         return self.batch_size
 97 | 
 98 |     # TensorRT passes along the names of the engine bindings to the get_batch function.
 99 |     # You don't necessarily have to use them, but they can be useful to understand the order of
100 |     # the inputs. The bindings list is expected to have the same ordering as 'names'.
101 |     def get_batch(self, names):
102 |         if self.current_index + self.batch_size > self.data.shape[0]:
103 |             return None
104 | 
105 |         current_batch = int(self.current_index / self.batch_size)
106 |         if current_batch % 10 == 0:
107 |             print("Calibrating batch {:}, containing {:} images".format(current_batch, self.batch_size))
108 | 
109 |         batch = self.data[self.current_index : self.current_index + self.batch_size].ravel()
110 |         cuda.memcpy_htod(self.device_input, batch)
111 |         self.current_index += self.batch_size
112 |         return [self.device_input]
113 | 
114 |     def read_calibration_cache(self):
115 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
116 |         if os.path.exists(self.cache_file):
117 |             with open(self.cache_file, "rb") as f:
118 |                 return f.read()
119 | 
120 |     def write_calibration_cache(self, cache):
121 |         with open(self.cache_file, "wb") as f:
122 |             f.write(cache)
123 | 
124 | 
125 | def build_int8_engine(onnx_file, calib, batch_size=32):
126 |     with trt.Builder(
127 |         TRT_LOGGER
128 |     ) as builder, builder.create_network(1) as network, builder.create_builder_config() as config:
129 |         # We set the builder batch size to be the same as the calibrator's, as we use the same batches
130 |         # during inference. Note that this is not required in general, and inference batch size is
131 |         # independent of calibration batch size.
132 |         builder.max_batch_size = batch_size
133 |         config.max_workspace_size = 1024 * 1024 * 1024  # 1024 MB
134 |         config.set_flag(trt.BuilderFlag.INT8)
135 |         config.int8_calibrator = calib
136 |         with trt.OnnxParser(network, TRT_LOGGER) as parser:
137 |             parser.parse_from_file(onnx_file)
138 |         # network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
139 |         # Build engine and do int8 calibration.
140 |         plan = builder.build_serialized_network(network, config)
141 |         return bytes(plan)
142 | 
143 | 
144 | def replace_suffix(file, new_suffix):
145 |     r = file.rfind(".")
146 |     return f"{file[:r]}{new_suffix}"
147 | 
148 | 
149 | def main():
150 |     # Now we create a calibrator and give it the location of our calibration data.
151 |     # We also allow it to cache calibration data for faster engine building.
152 |     onnxfile          = "yolov7.onnx"
153 |     calibration_cache = replace_suffix(onnxfile, ".cache")
154 |     engine_file       = replace_suffix(onnxfile, ".engine")
155 |     calib = MNISTEntropyCalibrator("/datav/dataset/coco/images/train2017/", cache_file=calibration_cache)
156 | 
157 |     # Inference batch size can be different from calibration batch size.
158 |     batch_size = 1
159 |     engine_data = build_int8_engine(onnxfile, calib, batch_size)
160 | 
161 |     with open(engine_file, "wb") as f:
162 |         f.write(engine_data)
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     main()
167 | 


--------------------------------------------------------------------------------