├── .gitmodules
├── README.md
├── yolov5l.yaml
├── yolov5m.yaml
├── yolov5s.yaml
├── yolov5x.yaml
├── .gitignore
├── LICENSE
├── yolo.py
└── main.py


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "yolov5"]
2 | 	path = yolov5
3 | 	url = https://github.com/ultralytics/yolov5
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # yolov5-tensorrt
 2 | A tensorrt implementation of yolov5: https://github.com/ultralytics/yolov5
 3 | 
 4 | # requirement
 5 | Please use torch>=1.6.0 + onnx>=1.6.0 + TRT 7.1+ (fix upsample issue) to run the sample code  
 6 | onnx-simplifier-0.2.16
 7 | 
 8 | # The code
 9 | Add newly implemented upsample to get this working with current combination of onnx and tensorrt.  
10 | 0. prepare above mentioned environment.
11 | 1. git clone && git submodule update --init
12 | 2. download weights file (use yolov5/models/export.py)
13 | 3. python main.py to run the benchmark
14 | 4. Generally, for image of size 640*640, using batchsize=1, the speedup is 4x on V100.
15 | 
16 | # Updates
17 | - 20201004 update to track yolov5 - v3.0 release. download model file from official websites please.
18 | 
19 | # TODO
20 | - [x] NMS support
21 | - [ ] dynamic shape or dynamic batchsize support (**won't implement soon because onnx-simplifier only supports fixed shape**)
22 | - [ ] FP16 numerical issue and performance investigation
23 | - [ ] Benchmark


--------------------------------------------------------------------------------
/yolov5l.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 1.0  # model depth multiple
 4 | width_multiple: 1.0  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/yolov5m.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 0.67  # model depth multiple
 4 | width_multiple: 0.75  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/yolov5s.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 0.33  # model depth multiple
 4 | width_multiple: 0.50  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/yolov5x.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 1.33  # model depth multiple
 4 | width_multiple: 1.25  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # pytorch weights
132 | *.pt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/yolo.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import math
  3 | import logging
  4 | from copy import deepcopy
  5 | from pathlib import Path
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | from models.common import Conv, Bottleneck, SPP, DWConv, Focus, BottleneckCSP, Concat
 11 | from models.experimental import MixConv2d, CrossConv, C3
 12 | from utils.general import check_anchor_order, make_divisible, check_file, set_logging
 13 | from utils.torch_utils import (
 14 |     time_synchronized, fuse_conv_and_bn, model_info, scale_img, initialize_weights, select_device)
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | class Detect(nn.Module):
 19 |     def __init__(self, nc=80, anchors=(), ch=()):  # detection layer
 20 |         super(Detect, self).__init__()
 21 |         self.stride = None  # strides computed during build
 22 |         self.nc = nc  # number of classes
 23 |         self.no = nc + 5  # number of outputs per anchor
 24 |         self.nl = len(anchors)  # number of detection layers
 25 |         self.na = len(anchors[0]) // 2  # number of anchors
 26 |         self.grid = [torch.zeros(1)] * self.nl  # init grid
 27 |         a = torch.tensor(anchors).float().view(self.nl, -1, 2)
 28 |         self.register_buffer('anchors', a)  # shape(nl,na,2)
 29 |         self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl,1,na,1,1,2)
 30 |         self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
 31 |         self.export = False  # onnx export
 32 | 
 33 |     def forward(self, x):
 34 |         # x = x.copy()  # for profiling
 35 |         preds = []  # inference output
 36 |         self.training |= self.export
 37 |         for i in range(self.nl):
 38 |             x[i] = self.m[i](x[i])  # conv
 39 |             bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
 40 |             x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
 41 | 
 42 |             if not self.training:  # inference
 43 |                 if self.grid[i].shape[2:4] != x[i].shape[2:4]:
 44 |                     self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
 45 |                 
 46 |                 y = x[i].sigmoid()
 47 |                 #y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
 48 |                 #y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
 49 |                 t0 = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]
 50 |                 t1 = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]
 51 |                 box_xyxy = self._xywh2xyxy(t0, t1).view(bs, -1, 4)
 52 |                 score = y[...,4:].float().view(bs, -1, self.nc+1)
 53 | 
 54 |                 preds.append(torch.cat([box_xyxy, score], -1))
 55 | 
 56 |         # return x if self.training else (torch.cat(z, 1), x)
 57 |         return x if self.training else [torch.cat(preds, 1)]
 58 | 
 59 | 
 60 |     @staticmethod
 61 |     def _make_grid(nx=20, ny=20):
 62 |         yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
 63 |         return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
 64 | 
 65 |     @staticmethod
 66 |     def _xywh2xyxy(t0, t1):
 67 |         # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
 68 |         t = t1 / 2
 69 |         x0 = t0 - t
 70 |         x1 = t0 + t
 71 |         return torch.cat([x0, x1], -1)
 72 | 
 73 | 
 74 | class Model(nn.Module):
 75 |     def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None):  # model, input channels, number of classes
 76 |         super(Model, self).__init__()
 77 |         if isinstance(cfg, dict):
 78 |             self.yaml = cfg  # model dict
 79 |         else:  # is *.yaml
 80 |             import yaml  # for torch hub
 81 |             self.yaml_file = Path(cfg).name
 82 |             with open(cfg) as f:
 83 |                 self.yaml = yaml.load(f, Loader=yaml.FullLoader)  # model dict
 84 | 
 85 |         # Define model
 86 |         if nc and nc != self.yaml['nc']:
 87 |             print('Overriding %s nc=%g with nc=%g' % (cfg, self.yaml['nc'], nc))
 88 |             self.yaml['nc'] = nc  # override yaml value
 89 |         self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist, ch_out
 90 |         # print([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])
 91 | 
 92 |         # Build strides, anchors
 93 |         m = self.model[-1]  # Detect()
 94 |         if isinstance(m, Detect):
 95 |             s = 128  # 2x min stride
 96 |             m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward
 97 |             m.anchors /= m.stride.view(-1, 1, 1)
 98 |             check_anchor_order(m)
 99 |             self.stride = m.stride
100 |             self._initialize_biases()  # only run once
101 |             # print('Strides: %s' % m.stride.tolist())
102 | 
103 |         # Init weights, biases
104 |         initialize_weights(self)
105 |         self.info()
106 |         print('')
107 | 
108 |     def forward(self, x, augment=False, profile=False):
109 |         if augment:
110 |             img_size = x.shape[-2:]  # height, width
111 |             s = [1, 0.83, 0.67]  # scales
112 |             f = [None, 3, None]  # flips (2-ud, 3-lr)
113 |             y = []  # outputs
114 |             for si, fi in zip(s, f):
115 |                 xi = scale_img(x.flip(fi) if fi else x, si)
116 |                 yi = self.forward_once(xi)[0]  # forward
117 |                 # cv2.imwrite('img%g.jpg' % s, 255 * xi[0].numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
118 |                 yi[..., :4] /= si  # de-scale
119 |                 if fi == 2:
120 |                     yi[..., 1] = img_size[0] - yi[..., 1]  # de-flip ud
121 |                 elif fi == 3:
122 |                     yi[..., 0] = img_size[1] - yi[..., 0]  # de-flip lr
123 |                 y.append(yi)
124 |             return torch.cat(y, 1), None  # augmented inference, train
125 |         else:
126 |             return self.forward_once(x, profile)  # single-scale inference, train
127 | 
128 |     def forward_once(self, x, profile=False):
129 |         y, dt = [], []  # outputs
130 |         for m in self.model:
131 |             if m.f != -1:  # if not from previous layer
132 |                 x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
133 | 
134 |             if profile:
135 |                 try:
136 |                     import thop
137 |                     o = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2  # FLOPS
138 |                 except:
139 |                     o = 0
140 |                 t = time_synchronized()
141 |                 for _ in range(10):
142 |                     _ = m(x)
143 |                 dt.append((time_synchronized() - t) * 100)
144 |                 print('%10.1f%10.0f%10.1fms %-40s' % (o, m.np, dt[-1], m.type))
145 | 
146 |             x = m(x)  # run
147 |             y.append(x if m.i in self.save else None)  # save output
148 | 
149 |         if profile:
150 |             print('%.1fms total' % sum(dt))
151 |         return x
152 | 
153 |     def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
154 |         # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
155 |         m = self.model[-1]  # Detect() module
156 |         for mi, s in zip(m.m, m.stride):  # from
157 |             b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
158 |             b[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
159 |             b[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
160 |             mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
161 | 
162 |     def _print_biases(self):
163 |         m = self.model[-1]  # Detect() module
164 |         for mi in m.m:  # from
165 |             b = mi.bias.detach().view(m.na, -1).T  # conv.bias(255) to (3,85)
166 |             print(('%6g Conv2d.bias:' + '%10.3g' * 6) % (mi.weight.shape[1], *b[:5].mean(1).tolist(), b[5:].mean()))
167 | 
168 |     # def _print_weights(self):
169 |     #     for m in self.model.modules():
170 |     #         if type(m) is Bottleneck:
171 |     #             print('%10.3g' % (m.w.detach().sigmoid() * 2))  # shortcut weights
172 | 
173 |     def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
174 |         print('Fusing layers... ')
175 |         for m in self.model.modules():
176 |             if type(m) is Conv:
177 |                 m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatability
178 |                 m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
179 |                 m.bn = None  # remove batchnorm
180 |                 m.forward = m.fuseforward  # update forward
181 |         self.info()
182 |         return self
183 | 
184 |     def info(self):  # print model information
185 |         model_info(self)
186 | 
187 | 
188 | def parse_model(d, ch):  # model_dict, input_channels(3)
189 |     logger.info('\n%3s%18s%3s%10s  %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
190 |     anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
191 |     na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
192 |     no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
193 | 
194 |     layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
195 |     for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
196 |         m = eval(m) if isinstance(m, str) else m  # eval strings
197 |         for j, a in enumerate(args):
198 |             try:
199 |                 args[j] = eval(a) if isinstance(a, str) else a  # eval strings
200 |             except:
201 |                 pass
202 | 
203 |         n = max(round(n * gd), 1) if n > 1 else n  # depth gain
204 |         if m in [nn.Conv2d, Conv, Bottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP, C3]:
205 |             c1, c2 = ch[f], args[0]
206 | 
207 |             # Normal
208 |             # if i > 0 and args[0] != no:  # channel expansion factor
209 |             #     ex = 1.75  # exponential (default 2.0)
210 |             #     e = math.log(c2 / ch[1]) / math.log(2)
211 |             #     c2 = int(ch[1] * ex ** e)
212 |             # if m != Focus:
213 | 
214 |             c2 = make_divisible(c2 * gw, 8) if c2 != no else c2
215 | 
216 |             # Experimental
217 |             # if i > 0 and args[0] != no:  # channel expansion factor
218 |             #     ex = 1 + gw  # exponential (default 2.0)
219 |             #     ch1 = 32  # ch[1]
220 |             #     e = math.log(c2 / ch1) / math.log(2)  # level 1-n
221 |             #     c2 = int(ch1 * ex ** e)
222 |             # if m != Focus:
223 |             #     c2 = make_divisible(c2, 8) if c2 != no else c2
224 | 
225 |             args = [c1, c2, *args[1:]]
226 |             if m in [BottleneckCSP, C3]:
227 |                 args.insert(2, n)
228 |                 n = 1
229 |         elif m is nn.BatchNorm2d:
230 |             args = [ch[f]]
231 |         elif m is Concat:
232 |             c2 = sum([ch[-1 if x == -1 else x + 1] for x in f])
233 |         elif m is Detect:
234 |             args.append([ch[x + 1] for x in f])
235 |             if isinstance(args[1], int):  # number of anchors
236 |                 args[1] = [list(range(args[1] * 2))] * len(f)
237 |         else:
238 |             c2 = ch[f]
239 | 
240 |         m_ = nn.Sequential(*[m(*args) for _ in range(n)]) if n > 1 else m(*args)  # module
241 |         t = str(m)[8:-2].replace('__main__.', '')  # module type
242 |         np = sum([x.numel() for x in m_.parameters()])  # number params
243 |         m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
244 |         logger.info('%3s%18s%3s%10.0f  %-40s%-30s' % (i, f, n, np, t, args))  # print
245 |         save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
246 |         layers.append(m_)
247 |         ch.append(c2)
248 |     return nn.Sequential(*layers), sorted(save)
249 | 
250 | 
251 | if __name__ == '__main__':
252 |     parser = argparse.ArgumentParser()
253 |     parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml')
254 |     parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
255 |     opt = parser.parse_args()
256 |     opt.cfg = check_file(opt.cfg)  # check file
257 |     set_logging()
258 |     device = select_device(opt.device)
259 | 
260 |     # Create model
261 |     model = Model(opt.cfg).to(device)
262 |     model.train()
263 | 
264 |     # Profile
265 |     # img = torch.rand(8 if torch.cuda.is_available() else 1, 3, 640, 640).to(device)
266 |     # y = model(img, profile=True)
267 | 
268 |     # ONNX export
269 |     # model.model[-1].export = True
270 |     # torch.onnx.export(model, img, opt.cfg.replace('.yaml', '.onnx'), verbose=True, opset_version=11)
271 | 
272 |     # Tensorboard
273 |     # from torch.utils.tensorboard import SummaryWriter
274 |     # tb_writer = SummaryWriter()
275 |     # print("Run 'tensorboard --logdir=models/runs' to view tensorboard at http://localhost:6006/")
276 |     # tb_writer.add_graph(model.model, img)  # add model to tensorboard
277 |     # tb_writer.add_image('test', img[0], dataformats='CWH')  # add model to tensorboard
278 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('yolov5')
  3 | 
  4 | from models.common import *
  5 | from utils.torch_utils import *
  6 | from utils.datasets import *
  7 | from yolo import *
  8 | import tensorrt as trt
  9 | import pycuda.driver as cuda
 10 | import pycuda.autoinit
 11 | import time
 12 | import copy
 13 | import numpy as np
 14 | import os
 15 | from onnxsim import simplify
 16 | import onnx
 17 | import struct
 18 | import yaml
 19 | import torchvision
 20 | 
 21 | device = select_device('0')
 22 | weights = 'yolov5s.pt'
 23 | model_config = 'yolov5s.yaml'
 24 | TRT_LOGGER = trt.Logger(trt.Logger.INFO)
 25 | EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 26 | image_loader = LoadImages('yolov5/inference/images', img_size=640)
 27 | image_loader.__iter__()
 28 | _, input_img, _, _ = image_loader.__next__()
 29 | input_img = input_img.astype(np.float)
 30 | input_img /= 255.0
 31 | input_img = np.expand_dims(input_img, axis=0)
 32 | with open(model_config) as f:
 33 |     cfg = yaml.load(f, Loader=yaml.FullLoader)  # model dict
 34 |     num_classes = cfg['nc']
 35 | 
 36 | # nms config
 37 | conf_thres = 0.4
 38 | iou_thres = 0.5
 39 | max_det = 300
 40 | # nms GPU
 41 | topK = 512 # max supported is 4096, if conf_thres is low, such as 0.001, use larger number.
 42 | keepTopK = max_det
 43 | 
 44 | 
 45 | def GiB(val):
 46 |     return val * 1 << 30
 47 | 
 48 | 
 49 | #  different from yolov5/utils/non_max_suppression, xywh2xyxy(x[:, :4]) is no longer needed (contained in Detect())
 50 | def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, merge=False, classes=None, agnostic=False):
 51 |     """Performs Non-Maximum Suppression (NMS) on inference results
 52 | 
 53 |     Returns:
 54 |          detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
 55 |     """
 56 |     if prediction.dtype is torch.float16:
 57 |         prediction = prediction.float()  # to FP32
 58 | 
 59 |     nc = prediction[0].shape[1] - 5  # number of classes
 60 |     xc = prediction[..., 4] > conf_thres  # candidates
 61 | 
 62 |     # Settings
 63 |     min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
 64 |     max_det = 300  # maximum number of detections per image
 65 |     time_limit = 10.0  # seconds to quit after
 66 |     redundant = True  # require redundant detections
 67 |     multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
 68 | 
 69 |     t = time.time()
 70 |     output = [None] * prediction.shape[0]
 71 |     for xi, x in enumerate(prediction):  # image index, image inference
 72 |         # Apply constraints
 73 |         # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
 74 |         x = x[xc[xi]]  # confidence
 75 | 
 76 |         # If none remain process next image
 77 |         if not x.shape[0]:
 78 |             continue
 79 | 
 80 |         # Compute conf
 81 |         x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
 82 | 
 83 |         # Box (center x, center y, width, height) to (x1, y1, x2, y2)
 84 |         box = x[:, :4] #xywh2xyxy(x[:, :4])
 85 | 
 86 |         # Detections matrix nx6 (xyxy, conf, cls)
 87 |         if multi_label:
 88 |             i, j = (x[:, 5:] > conf_thres).nonzero().t()
 89 |             x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
 90 |         else:  # best class only
 91 |             conf, j = x[:, 5:].max(1, keepdim=True)
 92 |             x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
 93 | 
 94 |         # Filter by class
 95 |         if classes:
 96 |             x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
 97 | 
 98 |         # Apply finite constraint
 99 |         # if not torch.isfinite(x).all():
100 |         #     x = x[torch.isfinite(x).all(1)]
101 | 
102 |         # If none remain process next image
103 |         n = x.shape[0]  # number of boxes
104 |         if not n:
105 |             continue
106 | 
107 |         # Sort by confidence
108 |         # x = x[x[:, 4].argsort(descending=True)]
109 | 
110 |         # Batched NMS
111 |         c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
112 |         boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
113 |         i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
114 |         if i.shape[0] > max_det:  # limit detections
115 |             i = i[:max_det]
116 |         if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
117 |             try:  # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
118 |                 iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
119 |                 weights = iou * scores[None]  # box weights
120 |                 x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
121 |                 if redundant:
122 |                     i = i[iou.sum(1) > 1]  # require redundancy
123 |             except:  # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
124 |                 print(x, i, x.shape, i.shape)
125 |                 pass
126 | 
127 |         output[xi] = x[i]
128 |         if (time.time() - t) > time_limit:
129 |             break  # time limit exceeded
130 | 
131 |     return output
132 | 
133 | 
134 | def load_model():
135 |     # Load model
136 |     model = Model(model_config)
137 |     ckpt = torch.load(weights, map_location=torch.device('cpu'))
138 |     ckpt['model'] = \
139 |                 {k: v for k, v in ckpt['model'].state_dict().items() if model.state_dict()[k].numel() == v.numel()}
140 |     model.load_state_dict(ckpt['model'], strict=False)
141 |     model.eval()
142 |     return model
143 | 
144 | 
145 | def export_onnx(model, batch_size):
146 |     _,_,x,y = input_img.shape
147 |     img = torch.zeros((batch_size, 3, x, y))
148 |     torch.onnx.export(model, (img), 'yolov5_{}.onnx'.format(batch_size), 
149 |            input_names=["data"], output_names=["prediction"], verbose=True, opset_version=11, operator_export_type=torch.onnx.OperatorExportTypes.ONNX
150 |     )
151 | 
152 | 
153 | def simplify_onnx(onnx_path):
154 |     model = onnx.load(onnx_path)
155 |     model_simp, check = simplify(model)
156 |     assert check, "Simplified ONNX model could not be validated"
157 |     onnx.save(model_simp, onnx_path)
158 | 
159 | 
160 | def build_engine(onnx_path, using_half):
161 |     trt.init_libnvinfer_plugins(None, '')
162 |     engine_file = onnx_path.replace(".onnx", ".engine")
163 |     if os.path.exists(engine_file):
164 |         with open(engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
165 |             return runtime.deserialize_cuda_engine(f.read())
166 |     with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
167 |         builder.max_batch_size = 1 # always 1 for explicit batch
168 |         config = builder.create_builder_config()
169 |         config.max_workspace_size = GiB(1)
170 |         if using_half:
171 |             config.set_flag(trt.BuilderFlag.FP16)
172 |         # Load the Onnx model and parse it in order to populate the TensorRT network.
173 |         with open(onnx_path, 'rb') as model:
174 |             if not parser.parse(model.read()):
175 |                 print ('ERROR: Failed to parse the ONNX file.')
176 |                 for error in range(parser.num_errors):
177 |                     print (parser.get_error(error))
178 |                 return None
179 |         
180 |         previous_output = network.get_output(0)
181 |         network.unmark_output(previous_output)
182 | 
183 |         # slice boxes, obj_score, class_scores
184 |         strides = trt.Dims([1,1,1])
185 |         starts = trt.Dims([0,0,0])
186 |         bs, num_boxes, _ = previous_output.shape
187 |         shapes = trt.Dims([bs, num_boxes, 4])
188 |         boxes = network.add_slice(previous_output, starts, shapes, strides)
189 |         starts[2] = 4
190 |         shapes[2] = 1
191 |         obj_score = network.add_slice(previous_output, starts, shapes, strides)
192 |         starts[2] = 5
193 |         shapes[2] = num_classes
194 |         scores = network.add_slice(previous_output, starts, shapes, strides)
195 | 
196 |         indices = network.add_constant(trt.Dims([num_classes]), trt.Weights(np.zeros(num_classes, np.int32)))
197 |         gather_layer = network.add_gather(obj_score.get_output(0), indices.get_output(0), 2)
198 | 
199 |         # scores = obj_score * class_scores => [bs, num_boxes, nc]
200 |         updated_scores = network.add_elementwise(gather_layer.get_output(0), scores.get_output(0), trt.ElementWiseOperation.PROD)
201 | 
202 |         # reshape box to [bs, num_boxes, 1, 4]
203 |         reshaped_boxes = network.add_shuffle(boxes.get_output(0))
204 |         reshaped_boxes.reshape_dims = trt.Dims([0,0,1,4])
205 | 
206 |         # add batchedNMSPlugin, inputs:[boxes:(bs, num, 1, 4), scores:(bs, num, 1)]
207 |         trt.init_libnvinfer_plugins(TRT_LOGGER, "")
208 |         registry = trt.get_plugin_registry()
209 |         assert(registry)
210 |         creator = registry.get_plugin_creator("BatchedNMS_TRT", "1")
211 |         assert(creator)
212 |         fc = []
213 |         fc.append(trt.PluginField("shareLocation", np.array([1], dtype=np.int), trt.PluginFieldType.INT32))
214 |         fc.append(trt.PluginField("backgroundLabelId", np.array([-1], dtype=np.int), trt.PluginFieldType.INT32))
215 |         fc.append(trt.PluginField("numClasses", np.array([num_classes], dtype=np.int), trt.PluginFieldType.INT32))
216 |         fc.append(trt.PluginField("topK", np.array([topK], dtype=np.int), trt.PluginFieldType.INT32))
217 |         fc.append(trt.PluginField("keepTopK", np.array([keepTopK], dtype=np.int), trt.PluginFieldType.INT32))
218 |         fc.append(trt.PluginField("scoreThreshold", np.array([conf_thres], dtype=np.float32), trt.PluginFieldType.FLOAT32))
219 |         fc.append(trt.PluginField("iouThreshold", np.array([iou_thres], dtype=np.float32), trt.PluginFieldType.FLOAT32))
220 |         fc.append(trt.PluginField("isNormalized", np.array([0], dtype=np.int), trt.PluginFieldType.INT32))
221 |         fc.append(trt.PluginField("clipBoxes", np.array([0], dtype=np.int), trt.PluginFieldType.INT32))
222 |         
223 |         fc = trt.PluginFieldCollection(fc) 
224 |         nms_layer = creator.create_plugin("nms_layer", fc)
225 | 
226 |         layer = network.add_plugin_v2([reshaped_boxes.get_output(0), updated_scores.get_output(0)], nms_layer)
227 |         layer.get_output(0).name = "num_detections"
228 |         layer.get_output(1).name = "nmsed_boxes"
229 |         layer.get_output(2).name = "nmsed_scores"
230 |         layer.get_output(3).name = "nmsed_classes"
231 |         for i in range(4):
232 |             network.mark_output(layer.get_output(i))
233 | 
234 |         return builder.build_engine(network, config)
235 | 
236 | 
237 | def allocate_buffers(engine, is_explicit_batch=False, dynamic_shapes=[]):
238 |     inputs = []
239 |     outputs = []
240 |     bindings = []
241 | 
242 |     class HostDeviceMem(object):
243 |         def __init__(self, host_mem, device_mem):
244 |             self.host = host_mem
245 |             self.device = device_mem
246 | 
247 |         def __str__(self):
248 |             return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
249 | 
250 |         def __repr__(self):
251 |             return self.__str__()
252 | 
253 |     for binding in engine:
254 |         dims = engine.get_binding_shape(binding)
255 |         print(dims)
256 |         if dims[0] == -1:
257 |             assert(len(dynamic_shapes) > 0)
258 |             dims[0] = dynamic_shapes[0]
259 |         size = trt.volume(dims) * engine.max_batch_size
260 |         dtype = trt.nptype(engine.get_binding_dtype(binding))
261 |         # Allocate host and device buffers
262 |         host_mem = cuda.pagelocked_empty(size, dtype)
263 |         device_mem = cuda.mem_alloc(host_mem.nbytes)
264 |         # Append the device buffer to device bindings.
265 |         bindings.append(int(device_mem))
266 |         # Append to the appropriate list.
267 |         if engine.binding_is_input(binding):
268 |             inputs.append(HostDeviceMem(host_mem, device_mem))
269 |         else:
270 |             outputs.append(HostDeviceMem(host_mem, device_mem))
271 |     return inputs, outputs, bindings
272 | 
273 | 
274 | def profile_trt(engine, batch_size, num_warmups=10, num_iters=100):
275 |     assert(engine is not None)  
276 |     input_img_array = np.array([input_img] * batch_size)
277 | 
278 |     yolo_inputs, yolo_outputs, yolo_bindings = allocate_buffers(engine, True)
279 |     
280 |     stream = cuda.Stream()    
281 |     with engine.create_execution_context() as context:
282 |         
283 |         total_duration = 0.
284 |         total_compute_duration = 0.
285 |         total_pre_duration = 0.
286 |         total_post_duration = 0.
287 |         for iteration in range(num_iters):
288 |             pre_t = time.time()
289 |             # set host data
290 |             img = torch.from_numpy(input_img_array).float().numpy()
291 |             yolo_inputs[0].host = img
292 |             [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in yolo_inputs]
293 |             stream.synchronize()
294 |             start_t = time.time()
295 |             context.execute_async_v2(bindings=yolo_bindings, stream_handle=stream.handle)
296 |             stream.synchronize()
297 |             end_t = time.time()
298 |             [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in yolo_outputs]
299 |             stream.synchronize()
300 |             post_t = time.time()
301 | 
302 |             duration = post_t - pre_t
303 |             compute_duration = end_t - start_t
304 |             pre_duration = start_t - pre_t
305 |             post_duration = post_t - end_t
306 |             if iteration >= num_warmups:
307 |                 total_duration += duration
308 |                 total_compute_duration += compute_duration
309 |                 total_post_duration += post_duration
310 |                 total_pre_duration += pre_duration
311 |         
312 |         print("avg GPU time: {}".format(total_duration/(num_iters - num_warmups)))
313 |         print("avg GPU compute time: {}".format(total_compute_duration/(num_iters - num_warmups)))
314 |         print("avg pre time: {}".format(total_pre_duration/(num_iters - num_warmups)))
315 |         print("avg post time: {}".format(total_post_duration/(num_iters - num_warmups)))
316 |         
317 |         num_det = int(yolo_outputs[0].host[0, ...])
318 |         boxes = np.array(yolo_outputs[1].host).reshape(batch_size, -1, 4)[0, 0:num_det, 0:4]
319 |         scores = np.array(yolo_outputs[2].host).reshape(batch_size, -1, 1)[0, 0:num_det, 0:1]
320 |         classes = np.array(yolo_outputs[3].host).reshape(batch_size, -1, 1)[0, 0:num_det, 0:1]
321 |         
322 |         return [np.concatenate([boxes, scores, classes], -1)]
323 | 
324 | 
325 | def profile_torch(model, using_half, batch_size, num_warmups=10, num_iters=100):
326 | 
327 |     model.to(device)
328 |     
329 |     total_duration = 0.
330 |     total_compute_duration = 0.
331 |     total_pre_duration = 0.
332 |     total_post_duration = 0.
333 |     if using_half:
334 |         model.half()
335 |     for iteration in range(num_iters):
336 |         pre_t = time.time()
337 |         # set host data
338 |         img = torch.from_numpy(input_img).float().to(device)
339 |         if using_half:
340 |             img = img.half()
341 |         start_t = time.time()
342 |         _ = model(img)
343 |         output = non_max_suppression(_[0], conf_thres, iou_thres)
344 |         end_t = time.time()
345 |         [i.cpu() for i in _]
346 |         post_t = time.time()
347 | 
348 |         duration = post_t - pre_t
349 |         compute_duration = end_t - start_t
350 |         pre_duration = start_t - pre_t
351 |         post_duration = post_t - end_t
352 |         if iteration >= num_warmups:
353 |             total_duration += duration
354 |             total_compute_duration += compute_duration
355 |             total_post_duration += post_duration
356 |             total_pre_duration += pre_duration
357 |     
358 |     print("avg GPU time: {}".format(total_duration/(num_iters - num_warmups)))
359 |     print("avg GPU compute time: {}".format(total_compute_duration/(num_iters - num_warmups)))
360 |     print("avg pre time: {}".format(total_pre_duration/(num_iters - num_warmups)))
361 |     print("avg post time: {}".format(total_post_duration/(num_iters - num_warmups)))
362 | 
363 |     return [output[0].cpu().numpy()]
364 | 
365 | 
366 | if __name__ == '__main__':
367 |     batch_size = 1 # only works for TRT. perf reported by torch is working on non-batched data.
368 |     using_half = False
369 |     onnx_path = 'yolov5_{}.onnx'.format(batch_size)
370 |     
371 |     with torch.no_grad():
372 |         model = load_model()
373 |         export_onnx(model, batch_size)
374 |         simplify_onnx(onnx_path)
375 | 
376 |         trt_result = profile_trt(build_engine(onnx_path, using_half), batch_size, 10, 100)
377 |         if using_half:
378 |             model.half()
379 |         torch_result = profile_torch(model, using_half, batch_size, 10, 100)
380 |         
381 |         print(trt_result)
382 |         print(torch_result)
383 | 
384 |     
385 | 


--------------------------------------------------------------------------------