├── .gitignore
├── CHANGELOG.md
├── CMakeLists.example.txt
├── LICENSE
├── README.md
├── assets
    ├── 000000000143-det.jpg
    ├── 000000000143-py-det.jpg
    ├── 000000000143-py-seg.jpg
    ├── 000000000143-seg.jpg
    ├── 000000000144-det.jpg
    ├── 000000000144-py-det.jpg
    ├── 000000000144-py-seg.jpg
    ├── 000000000144-seg.jpg
    ├── 000000000382-kpt-cpp.jpg
    ├── 000000000382-kpt-py.jpg
    ├── clion_screen.png
    └── export.png
├── checkpoints
    ├── yolov8n-seg.onnx
    └── yolov8n.onnx
├── images
    ├── 000000000143.jpg
    ├── 000000000144.jpg
    └── 000000000382.jpg
├── include
    ├── constants.h
    ├── nn
    │   ├── autobackend.h
    │   └── onnx_model_base.h
    └── utils
    │   ├── augment.h
    │   ├── common.h
    │   └── ops.h
└── src
    ├── main.cpp
    ├── nn
        ├── autobackend.cpp
        └── onnx_model_base.cpp
    └── utils
        ├── augment.cpp
        ├── common.cpp
        └── ops.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # ide related
 2 | .vs/*
 3 | .idea/*
 4 | # build
 5 | .build/*
 6 | x64/*
 7 | cmake-build-debug/*
 8 | cmake-build-debug-visual-studio/*
 9 | # ignore CMakeLists.txt but add example file
10 | CMakeLists.txt
11 | # dlls:
12 | *.dll
13 | # vs-like objects
14 | *.sln
15 | *vcxproj*
16 | packages.config
17 | packages/*
18 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 2024-05-09
 4 | ### Fixed 🔨
 5 | * Fixed memory leak by deleting the `blob` during the `predict_once` method call:
 6 |   [PR#7](https://github.com/FourierMourier/yolov8-onnx-cpp/pull/7), by [@dusionlike](https://github.com/dusionlike)
 7 | 
 8 | ## 2024-04-22
 9 | ### Fixed 🔨
10 | * Fixed returning not scaled coords for the keypoint task. 
11 |   [PR#5](https://github.com/FourierMourier/yolov8-onnx-cpp/pull/5), by [@youngday](https://github.com/youngday)
12 | * Fixed compilation issue on Linux due to the same type `model_path` arg 
13 |   in the `Ort::Session` constructor for both Windows and Linux. 
14 |   [PR#4](https://github.com/FourierMourier/yolov8-onnx-cpp/pull/4), 
15 |   by [@bhavya-goyal-22](https://github.com/bhavya-goyal-22) and [FourierMourier](https://github.com/FourierMourier)
16 | 


--------------------------------------------------------------------------------
/CMakeLists.example.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
 2 | project(YOLOv8CPP)
 3 | 
 4 | SET (OpenCV_DIR your/path/to/opencv/build/x64/vc16/lib)  # opencv lib root
 5 | SET (OpenCV_BIN_DIR your/path/to/opencv/build/x64/vc16/bin)  #opencv bin root
 6 | 
 7 | SET (OpenCV_DEBUG_DLL_FILENAME opencv_world480d.dll)  # change filenames
 8 | SET (OpenCV_RELEASE_DLL_FILENAME opencv_world480.dll)  # change filenames
 9 | 
10 | SET (ONNXRUNTIME_DIR your/path/to/onnxruntime-win-x64-1.15.1)  # onnxruntime root
11 | 
12 | FIND_PACKAGE(OpenCV REQUIRED)
13 | 
14 | # --- Configure your project files ---
15 | include_directories(include)  # Include your header files directory
16 | 
17 | # Recursively collect all source files under 'src' directory
18 | file(GLOB_RECURSE CURR_SOURCES src/*.cpp)
19 | 
20 | # Create the executable
21 | add_executable(YOLOv8CPP ${CURR_SOURCES})
22 | 
23 | SET(CMAKE_CXX_STANDARD 17)
24 | SET(CMAKE_CXX_STANDARD_REQUIRED ON)
25 | 
26 | TARGET_INCLUDE_DIRECTORIES(YOLOv8CPP PRIVATE "${ONNXRUNTIME_DIR}/include")
27 | 
28 | target_compile_features(YOLOv8CPP PRIVATE cxx_std_17)
29 | 
30 | TARGET_LINK_LIBRARIES(YOLOv8CPP ${OpenCV_LIBS})
31 | 
32 | if (WIN32)
33 |     TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/onnxruntime.lib")
34 | 
35 |     # some changes to the original version:
36 |     #   copy onnxruntime dll
37 |     add_custom_command(TARGET YOLOv8CPP POST_BUILD
38 |             COMMAND ${CMAKE_COMMAND} -E copy_if_different
39 |             "${ONNXRUNTIME_DIR}/lib/onnxruntime.dll"
40 |             "$<TARGET_FILE_DIR:YOLOv8CPP>"
41 |             )
42 |     #   copy opencv
43 | #[[    add_custom_command(TARGET YOLOv8CPP POST_BUILD
44 |             COMMAND ${CMAKE_COMMAND} -E copy_if_different
45 |             "${OpenCV_DIR}/${OpenCV_DLL_FILENAME}"
46 |             "$<TARGET_FILE_DIR:YOLOv8CPP>"
47 |             )]]
48 |     add_custom_command(TARGET YOLOv8CPP POST_BUILD
49 |             COMMAND ${CMAKE_COMMAND} -E copy_if_different
50 |             "${OpenCV_BIN_DIR}/${OpenCV_DEBUG_DLL_FILENAME}"
51 |             "$<TARGET_FILE_DIR:YOLOv8CPP>"
52 |             )
53 |     # add release
54 |     add_custom_command(TARGET YOLOv8CPP POST_BUILD
55 |             COMMAND ${CMAKE_COMMAND} -E copy_if_different
56 |             "${OpenCV_BIN_DIR}/${OpenCV_RELEASE_DLL_FILENAME}"
57 |             "$<TARGET_FILE_DIR:YOLOv8CPP>"
58 |             )
59 | 
60 | endif(WIN32)
61 | 
62 | if (UNIX)
63 |     TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/libonnxruntime.so")
64 | endif(UNIX)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2023 Elshat Akmaev
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ﻿# yolov8-onnx-cpp
  2 | 
  3 | ## Changelog
  4 | See the changelog [here](CHANGELOG.md)
  5 | 
  6 | ## Description
  7 | Hello there! yolov8-onnx-cpp is a C++ demo implementation of the YOLOv8 model using the ONNX library.
  8 | This project is based on the YOLOv8 model by Ultralytics.
  9 | I aimed to replicate the behavior of the Python version and achieve consistent results across various image sizes.
 10 | 
 11 | By the way, you don't need to specify names, img_size etc while initializing the model, since we can use ONNX metadata!
 12 | 
 13 | When you do export in python in onnx format, the following code executes
 14 | ```python
 15 |     self.metadata = {
 16 |         'description': description,
 17 |         'author': 'Ultralytics',
 18 |         'license': 'AGPL-3.0 https://ultralytics.com/license',
 19 |         'date': datetime.now().isoformat(),
 20 |         'version': __version__,
 21 |         'stride': int(max(model.stride)),
 22 |         'task': model.task,
 23 |         'batch': self.args.batch,
 24 |         'imgsz': self.imgsz,
 25 |         'names': model.names}  # model metadata
 26 |     if model.task == 'pose':
 27 |         self.metadata['kpt_shape'] = model.model[-1].kpt_shape
 28 | ```
 29 | 
 30 | (ultralytics 8.0.160, ultralytics/engine/exporter.py lines 221-233))
 31 | 
 32 | We can use this parameters at least to define stride, task, names and image size as described in the schema below:
 33 | 
 34 | ![Schema](assets/export.png)
 35 | 
 36 | ## Supported Tasks and Hardware
 37 | 
 38 | | Task       | Supported |
 39 | |------------|-----------|
 40 | | Detect     | ✔️        |
 41 | | Segment    | ✔️        |
 42 | | Pose       | ✔️        |
 43 | | Classify   |           |
 44 | 
 45 | 
 46 | | Hardware   | Supported |
 47 | |------------|-----------|
 48 | | CPU        | ✔️        |
 49 | | GPU        |           |
 50 | 
 51 | 
 52 | ## Comparison between Python and C++
 53 | 
 54 | I exported `yolov8n.pt`, `yolov8n-seg.pt`, `yolov8n-pose.pt` to ONNX format with an input size of [480, 640] ([height, width]).
 55 | For the test I used some images from the COCO128 dataset with different image sizes than the specified input. 
 56 | This difference in sizes triggered letterboxing. I maintained consistent parameters, 
 57 | setting `conf=0.3` and `iou=0.5` for all models.
 58 | 
 59 | Here are the comparison results:
 60 | 
 61 | 
 62 | ### Pose
 63 | Python Result
 64 | 
 65 | ![Python Result](assets/000000000382-kpt-py.jpg)
 66 | 
 67 | C++ Result
 68 | 
 69 | ![C++ Result](assets/000000000382-kpt-cpp.jpg)
 70 | 
 71 | ### Segmentation
 72 | Python Result 1
 73 | 
 74 | ![Python Result](assets/000000000143-py-seg.jpg)
 75 | 
 76 | C++ Result 1
 77 | 
 78 | ![C++ Result](assets/000000000143-seg.jpg)
 79 | 
 80 | Python Result 2
 81 | 
 82 | ![Python Result](assets/000000000144-py-seg.jpg)
 83 | 
 84 | C++ Result 2
 85 | 
 86 | ![C++ Result](assets/000000000144-seg.jpg)
 87 | 
 88 | ### Object detection
 89 | 
 90 | Python Result 1
 91 | 
 92 | ![Python Result](assets/000000000143-py-det.jpg)
 93 | 
 94 | C++ Result 1
 95 | 
 96 | ![C++ Result](assets/000000000143-det.jpg)
 97 | 
 98 | Python Result 2
 99 | 
100 | ![Python Result](assets/000000000144-py-det.jpg)
101 | 
102 | C++ Result 2
103 | 
104 | ![C++ Result](assets/000000000144-det.jpg)
105 | 
106 | ## Getting Started
107 | To get started with yolov8-onnx-cpp, follow these steps:
108 | 
109 | 1. Clone the repository:
110 |     ```shell
111 |     git clone https://github.com/FourierMourier/yolov8-onnx-cpp.git
112 |     ```
113 | 2. Setup additional libraries:
114 | 
115 | Download [opencv here](https://opencv.org/releases/) (4.80+)
116 | 
117 | ### Solution-like build
118 | * onnxruntime (1.50+) (nuget package)
119 | 
120 | ### Cmake in non-visual studio-like ide
121 | Copy `CMakeLists.example.txt` as `CMakeLists.txt` (git-ignored) and edit the following lines:
122 | ```cmake
123 | 
124 | SET (OpenCV_DIR your/path/to/opencv/build/x64/vc16/lib)  # opencv lib root
125 | SET (OpenCV_BIN_DIR your/path/to/opencv/build/x64/vc16/bin)  #opencv bin root
126 | 
127 | SET (OpenCV_DEBUG_DLL_FILENAME opencv_world480d.dll)  # change filenames
128 | SET (OpenCV_RELEASE_DLL_FILENAME opencv_world480.dll)  # change filenames
129 | 
130 | SET (ONNXRUNTIME_DIR your/path/to/onnxruntime-win-x64-1.15.1)  # onnxruntime root
131 | 
132 | ```
133 | 
134 | Even though you'll find only a nuget package on the [official page](https://onnxruntime.ai/docs/install/#cccwinml-installs)
135 | you can still download release for cmake file here for your platform:
136 | https://github.com/microsoft/onnxruntime/releases
137 | 
138 | If you're working in different IDE like Clion rather than visual studio you still have to do the following:
139 | 
140 | * Install Visual Studio: If you haven't already, consider installing Visual Studio on your Windows system.
141 |    You can download the Visual Studio Community edition for free from the official Microsoft website.
142 |    Ensure that you select the components necessary for C++ development. 
143 | * Configure CLion to Use Visual Studio: Open CLion, go to "File" > "Settings" > "Build, Execution, Deployment" > 
144 |   "Toolchains." In the "Environment" section, select the Visual Studio toolchain that you installed.
145 |   Make sure it points to the correct Visual Studio installation directory.
146 | 
147 | * CMake Configuration: Ensure that your CMake configuration in CLion specifies the Visual Studio generator 
148 |   (e.g., "Visual Studio 2022").
149 |   This can be set in "File" > "Settings" > "Build, Execution, Deployment" > "CMake" ( > "CMake options.")
150 |   ![clion_screen](assets/clion_screen.png)
151 | 
152 | So the issues like 
153 | * https://github.com/microsoft/onnxruntime/issues/1175
154 | * https://github.com/microsoft/onnxruntime/issues/9332
155 | * https://github.com/microsoft/onnxruntime/issues/11545
156 | will be gone
157 | 
158 | The issue like `"The given version [15] is not supported, only version 1 to 10 is supported in this build="`
159 | https://github.com/microsoft/onnxruntime/issues/11230
160 | 
161 | also should not occur since you configure dll in cmake 
162 | 
163 | Hope that helps!
164 | 
165 | 3. edit in the ./src/main.cpp img_path/&modelPath:
166 |     ```cpp
167 |     std::string img_path = "./images/000000000143.jpg";
168 |     //const std::string& modelPath = "./checkpoints/yolov8n.onnx"; // detection
169 |     const std::string& modelPath = "./checkpoints/yolov8n-seg.onnx"; // instance segmentation
170 |     const std::string& onnx_provider = OnnxProviders::CPU; // "cpu";
171 |     ```
172 | # Usage
173 | Provide an input image to the application, and it will perform object detection using the YOLOv8 model.
174 | Customize the model configuration and parameters in the code as needed.
175 | 
176 | # References
177 | * [YOLOv8 by Ultralytics](https://github.com/ultralytics/ultralytics)
178 | * [ONNX](https://onnx.ai)
179 | * [OpenCV](https://opencv.org)
180 | 
181 | # License
182 | This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
183 | 
184 | # Acknowledgments
185 | Some other cool repositories I found useful (and you might too):
186 | * https://github.com/winxos/yolov8_segment_onnx_in_cpp - another project implementing yolov8 segmentation in cpp
187 | * https://github.com/cyrusbehr/YOLOv8-TensorRT-CPP - tensorrt impelemntation in cpp
188 | * https://github.com/itsnine/yolov5-onnxruntime/tree/master yolov5 onnx in C++
189 | 
190 | This README was created with the assistance of OpenAI's ChatGPT (August 3 Version), a large language model.
191 | You can learn more about it [here](https://chat.openai.com/chat)
192 | 


--------------------------------------------------------------------------------
/assets/000000000143-det.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000143-det.jpg


--------------------------------------------------------------------------------
/assets/000000000143-py-det.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000143-py-det.jpg


--------------------------------------------------------------------------------
/assets/000000000143-py-seg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000143-py-seg.jpg


--------------------------------------------------------------------------------
/assets/000000000143-seg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000143-seg.jpg


--------------------------------------------------------------------------------
/assets/000000000144-det.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000144-det.jpg


--------------------------------------------------------------------------------
/assets/000000000144-py-det.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000144-py-det.jpg


--------------------------------------------------------------------------------
/assets/000000000144-py-seg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000144-py-seg.jpg


--------------------------------------------------------------------------------
/assets/000000000144-seg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000144-seg.jpg


--------------------------------------------------------------------------------
/assets/000000000382-kpt-cpp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000382-kpt-cpp.jpg


--------------------------------------------------------------------------------
/assets/000000000382-kpt-py.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/000000000382-kpt-py.jpg


--------------------------------------------------------------------------------
/assets/clion_screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/clion_screen.png


--------------------------------------------------------------------------------
/assets/export.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/assets/export.png


--------------------------------------------------------------------------------
/checkpoints/yolov8n-seg.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/checkpoints/yolov8n-seg.onnx


--------------------------------------------------------------------------------
/checkpoints/yolov8n.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/checkpoints/yolov8n.onnx


--------------------------------------------------------------------------------
/images/000000000143.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/images/000000000143.jpg


--------------------------------------------------------------------------------
/images/000000000144.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/images/000000000144.jpg


--------------------------------------------------------------------------------
/images/000000000382.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FourierMourier/yolov8-onnx-cpp/6f9d6716fe98febd04df553dd71f9d01423a7f2a/images/000000000382.jpg


--------------------------------------------------------------------------------
/include/constants.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | 
 4 | namespace MetadataConstants {
 5 |     inline const std::string IMGSZ = "imgsz";
 6 |     inline const std::string STRIDE = "stride";
 7 |     inline const std::string NC = "nc";
 8 |     inline const std::string CH = "ch";
 9 |     inline const std::string DATE = "date";
10 |     inline const std::string VERSION = "version";
11 |     inline const std::string TASK = "task";
12 |     inline const std::string BATCH = "batch";
13 |     inline const std::string NAMES = "names";
14 | }
15 | 
16 | namespace OnnxProviders {
17 |     inline const std::string CPU = "cpu";
18 |     inline const std::string CUDA = "cuda";
19 | }
20 | 
21 | namespace OnnxInitializers
22 | {
23 |     inline const int UNINITIALIZED_STRIDE = -1;
24 |     inline const int UNINITIALIZED_NC = -1;
25 | }
26 | 
27 | 
28 | namespace YoloTasks
29 | {
30 |     inline const std::string SEGMENT = "segment";
31 |     inline const std::string DETECT = "detect";
32 |     inline const std::string POSE = "pose";
33 |     inline const std::string CLASSIFY = "classify";
34 | }
35 | 


--------------------------------------------------------------------------------
/include/nn/autobackend.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <filesystem>
 3 | #include <vector>
 4 | #include <unordered_map>
 5 | #include <opencv2/core/mat.hpp>
 6 | 
 7 | #include "onnx_model_base.h"
 8 | #include "constants.h"
 9 | 
10 | /**
11 |  * @brief Represents the results of YOLO prediction.
12 |  *
13 |  * This structure stores information about a detected object, including its class index,
14 |  * confidence score, bounding box, semantic segmentation mask, and keypoints (if available).
15 |  */
16 | struct YoloResults {
17 |     int class_idx{};                  ///< The class index of the detected object.
18 |     float conf{};                     ///< The confidence score of the detection.
19 |     cv::Rect_<float> bbox;            ///< The bounding box of the detected object.
20 |     cv::Mat mask;                     ///< The semantic segmentation mask (if available).
21 |     std::vector<float> keypoints{};   ///< Keypoints representing the object's pose (if available).
22 | };
23 | 
24 | struct ImageInfo {
25 |     cv::Size raw_size;  // add additional attrs if you need
26 | };
27 | 
28 | 
29 | class AutoBackendOnnx : public OnnxModelBase {
30 | public:
31 |     // constructors
32 |     AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
33 |         const std::vector<int>& imgsz, const int& stride,
34 |         const int& nc, std::unordered_map<int, std::string> names);
35 | 
36 |     AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider);
37 | 
38 |     // getters
39 |     virtual const std::vector<int>& getImgsz();
40 |     virtual const int& getStride();
41 |     virtual const int& getCh();
42 |     virtual const int& getNc();
43 |     virtual const std::unordered_map<int, std::string>& getNames();
44 |     virtual const std::vector<int64_t>& getInputTensorShape();
45 |     virtual const int& getWidth();
46 |     virtual const int& getHeight();
47 |     virtual const cv::Size& getCvSize();
48 |     virtual const std::string& getTask();
49 |     /**
50 |      * @brief Runs object detection on an input image.
51 |      *
52 |      * This method performs object detection on the input image and returns the detected objects as YoloResults.
53 |      *
54 |      * @param image The input image to run object detection on.
55 |      * @param conf The confidence threshold for object detection.
56 |      * @param iou The intersection-over-union (IoU) threshold for non-maximum suppression.
57 |      * @param mask_threshold The threshold for the semantic segmentation mask.
58 |      * @param conversionCode An optional conversion code for image format conversion (e.g., cv::COLOR_BGR2RGB).
59 |      *                      Default value is -1, indicating no conversion.
60 |      *                      TODO: use some constant from some namespace rather than hardcoded values here
61 |      *
62 |      * @return A vector of YoloResults representing the detected objects.
63 |      */
64 |     virtual std::vector<YoloResults> predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
65 |     virtual std::vector<YoloResults> predict_once(const std::filesystem::path& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
66 |     virtual std::vector<YoloResults> predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
67 | 
68 |     virtual void fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
69 |     virtual void postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo para, std::vector<YoloResults>& output,
70 |         int& class_names_num, float& conf_threshold, float& iou_threshold,
71 |         int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold = 0.50f);
72 | 
73 |     virtual void postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
74 |         int& class_names_num, float& conf_threshold, float& iou_threshold);
75 |     virtual void postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
76 |                                   int& class_names_num, float& conf_threshold, float& iou_threshold);
77 |     static void _get_mask2(const cv::Mat& mask_info, const cv::Mat& mask_data, const ImageInfo& image_info, cv::Rect bound, cv::Mat& mask_out,
78 |         float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num, bool round_downsampled = false);
79 | 
80 | protected:
81 |     std::vector<int> imgsz_;
82 |     int stride_ = OnnxInitializers::UNINITIALIZED_STRIDE;
83 |     int nc_ = OnnxInitializers::UNINITIALIZED_NC; //
84 |     int ch_ = 3;
85 |     std::unordered_map<int, std::string> names_;
86 |     std::vector<int64_t> inputTensorShape_;
87 |     cv::Size cvSize_;
88 |     std::string task_;
89 |     //cv::MatSize cvMatSize_;
90 | };
91 | 


--------------------------------------------------------------------------------
/include/nn/onnx_model_base.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <onnxruntime_cxx_api.h>
 3 | #include <string>
 4 | #include <unordered_map>
 5 | #include <vector>
 6 | 
 7 | /*
 8 |  * This interface must provide only required arguments to load any onnx model regarding specific info -
 9 |  *  - i.e. modelPath will always be required, provider like "cpu" or "cuda" the same, since these are parameters you need
10 |  *  to set up `sessionOptions` or `session` objects properly, but image size is not needed for pure onnx graph to be loaded so do NOT include it here
11 |  */
12 | class OnnxModelBase {
13 | public:
14 |     OnnxModelBase(const char* modelPath, const char* logid, const char* provider);
15 |     //OnnxModelBase();  // no default constructor should be there
16 |     //virtual ~OnnxModelBase();
17 |     virtual const std::vector<std::string>& getInputNames(); // = 0
18 |     virtual const std::vector<std::string>& getOutputNames();
19 |     virtual const std::vector<const char*> getOutputNamesCStr();
20 |     virtual const std::vector<const char*> getInputNamesCStr();
21 |     virtual const Ort::ModelMetadata& getModelMetadata();
22 |     virtual const std::unordered_map<std::string, std::string>& getMetadata();
23 |     virtual const char* getModelPath();
24 |     virtual const Ort::Session& getSession();
25 |     //virtual std::vector<Ort::Value> forward(std::vector<Ort::Value> inputTensors);
26 |     virtual std::vector<Ort::Value> forward(std::vector<Ort::Value>& inputTensors);
27 |     Ort::Session session{ nullptr };
28 | 
29 | protected:
30 |     const char* modelPath_;
31 |     Ort::Env env{ nullptr };
32 | 
33 |     std::vector<std::string> inputNodeNames;
34 |     std::vector<std::string> outputNodeNames;
35 |     Ort::ModelMetadata model_metadata{ nullptr };
36 |     std::unordered_map<std::string, std::string> metadata;
37 |     std::vector<const char*> outputNamesCStr;
38 |     std::vector<const char*> inputNamesCStr;
39 | };
40 | 


--------------------------------------------------------------------------------
/include/utils/augment.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <opencv2/core/types.hpp>
 3 | 
 4 | void letterbox(const cv::Mat& image,
 5 |     cv::Mat& outImage,
 6 |     const cv::Size& newShape = cv::Size(640, 640),
 7 |     cv::Scalar_<double> color = cv::Scalar(), bool auto_ = true,
 8 |     bool scaleFill = false,
 9 |     bool scaleUp = true,
10 |     int stride = 32
11 | );
12 | 
13 | 
14 | cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float,
15 |     cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)));
16 | 
17 | void scale_image2(
18 |         cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape,
19 |         const std::pair<float, cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f))
20 |                 );
21 | 


--------------------------------------------------------------------------------
/include/utils/common.h:
--------------------------------------------------------------------------------
 1 | ﻿#ifndef COMMON_UTILS_H
 2 | #define COMMON_UTILS_H
 3 | 
 4 | #include <chrono>
 5 | #include <string>
 6 | #include <unordered_map>
 7 | #include <vector>
 8 | 
 9 | 
10 | class Timer {
11 | public:
12 |     Timer(double& accumulator, bool isEnabled = true);
13 |     void Stop();
14 | 
15 | private:
16 |     double& accumulator;
17 |     bool isEnabled;
18 |     std::chrono::time_point<std::chrono::high_resolution_clock> start;
19 | };
20 | 
21 | std::wstring get_win_path(const std::string& path);
22 | std::vector<std::string> parseVectorString(const std::string& input);
23 | std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input);
24 | std::unordered_map<int, std::string> parseNames(const std::string& input);
25 | int64_t vector_product(const std::vector<int64_t>& vec);
26 | #endif // COMMON_H COMMON_UTILS_H


--------------------------------------------------------------------------------
/include/utils/ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <opencv2/core/types.hpp>
 3 | 
 4 | //cv::Rect scaleCoords(const cv::Size& imageShape, const cv::Rect& coords, const cv::Size& imageOriginalShape);
 5 | /**
 6 |  * Scales a bounding box from the shape of the input image to the shape of an original image.
 7 |  *
 8 |  * @param img1_shape The shape (height, width) of the input image for the model.
 9 |  * @param box The bounding box to be scaled, specified as cv::Rect_<float>.
10 |  * @param img0_shape The shape (height, width) of the original target image.
11 |  * @param ratio_pad An optional parameter that specifies scaling and padding factors as a pair of values.
12 |  *	The first value (ratio) is used for scaling, and the second value (pad) is used for padding.
13 |  *	If not provided, default values will be used.
14 |  * @param padding An optional boolean parameter that specifies whether padding should be applied.
15 |  *	If set to true, padding will be applied to the bounding box.
16 |  *
17 |  * @return A scaled bounding box specified as cv::Rect_<float>.
18 |  *
19 |  * This function rescales a bounding box from the shape of the input image (img1_shape) to the shape of an original image (img0_shape).
20 |  */
21 | cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape, std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true);
22 | void clip_boxes(cv::Rect& box, const cv::Size& shape);
23 | void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape);
24 | void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape);
25 | void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape);
26 | 
27 | //void clip_coords(cv::Mat& coords, const cv::Size& shape);
28 | //cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape);
29 | void clip_coords(std::vector<float>& coords, const cv::Size& shape);
30 | std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape);
31 | 
32 | cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box);
33 | 
34 | 
35 | struct NMSResult{
36 |     std::vector<cv::Rect> bboxes;
37 |     std::vector<float> confidences;
38 |     std::vector<int> classes;
39 |     std::vector<std::vector<float>> rest;
40 | };
41 | 
42 | //std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
43 | std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
44 | non_max_suppression(const cv::Mat& output0, int class_names_num, int total_features_num, double conf_threshold, float iou_threshold);


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <random>
  2 | 
  3 | #include <filesystem>
  4 | #include "nn/onnx_model_base.h"
  5 | #include "nn/autobackend.h"
  6 | #include <opencv2/opencv.hpp>
  7 | #include <vector>
  8 | 
  9 | #include "utils/augment.h"
 10 | #include "constants.h"
 11 | #include "utils/common.h"
 12 | 
 13 | 
 14 | namespace fs = std::filesystem;
 15 | 
 16 | 
 17 | // Define the skeleton and color mappings
 18 | std::vector<std::vector<int>> skeleton = {{16, 14}, {14, 12}, {17, 15}, {15, 13}, {12, 13}, {6, 12}, {7, 13}, {6, 7},
 19 |                                           {6, 8}, {7, 9}, {8, 10}, {9, 11}, {2, 3}, {1, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}, {5, 7}};
 20 | 
 21 | std::vector<cv::Scalar> posePalette = {
 22 |         cv::Scalar(255, 128, 0), cv::Scalar(255, 153, 51), cv::Scalar(255, 178, 102), cv::Scalar(230, 230, 0), cv::Scalar(255, 153, 255),
 23 |         cv::Scalar(153, 204, 255), cv::Scalar(255, 102, 255), cv::Scalar(255, 51, 255), cv::Scalar(102, 178, 255), cv::Scalar(51, 153, 255),
 24 |         cv::Scalar(255, 153, 153), cv::Scalar(255, 102, 102), cv::Scalar(255, 51, 51), cv::Scalar(153, 255, 153), cv::Scalar(102, 255, 102),
 25 |         cv::Scalar(51, 255, 51), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(255, 0, 0), cv::Scalar(255, 255, 255)
 26 | };
 27 | 
 28 | std::vector<int> limbColorIndices = {9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16};
 29 | std::vector<int> kptColorIndices = {16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9};
 30 | 
 31 | 
 32 | 
 33 | cv::Scalar generateRandomColor(int numChannels) {
 34 |     if (numChannels < 1 || numChannels > 3) {
 35 |         throw std::invalid_argument("Invalid number of channels. Must be between 1 and 3.");
 36 |     }
 37 | 
 38 |     std::random_device rd;
 39 |     std::mt19937 gen(rd());
 40 |     std::uniform_int_distribution<int> dis(0, 255);
 41 | 
 42 |     cv::Scalar color;
 43 |     for (int i = 0; i < numChannels; i++) {
 44 |         color[i] = dis(gen); // for each channel separately generate value
 45 |     }
 46 | 
 47 |     return color;
 48 | }
 49 | 
 50 | std::vector<cv::Scalar> generateRandomColors(int class_names_num, int numChannels) {
 51 |     std::vector<cv::Scalar> colors;
 52 |     for (int i = 0; i < class_names_num; i++) {
 53 |         cv::Scalar color = generateRandomColor(numChannels);
 54 |         colors.push_back(color);
 55 |     }
 56 |     return colors;
 57 | }
 58 | 
 59 | void plot_masks(cv::Mat img, std::vector<YoloResults>& result, std::vector<cv::Scalar> color,
 60 |     std::unordered_map<int, std::string>& names)
 61 | {
 62 |     cv::Mat mask = img.clone();
 63 |     for (int i = 0; i < result.size(); i++)
 64 |     {
 65 |         float left, top;
 66 |         left = result[i].bbox.x;
 67 |         top = result[i].bbox.y;
 68 |         int color_num = i;
 69 |         int& class_idx = result[i].class_idx;
 70 |         rectangle(img, result[i].bbox, color[result[i].class_idx], 2);
 71 | 
 72 |         // try to get string value corresponding to given class_idx
 73 |         std::string class_name;
 74 |         auto it = names.find(class_idx);
 75 |         if (it != names.end()) {
 76 |             class_name = it->second;
 77 |         }
 78 |         else {
 79 |             std::cerr << "Warning: class_idx not found in names for class_idx = " << class_idx << std::endl;
 80 |             // then convert it to string anyway
 81 |             class_name = std::to_string(class_idx);
 82 |         }
 83 | 
 84 |         if (result[i].mask.rows && result[i].mask.cols > 0)
 85 |         {
 86 |             mask(result[i].bbox).setTo(color[result[i].class_idx], result[i].mask);
 87 |         }
 88 |         std::stringstream labelStream;
 89 |         labelStream << class_name << " " << std::fixed << std::setprecision(2) << result[i].conf;
 90 |         std::string label = labelStream.str();
 91 | 
 92 |     	cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
 93 |         cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
 94 |         cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
 95 |         rectangle(img, rect_to_fill, color[result[i].class_idx], -1);
 96 | 
 97 |         putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
 98 |     }
 99 |     addWeighted(img, 0.6, mask, 0.4, 0, img); //add mask to src
100 |     resize(img, img, img.size());
101 |     imshow("img", img);
102 |     cv::waitKey();
103 | }
104 | 
105 | 
106 | //void plot_keypoints(cv::Mat& image, const std::vector<std::vector<float>>& keypoints, const cv::Size& shape) {
107 | void plot_keypoints(cv::Mat& image, const std::vector<YoloResults>& results, const cv::Size& shape) {
108 | 
109 |     int radius = 5;
110 |     bool drawLines = true;
111 | 
112 |     if (results.empty()) {
113 |         return;
114 |     }
115 | 
116 |     std::vector<cv::Scalar> limbColorPalette;
117 |     std::vector<cv::Scalar> kptColorPalette;
118 | 
119 |     for (int index : limbColorIndices) {
120 |         limbColorPalette.push_back(posePalette[index]);
121 |     }
122 | 
123 |     for (int index : kptColorIndices) {
124 |         kptColorPalette.push_back(posePalette[index]);
125 |     }
126 | 
127 |     for (const auto& res: results) {
128 |         auto keypoint = res.keypoints;
129 |         bool isPose = keypoint.size() == 51;  // numKeypoints == 17 && keypoints[0].size() == 3;
130 |         drawLines &= isPose;
131 | 
132 |         // draw points
133 |         for (int i = 0; i < 17; i++) {
134 |             int idx = i * 3;
135 |             int x_coord = static_cast<int>(keypoint[idx]);
136 |             int y_coord = static_cast<int>(keypoint[idx + 1]);
137 | 
138 |             if (x_coord % shape.width != 0 && y_coord % shape.height != 0) {
139 |                 if (keypoint.size() == 3) {
140 |                     float conf = keypoint[2];
141 |                     if (conf < 0.5) {
142 |                         continue;
143 |                     }
144 |                 }
145 |                 cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
146 |                                                                                255);  // Default to red if not in pose mode
147 |                 cv::circle(image, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
148 |             }
149 |         }
150 |         // draw lines
151 |         if (drawLines) {
152 |             for (int i = 0; i < skeleton.size(); i++) {
153 |                 const std::vector<int> &sk = skeleton[i];
154 |                 int idx1 = sk[0] - 1;
155 |                 int idx2 = sk[1] - 1;
156 | 
157 |                 int idx1_x_pos = idx1 * 3;
158 |                 int idx2_x_pos = idx2 * 3;
159 | 
160 |                 int x1 = static_cast<int>(keypoint[idx1_x_pos]);
161 |                 int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
162 |                 int x2 = static_cast<int>(keypoint[idx2_x_pos]);
163 |                 int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
164 | 
165 |                 float conf1 = keypoint[idx1_x_pos + 2];
166 |                 float conf2 = keypoint[idx2_x_pos + 2];
167 | 
168 |                 // Check confidence thresholds
169 |                 if (conf1 < 0.5 || conf2 < 0.5) {
170 |                     continue;
171 |                 }
172 | 
173 |                 // Check if positions are within bounds
174 |                 if (x1 % shape.width == 0 || y1 % shape.height == 0 || x1 < 0 || y1 < 0 ||
175 |                     x2 % shape.width == 0 || y2 % shape.height == 0 || x2 < 0 || y2 < 0) {
176 |                     continue;
177 |                 }
178 | 
179 |                 // Draw a line between keypoints
180 |                 cv::Scalar color_limb = limbColorPalette[i];
181 |                 cv::line(image, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
182 |             }
183 |         }
184 |     }
185 | }
186 | 
187 | void plot_results(cv::Mat img, std::vector<YoloResults>& results,
188 |                   std::vector<cv::Scalar> color, std::unordered_map<int, std::string>& names,
189 |                   const cv::Size& shape
190 |                   ) {
191 | 
192 |     cv::Mat mask = img.clone();
193 | 
194 |     int radius = 5;
195 |     bool drawLines = true;
196 | 
197 |     auto raw_image_shape = img.size();
198 |     std::vector<cv::Scalar> limbColorPalette;
199 |     std::vector<cv::Scalar> kptColorPalette;
200 | 
201 |     for (int index : limbColorIndices) {
202 |         limbColorPalette.push_back(posePalette[index]);
203 |     }
204 | 
205 |     for (int index : kptColorIndices) {
206 |         kptColorPalette.push_back(posePalette[index]);
207 |     }
208 | 
209 |     for (const auto& res : results) {
210 |         float left = res.bbox.x;
211 |         float top = res.bbox.y;
212 |         int color_num = res.class_idx;
213 | 
214 |         // Draw bounding box
215 |         rectangle(img, res.bbox, color[res.class_idx], 2);
216 | 
217 |         // Try to get the class name corresponding to the given class_idx
218 |         std::string class_name;
219 |         auto it = names.find(res.class_idx);
220 |         if (it != names.end()) {
221 |             class_name = it->second;
222 |         }
223 |         else {
224 |             std::cerr << "Warning: class_idx not found in names for class_idx = " << res.class_idx << std::endl;
225 |             // Then convert it to a string anyway
226 |             class_name = std::to_string(res.class_idx);
227 |         }
228 | 
229 |         // Draw mask if available
230 |         if (res.mask.rows && res.mask.cols > 0) {
231 |             mask(res.bbox).setTo(color[res.class_idx], res.mask);
232 |         }
233 | 
234 |         // Create label
235 |         std::stringstream labelStream;
236 |         labelStream << class_name << " " << std::fixed << std::setprecision(2) << res.conf;
237 |         std::string label = labelStream.str();
238 | 
239 |         cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
240 |         cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
241 |         cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
242 |         rectangle(img, rect_to_fill, color[res.class_idx], -1);
243 |         putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
244 | 
245 |         // Check if keypoints are available
246 |         if (!res.keypoints.empty()) {
247 |             auto keypoint = res.keypoints;
248 |             bool isPose = keypoint.size() == 51;  // numKeypoints == 17 && keypoints[0].size() == 3;
249 |             drawLines &= isPose;
250 | 
251 |             // draw points
252 |             for (int i = 0; i < 17; i++) {
253 |                 int idx = i * 3;
254 |                 int x_coord = static_cast<int>(keypoint[idx]);
255 |                 int y_coord = static_cast<int>(keypoint[idx + 1]);
256 | 
257 |                 if (x_coord % raw_image_shape.width != 0 && y_coord % raw_image_shape.height != 0) {
258 |                     if (keypoint.size() == 3) {
259 |                         float conf = keypoint[2];
260 |                         if (conf < 0.5) {
261 |                             continue;
262 |                         }
263 |                     }
264 |                     cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
265 |                                                                                   255);  // Default to red if not in pose mode
266 |                     cv::circle(img, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
267 |                 }
268 |             }
269 |             // draw lines
270 |             if (drawLines) {
271 |                 for (int i = 0; i < skeleton.size(); i++) {
272 |                     const std::vector<int> &sk = skeleton[i];
273 |                     int idx1 = sk[0] - 1;
274 |                     int idx2 = sk[1] - 1;
275 | 
276 |                     int idx1_x_pos = idx1 * 3;
277 |                     int idx2_x_pos = idx2 * 3;
278 | 
279 |                     int x1 = static_cast<int>(keypoint[idx1_x_pos]);
280 |                     int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
281 |                     int x2 = static_cast<int>(keypoint[idx2_x_pos]);
282 |                     int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
283 | 
284 |                     float conf1 = keypoint[idx1_x_pos + 2];
285 |                     float conf2 = keypoint[idx2_x_pos + 2];
286 | 
287 |                     // Check confidence thresholds
288 |                     if (conf1 < 0.5 || conf2 < 0.5) {
289 |                         continue;
290 |                     }
291 | 
292 |                     // Check if positions are within bounds
293 |                     if (x1 % raw_image_shape.width == 0 || y1 % raw_image_shape.height == 0 || x1 < 0 || y1 < 0 ||
294 |                         x2 % raw_image_shape.width == 0 || y2 % raw_image_shape.height == 0 || x2 < 0 || y2 < 0) {
295 |                         continue;
296 |                     }
297 | 
298 |                     // Draw a line between keypoints
299 |                     cv::Scalar color_limb = limbColorPalette[i];
300 |                     cv::line(img, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
301 |                 }
302 |             }
303 |         }
304 |     }
305 | 
306 |     // Combine the image and mask
307 |     addWeighted(img, 0.6, mask, 0.4, 0, img);
308 | //    resize(img, img, img.size());
309 | //    resize(img, img, shape);
310 | //    // Show the image
311 | //    imshow("img", img);
312 | //    cv::waitKey();
313 | }
314 | 
315 | 
316 | 
317 | int main()
318 | {
319 |     std::string img_path = "../../images/000000000382.jpg";
320 |     //const std::img_path& modelPath = "./checkpoints/yolov8n.onnx"; // detection
321 |     // vs:
322 |     //    const std::string& modelPath = "./checkpoints/yolov8n-seg.onnx"; // instance segmentation
323 |     // clion:
324 |     const std::string& modelPath = "../../checkpoints/yolov8n-pose.onnx"; // pose
325 | 
326 |     fs::path imageFilePath(img_path);
327 |     fs::path newFilePath = imageFilePath.stem();
328 |     newFilePath += "-kpt-cpp";
329 |     newFilePath += imageFilePath.extension();
330 |     assert(newFilePath != imageFilePath);
331 |     std::cout << "newFilePath: " << newFilePath << std::endl;
332 | 
333 |     const std::string& onnx_provider = OnnxProviders::CPU; // "cpu";
334 |     const std::string& onnx_logid = "yolov8_inference2";
335 |     float mask_threshold = 0.5f;  // in python it's 0.5 and you can see that at ultralytics/utils/ops.process_mask line 705 (ultralytics.__version__ == .160)
336 |     float conf_threshold = 0.30f;
337 |     float iou_threshold = 0.45f;  //  0.70f;
338 | 	int conversion_code = cv::COLOR_BGR2RGB;
339 |     cv::Mat img = cv::imread(img_path, cv::IMREAD_UNCHANGED);
340 |     if (img.empty()) {
341 |         std::cerr << "Error: Unable to load image" << std::endl;
342 |         return 1;
343 |     }
344 |     AutoBackendOnnx model(modelPath.c_str(), onnx_logid.c_str(), onnx_provider.c_str());
345 |     std::vector<YoloResults> objs = model.predict_once(img, conf_threshold, iou_threshold, mask_threshold, conversion_code);
346 |     std::vector<cv::Scalar> colors = generateRandomColors(model.getNc(), model.getCh());
347 |     std::unordered_map<int, std::string> names = model.getNames();
348 | 
349 |     std::vector<std::vector<float>> keypointsVector;
350 |     for (const YoloResults& result : objs) {
351 |         keypointsVector.push_back(result.keypoints);
352 |     }
353 | 
354 |     cv::cvtColor(img, img, cv::COLOR_RGB2BGR);
355 |     cv::Size show_shape = img.size();  // cv::Size(1280, 720); // img.size()
356 |     plot_results(img, objs, colors, names, show_shape);
357 | //    plot_masks(img, objs, colors, names);
358 |     cv::imshow("img", img);
359 |     cv::waitKey();
360 |     return -1;
361 | }
362 | 


--------------------------------------------------------------------------------
/src/nn/autobackend.cpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "nn/autobackend.h"
  4 | 
  5 | #include <iostream>
  6 | #include <ostream>
  7 | #include <filesystem>
  8 | 
  9 | #include <opencv2/opencv.hpp>
 10 | #include <opencv2/imgcodecs.hpp>
 11 | #include <opencv2/imgproc.hpp>
 12 | #include <opencv2/core/mat.hpp>
 13 | 
 14 | #include "utils/augment.h"
 15 | #include "constants.h"
 16 | #include "utils/common.h"
 17 | #include "utils/ops.h"
 18 | 
 19 | 
 20 | namespace fs = std::filesystem;
 21 | 
 22 | 
 23 | AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
 24 |     const std::vector<int>& imgsz, const int& stride,
 25 |     const int& nc, const std::unordered_map<int, std::string> names)
 26 |     : OnnxModelBase(modelPath, logid, provider), imgsz_(imgsz), stride_(stride), nc_(nc), names_(names),
 27 |     inputTensorShape_()
 28 | {
 29 | }
 30 | 
 31 | AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider)
 32 |     : OnnxModelBase(modelPath, logid, provider) {
 33 |     // init metadata etc
 34 |     OnnxModelBase(modelPath, logid, provider);
 35 |     // then try to get additional info from metadata like imgsz, stride etc;
 36 |     //  ideally you should get all of them but you'll raise error if smth is not in metadata (or not under the appropriate keys)
 37 |     const std::unordered_map<std::string, std::string>& base_metadata = OnnxModelBase::getMetadata();
 38 | 
 39 |     // post init imgsz
 40 |     auto imgsz_iterator = base_metadata.find(MetadataConstants::IMGSZ);
 41 |     if (imgsz_iterator != base_metadata.end()) {
 42 |         // parse it and convert to int iterable
 43 |         std::vector<int> imgsz = convertStringVectorToInts(parseVectorString(imgsz_iterator->second));
 44 |         // set it here:
 45 |         if (imgsz_.empty()) {
 46 |             imgsz_ = imgsz;
 47 |         }
 48 |     }
 49 |     else {
 50 |         std::cerr << "Warning: Cannot get imgsz value from metadata" << std::endl;
 51 |     }
 52 | 
 53 |     // post init stride
 54 |     auto stride_item = base_metadata.find(MetadataConstants::STRIDE);
 55 |     if (stride_item != base_metadata.end()) {
 56 |         // parse it and convert to int iterable
 57 |         int stide_int = std::stoi(stride_item->second);
 58 |         // set it here:
 59 |         if (stride_ == OnnxInitializers::UNINITIALIZED_STRIDE) {
 60 |             stride_ = stide_int;
 61 |         }
 62 |     }
 63 |     else {
 64 |         std::cerr << "Warning: Cannot get stride value from metadata" << std::endl;
 65 |     }
 66 | 
 67 |     // post init names
 68 |     auto names_item = base_metadata.find(MetadataConstants::NAMES);
 69 |     if (names_item != base_metadata.end()) {
 70 |         // parse it and convert to int iterable
 71 |         std::unordered_map<int, std::string> names = parseNames(names_item->second);
 72 |         std::cout << "***Names from metadata***" << std::endl;
 73 |         for (const auto& pair : names) {
 74 |             std::cout << "Key: " << pair.first << ", Value: " << pair.second << std::endl;
 75 |         }
 76 |         // set it here:
 77 |         if (names_.empty()) {
 78 |             names_ = names;
 79 |         }
 80 |     }
 81 |     else {
 82 |         std::cerr << "Warning: Cannot get names value from metadata" << std::endl;
 83 |     }
 84 | 
 85 |     // post init number of classes - you can do that only and only if names_ is not empty and nc was not initialized previously
 86 |     if (nc_ == OnnxInitializers::UNINITIALIZED_NC && !names_.empty()) {
 87 |         nc_ = names_.size();
 88 |     }
 89 |     else {
 90 |         std::cerr << "Warning: Cannot get nc value from metadata (probably names wasn't set)" << std::endl;
 91 |     }
 92 | 
 93 |     if (!imgsz_.empty() && inputTensorShape_.empty())
 94 |     {
 95 |         inputTensorShape_ = { 1, ch_, getHeight(), getWidth() };
 96 |     }
 97 | 
 98 |     if (!imgsz_.empty())
 99 |     {
100 |         // Initialize cvSize_ using getHeight() and getWidth()
101 |         //cvSize_ = cv::MatSize()
102 |         cvSize_ = cv::Size(getWidth(), getHeight());
103 |         //cvMatSize_ = cv::MatSize(cvSize_.width, cvSize_.height);
104 |     }
105 | 
106 |     // task init:
107 |     auto task_item = base_metadata.find(MetadataConstants::TASK);
108 |     if (task_item != base_metadata.end()) {
109 |         // parse it and convert to int iterable
110 |         std::string task = std::string(task_item->second);
111 |         // set it here:
112 |         if (task_.empty())
113 |         {
114 |             task_ = task;
115 |         }
116 |     }
117 |     else {
118 |         std::cerr << "Warning: Cannot get task value from metadata" << std::endl;
119 |     }
120 | 
121 |     // TODO: raise assert if imgsz_ and task_ were not initialized (since you don't know in that case which postprocessing to use)
122 | 
123 | }
124 | 
125 | 
126 | 
127 | const std::vector<int>& AutoBackendOnnx::getImgsz() {
128 |     return imgsz_;
129 | }
130 | 
131 | const int& AutoBackendOnnx::getHeight()
132 | {
133 |     return imgsz_[0];
134 | }
135 | 
136 | const int& AutoBackendOnnx::getWidth()
137 | {
138 |     return imgsz_[1];
139 | }
140 | 
141 | const int& AutoBackendOnnx::getStride() {
142 |     return stride_;
143 | }
144 | 
145 | const int& AutoBackendOnnx::getCh() {
146 |     return ch_;
147 | }
148 | 
149 | const int& AutoBackendOnnx::getNc() {
150 |     return nc_;
151 | }
152 | 
153 | const std::unordered_map<int, std::string>& AutoBackendOnnx::getNames() {
154 |     return names_;
155 | }
156 | 
157 | 
158 | const cv::Size& AutoBackendOnnx::getCvSize()
159 | {
160 |     return cvSize_;
161 | }
162 | 
163 | const std::vector<int64_t>& AutoBackendOnnx::getInputTensorShape()
164 | {
165 |     return inputTensorShape_;
166 | }
167 | 
168 | const std::string& AutoBackendOnnx::getTask()
169 | {
170 |     return task_;
171 | }
172 | 
173 | std::vector<YoloResults> AutoBackendOnnx::predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold,
174 |     int conversionCode, bool verbose) {
175 |     // Convert the string imagePath to an object of type std::filesystem::path
176 |     fs::path imageFilePath(imagePath);
177 |     // Call the predict_once method, converting the image to a cv::Mat
178 |     return predict_once(imageFilePath, conf, iou, mask_threshold, conversionCode);
179 | }
180 | 
181 | std::vector<YoloResults> AutoBackendOnnx::predict_once(const fs::path& imagePath, float& conf, float& iou, float& mask_threshold,
182 |     int conversionCode, bool verbose) {
183 |     // Check if the specified path exists
184 |     if (!fs::exists(imagePath)) {
185 |         std::cerr << "Error: File does not exist: " << imagePath << std::endl;
186 |         // Return an empty vector or throw an exception, depending on your logic
187 |         return {};
188 |     }
189 | 
190 |     // Load the image into a cv::Mat
191 |     cv::Mat image = cv::imread(imagePath.string(), cv::IMREAD_UNCHANGED);
192 | 
193 |     // Check if loading the image was successful
194 |     if (image.empty()) {
195 |         std::cerr << "Error: Failed to load image: " << imagePath << std::endl;
196 |         // Return an empty vector or throw an exception, depending on your logic
197 |         return {};
198 |     }
199 | 
200 |     // now do some preprocessing based on channels info:
201 |     int required_image_channels = this->getCh();
202 |     /*assert(required_image_channels == image.channels() && "");*/
203 |     // Assert that the number of channels in the input image matches the required number of channels for the model
204 |     if (required_image_channels != image.channels()) {
205 |         const std::string& errorMessage = "Error: Number of image channels does not match the required channels.\n"
206 |             "Number of channels in the image: " + std::to_string(image.channels());
207 |         throw std::runtime_error(errorMessage);
208 |     }
209 | 
210 |     // Call overloaded one
211 |     return predict_once(image, conf, iou, mask_threshold, conversionCode);
212 | }
213 | 
214 | 
215 | std::vector<YoloResults> AutoBackendOnnx::predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode, bool verbose) {
216 |     double preprocess_time = 0.0;
217 |     double inference_time = 0.0;
218 |     double postprocess_time = 0.0;
219 |     Timer preprocess_timer = Timer(preprocess_time, verbose);
220 |     // 1. preprocess
221 |     float* blob = nullptr;
222 |     //double* blob = nullptr;
223 |     std::vector<Ort::Value> inputTensors;
224 |     if (conversionCode >= 0) {
225 |         cv::cvtColor(image, image, conversionCode);
226 |     }
227 |     std::vector<int64_t> inputTensorShape;
228 |     // TODO: for classify task preprocessed image will be different (!):
229 |     cv::Mat preprocessed_img;
230 |     cv::Size new_shape = cv::Size(getWidth(), getHeight());
231 |     const bool& scaleFill = false;  // false
232 |     const bool& auto_ = false; // true
233 |     letterbox(image, preprocessed_img, new_shape, cv::Scalar(), auto_, scaleFill, true, getStride());
234 |     fill_blob(preprocessed_img, blob, inputTensorShape);
235 |     int64_t inputTensorSize = vector_product(inputTensorShape);
236 |     std::vector<float> inputTensorValues(blob, blob + inputTensorSize);
237 | 
238 |     Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
239 |         OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
240 | 
241 |     inputTensors.push_back(Ort::Value::CreateTensor<float>(
242 |         memoryInfo, inputTensorValues.data(), inputTensorSize,
243 |         inputTensorShape.data(), inputTensorShape.size()
244 |     ));
245 |     preprocess_timer.Stop();
246 |     Timer inference_timer = Timer(inference_time, verbose);
247 |     // 2. inference
248 |     std::vector<Ort::Value> outputTensors = forward(inputTensors);
249 |     inference_timer.Stop();
250 |     Timer postprocess_timer = Timer(postprocess_time, verbose);
251 |     // create container for the results
252 |     std::vector<YoloResults> results;
253 |     // 3. postprocess based on task:
254 |     std::unordered_map<int, std::string> names = this->getNames();
255 |     // 4. cleanup blob since it was created using the "new" keyword during the `fill_blob` func call
256 |     delete[] blob;
257 | 
258 |     int class_names_num = names.size();
259 |     if (task_ == YoloTasks::SEGMENT) {
260 | 
261 |         // get outputs info
262 |         std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
263 |         std::vector<int64_t> outputTensor1Shape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
264 |         // get outputs
265 |         float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
266 | 
267 |         cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
268 |         auto mask_shape = outputTensor1Shape;
269 |         std::vector<int> mask_sz = { 1,(int)mask_shape[1],(int)mask_shape[2],(int)mask_shape[3] };
270 |         cv::Mat output1 = cv::Mat(mask_sz, CV_32F, outputTensors[1].GetTensorMutableData<float>());
271 | 
272 |         int iw = this->getWidth();
273 |         int ih = this->getHeight();
274 |         int mask_features_num = outputTensor1Shape[1];
275 |         int mh = outputTensor1Shape[2];
276 |         int mw = outputTensor1Shape[3];
277 |         ImageInfo img_info = { image.size() };
278 |         postprocess_masks(output0, output1, img_info, results, class_names_num, conf, iou,
279 |             iw, ih, mw, mh, mask_features_num, mask_threshold);
280 |     }
281 |     else if (task_ == YoloTasks::DETECT) {
282 |         ImageInfo img_info = { image.size() };
283 |         std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
284 |         float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
285 |         cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
286 |         postprocess_detects(output0, img_info, results, class_names_num, conf, iou);
287 |     }
288 |     else if (task_ == YoloTasks::POSE) {
289 |         ImageInfo image_info = { image.size() };
290 |         std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
291 |         float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
292 |         cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
293 |         postprocess_kpts(output0, image_info, results, class_names_num, conf, iou);
294 |     }
295 |     else {
296 |         throw std::runtime_error("NotImplementedError: task: " + task_);
297 |     }
298 | 
299 |     postprocess_timer.Stop();
300 |     if (verbose) {
301 |         std::cout << std::fixed << std::setprecision(1);
302 |         std::cout << "image: " << preprocessed_img.rows << "x" << preprocessed_img.cols << " " << results.size() << " objs, ";
303 |         std::cout << (preprocess_time + inference_time + postprocess_time) * 1000.0 << "ms" << std::endl;
304 |         std::cout << "Speed: " << (preprocess_time * 1000.0) << "ms preprocess, ";
305 |         std::cout << (inference_time * 1000.0) << "ms inference, ";
306 |         std::cout << (postprocess_time * 1000.0) << "ms postprocess per image ";
307 |         std::cout << "at shape (1, " << image.channels() << ", " << preprocessed_img.rows << ", " << preprocessed_img.cols << ")" << std::endl;
308 |     }
309 | 
310 |     return results;
311 | }
312 | 
313 | 
314 | void AutoBackendOnnx::postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo image_info, std::vector<YoloResults>& output,
315 |     int& class_names_num, float& conf_threshold, float& iou_threshold,
316 |     int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold /* = 0.5f */)
317 | {
318 |     output.clear();
319 |     std::vector<int> class_ids;
320 |     std::vector<float> confidences;
321 |     std::vector<cv::Rect> boxes;
322 |     std::vector<std::vector<float>> masks;
323 |     // 4 - your default number of rect parameters {x, y, w, h}
324 |     int data_width = class_names_num + 4 + masks_features_num;
325 |     int rows = output0.rows;
326 |     float* pdata = (float*)output0.data;
327 |     for (int r = 0; r < rows; ++r)
328 |     {
329 |         cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
330 |         cv::Point class_id;
331 |         double max_conf;
332 |         minMaxLoc(scores, 0, &max_conf, 0, &class_id);
333 |         if (max_conf > conf_threshold)
334 |         {
335 |             masks.push_back(std::vector<float>(pdata + 4 + class_names_num, pdata + data_width));
336 |             class_ids.push_back(class_id.x);
337 |             confidences.push_back(max_conf);
338 | 
339 |             float out_w = pdata[2];
340 |             float out_h = pdata[3];
341 |             float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
342 |             float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
343 |             cv::Rect_ <float> bbox = cv::Rect(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
344 |             cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
345 |             boxes.push_back(scaled_bbox);
346 |         }
347 |         pdata += data_width; // next pred
348 |     }
349 | 
350 |     // 
351 |     //float masks_threshold = 0.50;
352 |     //int top_k = 500;
353 |     //const float& nmsde_eta = 1.0f;
354 |     std::vector<int> nms_result;
355 |     cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
356 | 
357 |     // select all of the protos tensor
358 |     cv::Size downsampled_size = cv::Size(mw, mh);
359 |     std::vector<cv::Range> roi_rangs = { cv::Range(0, 1), cv::Range::all(),
360 |                                          cv::Range(0, downsampled_size.height), cv::Range(0, downsampled_size.width) };
361 |     cv::Mat temp_mask = output1(roi_rangs).clone();
362 |     cv::Mat proto = temp_mask.reshape(0, { masks_features_num, downsampled_size.width * downsampled_size.height });
363 | 
364 |     for (int i = 0; i < nms_result.size(); ++i)
365 |     {
366 |         int idx = nms_result[i];
367 |         boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
368 |         YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
369 |         _get_mask2(cv::Mat(masks[idx]).t(), proto, image_info, boxes[idx], result.mask, mask_threshold,
370 |             iw, ih, mw, mh, masks_features_num);
371 |         output.push_back(result);
372 |     }
373 | }
374 | 
375 | 
376 | void AutoBackendOnnx::postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
377 |     int& class_names_num, float& conf_threshold, float& iou_threshold)
378 | {
379 |     output.clear();
380 |     std::vector<int> class_ids;
381 |     std::vector<float> confidences;
382 |     std::vector<cv::Rect> boxes;
383 |     std::vector<std::vector<float>> masks;
384 |     // 4 - your default number of rect parameters {x, y, w, h}
385 |     int data_width = class_names_num + 4;
386 |     int rows = output0.rows;
387 |     float* pdata = (float*)output0.data;
388 | 
389 |     for (int r = 0; r < rows; ++r)
390 |     {
391 |         cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
392 |         cv::Point class_id;
393 |         double max_conf;
394 |         minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
395 | 
396 |         if (max_conf > conf_threshold)
397 |         {
398 |             masks.emplace_back(pdata + 4 + class_names_num, pdata + data_width);
399 |             class_ids.push_back(class_id.x);
400 |             confidences.push_back((float) max_conf);
401 | 
402 |             float out_w = pdata[2];
403 |             float out_h = pdata[3];
404 |             float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
405 |             float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
406 | 
407 |             cv::Rect_ <float> bbox = cv::Rect_ <float> (out_left, out_top, (out_w + 0.5), (out_h + 0.5));
408 |             cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
409 | 
410 |             boxes.push_back(scaled_bbox);
411 |         }
412 |         pdata += data_width; // next pred
413 |     }
414 | 
415 |     std::vector<int> nms_result;
416 |     cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
417 |     for (int idx : nms_result)
418 |     {
419 |         boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
420 |         YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
421 |         output.push_back(result);
422 |     }
423 | }
424 | 
425 | void AutoBackendOnnx::postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
426 |                                           int& class_names_num, float& conf_threshold, float& iou_threshold)
427 | {
428 |     std::vector<cv::Rect> boxes;
429 |     std::vector<float> confidences;
430 |     std::vector<int> class_ids;
431 |     std::vector<std::vector<float>> rest;
432 |     std::tie(boxes, confidences, class_ids, rest) = non_max_suppression(output0, class_names_num, output0.cols, conf_threshold, iou_threshold);
433 |     cv::Size img1_shape = getCvSize();
434 |     auto bound_bbox = cv::Rect_ <float> (0, 0, image_info.raw_size.width, image_info.raw_size.height);
435 |     for (int i = 0; i < boxes.size(); i++) {
436 |         //             pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], shape).round()
437 |         //            pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:]
438 |         //            pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, shape)
439 |         //            path = self.batch[0]
440 |         //            img_path = path[i] if isinstance(path, list) else path
441 |         //            results.append(
442 |         //                Results(orig_img=orig_img,
443 |         //                        path=img_path,
444 |         //                        names=self.model.names,
445 |         //                        boxes=pred[:, :6],
446 |         //                        keypoints=pred_kpts))
447 |         cv::Rect_<float> bbox = boxes[i];
448 |         auto scaled_bbox = scale_boxes(img1_shape, bbox, image_info.raw_size);
449 |         scaled_bbox = scaled_bbox & bound_bbox;
450 | //        cv::Mat kpt = cv::Mat(rest[i]).t();
451 | //        scale_coords(img1_shape, kpt, image_info.raw_size);
452 |         // TODO: overload scale_coords so that will accept cv::Mat of shape [17, 3]
453 |         //      so that it will be more similar to what we have in python
454 |         std::vector<float> kpt = scale_coords(img1_shape, rest[i], image_info.raw_size);
455 |         YoloResults tmp_res = { class_ids[i], confidences[i], scaled_bbox, {}, kpt};
456 |         output.push_back(tmp_res);
457 |     }
458 | }
459 | 
460 | void AutoBackendOnnx::_get_mask2(const cv::Mat& masks_features,
461 |     const cv::Mat& proto,
462 |     const ImageInfo& image_info, const cv::Rect bound, cv::Mat& mask_out,
463 |     float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num,
464 |     bool round_downsampled)
465 | 
466 | {
467 |     cv::Size img0_shape = image_info.raw_size;
468 |     cv::Size img1_shape = cv::Size(iw, ih);
469 |     cv::Size downsampled_size = cv::Size(mw, mh);
470 | 
471 |     cv::Rect_<float> bound_float(
472 |         static_cast<float>(bound.x),
473 |         static_cast<float>(bound.y),
474 |         static_cast<float>(bound.width),
475 |         static_cast<float>(bound.height)
476 |     );
477 | 
478 |     cv::Rect_<float> downsampled_bbox = scale_boxes(img0_shape, bound_float, downsampled_size);
479 |     cv::Size bound_size = cv::Size(mw, mh);
480 |     clip_boxes(downsampled_bbox, bound_size);
481 | 
482 |     cv::Mat matmul_res = (masks_features * proto).t();
483 |     matmul_res = matmul_res.reshape(1, { downsampled_size.height, downsampled_size.width });
484 |     // apply sigmoid to the mask:
485 |     cv::Mat sigmoid_mask;
486 |     exp(-matmul_res, sigmoid_mask);
487 |     sigmoid_mask = 1.0 / (1.0 + sigmoid_mask);
488 |     cv::Mat resized_mask;
489 |     cv::Rect_<float> input_bbox = scale_boxes(img0_shape, bound_float, img1_shape);
490 |     cv::resize(sigmoid_mask, resized_mask, img1_shape, 0, 0, cv::INTER_LANCZOS4);
491 |     cv::Mat pre_out_mask = resized_mask(input_bbox);
492 |     cv::Mat scaled_mask;
493 |     scale_image2(scaled_mask, resized_mask, img0_shape);
494 |     cv::resize(scaled_mask, mask_out, img0_shape);
495 |     mask_out = mask_out(bound) > mask_thresh;
496 | }
497 | 
498 | 
499 | void AutoBackendOnnx::fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape) {
500 | 
501 | 	cv::Mat floatImage;
502 |     if (inputTensorShape.empty())
503 |     {
504 |         inputTensorShape = getInputTensorShape();
505 |     }
506 |     int inputChannelsNum = inputTensorShape[1];
507 |     int rtype = CV_32FC3;
508 |     image.convertTo(floatImage, rtype, 1.0f / 255.0);
509 |     blob = new float[floatImage.cols * floatImage.rows * floatImage.channels()];
510 |     cv::Size floatImageSize{ floatImage.cols, floatImage.rows };
511 | 
512 |     // hwc -> chw
513 |     std::vector<cv::Mat> chw(floatImage.channels());
514 |     for (int i = 0; i < floatImage.channels(); ++i)
515 |     {
516 |         chw[i] = cv::Mat(floatImageSize, CV_32FC1, blob + i * floatImageSize.width * floatImageSize.height);
517 |     }
518 |     cv::split(floatImage, chw);
519 | }
520 | 
521 | 


--------------------------------------------------------------------------------
/src/nn/onnx_model_base.cpp:
--------------------------------------------------------------------------------
  1 | #include "nn/onnx_model_base.h"
  2 | 
  3 | #include <iostream>
  4 | #include <onnxruntime_cxx_api.h>
  5 | #include <onnxruntime_c_api.h>
  6 | 
  7 | #include "constants.h"
  8 | #include "utils/common.h"
  9 | 
 10 | 
 11 | /**
 12 |  * @brief Base class for any onnx model regarding the target.
 13 |  *
 14 |  * Wraps OrtApi.
 15 |  *
 16 |  * The caller provides a model path, logid, and provider.
 17 |  *
 18 |  * See the output logs for more information on warnings/errors that occur while processing the model.
 19 |  *
 20 |  * @param[in] modelPath Path to the model file.
 21 |  * @param[in] logid Log identifier.
 22 |  * @param[in] provider Provider (e.g., "CPU" or "CUDA"). (NOTE: for now only CPU is supported)
 23 |  */
 24 | 
 25 | OnnxModelBase::OnnxModelBase(const char* modelPath, const char* logid, const char* provider)
 26 | //: modelPath_(modelPath), env(std::move(env)), session(std::move(session))
 27 |     : modelPath_(modelPath)
 28 | {
 29 | 
 30 |     // TODO: too bad passing `ORT_LOGGING_LEVEL_WARNING` by default - for some cases
 31 |     //       info level would make sense too
 32 |     env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, logid);
 33 |     Ort::SessionOptions sessionOptions = Ort::SessionOptions();
 34 | 
 35 |     std::vector<std::string> availableProviders = Ort::GetAvailableProviders();
 36 |     auto cudaAvailable = std::find(availableProviders.begin(), availableProviders.end(), "CUDAExecutionProvider");
 37 |     OrtCUDAProviderOptions cudaOption;
 38 | 
 39 |     if (provider == OnnxProviders::CUDA.c_str()) {  // strcmp(provider, OnnxProviders::CUDA.c_str()) == true strcmp(provider, "cuda") // (providerStr == "cuda")
 40 |         if (cudaAvailable == availableProviders.end()) {
 41 |             std::cout << "CUDA is not supported by your ONNXRuntime build. Fallback to CPU." << std::endl;
 42 |             //std::cout << "Inference device: CPU" << std::endl;
 43 |         }
 44 |         else {
 45 |             //std::cout << "Inference device: GPU" << std::endl;
 46 |             sessionOptions.AppendExecutionProvider_CUDA(cudaOption);
 47 |         }
 48 |     }
 49 | 
 50 |     else if (provider == OnnxProviders::CPU.c_str()) {  // strcmp(provider, OnnxProviders::CPU.c_str()) == true) (providerStr == "cpu") {
 51 |         // "cpu" by default
 52 |     }
 53 |     else
 54 |     {
 55 |         throw std::runtime_error("NotImplemented provider=" + std::string(provider));
 56 |     }
 57 | 
 58 |     std::cout << "Inference device: " << std::string(provider) << std::endl;
 59 |     #ifdef _WIN32
 60 |         auto modelPathW = get_win_path(modelPath);  // For Windows (wstring)
 61 |         session = Ort::Session(env, modelPathW.c_str(), sessionOptions);
 62 |     #else
 63 |         session = Ort::Session(env, modelPath, sessionOptions);  // For Linux (string)
 64 |     #endif
 65 |     //session = Ort::Session(env)
 66 |     // https://github.com/microsoft/onnxruntime/issues/14157
 67 |     //std::vector<const char*> inputNodeNames; //
 68 |     // ----------------
 69 |     // init input names
 70 |     inputNodeNames;
 71 |     std::vector<Ort::AllocatedStringPtr> inputNodeNameAllocatedStrings; // <-- newly added
 72 |     Ort::AllocatorWithDefaultOptions allocator;
 73 |     auto inputNodesNum = session.GetInputCount();
 74 |     for (int i = 0; i < inputNodesNum; i++) {
 75 |         auto input_name = session.GetInputNameAllocated(i, allocator);
 76 |         inputNodeNameAllocatedStrings.push_back(std::move(input_name));
 77 |         inputNodeNames.push_back(inputNodeNameAllocatedStrings.back().get());
 78 |     }
 79 |     // -----------------
 80 |     // init output names
 81 |     outputNodeNames;
 82 |     auto outputNodesNum = session.GetOutputCount();
 83 |     std::vector<Ort::AllocatedStringPtr> outputNodeNameAllocatedStrings; // <-- newly added
 84 |     Ort::AllocatorWithDefaultOptions output_names_allocator;
 85 |     for (int i = 0; i < outputNodesNum; i++)
 86 |     {
 87 |         auto output_name = session.GetOutputNameAllocated(i, output_names_allocator);
 88 |         outputNodeNameAllocatedStrings.push_back(std::move(output_name));
 89 |         outputNodeNames.push_back(outputNodeNameAllocatedStrings.back().get());
 90 |     }
 91 |     // -------------------------
 92 |     // initialize model metadata
 93 |     model_metadata = session.GetModelMetadata();
 94 |     Ort::AllocatorWithDefaultOptions metadata_allocator;
 95 | 
 96 |     std::vector<Ort::AllocatedStringPtr> metadataAllocatedKeys = model_metadata.GetCustomMetadataMapKeysAllocated(metadata_allocator);
 97 |     std::vector<std::string> metadata_keys;
 98 |     metadata_keys.reserve(metadataAllocatedKeys.size());
 99 | 
100 |     for (const Ort::AllocatedStringPtr& allocatedString : metadataAllocatedKeys) {
101 |         metadata_keys.emplace_back(allocatedString.get());
102 |     }
103 | 
104 |     // -------------------------
105 |     // initialize metadata as the dict
106 |     // even though we know exactly what metadata we intend to use
107 |     // base onnx class should not have any ultralytics yolo-specific attributes like stride, task etc, so keep it clean as much as possible
108 |     for (const std::string& key : metadata_keys) {
109 |         Ort::AllocatedStringPtr metadata_value = model_metadata.LookupCustomMetadataMapAllocated(key.c_str(), metadata_allocator);
110 |         if (metadata_value != nullptr) {
111 |             auto raw_metadata_value = metadata_value.get();
112 |             metadata[key] = std::string(raw_metadata_value);
113 |         }
114 |     }
115 | 
116 |     // initialize cstr
117 |     for (const std::string& name : outputNodeNames) {
118 |         outputNamesCStr.push_back(name.c_str());
119 |     }
120 | 
121 |     for (const std::string& name : inputNodeNames)
122 |     {
123 |         inputNamesCStr.push_back(name.c_str());
124 |     }
125 | 
126 | }
127 | 
128 | const std::vector<std::string>& OnnxModelBase::getInputNames() {
129 |     return inputNodeNames;
130 | }
131 | 
132 | const std::vector<std::string>& OnnxModelBase::getOutputNames() {
133 |     return outputNodeNames;
134 | }
135 | 
136 | const Ort::ModelMetadata& OnnxModelBase::getModelMetadata()
137 | {
138 |     return model_metadata;
139 | }
140 | 
141 | const std::unordered_map<std::string, std::string>& OnnxModelBase::getMetadata()
142 | {
143 |     return metadata;
144 | }
145 | 
146 | 
147 | const Ort::Session& OnnxModelBase::getSession()
148 | {
149 |     return session;
150 | }
151 | 
152 | const char* OnnxModelBase::getModelPath()
153 | {
154 |     return modelPath_;
155 | }
156 | 
157 | const std::vector<const char*> OnnxModelBase::getOutputNamesCStr()
158 | {
159 |     return outputNamesCStr;
160 | }
161 | 
162 | const std::vector<const char*> OnnxModelBase::getInputNamesCStr()
163 | {
164 |     return inputNamesCStr;
165 | }
166 | 
167 | std::vector<Ort::Value> OnnxModelBase::forward(std::vector<Ort::Value>& inputTensors)
168 | {
169 |     // todo: make runOptions parameter here
170 | 
171 |     return session.Run(Ort::RunOptions{ nullptr },
172 |         inputNamesCStr.data(),
173 |         inputTensors.data(),
174 |         inputNamesCStr.size(),
175 |         outputNamesCStr.data(),
176 |         outputNamesCStr.size());
177 | }
178 | 
179 | //OnnxModelBase::~OnnxModelBase() {
180 | //    // empty body
181 | //}
182 | 


--------------------------------------------------------------------------------
/src/utils/augment.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core.hpp>
  2 | #include <opencv2/imgproc.hpp>
  3 | #include <opencv2/core/mat.hpp>
  4 | #include <opencv2/core/types.hpp>
  5 | 
  6 | 
  7 | /**
  8 |  * \brief padding value when letterbox changes image size ratio
  9 |  */
 10 | const int& DEFAULT_LETTERBOX_PAD_VALUE = 114;
 11 | 
 12 | 
 13 | void letterbox(const cv::Mat& image,
 14 |     cv::Mat& outImage,
 15 |     const cv::Size& newShape,
 16 |     cv::Scalar_<double> color,
 17 |     bool auto_,
 18 |     bool scaleFill,
 19 |     bool scaleUp, int stride
 20 | ) {
 21 |     cv::Size shape = image.size();
 22 |     float r = std::min(static_cast<float>(newShape.height) / static_cast<float>(shape.height),
 23 |         static_cast<float>(newShape.width) / static_cast<float>(shape.width));
 24 |     if (!scaleUp)
 25 |         r = std::min(r, 1.0f);
 26 | 
 27 |     float ratio[2]{ r, r };
 28 |     int newUnpad[2]{ static_cast<int>(std::round(static_cast<float>(shape.width) * r)),
 29 |                      static_cast<int>(std::round(static_cast<float>(shape.height) * r)) };
 30 | 
 31 |     auto dw = static_cast<float>(newShape.width - newUnpad[0]);
 32 |     auto dh = static_cast<float>(newShape.height - newUnpad[1]);
 33 | 
 34 |     if (auto_)
 35 |     {
 36 |         dw = static_cast<float>((static_cast<int>(dw) % stride));
 37 |         dh = static_cast<float>((static_cast<int>(dh) % stride));
 38 |     }
 39 |     else if (scaleFill)
 40 |     {
 41 |         dw = 0.0f;
 42 |         dh = 0.0f;
 43 |         newUnpad[0] = newShape.width;
 44 |         newUnpad[1] = newShape.height;
 45 |         ratio[0] = static_cast<float>(newShape.width) / static_cast<float>(shape.width);
 46 |         ratio[1] = static_cast<float>(newShape.height) / static_cast<float>(shape.height);
 47 |     }
 48 | 
 49 |     dw /= 2.0f;
 50 |     dh /= 2.0f;
 51 | 
 52 |     //cv::Mat outImage;
 53 |     if (shape.width != newUnpad[0] || shape.height != newUnpad[1])
 54 |     {
 55 |         cv::resize(image, outImage, cv::Size(newUnpad[0], newUnpad[1]));
 56 |     }
 57 |     else
 58 |     {
 59 |         outImage = image.clone();
 60 |     }
 61 | 
 62 |     int top = static_cast<int>(std::round(dh - 0.1f));
 63 |     int bottom = static_cast<int>(std::round(dh + 0.1f));
 64 |     int left = static_cast<int>(std::round(dw - 0.1f));
 65 |     int right = static_cast<int>(std::round(dw + 0.1f));
 66 | 
 67 | 
 68 |     if (color == cv::Scalar()) {
 69 |         color = cv::Scalar(DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE);
 70 |     }
 71 | 
 72 |     cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
 73 | 
 74 | }
 75 | 
 76 | cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
 77 |     cv::Size im1_shape = resized_mask.size();
 78 | 
 79 |     // Check if resizing is needed
 80 |     if (im1_shape == im0_shape) {
 81 |         return resized_mask.clone();
 82 |     }
 83 | 
 84 |     float gain, pad_x, pad_y;
 85 | 
 86 |     if (ratio_pad.first < 0.0f) {
 87 |         gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
 88 |             static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
 89 |         pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
 90 |         pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
 91 |     }
 92 |     else {
 93 |         gain = ratio_pad.first;
 94 |         pad_x = ratio_pad.second.x;
 95 |         pad_y = ratio_pad.second.y;
 96 |     }
 97 | 
 98 |     int top = static_cast<int>(pad_y);
 99 |     int left = static_cast<int>(pad_x);
100 |     int bottom = static_cast<int>(im1_shape.height - pad_y);
101 |     int right = static_cast<int>(im1_shape.width - pad_x);
102 | 
103 |     // Clip and resize the mask
104 |     cv::Rect clipped_rect(left, top, right - left, bottom - top);
105 |     cv::Mat clipped_mask = resized_mask(clipped_rect);
106 |     cv::Mat scaled_mask;
107 |     cv::resize(clipped_mask, scaled_mask, im0_shape);
108 | 
109 |     return scaled_mask;
110 | }
111 | 
112 | 
113 | void scale_image2(cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
114 |     cv::Size im1_shape = resized_mask.size();
115 | 
116 |     // Check if resizing is needed
117 |     if (im1_shape == im0_shape) {
118 |         scaled_mask = resized_mask.clone();
119 |         return;
120 |     }
121 | 
122 |     float gain, pad_x, pad_y;
123 | 
124 |     if (ratio_pad.first < 0.0f) {
125 |         gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
126 |                         static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
127 |         pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
128 |         pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
129 |     }
130 |     else {
131 |         gain = ratio_pad.first;
132 |         pad_x = ratio_pad.second.x;
133 |         pad_y = ratio_pad.second.y;
134 |     }
135 | 
136 |     int top = static_cast<int>(pad_y);
137 |     int left = static_cast<int>(pad_x);
138 |     int bottom = static_cast<int>(im1_shape.height - pad_y);
139 |     int right = static_cast<int>(im1_shape.width - pad_x);
140 | 
141 |     // Clip and resize the mask
142 |     cv::Rect clipped_rect(left, top, right - left, bottom - top);
143 |     cv::Mat clipped_mask = resized_mask(clipped_rect);
144 |     cv::resize(clipped_mask, scaled_mask, im0_shape);
145 | }
146 | 


--------------------------------------------------------------------------------
/src/utils/common.cpp:
--------------------------------------------------------------------------------
  1 | ﻿#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
  2 | #include <string>
  3 | #include <sstream>
  4 | #include <codecvt>
  5 | #include "utils/common.h"
  6 | 
  7 | #include <chrono>
  8 | //#include <boost/algorithm/string.hpp>
  9 | 
 10 | #include <iostream>
 11 | #include <stdio.h>
 12 | 
 13 | #include <string>
 14 | 
 15 | #include <regex>
 16 | #include <vector>
 17 | 
 18 | 
 19 | 
 20 | Timer::Timer(double& accumulator, bool isEnabled)
 21 |     : accumulator(accumulator), isEnabled(isEnabled) {
 22 |     if (isEnabled) {
 23 |         start = std::chrono::high_resolution_clock::now();
 24 |     }
 25 | }
 26 | 
 27 | // Stop the timer and update the accumulator
 28 | void Timer::Stop() {
 29 |     if (isEnabled) {
 30 |         auto end = std::chrono::high_resolution_clock::now();
 31 |         double duration = std::chrono::duration<double>(end - start).count();
 32 |         accumulator += duration;
 33 |     }
 34 | }
 35 | 
 36 | // С++ 14 version
 37 | //#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 38 | std::wstring get_win_path(const std::string& modelPath) {
 39 | #ifdef _WIN32
 40 |     return std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(modelPath);
 41 | #else
 42 |     // return modelPath;
 43 |     return std::wstring(modelPath.begin(), modelPath.end());
 44 | #endif
 45 | }
 46 | 
 47 | 
 48 | std::vector<std::string> parseVectorString(const std::string& input) {
 49 |     /* Main purpose of this function is to parse `imgsz` key value of model metadata
 50 |      *  and from [height, width] get height, width values in the vector of strings
 51 |      * Args:
 52 |      *  input:
 53 |      *      expected to be something like [544, 960] or [3,544, 960]
 54 |      * output:
 55 |      *  iterable of strings, representing integers
 56 |      */
 57 |     std::regex number_pattern(R"(\d+)");
 58 | 
 59 |     std::vector<std::string> result;
 60 |     std::sregex_iterator it(input.begin(), input.end(), number_pattern);
 61 |     std::sregex_iterator end;
 62 | 
 63 |     while (it != end) {
 64 |         result.push_back(it->str());
 65 |         ++it;
 66 |     }
 67 | 
 68 |     return result;
 69 | }
 70 | 
 71 | std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input) {
 72 |     std::vector<int> result;
 73 | 
 74 |     for (const std::string& str : input) {
 75 |         try {
 76 |             int value = std::stoi(str);
 77 |             result.push_back(value);
 78 |         }
 79 |         catch (const std::invalid_argument& e) {
 80 |             // raise explicit exception
 81 |             throw std::invalid_argument("Bad argument (cannot cast): value=" + str);
 82 |         }
 83 |         catch (const std::out_of_range& e) {
 84 |             // check bounds
 85 |             throw std::out_of_range("Value out of range: " + str);
 86 |         }
 87 |     }
 88 | 
 89 |     return result;
 90 | }
 91 | 
 92 | 
 93 | /*
 94 | std::unordered_map<int, std::string> parseNames(const std::string& input) {
 95 |     std::unordered_map<int, std::string> result;
 96 | 
 97 |     std::string cleanedInput = input;
 98 |     boost::erase_all(cleanedInput, "{");
 99 |     boost::erase_all(cleanedInput, "}");
100 | 
101 |     std::vector<std::string> elements;
102 |     boost::split(elements, cleanedInput, boost::is_any_of(","));
103 | 
104 |     for (const std::string& element : elements) {
105 |         std::vector<std::string> keyValue;
106 |         boost::split(keyValue, element, boost::is_any_of(":"));
107 | 
108 |         if (keyValue.size() == 2) {
109 |             int key = std::stoi(boost::trim_copy(keyValue[0]));
110 |             std::string value = boost::trim_copy(keyValue[1]);
111 | 
112 |             result[key] = value;
113 |         }
114 |     }
115 | 
116 |     return result;
117 | }
118 | */
119 | 
120 | std::unordered_map<int, std::string> parseNames(const std::string& input) {
121 |     std::unordered_map<int, std::string> result;
122 | 
123 |     std::string cleanedInput = input;
124 |     cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '{'), cleanedInput.end());
125 |     cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '}'), cleanedInput.end());
126 | 
127 |     std::istringstream elementStream(cleanedInput);
128 |     std::string element;
129 |     while (std::getline(elementStream, element, ',')) {
130 |         std::istringstream keyValueStream(element);
131 |         std::string keyStr, value;
132 |         if (std::getline(keyValueStream, keyStr, ':') && std::getline(keyValueStream, value)) {
133 |             int key = std::stoi(keyStr);
134 |             result[key] = value;
135 |         }
136 |     }
137 | 
138 |     return result;
139 | }
140 | 
141 | int64_t vector_product(const std::vector<int64_t>& vec) {
142 |     int64_t result = 1;
143 |     for (int64_t value : vec) {
144 |         result *= value;
145 |     }
146 |     return result;
147 | }
148 | 


--------------------------------------------------------------------------------
/src/utils/ops.cpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <opencv2/opencv.hpp>
  4 | //#include <opencv2/imgcodecs.hpp>
  5 | //#include <opencv2/imgproc.hpp>
  6 | 
  7 | #include <opencv2/core/mat.hpp>
  8 | #include <opencv2/core/types.hpp>
  9 | #include <opencv2/core.hpp>
 10 | #include <vector>
 11 | 
 12 | 
 13 | 
 14 | void clip_boxes(cv::Rect& box, const cv::Size& shape) {
 15 |     box.x = std::max(0, std::min(box.x, shape.width));
 16 |     box.y = std::max(0, std::min(box.y, shape.height));
 17 |     box.width = std::max(0, std::min(box.width, shape.width - box.x));
 18 |     box.height = std::max(0, std::min(box.height, shape.height - box.y));
 19 | }
 20 | 
 21 | void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape) {
 22 |     box.x = std::max(0.0f, std::min(box.x, static_cast<float>(shape.width)));
 23 |     box.y = std::max(0.0f, std::min(box.y, static_cast<float>(shape.height)));
 24 |     box.width = std::max(0.0f, std::min(box.width, static_cast<float>(shape.width - box.x)));
 25 |     box.height = std::max(0.0f, std::min(box.height, static_cast<float>(shape.height - box.y)));
 26 | }
 27 | 
 28 | 
 29 | void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape) {
 30 |     for (cv::Rect& box : boxes) {
 31 |         clip_boxes(box, shape);
 32 |     }
 33 | }
 34 | 
 35 | void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape) {
 36 |     for (cv::Rect_<float>& box : boxes) {
 37 |         clip_boxes(box, shape);
 38 |     }
 39 | }
 40 | 
 41 | // source: ultralytics/utils/ops.py scale_boxes lines 99+ (ultralytics==8.0.160)
 42 | cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape,
 43 |     std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true) {
 44 | 
 45 |     float gain, pad_x, pad_y;
 46 | 
 47 |     if (ratio_pad.first < 0.0f) {
 48 |         gain = std::min(static_cast<float>(img1_shape.height) / static_cast<float>(img0_shape.height),
 49 |             static_cast<float>(img1_shape.width) / static_cast<float>(img0_shape.width));
 50 |         pad_x = roundf((img1_shape.width - img0_shape.width * gain) / 2.0f - 0.1f);
 51 |         pad_y = roundf((img1_shape.height - img0_shape.height * gain) / 2.0f - 0.1f);
 52 |     }
 53 |     else {
 54 |         gain = ratio_pad.first;
 55 |         pad_x = ratio_pad.second.x;
 56 |         pad_y = ratio_pad.second.y;
 57 |     }
 58 | 
 59 |     //cv::Rect scaledCoords(box);
 60 |     cv::Rect_<float> scaledCoords(box);
 61 | 
 62 |     if (padding) {
 63 |         scaledCoords.x -= pad_x;
 64 |         scaledCoords.y -= pad_y;
 65 |     }
 66 | 
 67 |     scaledCoords.x /= gain;
 68 |     scaledCoords.y /= gain;
 69 |     scaledCoords.width /= gain;
 70 |     scaledCoords.height /= gain;
 71 | 
 72 |     // Clip the box to the bounds of the image
 73 |     clip_boxes(scaledCoords, img0_shape);
 74 | 
 75 |     return scaledCoords;
 76 | }
 77 | 
 78 | 
 79 | //void clip_coords(cv::Mat& coords, const cv::Size& shape) {
 80 | //    // Clip x-coordinates to the image width
 81 | //    cv::Mat xCoords = coords.col(0);
 82 | //    cv::Mat yCoords = coords.col(1);
 83 | //
 84 | //    for (int i = 0; i < coords.rows; ++i) {
 85 | //        xCoords.at<float>(i) = std::max(std::min(xCoords.at<float>(i), static_cast<float>(shape.width - 1)), 0.0f);
 86 | //        yCoords.at<float>(i) = std::max(std::min(yCoords.at<float>(i), static_cast<float>(shape.height - 1)), 0.0f);
 87 | //    }
 88 | //}
 89 | 
 90 | void clip_coords(std::vector<float>& coords, const cv::Size& shape) {
 91 |     // Assuming coords are of shape [1, 17, 3]
 92 |     for (int i = 0; i < coords.size(); i += 3) {
 93 |         coords[i] = std::min(std::max(coords[i], 0.0f), static_cast<float>(shape.width - 1));  // x
 94 |         coords[i + 1] = std::min(std::max(coords[i + 1], 0.0f), static_cast<float>(shape.height - 1));  // y
 95 |     }
 96 | }
 97 | 
 98 | // source: ultralytics/utils/ops.py scale_coords lines 753+ (ultralytics==8.0.160)
 99 | //cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape)
100 | //cv::Mat scale_coords(const cv::Size& img1_shape, std::vector<float> coords, const cv::Size& img0_shape)
101 | std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape)
102 | {
103 | //    cv::Mat scaledCoords = coords.clone();
104 |     std::vector<float> scaledCoords = coords;
105 | 
106 |     // Calculate gain and padding
107 |     double gain = std::min(static_cast<double>(img1_shape.width) / img0_shape.width, static_cast<double>(img1_shape.height) / img0_shape.height);
108 |     cv::Point2d pad((img1_shape.width - img0_shape.width * gain) / 2, (img1_shape.height - img0_shape.height * gain) / 2);
109 | 
110 |     // Apply padding
111 | //    scaledCoords.col(0) = (scaledCoords.col(0) - pad.x);
112 | //    scaledCoords.col(1) = (scaledCoords.col(1) - pad.y);
113 |     // Assuming coords are of shape [1, 17, 3]
114 |     for (int i = 0; i < scaledCoords.size(); i += 3) {
115 |         scaledCoords[i] -= pad.x;  // x padding
116 |         scaledCoords[i + 1] -= pad.y;  // y padding
117 |     }
118 | 
119 |     // Scale coordinates
120 | //    scaledCoords.col(0) /= gain;
121 | //    scaledCoords.col(1) /= gain;
122 |     // Assuming coords are of shape [1, 17, 3]
123 |     for (int i = 0; i < scaledCoords.size(); i += 3) {
124 |         scaledCoords[i] /= gain;
125 |         scaledCoords[i + 1] /= gain;
126 |     }
127 | 
128 |     clip_coords(scaledCoords, img0_shape);
129 |     return scaledCoords;
130 | }
131 | 
132 | 
133 | cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box) {
134 |     int h = mask.rows;
135 |     int w = mask.cols;
136 | 
137 |     int x1 = box.x;
138 |     int y1 = box.y;
139 |     int x2 = box.x + box.width;
140 |     int y2 = box.y + box.height;
141 | 
142 |     cv::Mat cropped_mask = cv::Mat::zeros(h, w, mask.type());
143 | 
144 |     for (int r = 0; r < h; ++r) {
145 |         for (int c = 0; c < w; ++c) {
146 |             if (r >= y1 && r < y2 && c >= x1 && c < x2) {
147 |                 cropped_mask.at<float>(r, c) = mask.at<float>(r, c);
148 |             }
149 |         }
150 |     }
151 | 
152 |     return cropped_mask;
153 | }
154 | 
155 | //std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
156 | std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
157 | non_max_suppression(const cv::Mat& output0, int class_names_num, int data_width, double conf_threshold,
158 |                     float iou_threshold) {
159 | 
160 |     std::vector<int> class_ids;
161 |     std::vector<float> confidences;
162 | //    std::vector<cv::Rect_<float>> boxes;
163 |     std::vector<cv::Rect> boxes;
164 |     std::vector<std::vector<float>> rest;
165 | 
166 |     int rest_start_pos = class_names_num + 4;
167 |     int rest_features = data_width - rest_start_pos;
168 | //    int data_width = rest_start_pos + total_features_num;
169 | 
170 |     int rows = output0.rows;
171 |     float* pdata = (float*) output0.data;
172 | 
173 |     for (int r = 0; r < rows; ++r) {
174 |         cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
175 |         cv::Point class_id;
176 |         double max_conf;
177 |         minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
178 | 
179 |         if (max_conf > conf_threshold) {
180 |             std::vector<float> mask_data(pdata + 4 + class_names_num, pdata + data_width);
181 |             class_ids.push_back(class_id.x);
182 |             confidences.push_back((float) max_conf);
183 | 
184 |             float out_w = pdata[2];
185 |             float out_h = pdata[3];
186 |             float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
187 |             float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
188 |             cv::Rect_<float> bbox(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
189 |             boxes.push_back(bbox);
190 |             if (rest_features > 0) {
191 |                 std::vector<float> rest_data(pdata + rest_start_pos, pdata + data_width);
192 |                 rest.push_back(rest_data);
193 |             }
194 |         }
195 |         pdata += data_width; // next prediction
196 |     }
197 | 
198 |     //
199 |     //float masks_threshold = 0.50;
200 |     //int top_k = 500;
201 |     //const float& nmsde_eta = 1.0f;
202 |     std::vector<int> nms_result;
203 |     cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
204 | //    cv::dnn::NMSBoxes(boxes, confidences, );
205 |     std::vector<int> nms_class_ids;
206 |     std::vector<float> nms_confidences;
207 | //    std::vector<cv::Rect_<float>> boxes;
208 |     std::vector<cv::Rect> nms_boxes;
209 |     std::vector<std::vector<float>> nms_rest;
210 |     for (int idx: nms_result) {
211 |         nms_class_ids.push_back(class_ids[idx]);
212 |         nms_confidences.push_back(confidences[idx]);
213 |         nms_boxes.push_back(boxes[idx]);
214 |         nms_rest.push_back(rest[idx]);
215 |     }
216 |     return std::make_tuple(nms_boxes, nms_confidences, nms_class_ids, nms_rest);
217 | }
218 | 


--------------------------------------------------------------------------------