├── .gitignore
├── CMakeLists.txt
├── README.md
├── data
    ├── bus.jpg
    ├── bus.npy
    ├── yolov8n_full.onnx
    ├── yolov8n_no_tail.onnx
    └── yolov8n_no_tail.rknn
├── include
    ├── rknn_api.h
    ├── rknn_net.h
    ├── yolo_multi_npu.h
    └── yolov8_post.h
├── lib
    └── librknnrt.so
├── src
    ├── picture_demo.cc
    ├── rknn_net.cc
    ├── video_demo.cc
    └── yolov8_post.cc
└── tools
    ├── dataset.txt
    ├── modify_no_tail.py
    ├── onnx2rknn_export.py
    ├── yolov8_box_head.py
    └── yolov8_tail.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | /.vscode


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.4.1)
 2 | 
 3 | project(rknn_yolov8_demo)
 4 | 
 5 | include_directories(${CMAKE_SOURCE_DIR}/include)
 6 | 
 7 | set(RKNN_RT_LIB ${CMAKE_SOURCE_DIR}/lib/librknnrt.so)
 8 | 
 9 | set(OpenCV_DIR /root/lib/opencv454_install/lib/cmake/opencv4)
10 | find_package(OpenCV REQUIRED)
11 | 
12 | find_package(Threads REQUIRED) # find and include pthreads
13 | 
14 | add_executable(${PROJECT_NAME}_picture
15 |         src/picture_demo.cc
16 |         src/rknn_net.cc
17 |         src/yolov8_post.cc
18 | )
19 | 
20 | add_executable(${PROJECT_NAME}_video
21 |         src/video_demo.cc
22 |         src/rknn_net.cc
23 |         src/yolov8_post.cc
24 | )
25 | 
26 | target_link_libraries(${PROJECT_NAME}_picture
27 |   ${RKNN_RT_LIB}
28 |   ${OpenCV_LIBS}
29 | )
30 | 
31 | target_link_libraries(${PROJECT_NAME}_video
32 |   ${RKNN_RT_LIB}
33 |   ${OpenCV_LIBS}
34 |   Threads::Threads
35 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Yolv8n RK3588 Demo :rocket:
 2 | 
 3 | This repository contains a demonstration of Yolv8n running on an RK3588 device.
 4 | 
 5 | ## How to Use
 6 | 
 7 | ### Configuring OpenCV Installation Directory
 8 | 
 9 | Edit CMakeList.txt to set your OpenCV installation directory:
10 | 
11 | ```
12 | set(OpenCV_DIR /root/lib/opencv454_install/lib/cmake/opencv4)
13 | ```
14 | 
15 | Replace the path with your own OpenCV installation directory.
16 | 
17 | ### Building the Project
18 | 
19 | ```shell
20 | mkdir build
21 | cd build
22 | cmake ..
23 | make
24 | ```
25 | 
26 | ### Running the Demo
27 | 
28 | **Inference from Images**
29 | 
30 | To run the demo using default image:
31 | 
32 | ```shell
33 | rknn_yolov8_demo_picture
34 | ```
35 | 
36 | **Inference from Video**
37 | 
38 | Video inference is currently under development :construction:. To run the demo, please install ffmpeg and rtsp server, modifiy the input_path in src/video_demo.cc and rtsp address.
39 | 
40 | ```
41 | rknn_yolov8_demo_video
42 | ```
43 | 
44 | ## Notes
45 | 
46 | The tools directory contains scripts for model conversion which are intended to be run on a host machine. These include:
47 | 
48 | - modify_no_tail.py
49 | 
50 |   This script removes the tail of the official exported Yolv8n model.
51 | 
52 | - onnx2rknn_export.py
53 | 
54 |   This script converts the modified ONNX model (with the head removed) to an RKNN model.


--------------------------------------------------------------------------------
/data/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/bus.jpg


--------------------------------------------------------------------------------
/data/bus.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/bus.npy


--------------------------------------------------------------------------------
/data/yolov8n_full.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/yolov8n_full.onnx


--------------------------------------------------------------------------------
/data/yolov8n_no_tail.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/yolov8n_no_tail.onnx


--------------------------------------------------------------------------------
/data/yolov8n_no_tail.rknn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/yolov8n_no_tail.rknn


--------------------------------------------------------------------------------
/include/rknn_api.h:
--------------------------------------------------------------------------------
  1 | /****************************************************************************
  2 | *
  3 | *    Copyright (c) 2017 - 2022 by Rockchip Corp.  All rights reserved.
  4 | *
  5 | *    The material in this file is confidential and contains trade secrets
  6 | *    of Rockchip Corporation. This is proprietary information owned by
  7 | *    Rockchip Corporation. No part of this work may be disclosed,
  8 | *    reproduced, copied, transmitted, or used in any way for any purpose,
  9 | *    without the express written permission of Rockchip Corporation.
 10 | *
 11 | *****************************************************************************/
 12 | 
 13 | 
 14 | #ifndef _RKNN_API_H
 15 | #define _RKNN_API_H
 16 | 
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | #include <stdint.h>
 22 | 
 23 | /*
 24 |     Definition of extended flag for rknn_init.
 25 | */
 26 | /* set high priority context. */
 27 | #define RKNN_FLAG_PRIOR_HIGH                    0x00000000
 28 | 
 29 | /* set medium priority context */
 30 | #define RKNN_FLAG_PRIOR_MEDIUM                  0x00000001
 31 | 
 32 | /* set low priority context. */
 33 | #define RKNN_FLAG_PRIOR_LOW                     0x00000002
 34 | 
 35 | /* asynchronous mode.
 36 |    when enable, rknn_outputs_get will not block for too long because it directly retrieves the result of
 37 |    the previous frame which can increase the frame rate on single-threaded mode, but at the cost of
 38 |    rknn_outputs_get not retrieves the result of the current frame.
 39 |    in multi-threaded mode you do not need to turn this mode on. */
 40 | #define RKNN_FLAG_ASYNC_MASK                    0x00000004
 41 | 
 42 | /* collect performance mode.
 43 |    when enable, you can get detailed performance reports via rknn_query(ctx, RKNN_QUERY_PERF_DETAIL, ...),
 44 |    but it will reduce the frame rate. */
 45 | #define RKNN_FLAG_COLLECT_PERF_MASK             0x00000008
 46 | 
 47 | /* allocate all memory in outside, includes weight/internal/inputs/outputs */
 48 | #define RKNN_FLAG_MEM_ALLOC_OUTSIDE             0x00000010
 49 | 
 50 | /* weight sharing with the same network structure */
 51 | #define RKNN_FLAG_SHARE_WEIGHT_MEM              0x00000020
 52 | 
 53 | /* send fence fd from outside */
 54 | #define RKNN_FLAG_FENCE_IN_OUTSIDE              0x00000040
 55 | 
 56 | /* get fence fd from inside */
 57 | #define RKNN_FLAG_FENCE_OUT_OUTSIDE             0x00000080
 58 | 
 59 | /* dummy init flag: could only get total_weight_size and total_internal_size by rknn_query*/
 60 | #define RKNN_FLAG_COLLECT_MODEL_INFO_ONLY       0x00000100
 61 | 
 62 | /*
 63 |     Error code returned by the RKNN API.
 64 | */
 65 | #define RKNN_SUCC                               0       /* execute succeed. */
 66 | #define RKNN_ERR_FAIL                           -1      /* execute failed. */
 67 | #define RKNN_ERR_TIMEOUT                        -2      /* execute timeout. */
 68 | #define RKNN_ERR_DEVICE_UNAVAILABLE             -3      /* device is unavailable. */
 69 | #define RKNN_ERR_MALLOC_FAIL                    -4      /* memory malloc fail. */
 70 | #define RKNN_ERR_PARAM_INVALID                  -5      /* parameter is invalid. */
 71 | #define RKNN_ERR_MODEL_INVALID                  -6      /* model is invalid. */
 72 | #define RKNN_ERR_CTX_INVALID                    -7      /* context is invalid. */
 73 | #define RKNN_ERR_INPUT_INVALID                  -8      /* input is invalid. */
 74 | #define RKNN_ERR_OUTPUT_INVALID                 -9      /* output is invalid. */
 75 | #define RKNN_ERR_DEVICE_UNMATCH                 -10     /* the device is unmatch, please update rknn sdk
 76 |                                                            and npu driver/firmware. */
 77 | #define RKNN_ERR_INCOMPATILE_PRE_COMPILE_MODEL  -11     /* This RKNN model use pre_compile mode, but not compatible with current driver. */
 78 | #define RKNN_ERR_INCOMPATILE_OPTIMIZATION_LEVEL_VERSION  -12     /* This RKNN model set optimization level, but not compatible with current driver. */
 79 | #define RKNN_ERR_TARGET_PLATFORM_UNMATCH        -13     /* This RKNN model set target platform, but not compatible with current platform. */
 80 | 
 81 | /*
 82 |     Definition for tensor
 83 | */
 84 | #define RKNN_MAX_DIMS                           16      /* maximum dimension of tensor. */
 85 | #define RKNN_MAX_NUM_CHANNEL                    15      /* maximum channel number of input tensor. */
 86 | #define RKNN_MAX_NAME_LEN                       256     /* maximum name lenth of tensor. */
 87 | 
 88 | 
 89 | #ifdef __arm__
 90 | typedef uint32_t rknn_context;
 91 | #else
 92 | typedef uint64_t rknn_context;
 93 | #endif
 94 | 
 95 | 
 96 | /*
 97 |     The query command for rknn_query
 98 | */
 99 | typedef enum _rknn_query_cmd {
100 |     RKNN_QUERY_IN_OUT_NUM = 0,                              /* query the number of input & output tensor. */
101 |     RKNN_QUERY_INPUT_ATTR = 1,                              /* query the attribute of input tensor. */
102 |     RKNN_QUERY_OUTPUT_ATTR = 2,                             /* query the attribute of output tensor. */
103 |     RKNN_QUERY_PERF_DETAIL = 3,                             /* query the detail performance, need set
104 |                                                                RKNN_FLAG_COLLECT_PERF_MASK when call rknn_init,
105 |                                                                this query needs to be valid after rknn_outputs_get. */
106 |     RKNN_QUERY_PERF_RUN = 4,                                /* query the time of run,
107 |                                                                this query needs to be valid after rknn_outputs_get. */
108 |     RKNN_QUERY_SDK_VERSION = 5,                             /* query the sdk & driver version */
109 | 
110 |     RKNN_QUERY_MEM_SIZE = 6,                                /* query the weight & internal memory size */
111 |     RKNN_QUERY_CUSTOM_STRING = 7,                           /* query the custom string */
112 | 
113 |     RKNN_QUERY_NATIVE_INPUT_ATTR = 8,                       /* query the attribute of native input tensor. */
114 |     RKNN_QUERY_NATIVE_OUTPUT_ATTR = 9,                      /* query the attribute of native output tensor. */
115 | 
116 |     RKNN_QUERY_NATIVE_NC1HWC2_INPUT_ATTR = 8,               /* query the attribute of native input tensor. */
117 |     RKNN_QUERY_NATIVE_NC1HWC2_OUTPUT_ATTR = 9,              /* query the attribute of native output tensor. */
118 | 
119 |     RKNN_QUERY_NATIVE_NHWC_INPUT_ATTR = 10,                 /* query the attribute of native input tensor. */
120 |     RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR = 11,                /* query the attribute of native output tensor. */
121 | 
122 |     RKNN_QUERY_DEVICE_MEM_INFO = 12,                        /* query the attribute of rknn memory information. */
123 | 
124 |     RKNN_QUERY_CMD_MAX
125 | } rknn_query_cmd;
126 | 
127 | /*
128 |     the tensor data type.
129 | */
130 | typedef enum _rknn_tensor_type {
131 |     RKNN_TENSOR_FLOAT32 = 0,                            /* data type is float32. */
132 |     RKNN_TENSOR_FLOAT16,                                /* data type is float16. */
133 |     RKNN_TENSOR_INT8,                                   /* data type is int8. */
134 |     RKNN_TENSOR_UINT8,                                  /* data type is uint8. */
135 |     RKNN_TENSOR_INT16,                                  /* data type is int16. */
136 |     RKNN_TENSOR_UINT16,                                 /* data type is uint16. */
137 |     RKNN_TENSOR_INT32,                                  /* data type is int32. */
138 |     RKNN_TENSOR_UINT32,                                 /* data type is uint32. */
139 |     RKNN_TENSOR_INT64,                                  /* data type is int64. */
140 |     RKNN_TENSOR_BOOL,
141 | 
142 |     RKNN_TENSOR_TYPE_MAX
143 | } rknn_tensor_type;
144 | 
145 | inline static const char* get_type_string(rknn_tensor_type type)
146 | {
147 |     switch(type) {
148 |     case RKNN_TENSOR_FLOAT32: return "FP32";
149 |     case RKNN_TENSOR_FLOAT16: return "FP16";
150 |     case RKNN_TENSOR_INT8: return "INT8";
151 |     case RKNN_TENSOR_UINT8: return "UINT8";
152 |     case RKNN_TENSOR_INT16: return "INT16";
153 |     case RKNN_TENSOR_UINT16: return "UINT16";
154 |     case RKNN_TENSOR_INT32: return "INT32";
155 |     case RKNN_TENSOR_UINT32: return "UINT32";
156 |     case RKNN_TENSOR_INT64: return "INT64";
157 |     case RKNN_TENSOR_BOOL: return "BOOL";
158 |     default: return "UNKNOW";
159 |     }
160 | }
161 | 
162 | /*
163 |     the quantitative type.
164 | */
165 | typedef enum _rknn_tensor_qnt_type {
166 |     RKNN_TENSOR_QNT_NONE = 0,                           /* none. */
167 |     RKNN_TENSOR_QNT_DFP,                                /* dynamic fixed point. */
168 |     RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC,                  /* asymmetric affine. */
169 | 
170 |     RKNN_TENSOR_QNT_MAX
171 | } rknn_tensor_qnt_type;
172 | 
173 | inline static const char* get_qnt_type_string(rknn_tensor_qnt_type type)
174 | {
175 |     switch(type) {
176 |     case RKNN_TENSOR_QNT_NONE: return "NONE";
177 |     case RKNN_TENSOR_QNT_DFP: return "DFP";
178 |     case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC: return "AFFINE";
179 |     default: return "UNKNOW";
180 |     }
181 | }
182 | 
183 | /*
184 |     the tensor data format.
185 | */
186 | typedef enum _rknn_tensor_format {
187 |     RKNN_TENSOR_NCHW = 0,                               /* data format is NCHW. */
188 |     RKNN_TENSOR_NHWC,                                   /* data format is NHWC. */
189 |     RKNN_TENSOR_NC1HWC2,                                /* data format is NC1HWC2. */
190 |     RKNN_TENSOR_UNDEFINED,
191 | 
192 |     RKNN_TENSOR_FORMAT_MAX
193 | } rknn_tensor_format;
194 | 
195 | /*
196 |     the mode of running on target NPU core.
197 | */
198 | typedef enum _rknn_core_mask {
199 |     RKNN_NPU_CORE_AUTO = 0,                                       /* default, run on NPU core randomly. */
200 |     RKNN_NPU_CORE_0 = 1,                                          /* run on NPU core 0. */
201 |     RKNN_NPU_CORE_1 = 2,                                          /* run on NPU core 1. */
202 |     RKNN_NPU_CORE_2 = 4,                                          /* run on NPU core 2. */
203 |     RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 | RKNN_NPU_CORE_1,        /* run on NPU core 1 and core 2. */
204 |     RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 | RKNN_NPU_CORE_2,    /* run on NPU core 1 and core 2 and core 3. */
205 | 
206 |     RKNN_NPU_CORE_UNDEFINED,
207 | } rknn_core_mask;
208 | 
209 | inline static const char* get_format_string(rknn_tensor_format fmt)
210 | {
211 |     switch(fmt) {
212 |     case RKNN_TENSOR_NCHW: return "NCHW";
213 |     case RKNN_TENSOR_NHWC: return "NHWC";
214 |     case RKNN_TENSOR_NC1HWC2: return "NC1HWC2";
215 |     case RKNN_TENSOR_UNDEFINED: return "UNDEFINED";
216 |     default: return "UNKNOW";
217 |     }
218 | }
219 | 
220 | /*
221 |     the information for RKNN_QUERY_IN_OUT_NUM.
222 | */
223 | typedef struct _rknn_input_output_num {
224 |     uint32_t n_input;                                   /* the number of input. */
225 |     uint32_t n_output;                                  /* the number of output. */
226 | } rknn_input_output_num;
227 | 
228 | /*
229 |     the information for RKNN_QUERY_INPUT_ATTR / RKNN_QUERY_OUTPUT_ATTR.
230 | */
231 | typedef struct _rknn_tensor_attr {
232 |     uint32_t index;                                     /* input parameter, the index of input/output tensor,
233 |                                                            need set before call rknn_query. */
234 | 
235 |     uint32_t n_dims;                                    /* the number of dimensions. */
236 |     uint32_t dims[RKNN_MAX_DIMS];                       /* the dimensions array. */
237 |     char name[RKNN_MAX_NAME_LEN];                       /* the name of tensor. */
238 | 
239 |     uint32_t n_elems;                                   /* the number of elements. */
240 |     uint32_t size;                                      /* the bytes size of tensor. */
241 | 
242 |     rknn_tensor_format fmt;                             /* the data format of tensor. */
243 |     rknn_tensor_type type;                              /* the data type of tensor. */
244 |     rknn_tensor_qnt_type qnt_type;                      /* the quantitative type of tensor. */
245 |     int8_t fl;                                          /* fractional length for RKNN_TENSOR_QNT_DFP. */
246 |     int32_t zp;                                         /* zero point for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */
247 |     float scale;                                        /* scale for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */
248 | 
249 |     uint32_t w_stride;                                  /* the stride of tensor along the width dimention of input,
250 |                                                            Note: it is read-only, 0 means equal to width. */
251 |     uint32_t size_with_stride;                          /* the bytes size of tensor with stride. */
252 | 
253 |     uint8_t pass_through;                               /* pass through mode, for rknn_set_io_mem interface.
254 |                                                            if TRUE, the buf data is passed directly to the input node of the rknn model
255 |                                                                     without any conversion. the following variables do not need to be set.
256 |                                                            if FALSE, the buf data is converted into an input consistent with the model
257 |                                                                      according to the following type and fmt. so the following variables
258 |                                                                      need to be set.*/
259 |     uint32_t h_stride;                                  /* the stride along the height dimention of input,
260 |                                                            Note: it is write-only, if it was set to 0, h_stride = height. */
261 | } rknn_tensor_attr;
262 | 
263 | /*
264 |     the information for RKNN_QUERY_PERF_DETAIL.
265 | */
266 | typedef struct _rknn_perf_detail {
267 |     char* perf_data;                                    /* the string pointer of perf detail. don't need free it by user. */
268 |     uint64_t data_len;                                  /* the string length. */
269 | } rknn_perf_detail;
270 | 
271 | /*
272 |     the information for RKNN_QUERY_PERF_RUN.
273 | */
274 | typedef struct _rknn_perf_run {
275 |     int64_t run_duration;                               /* real inference time (us) */
276 | } rknn_perf_run;
277 | 
278 | /*
279 |     the information for RKNN_QUERY_SDK_VERSION.
280 | */
281 | typedef struct _rknn_sdk_version {
282 |     char api_version[256];                              /* the version of rknn api. */
283 |     char drv_version[256];                              /* the version of rknn driver. */
284 | } rknn_sdk_version;
285 | 
286 | /*
287 |     the information for RKNN_QUERY_MEM_SIZE.
288 | */
289 | typedef struct _rknn_mem_size {
290 |     uint32_t total_weight_size;                         /* the weight memory size */
291 |     uint32_t total_internal_size;                       /* the internal memory size, exclude inputs/outputs */
292 |     uint64_t total_dma_allocated_size;                  /* total dma memory allocated size */
293 |     uint32_t total_sram_size;                           /* total system sram size reserved for rknn */
294 |     uint32_t free_sram_size;                            /* free system sram size reserved for rknn */
295 |     uint32_t reserved[10];                              /* reserved */
296 | } rknn_mem_size;
297 | 
298 | /*
299 |     the information for RKNN_QUERY_CUSTOM_STRING.
300 | */
301 | typedef struct _rknn_custom_string {
302 |     char string[1024];                                  /* the string of custom, lengths max to 1024 bytes */
303 | } rknn_custom_string;
304 | 
305 | /*
306 |    The flags of rknn_tensor_mem.
307 | */
308 | typedef enum _rknn_tensor_mem_flags {
309 |     RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE = 1,           /*Used to mark in rknn_destroy_mem() whether it is necessary to release the "mem" pointer itself.
310 |                                                          If the flag RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE is set, rknn_destroy_mem() will call free(mem).*/
311 |     RKNN_TENSOR_MEMORY_FLAGS_FROM_FD      = 2,           /*Used to mark in rknn_create_mem_from_fd() whether it is necessary to release the "mem" pointer itself.
312 |                                                          If the flag RKNN_TENSOR_MEMORY_FLAGS_FROM_FD is set, rknn_destroy_mem() will call free(mem).*/
313 |     RKNN_TENSOR_MEMORY_FLAGS_FROM_PHYS    = 3,           /*Used to mark in rknn_create_mem_from_phys() whether it is necessary to release the "mem" pointer itself.
314 |                                                          If the flag RKNN_TENSOR_MEMORY_FLAGS_FROM_PHYS is set, rknn_destroy_mem() will call free(mem).*/
315 |     RKNN_TENSOR_MEMORY_FLAGS_UNKNOWN
316 | } rknn_tensor_mem_flags;
317 | 
318 | /*
319 |     the memory information of tensor.
320 | */
321 | typedef struct _rknn_tensor_memory {
322 |     void*            virt_addr;                         /* the virtual address of tensor buffer. */
323 |     uint64_t         phys_addr;                         /* the physical address of tensor buffer. */
324 |     int32_t          fd;                                /* the fd of tensor buffer. */
325 |     int32_t          offset;                            /* indicates the offset of the memory. */
326 |     uint32_t         size;                              /* the size of tensor buffer. */
327 |     uint32_t         flags;                             /* the flags of tensor buffer, reserved */
328 |     void *           priv_data;                         /* the private data of tensor buffer. */
329 | } rknn_tensor_mem;
330 | 
331 | /*
332 |     the input information for rknn_input_set.
333 | */
334 | typedef struct _rknn_input {
335 |     uint32_t index;                                     /* the input index. */
336 |     void* buf;                                          /* the input buf for index. */
337 |     uint32_t size;                                      /* the size of input buf. */
338 |     uint8_t pass_through;                               /* pass through mode.
339 |                                                            if TRUE, the buf data is passed directly to the input node of the rknn model
340 |                                                                     without any conversion. the following variables do not need to be set.
341 |                                                            if FALSE, the buf data is converted into an input consistent with the model
342 |                                                                      according to the following type and fmt. so the following variables
343 |                                                                      need to be set.*/
344 |     rknn_tensor_type type;                              /* the data type of input buf. */
345 |     rknn_tensor_format fmt;                             /* the data format of input buf.
346 |                                                            currently the internal input format of NPU is NCHW by default.
347 |                                                            so entering NCHW data can avoid the format conversion in the driver. */
348 | } rknn_input;
349 | 
350 | /*
351 |     the output information for rknn_outputs_get.
352 | */
353 | typedef struct _rknn_output {
354 |     uint8_t want_float;                                 /* want transfer output data to float */
355 |     uint8_t is_prealloc;                                /* whether buf is pre-allocated.
356 |                                                            if TRUE, the following variables need to be set.
357 |                                                            if FALSE, the following variables do not need to be set. */
358 |     uint32_t index;                                     /* the output index. */
359 |     void* buf;                                          /* the output buf for index.
360 |                                                            when is_prealloc = FALSE and rknn_outputs_release called,
361 |                                                            this buf pointer will be free and don't use it anymore. */
362 |     uint32_t size;                                      /* the size of output buf. */
363 | } rknn_output;
364 | 
365 | /*
366 |     the extend information for rknn_init.
367 | */
368 | typedef struct _rknn_init_extend {
369 |     rknn_context ctx;                                    /* rknn context */
370 |     int32_t      real_model_offset;                      /* real rknn model file size, only valid when init context with rknn file path */
371 |     uint32_t     real_model_size;                        /* real rknn model file offset, only valid when init context with rknn file path */
372 |     uint8_t      reserved[120];                          /* reserved */
373 | } rknn_init_extend;
374 | 
375 | /*
376 |     the extend information for rknn_run.
377 | */
378 | typedef struct _rknn_run_extend {
379 |     uint64_t frame_id;                                  /* output parameter, indicate current frame id of run. */
380 |     int32_t non_block;                                  /* block flag of run, 0 is block else 1 is non block */
381 |     int32_t timeout_ms;                                 /* timeout for block mode, in milliseconds */
382 |     int32_t fence_fd;                                   /* fence fd from other unit */
383 | } rknn_run_extend;
384 | 
385 | /*
386 |     the extend information for rknn_outputs_get.
387 | */
388 | typedef struct _rknn_output_extend {
389 |     uint64_t frame_id;                                  /* output parameter, indicate the frame id of outputs, corresponds to
390 |                                                            struct rknn_run_extend.frame_id.*/
391 | } rknn_output_extend;
392 | 
393 | 
394 | /*  rknn_init
395 | 
396 |     initial the context and load the rknn model.
397 | 
398 |     input:
399 |         rknn_context* context       the pointer of context handle.
400 |         void* model                 if size > 0, pointer to the rknn model, if size = 0, filepath to the rknn model.
401 |         uint32_t size               the size of rknn model.
402 |         uint32_t flag               extend flag, see the define of RKNN_FLAG_XXX_XXX.
403 |         rknn_init_extend* extend    the extend information of init.
404 |     return:
405 |         int                         error code.
406 | */
407 | int rknn_init(rknn_context* context, void* model, uint32_t size, uint32_t flag, rknn_init_extend* extend);
408 | 
409 | /*  rknn_dup_context
410 | 
411 |     initial the context and load the rknn model.
412 | 
413 |     input:
414 |         rknn_context* context_in       the pointer of context in handle.
415 |         rknn_context* context_out      the pointer of context out handle.
416 |     return:
417 |         int                         error code.
418 | */
419 | int rknn_dup_context(rknn_context* context_in, rknn_context* context_out);
420 | 
421 | /*  rknn_destroy
422 | 
423 |     unload the rknn model and destroy the context.
424 | 
425 |     input:
426 |         rknn_context context        the handle of context.
427 |     return:
428 |         int                         error code.
429 | */
430 | int rknn_destroy(rknn_context context);
431 | 
432 | 
433 | /*  rknn_query
434 | 
435 |     query the information about model or others. see rknn_query_cmd.
436 | 
437 |     input:
438 |         rknn_context context        the handle of context.
439 |         rknn_query_cmd cmd          the command of query.
440 |         void* info                  the buffer point of information.
441 |         uint32_t size               the size of information.
442 |     return:
443 |         int                         error code.
444 | */
445 | int rknn_query(rknn_context context, rknn_query_cmd cmd, void* info, uint32_t size);
446 | 
447 | 
448 | /*  rknn_inputs_set
449 | 
450 |     set inputs information by input index of rknn model.
451 |     inputs information see rknn_input.
452 | 
453 |     input:
454 |         rknn_context context        the handle of context.
455 |         uint32_t n_inputs           the number of inputs.
456 |         rknn_input inputs[]         the arrays of inputs information, see rknn_input.
457 |     return:
458 |         int                         error code
459 | */
460 | int rknn_inputs_set(rknn_context context, uint32_t n_inputs, rknn_input inputs[]);
461 | 
462 | /*
463 |     rknn_set_batch_core_num
464 | 
465 |     set rknn batch core_num.
466 | 
467 |     input:
468 |         rknn_context context        the handle of context.
469 |         int core_num                the core number.
470 |     return:
471 |         int                         error code.
472 | 
473 | */
474 | int rknn_set_batch_core_num(rknn_context context, int core_num);
475 | 
476 | /*  rknn_set_core_mask
477 | 
478 |     set rknn core mask.(only supported on RK3588 now)
479 | 
480 |     RKNN_NPU_CORE_AUTO: auto mode, default value
481 |     RKNN_NPU_CORE_0: core 0 mode
482 |     RKNN_NPU_CORE_1: core 1 mode
483 |     RKNN_NPU_CORE_2: core 2 mode
484 |     RKNN_NPU_CORE_0_1: combine core 0/1 mode
485 |     RKNN_NPU_CORE_0_1_2: combine core 0/1/2 mode
486 | 
487 |     input:
488 |         rknn_context context        the handle of context.
489 |         rknn_core_mask core_mask    the core mask.
490 |     return:
491 |         int                         error code.
492 | */
493 | int rknn_set_core_mask(rknn_context context, rknn_core_mask core_mask);
494 | 
495 | /*  rknn_run
496 | 
497 |     run the model to execute inference.
498 | 
499 |     input:
500 |         rknn_context context        the handle of context.
501 |         rknn_run_extend* extend     the extend information of run.
502 |     return:
503 |         int                         error code.
504 | */
505 | int rknn_run(rknn_context context, rknn_run_extend* extend);
506 | 
507 | 
508 | /*  rknn_wait
509 | 
510 |     wait the model after execute inference.
511 | 
512 |     input:
513 |         rknn_context context        the handle of context.
514 |         rknn_run_extend* extend     the extend information of run.
515 |     return:
516 |         int                         error code.
517 | */
518 | int rknn_wait(rknn_context context, rknn_run_extend* extend);
519 | 
520 | 
521 | /*  rknn_outputs_get
522 | 
523 |     wait the inference to finish and get the outputs.
524 |     this function will block until inference finish.
525 |     the results will set to outputs[].
526 | 
527 |     input:
528 |         rknn_context context        the handle of context.
529 |         uint32_t n_outputs          the number of outputs.
530 |         rknn_output outputs[]       the arrays of output, see rknn_output.
531 |         rknn_output_extend*         the extend information of output.
532 |     return:
533 |         int                         error code.
534 | */
535 | int rknn_outputs_get(rknn_context context, uint32_t n_outputs, rknn_output outputs[], rknn_output_extend* extend);
536 | 
537 | 
538 | /*  rknn_outputs_release
539 | 
540 |     release the outputs that get by rknn_outputs_get.
541 |     after called, the rknn_output[x].buf get from rknn_outputs_get will
542 |     also be free when rknn_output[x].is_prealloc = FALSE.
543 | 
544 |     input:
545 |         rknn_context context        the handle of context.
546 |         uint32_t n_ouputs           the number of outputs.
547 |         rknn_output outputs[]       the arrays of output.
548 |     return:
549 |         int                         error code
550 | */
551 | int rknn_outputs_release(rknn_context context, uint32_t n_ouputs, rknn_output outputs[]);
552 | 
553 | 
554 | /* new api for zero copy */
555 | 
556 | /*  rknn_create_mem_from_phys (memory allocated outside)
557 | 
558 |     initialize tensor memory from physical address.
559 | 
560 |     input:
561 |         rknn_context ctx            the handle of context.
562 |         uint64_t phys_addr          physical address.
563 |         void *virt_addr             virtual address.
564 |         uint32_t size               the size of tensor buffer.
565 |     return:
566 |         rknn_tensor_mem             the pointer of tensor memory information.
567 | */
568 | rknn_tensor_mem* rknn_create_mem_from_phys(rknn_context ctx, uint64_t phys_addr, void *virt_addr, uint32_t size);
569 | 
570 | 
571 | /*  rknn_create_mem_from_fd (memory allocated outside)
572 | 
573 |     initialize tensor memory from file description.
574 | 
575 |     input:
576 |         rknn_context ctx            the handle of context.
577 |         int32_t fd                  file description.
578 |         void *virt_addr             virtual address.
579 |         uint32_t size               the size of tensor buffer.
580 |         int32_t offset              indicates the offset of the memory (virt_addr without offset).
581 |     return:
582 |         rknn_tensor_mem             the pointer of tensor memory information.
583 | */
584 | rknn_tensor_mem* rknn_create_mem_from_fd(rknn_context ctx, int32_t fd, void *virt_addr, uint32_t size, int32_t offset);
585 | 
586 | 
587 | /*  rknn_create_mem_from_mb_blk (memory allocated outside)
588 | 
589 |     create tensor memory from mb_blk.
590 | 
591 |     input:
592 |         rknn_context ctx            the handle of context.
593 |         void *mb_blk                mb_blk allocate from system api.
594 |         int32_t offset              indicates the offset of the memory.
595 |     return:
596 |         rknn_tensor_mem             the pointer of tensor memory information.
597 | */
598 | rknn_tensor_mem* rknn_create_mem_from_mb_blk(rknn_context ctx, void *mb_blk, int32_t offset);
599 | 
600 | 
601 | /*  rknn_create_mem (memory allocated inside)
602 | 
603 |     create tensor memory.
604 | 
605 |     input:
606 |         rknn_context ctx            the handle of context.
607 |         uint32_t size               the size of tensor buffer.
608 |     return:
609 |         rknn_tensor_mem             the pointer of tensor memory information.
610 | */
611 | rknn_tensor_mem* rknn_create_mem(rknn_context ctx, uint32_t size);
612 | 
613 | 
614 | /*  rknn_destroy_mem (support allocate inside and outside)
615 | 
616 |     destroy tensor memory.
617 | 
618 |     input:
619 |         rknn_context ctx            the handle of context.
620 |         rknn_tensor_mem *mem        the pointer of tensor memory information.
621 |     return:
622 |         int                         error code
623 | */
624 | int rknn_destroy_mem(rknn_context ctx, rknn_tensor_mem *mem);
625 | 
626 | 
627 | /*  rknn_set_weight_mem
628 | 
629 |     set the weight memory.
630 | 
631 |     input:
632 |         rknn_context ctx            the handle of context.
633 |         rknn_tensor_mem *mem        the array of tensor memory information
634 |     return:
635 |         int                         error code.
636 | */
637 | int rknn_set_weight_mem(rknn_context ctx, rknn_tensor_mem *mem);
638 | 
639 | 
640 | /*  rknn_set_internal_mem
641 | 
642 |     set the internal memory.
643 | 
644 |     input:
645 |         rknn_context ctx            the handle of context.
646 |         rknn_tensor_mem *mem        the array of tensor memory information
647 |     return:
648 |         int                         error code.
649 | */
650 | int rknn_set_internal_mem(rknn_context ctx, rknn_tensor_mem *mem);
651 | 
652 | 
653 | /*  rknn_set_io_mem
654 | 
655 |     set the input and output tensors buffer.
656 | 
657 |     input:
658 |         rknn_context ctx            the handle of context.
659 |         rknn_tensor_mem *mem        the array of tensor memory information.
660 |         rknn_tensor_attr *attr      the attribute of input or output tensor buffer.
661 |     return:
662 |         int                         error code.
663 | */
664 | int rknn_set_io_mem(rknn_context ctx, rknn_tensor_mem *mem, rknn_tensor_attr *attr);
665 | 
666 | 
667 | #ifdef __cplusplus
668 | } //extern "C"
669 | #endif
670 | 
671 | #endif  //_RKNN_API_H
672 | 


--------------------------------------------------------------------------------
/include/rknn_net.h:
--------------------------------------------------------------------------------
 1 | #ifndef RKNN_NET_H
 2 | #define RKNN_NET_H
 3 | 
 4 | #include "rknn_api.h"
 5 | #include <stddef.h> // or <stdlib.h> for size_t
 6 | #include <stdint.h> // for uint8_t
 7 | #include <stdlib.h>
 8 | 
 9 | 
10 | // Define the struct
11 | typedef struct rknn_net {
12 |     rknn_context* ctx;
13 |     int debug_flag;
14 |     uint8_t* model;
15 |     size_t model_size;
16 |     rknn_tensor_attr* input_attrs;
17 |     rknn_tensor_attr* output_attrs;
18 |     int num_input;
19 |     int num_output;
20 |     int input_height;
21 |     int input_width;
22 |     int input_channel;
23 | } rknn_net;
24 | 
25 | 
26 | // Function prototypes
27 | rknn_net* rknn_net_create(const char* model_path, int target_npu, int debug_flag);
28 | int rknn_net_inference(rknn_net* net, int8_t* input, float** output);
29 | void rknn_net_destroy(rknn_net* net);
30 | static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
31 | 
32 | #endif // RKNN_NET_H
33 | 


--------------------------------------------------------------------------------
/include/yolo_multi_npu.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <pthread.h>
  4 | #include <string.h>
  5 | #include <opencv2/highgui.hpp>
  6 | #include <opencv2/imgproc.hpp>
  7 | #include <opencv2/imgcodecs.hpp>
  8 | #include "yolov8_post.h"
  9 | #include "rknn_net.h"
 10 | #include <queue>
 11 | #include <mutex>
 12 | #include <condition_variable>
 13 | #include <iostream>
 14 | 
 15 | template <typename T>
 16 | class ThreadSafeQueue {
 17 | private:
 18 |     std::queue<T> queue;
 19 |     mutable std::mutex mutex;
 20 |     std::condition_variable cond_empty;
 21 |     std::condition_variable cond_full;
 22 |     size_t max_size;
 23 | 
 24 | public:
 25 |     ThreadSafeQueue(size_t max_size) : max_size(max_size) {}
 26 | 
 27 |     void enqueue(T t) {
 28 |         std::unique_lock<std::mutex> lock(mutex);
 29 |         cond_full.wait(lock, [&](){ return queue.size() < max_size; });
 30 |         queue.push(t);
 31 |         cond_empty.notify_one();
 32 |     }
 33 | 
 34 |     T dequeue() {
 35 |         std::unique_lock<std::mutex> lock(mutex);
 36 |         cond_empty.wait(lock, [&](){ return !queue.empty(); });
 37 |         T val = queue.front();
 38 |         queue.pop();
 39 |         cond_full.notify_one();
 40 |         return val;
 41 |     }
 42 | };
 43 | 
 44 | 
 45 | #define BUFFER_SIZE 150
 46 | #define ELEMENT_SIZE 3*640*640
 47 | 
 48 | typedef struct {
 49 |     int8_t* data[BUFFER_SIZE];
 50 |     int start; // Index of the oldest element
 51 |     int end;   // Index at which to write new element
 52 |     pthread_mutex_t lock;
 53 |     pthread_cond_t notEmpty; // Signaled when the buffer is not empty
 54 |     pthread_cond_t notFull;  // Signaled when the buffer is not full
 55 |     bool stop;  // Flag to stop the thread
 56 | } CircularBuffer;
 57 | 
 58 | typedef struct {
 59 |     cv::VideoCapture* cam;
 60 |     CircularBuffer* output;    
 61 | } video_read_arg;
 62 | 
 63 | typedef struct {
 64 |     CircularBuffer* input;    
 65 |     CircularBuffer* output;    
 66 |     int* target_npu;
 67 |     std::string* model_path;
 68 | } yolov8_npu_arg;
 69 | 
 70 | void initialize(CircularBuffer *buffer) {
 71 |     buffer->start = 0;
 72 |     buffer->end = 0;
 73 |     for (int i=0; i< BUFFER_SIZE;++i) {
 74 |         buffer->data[i] = (int8_t*)malloc(ELEMENT_SIZE * sizeof(int8_t));
 75 |     }
 76 |     pthread_mutex_init(&buffer->lock, NULL);
 77 |     pthread_cond_init(&buffer->notEmpty, NULL);
 78 |     pthread_cond_init(&buffer->notFull, NULL);
 79 |     buffer->stop = false;
 80 | }
 81 | 
 82 | void enqueue(CircularBuffer *buffer, int8_t* input) {
 83 |     pthread_mutex_lock(&buffer->lock);
 84 |     while ((buffer->end + 1) % BUFFER_SIZE == buffer->start) { // Buffer is full
 85 |         pthread_cond_wait(&buffer->notFull, &buffer->lock);   // Wait until not full
 86 |         if(buffer->stop) {
 87 |             pthread_mutex_unlock(&buffer->lock);
 88 |             buffer->end = 1;
 89 |             buffer->start = 0;
 90 |             return;
 91 |         }
 92 |     }
 93 |     if(buffer->stop) {
 94 |         pthread_mutex_unlock(&buffer->lock);
 95 |         return;
 96 |     }
 97 |     memcpy(buffer->data[buffer->end], input, ELEMENT_SIZE * sizeof(int8_t));
 98 |     buffer->end = (buffer->end + 1) % BUFFER_SIZE;
 99 |     pthread_cond_signal(&buffer->notEmpty); // Signal that the buffer is not empty
100 |     pthread_mutex_unlock(&buffer->lock);
101 | }
102 | 
103 | void dequeue(CircularBuffer *buffer, int8_t* output) {
104 |     pthread_mutex_lock(&buffer->lock);
105 |     while (buffer->end == buffer->start) { // Buffer is empty
106 |         pthread_cond_wait(&buffer->notEmpty, &buffer->lock); // Wait until not empty
107 |         if(buffer->stop) {
108 |             pthread_mutex_unlock(&buffer->lock);
109 |             buffer->end = 1;
110 |             buffer->start = 0;
111 |             return;
112 |         }
113 |     }
114 |     if(buffer->stop) {
115 |         pthread_mutex_unlock(&buffer->lock);
116 |         return;
117 |     }
118 |     memcpy(output, buffer->data[buffer->start], ELEMENT_SIZE * sizeof(int8_t));
119 |     buffer->start = (buffer->start + 1) % BUFFER_SIZE;
120 |     pthread_cond_signal(&buffer->notFull); // Signal that the buffer is not full
121 |     pthread_mutex_unlock(&buffer->lock);
122 | }
123 | 
124 | void release(CircularBuffer *buffer) {
125 |     for (int i=0; i<BUFFER_SIZE; ++i) {
126 |         free(buffer->data[i]);
127 |     }
128 |     free(buffer);
129 | }
130 | 
131 | void* video_read_thread(void* args) {
132 |     video_read_arg* video_args = (video_read_arg*)args;
133 |     cv::VideoCapture* cam = video_args->cam;
134 |     CircularBuffer* output_buffer = video_args->output;
135 |     cv::Mat frame;
136 |     fprintf(stdout, "Video Thread start!\n");
137 |     while (true)
138 |     {
139 |         *cam >> frame;
140 |         if(frame.empty())
141 |             break;
142 |         // Resize the image to 640x640
143 |         cv::resize(frame, frame, cv::Size(640, 640));
144 |         enqueue(output_buffer, (int8_t*)frame.data);
145 |         if(output_buffer->stop) break;
146 |     }
147 |     fprintf(stdout, "Video Thread stop!\n");
148 |     output_buffer->stop = true;
149 |     pthread_cond_signal(&output_buffer->notEmpty); // Signal that the buffer is not empty
150 |     return NULL;
151 | }
152 | 
153 | void* yolov8_npu_thread(void* args) {
154 |     yolov8_npu_arg* yolo_args = (yolov8_npu_arg*)args;
155 |     std::string* model_path = yolo_args->model_path;
156 |     int* target_npu = yolo_args->target_npu;
157 |     CircularBuffer* input_buffer = yolo_args->input;
158 |     CircularBuffer* output_buffer = yolo_args->output;
159 | 
160 |     rknn_net* yolov8 = rknn_net_create((*model_path).c_str(), *target_npu, false);
161 | 
162 |     int8_t* input;
163 |     input = (int8_t*)malloc(3*640*640*sizeof(int8_t));
164 | 
165 |     float* outputs[6];
166 |     outputs[0] = (float*)malloc(1*80*400 * sizeof(float));
167 |     outputs[1] = (float*)malloc(1*64*400 * sizeof(float));
168 |     outputs[2] = (float*)malloc(1*80*1600 * sizeof(float));
169 |     outputs[3] = (float*)malloc(1*64*1600 * sizeof(float));
170 |     outputs[4] = (float*)malloc(1*80*6400 * sizeof(float));
171 |     outputs[5] = (float*)malloc(1*64*6400 * sizeof(float));
172 | 
173 |     fprintf(stdout, "NPU Thread start, NPU:%d\n", *yolo_args->target_npu);
174 |     while (true)
175 |     {
176 |         dequeue(input_buffer, input);
177 |         if(input_buffer->stop) break;
178 |         rknn_net_inference(yolov8, input, outputs);
179 |         std::vector<DetectionResult> results = yolov8_tail_post_process(outputs, 80, 1.0, 1.0, 0.3, 0.4, false);
180 |         cv::Mat output_img(640, 640, CV_8UC3, (uint8_t*)input);
181 |         yolov8_draw_result(results, output_img, coco_classes, 80);
182 |         enqueue(output_buffer, (int8_t*)output_img.data);
183 |     }
184 |     rknn_net_destroy(yolov8);
185 |     for (int i=0; i<6; ++i) {
186 |         free(outputs[i]);
187 |     }
188 |     output_buffer->stop = true;
189 |     pthread_cond_signal(&output_buffer->notEmpty); // Signal that the buffer is not empty
190 |     fprintf(stdout,"NPU Thread stop, NPU:%d\n", *yolo_args->target_npu);
191 |     return NULL;
192 | }


--------------------------------------------------------------------------------
/include/yolov8_post.h:
--------------------------------------------------------------------------------
 1 | #ifndef YOLOV8_POST_H
 2 | #define YOLOV8_POST_H
 3 | #include <stddef.h> // or <stdlib.h> for size_t
 4 | #include <stdint.h> // for uint8_t
 5 | #include <stdlib.h>
 6 | #include <vector>
 7 | #include <opencv2/dnn.hpp>
 8 | #include <opencv2/core.hpp>
 9 | #include <opencv2/highgui.hpp>
10 | #include <opencv2/imgproc.hpp>
11 | 
12 | typedef struct {
13 |     float score;
14 |     int cls;
15 |     int x;
16 |     int y;
17 |     int w;
18 |     int h;
19 | } DetectionResult;
20 | 
21 | const std::string coco_classes[] = {"person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light",
22 |            "fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant",
23 |            "bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
24 |            "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ",
25 |            "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
26 |            "pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop ", "mouse   ", "remote ", "keyboard ", "cell phone", "microwave ",
27 |            "oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush"};
28 | 
29 | /** @brief Performs tail process for yolov8(no tail, just like dfl).
30 | 
31 | * @param input input[0] is the buffer of confidence of layer_20x20 with shape (1xclass_numx20x20),
32 | *              intpu[1] is the buffer of bbox of layer_20x20 with shape (1x64x20x20),
33 | *              input[2] is the buffer of confidence of layer_40x40 with shape (1xclass_numx40x40),
34 | *              intpu[3] is the buffer of bbox of layer_40x40 with shape (1x64x40x40),
35 | *              input[4] is the buffer of confidence of layer_80x80 with shape (1xclass_numx80x80),
36 | *              intpu[5] is the buffer of bbox of layer_80x80 with shape (1x64x80x80),
37 | * @param output output[0] is the buffer of dlf result with shape (1x8400x4),
38 | *              output[1] is the buffer of the confidence with shape (1x8400xclass_num).
39 | * @param class_num the total number of classes;
40 | */
41 | void yolov8_tail_process(float** input, float** output, int class_num);
42 | 
43 | /** @brief Performs post process for yolov8(no bbox head).
44 | 
45 | * @param input input[0] is the buffer of dlf result with shape (1x8400x4),
46 | *              intpu[1] is the buffer of the confidence with shape (1x8400xclass_num).
47 | * @param class_num the total number of classes;
48 | * @param scale_x the scale of width from origin img compare to 640 equal to origin_img.width/640.
49 | * @param scale_y the scale of height from origin img compare to 640 equal to origin_img.height/640.
50 | * @param score_threshold a threshold used to filter boxes by score.
51 | * @param nms_threshold a threshold used in non maximum suppression.
52 | * @param debug_flat if print the debug imformation.
53 | */
54 | std::vector<DetectionResult> yolov8_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag);
55 | std::vector<DetectionResult> yolov8_tail_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag);
56 | 
57 | /** @brief Draw the detection result to image.
58 | 
59 | * @param result detection results.
60 | * @param img image to draw.
61 | * @param class_namess the name of each class.
62 | * @param num_of_classes the total number of classes.
63 | */
64 | void yolov8_draw_result(std::vector<DetectionResult> results, cv::Mat &img, const std::string* class_names, int num_of_classes);
65 | 
66 | 
67 | 
68 | #endif // YOLOV8_POST_H


--------------------------------------------------------------------------------
/lib/librknnrt.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/lib/librknnrt.so


--------------------------------------------------------------------------------
/src/picture_demo.cc:
--------------------------------------------------------------------------------
  1 | #include <getopt.h>
  2 | #include "rknn_net.h"
  3 | #include "yolov8_post.h"
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <unistd.h>
  7 | #include <opencv2/core/core.hpp>
  8 | #include <opencv2/highgui/highgui.hpp>
  9 | #include <opencv2/imgproc.hpp>
 10 | #include <opencv2/imgcodecs.hpp>
 11 | #include <opencv2/dnn.hpp>
 12 | 
 13 | void print_help() {
 14 |     printf("Usage: your_program [OPTIONS]\n");
 15 |     printf("Options:\n");
 16 |     printf("\t-i, --input\t\tPath to the input image\n");
 17 |     printf("\t-m, --model\t\tPath to the model file\n");
 18 |     printf("\t-o, --output\t\tPath to the output image\n");
 19 |     printf("\t-d, --debug\t\tDebug flag (0 or 1)\n");
 20 |     printf("\t-s, --score_threshold\tScore threshold for filtering boxes\n");
 21 |     printf("\t-n, --nms_threshold\tNMS threshold\n");
 22 |     printf("\t-h, --help\t\tPrint this help message\n");
 23 | }
 24 | 
 25 | int main (int argc, char* argv[]) {
 26 |     // Define options
 27 |     const struct option long_options[] = {
 28 |         {"input",  required_argument, 0, 'i'},
 29 |         {"model",  required_argument, 0, 'm'},
 30 |         {"output", required_argument, 0, 'o'},
 31 |         {"debug",  required_argument, 0, 'd'},
 32 |         {"help",   no_argument,       0, 'h'},
 33 |         {"score_threshold", required_argument, 0, 's'},
 34 |         {"nms_threshold",   required_argument, 0, 'n'},
 35 |         {0, 0, 0, 0}
 36 |     };
 37 | 
 38 |     // Default parameters
 39 |     std::string input_path = "../data/bus.jpg";
 40 |     std::string model_path = "../data/yolov8n_no_tail.rknn";
 41 |     std::string output_path = "./debug.png";
 42 |     float score_threshold = 0.4;
 43 |     float nms_threshold = 0.5;
 44 |     bool debug_flag = true;
 45 | 
 46 |     //Temp var
 47 |     float scale_x, scale_y;
 48 | 
 49 |     // Parse options
 50 |     int opt = 0;
 51 |     while ((opt = getopt_long(argc, argv, "i:m:o:d:h", long_options, NULL)) != -1) {
 52 |         switch (opt) {
 53 |             case 'i':
 54 |                 input_path = optarg;
 55 |                 break;
 56 |             case 'm':
 57 |                 model_path = optarg;
 58 |                 break;
 59 |             case 'o':
 60 |                 output_path = optarg;
 61 |                 break;
 62 |             case 's':
 63 |                 score_threshold = atof(optarg);
 64 |                 break;
 65 |             case 'n':
 66 |                 nms_threshold = atof(optarg);
 67 |                 break;
 68 |             case 'd':
 69 |                 debug_flag = atoi(optarg);
 70 |                 break;
 71 |             case 'h':
 72 |             default:
 73 |                 print_help();
 74 |                 return 0;
 75 |         }
 76 |     }
 77 | 
 78 |     // Load the model
 79 |     rknn_net* yolov8 = rknn_net_create(model_path.c_str(), 0, false);
 80 |     
 81 |     // Read image
 82 |     cv::Mat img = cv::imread(input_path);
 83 |     scale_x = ((float)img.cols) / 640;
 84 |     scale_y = ((float)img.rows) / 640;
 85 |     cv::Mat output_img = img.clone();
 86 |     
 87 |     // Resize the image to 640x640
 88 |     cv::resize(img, img, cv::Size(640, 640));
 89 | 
 90 | 
 91 |     // Allocate data buffer to save the raw result from Neural net
 92 |     float* outputs[6];
 93 |     outputs[0] = (float*)malloc(1*80*400 * sizeof(float));
 94 |     outputs[1] = (float*)malloc(1*64*400 * sizeof(float));
 95 |     outputs[2] = (float*)malloc(1*80*1600 * sizeof(float));
 96 |     outputs[3] = (float*)malloc(1*64*1600 * sizeof(float));
 97 |     outputs[4] = (float*)malloc(1*80*6400 * sizeof(float));
 98 |     outputs[5] = (float*)malloc(1*64*6400 * sizeof(float));
 99 | 
100 |     // Do inference
101 |     rknn_net_inference(yolov8, (int8_t*)img.data, outputs);
102 | 
103 |     // Post process to get the detection result
104 |     std::vector<DetectionResult> results = yolov8_tail_post_process(outputs, 80, scale_x, scale_y, 0.4, 0.8, debug_flag);
105 | 
106 |     yolov8_draw_result(results, output_img, coco_classes, 80);
107 | 
108 |     // Write the image which have detection result
109 |     cv::imwrite(output_path, output_img);
110 | 
111 |     // Release the resource
112 |     rknn_net_destroy(yolov8);
113 |     
114 | 
115 |     return 0;
116 | }
117 | 
118 | 


--------------------------------------------------------------------------------
/src/rknn_net.cc:
--------------------------------------------------------------------------------
  1 | #include "rknn_net.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <iostream>
  6 | #include <chrono>
  7 | 
  8 | class Timer {
  9 | public:
 10 |     Timer() : start_time(std::chrono::high_resolution_clock::now()) {}
 11 | 
 12 |     void reset() {
 13 |         start_time = std::chrono::high_resolution_clock::now();
 14 |     }
 15 | 
 16 |     double elapsed() const {
 17 |         return std::chrono::duration_cast<std::chrono::milliseconds>(
 18 |             std::chrono::high_resolution_clock::now() - start_time).count() / 1000.0;
 19 |     }
 20 | 
 21 | private:
 22 |     std::chrono::high_resolution_clock::time_point start_time;
 23 | };
 24 | 
 25 | const char* get_type(rknn_tensor_type type) {
 26 |     switch (type) {
 27 |         case RKNN_TENSOR_FLOAT32:
 28 |             return "float32";
 29 |         case RKNN_TENSOR_FLOAT16:
 30 |             return "float16";
 31 |         case RKNN_TENSOR_INT8:
 32 |             return "int8";
 33 |         case RKNN_TENSOR_UINT8:
 34 |             return "uint8";
 35 |         case RKNN_TENSOR_INT16:
 36 |             return "int16";
 37 |         case RKNN_TENSOR_UINT16:
 38 |             return "uint16";
 39 |         case RKNN_TENSOR_INT32:
 40 |             return "int32";
 41 |         case RKNN_TENSOR_UINT32:
 42 |             return "uint32";
 43 |         case RKNN_TENSOR_INT64:
 44 |             return "int64";
 45 |         case RKNN_TENSOR_BOOL:
 46 |             return "bool";
 47 |         default:
 48 |             return "unknown";
 49 |     }
 50 | }
 51 | 
 52 | 
 53 | rknn_net* rknn_net_create(const char* model_path, int target_npu, int debug_flag) {
 54 | 
 55 |     FILE* fp = fopen(model_path, "rb");
 56 |     if(fp == NULL) {
 57 |         printf("Failed to open file %s\n", model_path);
 58 |         return NULL;
 59 |     }
 60 | 
 61 |     fseek(fp, 0, SEEK_END);
 62 |     long model_size = ftell(fp);
 63 |     rewind(fp);
 64 | 
 65 |     uint8_t* model = (uint8_t*)malloc(model_size);
 66 |     if(model == NULL) {
 67 |         printf("Failed to malloc memory for model\n");
 68 |         fclose(fp);
 69 |         return NULL;
 70 |     }
 71 | 
 72 |     size_t read_count = fread(model, 1, model_size, fp);
 73 |     if(read_count != model_size) {
 74 |         printf("Failed to read model\n");
 75 |         free(model);
 76 |         fclose(fp);
 77 |         return NULL;
 78 |     }
 79 | 
 80 |     if (debug_flag) {
 81 |         printf("read model from %s\n", model_path);
 82 |         printf("model size = %ld bytes\n", model_size);
 83 |     }
 84 | 
 85 |     fclose(fp);
 86 | 
 87 |     //rknn_context* ctx = NULL;
 88 |     rknn_context* ctx = (rknn_context*)malloc(sizeof(rknn_context));
 89 |     int ret = rknn_init(ctx, model, model_size, RKNN_FLAG_COLLECT_PERF_MASK, NULL);
 90 |     if(ret < 0) {
 91 |         printf("rknn_init fail! ret=%d\n", ret);
 92 |         free(model);
 93 |         free(ctx);
 94 |         return NULL;
 95 |     }
 96 | 
 97 |     rknn_sdk_version version;
 98 |     ret = rknn_query(*ctx, RKNN_QUERY_SDK_VERSION, &version, sizeof(rknn_sdk_version));
 99 |     if(ret < 0) {
100 |         printf("rknn_query SDK version error, ret=%d\n", ret);
101 |         free(model);
102 |         free(ctx);
103 |         return NULL;
104 |     }
105 | 
106 |     if (debug_flag) {
107 |         printf("sdk version: %s driver version: %s\n", version.api_version, version.drv_version);
108 |     }
109 | 
110 |     rknn_input_output_num io_num;
111 |     ret = rknn_query(*ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
112 |     if(ret < 0) {
113 |         printf("rknn_query IN_OUT_NUM error, ret=%d\n", ret);
114 |         free(model);
115 |         free(ctx);
116 |         return NULL;
117 |     }
118 | 
119 |     if (debug_flag) {
120 |         printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
121 |     }
122 | 
123 |     rknn_tensor_attr* input_attrs = (rknn_tensor_attr*)calloc(io_num.n_input, sizeof(rknn_tensor_attr));
124 |     rknn_tensor_attr* output_attrs = (rknn_tensor_attr*)calloc(io_num.n_output, sizeof(rknn_tensor_attr));
125 | 
126 |     for (int i = 0; i < io_num.n_input; i++) {
127 |         input_attrs[i].index = i;
128 |         ret = rknn_query(*ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
129 |         if (ret < 0) {
130 |             printf("rknn_query INPUT_ATTR error, ret=%d\n", ret);
131 |             free(model);
132 |             free(ctx);
133 |             free(input_attrs);
134 |             free(output_attrs);
135 |             return NULL;
136 |         }
137 |     }
138 | 
139 |     for (int i = 0; i < io_num.n_output; i++) {
140 |         output_attrs[i].index = i;
141 |         ret = rknn_query(*ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
142 |         if (ret < 0) {
143 |             printf("rknn_query OUTPUT_ATTR error, ret=%d\n", ret);
144 |             free(model);
145 |             free(ctx);
146 |             free(input_attrs);
147 |             free(output_attrs);
148 |             return NULL;
149 |         }
150 |     }
151 | 
152 |     int channel = 3;
153 |     int width  = 0;
154 |     int height  = 0;
155 | 
156 |     if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
157 |         if (debug_flag) {
158 |             printf("model is NCHW input fmt\n");
159 |         }
160 |         channel = input_attrs[0].dims[1];
161 |         height  = input_attrs[0].dims[2];
162 |         width  = input_attrs[0].dims[3];
163 |     } else {
164 |         if (debug_flag) {
165 |             printf("model is NHWC input fmt\n");
166 |         }
167 |         height         = input_attrs[0].dims[1];
168 |         width  = input_attrs[0].dims[2];
169 |         channel = input_attrs[0].dims[3];
170 |     }
171 | 
172 |     if (debug_flag) {
173 |         printf("model input height=%d, width=%d, channel=%d, type:%s\n", height, width, channel, get_type(input_attrs[0].type));
174 |         for (int i=0; i<io_num.n_output; ++i) {
175 |             printf("model output %d:", i);
176 |             for (int j=0; j<output_attrs[i].n_dims; ++j) {
177 |                 printf("%d ", output_attrs[i].dims[j]);
178 |             }
179 |             printf("type:%s\n", get_type(output_attrs[0].type));
180 |         }
181 |     }
182 | 
183 |     switch (target_npu)
184 |     {
185 |     case 0:
186 |         rknn_set_core_mask(*ctx, RKNN_NPU_CORE_0);
187 |         break;
188 |     case 1:
189 |         rknn_set_core_mask(*ctx, RKNN_NPU_CORE_1);
190 |         break;
191 |     case 2:
192 |         rknn_set_core_mask(*ctx, RKNN_NPU_CORE_2);
193 |     default:
194 |         break;
195 |     }
196 | 
197 |     // allocate and initialize rknn_net structure
198 |     rknn_net* net = (rknn_net*)malloc(sizeof(rknn_net));
199 |     net->ctx = ctx;
200 |     net->debug_flag = debug_flag;
201 |     net->model = model;
202 |     net->model_size = model_size;
203 |     net->input_attrs = input_attrs;
204 |     net->output_attrs = output_attrs;
205 |     net->num_input = io_num.n_input;
206 |     net->num_output = io_num.n_output;
207 |     net->input_height = height;
208 |     net->input_width = width;
209 |     net->input_channel = channel;
210 | 
211 |     return net;
212 | }
213 | 
214 | int rknn_net_inference(rknn_net* net, int8_t* input, float** output) {
215 |     rknn_input inputs[net->num_input];
216 |     memset(inputs, 0, sizeof(inputs));
217 | 
218 |     inputs[0].index = 0;
219 |     inputs[0].buf = (void*)input;
220 |     inputs[0].size = net->input_attrs[0].n_elems * sizeof(int8_t);
221 |     inputs[0].pass_through = true;
222 |     inputs[0].type = RKNN_TENSOR_INT8;
223 |     inputs[0].fmt = net->input_attrs[0].fmt;//RKNN_TENSOR_NCHW;
224 | 
225 |     int ret = rknn_inputs_set(*(net->ctx), 1, inputs);
226 |     if(ret < 0) {
227 |         printf("rknn_inputs_set fail! ret=%d\n", ret);
228 |         return 1;
229 |     } else if (net->debug_flag) {
230 |         printf("rknn_inputs_set done.\n");
231 |     }
232 | 
233 |     //Timer t1;
234 |     //for (int j=0; j<50; ++j) {
235 |     //    t1.reset();
236 |     ret = rknn_run(*(net->ctx), NULL);
237 |     //std::cout << ":>" << t1.elapsed() << std::endl;
238 |     //}
239 |     //rknn_perf_detail perf_detail;
240 |     //ret = rknn_query(*(net->ctx), RKNN_QUERY_PERF_DETAIL, &perf_detail, sizeof(perf_detail));
241 |     //for (int j=0; j<perf_detail.data_len; ++j) {
242 |     //    std::cout << perf_detail.perf_data[j];
243 |     //}
244 |     //std::cout << std::endl;
245 | 
246 |     if(ret < 0) {
247 |         printf("rknn_run fail! ret=%d\n", ret);
248 |         return 1;
249 |     } else if (net->debug_flag) {
250 |         printf("rknn_run done.\n");
251 |     }
252 | 
253 |     rknn_output outputs[6];
254 |     memset(outputs, 0, sizeof(outputs));
255 | 
256 |     for (int i=0; i<6; ++i) {
257 |         outputs[i].want_float = true;
258 |         outputs[i].is_prealloc = true;
259 |         outputs[i].buf = output[i];
260 |         outputs[i].index = i;
261 |         outputs[i].size = net->output_attrs[0].n_elems * sizeof(float);
262 |     }
263 | 
264 |     ret = rknn_outputs_get(*(net->ctx), 6, outputs, NULL);
265 |     if(ret < 0) {
266 |         printf("rknn_outputs_get fail! ret=%d\n", ret);
267 |         return 1;
268 |     } else if (net->debug_flag) {
269 |         printf("rknn_outputs_get done.\n");
270 |     }
271 | 
272 |     for (int i=0; i<6; ++i) {
273 |         memcpy(output[i], outputs[i].buf, outputs[i].size);
274 |     }
275 |     rknn_outputs_release(*(net->ctx), 6, outputs);
276 | 
277 |     return 0;
278 | }
279 | 
280 | void rknn_net_destroy(rknn_net* net) {
281 |     if (net == NULL) {
282 |         return;
283 |     }
284 | 
285 |     // Release rknn context
286 |     if (net->ctx != NULL) {
287 |         rknn_destroy(*(net->ctx));
288 |         net->ctx = NULL;
289 |     }
290 | 
291 |     // Release model data
292 |     if (net->model != NULL) {
293 |         free(net->model);
294 |         net->model = NULL;
295 |     }
296 | 
297 |     // Release input tensor attributes
298 |     if (net->input_attrs != NULL) {
299 |         free(net->input_attrs);
300 |         net->input_attrs = NULL;
301 |     }
302 | 
303 |     // Release output tensor attributes
304 |     if (net->output_attrs != NULL) {
305 |         free(net->output_attrs);
306 |         net->output_attrs = NULL;
307 |     }
308 | 
309 |     // Finally, free the rknn_net struct itself
310 |     free(net);
311 | }


--------------------------------------------------------------------------------
/src/video_demo.cc:
--------------------------------------------------------------------------------
  1 | #include "rknn_net.h"
  2 | #include "yolov8_post.h"
  3 | #include "yolo_multi_npu.h"
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <unistd.h>
  7 | #include <iostream>
  8 | #include <pthread.h>
  9 | #include <signal.h>
 10 | #include <time.h>
 11 | #include <opencv2/core/core.hpp>
 12 | #include <opencv2/highgui/highgui.hpp>
 13 | #include <opencv2/imgproc.hpp>
 14 | #include <opencv2/imgcodecs.hpp>
 15 | #include <opencv2/dnn.hpp>
 16 | #include <chrono>
 17 | 
 18 | void preciseSleep(double seconds)
 19 | {
 20 |     struct timespec ts;
 21 |     ts.tv_sec = (time_t)seconds; // Truncate the decimal part for the 'seconds' component
 22 |     ts.tv_nsec = (long)((seconds - ts.tv_sec) * 1e+9); // Subtract the 'seconds' component from 'seconds', leaving the fractional part
 23 | 
 24 |     nanosleep(&ts, NULL);
 25 | }
 26 | 
 27 | volatile sig_atomic_t flag = 0;
 28 | CircularBuffer input_buffer, output_buffer;
 29 | 
 30 | void handler(int sig) {
 31 |     flag = 1;
 32 |     output_buffer.stop = true;
 33 |     pthread_cond_signal(&output_buffer.notEmpty);
 34 |     pthread_cond_signal(&output_buffer.notFull);
 35 |     printf("\nCtrl+C Press, Wait Thread stop.\n");
 36 |     input_buffer.stop = true;
 37 |     pthread_cond_signal(&input_buffer.notEmpty);
 38 |     pthread_cond_signal(&input_buffer.notFull);
 39 |     sleep(2);
 40 |     printf("STOPED!\n");
 41 | }
 42 | 
 43 | 
 44 | class Timer {
 45 | public:
 46 |     Timer() : start_time(std::chrono::high_resolution_clock::now()) {}
 47 | 
 48 |     void reset() {
 49 |         start_time = std::chrono::high_resolution_clock::now();
 50 |     }
 51 | 
 52 |     double elapsed() const {
 53 |         return std::chrono::duration_cast<std::chrono::milliseconds>(
 54 |             std::chrono::high_resolution_clock::now() - start_time).count() / 1000.0;
 55 |     }
 56 | 
 57 | private:
 58 |     std::chrono::high_resolution_clock::time_point start_time;
 59 | };
 60 | 
 61 | int main () {
 62 |     //std::cout << cv::getBuildInformation() << std::endl;
 63 |     // Default parameters
 64 |     std::string input_path = "/root/data/test_video.mp4";
 65 |     std::string model_path = "../data/yolov8n_no_tail.rknn";
 66 |     std::string output_path = "ffmpeg -loglevel quiet -f rawvideo -pix_fmt bgr24 -s 640x640 -r 28 -i - -an -b:v 20M -f rtsp rtsp://localhost:8848/test_video";
 67 |     float score_threshold = 0.4;
 68 |     float nms_threshold = 0.5;
 69 |     bool debug_flag = false;
 70 |     cv::Mat frame, output_img;
 71 |     //Temp var
 72 |     float scale_x, scale_y;
 73 |     signal(SIGINT, handler);
 74 | 
 75 |     cv::VideoCapture cap(input_path);
 76 |     if(!cap.isOpened()) {
 77 |         printf("Video Open Faild\n");
 78 |         return -1;
 79 |     }  // check if we succeeded
 80 | 
 81 | 
 82 |     //Setup output
 83 |     FILE* ffmpeg = popen(output_path.c_str(), "w");
 84 |     if (!ffmpeg) {
 85 |         fprintf(stderr, "Could not open pipe to ffmpeg\n");
 86 |         return 1;
 87 |     }
 88 |     int8_t* result_img = (int8_t*)malloc(3*640*640*sizeof(int8_t));
 89 | 
 90 |     //Setup FIFO
 91 |     initialize(&input_buffer);
 92 |     initialize(&output_buffer);
 93 | 
 94 |     pthread_t threads[4];
 95 |     int npu_0=0, npu_1=1, npu_2=2;
 96 |     yolov8_npu_arg npu_arg_0, npu_arg_1, npu_arg_2;
 97 |     npu_arg_0.input = &input_buffer;
 98 |     npu_arg_0.output = &output_buffer;
 99 |     npu_arg_0.model_path = &model_path;
100 |     npu_arg_0.target_npu = &npu_0;
101 |     pthread_create(&threads[0], NULL, yolov8_npu_thread, &npu_arg_0);
102 |     npu_arg_1.input = &input_buffer;
103 |     npu_arg_1.output = &output_buffer;
104 |     npu_arg_1.model_path = &model_path;
105 |     npu_arg_1.target_npu = &npu_1;
106 |     pthread_create(&threads[1], NULL, yolov8_npu_thread, &npu_arg_1);
107 |     npu_arg_2.input = &input_buffer;
108 |     npu_arg_2.output = &output_buffer;
109 |     npu_arg_2.model_path = &model_path;
110 |     npu_arg_2.target_npu = &npu_2;
111 |     pthread_create(&threads[2], NULL, yolov8_npu_thread, &npu_arg_2);
112 | 
113 |     video_read_arg video_arg;
114 |     video_arg.cam = &cap;
115 |     video_arg.output = &input_buffer;
116 |     pthread_create(&threads[3], NULL, video_read_thread, &video_arg);
117 | 
118 |     Timer timer;
119 |     size_t cnt=0;
120 |     fprintf(stdout, "Wait for buffering!\n");
121 |     sleep(5);
122 |     fprintf(stdout, "Main process start!\n");
123 |     while(!flag)
124 |     {
125 | 
126 |         timer.reset();
127 |         dequeue(&output_buffer, result_img);
128 |         if (output_buffer.stop) break;
129 |         if (timer.elapsed() < 0.033) preciseSleep(0.033-timer.elapsed());
130 |         fwrite(result_img, sizeof(char), 3*640*640, ffmpeg);
131 |         fprintf(stdout, "\rFPS: %f, total frame:%d    ", 1/timer.elapsed(), ++cnt);
132 |     }
133 | 
134 |     fprintf(stdout, "\nMain process stop!\n");
135 |     input_buffer.stop = true;
136 |     pthread_cond_signal(&input_buffer.notEmpty);
137 |     pthread_cond_signal(&input_buffer.notFull);
138 | 
139 |     // Release the resource
140 |     release(&input_buffer);
141 |     release(&output_buffer);
142 |     free(result_img);
143 |     pclose(ffmpeg);
144 |     return 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/src/yolov8_post.cc:
--------------------------------------------------------------------------------
  1 | #include "yolov8_post.h"
  2 | #include <opencv2/opencv.hpp>
  3 | #include <cstring>
  4 | #include <iostream>
  5 | #include <chrono>
  6 | 
  7 | class Timer {
  8 | public:
  9 |     Timer() : start_time(std::chrono::high_resolution_clock::now()) {}
 10 | 
 11 |     void reset() {
 12 |         start_time = std::chrono::high_resolution_clock::now();
 13 |     }
 14 | 
 15 |     double elapsed() const {
 16 |         return std::chrono::duration_cast<std::chrono::milliseconds>(
 17 |             std::chrono::high_resolution_clock::now() - start_time).count() / 1000.0;
 18 |     }
 19 | 
 20 | private:
 21 |     std::chrono::high_resolution_clock::time_point start_time;
 22 | };
 23 | 
 24 | using namespace cv;
 25 | 
 26 | int l_size[3] = {80, 40, 20}; 
 27 | size_t l_base[3] = {0, 6400, 8000};
 28 | float striders[3] = {8, 16, 32};
 29 | int confi_idx[3] = {4,2,0};
 30 | int box_idx[3] = {5,3,1};
 31 | 
 32 | 
 33 | void get_confidence(float* input, int q_num, int group_size, int* idx, float* value) {
 34 |     int start = q_num * group_size;
 35 |     int end = start + group_size;
 36 |     float max_val = input[start];
 37 |     int max_idx = start;
 38 |     for (int i = start; i < end; i++) {
 39 |         if (input[i] > max_val) {
 40 |             max_val = input[i];
 41 |             max_idx = i;
 42 |         }
 43 |     }
 44 |     *idx = max_idx - start;  // convert absolute index to index within the group
 45 |     *value = max_val;
 46 | }
 47 | 
 48 | float softmax_weightsum(float *input) {
 49 |     double base=0;
 50 |     float output=0;
 51 |     for (int i=0; i<16; ++i) {
 52 |         base += input[i];
 53 |     }
 54 |     for (int i=0; i<16; ++i) {
 55 |         output += i*(input[i]/base);
 56 |     }
 57 |     return output;
 58 | }
 59 | void printMaxElement(const cv::Mat& mat) {
 60 |     double minVal, maxVal;
 61 |     cv::minMaxLoc(mat, &minVal, &maxVal);
 62 |     std::cout << "Max element: " << maxVal << std::endl;
 63 | }
 64 | 
 65 | void yolov8_tail_process(float** input, float** output, int class_num) {
 66 |     Timer t1;
 67 | 
 68 |     // Transpose and concatenate confidence data
 69 |     Mat confi_20(class_num, 400, CV_32F, input[0]);
 70 |     transpose(confi_20, confi_20);
 71 |     Mat confi_40(class_num, 1600, CV_32F, input[2]);
 72 |     transpose(confi_40, confi_40);
 73 |     Mat confi_80(class_num, 6400, CV_32F, input[4]);
 74 |     transpose(confi_80, confi_80);
 75 |     std::memcpy(output[1], confi_80.data, class_num*6400*sizeof(float));
 76 |     std::memcpy(output[1]+class_num*6400, confi_40.data, class_num*1600*sizeof(float));
 77 |     std::memcpy(output[1]+class_num*6400+class_num*1600, confi_20.data, class_num*400*sizeof(float));
 78 | 
 79 |     // Process bbox data
 80 |     float *ip, *op;
 81 |     Mat box_80_raw(64, 6400, CV_32F, input[5]); 
 82 |     transpose(box_80_raw, box_80_raw);
 83 |     exp(box_80_raw, box_80_raw);
 84 |     ip = (float*)box_80_raw.data;
 85 |     op = output[0];
 86 |     for (int i=0; i<6400; ++i) {
 87 |         for (int j=0; j<4; ++j) {
 88 |             op[i*4+j] = softmax_weightsum(ip+(i*64+j*16));
 89 |         }
 90 |     }
 91 |     Mat box_40_raw(64, 1600, CV_32F, input[3]); 
 92 |     transpose(box_40_raw, box_40_raw);
 93 |     exp(box_40_raw, box_40_raw);
 94 |     ip = (float*)box_40_raw.data;
 95 |     op = output[0]+6400*4;
 96 |     for (int i=0; i<1600; ++i) {
 97 |         for (int j=0; j<4; ++j) {
 98 |             op[i*4+j] = softmax_weightsum(ip+(i*64+j*16));
 99 |         }
100 |     }
101 |     Mat box_20_raw(64, 400, CV_32F, input[1]); 
102 |     transpose(box_20_raw, box_20_raw);
103 |     exp(box_20_raw, box_20_raw);
104 |     ip = (float*)box_20_raw.data;
105 |     op = output[0]+8000*4;
106 |     for (int i=0; i<400; ++i) {
107 |         for (int j=0; j<4; ++j) {
108 |             op[i*4+j] = softmax_weightsum(ip+(i*64+j*16));
109 |         }
110 |     }
111 | 
112 |     std::cout << "tail_process:>" << t1.elapsed() << std::endl;
113 |     return;
114 | }
115 | 
116 | void get_max_confidence(float* data, size_t item_size, int pos, int class_num, float* value, int* class_id) {
117 |     *value = 0;
118 |     *class_id = 0;
119 |     for (int i=0; i<class_num; ++i) {
120 |         if (*value <= data[i * item_size + pos]) {
121 |            *value = data[i * item_size + pos];
122 |            *class_id = i;
123 |         }
124 |     }
125 | }
126 | 
127 | float wsum_softmax(float *data, size_t item_size, int pos, int group) {
128 |     double e_sum=0;
129 |     float output=0;
130 |     int base = group*16;
131 |     for (int i=0; i<16; ++i) {
132 |         e_sum += data[(base+i)*item_size+pos];
133 |     }
134 |     for (int i=0; i<16; ++i) {
135 |         output += i*(data[(base+i)*item_size+pos]/e_sum);
136 |     }
137 |     return output;
138 | }
139 | 
140 | void get_rawbbox(float *data, size_t item_size, int pos, int* x1, int* y1, int* x2, int* y2) {
141 | }
142 | 
143 | std::vector<DetectionResult> yolov8_tail_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag) {
144 | 
145 |     //Timer t1;
146 | 
147 |     std::vector<cv::Rect> boxes; // vector of bounding boxes
148 |     std::vector<float> scores; // confidence scores for each box
149 |     std::vector<int> indices; // indices of the raw detection result
150 |     std::vector<int> cls; // vector of class IDs
151 |     std::vector<DetectionResult> results; // result to return;
152 | 
153 |     float score, x1, y1, x2, y2, l_x, l_y, w, h;
154 |     int class_id, cnt=0;
155 |     size_t item_size, base_y;
156 | 
157 |     for (int l=0; l<3; ++l) {
158 |         item_size = l_size[l]*l_size[l];
159 |         Mat tmp_mat(64, item_size, CV_32F, input[box_idx[l]]);
160 |         exp(tmp_mat, tmp_mat);
161 |         for (int y=0; y<l_size[l]; ++y) {
162 |             base_y = y*l_size[l];
163 |             for (int x=0; x<l_size[l]; ++x) {
164 |                 get_max_confidence(input[confi_idx[l]], item_size, base_y+x, class_num, &score, &class_id);
165 |                 if (score < score_threshold) continue;
166 |                 x1 = wsum_softmax(input[box_idx[l]], item_size, base_y+x, 0);
167 |                 y1 = wsum_softmax(input[box_idx[l]], item_size, base_y+x, 1);
168 |                 x2 = wsum_softmax(input[box_idx[l]], item_size, base_y+x, 2);
169 |                 y2 = wsum_softmax(input[box_idx[l]], item_size, base_y+x, 3);
170 | 
171 |                 l_x = x + 0.5 - x1;
172 |                 l_y = y + 0.5 - y1;
173 |                 w = (x1 + x2);
174 |                 h = (y1 + y2);
175 |                 boxes.push_back(cv::Rect(l_x*striders[l]*scale_x, l_y*striders[l]*scale_y, w*striders[l]*scale_x, h*striders[l]*scale_y));    
176 |                 scores.push_back(score);
177 |                 cls.push_back(class_id);
178 |                 indices.push_back(cnt++);
179 |             }
180 |         }
181 |     }
182 | 
183 |     //Do NMS
184 |     cv::dnn::NMSBoxes(boxes, scores, score_threshold, nms_threshold, indices);
185 |     //std::cout << "tail_post_process:>" << t1.elapsed() << std::endl;
186 |     if(debug_flag) {
187 |         printf("========Detection Results========\n");
188 |         printf("classID | score |     bbox\n");
189 |         printf("---------------------------------\n");
190 |     }
191 |     for (int i=0; i<indices.size(); ++i) {
192 |         int idx = indices[i];
193 |         DetectionResult result = {scores[idx], cls[idx], boxes[idx].x, boxes[idx].y, boxes[idx].width, boxes[idx].height};
194 |         results.push_back(result);
195 |         if(debug_flag)
196 |         printf("   %d    | %.2f  | %d %d %d %d\n", cls[idx], scores[idx], boxes[idx].x, boxes[idx].y, boxes[idx].width, boxes[idx].height);
197 |     }
198 |     if(debug_flag) printf("=================================\n");
199 |     return results;
200 | }
201 | 
202 | std::vector<DetectionResult> yolov8_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag) {
203 | 
204 | 
205 |     Timer t1;    
206 |     std::vector<cv::Rect> boxes; // vector of bounding boxes
207 |     std::vector<float> scores; // confidence scores for each box
208 |     std::vector<int> indices; // indices of the raw detection result
209 |     std::vector<int> cls; // vector of class IDs
210 |     std::vector<DetectionResult> results; // result to return;
211 | 
212 |     //Some tmp var
213 |     float score, l_x, l_y, w, h;
214 |     int cl;
215 |     size_t lbp, ybp, xbp, x1p, y1p, x2p, y2p;
216 | 
217 |     //Load detection results from buffers
218 |     for (int l=0; l<3; ++l) {
219 |         lbp = l_base[l];
220 |         for (int y=0; y<l_size[l];++y) {
221 |             ybp = lbp + y * l_size[l];
222 |             for(int x=0; x<l_size[l]; ++x) {
223 |                 xbp = (ybp + x)*4;
224 |                 x1p = xbp;
225 |                 y1p = xbp + 1;
226 |                 x2p = xbp + 2;
227 |                 y2p = xbp + 3;
228 |                 
229 |                 l_x = x + 0.5 - input[0][x1p];
230 |                 l_y = y + 0.5 - input[0][y1p];
231 |                 w = (input[0][x2p] + input[0][x1p]);
232 |                 h = (input[0][y2p] + input[0][y1p]);
233 | 
234 |                 boxes.push_back(cv::Rect(l_x*striders[l]*scale_x, l_y*striders[l]*scale_y, w*striders[l]*scale_x, h*striders[l]*scale_y));    
235 |                 get_confidence(input[1], ybp+x, class_num, &cl, &score);
236 |                 scores.push_back(score);
237 |                 cls.push_back(cl);
238 |                 indices.push_back(ybp+x);
239 |             }
240 |         }
241 |     }
242 | 
243 |     //Do NMS
244 |     cv::dnn::NMSBoxes(boxes, scores, score_threshold, nms_threshold, indices);
245 |     std::cout << "post_process:>" << t1.elapsed() << std::endl;
246 |     if(debug_flag) {
247 |         printf("========Detection Results========\n");
248 |         printf("classID | score |     bbox\n");
249 |         printf("---------------------------------\n");
250 |     }
251 |     for (int i=0; i<indices.size(); ++i) {
252 |         int idx = indices[i];
253 |         DetectionResult result = {scores[idx], cls[idx], boxes[idx].x, boxes[idx].y, boxes[idx].width, boxes[idx].height};
254 |         results.push_back(result);
255 |         if(debug_flag)
256 |         printf("   %d    | %.2f  | %d %d %d %d\n", cls[idx], scores[idx], boxes[idx].x, boxes[idx].y, boxes[idx].width, boxes[idx].height);
257 |     }
258 |     if(debug_flag) printf("=================================\n");
259 |     return results;
260 | }
261 | 
262 | cv::Scalar hsv_to_bgr(int h, int s, int v) {
263 |     cv::Mat hsv(1, 1, CV_8UC3, cv::Scalar(h, s, v));
264 |     cv::Mat bgr;
265 |     cv::cvtColor(hsv, bgr, cv::COLOR_HSV2BGR);
266 |     cv::Scalar bgr_color = bgr.at<cv::Vec3b>(0,0);
267 |     return bgr_color;
268 | }
269 | 
270 | std::vector<cv::Scalar> get_colors(int nums_of_class) {
271 |     std::vector<cv::Scalar> colors;
272 |     for (int i=0; i<nums_of_class; ++i) {
273 |         colors.push_back(hsv_to_bgr((int)((float)i/(float)nums_of_class*255),200,230));
274 |     }
275 |     return colors;
276 | }
277 | 
278 | void yolov8_draw_result(std::vector<DetectionResult> results, cv::Mat &img, const std::string* class_names, int num_of_classes) {
279 |     std::vector<cv::Scalar> colors = get_colors(num_of_classes);
280 |     for (int i=0; i<results.size(); ++i) {
281 |         cv::Rect bbox(results[i].x, results[i].y, results[i].w, results[i].h);
282 |         cv::rectangle(img, bbox, colors[results[i].cls], 2);
283 | 
284 |         // Prepare the text label
285 |         std::string label = class_names[results[i].cls] + ":" + std::to_string(results[i].score);
286 |         
287 |         // Choose a position for the label
288 |         int baseLine;
289 |         cv::Size labelSize = getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
290 | 
291 |         // Draw filled box to put label text in it
292 |         rectangle(img, cv::Rect(cv::Point(results[i].x, results[i].y - labelSize.height), cv::Size(labelSize.width, labelSize.height + baseLine)), colors[results[i].cls], cv::FILLED);
293 | 
294 |         // Put label text on the image
295 |         cv::putText(img, label, cv::Point(results[i].x, results[i].y), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0,0,0));
296 |     }
297 | }
298 | 


--------------------------------------------------------------------------------
/tools/dataset.txt:
--------------------------------------------------------------------------------
1 | ../data/bus.npy
2 | 


--------------------------------------------------------------------------------
/tools/modify_no_tail.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | from onnx import helper
 3 | 
 4 | import numpy as np
 5 | 
 6 | # Load the ONNX model
 7 | model = onnx.load('../data/yolov8n_full.onnx')
 8 | 
 9 | # Get the model's graph
10 | graph = model.graph
11 | 
12 | # Find the Concat operation to remove
13 | concat_node = None
14 | for node in graph.node:
15 |     if node.name == '/model.22/Concat_5':
16 |         concat_node = node
17 |         break
18 | 
19 | # If the Concat operation is found, remove it
20 | if concat_node:
21 |     graph.node.remove(concat_node)
22 | 
23 | # Find the nodes producing '/model.22/Mul_2_output_0' and '/model.22/Sigmoid_output_0' and change their outputs
24 | for node in graph.node:
25 |     node_outputs = list(node.output)
26 |     if '/model.22/dfl/Reshape_1_output_0' in node_outputs:
27 |         node_outputs[node_outputs.index('/model.22/dfl/Reshape_1_output_0')] = 'output0'
28 |     elif '/model.22/Sigmoid_output_0' in node_outputs:
29 |         node_outputs[node_outputs.index('/model.22/Sigmoid_output_0')] = 'output1'
30 |     node.output[:] = node_outputs
31 | 
32 | # Remove existing output
33 | for output in list(graph.output):
34 |     graph.output.remove(output)
35 | 
36 | # Cut the Box head
37 | node_to_remove = ['/model.22/Concat','/model.22/Reshape','/model.22/Concat_1','/model.22/Reshape_1','/model.22/Concat_2','/model.22/Reshape_2','/model.22/Concat_3','/model.22/Split','/model.22/dfl/Constant_1', '/model.22/dfl/Constant', '/model.22/Sigmoid', '/model.22/dfl/Reshape_1', '/model.22/dfl/conv/Conv', '/model.22/dfl/Transpose_1', '/model.22/dfl/Softmax', '/model.22/dfl/Transpose', '/model.22/dfl/Reshape', '/model.22/Shape', '/model.22/Gather', '/model.22/Add', '/model.22/Div', '/model.22/Mul', '/model.22/Mul_1', '/model.22/Slice', '/model.22/Slice_1', '/model.22/Sub', '/model.22/Add_1', '/model.22/Add_2', '/model.22/Sub_1', '/model.22/Div_1', '/model.22/Concat_4', '/model.22/Mul_2']
38 | 
39 | for i in range(1,13):
40 |     node_to_remove.append("/model.22/Constant_%d" % i)
41 | 
42 | node_to_remove.append("/model.22/Constant")
43 | 
44 | tmp_node_list = []
45 | for r_node in node_to_remove:
46 |     for node in graph.node:
47 |         if node.name == r_node:
48 |             tmp_node_list.append(node)
49 |             break
50 | for r_node in tmp_node_list:
51 |     graph.node.remove(r_node)
52 | 
53 | # Above is used to remove the tail
54 | 
55 | # Define new nodes
56 | # 80x20x20
57 | sigmoid_node_0 = helper.make_node(
58 |     'Sigmoid',
59 |     name='/tail/Sigmoid_0',
60 |     inputs=['/model.22/cv3.2/cv3.2.2/Conv_output_0'],
61 |     outputs=['/branch_20/confi'],
62 | )
63 | 
64 | # 80x40x40
65 | sigmoid_node_1 = helper.make_node(
66 |     'Sigmoid',
67 |     name='/tail/Sigmoid_1',
68 |     inputs=['/model.22/cv3.1/cv3.1.2/Conv_output_0'],
69 |     outputs=['/branch_40/confi'],
70 | )
71 | 
72 | # 80x80x80
73 | sigmoid_node_2 = helper.make_node(
74 |     'Sigmoid',
75 |     name='/tail/Sigmoid_2',
76 |     inputs=['/model.22/cv3.0/cv3.0.2/Conv_output_0'],
77 |     outputs=['/branch_80/confi'],
78 | )
79 | # Add new nodes to graph
80 | graph.node.extend([sigmoid_node_0, sigmoid_node_1, sigmoid_node_2])
81 | # Create new output Tensors with the specified shape and datatype (float32)
82 | new_output0 = helper.make_tensor_value_info('/branch_20/confi', onnx.TensorProto.FLOAT, [1, 80, 20, 20])
83 | new_output1 = helper.make_tensor_value_info('/model.22/cv2.2/cv2.2.2/Conv_output_0', onnx.TensorProto.FLOAT, [1, 64, 20, 20])
84 | 
85 | new_output2 = helper.make_tensor_value_info('/branch_40/confi', onnx.TensorProto.FLOAT, [1, 80, 40, 40])
86 | new_output3 = helper.make_tensor_value_info('/model.22/cv2.1/cv2.1.2/Conv_output_0', onnx.TensorProto.FLOAT, [1, 64, 40, 40])
87 | 
88 | new_output4 = helper.make_tensor_value_info('/branch_80/confi', onnx.TensorProto.FLOAT, [1, 80, 80, 80])
89 | new_output5 = helper.make_tensor_value_info('/model.22/cv2.0/cv2.0.2/Conv_output_0', onnx.TensorProto.FLOAT, [1, 64, 80, 80])
90 | 
91 | graph.output.extend([new_output0, new_output1, new_output2, new_output3, new_output4, new_output5])
92 | 
93 | # Check the model
94 | result = onnx.shape_inference.infer_shapes(model)
95 | 
96 | # Save the modified model
97 | onnx.save(result, 'yolov8n_no_tail.onnx')
98 | 


--------------------------------------------------------------------------------
/tools/onnx2rknn_export.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import urllib
  3 | import traceback
  4 | import time
  5 | import sys
  6 | import numpy as np
  7 | import cv2
  8 | import yolov8_box_head
  9 | from yolov8_tail import tail_process
 10 | 
 11 | from rknn.api import RKNN
 12 | 
 13 | ONNX_MODEL = '../data/yolov8n_no_boxhead_transpose.onnx'
 14 | RKNN_MODEL = '../data/yolov8n_no_boxhead_transpose.rknn'
 15 | IMG_PATH = '../data/bus.jpg'
 16 | DATASET = '../data/dataset.txt'
 17 | 
 18 | QUANTIZE_ON = True
 19 | 
 20 | TAIL_PROC = True
 21 | 
 22 | OBJ_THRESH = 0.5
 23 | NMS_THRESH = 0.45
 24 | IMG_SIZE = 640
 25 | 
 26 | CLASSES = ("person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light",
 27 |            "fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant",
 28 |            "bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
 29 |            "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ",
 30 |            "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
 31 |            "pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop	", "mouse	", "remote ", "keyboard ", "cell phone", "microwave ",
 32 |            "oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush ")
 33 | 
 34 | def post_process_yolov8(network_output, img_input, save_path=None):
 35 |     global OBJ_THRESH, NMS_THRESH, CLASSES, HEAD_PROC
 36 | 
 37 |     # Parameters
 38 |     confThreshold = OBJ_THRESH
 39 |     nmsThreshold = NMS_THRESH
 40 | 
 41 |     # Lists to store detected bounding boxes, confidences and class IDs
 42 |     boxes = []
 43 |     confidences = []
 44 |     classIDs = []
 45 | 
 46 |     # Process the raw output
 47 |     for detection in np.transpose(network_output, (2,1,0)):
 48 |     # Extract the bounding box coordinates and class confidences
 49 |         box = detection[0:4]
 50 |         class_confidences = detection[4:]
 51 |         # Get the class ID and its confidence
 52 |         classID = np.argmax(class_confidences)
 53 |         confidence = class_confidences[classID]
 54 | 
 55 |         # Filter out weak predictions
 56 |         if confidence > confThreshold:
 57 |             # Update our list of bounding boxes, confidences and class IDs
 58 |             boxes.append([int(val) for val in box])
 59 |             confidences.append(float(confidence))
 60 |             classIDs.append(classID)
 61 | 
 62 |     indices = cv2.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
 63 |     if len(indices) > 0:
 64 |     # Loop over the indices we are keeping
 65 |         for i in indices.flatten():
 66 |             # Extract the bounding box coordinates
 67 |             (x, y, w, h) = [int(val) for val in boxes[i]]
 68 |             x = x - w//2
 69 |             y = y - h//2
 70 |             text = "{}: {:.4f}".format(CLASSES[classIDs[i]], confidences[i])
 71 |             print(text, "|", "%d,%d,%d,%d" % (x,y,w,h))
 72 | 
 73 |             if save_path != None:
 74 |                 # Draw a bounding box rectangle and label on the image
 75 |                 color = [int(c) for c in np.random.randint(0, 255, size=(3,))]
 76 |                 cv2.rectangle(img_input, (x, y), (x + w, y + h), color, 2)
 77 |                 cv2.putText(img_input, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
 78 |     if save_path != None:
 79 |         cv2.imwrite(save_path, img_input)
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 | 
 87 | 
 88 |     # Create RKNN object
 89 |     rknn = RKNN(verbose=True)
 90 | 
 91 |     # pre-process config
 92 |     print('--> Config model')
 93 |     rknn.config(mean_values=[[0, 0, 0]], std_values=[[1, 1, 1]], target_platform="rk3588")
 94 |     print('done')
 95 | 
 96 |     # Load ONNX model
 97 |     print('--> Loading model')
 98 |     ret = rknn.load_onnx(model=ONNX_MODEL)
 99 |     if ret != 0:
100 |         print('Load model failed!')
101 |         exit(ret)
102 |     print('done')
103 | 
104 |     # Build model
105 |     print('--> Build model.')
106 |     ret = rknn.build(do_quantization=QUANTIZE_ON, dataset=DATASET)
107 |     if ret != 0:
108 |         print('Build model failed!')
109 |         exit(ret)
110 |     print('done')
111 |     """
112 | 
113 |     # Hybird quantization step 1
114 |     print('--> Hybird quantization step one')
115 |     ret = rknn.hybrid_quantization_step1(dataset=DATASET)
116 |     if ret != 0:
117 |         print('Step one failed!')
118 |         exit(ret)
119 |     print('done')
120 | 
121 |     # Call hybrid_quantization_step2 to generate hybrid quantized RKNN model
122 |     print('--> Hybird quantization step two')
123 |     ret = rknn.hybrid_quantization_step2(
124 |             model_input="./"+ONNX_MODEL.split('.')[0]+".model",
125 |             data_input="./"+ONNX_MODEL.split('.')[0]+".data",
126 |             model_quantization_cfg="./"+ONNX_MODEL.split('.')[0]+".quantization.cfg"
127 |             )
128 |     if ret != 0:
129 |         print('Step two failed!')
130 |         exit(ret)
131 |     print('done')
132 |     """
133 | 
134 |     # Export RKNN model
135 |     print('--> Export rknn model')
136 |     ret = rknn.export_rknn(RKNN_MODEL)
137 |     if ret != 0:
138 |         print('Export rknn model failed!')
139 |         exit(ret)
140 |     print('done')
141 | 
142 |     # Init runtime environment
143 |     print('--> Init runtime environment')
144 |     ret = rknn.init_runtime()
145 |     if ret != 0:
146 |         print('Init runtime environment failed!')
147 |         exit(ret)
148 |     print('done')
149 | 
150 |     # Set inputs
151 |     img = cv2.imread(IMG_PATH)
152 |     blob = cv2.dnn.blobFromImage(img, 1/255.0, (IMG_SIZE, IMG_SIZE), swapRB=True, crop=False)
153 | 
154 |     # Inference
155 |     print('--> Running model')
156 |     outputs = rknn.inference(inputs=[blob], data_format="nchw", inputs_pass_through=[1])
157 |     print('done')
158 | 
159 |     if (TAIL_PROC):
160 |         outputs = tail_process(outputs)
161 |         outputs[0] = yolov8_box_head.yolov8_box_head(outputs[0])
162 | 
163 |     merge_output = np.concatenate(outputs, axis=1)
164 | 
165 |     print('--> Post Process')
166 |     post_process_yolov8(merge_output, img, "./"+ONNX_MODEL.split('.')[0]+"_result.png")
167 |     print('done')
168 | 
169 |     rknn.release()
170 | 


--------------------------------------------------------------------------------
/tools/yolov8_box_head.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def yolov8_box_head(input):
 4 |     layer_sizes = [80, 40, 20]
 5 |     layer_pointer = [0, 6400, 8000]
 6 |     strides = [8, 16, 32]
 7 |     for l in range(3):
 8 | 
 9 |         x = np.arange(0.5, layer_sizes[l], 1)
10 |         y = np.arange(0.5, layer_sizes[l], 1)
11 |         sx, sy = np.meshgrid(x, y)
12 |         sx = sx.reshape(-1)
13 |         sy = sy.reshape(-1)
14 |         p_s, p_e = layer_pointer[l], layer_pointer[l] + layer_sizes[l]**2
15 | 
16 |         input[0,0,p_s:p_e] = sx - input[0,0,p_s:p_e] 
17 |         input[0,1,p_s:p_e] = sy - input[0,1,p_s:p_e] 
18 |         input[0,2,p_s:p_e] = sx + input[0,2,p_s:p_e] 
19 |         input[0,3,p_s:p_e] = sy + input[0,3,p_s:p_e]
20 | 
21 |         c_x = (input[0,0,p_s:p_e] + input[0,2,p_s:p_e]) / 2
22 |         c_y = (input[0,1,p_s:p_e] + input[0,3,p_s:p_e]) / 2
23 |         w = input[0,2,p_s:p_e] - input[0,0,p_s:p_e]
24 |         h = input[0,3,p_s:p_e] - input[0,1,p_s:p_e]
25 | 
26 |         input[0,0,p_s:p_e] = c_x
27 |         input[0,1,p_s:p_e] = c_y
28 |         input[0,2,p_s:p_e] = w
29 |         input[0,3,p_s:p_e] = h
30 | 
31 |         input[0,:,p_s:p_e] *= strides[l]
32 | 
33 |     return input
34 | 
35 | if __name__ == "__main__":
36 |     import pdb
37 |     pdb.set_trace()
38 |     arr = np.load("yolov8n_no_boxhead_result_0.npy")
39 |     yolov8_box_head(arr)
40 | 
41 | 


--------------------------------------------------------------------------------
/tools/yolov8_tail.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def softmax(x):
 4 |     e_x = np.exp(x)
 5 |     s_e_x = np.sum(e_x, axis=-1).reshape(-1,4,1)
 6 |     return e_x/s_e_x
 7 | 
 8 | def box_tail(x):
 9 |     x = x.reshape((-1,4,16))
10 |     x = softmax(x)
11 |     return np.sum(x*np.arange(16).reshape(1,1,-1), axis=-1).reshape(-1,4)
12 | 
13 | 
14 | def tail_process(network_outputs):
15 |     a1 = network_outputs[1].transpose((0,2,3,1)).reshape(1,-1,64,1)
16 |     a2 = network_outputs[3].transpose((0,2,3,1)).reshape(1,-1,64,1)
17 |     a3 = network_outputs[5].transpose((0,2,3,1)).reshape(1,-1,64,1)
18 |     bbox_20 = box_tail(network_outputs[1].transpose((0,2,3,1)).reshape(1,-1,64,1)).transpose().reshape((1,4,-1))
19 |     bbox_40 = box_tail(network_outputs[3].transpose((0,2,3,1)).reshape(1,-1,64,1)).transpose().reshape((1,4,-1))
20 |     bbox_80 = box_tail(network_outputs[5].transpose((0,2,3,1)).reshape(1,-1,64,1)).transpose().reshape((1,4,-1))
21 |     bbox = np.concatenate((bbox_80, bbox_40, bbox_20), axis=2)
22 | 
23 |     confi_20 = network_outputs[0].reshape(1,80,-1)
24 |     confi_40 = network_outputs[2].reshape(1,80,-1)
25 |     confi_80 = network_outputs[4].reshape(1,80,-1)
26 |     confi = np.concatenate((confi_80, confi_40, confi_20), axis=2)
27 | 
28 |     return [bbox, confi]
29 | 
30 |     
31 | 
32 | 


--------------------------------------------------------------------------------