├── .gitignore ├── CMakeLists.txt ├── README.md ├── data ├── bus.jpg ├── bus.npy ├── yolov8n_full.onnx ├── yolov8n_no_tail.onnx └── yolov8n_no_tail.rknn ├── include ├── rknn_api.h ├── rknn_net.h ├── yolo_multi_npu.h └── yolov8_post.h ├── lib └── librknnrt.so ├── src ├── picture_demo.cc ├── rknn_net.cc ├── video_demo.cc └── yolov8_post.cc └── tools ├── dataset.txt ├── modify_no_tail.py ├── onnx2rknn_export.py ├── yolov8_box_head.py └── yolov8_tail.py /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /.vscode -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.1) 2 | 3 | project(rknn_yolov8_demo) 4 | 5 | include_directories(${CMAKE_SOURCE_DIR}/include) 6 | 7 | set(RKNN_RT_LIB ${CMAKE_SOURCE_DIR}/lib/librknnrt.so) 8 | 9 | set(OpenCV_DIR /root/lib/opencv454_install/lib/cmake/opencv4) 10 | find_package(OpenCV REQUIRED) 11 | 12 | find_package(Threads REQUIRED) # find and include pthreads 13 | 14 | add_executable(${PROJECT_NAME}_picture 15 | src/picture_demo.cc 16 | src/rknn_net.cc 17 | src/yolov8_post.cc 18 | ) 19 | 20 | add_executable(${PROJECT_NAME}_video 21 | src/video_demo.cc 22 | src/rknn_net.cc 23 | src/yolov8_post.cc 24 | ) 25 | 26 | target_link_libraries(${PROJECT_NAME}_picture 27 | ${RKNN_RT_LIB} 28 | ${OpenCV_LIBS} 29 | ) 30 | 31 | target_link_libraries(${PROJECT_NAME}_video 32 | ${RKNN_RT_LIB} 33 | ${OpenCV_LIBS} 34 | Threads::Threads 35 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Yolv8n RK3588 Demo :rocket: 2 | 3 | This repository contains a demonstration of Yolv8n running on an RK3588 device. 4 | 5 | ## How to Use 6 | 7 | ### Configuring OpenCV Installation Directory 8 | 9 | Edit CMakeList.txt to set your OpenCV installation directory: 10 | 11 | ``` 12 | set(OpenCV_DIR /root/lib/opencv454_install/lib/cmake/opencv4) 13 | ``` 14 | 15 | Replace the path with your own OpenCV installation directory. 16 | 17 | ### Building the Project 18 | 19 | ```shell 20 | mkdir build 21 | cd build 22 | cmake .. 23 | make 24 | ``` 25 | 26 | ### Running the Demo 27 | 28 | **Inference from Images** 29 | 30 | To run the demo using default image: 31 | 32 | ```shell 33 | rknn_yolov8_demo_picture 34 | ``` 35 | 36 | **Inference from Video** 37 | 38 | Video inference is currently under development :construction:. To run the demo, please install ffmpeg and rtsp server, modifiy the input_path in src/video_demo.cc and rtsp address. 39 | 40 | ``` 41 | rknn_yolov8_demo_video 42 | ``` 43 | 44 | ## Notes 45 | 46 | The tools directory contains scripts for model conversion which are intended to be run on a host machine. These include: 47 | 48 | - modify_no_tail.py 49 | 50 | This script removes the tail of the official exported Yolv8n model. 51 | 52 | - onnx2rknn_export.py 53 | 54 | This script converts the modified ONNX model (with the head removed) to an RKNN model. -------------------------------------------------------------------------------- /data/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/bus.jpg -------------------------------------------------------------------------------- /data/bus.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/bus.npy -------------------------------------------------------------------------------- /data/yolov8n_full.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/yolov8n_full.onnx -------------------------------------------------------------------------------- /data/yolov8n_no_tail.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/yolov8n_no_tail.onnx -------------------------------------------------------------------------------- /data/yolov8n_no_tail.rknn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/data/yolov8n_no_tail.rknn -------------------------------------------------------------------------------- /include/rknn_api.h: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | * 3 | * Copyright (c) 2017 - 2022 by Rockchip Corp. All rights reserved. 4 | * 5 | * The material in this file is confidential and contains trade secrets 6 | * of Rockchip Corporation. This is proprietary information owned by 7 | * Rockchip Corporation. No part of this work may be disclosed, 8 | * reproduced, copied, transmitted, or used in any way for any purpose, 9 | * without the express written permission of Rockchip Corporation. 10 | * 11 | *****************************************************************************/ 12 | 13 | 14 | #ifndef _RKNN_API_H 15 | #define _RKNN_API_H 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | #include 22 | 23 | /* 24 | Definition of extended flag for rknn_init. 25 | */ 26 | /* set high priority context. */ 27 | #define RKNN_FLAG_PRIOR_HIGH 0x00000000 28 | 29 | /* set medium priority context */ 30 | #define RKNN_FLAG_PRIOR_MEDIUM 0x00000001 31 | 32 | /* set low priority context. */ 33 | #define RKNN_FLAG_PRIOR_LOW 0x00000002 34 | 35 | /* asynchronous mode. 36 | when enable, rknn_outputs_get will not block for too long because it directly retrieves the result of 37 | the previous frame which can increase the frame rate on single-threaded mode, but at the cost of 38 | rknn_outputs_get not retrieves the result of the current frame. 39 | in multi-threaded mode you do not need to turn this mode on. */ 40 | #define RKNN_FLAG_ASYNC_MASK 0x00000004 41 | 42 | /* collect performance mode. 43 | when enable, you can get detailed performance reports via rknn_query(ctx, RKNN_QUERY_PERF_DETAIL, ...), 44 | but it will reduce the frame rate. */ 45 | #define RKNN_FLAG_COLLECT_PERF_MASK 0x00000008 46 | 47 | /* allocate all memory in outside, includes weight/internal/inputs/outputs */ 48 | #define RKNN_FLAG_MEM_ALLOC_OUTSIDE 0x00000010 49 | 50 | /* weight sharing with the same network structure */ 51 | #define RKNN_FLAG_SHARE_WEIGHT_MEM 0x00000020 52 | 53 | /* send fence fd from outside */ 54 | #define RKNN_FLAG_FENCE_IN_OUTSIDE 0x00000040 55 | 56 | /* get fence fd from inside */ 57 | #define RKNN_FLAG_FENCE_OUT_OUTSIDE 0x00000080 58 | 59 | /* dummy init flag: could only get total_weight_size and total_internal_size by rknn_query*/ 60 | #define RKNN_FLAG_COLLECT_MODEL_INFO_ONLY 0x00000100 61 | 62 | /* 63 | Error code returned by the RKNN API. 64 | */ 65 | #define RKNN_SUCC 0 /* execute succeed. */ 66 | #define RKNN_ERR_FAIL -1 /* execute failed. */ 67 | #define RKNN_ERR_TIMEOUT -2 /* execute timeout. */ 68 | #define RKNN_ERR_DEVICE_UNAVAILABLE -3 /* device is unavailable. */ 69 | #define RKNN_ERR_MALLOC_FAIL -4 /* memory malloc fail. */ 70 | #define RKNN_ERR_PARAM_INVALID -5 /* parameter is invalid. */ 71 | #define RKNN_ERR_MODEL_INVALID -6 /* model is invalid. */ 72 | #define RKNN_ERR_CTX_INVALID -7 /* context is invalid. */ 73 | #define RKNN_ERR_INPUT_INVALID -8 /* input is invalid. */ 74 | #define RKNN_ERR_OUTPUT_INVALID -9 /* output is invalid. */ 75 | #define RKNN_ERR_DEVICE_UNMATCH -10 /* the device is unmatch, please update rknn sdk 76 | and npu driver/firmware. */ 77 | #define RKNN_ERR_INCOMPATILE_PRE_COMPILE_MODEL -11 /* This RKNN model use pre_compile mode, but not compatible with current driver. */ 78 | #define RKNN_ERR_INCOMPATILE_OPTIMIZATION_LEVEL_VERSION -12 /* This RKNN model set optimization level, but not compatible with current driver. */ 79 | #define RKNN_ERR_TARGET_PLATFORM_UNMATCH -13 /* This RKNN model set target platform, but not compatible with current platform. */ 80 | 81 | /* 82 | Definition for tensor 83 | */ 84 | #define RKNN_MAX_DIMS 16 /* maximum dimension of tensor. */ 85 | #define RKNN_MAX_NUM_CHANNEL 15 /* maximum channel number of input tensor. */ 86 | #define RKNN_MAX_NAME_LEN 256 /* maximum name lenth of tensor. */ 87 | 88 | 89 | #ifdef __arm__ 90 | typedef uint32_t rknn_context; 91 | #else 92 | typedef uint64_t rknn_context; 93 | #endif 94 | 95 | 96 | /* 97 | The query command for rknn_query 98 | */ 99 | typedef enum _rknn_query_cmd { 100 | RKNN_QUERY_IN_OUT_NUM = 0, /* query the number of input & output tensor. */ 101 | RKNN_QUERY_INPUT_ATTR = 1, /* query the attribute of input tensor. */ 102 | RKNN_QUERY_OUTPUT_ATTR = 2, /* query the attribute of output tensor. */ 103 | RKNN_QUERY_PERF_DETAIL = 3, /* query the detail performance, need set 104 | RKNN_FLAG_COLLECT_PERF_MASK when call rknn_init, 105 | this query needs to be valid after rknn_outputs_get. */ 106 | RKNN_QUERY_PERF_RUN = 4, /* query the time of run, 107 | this query needs to be valid after rknn_outputs_get. */ 108 | RKNN_QUERY_SDK_VERSION = 5, /* query the sdk & driver version */ 109 | 110 | RKNN_QUERY_MEM_SIZE = 6, /* query the weight & internal memory size */ 111 | RKNN_QUERY_CUSTOM_STRING = 7, /* query the custom string */ 112 | 113 | RKNN_QUERY_NATIVE_INPUT_ATTR = 8, /* query the attribute of native input tensor. */ 114 | RKNN_QUERY_NATIVE_OUTPUT_ATTR = 9, /* query the attribute of native output tensor. */ 115 | 116 | RKNN_QUERY_NATIVE_NC1HWC2_INPUT_ATTR = 8, /* query the attribute of native input tensor. */ 117 | RKNN_QUERY_NATIVE_NC1HWC2_OUTPUT_ATTR = 9, /* query the attribute of native output tensor. */ 118 | 119 | RKNN_QUERY_NATIVE_NHWC_INPUT_ATTR = 10, /* query the attribute of native input tensor. */ 120 | RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR = 11, /* query the attribute of native output tensor. */ 121 | 122 | RKNN_QUERY_DEVICE_MEM_INFO = 12, /* query the attribute of rknn memory information. */ 123 | 124 | RKNN_QUERY_CMD_MAX 125 | } rknn_query_cmd; 126 | 127 | /* 128 | the tensor data type. 129 | */ 130 | typedef enum _rknn_tensor_type { 131 | RKNN_TENSOR_FLOAT32 = 0, /* data type is float32. */ 132 | RKNN_TENSOR_FLOAT16, /* data type is float16. */ 133 | RKNN_TENSOR_INT8, /* data type is int8. */ 134 | RKNN_TENSOR_UINT8, /* data type is uint8. */ 135 | RKNN_TENSOR_INT16, /* data type is int16. */ 136 | RKNN_TENSOR_UINT16, /* data type is uint16. */ 137 | RKNN_TENSOR_INT32, /* data type is int32. */ 138 | RKNN_TENSOR_UINT32, /* data type is uint32. */ 139 | RKNN_TENSOR_INT64, /* data type is int64. */ 140 | RKNN_TENSOR_BOOL, 141 | 142 | RKNN_TENSOR_TYPE_MAX 143 | } rknn_tensor_type; 144 | 145 | inline static const char* get_type_string(rknn_tensor_type type) 146 | { 147 | switch(type) { 148 | case RKNN_TENSOR_FLOAT32: return "FP32"; 149 | case RKNN_TENSOR_FLOAT16: return "FP16"; 150 | case RKNN_TENSOR_INT8: return "INT8"; 151 | case RKNN_TENSOR_UINT8: return "UINT8"; 152 | case RKNN_TENSOR_INT16: return "INT16"; 153 | case RKNN_TENSOR_UINT16: return "UINT16"; 154 | case RKNN_TENSOR_INT32: return "INT32"; 155 | case RKNN_TENSOR_UINT32: return "UINT32"; 156 | case RKNN_TENSOR_INT64: return "INT64"; 157 | case RKNN_TENSOR_BOOL: return "BOOL"; 158 | default: return "UNKNOW"; 159 | } 160 | } 161 | 162 | /* 163 | the quantitative type. 164 | */ 165 | typedef enum _rknn_tensor_qnt_type { 166 | RKNN_TENSOR_QNT_NONE = 0, /* none. */ 167 | RKNN_TENSOR_QNT_DFP, /* dynamic fixed point. */ 168 | RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC, /* asymmetric affine. */ 169 | 170 | RKNN_TENSOR_QNT_MAX 171 | } rknn_tensor_qnt_type; 172 | 173 | inline static const char* get_qnt_type_string(rknn_tensor_qnt_type type) 174 | { 175 | switch(type) { 176 | case RKNN_TENSOR_QNT_NONE: return "NONE"; 177 | case RKNN_TENSOR_QNT_DFP: return "DFP"; 178 | case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC: return "AFFINE"; 179 | default: return "UNKNOW"; 180 | } 181 | } 182 | 183 | /* 184 | the tensor data format. 185 | */ 186 | typedef enum _rknn_tensor_format { 187 | RKNN_TENSOR_NCHW = 0, /* data format is NCHW. */ 188 | RKNN_TENSOR_NHWC, /* data format is NHWC. */ 189 | RKNN_TENSOR_NC1HWC2, /* data format is NC1HWC2. */ 190 | RKNN_TENSOR_UNDEFINED, 191 | 192 | RKNN_TENSOR_FORMAT_MAX 193 | } rknn_tensor_format; 194 | 195 | /* 196 | the mode of running on target NPU core. 197 | */ 198 | typedef enum _rknn_core_mask { 199 | RKNN_NPU_CORE_AUTO = 0, /* default, run on NPU core randomly. */ 200 | RKNN_NPU_CORE_0 = 1, /* run on NPU core 0. */ 201 | RKNN_NPU_CORE_1 = 2, /* run on NPU core 1. */ 202 | RKNN_NPU_CORE_2 = 4, /* run on NPU core 2. */ 203 | RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 | RKNN_NPU_CORE_1, /* run on NPU core 1 and core 2. */ 204 | RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 | RKNN_NPU_CORE_2, /* run on NPU core 1 and core 2 and core 3. */ 205 | 206 | RKNN_NPU_CORE_UNDEFINED, 207 | } rknn_core_mask; 208 | 209 | inline static const char* get_format_string(rknn_tensor_format fmt) 210 | { 211 | switch(fmt) { 212 | case RKNN_TENSOR_NCHW: return "NCHW"; 213 | case RKNN_TENSOR_NHWC: return "NHWC"; 214 | case RKNN_TENSOR_NC1HWC2: return "NC1HWC2"; 215 | case RKNN_TENSOR_UNDEFINED: return "UNDEFINED"; 216 | default: return "UNKNOW"; 217 | } 218 | } 219 | 220 | /* 221 | the information for RKNN_QUERY_IN_OUT_NUM. 222 | */ 223 | typedef struct _rknn_input_output_num { 224 | uint32_t n_input; /* the number of input. */ 225 | uint32_t n_output; /* the number of output. */ 226 | } rknn_input_output_num; 227 | 228 | /* 229 | the information for RKNN_QUERY_INPUT_ATTR / RKNN_QUERY_OUTPUT_ATTR. 230 | */ 231 | typedef struct _rknn_tensor_attr { 232 | uint32_t index; /* input parameter, the index of input/output tensor, 233 | need set before call rknn_query. */ 234 | 235 | uint32_t n_dims; /* the number of dimensions. */ 236 | uint32_t dims[RKNN_MAX_DIMS]; /* the dimensions array. */ 237 | char name[RKNN_MAX_NAME_LEN]; /* the name of tensor. */ 238 | 239 | uint32_t n_elems; /* the number of elements. */ 240 | uint32_t size; /* the bytes size of tensor. */ 241 | 242 | rknn_tensor_format fmt; /* the data format of tensor. */ 243 | rknn_tensor_type type; /* the data type of tensor. */ 244 | rknn_tensor_qnt_type qnt_type; /* the quantitative type of tensor. */ 245 | int8_t fl; /* fractional length for RKNN_TENSOR_QNT_DFP. */ 246 | int32_t zp; /* zero point for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */ 247 | float scale; /* scale for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */ 248 | 249 | uint32_t w_stride; /* the stride of tensor along the width dimention of input, 250 | Note: it is read-only, 0 means equal to width. */ 251 | uint32_t size_with_stride; /* the bytes size of tensor with stride. */ 252 | 253 | uint8_t pass_through; /* pass through mode, for rknn_set_io_mem interface. 254 | if TRUE, the buf data is passed directly to the input node of the rknn model 255 | without any conversion. the following variables do not need to be set. 256 | if FALSE, the buf data is converted into an input consistent with the model 257 | according to the following type and fmt. so the following variables 258 | need to be set.*/ 259 | uint32_t h_stride; /* the stride along the height dimention of input, 260 | Note: it is write-only, if it was set to 0, h_stride = height. */ 261 | } rknn_tensor_attr; 262 | 263 | /* 264 | the information for RKNN_QUERY_PERF_DETAIL. 265 | */ 266 | typedef struct _rknn_perf_detail { 267 | char* perf_data; /* the string pointer of perf detail. don't need free it by user. */ 268 | uint64_t data_len; /* the string length. */ 269 | } rknn_perf_detail; 270 | 271 | /* 272 | the information for RKNN_QUERY_PERF_RUN. 273 | */ 274 | typedef struct _rknn_perf_run { 275 | int64_t run_duration; /* real inference time (us) */ 276 | } rknn_perf_run; 277 | 278 | /* 279 | the information for RKNN_QUERY_SDK_VERSION. 280 | */ 281 | typedef struct _rknn_sdk_version { 282 | char api_version[256]; /* the version of rknn api. */ 283 | char drv_version[256]; /* the version of rknn driver. */ 284 | } rknn_sdk_version; 285 | 286 | /* 287 | the information for RKNN_QUERY_MEM_SIZE. 288 | */ 289 | typedef struct _rknn_mem_size { 290 | uint32_t total_weight_size; /* the weight memory size */ 291 | uint32_t total_internal_size; /* the internal memory size, exclude inputs/outputs */ 292 | uint64_t total_dma_allocated_size; /* total dma memory allocated size */ 293 | uint32_t total_sram_size; /* total system sram size reserved for rknn */ 294 | uint32_t free_sram_size; /* free system sram size reserved for rknn */ 295 | uint32_t reserved[10]; /* reserved */ 296 | } rknn_mem_size; 297 | 298 | /* 299 | the information for RKNN_QUERY_CUSTOM_STRING. 300 | */ 301 | typedef struct _rknn_custom_string { 302 | char string[1024]; /* the string of custom, lengths max to 1024 bytes */ 303 | } rknn_custom_string; 304 | 305 | /* 306 | The flags of rknn_tensor_mem. 307 | */ 308 | typedef enum _rknn_tensor_mem_flags { 309 | RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE = 1, /*Used to mark in rknn_destroy_mem() whether it is necessary to release the "mem" pointer itself. 310 | If the flag RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE is set, rknn_destroy_mem() will call free(mem).*/ 311 | RKNN_TENSOR_MEMORY_FLAGS_FROM_FD = 2, /*Used to mark in rknn_create_mem_from_fd() whether it is necessary to release the "mem" pointer itself. 312 | If the flag RKNN_TENSOR_MEMORY_FLAGS_FROM_FD is set, rknn_destroy_mem() will call free(mem).*/ 313 | RKNN_TENSOR_MEMORY_FLAGS_FROM_PHYS = 3, /*Used to mark in rknn_create_mem_from_phys() whether it is necessary to release the "mem" pointer itself. 314 | If the flag RKNN_TENSOR_MEMORY_FLAGS_FROM_PHYS is set, rknn_destroy_mem() will call free(mem).*/ 315 | RKNN_TENSOR_MEMORY_FLAGS_UNKNOWN 316 | } rknn_tensor_mem_flags; 317 | 318 | /* 319 | the memory information of tensor. 320 | */ 321 | typedef struct _rknn_tensor_memory { 322 | void* virt_addr; /* the virtual address of tensor buffer. */ 323 | uint64_t phys_addr; /* the physical address of tensor buffer. */ 324 | int32_t fd; /* the fd of tensor buffer. */ 325 | int32_t offset; /* indicates the offset of the memory. */ 326 | uint32_t size; /* the size of tensor buffer. */ 327 | uint32_t flags; /* the flags of tensor buffer, reserved */ 328 | void * priv_data; /* the private data of tensor buffer. */ 329 | } rknn_tensor_mem; 330 | 331 | /* 332 | the input information for rknn_input_set. 333 | */ 334 | typedef struct _rknn_input { 335 | uint32_t index; /* the input index. */ 336 | void* buf; /* the input buf for index. */ 337 | uint32_t size; /* the size of input buf. */ 338 | uint8_t pass_through; /* pass through mode. 339 | if TRUE, the buf data is passed directly to the input node of the rknn model 340 | without any conversion. the following variables do not need to be set. 341 | if FALSE, the buf data is converted into an input consistent with the model 342 | according to the following type and fmt. so the following variables 343 | need to be set.*/ 344 | rknn_tensor_type type; /* the data type of input buf. */ 345 | rknn_tensor_format fmt; /* the data format of input buf. 346 | currently the internal input format of NPU is NCHW by default. 347 | so entering NCHW data can avoid the format conversion in the driver. */ 348 | } rknn_input; 349 | 350 | /* 351 | the output information for rknn_outputs_get. 352 | */ 353 | typedef struct _rknn_output { 354 | uint8_t want_float; /* want transfer output data to float */ 355 | uint8_t is_prealloc; /* whether buf is pre-allocated. 356 | if TRUE, the following variables need to be set. 357 | if FALSE, the following variables do not need to be set. */ 358 | uint32_t index; /* the output index. */ 359 | void* buf; /* the output buf for index. 360 | when is_prealloc = FALSE and rknn_outputs_release called, 361 | this buf pointer will be free and don't use it anymore. */ 362 | uint32_t size; /* the size of output buf. */ 363 | } rknn_output; 364 | 365 | /* 366 | the extend information for rknn_init. 367 | */ 368 | typedef struct _rknn_init_extend { 369 | rknn_context ctx; /* rknn context */ 370 | int32_t real_model_offset; /* real rknn model file size, only valid when init context with rknn file path */ 371 | uint32_t real_model_size; /* real rknn model file offset, only valid when init context with rknn file path */ 372 | uint8_t reserved[120]; /* reserved */ 373 | } rknn_init_extend; 374 | 375 | /* 376 | the extend information for rknn_run. 377 | */ 378 | typedef struct _rknn_run_extend { 379 | uint64_t frame_id; /* output parameter, indicate current frame id of run. */ 380 | int32_t non_block; /* block flag of run, 0 is block else 1 is non block */ 381 | int32_t timeout_ms; /* timeout for block mode, in milliseconds */ 382 | int32_t fence_fd; /* fence fd from other unit */ 383 | } rknn_run_extend; 384 | 385 | /* 386 | the extend information for rknn_outputs_get. 387 | */ 388 | typedef struct _rknn_output_extend { 389 | uint64_t frame_id; /* output parameter, indicate the frame id of outputs, corresponds to 390 | struct rknn_run_extend.frame_id.*/ 391 | } rknn_output_extend; 392 | 393 | 394 | /* rknn_init 395 | 396 | initial the context and load the rknn model. 397 | 398 | input: 399 | rknn_context* context the pointer of context handle. 400 | void* model if size > 0, pointer to the rknn model, if size = 0, filepath to the rknn model. 401 | uint32_t size the size of rknn model. 402 | uint32_t flag extend flag, see the define of RKNN_FLAG_XXX_XXX. 403 | rknn_init_extend* extend the extend information of init. 404 | return: 405 | int error code. 406 | */ 407 | int rknn_init(rknn_context* context, void* model, uint32_t size, uint32_t flag, rknn_init_extend* extend); 408 | 409 | /* rknn_dup_context 410 | 411 | initial the context and load the rknn model. 412 | 413 | input: 414 | rknn_context* context_in the pointer of context in handle. 415 | rknn_context* context_out the pointer of context out handle. 416 | return: 417 | int error code. 418 | */ 419 | int rknn_dup_context(rknn_context* context_in, rknn_context* context_out); 420 | 421 | /* rknn_destroy 422 | 423 | unload the rknn model and destroy the context. 424 | 425 | input: 426 | rknn_context context the handle of context. 427 | return: 428 | int error code. 429 | */ 430 | int rknn_destroy(rknn_context context); 431 | 432 | 433 | /* rknn_query 434 | 435 | query the information about model or others. see rknn_query_cmd. 436 | 437 | input: 438 | rknn_context context the handle of context. 439 | rknn_query_cmd cmd the command of query. 440 | void* info the buffer point of information. 441 | uint32_t size the size of information. 442 | return: 443 | int error code. 444 | */ 445 | int rknn_query(rknn_context context, rknn_query_cmd cmd, void* info, uint32_t size); 446 | 447 | 448 | /* rknn_inputs_set 449 | 450 | set inputs information by input index of rknn model. 451 | inputs information see rknn_input. 452 | 453 | input: 454 | rknn_context context the handle of context. 455 | uint32_t n_inputs the number of inputs. 456 | rknn_input inputs[] the arrays of inputs information, see rknn_input. 457 | return: 458 | int error code 459 | */ 460 | int rknn_inputs_set(rknn_context context, uint32_t n_inputs, rknn_input inputs[]); 461 | 462 | /* 463 | rknn_set_batch_core_num 464 | 465 | set rknn batch core_num. 466 | 467 | input: 468 | rknn_context context the handle of context. 469 | int core_num the core number. 470 | return: 471 | int error code. 472 | 473 | */ 474 | int rknn_set_batch_core_num(rknn_context context, int core_num); 475 | 476 | /* rknn_set_core_mask 477 | 478 | set rknn core mask.(only supported on RK3588 now) 479 | 480 | RKNN_NPU_CORE_AUTO: auto mode, default value 481 | RKNN_NPU_CORE_0: core 0 mode 482 | RKNN_NPU_CORE_1: core 1 mode 483 | RKNN_NPU_CORE_2: core 2 mode 484 | RKNN_NPU_CORE_0_1: combine core 0/1 mode 485 | RKNN_NPU_CORE_0_1_2: combine core 0/1/2 mode 486 | 487 | input: 488 | rknn_context context the handle of context. 489 | rknn_core_mask core_mask the core mask. 490 | return: 491 | int error code. 492 | */ 493 | int rknn_set_core_mask(rknn_context context, rknn_core_mask core_mask); 494 | 495 | /* rknn_run 496 | 497 | run the model to execute inference. 498 | 499 | input: 500 | rknn_context context the handle of context. 501 | rknn_run_extend* extend the extend information of run. 502 | return: 503 | int error code. 504 | */ 505 | int rknn_run(rknn_context context, rknn_run_extend* extend); 506 | 507 | 508 | /* rknn_wait 509 | 510 | wait the model after execute inference. 511 | 512 | input: 513 | rknn_context context the handle of context. 514 | rknn_run_extend* extend the extend information of run. 515 | return: 516 | int error code. 517 | */ 518 | int rknn_wait(rknn_context context, rknn_run_extend* extend); 519 | 520 | 521 | /* rknn_outputs_get 522 | 523 | wait the inference to finish and get the outputs. 524 | this function will block until inference finish. 525 | the results will set to outputs[]. 526 | 527 | input: 528 | rknn_context context the handle of context. 529 | uint32_t n_outputs the number of outputs. 530 | rknn_output outputs[] the arrays of output, see rknn_output. 531 | rknn_output_extend* the extend information of output. 532 | return: 533 | int error code. 534 | */ 535 | int rknn_outputs_get(rknn_context context, uint32_t n_outputs, rknn_output outputs[], rknn_output_extend* extend); 536 | 537 | 538 | /* rknn_outputs_release 539 | 540 | release the outputs that get by rknn_outputs_get. 541 | after called, the rknn_output[x].buf get from rknn_outputs_get will 542 | also be free when rknn_output[x].is_prealloc = FALSE. 543 | 544 | input: 545 | rknn_context context the handle of context. 546 | uint32_t n_ouputs the number of outputs. 547 | rknn_output outputs[] the arrays of output. 548 | return: 549 | int error code 550 | */ 551 | int rknn_outputs_release(rknn_context context, uint32_t n_ouputs, rknn_output outputs[]); 552 | 553 | 554 | /* new api for zero copy */ 555 | 556 | /* rknn_create_mem_from_phys (memory allocated outside) 557 | 558 | initialize tensor memory from physical address. 559 | 560 | input: 561 | rknn_context ctx the handle of context. 562 | uint64_t phys_addr physical address. 563 | void *virt_addr virtual address. 564 | uint32_t size the size of tensor buffer. 565 | return: 566 | rknn_tensor_mem the pointer of tensor memory information. 567 | */ 568 | rknn_tensor_mem* rknn_create_mem_from_phys(rknn_context ctx, uint64_t phys_addr, void *virt_addr, uint32_t size); 569 | 570 | 571 | /* rknn_create_mem_from_fd (memory allocated outside) 572 | 573 | initialize tensor memory from file description. 574 | 575 | input: 576 | rknn_context ctx the handle of context. 577 | int32_t fd file description. 578 | void *virt_addr virtual address. 579 | uint32_t size the size of tensor buffer. 580 | int32_t offset indicates the offset of the memory (virt_addr without offset). 581 | return: 582 | rknn_tensor_mem the pointer of tensor memory information. 583 | */ 584 | rknn_tensor_mem* rknn_create_mem_from_fd(rknn_context ctx, int32_t fd, void *virt_addr, uint32_t size, int32_t offset); 585 | 586 | 587 | /* rknn_create_mem_from_mb_blk (memory allocated outside) 588 | 589 | create tensor memory from mb_blk. 590 | 591 | input: 592 | rknn_context ctx the handle of context. 593 | void *mb_blk mb_blk allocate from system api. 594 | int32_t offset indicates the offset of the memory. 595 | return: 596 | rknn_tensor_mem the pointer of tensor memory information. 597 | */ 598 | rknn_tensor_mem* rknn_create_mem_from_mb_blk(rknn_context ctx, void *mb_blk, int32_t offset); 599 | 600 | 601 | /* rknn_create_mem (memory allocated inside) 602 | 603 | create tensor memory. 604 | 605 | input: 606 | rknn_context ctx the handle of context. 607 | uint32_t size the size of tensor buffer. 608 | return: 609 | rknn_tensor_mem the pointer of tensor memory information. 610 | */ 611 | rknn_tensor_mem* rknn_create_mem(rknn_context ctx, uint32_t size); 612 | 613 | 614 | /* rknn_destroy_mem (support allocate inside and outside) 615 | 616 | destroy tensor memory. 617 | 618 | input: 619 | rknn_context ctx the handle of context. 620 | rknn_tensor_mem *mem the pointer of tensor memory information. 621 | return: 622 | int error code 623 | */ 624 | int rknn_destroy_mem(rknn_context ctx, rknn_tensor_mem *mem); 625 | 626 | 627 | /* rknn_set_weight_mem 628 | 629 | set the weight memory. 630 | 631 | input: 632 | rknn_context ctx the handle of context. 633 | rknn_tensor_mem *mem the array of tensor memory information 634 | return: 635 | int error code. 636 | */ 637 | int rknn_set_weight_mem(rknn_context ctx, rknn_tensor_mem *mem); 638 | 639 | 640 | /* rknn_set_internal_mem 641 | 642 | set the internal memory. 643 | 644 | input: 645 | rknn_context ctx the handle of context. 646 | rknn_tensor_mem *mem the array of tensor memory information 647 | return: 648 | int error code. 649 | */ 650 | int rknn_set_internal_mem(rknn_context ctx, rknn_tensor_mem *mem); 651 | 652 | 653 | /* rknn_set_io_mem 654 | 655 | set the input and output tensors buffer. 656 | 657 | input: 658 | rknn_context ctx the handle of context. 659 | rknn_tensor_mem *mem the array of tensor memory information. 660 | rknn_tensor_attr *attr the attribute of input or output tensor buffer. 661 | return: 662 | int error code. 663 | */ 664 | int rknn_set_io_mem(rknn_context ctx, rknn_tensor_mem *mem, rknn_tensor_attr *attr); 665 | 666 | 667 | #ifdef __cplusplus 668 | } //extern "C" 669 | #endif 670 | 671 | #endif //_RKNN_API_H 672 | -------------------------------------------------------------------------------- /include/rknn_net.h: -------------------------------------------------------------------------------- 1 | #ifndef RKNN_NET_H 2 | #define RKNN_NET_H 3 | 4 | #include "rknn_api.h" 5 | #include // or for size_t 6 | #include // for uint8_t 7 | #include 8 | 9 | 10 | // Define the struct 11 | typedef struct rknn_net { 12 | rknn_context* ctx; 13 | int debug_flag; 14 | uint8_t* model; 15 | size_t model_size; 16 | rknn_tensor_attr* input_attrs; 17 | rknn_tensor_attr* output_attrs; 18 | int num_input; 19 | int num_output; 20 | int input_height; 21 | int input_width; 22 | int input_channel; 23 | } rknn_net; 24 | 25 | 26 | // Function prototypes 27 | rknn_net* rknn_net_create(const char* model_path, int target_npu, int debug_flag); 28 | int rknn_net_inference(rknn_net* net, int8_t* input, float** output); 29 | void rknn_net_destroy(rknn_net* net); 30 | static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; } 31 | 32 | #endif // RKNN_NET_H 33 | -------------------------------------------------------------------------------- /include/yolo_multi_npu.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "yolov8_post.h" 9 | #include "rknn_net.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | template 16 | class ThreadSafeQueue { 17 | private: 18 | std::queue queue; 19 | mutable std::mutex mutex; 20 | std::condition_variable cond_empty; 21 | std::condition_variable cond_full; 22 | size_t max_size; 23 | 24 | public: 25 | ThreadSafeQueue(size_t max_size) : max_size(max_size) {} 26 | 27 | void enqueue(T t) { 28 | std::unique_lock lock(mutex); 29 | cond_full.wait(lock, [&](){ return queue.size() < max_size; }); 30 | queue.push(t); 31 | cond_empty.notify_one(); 32 | } 33 | 34 | T dequeue() { 35 | std::unique_lock lock(mutex); 36 | cond_empty.wait(lock, [&](){ return !queue.empty(); }); 37 | T val = queue.front(); 38 | queue.pop(); 39 | cond_full.notify_one(); 40 | return val; 41 | } 42 | }; 43 | 44 | 45 | #define BUFFER_SIZE 150 46 | #define ELEMENT_SIZE 3*640*640 47 | 48 | typedef struct { 49 | int8_t* data[BUFFER_SIZE]; 50 | int start; // Index of the oldest element 51 | int end; // Index at which to write new element 52 | pthread_mutex_t lock; 53 | pthread_cond_t notEmpty; // Signaled when the buffer is not empty 54 | pthread_cond_t notFull; // Signaled when the buffer is not full 55 | bool stop; // Flag to stop the thread 56 | } CircularBuffer; 57 | 58 | typedef struct { 59 | cv::VideoCapture* cam; 60 | CircularBuffer* output; 61 | } video_read_arg; 62 | 63 | typedef struct { 64 | CircularBuffer* input; 65 | CircularBuffer* output; 66 | int* target_npu; 67 | std::string* model_path; 68 | } yolov8_npu_arg; 69 | 70 | void initialize(CircularBuffer *buffer) { 71 | buffer->start = 0; 72 | buffer->end = 0; 73 | for (int i=0; i< BUFFER_SIZE;++i) { 74 | buffer->data[i] = (int8_t*)malloc(ELEMENT_SIZE * sizeof(int8_t)); 75 | } 76 | pthread_mutex_init(&buffer->lock, NULL); 77 | pthread_cond_init(&buffer->notEmpty, NULL); 78 | pthread_cond_init(&buffer->notFull, NULL); 79 | buffer->stop = false; 80 | } 81 | 82 | void enqueue(CircularBuffer *buffer, int8_t* input) { 83 | pthread_mutex_lock(&buffer->lock); 84 | while ((buffer->end + 1) % BUFFER_SIZE == buffer->start) { // Buffer is full 85 | pthread_cond_wait(&buffer->notFull, &buffer->lock); // Wait until not full 86 | if(buffer->stop) { 87 | pthread_mutex_unlock(&buffer->lock); 88 | buffer->end = 1; 89 | buffer->start = 0; 90 | return; 91 | } 92 | } 93 | if(buffer->stop) { 94 | pthread_mutex_unlock(&buffer->lock); 95 | return; 96 | } 97 | memcpy(buffer->data[buffer->end], input, ELEMENT_SIZE * sizeof(int8_t)); 98 | buffer->end = (buffer->end + 1) % BUFFER_SIZE; 99 | pthread_cond_signal(&buffer->notEmpty); // Signal that the buffer is not empty 100 | pthread_mutex_unlock(&buffer->lock); 101 | } 102 | 103 | void dequeue(CircularBuffer *buffer, int8_t* output) { 104 | pthread_mutex_lock(&buffer->lock); 105 | while (buffer->end == buffer->start) { // Buffer is empty 106 | pthread_cond_wait(&buffer->notEmpty, &buffer->lock); // Wait until not empty 107 | if(buffer->stop) { 108 | pthread_mutex_unlock(&buffer->lock); 109 | buffer->end = 1; 110 | buffer->start = 0; 111 | return; 112 | } 113 | } 114 | if(buffer->stop) { 115 | pthread_mutex_unlock(&buffer->lock); 116 | return; 117 | } 118 | memcpy(output, buffer->data[buffer->start], ELEMENT_SIZE * sizeof(int8_t)); 119 | buffer->start = (buffer->start + 1) % BUFFER_SIZE; 120 | pthread_cond_signal(&buffer->notFull); // Signal that the buffer is not full 121 | pthread_mutex_unlock(&buffer->lock); 122 | } 123 | 124 | void release(CircularBuffer *buffer) { 125 | for (int i=0; idata[i]); 127 | } 128 | free(buffer); 129 | } 130 | 131 | void* video_read_thread(void* args) { 132 | video_read_arg* video_args = (video_read_arg*)args; 133 | cv::VideoCapture* cam = video_args->cam; 134 | CircularBuffer* output_buffer = video_args->output; 135 | cv::Mat frame; 136 | fprintf(stdout, "Video Thread start!\n"); 137 | while (true) 138 | { 139 | *cam >> frame; 140 | if(frame.empty()) 141 | break; 142 | // Resize the image to 640x640 143 | cv::resize(frame, frame, cv::Size(640, 640)); 144 | enqueue(output_buffer, (int8_t*)frame.data); 145 | if(output_buffer->stop) break; 146 | } 147 | fprintf(stdout, "Video Thread stop!\n"); 148 | output_buffer->stop = true; 149 | pthread_cond_signal(&output_buffer->notEmpty); // Signal that the buffer is not empty 150 | return NULL; 151 | } 152 | 153 | void* yolov8_npu_thread(void* args) { 154 | yolov8_npu_arg* yolo_args = (yolov8_npu_arg*)args; 155 | std::string* model_path = yolo_args->model_path; 156 | int* target_npu = yolo_args->target_npu; 157 | CircularBuffer* input_buffer = yolo_args->input; 158 | CircularBuffer* output_buffer = yolo_args->output; 159 | 160 | rknn_net* yolov8 = rknn_net_create((*model_path).c_str(), *target_npu, false); 161 | 162 | int8_t* input; 163 | input = (int8_t*)malloc(3*640*640*sizeof(int8_t)); 164 | 165 | float* outputs[6]; 166 | outputs[0] = (float*)malloc(1*80*400 * sizeof(float)); 167 | outputs[1] = (float*)malloc(1*64*400 * sizeof(float)); 168 | outputs[2] = (float*)malloc(1*80*1600 * sizeof(float)); 169 | outputs[3] = (float*)malloc(1*64*1600 * sizeof(float)); 170 | outputs[4] = (float*)malloc(1*80*6400 * sizeof(float)); 171 | outputs[5] = (float*)malloc(1*64*6400 * sizeof(float)); 172 | 173 | fprintf(stdout, "NPU Thread start, NPU:%d\n", *yolo_args->target_npu); 174 | while (true) 175 | { 176 | dequeue(input_buffer, input); 177 | if(input_buffer->stop) break; 178 | rknn_net_inference(yolov8, input, outputs); 179 | std::vector results = yolov8_tail_post_process(outputs, 80, 1.0, 1.0, 0.3, 0.4, false); 180 | cv::Mat output_img(640, 640, CV_8UC3, (uint8_t*)input); 181 | yolov8_draw_result(results, output_img, coco_classes, 80); 182 | enqueue(output_buffer, (int8_t*)output_img.data); 183 | } 184 | rknn_net_destroy(yolov8); 185 | for (int i=0; i<6; ++i) { 186 | free(outputs[i]); 187 | } 188 | output_buffer->stop = true; 189 | pthread_cond_signal(&output_buffer->notEmpty); // Signal that the buffer is not empty 190 | fprintf(stdout,"NPU Thread stop, NPU:%d\n", *yolo_args->target_npu); 191 | return NULL; 192 | } -------------------------------------------------------------------------------- /include/yolov8_post.h: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV8_POST_H 2 | #define YOLOV8_POST_H 3 | #include // or for size_t 4 | #include // for uint8_t 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | typedef struct { 13 | float score; 14 | int cls; 15 | int x; 16 | int y; 17 | int w; 18 | int h; 19 | } DetectionResult; 20 | 21 | const std::string coco_classes[] = {"person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light", 22 | "fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant", 23 | "bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", 24 | "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ", 25 | "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa", 26 | "pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop ", "mouse ", "remote ", "keyboard ", "cell phone", "microwave ", 27 | "oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush"}; 28 | 29 | /** @brief Performs tail process for yolov8(no tail, just like dfl). 30 | 31 | * @param input input[0] is the buffer of confidence of layer_20x20 with shape (1xclass_numx20x20), 32 | * intpu[1] is the buffer of bbox of layer_20x20 with shape (1x64x20x20), 33 | * input[2] is the buffer of confidence of layer_40x40 with shape (1xclass_numx40x40), 34 | * intpu[3] is the buffer of bbox of layer_40x40 with shape (1x64x40x40), 35 | * input[4] is the buffer of confidence of layer_80x80 with shape (1xclass_numx80x80), 36 | * intpu[5] is the buffer of bbox of layer_80x80 with shape (1x64x80x80), 37 | * @param output output[0] is the buffer of dlf result with shape (1x8400x4), 38 | * output[1] is the buffer of the confidence with shape (1x8400xclass_num). 39 | * @param class_num the total number of classes; 40 | */ 41 | void yolov8_tail_process(float** input, float** output, int class_num); 42 | 43 | /** @brief Performs post process for yolov8(no bbox head). 44 | 45 | * @param input input[0] is the buffer of dlf result with shape (1x8400x4), 46 | * intpu[1] is the buffer of the confidence with shape (1x8400xclass_num). 47 | * @param class_num the total number of classes; 48 | * @param scale_x the scale of width from origin img compare to 640 equal to origin_img.width/640. 49 | * @param scale_y the scale of height from origin img compare to 640 equal to origin_img.height/640. 50 | * @param score_threshold a threshold used to filter boxes by score. 51 | * @param nms_threshold a threshold used in non maximum suppression. 52 | * @param debug_flat if print the debug imformation. 53 | */ 54 | std::vector yolov8_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag); 55 | std::vector yolov8_tail_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag); 56 | 57 | /** @brief Draw the detection result to image. 58 | 59 | * @param result detection results. 60 | * @param img image to draw. 61 | * @param class_namess the name of each class. 62 | * @param num_of_classes the total number of classes. 63 | */ 64 | void yolov8_draw_result(std::vector results, cv::Mat &img, const std::string* class_names, int num_of_classes); 65 | 66 | 67 | 68 | #endif // YOLOV8_POST_H -------------------------------------------------------------------------------- /lib/librknnrt.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aemior/yolov8n_rk3588/6786a52230bd5022a9788e29d94217541586c9d8/lib/librknnrt.so -------------------------------------------------------------------------------- /src/picture_demo.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "rknn_net.h" 3 | #include "yolov8_post.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | void print_help() { 14 | printf("Usage: your_program [OPTIONS]\n"); 15 | printf("Options:\n"); 16 | printf("\t-i, --input\t\tPath to the input image\n"); 17 | printf("\t-m, --model\t\tPath to the model file\n"); 18 | printf("\t-o, --output\t\tPath to the output image\n"); 19 | printf("\t-d, --debug\t\tDebug flag (0 or 1)\n"); 20 | printf("\t-s, --score_threshold\tScore threshold for filtering boxes\n"); 21 | printf("\t-n, --nms_threshold\tNMS threshold\n"); 22 | printf("\t-h, --help\t\tPrint this help message\n"); 23 | } 24 | 25 | int main (int argc, char* argv[]) { 26 | // Define options 27 | const struct option long_options[] = { 28 | {"input", required_argument, 0, 'i'}, 29 | {"model", required_argument, 0, 'm'}, 30 | {"output", required_argument, 0, 'o'}, 31 | {"debug", required_argument, 0, 'd'}, 32 | {"help", no_argument, 0, 'h'}, 33 | {"score_threshold", required_argument, 0, 's'}, 34 | {"nms_threshold", required_argument, 0, 'n'}, 35 | {0, 0, 0, 0} 36 | }; 37 | 38 | // Default parameters 39 | std::string input_path = "../data/bus.jpg"; 40 | std::string model_path = "../data/yolov8n_no_tail.rknn"; 41 | std::string output_path = "./debug.png"; 42 | float score_threshold = 0.4; 43 | float nms_threshold = 0.5; 44 | bool debug_flag = true; 45 | 46 | //Temp var 47 | float scale_x, scale_y; 48 | 49 | // Parse options 50 | int opt = 0; 51 | while ((opt = getopt_long(argc, argv, "i:m:o:d:h", long_options, NULL)) != -1) { 52 | switch (opt) { 53 | case 'i': 54 | input_path = optarg; 55 | break; 56 | case 'm': 57 | model_path = optarg; 58 | break; 59 | case 'o': 60 | output_path = optarg; 61 | break; 62 | case 's': 63 | score_threshold = atof(optarg); 64 | break; 65 | case 'n': 66 | nms_threshold = atof(optarg); 67 | break; 68 | case 'd': 69 | debug_flag = atoi(optarg); 70 | break; 71 | case 'h': 72 | default: 73 | print_help(); 74 | return 0; 75 | } 76 | } 77 | 78 | // Load the model 79 | rknn_net* yolov8 = rknn_net_create(model_path.c_str(), 0, false); 80 | 81 | // Read image 82 | cv::Mat img = cv::imread(input_path); 83 | scale_x = ((float)img.cols) / 640; 84 | scale_y = ((float)img.rows) / 640; 85 | cv::Mat output_img = img.clone(); 86 | 87 | // Resize the image to 640x640 88 | cv::resize(img, img, cv::Size(640, 640)); 89 | 90 | 91 | // Allocate data buffer to save the raw result from Neural net 92 | float* outputs[6]; 93 | outputs[0] = (float*)malloc(1*80*400 * sizeof(float)); 94 | outputs[1] = (float*)malloc(1*64*400 * sizeof(float)); 95 | outputs[2] = (float*)malloc(1*80*1600 * sizeof(float)); 96 | outputs[3] = (float*)malloc(1*64*1600 * sizeof(float)); 97 | outputs[4] = (float*)malloc(1*80*6400 * sizeof(float)); 98 | outputs[5] = (float*)malloc(1*64*6400 * sizeof(float)); 99 | 100 | // Do inference 101 | rknn_net_inference(yolov8, (int8_t*)img.data, outputs); 102 | 103 | // Post process to get the detection result 104 | std::vector results = yolov8_tail_post_process(outputs, 80, scale_x, scale_y, 0.4, 0.8, debug_flag); 105 | 106 | yolov8_draw_result(results, output_img, coco_classes, 80); 107 | 108 | // Write the image which have detection result 109 | cv::imwrite(output_path, output_img); 110 | 111 | // Release the resource 112 | rknn_net_destroy(yolov8); 113 | 114 | 115 | return 0; 116 | } 117 | 118 | -------------------------------------------------------------------------------- /src/rknn_net.cc: -------------------------------------------------------------------------------- 1 | #include "rknn_net.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | class Timer { 9 | public: 10 | Timer() : start_time(std::chrono::high_resolution_clock::now()) {} 11 | 12 | void reset() { 13 | start_time = std::chrono::high_resolution_clock::now(); 14 | } 15 | 16 | double elapsed() const { 17 | return std::chrono::duration_cast( 18 | std::chrono::high_resolution_clock::now() - start_time).count() / 1000.0; 19 | } 20 | 21 | private: 22 | std::chrono::high_resolution_clock::time_point start_time; 23 | }; 24 | 25 | const char* get_type(rknn_tensor_type type) { 26 | switch (type) { 27 | case RKNN_TENSOR_FLOAT32: 28 | return "float32"; 29 | case RKNN_TENSOR_FLOAT16: 30 | return "float16"; 31 | case RKNN_TENSOR_INT8: 32 | return "int8"; 33 | case RKNN_TENSOR_UINT8: 34 | return "uint8"; 35 | case RKNN_TENSOR_INT16: 36 | return "int16"; 37 | case RKNN_TENSOR_UINT16: 38 | return "uint16"; 39 | case RKNN_TENSOR_INT32: 40 | return "int32"; 41 | case RKNN_TENSOR_UINT32: 42 | return "uint32"; 43 | case RKNN_TENSOR_INT64: 44 | return "int64"; 45 | case RKNN_TENSOR_BOOL: 46 | return "bool"; 47 | default: 48 | return "unknown"; 49 | } 50 | } 51 | 52 | 53 | rknn_net* rknn_net_create(const char* model_path, int target_npu, int debug_flag) { 54 | 55 | FILE* fp = fopen(model_path, "rb"); 56 | if(fp == NULL) { 57 | printf("Failed to open file %s\n", model_path); 58 | return NULL; 59 | } 60 | 61 | fseek(fp, 0, SEEK_END); 62 | long model_size = ftell(fp); 63 | rewind(fp); 64 | 65 | uint8_t* model = (uint8_t*)malloc(model_size); 66 | if(model == NULL) { 67 | printf("Failed to malloc memory for model\n"); 68 | fclose(fp); 69 | return NULL; 70 | } 71 | 72 | size_t read_count = fread(model, 1, model_size, fp); 73 | if(read_count != model_size) { 74 | printf("Failed to read model\n"); 75 | free(model); 76 | fclose(fp); 77 | return NULL; 78 | } 79 | 80 | if (debug_flag) { 81 | printf("read model from %s\n", model_path); 82 | printf("model size = %ld bytes\n", model_size); 83 | } 84 | 85 | fclose(fp); 86 | 87 | //rknn_context* ctx = NULL; 88 | rknn_context* ctx = (rknn_context*)malloc(sizeof(rknn_context)); 89 | int ret = rknn_init(ctx, model, model_size, RKNN_FLAG_COLLECT_PERF_MASK, NULL); 90 | if(ret < 0) { 91 | printf("rknn_init fail! ret=%d\n", ret); 92 | free(model); 93 | free(ctx); 94 | return NULL; 95 | } 96 | 97 | rknn_sdk_version version; 98 | ret = rknn_query(*ctx, RKNN_QUERY_SDK_VERSION, &version, sizeof(rknn_sdk_version)); 99 | if(ret < 0) { 100 | printf("rknn_query SDK version error, ret=%d\n", ret); 101 | free(model); 102 | free(ctx); 103 | return NULL; 104 | } 105 | 106 | if (debug_flag) { 107 | printf("sdk version: %s driver version: %s\n", version.api_version, version.drv_version); 108 | } 109 | 110 | rknn_input_output_num io_num; 111 | ret = rknn_query(*ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); 112 | if(ret < 0) { 113 | printf("rknn_query IN_OUT_NUM error, ret=%d\n", ret); 114 | free(model); 115 | free(ctx); 116 | return NULL; 117 | } 118 | 119 | if (debug_flag) { 120 | printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); 121 | } 122 | 123 | rknn_tensor_attr* input_attrs = (rknn_tensor_attr*)calloc(io_num.n_input, sizeof(rknn_tensor_attr)); 124 | rknn_tensor_attr* output_attrs = (rknn_tensor_attr*)calloc(io_num.n_output, sizeof(rknn_tensor_attr)); 125 | 126 | for (int i = 0; i < io_num.n_input; i++) { 127 | input_attrs[i].index = i; 128 | ret = rknn_query(*ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); 129 | if (ret < 0) { 130 | printf("rknn_query INPUT_ATTR error, ret=%d\n", ret); 131 | free(model); 132 | free(ctx); 133 | free(input_attrs); 134 | free(output_attrs); 135 | return NULL; 136 | } 137 | } 138 | 139 | for (int i = 0; i < io_num.n_output; i++) { 140 | output_attrs[i].index = i; 141 | ret = rknn_query(*ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); 142 | if (ret < 0) { 143 | printf("rknn_query OUTPUT_ATTR error, ret=%d\n", ret); 144 | free(model); 145 | free(ctx); 146 | free(input_attrs); 147 | free(output_attrs); 148 | return NULL; 149 | } 150 | } 151 | 152 | int channel = 3; 153 | int width = 0; 154 | int height = 0; 155 | 156 | if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) { 157 | if (debug_flag) { 158 | printf("model is NCHW input fmt\n"); 159 | } 160 | channel = input_attrs[0].dims[1]; 161 | height = input_attrs[0].dims[2]; 162 | width = input_attrs[0].dims[3]; 163 | } else { 164 | if (debug_flag) { 165 | printf("model is NHWC input fmt\n"); 166 | } 167 | height = input_attrs[0].dims[1]; 168 | width = input_attrs[0].dims[2]; 169 | channel = input_attrs[0].dims[3]; 170 | } 171 | 172 | if (debug_flag) { 173 | printf("model input height=%d, width=%d, channel=%d, type:%s\n", height, width, channel, get_type(input_attrs[0].type)); 174 | for (int i=0; ictx = ctx; 200 | net->debug_flag = debug_flag; 201 | net->model = model; 202 | net->model_size = model_size; 203 | net->input_attrs = input_attrs; 204 | net->output_attrs = output_attrs; 205 | net->num_input = io_num.n_input; 206 | net->num_output = io_num.n_output; 207 | net->input_height = height; 208 | net->input_width = width; 209 | net->input_channel = channel; 210 | 211 | return net; 212 | } 213 | 214 | int rknn_net_inference(rknn_net* net, int8_t* input, float** output) { 215 | rknn_input inputs[net->num_input]; 216 | memset(inputs, 0, sizeof(inputs)); 217 | 218 | inputs[0].index = 0; 219 | inputs[0].buf = (void*)input; 220 | inputs[0].size = net->input_attrs[0].n_elems * sizeof(int8_t); 221 | inputs[0].pass_through = true; 222 | inputs[0].type = RKNN_TENSOR_INT8; 223 | inputs[0].fmt = net->input_attrs[0].fmt;//RKNN_TENSOR_NCHW; 224 | 225 | int ret = rknn_inputs_set(*(net->ctx), 1, inputs); 226 | if(ret < 0) { 227 | printf("rknn_inputs_set fail! ret=%d\n", ret); 228 | return 1; 229 | } else if (net->debug_flag) { 230 | printf("rknn_inputs_set done.\n"); 231 | } 232 | 233 | //Timer t1; 234 | //for (int j=0; j<50; ++j) { 235 | // t1.reset(); 236 | ret = rknn_run(*(net->ctx), NULL); 237 | //std::cout << ":>" << t1.elapsed() << std::endl; 238 | //} 239 | //rknn_perf_detail perf_detail; 240 | //ret = rknn_query(*(net->ctx), RKNN_QUERY_PERF_DETAIL, &perf_detail, sizeof(perf_detail)); 241 | //for (int j=0; jdebug_flag) { 250 | printf("rknn_run done.\n"); 251 | } 252 | 253 | rknn_output outputs[6]; 254 | memset(outputs, 0, sizeof(outputs)); 255 | 256 | for (int i=0; i<6; ++i) { 257 | outputs[i].want_float = true; 258 | outputs[i].is_prealloc = true; 259 | outputs[i].buf = output[i]; 260 | outputs[i].index = i; 261 | outputs[i].size = net->output_attrs[0].n_elems * sizeof(float); 262 | } 263 | 264 | ret = rknn_outputs_get(*(net->ctx), 6, outputs, NULL); 265 | if(ret < 0) { 266 | printf("rknn_outputs_get fail! ret=%d\n", ret); 267 | return 1; 268 | } else if (net->debug_flag) { 269 | printf("rknn_outputs_get done.\n"); 270 | } 271 | 272 | for (int i=0; i<6; ++i) { 273 | memcpy(output[i], outputs[i].buf, outputs[i].size); 274 | } 275 | rknn_outputs_release(*(net->ctx), 6, outputs); 276 | 277 | return 0; 278 | } 279 | 280 | void rknn_net_destroy(rknn_net* net) { 281 | if (net == NULL) { 282 | return; 283 | } 284 | 285 | // Release rknn context 286 | if (net->ctx != NULL) { 287 | rknn_destroy(*(net->ctx)); 288 | net->ctx = NULL; 289 | } 290 | 291 | // Release model data 292 | if (net->model != NULL) { 293 | free(net->model); 294 | net->model = NULL; 295 | } 296 | 297 | // Release input tensor attributes 298 | if (net->input_attrs != NULL) { 299 | free(net->input_attrs); 300 | net->input_attrs = NULL; 301 | } 302 | 303 | // Release output tensor attributes 304 | if (net->output_attrs != NULL) { 305 | free(net->output_attrs); 306 | net->output_attrs = NULL; 307 | } 308 | 309 | // Finally, free the rknn_net struct itself 310 | free(net); 311 | } -------------------------------------------------------------------------------- /src/video_demo.cc: -------------------------------------------------------------------------------- 1 | #include "rknn_net.h" 2 | #include "yolov8_post.h" 3 | #include "yolo_multi_npu.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | void preciseSleep(double seconds) 19 | { 20 | struct timespec ts; 21 | ts.tv_sec = (time_t)seconds; // Truncate the decimal part for the 'seconds' component 22 | ts.tv_nsec = (long)((seconds - ts.tv_sec) * 1e+9); // Subtract the 'seconds' component from 'seconds', leaving the fractional part 23 | 24 | nanosleep(&ts, NULL); 25 | } 26 | 27 | volatile sig_atomic_t flag = 0; 28 | CircularBuffer input_buffer, output_buffer; 29 | 30 | void handler(int sig) { 31 | flag = 1; 32 | output_buffer.stop = true; 33 | pthread_cond_signal(&output_buffer.notEmpty); 34 | pthread_cond_signal(&output_buffer.notFull); 35 | printf("\nCtrl+C Press, Wait Thread stop.\n"); 36 | input_buffer.stop = true; 37 | pthread_cond_signal(&input_buffer.notEmpty); 38 | pthread_cond_signal(&input_buffer.notFull); 39 | sleep(2); 40 | printf("STOPED!\n"); 41 | } 42 | 43 | 44 | class Timer { 45 | public: 46 | Timer() : start_time(std::chrono::high_resolution_clock::now()) {} 47 | 48 | void reset() { 49 | start_time = std::chrono::high_resolution_clock::now(); 50 | } 51 | 52 | double elapsed() const { 53 | return std::chrono::duration_cast( 54 | std::chrono::high_resolution_clock::now() - start_time).count() / 1000.0; 55 | } 56 | 57 | private: 58 | std::chrono::high_resolution_clock::time_point start_time; 59 | }; 60 | 61 | int main () { 62 | //std::cout << cv::getBuildInformation() << std::endl; 63 | // Default parameters 64 | std::string input_path = "/root/data/test_video.mp4"; 65 | std::string model_path = "../data/yolov8n_no_tail.rknn"; 66 | std::string output_path = "ffmpeg -loglevel quiet -f rawvideo -pix_fmt bgr24 -s 640x640 -r 28 -i - -an -b:v 20M -f rtsp rtsp://localhost:8848/test_video"; 67 | float score_threshold = 0.4; 68 | float nms_threshold = 0.5; 69 | bool debug_flag = false; 70 | cv::Mat frame, output_img; 71 | //Temp var 72 | float scale_x, scale_y; 73 | signal(SIGINT, handler); 74 | 75 | cv::VideoCapture cap(input_path); 76 | if(!cap.isOpened()) { 77 | printf("Video Open Faild\n"); 78 | return -1; 79 | } // check if we succeeded 80 | 81 | 82 | //Setup output 83 | FILE* ffmpeg = popen(output_path.c_str(), "w"); 84 | if (!ffmpeg) { 85 | fprintf(stderr, "Could not open pipe to ffmpeg\n"); 86 | return 1; 87 | } 88 | int8_t* result_img = (int8_t*)malloc(3*640*640*sizeof(int8_t)); 89 | 90 | //Setup FIFO 91 | initialize(&input_buffer); 92 | initialize(&output_buffer); 93 | 94 | pthread_t threads[4]; 95 | int npu_0=0, npu_1=1, npu_2=2; 96 | yolov8_npu_arg npu_arg_0, npu_arg_1, npu_arg_2; 97 | npu_arg_0.input = &input_buffer; 98 | npu_arg_0.output = &output_buffer; 99 | npu_arg_0.model_path = &model_path; 100 | npu_arg_0.target_npu = &npu_0; 101 | pthread_create(&threads[0], NULL, yolov8_npu_thread, &npu_arg_0); 102 | npu_arg_1.input = &input_buffer; 103 | npu_arg_1.output = &output_buffer; 104 | npu_arg_1.model_path = &model_path; 105 | npu_arg_1.target_npu = &npu_1; 106 | pthread_create(&threads[1], NULL, yolov8_npu_thread, &npu_arg_1); 107 | npu_arg_2.input = &input_buffer; 108 | npu_arg_2.output = &output_buffer; 109 | npu_arg_2.model_path = &model_path; 110 | npu_arg_2.target_npu = &npu_2; 111 | pthread_create(&threads[2], NULL, yolov8_npu_thread, &npu_arg_2); 112 | 113 | video_read_arg video_arg; 114 | video_arg.cam = ∩ 115 | video_arg.output = &input_buffer; 116 | pthread_create(&threads[3], NULL, video_read_thread, &video_arg); 117 | 118 | Timer timer; 119 | size_t cnt=0; 120 | fprintf(stdout, "Wait for buffering!\n"); 121 | sleep(5); 122 | fprintf(stdout, "Main process start!\n"); 123 | while(!flag) 124 | { 125 | 126 | timer.reset(); 127 | dequeue(&output_buffer, result_img); 128 | if (output_buffer.stop) break; 129 | if (timer.elapsed() < 0.033) preciseSleep(0.033-timer.elapsed()); 130 | fwrite(result_img, sizeof(char), 3*640*640, ffmpeg); 131 | fprintf(stdout, "\rFPS: %f, total frame:%d ", 1/timer.elapsed(), ++cnt); 132 | } 133 | 134 | fprintf(stdout, "\nMain process stop!\n"); 135 | input_buffer.stop = true; 136 | pthread_cond_signal(&input_buffer.notEmpty); 137 | pthread_cond_signal(&input_buffer.notFull); 138 | 139 | // Release the resource 140 | release(&input_buffer); 141 | release(&output_buffer); 142 | free(result_img); 143 | pclose(ffmpeg); 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /src/yolov8_post.cc: -------------------------------------------------------------------------------- 1 | #include "yolov8_post.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | class Timer { 8 | public: 9 | Timer() : start_time(std::chrono::high_resolution_clock::now()) {} 10 | 11 | void reset() { 12 | start_time = std::chrono::high_resolution_clock::now(); 13 | } 14 | 15 | double elapsed() const { 16 | return std::chrono::duration_cast( 17 | std::chrono::high_resolution_clock::now() - start_time).count() / 1000.0; 18 | } 19 | 20 | private: 21 | std::chrono::high_resolution_clock::time_point start_time; 22 | }; 23 | 24 | using namespace cv; 25 | 26 | int l_size[3] = {80, 40, 20}; 27 | size_t l_base[3] = {0, 6400, 8000}; 28 | float striders[3] = {8, 16, 32}; 29 | int confi_idx[3] = {4,2,0}; 30 | int box_idx[3] = {5,3,1}; 31 | 32 | 33 | void get_confidence(float* input, int q_num, int group_size, int* idx, float* value) { 34 | int start = q_num * group_size; 35 | int end = start + group_size; 36 | float max_val = input[start]; 37 | int max_idx = start; 38 | for (int i = start; i < end; i++) { 39 | if (input[i] > max_val) { 40 | max_val = input[i]; 41 | max_idx = i; 42 | } 43 | } 44 | *idx = max_idx - start; // convert absolute index to index within the group 45 | *value = max_val; 46 | } 47 | 48 | float softmax_weightsum(float *input) { 49 | double base=0; 50 | float output=0; 51 | for (int i=0; i<16; ++i) { 52 | base += input[i]; 53 | } 54 | for (int i=0; i<16; ++i) { 55 | output += i*(input[i]/base); 56 | } 57 | return output; 58 | } 59 | void printMaxElement(const cv::Mat& mat) { 60 | double minVal, maxVal; 61 | cv::minMaxLoc(mat, &minVal, &maxVal); 62 | std::cout << "Max element: " << maxVal << std::endl; 63 | } 64 | 65 | void yolov8_tail_process(float** input, float** output, int class_num) { 66 | Timer t1; 67 | 68 | // Transpose and concatenate confidence data 69 | Mat confi_20(class_num, 400, CV_32F, input[0]); 70 | transpose(confi_20, confi_20); 71 | Mat confi_40(class_num, 1600, CV_32F, input[2]); 72 | transpose(confi_40, confi_40); 73 | Mat confi_80(class_num, 6400, CV_32F, input[4]); 74 | transpose(confi_80, confi_80); 75 | std::memcpy(output[1], confi_80.data, class_num*6400*sizeof(float)); 76 | std::memcpy(output[1]+class_num*6400, confi_40.data, class_num*1600*sizeof(float)); 77 | std::memcpy(output[1]+class_num*6400+class_num*1600, confi_20.data, class_num*400*sizeof(float)); 78 | 79 | // Process bbox data 80 | float *ip, *op; 81 | Mat box_80_raw(64, 6400, CV_32F, input[5]); 82 | transpose(box_80_raw, box_80_raw); 83 | exp(box_80_raw, box_80_raw); 84 | ip = (float*)box_80_raw.data; 85 | op = output[0]; 86 | for (int i=0; i<6400; ++i) { 87 | for (int j=0; j<4; ++j) { 88 | op[i*4+j] = softmax_weightsum(ip+(i*64+j*16)); 89 | } 90 | } 91 | Mat box_40_raw(64, 1600, CV_32F, input[3]); 92 | transpose(box_40_raw, box_40_raw); 93 | exp(box_40_raw, box_40_raw); 94 | ip = (float*)box_40_raw.data; 95 | op = output[0]+6400*4; 96 | for (int i=0; i<1600; ++i) { 97 | for (int j=0; j<4; ++j) { 98 | op[i*4+j] = softmax_weightsum(ip+(i*64+j*16)); 99 | } 100 | } 101 | Mat box_20_raw(64, 400, CV_32F, input[1]); 102 | transpose(box_20_raw, box_20_raw); 103 | exp(box_20_raw, box_20_raw); 104 | ip = (float*)box_20_raw.data; 105 | op = output[0]+8000*4; 106 | for (int i=0; i<400; ++i) { 107 | for (int j=0; j<4; ++j) { 108 | op[i*4+j] = softmax_weightsum(ip+(i*64+j*16)); 109 | } 110 | } 111 | 112 | std::cout << "tail_process:>" << t1.elapsed() << std::endl; 113 | return; 114 | } 115 | 116 | void get_max_confidence(float* data, size_t item_size, int pos, int class_num, float* value, int* class_id) { 117 | *value = 0; 118 | *class_id = 0; 119 | for (int i=0; i yolov8_tail_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag) { 144 | 145 | //Timer t1; 146 | 147 | std::vector boxes; // vector of bounding boxes 148 | std::vector scores; // confidence scores for each box 149 | std::vector indices; // indices of the raw detection result 150 | std::vector cls; // vector of class IDs 151 | std::vector results; // result to return; 152 | 153 | float score, x1, y1, x2, y2, l_x, l_y, w, h; 154 | int class_id, cnt=0; 155 | size_t item_size, base_y; 156 | 157 | for (int l=0; l<3; ++l) { 158 | item_size = l_size[l]*l_size[l]; 159 | Mat tmp_mat(64, item_size, CV_32F, input[box_idx[l]]); 160 | exp(tmp_mat, tmp_mat); 161 | for (int y=0; y" << t1.elapsed() << std::endl; 186 | if(debug_flag) { 187 | printf("========Detection Results========\n"); 188 | printf("classID | score | bbox\n"); 189 | printf("---------------------------------\n"); 190 | } 191 | for (int i=0; i yolov8_post_process(float** input, int class_num, float scale_x, float scale_y, float score_threshold, float nms_threshold, bool debug_flag) { 203 | 204 | 205 | Timer t1; 206 | std::vector boxes; // vector of bounding boxes 207 | std::vector scores; // confidence scores for each box 208 | std::vector indices; // indices of the raw detection result 209 | std::vector cls; // vector of class IDs 210 | std::vector results; // result to return; 211 | 212 | //Some tmp var 213 | float score, l_x, l_y, w, h; 214 | int cl; 215 | size_t lbp, ybp, xbp, x1p, y1p, x2p, y2p; 216 | 217 | //Load detection results from buffers 218 | for (int l=0; l<3; ++l) { 219 | lbp = l_base[l]; 220 | for (int y=0; y" << t1.elapsed() << std::endl; 246 | if(debug_flag) { 247 | printf("========Detection Results========\n"); 248 | printf("classID | score | bbox\n"); 249 | printf("---------------------------------\n"); 250 | } 251 | for (int i=0; i(0,0); 267 | return bgr_color; 268 | } 269 | 270 | std::vector get_colors(int nums_of_class) { 271 | std::vector colors; 272 | for (int i=0; i results, cv::Mat &img, const std::string* class_names, int num_of_classes) { 279 | std::vector colors = get_colors(num_of_classes); 280 | for (int i=0; i confThreshold: 57 | # Update our list of bounding boxes, confidences and class IDs 58 | boxes.append([int(val) for val in box]) 59 | confidences.append(float(confidence)) 60 | classIDs.append(classID) 61 | 62 | indices = cv2.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold) 63 | if len(indices) > 0: 64 | # Loop over the indices we are keeping 65 | for i in indices.flatten(): 66 | # Extract the bounding box coordinates 67 | (x, y, w, h) = [int(val) for val in boxes[i]] 68 | x = x - w//2 69 | y = y - h//2 70 | text = "{}: {:.4f}".format(CLASSES[classIDs[i]], confidences[i]) 71 | print(text, "|", "%d,%d,%d,%d" % (x,y,w,h)) 72 | 73 | if save_path != None: 74 | # Draw a bounding box rectangle and label on the image 75 | color = [int(c) for c in np.random.randint(0, 255, size=(3,))] 76 | cv2.rectangle(img_input, (x, y), (x + w, y + h), color, 2) 77 | cv2.putText(img_input, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) 78 | if save_path != None: 79 | cv2.imwrite(save_path, img_input) 80 | 81 | 82 | 83 | 84 | 85 | if __name__ == '__main__': 86 | 87 | 88 | # Create RKNN object 89 | rknn = RKNN(verbose=True) 90 | 91 | # pre-process config 92 | print('--> Config model') 93 | rknn.config(mean_values=[[0, 0, 0]], std_values=[[1, 1, 1]], target_platform="rk3588") 94 | print('done') 95 | 96 | # Load ONNX model 97 | print('--> Loading model') 98 | ret = rknn.load_onnx(model=ONNX_MODEL) 99 | if ret != 0: 100 | print('Load model failed!') 101 | exit(ret) 102 | print('done') 103 | 104 | # Build model 105 | print('--> Build model.') 106 | ret = rknn.build(do_quantization=QUANTIZE_ON, dataset=DATASET) 107 | if ret != 0: 108 | print('Build model failed!') 109 | exit(ret) 110 | print('done') 111 | """ 112 | 113 | # Hybird quantization step 1 114 | print('--> Hybird quantization step one') 115 | ret = rknn.hybrid_quantization_step1(dataset=DATASET) 116 | if ret != 0: 117 | print('Step one failed!') 118 | exit(ret) 119 | print('done') 120 | 121 | # Call hybrid_quantization_step2 to generate hybrid quantized RKNN model 122 | print('--> Hybird quantization step two') 123 | ret = rknn.hybrid_quantization_step2( 124 | model_input="./"+ONNX_MODEL.split('.')[0]+".model", 125 | data_input="./"+ONNX_MODEL.split('.')[0]+".data", 126 | model_quantization_cfg="./"+ONNX_MODEL.split('.')[0]+".quantization.cfg" 127 | ) 128 | if ret != 0: 129 | print('Step two failed!') 130 | exit(ret) 131 | print('done') 132 | """ 133 | 134 | # Export RKNN model 135 | print('--> Export rknn model') 136 | ret = rknn.export_rknn(RKNN_MODEL) 137 | if ret != 0: 138 | print('Export rknn model failed!') 139 | exit(ret) 140 | print('done') 141 | 142 | # Init runtime environment 143 | print('--> Init runtime environment') 144 | ret = rknn.init_runtime() 145 | if ret != 0: 146 | print('Init runtime environment failed!') 147 | exit(ret) 148 | print('done') 149 | 150 | # Set inputs 151 | img = cv2.imread(IMG_PATH) 152 | blob = cv2.dnn.blobFromImage(img, 1/255.0, (IMG_SIZE, IMG_SIZE), swapRB=True, crop=False) 153 | 154 | # Inference 155 | print('--> Running model') 156 | outputs = rknn.inference(inputs=[blob], data_format="nchw", inputs_pass_through=[1]) 157 | print('done') 158 | 159 | if (TAIL_PROC): 160 | outputs = tail_process(outputs) 161 | outputs[0] = yolov8_box_head.yolov8_box_head(outputs[0]) 162 | 163 | merge_output = np.concatenate(outputs, axis=1) 164 | 165 | print('--> Post Process') 166 | post_process_yolov8(merge_output, img, "./"+ONNX_MODEL.split('.')[0]+"_result.png") 167 | print('done') 168 | 169 | rknn.release() 170 | -------------------------------------------------------------------------------- /tools/yolov8_box_head.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def yolov8_box_head(input): 4 | layer_sizes = [80, 40, 20] 5 | layer_pointer = [0, 6400, 8000] 6 | strides = [8, 16, 32] 7 | for l in range(3): 8 | 9 | x = np.arange(0.5, layer_sizes[l], 1) 10 | y = np.arange(0.5, layer_sizes[l], 1) 11 | sx, sy = np.meshgrid(x, y) 12 | sx = sx.reshape(-1) 13 | sy = sy.reshape(-1) 14 | p_s, p_e = layer_pointer[l], layer_pointer[l] + layer_sizes[l]**2 15 | 16 | input[0,0,p_s:p_e] = sx - input[0,0,p_s:p_e] 17 | input[0,1,p_s:p_e] = sy - input[0,1,p_s:p_e] 18 | input[0,2,p_s:p_e] = sx + input[0,2,p_s:p_e] 19 | input[0,3,p_s:p_e] = sy + input[0,3,p_s:p_e] 20 | 21 | c_x = (input[0,0,p_s:p_e] + input[0,2,p_s:p_e]) / 2 22 | c_y = (input[0,1,p_s:p_e] + input[0,3,p_s:p_e]) / 2 23 | w = input[0,2,p_s:p_e] - input[0,0,p_s:p_e] 24 | h = input[0,3,p_s:p_e] - input[0,1,p_s:p_e] 25 | 26 | input[0,0,p_s:p_e] = c_x 27 | input[0,1,p_s:p_e] = c_y 28 | input[0,2,p_s:p_e] = w 29 | input[0,3,p_s:p_e] = h 30 | 31 | input[0,:,p_s:p_e] *= strides[l] 32 | 33 | return input 34 | 35 | if __name__ == "__main__": 36 | import pdb 37 | pdb.set_trace() 38 | arr = np.load("yolov8n_no_boxhead_result_0.npy") 39 | yolov8_box_head(arr) 40 | 41 | -------------------------------------------------------------------------------- /tools/yolov8_tail.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def softmax(x): 4 | e_x = np.exp(x) 5 | s_e_x = np.sum(e_x, axis=-1).reshape(-1,4,1) 6 | return e_x/s_e_x 7 | 8 | def box_tail(x): 9 | x = x.reshape((-1,4,16)) 10 | x = softmax(x) 11 | return np.sum(x*np.arange(16).reshape(1,1,-1), axis=-1).reshape(-1,4) 12 | 13 | 14 | def tail_process(network_outputs): 15 | a1 = network_outputs[1].transpose((0,2,3,1)).reshape(1,-1,64,1) 16 | a2 = network_outputs[3].transpose((0,2,3,1)).reshape(1,-1,64,1) 17 | a3 = network_outputs[5].transpose((0,2,3,1)).reshape(1,-1,64,1) 18 | bbox_20 = box_tail(network_outputs[1].transpose((0,2,3,1)).reshape(1,-1,64,1)).transpose().reshape((1,4,-1)) 19 | bbox_40 = box_tail(network_outputs[3].transpose((0,2,3,1)).reshape(1,-1,64,1)).transpose().reshape((1,4,-1)) 20 | bbox_80 = box_tail(network_outputs[5].transpose((0,2,3,1)).reshape(1,-1,64,1)).transpose().reshape((1,4,-1)) 21 | bbox = np.concatenate((bbox_80, bbox_40, bbox_20), axis=2) 22 | 23 | confi_20 = network_outputs[0].reshape(1,80,-1) 24 | confi_40 = network_outputs[2].reshape(1,80,-1) 25 | confi_80 = network_outputs[4].reshape(1,80,-1) 26 | confi = np.concatenate((confi_80, confi_40, confi_20), axis=2) 27 | 28 | return [bbox, confi] 29 | 30 | 31 | 32 | --------------------------------------------------------------------------------